{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.07718985, "auxiliary_loss_mlp": 1.70153117, "balance_loss_clip": 3.41505909, "balance_loss_mlp": 2.47734356, "epoch": 6.012325266796934e-05, "flos": 24466939372800.0, "grad_norm": 80.750913664829, "language_loss": 3.30601239, "learning_rate": 0.0, "loss": 3.35351992, "num_input_tokens_seen": 19155, "router_z_loss_clip": 43.03125, "router_z_loss_mlp": 1679.0, "step": 1, "time_per_iteration": 19.43519115447998 }, { "auxiliary_loss_clip": 0.05053996, "auxiliary_loss_mlp": 0.99255323, "balance_loss_clip": 2.26067638, "balance_loss_mlp": 1.55024052, "epoch": 0.00012024650533593868, "flos": 20234506619520.0, "grad_norm": 65.25129112022476, "language_loss": 2.05704641, "learning_rate": 4.4628432569317594e-07, "loss": 3.10013962, "num_input_tokens_seen": 36175, "router_z_loss_clip": 27.890625, "router_z_loss_mlp": 977.0, "step": 2, "time_per_iteration": 2.70206618309021 }, { "auxiliary_loss_clip": 0.05078094, "auxiliary_loss_mlp": 1.09766936, "balance_loss_clip": 2.2767148, "balance_loss_mlp": 1.5638144, "epoch": 0.000180369758003908, "flos": 22320171780480.0, "grad_norm": 72.49628618074873, "language_loss": 1.8270725, "learning_rate": 7.073439208833112e-07, "loss": 2.97552299, "num_input_tokens_seen": 54870, "router_z_loss_clip": 28.0, "router_z_loss_mlp": 1082.5, "step": 3, "time_per_iteration": 2.5937623977661133 }, { "auxiliary_loss_clip": 0.05120585, "auxiliary_loss_mlp": 0.9843846, "balance_loss_clip": 2.2918458, "balance_loss_mlp": 1.56345916, "epoch": 0.00024049301067187735, "flos": 22423683179520.0, "grad_norm": 71.5263710431209, "language_loss": 1.99349856, "learning_rate": 8.925686513863519e-07, "loss": 3.02908897, "num_input_tokens_seen": 74575, "router_z_loss_clip": 28.28125, "router_z_loss_mlp": 969.0, "step": 4, "time_per_iteration": 2.8225133419036865 }, { "auxiliary_loss_clip": 0.05109305, "auxiliary_loss_mlp": 1.06129885, "balance_loss_clip": 2.28268504, "balance_loss_mlp": 1.58887625, "epoch": 0.0003006162633398467, "flos": 21406766878080.0, "grad_norm": 74.13974949679341, "language_loss": 2.2624855, "learning_rate": 1.0362401141348472e-06, "loss": 3.37487745, "num_input_tokens_seen": 92580, "router_z_loss_clip": 28.28125, "router_z_loss_mlp": 1046.0, "step": 5, "time_per_iteration": 2.9290919303894043 }, { "auxiliary_loss_clip": 0.05097248, "auxiliary_loss_mlp": 1.13614821, "balance_loss_clip": 2.2707057, "balance_loss_mlp": 1.55427611, "epoch": 0.000360739516007816, "flos": 21662228874240.0, "grad_norm": 65.4975568880652, "language_loss": 1.89590144, "learning_rate": 1.153628246576487e-06, "loss": 3.08302212, "num_input_tokens_seen": 109705, "router_z_loss_clip": 28.25, "router_z_loss_mlp": 1122.0, "step": 6, "time_per_iteration": 3.007115364074707 }, { "auxiliary_loss_clip": 0.05072753, "auxiliary_loss_mlp": 1.1702292, "balance_loss_clip": 2.24336386, "balance_loss_mlp": 1.59323204, "epoch": 0.0004208627686757854, "flos": 27170511102720.0, "grad_norm": 66.3124465874403, "language_loss": 1.7740798, "learning_rate": 1.2528784983718962e-06, "loss": 2.9950366, "num_input_tokens_seen": 129425, "router_z_loss_clip": 28.265625, "router_z_loss_mlp": 1154.0, "step": 7, "time_per_iteration": 3.123600482940674 }, { "auxiliary_loss_clip": 0.04936389, "auxiliary_loss_mlp": 0.77675629, "balance_loss_clip": 2.25196505, "balance_loss_mlp": 1.4061029, "epoch": 0.0004809860213437547, "flos": 31330937427840.0, "grad_norm": 56.04965158379521, "language_loss": 1.75922132, "learning_rate": 1.338852977079528e-06, "loss": 2.58534145, "num_input_tokens_seen": 149210, "router_z_loss_clip": 26.84375, "router_z_loss_mlp": 763.0, "step": 8, "time_per_iteration": 2.997500419616699 }, { "auxiliary_loss_clip": 0.0494687, "auxiliary_loss_mlp": 0.89586616, "balance_loss_clip": 2.2349596, "balance_loss_mlp": 1.45184994, "epoch": 0.000541109274011724, "flos": 32173027246080.0, "grad_norm": 56.731733031942944, "language_loss": 1.84469974, "learning_rate": 1.4146878417666224e-06, "loss": 2.79003477, "num_input_tokens_seen": 169055, "router_z_loss_clip": 27.09375, "router_z_loss_mlp": 882.0, "step": 9, "time_per_iteration": 3.1255698204040527 }, { "auxiliary_loss_clip": 0.04919174, "auxiliary_loss_mlp": 0.80704236, "balance_loss_clip": 2.23462343, "balance_loss_mlp": 1.40736294, "epoch": 0.0006012325266796934, "flos": 18926176688640.0, "grad_norm": 47.71260207760991, "language_loss": 1.74780297, "learning_rate": 1.4825244398280232e-06, "loss": 2.60403705, "num_input_tokens_seen": 188045, "router_z_loss_clip": 26.859375, "router_z_loss_mlp": 793.0, "step": 10, "time_per_iteration": 2.9445340633392334 }, { "auxiliary_loss_clip": 0.04901214, "auxiliary_loss_mlp": 0.73953408, "balance_loss_clip": 2.26035595, "balance_loss_mlp": 1.39481604, "epoch": 0.0006613557793476627, "flos": 20784006443520.0, "grad_norm": 48.1537727282832, "language_loss": 1.75507092, "learning_rate": 1.5438901072051983e-06, "loss": 2.54361701, "num_input_tokens_seen": 207035, "router_z_loss_clip": 26.390625, "router_z_loss_mlp": 726.0, "step": 11, "time_per_iteration": 2.9828543663024902 }, { "auxiliary_loss_clip": 0.0488665, "auxiliary_loss_mlp": 0.71851075, "balance_loss_clip": 2.25469971, "balance_loss_mlp": 1.39208663, "epoch": 0.000721479032015632, "flos": 16590433662720.0, "grad_norm": 43.00170113714157, "language_loss": 1.68411064, "learning_rate": 1.5999125722696629e-06, "loss": 2.45148778, "num_input_tokens_seen": 223225, "router_z_loss_clip": 26.296875, "router_z_loss_mlp": 704.0, "step": 12, "time_per_iteration": 2.938222646713257 }, { "auxiliary_loss_clip": 0.04851279, "auxiliary_loss_mlp": 0.79162264, "balance_loss_clip": 2.27056551, "balance_loss_mlp": 1.33023047, "epoch": 0.0007816022846836014, "flos": 23815996738560.0, "grad_norm": 44.59606807630656, "language_loss": 1.61133099, "learning_rate": 1.6514482443788434e-06, "loss": 2.45146656, "num_input_tokens_seen": 242570, "router_z_loss_clip": 25.78125, "router_z_loss_mlp": 778.5, "step": 13, "time_per_iteration": 2.975118637084961 }, { "auxiliary_loss_clip": 0.0482919, "auxiliary_loss_mlp": 0.4812108, "balance_loss_clip": 2.38134813, "balance_loss_mlp": 1.34373498, "epoch": 0.0008417255373515708, "flos": 19181638684800.0, "grad_norm": 24.752099696183123, "language_loss": 1.42605567, "learning_rate": 1.6991628240650723e-06, "loss": 1.95555842, "num_input_tokens_seen": 261215, "router_z_loss_clip": 24.46875, "router_z_loss_mlp": 467.5, "step": 14, "time_per_iteration": 3.005387783050537 }, { "auxiliary_loss_clip": 0.04843317, "auxiliary_loss_mlp": 0.42258424, "balance_loss_clip": 2.42058206, "balance_loss_mlp": 1.34045625, "epoch": 0.00090184879001954, "flos": 26406658823040.0, "grad_norm": 21.522688589637365, "language_loss": 1.34224033, "learning_rate": 1.7435840350181584e-06, "loss": 1.81325781, "num_input_tokens_seen": 280035, "router_z_loss_clip": 24.1875, "router_z_loss_mlp": 409.5, "step": 15, "time_per_iteration": 4.444719314575195 }, { "auxiliary_loss_clip": 0.0485227, "auxiliary_loss_mlp": 0.40017533, "balance_loss_clip": 2.44129586, "balance_loss_mlp": 1.37007236, "epoch": 0.0009619720426875094, "flos": 24689830423680.0, "grad_norm": 20.54737274620766, "language_loss": 1.28753221, "learning_rate": 1.7851373027727038e-06, "loss": 1.73623025, "num_input_tokens_seen": 300265, "router_z_loss_clip": 24.125, "router_z_loss_mlp": 386.75, "step": 16, "time_per_iteration": 5.87269401550293 }, { "auxiliary_loss_clip": 0.04792338, "auxiliary_loss_mlp": 0.3982912, "balance_loss_clip": 2.41458082, "balance_loss_mlp": 1.32814205, "epoch": 0.0010220952953554788, "flos": 18633722428800.0, "grad_norm": 20.218220196956235, "language_loss": 1.39150214, "learning_rate": 1.8241705979033208e-06, "loss": 1.83771682, "num_input_tokens_seen": 317375, "router_z_loss_clip": 23.734375, "router_z_loss_mlp": 385.0, "step": 17, "time_per_iteration": 2.999260902404785 }, { "auxiliary_loss_clip": 0.04784064, "auxiliary_loss_mlp": 0.32224673, "balance_loss_clip": 2.46232653, "balance_loss_mlp": 1.29205656, "epoch": 0.001082218548023448, "flos": 26154182983680.0, "grad_norm": 16.732686601673127, "language_loss": 1.23413396, "learning_rate": 1.860972167459798e-06, "loss": 1.60422134, "num_input_tokens_seen": 337975, "router_z_loss_clip": 23.203125, "router_z_loss_mlp": 309.5, "step": 18, "time_per_iteration": 3.023937463760376 }, { "auxiliary_loss_clip": 0.04811134, "auxiliary_loss_mlp": 0.31757289, "balance_loss_clip": 2.48741102, "balance_loss_mlp": 1.31295526, "epoch": 0.0011423418006914173, "flos": 19619256821760.0, "grad_norm": 16.316076017634586, "language_loss": 1.22099996, "learning_rate": 1.89578346593066e-06, "loss": 1.58668423, "num_input_tokens_seen": 356635, "router_z_loss_clip": 23.203125, "router_z_loss_mlp": 304.5, "step": 19, "time_per_iteration": 3.008336067199707 }, { "auxiliary_loss_clip": 0.04806993, "auxiliary_loss_mlp": 0.33392182, "balance_loss_clip": 2.49775839, "balance_loss_mlp": 1.29989696, "epoch": 0.0012024650533593868, "flos": 17904509683200.0, "grad_norm": 17.142970550289178, "language_loss": 1.31938076, "learning_rate": 1.928808765521199e-06, "loss": 1.70137239, "num_input_tokens_seen": 375625, "router_z_loss_clip": 23.078125, "router_z_loss_mlp": 321.125, "step": 20, "time_per_iteration": 2.952523946762085 }, { "auxiliary_loss_clip": 0.04744621, "auxiliary_loss_mlp": 0.27582109, "balance_loss_clip": 2.52278638, "balance_loss_mlp": 1.22713017, "epoch": 0.001262588306027356, "flos": 21262055448960.0, "grad_norm": 14.549432931011598, "language_loss": 1.28722882, "learning_rate": 1.9602224192552076e-06, "loss": 1.61049604, "num_input_tokens_seen": 394350, "router_z_loss_clip": 22.21875, "router_z_loss_mlp": 263.5, "step": 21, "time_per_iteration": 3.0091187953948975 }, { "auxiliary_loss_clip": 0.04678907, "auxiliary_loss_mlp": 0.13504824, "balance_loss_clip": 2.73326254, "balance_loss_mlp": 1.35882723, "epoch": 0.0013227115586953253, "flos": 26115199948800.0, "grad_norm": 7.4316860915247265, "language_loss": 1.28335118, "learning_rate": 1.9901744328983746e-06, "loss": 1.4651885, "num_input_tokens_seen": 413255, "router_z_loss_clip": 19.4375, "router_z_loss_mlp": 121.4375, "step": 22, "time_per_iteration": 3.0606420040130615 }, { "auxiliary_loss_clip": 0.04738327, "auxiliary_loss_mlp": 0.10603125, "balance_loss_clip": 2.8633604, "balance_loss_mlp": 1.39291942, "epoch": 0.0013828348113632948, "flos": 23961929777280.0, "grad_norm": 5.122555311389441, "language_loss": 1.08021665, "learning_rate": 2.018794797290208e-06, "loss": 1.23363113, "num_input_tokens_seen": 433065, "router_z_loss_clip": 18.734375, "router_z_loss_mlp": 92.0625, "step": 23, "time_per_iteration": 2.9850363731384277 }, { "auxiliary_loss_clip": 0.04788831, "auxiliary_loss_mlp": 0.09025525, "balance_loss_clip": 2.97458267, "balance_loss_mlp": 1.44801044, "epoch": 0.001442958064031264, "flos": 15968035186560.0, "grad_norm": 4.53283244496276, "language_loss": 1.19218946, "learning_rate": 2.046196897962839e-06, "loss": 1.33033299, "num_input_tokens_seen": 451175, "router_z_loss_clip": 18.171875, "router_z_loss_mlp": 75.75, "step": 24, "time_per_iteration": 2.898371696472168 }, { "auxiliary_loss_clip": 0.04741364, "auxiliary_loss_mlp": 0.09615618, "balance_loss_clip": 2.94651365, "balance_loss_mlp": 1.44911408, "epoch": 0.0015030813166992333, "flos": 18116011716480.0, "grad_norm": 5.890458377108494, "language_loss": 1.21581137, "learning_rate": 2.0724802282696944e-06, "loss": 1.3593812, "num_input_tokens_seen": 468775, "router_z_loss_clip": 17.96875, "router_z_loss_mlp": 81.625, "step": 25, "time_per_iteration": 2.886779546737671 }, { "auxiliary_loss_clip": 0.0476236, "auxiliary_loss_mlp": 0.09070724, "balance_loss_clip": 2.99907613, "balance_loss_mlp": 1.4748987, "epoch": 0.0015632045693672028, "flos": 22244151237120.0, "grad_norm": 4.4364931290458, "language_loss": 1.18391669, "learning_rate": 2.0977325700720194e-06, "loss": 1.32224751, "num_input_tokens_seen": 488530, "router_z_loss_clip": 17.609375, "router_z_loss_mlp": 75.96875, "step": 26, "time_per_iteration": 3.0007307529449463 }, { "auxiliary_loss_clip": 0.04789963, "auxiliary_loss_mlp": 0.08426295, "balance_loss_clip": 3.04724741, "balance_loss_mlp": 1.47133934, "epoch": 0.001623327822035172, "flos": 24003582255360.0, "grad_norm": 3.7605434031046165, "language_loss": 1.05572927, "learning_rate": 2.122031762649933e-06, "loss": 1.18789196, "num_input_tokens_seen": 510495, "router_z_loss_clip": 17.4375, "router_z_loss_mlp": 69.5625, "step": 27, "time_per_iteration": 2.9833555221557617 }, { "auxiliary_loss_clip": 0.0477493, "auxiliary_loss_mlp": 0.06973341, "balance_loss_clip": 3.07912683, "balance_loss_mlp": 1.48628032, "epoch": 0.0016834510747031415, "flos": 19685821201920.0, "grad_norm": 3.2178699917375533, "language_loss": 1.18026257, "learning_rate": 2.1454471497582483e-06, "loss": 1.29774523, "num_input_tokens_seen": 528605, "router_z_loss_clip": 16.953125, "router_z_loss_mlp": 54.84375, "step": 28, "time_per_iteration": 2.917419672012329 }, { "auxiliary_loss_clip": 0.04737539, "auxiliary_loss_mlp": 0.07382212, "balance_loss_clip": 3.0716362, "balance_loss_mlp": 1.49842286, "epoch": 0.0017435743273711108, "flos": 20933785290240.0, "grad_norm": 3.11924325803866, "language_loss": 1.15037429, "learning_rate": 2.1680407726407727e-06, "loss": 1.27157176, "num_input_tokens_seen": 548515, "router_z_loss_clip": 16.671875, "router_z_loss_mlp": 58.84375, "step": 29, "time_per_iteration": 2.983382225036621 }, { "auxiliary_loss_clip": 0.04715473, "auxiliary_loss_mlp": 0.07599083, "balance_loss_clip": 3.05350828, "balance_loss_mlp": 1.55354989, "epoch": 0.00180369758003908, "flos": 19536404313600.0, "grad_norm": 3.824928631446582, "language_loss": 1.37850773, "learning_rate": 2.189868360711334e-06, "loss": 1.50165331, "num_input_tokens_seen": 564025, "router_z_loss_clip": 16.65625, "router_z_loss_mlp": 60.46875, "step": 30, "time_per_iteration": 2.9731624126434326 }, { "auxiliary_loss_clip": 0.04724666, "auxiliary_loss_mlp": 0.06635766, "balance_loss_clip": 3.10167885, "balance_loss_mlp": 1.65071881, "epoch": 0.0018638208327070496, "flos": 27464413196160.0, "grad_norm": 3.153053452271225, "language_loss": 1.18014097, "learning_rate": 2.2109801597326265e-06, "loss": 1.29374528, "num_input_tokens_seen": 583345, "router_z_loss_clip": 16.234375, "router_z_loss_mlp": 49.84375, "step": 31, "time_per_iteration": 2.966228723526001 }, { "auxiliary_loss_clip": 0.04728903, "auxiliary_loss_mlp": 0.0608696, "balance_loss_clip": 3.12140417, "balance_loss_mlp": 1.71684301, "epoch": 0.0019239440853750188, "flos": 13597335912960.0, "grad_norm": 2.6499782485116, "language_loss": 1.06642807, "learning_rate": 2.2314216284658796e-06, "loss": 1.17458677, "num_input_tokens_seen": 600010, "router_z_loss_clip": 16.078125, "router_z_loss_mlp": 43.71875, "step": 32, "time_per_iteration": 2.940821647644043 }, { "auxiliary_loss_clip": 0.04738251, "auxiliary_loss_mlp": 0.06605868, "balance_loss_clip": 3.12230968, "balance_loss_mlp": 1.73221064, "epoch": 0.001984067338042988, "flos": 11261004704640.0, "grad_norm": 3.2304432423690685, "language_loss": 1.12521827, "learning_rate": 2.2512340280885094e-06, "loss": 1.23865938, "num_input_tokens_seen": 616295, "router_z_loss_clip": 16.15625, "router_z_loss_mlp": 48.765625, "step": 33, "time_per_iteration": 2.955939292907715 }, { "auxiliary_loss_clip": 0.04739591, "auxiliary_loss_mlp": 0.05051812, "balance_loss_clip": 3.15371943, "balance_loss_mlp": 1.92681217, "epoch": 0.0020441905907109576, "flos": 22397368688640.0, "grad_norm": 2.193599177835473, "language_loss": 0.99594772, "learning_rate": 2.270454923596497e-06, "loss": 1.09386158, "num_input_tokens_seen": 637640, "router_z_loss_clip": 15.859375, "router_z_loss_mlp": 31.265625, "step": 34, "time_per_iteration": 3.0313732624053955 }, { "auxiliary_loss_clip": 0.04648167, "auxiliary_loss_mlp": 0.04997702, "balance_loss_clip": 3.08843613, "balance_loss_mlp": 2.02986765, "epoch": 0.0021043138433789266, "flos": 49794611339520.0, "grad_norm": 2.2977998165312448, "language_loss": 0.8940649, "learning_rate": 2.2891186125067434e-06, "loss": 0.99052364, "num_input_tokens_seen": 659710, "router_z_loss_clip": 15.6015625, "router_z_loss_mlp": 29.671875, "step": 35, "time_per_iteration": 3.249321937561035 }, { "auxiliary_loss_clip": 0.04644082, "auxiliary_loss_mlp": 0.04726864, "balance_loss_clip": 3.1093595, "balance_loss_mlp": 2.17559481, "epoch": 0.002164437096046896, "flos": 20567572727040.0, "grad_norm": 2.2566672745340646, "language_loss": 1.00816774, "learning_rate": 2.307256493152974e-06, "loss": 1.10187721, "num_input_tokens_seen": 679670, "router_z_loss_clip": 15.34375, "router_z_loss_mlp": 25.515625, "step": 36, "time_per_iteration": 3.08524489402771 }, { "auxiliary_loss_clip": 0.04630248, "auxiliary_loss_mlp": 0.04990356, "balance_loss_clip": 3.12265873, "balance_loss_mlp": 2.22088623, "epoch": 0.0022245603487148656, "flos": 26553632492160.0, "grad_norm": 2.5399102877001334, "language_loss": 1.0566318, "learning_rate": 2.3248973825097614e-06, "loss": 1.15283775, "num_input_tokens_seen": 700170, "router_z_loss_clip": 15.078125, "router_z_loss_mlp": 27.65625, "step": 37, "time_per_iteration": 2.983510971069336 }, { "auxiliary_loss_clip": 0.04611097, "auxiliary_loss_mlp": 0.03961299, "balance_loss_clip": 3.16870356, "balance_loss_mlp": 2.31106138, "epoch": 0.0022846836013828346, "flos": 20347519426560.0, "grad_norm": 2.0424402484536337, "language_loss": 1.11575007, "learning_rate": 2.3420677916238357e-06, "loss": 1.20147407, "num_input_tokens_seen": 718545, "router_z_loss_clip": 14.4453125, "router_z_loss_mlp": 16.5, "step": 38, "time_per_iteration": 2.9655370712280273 }, { "auxiliary_loss_clip": 0.04506429, "auxiliary_loss_mlp": 0.03775921, "balance_loss_clip": 3.14303923, "balance_loss_mlp": 2.16230488, "epoch": 0.002344806854050804, "flos": 26258101585920.0, "grad_norm": 2.027333862960444, "language_loss": 0.95167744, "learning_rate": 2.358792165262154e-06, "loss": 1.03450096, "num_input_tokens_seen": 739865, "router_z_loss_clip": 13.6328125, "router_z_loss_mlp": 16.140625, "step": 39, "time_per_iteration": 3.008152723312378 }, { "auxiliary_loss_clip": 0.0438172, "auxiliary_loss_mlp": 0.03697147, "balance_loss_clip": 3.07590675, "balance_loss_mlp": 1.97595596, "epoch": 0.0024049301067187736, "flos": 11808920960640.0, "grad_norm": 2.8344510332509296, "language_loss": 1.06198907, "learning_rate": 2.3750930912143747e-06, "loss": 1.14277792, "num_input_tokens_seen": 755770, "router_z_loss_clip": 13.078125, "router_z_loss_mlp": 17.2109375, "step": 40, "time_per_iteration": 2.933713436126709 }, { "auxiliary_loss_clip": 0.04263376, "auxiliary_loss_mlp": 0.03392493, "balance_loss_clip": 3.03415275, "balance_loss_mlp": 1.76819491, "epoch": 0.0024650533593867426, "flos": 20641285785600.0, "grad_norm": 2.038095027841407, "language_loss": 1.05127823, "learning_rate": 2.3909914837471044e-06, "loss": 1.12783694, "num_input_tokens_seen": 773440, "router_z_loss_clip": 12.3046875, "router_z_loss_mlp": 16.234375, "step": 41, "time_per_iteration": 3.0024852752685547 }, { "auxiliary_loss_clip": 0.04179972, "auxiliary_loss_mlp": 0.03160781, "balance_loss_clip": 3.00309896, "balance_loss_mlp": 1.64405775, "epoch": 0.002525176612054712, "flos": 18415388430720.0, "grad_norm": 2.064139333797363, "language_loss": 1.06096089, "learning_rate": 2.4065067449483835e-06, "loss": 1.13436842, "num_input_tokens_seen": 790455, "router_z_loss_clip": 11.7734375, "router_z_loss_mlp": 15.15625, "step": 42, "time_per_iteration": 2.8881754875183105 }, { "auxiliary_loss_clip": 0.04125922, "auxiliary_loss_mlp": 0.02867808, "balance_loss_clip": 2.97397232, "balance_loss_mlp": 1.53953028, "epoch": 0.0025852998647226816, "flos": 28195752447360.0, "grad_norm": 2.0903387315667756, "language_loss": 1.1070857, "learning_rate": 2.4216569070848724e-06, "loss": 1.17702317, "num_input_tokens_seen": 810645, "router_z_loss_clip": 11.515625, "router_z_loss_mlp": 13.2890625, "step": 43, "time_per_iteration": 2.9615306854248047 }, { "auxiliary_loss_clip": 0.04024056, "auxiliary_loss_mlp": 0.03335081, "balance_loss_clip": 2.8991127, "balance_loss_mlp": 1.48190141, "epoch": 0.0026454231173906506, "flos": 14291275697280.0, "grad_norm": 2.4122634317931295, "language_loss": 1.08213997, "learning_rate": 2.4364587585915504e-06, "loss": 1.15573144, "num_input_tokens_seen": 827470, "router_z_loss_clip": 11.2421875, "router_z_loss_mlp": 18.53125, "step": 44, "time_per_iteration": 3.0093536376953125 }, { "auxiliary_loss_clip": 0.03949196, "auxiliary_loss_mlp": 0.02542776, "balance_loss_clip": 2.86159801, "balance_loss_mlp": 1.42735875, "epoch": 0.00270554637005862, "flos": 22429429269120.0, "grad_norm": 1.8787174249155538, "language_loss": 1.09263754, "learning_rate": 2.450927955901469e-06, "loss": 1.15755725, "num_input_tokens_seen": 847285, "router_z_loss_clip": 10.875, "router_z_loss_mlp": 11.1640625, "step": 45, "time_per_iteration": 2.9262454509735107 }, { "auxiliary_loss_clip": 0.03814146, "auxiliary_loss_mlp": 0.02616596, "balance_loss_clip": 2.76267052, "balance_loss_mlp": 1.39818192, "epoch": 0.0027656696227265896, "flos": 23995800149760.0, "grad_norm": 1.6735211082321715, "language_loss": 1.10298002, "learning_rate": 2.465079122983384e-06, "loss": 1.16728747, "num_input_tokens_seen": 867545, "router_z_loss_clip": 10.515625, "router_z_loss_mlp": 12.1796875, "step": 46, "time_per_iteration": 3.021876096725464 }, { "auxiliary_loss_clip": 0.03728038, "auxiliary_loss_mlp": 0.0228702, "balance_loss_clip": 2.70781136, "balance_loss_mlp": 1.39628839, "epoch": 0.0028257928753945586, "flos": 37683481224960.0, "grad_norm": 2.1245409214462296, "language_loss": 0.99205673, "learning_rate": 2.4789259401737868e-06, "loss": 1.05220735, "num_input_tokens_seen": 889915, "router_z_loss_clip": 10.1953125, "router_z_loss_mlp": 8.90234375, "step": 47, "time_per_iteration": 3.1279637813568115 }, { "auxiliary_loss_clip": 0.03609342, "auxiliary_loss_mlp": 0.02256737, "balance_loss_clip": 2.62699032, "balance_loss_mlp": 1.43886566, "epoch": 0.002885916128062528, "flos": 22464476006400.0, "grad_norm": 1.7315270840674621, "language_loss": 0.95409727, "learning_rate": 2.492481223656015e-06, "loss": 1.01275802, "num_input_tokens_seen": 908975, "router_z_loss_clip": 9.8359375, "router_z_loss_mlp": 8.18359375, "step": 48, "time_per_iteration": 2.9573256969451904 }, { "auxiliary_loss_clip": 0.03492365, "auxiliary_loss_mlp": 0.0239595, "balance_loss_clip": 2.54325104, "balance_loss_mlp": 1.47241127, "epoch": 0.0029460393807304976, "flos": 27023265964800.0, "grad_norm": 1.8450388987066868, "language_loss": 0.98168725, "learning_rate": 2.5057569967437924e-06, "loss": 1.04057026, "num_input_tokens_seen": 929810, "router_z_loss_clip": 9.484375, "router_z_loss_mlp": 9.23828125, "step": 49, "time_per_iteration": 3.0204553604125977 }, { "auxiliary_loss_clip": 0.03365751, "auxiliary_loss_mlp": 0.02232909, "balance_loss_clip": 2.44827795, "balance_loss_mlp": 1.51917887, "epoch": 0.0030061626333984666, "flos": 15860361265920.0, "grad_norm": 1.9479088444717867, "language_loss": 0.99412388, "learning_rate": 2.51876455396287e-06, "loss": 1.05011046, "num_input_tokens_seen": 948650, "router_z_loss_clip": 9.1875, "router_z_loss_mlp": 7.140625, "step": 50, "time_per_iteration": 5.775744438171387 }, { "auxiliary_loss_clip": 0.03264582, "auxiliary_loss_mlp": 0.02216198, "balance_loss_clip": 2.39411044, "balance_loss_mlp": 1.5341301, "epoch": 0.003066285886066436, "flos": 31838287080960.0, "grad_norm": 1.9753530157624453, "language_loss": 1.0056417, "learning_rate": 2.5315145187866316e-06, "loss": 1.0604496, "num_input_tokens_seen": 966455, "router_z_loss_clip": 8.703125, "router_z_loss_mlp": 6.82421875, "step": 51, "time_per_iteration": 4.440434455871582 }, { "auxiliary_loss_clip": 0.03135578, "auxiliary_loss_mlp": 0.02258513, "balance_loss_clip": 2.30240512, "balance_loss_mlp": 1.54440165, "epoch": 0.0031264091387344056, "flos": 41442422025600.0, "grad_norm": 1.8739398191585512, "language_loss": 1.02895951, "learning_rate": 2.5440168957651953e-06, "loss": 1.08290029, "num_input_tokens_seen": 988110, "router_z_loss_clip": 8.3359375, "router_z_loss_mlp": 7.140625, "step": 52, "time_per_iteration": 3.1230568885803223 }, { "auxiliary_loss_clip": 0.03017838, "auxiliary_loss_mlp": 0.02039637, "balance_loss_clip": 2.23043537, "balance_loss_mlp": 1.52331805, "epoch": 0.0031865323914023747, "flos": 23451458232960.0, "grad_norm": 1.8070749898571337, "language_loss": 1.01873517, "learning_rate": 2.5562811176888872e-06, "loss": 1.06930995, "num_input_tokens_seen": 1008550, "router_z_loss_clip": 7.88671875, "router_z_loss_mlp": 5.16210938, "step": 53, "time_per_iteration": 2.9906809329986572 }, { "auxiliary_loss_clip": 0.0290796, "auxiliary_loss_mlp": 0.02149796, "balance_loss_clip": 2.17462349, "balance_loss_mlp": 1.49252415, "epoch": 0.003246655644070344, "flos": 14437434960000.0, "grad_norm": 1.9792945125495223, "language_loss": 0.93568718, "learning_rate": 2.5683160883431093e-06, "loss": 0.98626477, "num_input_tokens_seen": 1026840, "router_z_loss_clip": 7.33203125, "router_z_loss_mlp": 6.5625, "step": 54, "time_per_iteration": 2.9702131748199463 }, { "auxiliary_loss_clip": 0.02827891, "auxiliary_loss_mlp": 0.02136015, "balance_loss_clip": 2.13814402, "balance_loss_mlp": 1.44975078, "epoch": 0.0033067788967383136, "flos": 35931787067520.0, "grad_norm": 2.0477595097481287, "language_loss": 0.92444384, "learning_rate": 2.580130221340046e-06, "loss": 0.97408283, "num_input_tokens_seen": 1048875, "router_z_loss_clip": 6.90234375, "router_z_loss_mlp": 6.86328125, "step": 55, "time_per_iteration": 3.1214632987976074 }, { "auxiliary_loss_clip": 0.02756652, "auxiliary_loss_mlp": 0.02227438, "balance_loss_clip": 2.10026264, "balance_loss_mlp": 1.41948545, "epoch": 0.003366902149406283, "flos": 22967074955520.0, "grad_norm": 2.3362825082226046, "language_loss": 1.02704704, "learning_rate": 2.5917314754514246e-06, "loss": 1.07688785, "num_input_tokens_seen": 1066435, "router_z_loss_clip": 6.55859375, "router_z_loss_mlp": 8.0703125, "step": 56, "time_per_iteration": 3.0924503803253174 }, { "auxiliary_loss_clip": 0.02700366, "auxiliary_loss_mlp": 0.02074311, "balance_loss_clip": 2.07978201, "balance_loss_mlp": 1.38308835, "epoch": 0.003427025402074252, "flos": 26595239725440.0, "grad_norm": 4.372956429906548, "language_loss": 1.04043138, "learning_rate": 2.6031273868139713e-06, "loss": 1.08817816, "num_input_tokens_seen": 1090330, "router_z_loss_clip": 6.20703125, "router_z_loss_mlp": 6.9140625, "step": 57, "time_per_iteration": 3.024188756942749 }, { "auxiliary_loss_clip": 0.02633189, "auxiliary_loss_mlp": 0.02124644, "balance_loss_clip": 2.02887058, "balance_loss_mlp": 1.40175891, "epoch": 0.0034871486547422216, "flos": 23961703553280.0, "grad_norm": 2.0204378954054003, "language_loss": 1.08251572, "learning_rate": 2.614325098333948e-06, "loss": 1.13009405, "num_input_tokens_seen": 1109840, "router_z_loss_clip": 6.046875, "router_z_loss_mlp": 7.23046875, "step": 58, "time_per_iteration": 3.0998780727386475 }, { "auxiliary_loss_clip": 0.02581605, "auxiliary_loss_mlp": 0.02027596, "balance_loss_clip": 1.99360061, "balance_loss_mlp": 1.37871587, "epoch": 0.003547271907410191, "flos": 21224836961280.0, "grad_norm": 1.8913346586629278, "language_loss": 0.98887563, "learning_rate": 2.625331386578098e-06, "loss": 1.03496754, "num_input_tokens_seen": 1128415, "router_z_loss_clip": 5.8828125, "router_z_loss_mlp": 6.4921875, "step": 59, "time_per_iteration": 2.9935126304626465 }, { "auxiliary_loss_clip": 0.02565694, "auxiliary_loss_mlp": 0.02080627, "balance_loss_clip": 1.97923756, "balance_loss_mlp": 1.42659712, "epoch": 0.00360739516007816, "flos": 16512648572160.0, "grad_norm": 1.8217685591083403, "language_loss": 1.03651714, "learning_rate": 2.63615268640451e-06, "loss": 1.08298028, "num_input_tokens_seen": 1146515, "router_z_loss_clip": 5.875, "router_z_loss_mlp": 6.54101562, "step": 60, "time_per_iteration": 2.921691656112671 }, { "auxiliary_loss_clip": 0.02525818, "auxiliary_loss_mlp": 0.01972963, "balance_loss_clip": 1.94882774, "balance_loss_mlp": 1.43432772, "epoch": 0.0036675184127461296, "flos": 19474500147840.0, "grad_norm": 2.012211350550266, "language_loss": 1.00783324, "learning_rate": 2.6467951135575943e-06, "loss": 1.05282104, "num_input_tokens_seen": 1166330, "router_z_loss_clip": 5.7734375, "router_z_loss_mlp": 5.390625, "step": 61, "time_per_iteration": 2.9752259254455566 }, { "auxiliary_loss_clip": 0.02525007, "auxiliary_loss_mlp": 0.01933186, "balance_loss_clip": 1.95319808, "balance_loss_mlp": 1.42296982, "epoch": 0.003727641665414099, "flos": 20966524542720.0, "grad_norm": 1.7233977874020083, "language_loss": 0.97835529, "learning_rate": 2.657264485425803e-06, "loss": 1.0229373, "num_input_tokens_seen": 1186010, "router_z_loss_clip": 5.71875, "router_z_loss_mlp": 5.09960938, "step": 62, "time_per_iteration": 2.9686691761016846 }, { "auxiliary_loss_clip": 0.02493904, "auxiliary_loss_mlp": 0.01856282, "balance_loss_clip": 1.92791915, "balance_loss_mlp": 1.39317799, "epoch": 0.003787764918082068, "flos": 18415297941120.0, "grad_norm": 1.686873854699331, "language_loss": 1.02980542, "learning_rate": 2.6675663401385186e-06, "loss": 1.07330728, "num_input_tokens_seen": 1204985, "router_z_loss_clip": 5.65625, "router_z_loss_mlp": 4.6328125, "step": 63, "time_per_iteration": 2.951929807662964 }, { "auxiliary_loss_clip": 0.02488836, "auxiliary_loss_mlp": 0.01824781, "balance_loss_clip": 1.93110788, "balance_loss_mlp": 1.38437414, "epoch": 0.0038478881707500376, "flos": 12467044846080.0, "grad_norm": 1.9303686318249158, "language_loss": 1.11379838, "learning_rate": 2.677705954159056e-06, "loss": 1.15693462, "num_input_tokens_seen": 1223545, "router_z_loss_clip": 5.5859375, "router_z_loss_mlp": 4.40234375, "step": 64, "time_per_iteration": 2.9584901332855225 }, { "auxiliary_loss_clip": 0.0245959, "auxiliary_loss_mlp": 0.01820107, "balance_loss_clip": 1.91570032, "balance_loss_mlp": 1.36882806, "epoch": 0.003908011423418007, "flos": 13561293790080.0, "grad_norm": 1.9293013249698734, "language_loss": 1.00365043, "learning_rate": 2.6876883585136904e-06, "loss": 1.0464474, "num_input_tokens_seen": 1241175, "router_z_loss_clip": 5.4453125, "router_z_loss_mlp": 4.51171875, "step": 65, "time_per_iteration": 2.9369256496429443 }, { "auxiliary_loss_clip": 0.02428748, "auxiliary_loss_mlp": 0.01830616, "balance_loss_clip": 1.90669298, "balance_loss_mlp": 1.3783834, "epoch": 0.003968134676085976, "flos": 18342806492160.0, "grad_norm": 1.6295432689463107, "language_loss": 0.98608351, "learning_rate": 2.697518353781685e-06, "loss": 1.02867711, "num_input_tokens_seen": 1259315, "router_z_loss_clip": 5.21484375, "router_z_loss_mlp": 4.5234375, "step": 66, "time_per_iteration": 3.0221219062805176 }, { "auxiliary_loss_clip": 0.02392277, "auxiliary_loss_mlp": 0.01814904, "balance_loss_clip": 1.89489579, "balance_loss_mlp": 1.3765955, "epoch": 0.004028257928753946, "flos": 20494900298880.0, "grad_norm": 1.9643108818682868, "language_loss": 1.10504806, "learning_rate": 2.7072005239581103e-06, "loss": 1.14712, "num_input_tokens_seen": 1277055, "router_z_loss_clip": 4.96875, "router_z_loss_mlp": 4.38476562, "step": 67, "time_per_iteration": 2.9501683712005615 }, { "auxiliary_loss_clip": 0.02350519, "auxiliary_loss_mlp": 0.01734376, "balance_loss_clip": 1.87984216, "balance_loss_mlp": 1.37255144, "epoch": 0.004088381181421915, "flos": 18853187546880.0, "grad_norm": 1.7730617643888935, "language_loss": 1.01648676, "learning_rate": 2.7167392492896727e-06, "loss": 1.05733562, "num_input_tokens_seen": 1294355, "router_z_loss_clip": 4.69921875, "router_z_loss_mlp": 3.62109375, "step": 68, "time_per_iteration": 2.964909553527832 }, { "auxiliary_loss_clip": 0.02310557, "auxiliary_loss_mlp": 0.01778697, "balance_loss_clip": 1.86005521, "balance_loss_mlp": 1.36899877, "epoch": 0.004148504434089885, "flos": 19437372149760.0, "grad_norm": 1.7152465305511237, "language_loss": 1.06029165, "learning_rate": 2.7261387181735195e-06, "loss": 1.10118413, "num_input_tokens_seen": 1313525, "router_z_loss_clip": 4.51171875, "router_z_loss_mlp": 4.09570312, "step": 69, "time_per_iteration": 2.9616217613220215 }, { "auxiliary_loss_clip": 0.02292586, "auxiliary_loss_mlp": 0.01737615, "balance_loss_clip": 1.86308813, "balance_loss_mlp": 1.36358428, "epoch": 0.004208627686757853, "flos": 20820093811200.0, "grad_norm": 2.0847122496335153, "language_loss": 1.0963186, "learning_rate": 2.7354029381999196e-06, "loss": 1.13662052, "num_input_tokens_seen": 1330505, "router_z_loss_clip": 4.2890625, "router_z_loss_mlp": 3.74023438, "step": 70, "time_per_iteration": 2.942866086959839 }, { "auxiliary_loss_clip": 0.02260396, "auxiliary_loss_mlp": 0.01844666, "balance_loss_clip": 1.84152269, "balance_loss_mlp": 1.35981774, "epoch": 0.004268750939425823, "flos": 19107880381440.0, "grad_norm": 2.080099157641855, "language_loss": 1.17807257, "learning_rate": 2.7445357464116983e-06, "loss": 1.21912324, "num_input_tokens_seen": 1349615, "router_z_loss_clip": 4.1875, "router_z_loss_mlp": 4.84570312, "step": 71, "time_per_iteration": 2.967806816101074 }, { "auxiliary_loss_clip": 0.02468186, "auxiliary_loss_mlp": 0.01745151, "balance_loss_clip": 2.12512517, "balance_loss_mlp": 1.37741411, "epoch": 0.004328874192093792, "flos": 52465203002880.0, "grad_norm": 2.627424663235757, "language_loss": 0.67090684, "learning_rate": 2.75354081884615e-06, "loss": 0.71304029, "num_input_tokens_seen": 1410275, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 3.671875, "step": 72, "time_per_iteration": 3.3788864612579346 }, { "auxiliary_loss_clip": 0.02465628, "auxiliary_loss_mlp": 0.01579652, "balance_loss_clip": 2.12709475, "balance_loss_mlp": 1.34771872, "epoch": 0.004388997444761762, "flos": 66508436851200.0, "grad_norm": 2.4574586075941327, "language_loss": 0.64710939, "learning_rate": 2.7624216794188286e-06, "loss": 0.68756223, "num_input_tokens_seen": 1473020, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 2.3125, "step": 73, "time_per_iteration": 3.451493978500366 }, { "auxiliary_loss_clip": 0.02220821, "auxiliary_loss_mlp": 0.0171673, "balance_loss_clip": 1.83534443, "balance_loss_mlp": 1.36444235, "epoch": 0.004449120697429731, "flos": 18962264056320.0, "grad_norm": 1.8122737928190449, "language_loss": 0.98491162, "learning_rate": 2.771181708202938e-06, "loss": 1.02428722, "num_input_tokens_seen": 1490385, "router_z_loss_clip": 3.859375, "router_z_loss_mlp": 3.5234375, "step": 74, "time_per_iteration": 3.0129518508911133 }, { "auxiliary_loss_clip": 0.0220345, "auxiliary_loss_mlp": 0.01728201, "balance_loss_clip": 1.81980813, "balance_loss_mlp": 1.35340667, "epoch": 0.004509243950097701, "flos": 21115308003840.0, "grad_norm": 1.7986986515436891, "language_loss": 1.07602251, "learning_rate": 2.779824149153005e-06, "loss": 1.11533904, "num_input_tokens_seen": 1509725, "router_z_loss_clip": 3.8359375, "router_z_loss_mlp": 3.75, "step": 75, "time_per_iteration": 2.9826927185058594 }, { "auxiliary_loss_clip": 0.02203747, "auxiliary_loss_mlp": 0.0172515, "balance_loss_clip": 1.82403791, "balance_loss_mlp": 1.36046553, "epoch": 0.004569367202765669, "flos": 20707035759360.0, "grad_norm": 1.824084662435007, "language_loss": 0.9909516, "learning_rate": 2.788352117317012e-06, "loss": 1.03024054, "num_input_tokens_seen": 1527245, "router_z_loss_clip": 3.79882812, "router_z_loss_mlp": 3.6484375, "step": 76, "time_per_iteration": 3.0636885166168213 }, { "auxiliary_loss_clip": 0.02194324, "auxiliary_loss_mlp": 0.01687185, "balance_loss_clip": 1.81513953, "balance_loss_mlp": 1.35587883, "epoch": 0.004629490455433639, "flos": 28670362848000.0, "grad_norm": 1.7042981619568738, "language_loss": 1.02161932, "learning_rate": 2.796768605577095e-06, "loss": 1.06043446, "num_input_tokens_seen": 1548930, "router_z_loss_clip": 3.7890625, "router_z_loss_mlp": 3.31054688, "step": 77, "time_per_iteration": 3.0720362663269043 }, { "auxiliary_loss_clip": 0.02171185, "auxiliary_loss_mlp": 0.01636194, "balance_loss_clip": 1.80305076, "balance_loss_mlp": 1.36592269, "epoch": 0.004689613708101608, "flos": 11079753459840.0, "grad_norm": 1.989328546204836, "language_loss": 1.04702544, "learning_rate": 2.80507649095533e-06, "loss": 1.08509922, "num_input_tokens_seen": 1565695, "router_z_loss_clip": 3.68164062, "router_z_loss_mlp": 2.703125, "step": 78, "time_per_iteration": 2.9188883304595947 }, { "auxiliary_loss_clip": 0.02167085, "auxiliary_loss_mlp": 0.01657007, "balance_loss_clip": 1.79622591, "balance_loss_mlp": 1.36308455, "epoch": 0.004749736960769578, "flos": 21809202543360.0, "grad_norm": 2.1524239162120753, "language_loss": 0.96298873, "learning_rate": 2.813278540517843e-06, "loss": 1.00122976, "num_input_tokens_seen": 1582625, "router_z_loss_clip": 3.70898438, "router_z_loss_mlp": 2.94140625, "step": 79, "time_per_iteration": 2.968212127685547 }, { "auxiliary_loss_clip": 0.02152993, "auxiliary_loss_mlp": 0.01706955, "balance_loss_clip": 1.78584027, "balance_loss_mlp": 1.38385034, "epoch": 0.004809860213437547, "flos": 19802227368960.0, "grad_norm": 1.6417094800798346, "language_loss": 0.99399751, "learning_rate": 2.8213774169075505e-06, "loss": 1.03259695, "num_input_tokens_seen": 1601725, "router_z_loss_clip": 3.671875, "router_z_loss_mlp": 3.22851562, "step": 80, "time_per_iteration": 2.999403238296509 }, { "auxiliary_loss_clip": 0.02145505, "auxiliary_loss_mlp": 0.01643657, "balance_loss_clip": 1.78738248, "balance_loss_mlp": 1.3602246, "epoch": 0.004869983466105517, "flos": 26584516707840.0, "grad_norm": 1.8651087935124366, "language_loss": 1.05145454, "learning_rate": 2.829375683533245e-06, "loss": 1.08934617, "num_input_tokens_seen": 1622420, "router_z_loss_clip": 3.58203125, "router_z_loss_mlp": 2.83398438, "step": 81, "time_per_iteration": 2.990549087524414 }, { "auxiliary_loss_clip": 0.02134164, "auxiliary_loss_mlp": 0.01654487, "balance_loss_clip": 1.77317882, "balance_loss_mlp": 1.35713112, "epoch": 0.004930106718773485, "flos": 12831085658880.0, "grad_norm": 2.3593275805769856, "language_loss": 1.13555384, "learning_rate": 2.8372758094402803e-06, "loss": 1.17344034, "num_input_tokens_seen": 1640715, "router_z_loss_clip": 3.61132812, "router_z_loss_mlp": 2.97265625, "step": 82, "time_per_iteration": 2.936556816101074 }, { "auxiliary_loss_clip": 0.02119173, "auxiliary_loss_mlp": 0.0163481, "balance_loss_clip": 1.76616168, "balance_loss_mlp": 1.35500205, "epoch": 0.004990229971441455, "flos": 25785753425280.0, "grad_norm": 1.790325806226351, "language_loss": 0.94502568, "learning_rate": 2.84508017388607e-06, "loss": 0.98256558, "num_input_tokens_seen": 1662210, "router_z_loss_clip": 3.52929688, "router_z_loss_mlp": 2.79882812, "step": 83, "time_per_iteration": 3.0076515674591064 }, { "auxiliary_loss_clip": 0.02100869, "auxiliary_loss_mlp": 0.01634419, "balance_loss_clip": 1.75793338, "balance_loss_mlp": 1.3485074, "epoch": 0.005050353224109424, "flos": 17466212874240.0, "grad_norm": 2.018160282273101, "language_loss": 1.04113054, "learning_rate": 2.852791070641559e-06, "loss": 1.07848334, "num_input_tokens_seen": 1681070, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 2.859375, "step": 84, "time_per_iteration": 2.968029737472534 }, { "auxiliary_loss_clip": 0.02327821, "auxiliary_loss_mlp": 0.01425662, "balance_loss_clip": 2.03096032, "balance_loss_mlp": 1.33906794, "epoch": 0.005110476476777394, "flos": 69835170908160.0, "grad_norm": 1.6250768740902481, "language_loss": 0.63235581, "learning_rate": 2.8604107120381682e-06, "loss": 0.66989064, "num_input_tokens_seen": 1747140, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.8671875, "step": 85, "time_per_iteration": 6.263977289199829 }, { "auxiliary_loss_clip": 0.02069861, "auxiliary_loss_mlp": 0.01643009, "balance_loss_clip": 1.73488259, "balance_loss_mlp": 1.35576189, "epoch": 0.005170599729445363, "flos": 24800761969920.0, "grad_norm": 1.4888853033301752, "language_loss": 0.96089995, "learning_rate": 2.8679412327780482e-06, "loss": 0.99802858, "num_input_tokens_seen": 1767475, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 2.87109375, "step": 86, "time_per_iteration": 4.483587741851807 }, { "auxiliary_loss_clip": 0.02060358, "auxiliary_loss_mlp": 0.01618375, "balance_loss_clip": 1.72654772, "balance_loss_mlp": 1.35106003, "epoch": 0.005230722982113333, "flos": 23268397196160.0, "grad_norm": 2.031680688672302, "language_loss": 0.95613682, "learning_rate": 2.8753846935240833e-06, "loss": 0.99292409, "num_input_tokens_seen": 1784980, "router_z_loss_clip": 3.33789062, "router_z_loss_mlp": 2.67285156, "step": 87, "time_per_iteration": 2.964277982711792 }, { "auxiliary_loss_clip": 0.02037031, "auxiliary_loss_mlp": 0.01560917, "balance_loss_clip": 1.71697092, "balance_loss_mlp": 1.35339785, "epoch": 0.005290846234781301, "flos": 16736773904640.0, "grad_norm": 1.585897659189128, "language_loss": 1.03720117, "learning_rate": 2.8827430842847267e-06, "loss": 1.07318068, "num_input_tokens_seen": 1803030, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 2.07714844, "step": 88, "time_per_iteration": 2.954820394515991 }, { "auxiliary_loss_clip": 0.02026895, "auxiliary_loss_mlp": 0.01580087, "balance_loss_clip": 1.70528781, "balance_loss_mlp": 1.34843946, "epoch": 0.005350969487449271, "flos": 20895707151360.0, "grad_norm": 1.675591249683518, "language_loss": 0.95536608, "learning_rate": 2.8900183276075957e-06, "loss": 0.99143589, "num_input_tokens_seen": 1822865, "router_z_loss_clip": 3.21875, "router_z_loss_mlp": 2.31640625, "step": 89, "time_per_iteration": 2.976139783859253 }, { "auxiliary_loss_clip": 0.02013525, "auxiliary_loss_mlp": 0.01567141, "balance_loss_clip": 1.69726038, "balance_loss_mlp": 1.35256433, "epoch": 0.00541109274011724, "flos": 26220204426240.0, "grad_norm": 1.6140515868788872, "language_loss": 1.0082438, "learning_rate": 2.8972122815946455e-06, "loss": 1.04405046, "num_input_tokens_seen": 1842435, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 2.14746094, "step": 90, "time_per_iteration": 2.970111846923828 }, { "auxiliary_loss_clip": 0.01994247, "auxiliary_loss_mlp": 0.01596177, "balance_loss_clip": 1.68450856, "balance_loss_mlp": 1.35499239, "epoch": 0.00547121599278521, "flos": 21188432880000.0, "grad_norm": 1.8403228620390235, "language_loss": 0.94911039, "learning_rate": 2.90432674275074e-06, "loss": 0.98501468, "num_input_tokens_seen": 1860065, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 2.41308594, "step": 91, "time_per_iteration": 2.971445322036743 }, { "auxiliary_loss_clip": 0.01995211, "auxiliary_loss_mlp": 0.01589467, "balance_loss_clip": 1.68318522, "balance_loss_mlp": 1.35467279, "epoch": 0.005531339245453179, "flos": 19728197596800.0, "grad_norm": 2.068406962320896, "language_loss": 1.00197899, "learning_rate": 2.91136344867656e-06, "loss": 1.03782582, "num_input_tokens_seen": 1878135, "router_z_loss_clip": 3.125, "router_z_loss_mlp": 2.34667969, "step": 92, "time_per_iteration": 2.9482290744781494 }, { "auxiliary_loss_clip": 0.0198818, "auxiliary_loss_mlp": 0.01574655, "balance_loss_clip": 1.67300308, "balance_loss_mlp": 1.35340309, "epoch": 0.005591462498121149, "flos": 17644161248640.0, "grad_norm": 2.2334664655249714, "language_loss": 1.11671638, "learning_rate": 2.918324080615938e-06, "loss": 1.15234458, "num_input_tokens_seen": 1894895, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 2.21289062, "step": 93, "time_per_iteration": 2.943408489227295 }, { "auxiliary_loss_clip": 0.01991725, "auxiliary_loss_mlp": 0.01604058, "balance_loss_clip": 1.67189646, "balance_loss_mlp": 1.35667479, "epoch": 0.005651585750789117, "flos": 20020832835840.0, "grad_norm": 2.253313963518644, "language_loss": 1.00802302, "learning_rate": 2.925210265866963e-06, "loss": 1.04398084, "num_input_tokens_seen": 1913220, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 2.47558594, "step": 94, "time_per_iteration": 3.011888027191162 }, { "auxiliary_loss_clip": 0.02080452, "auxiliary_loss_mlp": 0.01412516, "balance_loss_clip": 1.81388235, "balance_loss_mlp": 1.33240759, "epoch": 0.005711709003457087, "flos": 59841268842240.0, "grad_norm": 1.4637687301060223, "language_loss": 0.68633616, "learning_rate": 2.932023580065507e-06, "loss": 0.72126579, "num_input_tokens_seen": 1970970, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.80078125, "step": 95, "time_per_iteration": 3.239597797393799 }, { "auxiliary_loss_clip": 0.01981705, "auxiliary_loss_mlp": 0.01586513, "balance_loss_clip": 1.66928828, "balance_loss_mlp": 1.34933424, "epoch": 0.005771832256125056, "flos": 15567952250880.0, "grad_norm": 1.764136207853058, "language_loss": 1.02087808, "learning_rate": 2.9387655493491906e-06, "loss": 1.0565604, "num_input_tokens_seen": 1988930, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 2.37207031, "step": 96, "time_per_iteration": 2.994492769241333 }, { "auxiliary_loss_clip": 0.01974666, "auxiliary_loss_mlp": 0.01579647, "balance_loss_clip": 1.66532207, "balance_loss_mlp": 1.35477054, "epoch": 0.005831955508793026, "flos": 22538551023360.0, "grad_norm": 2.0376589297783205, "language_loss": 1.05534458, "learning_rate": 2.9454376524092147e-06, "loss": 1.09088778, "num_input_tokens_seen": 2006285, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 2.25, "step": 97, "time_per_iteration": 2.9825820922851562 }, { "auxiliary_loss_clip": 0.0196442, "auxiliary_loss_mlp": 0.01601222, "balance_loss_clip": 1.65420675, "balance_loss_mlp": 1.35002446, "epoch": 0.005892078761460995, "flos": 22058782715520.0, "grad_norm": 1.8372653349549066, "language_loss": 0.84753704, "learning_rate": 2.952041322436969e-06, "loss": 0.88319343, "num_input_tokens_seen": 2024905, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 2.51660156, "step": 98, "time_per_iteration": 3.0707077980041504 }, { "auxiliary_loss_clip": 0.01981568, "auxiliary_loss_mlp": 0.01431458, "balance_loss_clip": 1.73114038, "balance_loss_mlp": 1.35287571, "epoch": 0.005952202014128965, "flos": 68571298632960.0, "grad_norm": 1.0903465547067244, "language_loss": 0.65863246, "learning_rate": 2.9585779489718204e-06, "loss": 0.69276273, "num_input_tokens_seen": 2086220, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.78515625, "step": 99, "time_per_iteration": 3.4346020221710205 }, { "auxiliary_loss_clip": 0.01945887, "auxiliary_loss_mlp": 0.01549802, "balance_loss_clip": 1.64866436, "balance_loss_mlp": 1.34991217, "epoch": 0.006012325266796933, "flos": 22969789643520.0, "grad_norm": 1.7755210256902771, "language_loss": 1.03168666, "learning_rate": 2.9650488796560464e-06, "loss": 1.0666436, "num_input_tokens_seen": 2103365, "router_z_loss_clip": 2.97265625, "router_z_loss_mlp": 1.99902344, "step": 100, "time_per_iteration": 2.9528040885925293 }, { "auxiliary_loss_clip": 0.01944294, "auxiliary_loss_mlp": 0.01559737, "balance_loss_clip": 1.64519811, "balance_loss_mlp": 1.3622309, "epoch": 0.006072448519464903, "flos": 17356910140800.0, "grad_norm": 1.851347115892186, "language_loss": 1.01091123, "learning_rate": 2.971455421902446e-06, "loss": 1.0459516, "num_input_tokens_seen": 2121995, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 1.97460938, "step": 101, "time_per_iteration": 2.959949016571045 }, { "auxiliary_loss_clip": 0.01937535, "auxiliary_loss_mlp": 0.01559691, "balance_loss_clip": 1.64149296, "balance_loss_mlp": 1.35837042, "epoch": 0.006132571772132872, "flos": 24691866439680.0, "grad_norm": 1.7239595815909345, "language_loss": 1.03446138, "learning_rate": 2.9777988444798075e-06, "loss": 1.06943369, "num_input_tokens_seen": 2141815, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 2.01367188, "step": 102, "time_per_iteration": 3.0102484226226807 }, { "auxiliary_loss_clip": 0.01935015, "auxiliary_loss_mlp": 0.01614495, "balance_loss_clip": 1.63839757, "balance_loss_mlp": 1.36320138, "epoch": 0.006192695024800842, "flos": 21474371888640.0, "grad_norm": 1.9250394651509122, "language_loss": 0.97853917, "learning_rate": 2.9840803790210285e-06, "loss": 1.01403427, "num_input_tokens_seen": 2161125, "router_z_loss_clip": 2.96679688, "router_z_loss_mlp": 2.515625, "step": 103, "time_per_iteration": 2.9484057426452637 }, { "auxiliary_loss_clip": 0.01927277, "auxiliary_loss_mlp": 0.01590141, "balance_loss_clip": 1.63277698, "balance_loss_mlp": 1.36345267, "epoch": 0.006252818277468811, "flos": 17429265855360.0, "grad_norm": 1.664814914521033, "language_loss": 1.00576234, "learning_rate": 2.990301221458371e-06, "loss": 1.04093659, "num_input_tokens_seen": 2179510, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 2.26757812, "step": 104, "time_per_iteration": 2.9664459228515625 }, { "auxiliary_loss_clip": 0.01933122, "auxiliary_loss_mlp": 0.01557284, "balance_loss_clip": 1.64216352, "balance_loss_mlp": 1.36664486, "epoch": 0.006312941530136781, "flos": 19109056746240.0, "grad_norm": 1.8866654983844848, "language_loss": 1.0761739, "learning_rate": 2.9964625333900544e-06, "loss": 1.11107802, "num_input_tokens_seen": 2197870, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 1.90527344, "step": 105, "time_per_iteration": 2.984001636505127 }, { "auxiliary_loss_clip": 0.0192205, "auxiliary_loss_mlp": 0.01555855, "balance_loss_clip": 1.63220656, "balance_loss_mlp": 1.3480494, "epoch": 0.006373064782804749, "flos": 24071413489920.0, "grad_norm": 1.9538587272618906, "language_loss": 1.04559231, "learning_rate": 3.002565443382063e-06, "loss": 1.08037138, "num_input_tokens_seen": 2217495, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 2.08007812, "step": 106, "time_per_iteration": 3.048912763595581 }, { "auxiliary_loss_clip": 0.01920889, "auxiliary_loss_mlp": 0.01538576, "balance_loss_clip": 1.62742102, "balance_loss_mlp": 1.34860444, "epoch": 0.006433188035472719, "flos": 18341494392960.0, "grad_norm": 1.862783685773675, "language_loss": 0.99073768, "learning_rate": 3.008611048208843e-06, "loss": 1.02533245, "num_input_tokens_seen": 2236520, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 1.90039062, "step": 107, "time_per_iteration": 3.03747820854187 }, { "auxiliary_loss_clip": 0.01922726, "auxiliary_loss_mlp": 0.01489736, "balance_loss_clip": 1.69902253, "balance_loss_mlp": 1.42870128, "epoch": 0.006493311288140688, "flos": 62594785520640.0, "grad_norm": 1.0550469301656948, "language_loss": 0.65021467, "learning_rate": 3.014600414036285e-06, "loss": 0.68433928, "num_input_tokens_seen": 2300140, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.609375, "step": 108, "time_per_iteration": 3.3504483699798584 }, { "auxiliary_loss_clip": 0.0191401, "auxiliary_loss_mlp": 0.0154659, "balance_loss_clip": 1.62987506, "balance_loss_mlp": 1.36272204, "epoch": 0.006553434540808658, "flos": 19509546885120.0, "grad_norm": 1.792568200840074, "language_loss": 1.08283758, "learning_rate": 3.0205345775501937e-06, "loss": 1.11744368, "num_input_tokens_seen": 2317320, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 1.83886719, "step": 109, "time_per_iteration": 2.962038040161133 }, { "auxiliary_loss_clip": 0.01898747, "auxiliary_loss_mlp": 0.01584883, "balance_loss_clip": 1.61868119, "balance_loss_mlp": 1.38804448, "epoch": 0.006613557793476627, "flos": 21114855555840.0, "grad_norm": 1.4914829380756742, "language_loss": 0.94560754, "learning_rate": 3.0264145470332218e-06, "loss": 0.98044389, "num_input_tokens_seen": 2337820, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 1.96679688, "step": 110, "time_per_iteration": 2.996743679046631 }, { "auxiliary_loss_clip": 0.01886148, "auxiliary_loss_mlp": 0.01583833, "balance_loss_clip": 1.60763562, "balance_loss_mlp": 1.39538682, "epoch": 0.006673681046144597, "flos": 26041758359040.0, "grad_norm": 1.8055190977799156, "language_loss": 0.89272934, "learning_rate": 3.032241303393073e-06, "loss": 0.9274292, "num_input_tokens_seen": 2358560, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 1.88476562, "step": 111, "time_per_iteration": 3.0377683639526367 }, { "auxiliary_loss_clip": 0.01886626, "auxiliary_loss_mlp": 0.01576551, "balance_loss_clip": 1.60440516, "balance_loss_mlp": 1.37570679, "epoch": 0.006733804298812566, "flos": 23157872853120.0, "grad_norm": 1.64758868341786, "language_loss": 1.01985645, "learning_rate": 3.0380158011446e-06, "loss": 1.05448818, "num_input_tokens_seen": 2379005, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 2.00976562, "step": 112, "time_per_iteration": 3.0487022399902344 }, { "auxiliary_loss_clip": 0.01875265, "auxiliary_loss_mlp": 0.01545316, "balance_loss_clip": 1.59934902, "balance_loss_mlp": 1.3559165, "epoch": 0.006793927551480535, "flos": 11771657228160.0, "grad_norm": 2.0481936995810166, "language_loss": 0.94080055, "learning_rate": 3.0437389693482466e-06, "loss": 0.97500634, "num_input_tokens_seen": 2395610, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 1.89257812, "step": 113, "time_per_iteration": 2.9369571208953857 }, { "auxiliary_loss_clip": 0.01871203, "auxiliary_loss_mlp": 0.01505836, "balance_loss_clip": 1.59276152, "balance_loss_mlp": 1.36998463, "epoch": 0.006854050804148504, "flos": 19181321971200.0, "grad_norm": 1.653140138910595, "language_loss": 1.03894496, "learning_rate": 3.0494117125071475e-06, "loss": 1.0727154, "num_input_tokens_seen": 2415005, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 1.35888672, "step": 114, "time_per_iteration": 2.9705049991607666 }, { "auxiliary_loss_clip": 0.01866799, "auxiliary_loss_mlp": 0.01551521, "balance_loss_clip": 1.58869743, "balance_loss_mlp": 1.37094283, "epoch": 0.006914174056816474, "flos": 21992127845760.0, "grad_norm": 1.7331904134594145, "language_loss": 1.05513215, "learning_rate": 3.055034911425055e-06, "loss": 1.08931541, "num_input_tokens_seen": 2433965, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 1.80712891, "step": 115, "time_per_iteration": 3.013333320617676 }, { "auxiliary_loss_clip": 0.01878037, "auxiliary_loss_mlp": 0.01671563, "balance_loss_clip": 1.59157312, "balance_loss_mlp": 1.38674784, "epoch": 0.006974297309484443, "flos": 16297662689280.0, "grad_norm": 1.8038799674825148, "language_loss": 0.96423781, "learning_rate": 3.0606094240271244e-06, "loss": 0.99973392, "num_input_tokens_seen": 2451605, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 2.85107422, "step": 116, "time_per_iteration": 2.9376189708709717 }, { "auxiliary_loss_clip": 0.01846166, "auxiliary_loss_mlp": 0.01523902, "balance_loss_clip": 1.57392287, "balance_loss_mlp": 1.37779856, "epoch": 0.007034420562152413, "flos": 26115109459200.0, "grad_norm": 1.8016206569490554, "language_loss": 1.02880001, "learning_rate": 3.0661360861454656e-06, "loss": 1.06250072, "num_input_tokens_seen": 2472035, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 1.46191406, "step": 117, "time_per_iteration": 2.9882030487060547 }, { "auxiliary_loss_clip": 0.01839617, "auxiliary_loss_mlp": 0.01510947, "balance_loss_clip": 1.56601071, "balance_loss_mlp": 1.36379528, "epoch": 0.007094543814820382, "flos": 14211318856320.0, "grad_norm": 1.8741808881633313, "language_loss": 0.98083568, "learning_rate": 3.071615712271274e-06, "loss": 1.01434135, "num_input_tokens_seen": 2489285, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 1.47167969, "step": 118, "time_per_iteration": 2.9267263412475586 }, { "auxiliary_loss_clip": 0.01833788, "auxiliary_loss_mlp": 0.01527576, "balance_loss_clip": 1.56065798, "balance_loss_mlp": 1.37022018, "epoch": 0.007154667067488351, "flos": 14984265340800.0, "grad_norm": 1.8385648159151324, "language_loss": 1.08662391, "learning_rate": 3.0770490962752172e-06, "loss": 1.12023759, "num_input_tokens_seen": 2506460, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 1.57421875, "step": 119, "time_per_iteration": 2.9875338077545166 }, { "auxiliary_loss_clip": 0.01828494, "auxiliary_loss_mlp": 0.01523588, "balance_loss_clip": 1.55392456, "balance_loss_mlp": 1.36537349, "epoch": 0.00721479032015632, "flos": 20202853242240.0, "grad_norm": 2.02862627281142, "language_loss": 1.10576797, "learning_rate": 3.082437012097686e-06, "loss": 1.1392889, "num_input_tokens_seen": 2525565, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 1.58203125, "step": 120, "time_per_iteration": 7.1235082149505615 }, { "auxiliary_loss_clip": 0.01809403, "auxiliary_loss_mlp": 0.01510399, "balance_loss_clip": 1.54465103, "balance_loss_mlp": 1.37345147, "epoch": 0.00727491357282429, "flos": 23157194181120.0, "grad_norm": 1.6537702279639956, "language_loss": 0.98594522, "learning_rate": 3.0877802144103967e-06, "loss": 1.01914322, "num_input_tokens_seen": 2546605, "router_z_loss_clip": 2.64453125, "router_z_loss_mlp": 1.37011719, "step": 121, "time_per_iteration": 4.4040961265563965 }, { "auxiliary_loss_clip": 0.01813054, "auxiliary_loss_mlp": 0.01535398, "balance_loss_clip": 1.54480803, "balance_loss_mlp": 1.36926746, "epoch": 0.007335036825492259, "flos": 15529647888000.0, "grad_norm": 2.1449802853017257, "language_loss": 1.050385, "learning_rate": 3.09307943925077e-06, "loss": 1.08386946, "num_input_tokens_seen": 2560730, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 1.66210938, "step": 122, "time_per_iteration": 2.9283711910247803 }, { "auxiliary_loss_clip": 0.0180488, "auxiliary_loss_mlp": 0.01494523, "balance_loss_clip": 1.54170537, "balance_loss_mlp": 1.35519147, "epoch": 0.007395160078160229, "flos": 24254022078720.0, "grad_norm": 1.881524266622204, "language_loss": 1.04281998, "learning_rate": 3.0983354046304154e-06, "loss": 1.07581413, "num_input_tokens_seen": 2579550, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 1.39257812, "step": 123, "time_per_iteration": 3.007431983947754 }, { "auxiliary_loss_clip": 0.01806349, "auxiliary_loss_mlp": 0.0150086, "balance_loss_clip": 1.53740692, "balance_loss_mlp": 1.35742736, "epoch": 0.007455283330828198, "flos": 31772491862400.0, "grad_norm": 1.7140578427898214, "language_loss": 0.84739226, "learning_rate": 3.103548811118979e-06, "loss": 0.88046438, "num_input_tokens_seen": 2600390, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 1.43603516, "step": 124, "time_per_iteration": 3.075922966003418 }, { "auxiliary_loss_clip": 0.01793709, "auxiliary_loss_mlp": 0.01520004, "balance_loss_clip": 1.52771068, "balance_loss_mlp": 1.3671298, "epoch": 0.007515406583496167, "flos": 26626485899520.0, "grad_norm": 1.8289813203271517, "language_loss": 1.00631356, "learning_rate": 3.108720342404542e-06, "loss": 1.03945065, "num_input_tokens_seen": 2620770, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 1.52929688, "step": 125, "time_per_iteration": 3.031848430633545 }, { "auxiliary_loss_clip": 0.01795018, "auxiliary_loss_mlp": 0.01515022, "balance_loss_clip": 1.52735257, "balance_loss_mlp": 1.37402105, "epoch": 0.007575529836164136, "flos": 18232915576320.0, "grad_norm": 3.125298966529811, "language_loss": 1.01942134, "learning_rate": 3.1138506658316945e-06, "loss": 1.05252171, "num_input_tokens_seen": 2639900, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 1.40917969, "step": 126, "time_per_iteration": 3.087946891784668 }, { "auxiliary_loss_clip": 0.01793919, "auxiliary_loss_mlp": 0.01507171, "balance_loss_clip": 1.52451944, "balance_loss_mlp": 1.38552988, "epoch": 0.007635653088832106, "flos": 21590506586880.0, "grad_norm": 2.0968660593484723, "language_loss": 0.8475852, "learning_rate": 3.1189404329183404e-06, "loss": 0.8805961, "num_input_tokens_seen": 2657450, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 1.21582031, "step": 127, "time_per_iteration": 3.032813310623169 }, { "auxiliary_loss_clip": 0.01798734, "auxiliary_loss_mlp": 0.01537827, "balance_loss_clip": 1.53317142, "balance_loss_mlp": 1.37689507, "epoch": 0.007695776341500075, "flos": 25386168182400.0, "grad_norm": 1.7634310780686029, "language_loss": 0.96107388, "learning_rate": 3.1239902798522317e-06, "loss": 0.99443942, "num_input_tokens_seen": 2678150, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 1.60986328, "step": 128, "time_per_iteration": 2.994908332824707 }, { "auxiliary_loss_clip": 0.01775609, "auxiliary_loss_mlp": 0.01485588, "balance_loss_clip": 1.51239061, "balance_loss_mlp": 1.34921277, "epoch": 0.007755899594168045, "flos": 22353499215360.0, "grad_norm": 1.5447440556940202, "language_loss": 0.91816604, "learning_rate": 3.129000827968184e-06, "loss": 0.95077795, "num_input_tokens_seen": 2698290, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 1.36376953, "step": 129, "time_per_iteration": 3.036813497543335 }, { "auxiliary_loss_clip": 0.01771995, "auxiliary_loss_mlp": 0.0147112, "balance_loss_clip": 1.51097655, "balance_loss_mlp": 1.34804797, "epoch": 0.007816022846836013, "flos": 22648532428800.0, "grad_norm": 1.740957399337759, "language_loss": 1.06647468, "learning_rate": 3.133972684206866e-06, "loss": 1.0989058, "num_input_tokens_seen": 2717630, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 1.22949219, "step": 130, "time_per_iteration": 3.003504514694214 }, { "auxiliary_loss_clip": 0.01766666, "auxiliary_loss_mlp": 0.01488458, "balance_loss_clip": 1.50428295, "balance_loss_mlp": 1.35494328, "epoch": 0.007876146099503984, "flos": 18190991629440.0, "grad_norm": 1.633591787982827, "language_loss": 0.91260767, "learning_rate": 3.138906441556014e-06, "loss": 0.94515896, "num_input_tokens_seen": 2735835, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 1.3359375, "step": 131, "time_per_iteration": 2.9374780654907227 }, { "auxiliary_loss_clip": 0.01769541, "auxiliary_loss_mlp": 0.01500769, "balance_loss_clip": 1.51053822, "balance_loss_mlp": 1.36224818, "epoch": 0.007936269352171952, "flos": 27129265827840.0, "grad_norm": 1.7986380249471208, "language_loss": 0.91651535, "learning_rate": 3.143802679474861e-06, "loss": 0.94921839, "num_input_tokens_seen": 2756335, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 1.38623047, "step": 132, "time_per_iteration": 3.0551247596740723 }, { "auxiliary_loss_clip": 0.01759184, "auxiliary_loss_mlp": 0.01509459, "balance_loss_clip": 1.49819088, "balance_loss_mlp": 1.3616395, "epoch": 0.007996392604839923, "flos": 19035750890880.0, "grad_norm": 1.7781138983719795, "language_loss": 1.04958653, "learning_rate": 3.1486619643025565e-06, "loss": 1.08227301, "num_input_tokens_seen": 2775090, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 1.47851562, "step": 133, "time_per_iteration": 3.035219430923462 }, { "auxiliary_loss_clip": 0.01743115, "auxiliary_loss_mlp": 0.01522793, "balance_loss_clip": 1.48822713, "balance_loss_mlp": 1.35036814, "epoch": 0.008056515857507891, "flos": 25495199447040.0, "grad_norm": 1.4242186650854611, "language_loss": 0.80555087, "learning_rate": 3.153484849651286e-06, "loss": 0.83820999, "num_input_tokens_seen": 2795320, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 1.72363281, "step": 134, "time_per_iteration": 2.9813473224639893 }, { "auxiliary_loss_clip": 0.01744416, "auxiliary_loss_mlp": 0.01483782, "balance_loss_clip": 1.48811674, "balance_loss_mlp": 1.35069668, "epoch": 0.00811663911017586, "flos": 20567075034240.0, "grad_norm": 2.360031493174988, "language_loss": 1.03100455, "learning_rate": 3.1582718767847806e-06, "loss": 1.06328654, "num_input_tokens_seen": 2812815, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 1.33105469, "step": 135, "time_per_iteration": 2.951490879058838 }, { "auxiliary_loss_clip": 0.01739459, "auxiliary_loss_mlp": 0.01474254, "balance_loss_clip": 1.48635364, "balance_loss_mlp": 1.34856021, "epoch": 0.00817676236284383, "flos": 18807101078400.0, "grad_norm": 2.0791676106209125, "language_loss": 1.03863001, "learning_rate": 3.1630235749828485e-06, "loss": 1.07076716, "num_input_tokens_seen": 2830445, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 1.25585938, "step": 136, "time_per_iteration": 2.939040184020996 }, { "auxiliary_loss_clip": 0.01731514, "auxiliary_loss_mlp": 0.01481807, "balance_loss_clip": 1.47920465, "balance_loss_mlp": 1.34752953, "epoch": 0.008236885615511799, "flos": 23882877832320.0, "grad_norm": 1.8035601242269115, "language_loss": 0.96884584, "learning_rate": 3.1677404618925676e-06, "loss": 1.00097907, "num_input_tokens_seen": 2846965, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 1.34228516, "step": 137, "time_per_iteration": 3.0082385540008545 }, { "auxiliary_loss_clip": 0.01728285, "auxiliary_loss_mlp": 0.01482164, "balance_loss_clip": 1.4768219, "balance_loss_mlp": 1.34960294, "epoch": 0.00829700886817977, "flos": 24654105014400.0, "grad_norm": 1.4912579024644579, "language_loss": 0.97569871, "learning_rate": 3.1724230438666953e-06, "loss": 1.0078032, "num_input_tokens_seen": 2867520, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 1.32519531, "step": 138, "time_per_iteration": 2.988132953643799 }, { "auxiliary_loss_clip": 0.01720391, "auxiliary_loss_mlp": 0.01497493, "balance_loss_clip": 1.46845675, "balance_loss_mlp": 1.34943545, "epoch": 0.008357132120847738, "flos": 25272159989760.0, "grad_norm": 1.7767025349051841, "language_loss": 0.98709744, "learning_rate": 3.177071816289865e-06, "loss": 1.01927638, "num_input_tokens_seen": 2885675, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 1.48046875, "step": 139, "time_per_iteration": 3.0043931007385254 }, { "auxiliary_loss_clip": 0.01720858, "auxiliary_loss_mlp": 0.01470096, "balance_loss_clip": 1.46969604, "balance_loss_mlp": 1.34850264, "epoch": 0.008417255373515706, "flos": 27356422561920.0, "grad_norm": 1.8838243025125034, "language_loss": 1.01136756, "learning_rate": 3.181687263893095e-06, "loss": 1.04327714, "num_input_tokens_seen": 2905960, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 1.21630859, "step": 140, "time_per_iteration": 3.014277458190918 }, { "auxiliary_loss_clip": 0.01716456, "auxiliary_loss_mlp": 0.01467969, "balance_loss_clip": 1.4673841, "balance_loss_mlp": 1.34856856, "epoch": 0.008477378626183677, "flos": 17647916567040.0, "grad_norm": 2.0274133416777254, "language_loss": 0.9790231, "learning_rate": 3.186269861057098e-06, "loss": 1.01086736, "num_input_tokens_seen": 2922780, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 1.19433594, "step": 141, "time_per_iteration": 2.9487948417663574 }, { "auxiliary_loss_clip": 0.01712216, "auxiliary_loss_mlp": 0.01469021, "balance_loss_clip": 1.46430659, "balance_loss_mlp": 1.34814322, "epoch": 0.008537501878851645, "flos": 13889292480000.0, "grad_norm": 2.2138094256594463, "language_loss": 0.98319864, "learning_rate": 3.1908200721048745e-06, "loss": 1.01501107, "num_input_tokens_seen": 2938765, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 1.20898438, "step": 142, "time_per_iteration": 3.000056028366089 }, { "auxiliary_loss_clip": 0.01760019, "auxiliary_loss_mlp": 0.01493213, "balance_loss_clip": 1.54942608, "balance_loss_mlp": 1.42950761, "epoch": 0.008597625131519616, "flos": 71283298567680.0, "grad_norm": 1.1725613867174547, "language_loss": 0.6717701, "learning_rate": 3.195338351584042e-06, "loss": 0.70430243, "num_input_tokens_seen": 3006665, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.63671875, "step": 143, "time_per_iteration": 3.505128860473633 }, { "auxiliary_loss_clip": 0.01713448, "auxiliary_loss_mlp": 0.01477787, "balance_loss_clip": 1.46557307, "balance_loss_mlp": 1.35872102, "epoch": 0.008657748384187584, "flos": 17611738709760.0, "grad_norm": 1.7054812241713622, "language_loss": 0.9672817, "learning_rate": 3.1998251445393258e-06, "loss": 0.99919403, "num_input_tokens_seen": 3024335, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 1.19140625, "step": 144, "time_per_iteration": 3.0075223445892334 }, { "auxiliary_loss_clip": 0.01715618, "auxiliary_loss_mlp": 0.01546279, "balance_loss_clip": 1.46553135, "balance_loss_mlp": 1.39264166, "epoch": 0.008717871636855555, "flos": 19723808851200.0, "grad_norm": 1.5138212695528415, "language_loss": 0.99771917, "learning_rate": 3.204280886775619e-06, "loss": 1.03033805, "num_input_tokens_seen": 3043300, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 1.53613281, "step": 145, "time_per_iteration": 3.086247205734253 }, { "auxiliary_loss_clip": 0.01730241, "auxiliary_loss_mlp": 0.01551944, "balance_loss_clip": 1.47565234, "balance_loss_mlp": 1.41995525, "epoch": 0.008777994889523523, "flos": 24728134786560.0, "grad_norm": 1.520543892784519, "language_loss": 0.97562361, "learning_rate": 3.208706005112005e-06, "loss": 1.0084455, "num_input_tokens_seen": 3064610, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 1.31933594, "step": 146, "time_per_iteration": 2.9851768016815186 }, { "auxiliary_loss_clip": 0.01717858, "auxiliary_loss_mlp": 0.0147494, "balance_loss_clip": 1.51347804, "balance_loss_mlp": 1.4371742, "epoch": 0.008838118142191492, "flos": 70161405050880.0, "grad_norm": 0.9327860305439852, "language_loss": 0.60406101, "learning_rate": 3.213100917627104e-06, "loss": 0.63598901, "num_input_tokens_seen": 3130385, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.37695312, "step": 147, "time_per_iteration": 3.480670928955078 }, { "auxiliary_loss_clip": 0.01712017, "auxiliary_loss_mlp": 0.01493891, "balance_loss_clip": 1.46028781, "balance_loss_mlp": 1.37749517, "epoch": 0.008898241394859462, "flos": 20053662577920.0, "grad_norm": 1.7733948706889147, "language_loss": 0.91982633, "learning_rate": 3.2174660338961135e-06, "loss": 0.9518854, "num_input_tokens_seen": 3149760, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 1.16455078, "step": 148, "time_per_iteration": 2.989441394805908 }, { "auxiliary_loss_clip": 0.01709266, "auxiliary_loss_mlp": 0.01483898, "balance_loss_clip": 1.46102834, "balance_loss_mlp": 1.3573451, "epoch": 0.008958364647527431, "flos": 10750759384320.0, "grad_norm": 1.846657737799584, "language_loss": 0.98415065, "learning_rate": 3.2218017552198588e-06, "loss": 1.01608229, "num_input_tokens_seen": 3164500, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 1.265625, "step": 149, "time_per_iteration": 2.9582393169403076 }, { "auxiliary_loss_clip": 0.01707349, "auxiliary_loss_mlp": 0.01485741, "balance_loss_clip": 1.4617424, "balance_loss_mlp": 1.37788033, "epoch": 0.009018487900195401, "flos": 29138096039040.0, "grad_norm": 3.5482986287921734, "language_loss": 1.04718316, "learning_rate": 3.226108474846181e-06, "loss": 1.07911408, "num_input_tokens_seen": 3182455, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 1.07861328, "step": 150, "time_per_iteration": 3.035717725753784 }, { "auxiliary_loss_clip": 0.01698054, "auxiliary_loss_mlp": 0.01502167, "balance_loss_clip": 1.45451343, "balance_loss_mlp": 1.40126812, "epoch": 0.00907861115286337, "flos": 32976631722240.0, "grad_norm": 1.7070340963942663, "language_loss": 0.85386127, "learning_rate": 3.2303865781839817e-06, "loss": 0.88586354, "num_input_tokens_seen": 3203995, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 1.00927734, "step": 151, "time_per_iteration": 3.0584869384765625 }, { "auxiliary_loss_clip": 0.01714564, "auxiliary_loss_mlp": 0.01537387, "balance_loss_clip": 1.46676886, "balance_loss_mlp": 1.42208743, "epoch": 0.009138734405531338, "flos": 21772481748480.0, "grad_norm": 1.8963026642612548, "language_loss": 1.01383948, "learning_rate": 3.234636443010188e-06, "loss": 1.04635906, "num_input_tokens_seen": 3222575, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 1.15283203, "step": 152, "time_per_iteration": 3.015085458755493 }, { "auxiliary_loss_clip": 0.01697658, "auxiliary_loss_mlp": 0.01505996, "balance_loss_clip": 1.453601, "balance_loss_mlp": 1.4037149, "epoch": 0.009198857658199309, "flos": 20850299354880.0, "grad_norm": 2.5095207462199944, "language_loss": 1.0263381, "learning_rate": 3.238858439669943e-06, "loss": 1.05837464, "num_input_tokens_seen": 3240180, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 1.02294922, "step": 153, "time_per_iteration": 2.9863834381103516 }, { "auxiliary_loss_clip": 0.01686311, "auxiliary_loss_mlp": 0.01482566, "balance_loss_clip": 1.44646311, "balance_loss_mlp": 1.36826825, "epoch": 0.009258980910867277, "flos": 24838251926400.0, "grad_norm": 1.58784688596587, "language_loss": 0.9567551, "learning_rate": 3.2430529312702712e-06, "loss": 0.98844391, "num_input_tokens_seen": 3259800, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 1.14306641, "step": 154, "time_per_iteration": 2.988161325454712 }, { "auxiliary_loss_clip": 0.01683941, "auxiliary_loss_mlp": 0.01451792, "balance_loss_clip": 1.44340765, "balance_loss_mlp": 1.34278738, "epoch": 0.009319104163535248, "flos": 28779665581440.0, "grad_norm": 1.7612335960786463, "language_loss": 0.96971238, "learning_rate": 3.2472202738674737e-06, "loss": 1.00106966, "num_input_tokens_seen": 3280400, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 1.09033203, "step": 155, "time_per_iteration": 5.857283592224121 }, { "auxiliary_loss_clip": 0.01700492, "auxiliary_loss_mlp": 0.01467731, "balance_loss_clip": 1.45578206, "balance_loss_mlp": 1.3586781, "epoch": 0.009379227416203216, "flos": 16590840865920.0, "grad_norm": 1.9042846459747682, "language_loss": 0.99431241, "learning_rate": 3.2513608166485063e-06, "loss": 1.02599466, "num_input_tokens_seen": 3297600, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 1.09033203, "step": 156, "time_per_iteration": 5.756657600402832 }, { "auxiliary_loss_clip": 0.01692196, "auxiliary_loss_mlp": 0.015152, "balance_loss_clip": 1.44911659, "balance_loss_mlp": 1.37849081, "epoch": 0.009439350668871187, "flos": 18338327256960.0, "grad_norm": 2.102312069099806, "language_loss": 1.1224153, "learning_rate": 3.2554749021065498e-06, "loss": 1.15448916, "num_input_tokens_seen": 3313635, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 1.36767578, "step": 157, "time_per_iteration": 2.929694652557373 }, { "auxiliary_loss_clip": 0.01687177, "auxiliary_loss_mlp": 0.01494498, "balance_loss_clip": 1.44721782, "balance_loss_mlp": 1.38263166, "epoch": 0.009499473921539155, "flos": 24359840962560.0, "grad_norm": 1.7898147420392636, "language_loss": 0.99449217, "learning_rate": 3.2595628662110186e-06, "loss": 1.02630901, "num_input_tokens_seen": 3333735, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 1.11865234, "step": 158, "time_per_iteration": 3.1394481658935547 }, { "auxiliary_loss_clip": 0.01677624, "auxiliary_loss_mlp": 0.01512768, "balance_loss_clip": 1.43753374, "balance_loss_mlp": 1.38488019, "epoch": 0.009559597174207124, "flos": 16408051297920.0, "grad_norm": 2.0099245126604957, "language_loss": 0.99955839, "learning_rate": 3.2636250385721982e-06, "loss": 1.03146219, "num_input_tokens_seen": 3348800, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 1.27783203, "step": 159, "time_per_iteration": 3.36445689201355 }, { "auxiliary_loss_clip": 0.01676278, "auxiliary_loss_mlp": 0.01492735, "balance_loss_clip": 1.43387544, "balance_loss_mlp": 1.3758142, "epoch": 0.009619720426875094, "flos": 22867183140480.0, "grad_norm": 1.4472853637188445, "language_loss": 0.93441355, "learning_rate": 3.2676617426007263e-06, "loss": 0.96610367, "num_input_tokens_seen": 3368595, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 1.16943359, "step": 160, "time_per_iteration": 3.147494316101074 }, { "auxiliary_loss_clip": 0.01671263, "auxiliary_loss_mlp": 0.01479541, "balance_loss_clip": 1.43019044, "balance_loss_mlp": 1.36600614, "epoch": 0.009679843679543063, "flos": 19144239217920.0, "grad_norm": 1.9611011875678692, "language_loss": 1.0494926, "learning_rate": 3.2716732956621042e-06, "loss": 1.08100057, "num_input_tokens_seen": 3384975, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 1.13623047, "step": 161, "time_per_iteration": 3.083754539489746 }, { "auxiliary_loss_clip": 0.01667278, "auxiliary_loss_mlp": 0.01456613, "balance_loss_clip": 1.42786336, "balance_loss_mlp": 1.35352039, "epoch": 0.009739966932211033, "flos": 20312653668480.0, "grad_norm": 1.5627629857261016, "language_loss": 1.02950001, "learning_rate": 3.2756600092264203e-06, "loss": 1.0607388, "num_input_tokens_seen": 3404755, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 1.03076172, "step": 162, "time_per_iteration": 3.0321624279022217 }, { "auxiliary_loss_clip": 0.01646538, "auxiliary_loss_mlp": 0.0147447, "balance_loss_clip": 1.45340502, "balance_loss_mlp": 1.41229022, "epoch": 0.009800090184879002, "flos": 67063664782080.0, "grad_norm": 1.2254356704368674, "language_loss": 0.72704124, "learning_rate": 3.279622189013474e-06, "loss": 0.75825131, "num_input_tokens_seen": 3467210, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.62109375, "step": 163, "time_per_iteration": 3.41477370262146 }, { "auxiliary_loss_clip": 0.01654993, "auxiliary_loss_mlp": 0.01472794, "balance_loss_clip": 1.41852486, "balance_loss_mlp": 1.36169064, "epoch": 0.00986021343754697, "flos": 17173396656000.0, "grad_norm": 1.8904270217147379, "language_loss": 0.97955823, "learning_rate": 3.283560135133457e-06, "loss": 1.01083612, "num_input_tokens_seen": 3483220, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 1.11083984, "step": 164, "time_per_iteration": 2.9637222290039062 }, { "auxiliary_loss_clip": 0.01657604, "auxiliary_loss_mlp": 0.01501523, "balance_loss_clip": 1.41857445, "balance_loss_mlp": 1.38236129, "epoch": 0.00992033669021494, "flos": 17758531399680.0, "grad_norm": 1.8437212215427194, "language_loss": 1.00707221, "learning_rate": 3.2874741422233565e-06, "loss": 1.03866363, "num_input_tokens_seen": 3501465, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 1.19238281, "step": 165, "time_per_iteration": 3.0621936321258545 }, { "auxiliary_loss_clip": 0.01648986, "auxiliary_loss_mlp": 0.01488579, "balance_loss_clip": 1.41218567, "balance_loss_mlp": 1.39478469, "epoch": 0.00998045994288291, "flos": 25306980503040.0, "grad_norm": 1.5728534755454364, "language_loss": 0.90479118, "learning_rate": 3.2913644995792465e-06, "loss": 0.93616682, "num_input_tokens_seen": 3520480, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.93847656, "step": 166, "time_per_iteration": 2.993326425552368 }, { "auxiliary_loss_clip": 0.01646425, "auxiliary_loss_mlp": 0.01485213, "balance_loss_clip": 1.41008055, "balance_loss_mlp": 1.38297963, "epoch": 0.01004058319555088, "flos": 32309685100800.0, "grad_norm": 1.8408636647865255, "language_loss": 1.01632929, "learning_rate": 3.2952314912845914e-06, "loss": 1.04764569, "num_input_tokens_seen": 3539570, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 1.02197266, "step": 167, "time_per_iteration": 3.109844207763672 }, { "auxiliary_loss_clip": 0.0164675, "auxiliary_loss_mlp": 0.01458415, "balance_loss_clip": 1.41309142, "balance_loss_mlp": 1.37029552, "epoch": 0.010100706448218848, "flos": 11325306844800.0, "grad_norm": 2.0017223838491023, "language_loss": 1.06033587, "learning_rate": 3.299075396334735e-06, "loss": 1.09138751, "num_input_tokens_seen": 3555465, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.88085938, "step": 168, "time_per_iteration": 3.099179267883301 }, { "auxiliary_loss_clip": 0.01648768, "auxiliary_loss_mlp": 0.01443044, "balance_loss_clip": 1.41379213, "balance_loss_mlp": 1.35058498, "epoch": 0.010160829700886819, "flos": 29732415477120.0, "grad_norm": 1.3823934475823065, "language_loss": 0.94224906, "learning_rate": 3.3028964887576868e-06, "loss": 0.97316718, "num_input_tokens_seen": 3578970, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.92431641, "step": 169, "time_per_iteration": 3.0935239791870117 }, { "auxiliary_loss_clip": 0.01650068, "auxiliary_loss_mlp": 0.01440419, "balance_loss_clip": 1.41612494, "balance_loss_mlp": 1.35382497, "epoch": 0.010220952953554787, "flos": 20422001646720.0, "grad_norm": 1.4773365170470347, "language_loss": 0.94947594, "learning_rate": 3.306695037731344e-06, "loss": 0.98038083, "num_input_tokens_seen": 3597275, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.86523438, "step": 170, "time_per_iteration": 3.008519172668457 }, { "auxiliary_loss_clip": 0.0167029, "auxiliary_loss_mlp": 0.01460026, "balance_loss_clip": 1.4290185, "balance_loss_mlp": 1.36637545, "epoch": 0.010281076206222756, "flos": 31297609992960.0, "grad_norm": 1.6130596964371056, "language_loss": 1.00798678, "learning_rate": 3.3104713076972827e-06, "loss": 1.03928983, "num_input_tokens_seen": 3618905, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.93701172, "step": 171, "time_per_iteration": 3.111489772796631 }, { "auxiliary_loss_clip": 0.01650145, "auxiliary_loss_mlp": 0.01454002, "balance_loss_clip": 1.41761374, "balance_loss_mlp": 1.36197269, "epoch": 0.010341199458890726, "flos": 21992625538560.0, "grad_norm": 1.6574259950964503, "language_loss": 0.98412812, "learning_rate": 3.314225558471224e-06, "loss": 1.0151695, "num_input_tokens_seen": 3639610, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.92041016, "step": 172, "time_per_iteration": 3.0119850635528564 }, { "auxiliary_loss_clip": 0.0164008, "auxiliary_loss_mlp": 0.01411296, "balance_loss_clip": 1.41633415, "balance_loss_mlp": 1.3497839, "epoch": 0.010401322711558695, "flos": 30822818613120.0, "grad_norm": 1.4251684275897927, "language_loss": 0.89418882, "learning_rate": 3.317958045350308e-06, "loss": 0.92470258, "num_input_tokens_seen": 3664030, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.61425781, "step": 173, "time_per_iteration": 3.2355563640594482 }, { "auxiliary_loss_clip": 0.01645054, "auxiliary_loss_mlp": 0.01452845, "balance_loss_clip": 1.41507065, "balance_loss_mlp": 1.35022962, "epoch": 0.010461445964226665, "flos": 24725239119360.0, "grad_norm": 1.5776203616061641, "language_loss": 0.92519748, "learning_rate": 3.3216690192172596e-06, "loss": 0.95617652, "num_input_tokens_seen": 3683615, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 1.02587891, "step": 174, "time_per_iteration": 3.2617673873901367 }, { "auxiliary_loss_clip": 0.01643694, "auxiliary_loss_mlp": 0.01442153, "balance_loss_clip": 1.40976715, "balance_loss_mlp": 1.34359097, "epoch": 0.010521569216894634, "flos": 27722137432320.0, "grad_norm": 1.6991857998138788, "language_loss": 0.82230949, "learning_rate": 3.325358726641591e-06, "loss": 0.85316801, "num_input_tokens_seen": 3704540, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.98535156, "step": 175, "time_per_iteration": 3.1029117107391357 }, { "auxiliary_loss_clip": 0.01628773, "auxiliary_loss_mlp": 0.01463245, "balance_loss_clip": 1.40014338, "balance_loss_mlp": 1.34775496, "epoch": 0.010581692469562603, "flos": 12465913726080.0, "grad_norm": 3.1192036637042633, "language_loss": 1.0955677, "learning_rate": 3.329027409977902e-06, "loss": 1.12648797, "num_input_tokens_seen": 3721320, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 1.15576172, "step": 176, "time_per_iteration": 3.086076498031616 }, { "auxiliary_loss_clip": 0.01620001, "auxiliary_loss_mlp": 0.01462779, "balance_loss_clip": 1.39549148, "balance_loss_mlp": 1.35844743, "epoch": 0.010641815722230573, "flos": 19437100680960.0, "grad_norm": 1.8462771906213882, "language_loss": 0.89843351, "learning_rate": 3.3326753074614087e-06, "loss": 0.92926133, "num_input_tokens_seen": 3739385, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 1.04394531, "step": 177, "time_per_iteration": 3.0327537059783936 }, { "auxiliary_loss_clip": 0.01618232, "auxiliary_loss_mlp": 0.0143988, "balance_loss_clip": 1.39239311, "balance_loss_mlp": 1.35686231, "epoch": 0.010701938974898541, "flos": 18341675372160.0, "grad_norm": 2.103736868146801, "language_loss": 0.94473076, "learning_rate": 3.3363026533007716e-06, "loss": 0.97531188, "num_input_tokens_seen": 3756360, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.83056641, "step": 178, "time_per_iteration": 2.995852470397949 }, { "auxiliary_loss_clip": 0.01610333, "auxiliary_loss_mlp": 0.01445442, "balance_loss_clip": 1.38644528, "balance_loss_mlp": 1.36266279, "epoch": 0.010762062227566512, "flos": 19212206186880.0, "grad_norm": 1.8843429627550965, "language_loss": 0.95800889, "learning_rate": 3.3399096777683303e-06, "loss": 0.98856664, "num_input_tokens_seen": 3773930, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.82861328, "step": 179, "time_per_iteration": 3.0423085689544678 }, { "auxiliary_loss_clip": 0.01607489, "auxiliary_loss_mlp": 0.01454743, "balance_loss_clip": 1.38320935, "balance_loss_mlp": 1.3504113, "epoch": 0.01082218548023448, "flos": 31436484842880.0, "grad_norm": 1.812687952719425, "language_loss": 0.97297537, "learning_rate": 3.3434966072878213e-06, "loss": 1.00359762, "num_input_tokens_seen": 3793630, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 1.04443359, "step": 180, "time_per_iteration": 3.0637238025665283 }, { "auxiliary_loss_clip": 0.01603919, "auxiliary_loss_mlp": 0.01428238, "balance_loss_clip": 1.37961936, "balance_loss_mlp": 1.34560251, "epoch": 0.01088230873290245, "flos": 25057400330880.0, "grad_norm": 1.7641694181066094, "language_loss": 0.88037115, "learning_rate": 3.3470636645196674e-06, "loss": 0.91069275, "num_input_tokens_seen": 3813610, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.82568359, "step": 181, "time_per_iteration": 3.085604429244995 }, { "auxiliary_loss_clip": 0.01605751, "auxiliary_loss_mlp": 0.01464032, "balance_loss_clip": 1.37691689, "balance_loss_mlp": 1.34372616, "epoch": 0.01094243198557042, "flos": 22903722956160.0, "grad_norm": 2.1307819986005034, "language_loss": 0.99614644, "learning_rate": 3.3506110684439156e-06, "loss": 1.02684426, "num_input_tokens_seen": 3831390, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 1.20410156, "step": 182, "time_per_iteration": 2.995382308959961 }, { "auxiliary_loss_clip": 0.01600262, "auxiliary_loss_mlp": 0.01448144, "balance_loss_clip": 1.37345481, "balance_loss_mlp": 1.35206163, "epoch": 0.011002555238238388, "flos": 17173849104000.0, "grad_norm": 1.8525369895595036, "language_loss": 1.01507986, "learning_rate": 3.3541390344409054e-06, "loss": 1.04556394, "num_input_tokens_seen": 3849705, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.9609375, "step": 183, "time_per_iteration": 2.994041919708252 }, { "auxiliary_loss_clip": 0.01596406, "auxiliary_loss_mlp": 0.01471138, "balance_loss_clip": 1.37230945, "balance_loss_mlp": 1.36027384, "epoch": 0.011062678490906358, "flos": 22320443249280.0, "grad_norm": 1.7988334841039793, "language_loss": 0.98383915, "learning_rate": 3.357647774369736e-06, "loss": 1.01451468, "num_input_tokens_seen": 3869230, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 1.10839844, "step": 184, "time_per_iteration": 3.105968475341797 }, { "auxiliary_loss_clip": 0.01589508, "auxiliary_loss_mlp": 0.0145589, "balance_loss_clip": 1.37041354, "balance_loss_mlp": 1.35399008, "epoch": 0.011122801743574327, "flos": 24398823997440.0, "grad_norm": 1.606355745395951, "language_loss": 0.94632804, "learning_rate": 3.3611374966446085e-06, "loss": 0.97678202, "num_input_tokens_seen": 3889735, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 1.01904297, "step": 185, "time_per_iteration": 2.9795873165130615 }, { "auxiliary_loss_clip": 0.01590443, "auxiliary_loss_mlp": 0.01423736, "balance_loss_clip": 1.36884439, "balance_loss_mlp": 1.34000301, "epoch": 0.011182924996242297, "flos": 18159338252160.0, "grad_norm": 1.8106894182956066, "language_loss": 0.86341554, "learning_rate": 3.3646084063091142e-06, "loss": 0.89355731, "num_input_tokens_seen": 3908855, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.83789062, "step": 186, "time_per_iteration": 2.9593544006347656 }, { "auxiliary_loss_clip": 0.01596973, "auxiliary_loss_mlp": 0.01433203, "balance_loss_clip": 1.37253225, "balance_loss_mlp": 1.34403408, "epoch": 0.011243048248910266, "flos": 15495189333120.0, "grad_norm": 2.1098877541394394, "language_loss": 1.16447306, "learning_rate": 3.3680607051085194e-06, "loss": 1.19477475, "num_input_tokens_seen": 3923865, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.89111328, "step": 187, "time_per_iteration": 2.933722734451294 }, { "auxiliary_loss_clip": 0.01586543, "auxiliary_loss_mlp": 0.01437228, "balance_loss_clip": 1.36790895, "balance_loss_mlp": 1.34367251, "epoch": 0.011303171501578235, "flos": 40931724257280.0, "grad_norm": 1.3705166734980192, "language_loss": 0.82476997, "learning_rate": 3.371494591560139e-06, "loss": 0.85500765, "num_input_tokens_seen": 3946870, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.93603516, "step": 188, "time_per_iteration": 3.143360137939453 }, { "auxiliary_loss_clip": 0.01586828, "auxiliary_loss_mlp": 0.01475275, "balance_loss_clip": 1.40591264, "balance_loss_mlp": 1.41767311, "epoch": 0.011363294754246205, "flos": 66331465879680.0, "grad_norm": 0.867892790654316, "language_loss": 0.56381094, "learning_rate": 3.3749102610218297e-06, "loss": 0.594432, "num_input_tokens_seen": 4010005, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.57421875, "step": 189, "time_per_iteration": 4.888100624084473 }, { "auxiliary_loss_clip": 0.01582753, "auxiliary_loss_mlp": 0.01423865, "balance_loss_clip": 1.36430097, "balance_loss_mlp": 1.34556842, "epoch": 0.011423418006914174, "flos": 24911150578560.0, "grad_norm": 1.792213522382581, "language_loss": 1.06865633, "learning_rate": 3.3783079057586833e-06, "loss": 1.09872258, "num_input_tokens_seen": 4029035, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.78320312, "step": 190, "time_per_iteration": 5.986846923828125 }, { "auxiliary_loss_clip": 0.01588894, "auxiliary_loss_mlp": 0.0144313, "balance_loss_clip": 1.36869991, "balance_loss_mlp": 1.36383212, "epoch": 0.011483541259582144, "flos": 19801322472960.0, "grad_norm": 1.9985818129320894, "language_loss": 0.98708963, "learning_rate": 3.3816877150079665e-06, "loss": 1.0174098, "num_input_tokens_seen": 4046995, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.79345703, "step": 191, "time_per_iteration": 3.0297658443450928 }, { "auxiliary_loss_clip": 0.01586426, "auxiliary_loss_mlp": 0.01482982, "balance_loss_clip": 1.36770689, "balance_loss_mlp": 1.3854686, "epoch": 0.011543664512250112, "flos": 26188053356160.0, "grad_norm": 1.7106597154035308, "language_loss": 1.00054991, "learning_rate": 3.385049875042367e-06, "loss": 1.03124392, "num_input_tokens_seen": 4065865, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.97460938, "step": 192, "time_per_iteration": 4.4699742794036865 }, { "auxiliary_loss_clip": 0.0159415, "auxiliary_loss_mlp": 0.0145886, "balance_loss_clip": 1.37441766, "balance_loss_mlp": 1.38027692, "epoch": 0.011603787764918083, "flos": 23779502167680.0, "grad_norm": 1.8966645310430514, "language_loss": 0.98729956, "learning_rate": 3.3883945692315938e-06, "loss": 1.01782966, "num_input_tokens_seen": 4085305, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.78564453, "step": 193, "time_per_iteration": 3.0018441677093506 }, { "auxiliary_loss_clip": 0.01588637, "auxiliary_loss_mlp": 0.01457368, "balance_loss_clip": 1.3726126, "balance_loss_mlp": 1.38269556, "epoch": 0.011663911017586051, "flos": 25964878164480.0, "grad_norm": 1.7196389491649033, "language_loss": 1.03056097, "learning_rate": 3.3917219781023906e-06, "loss": 1.06102109, "num_input_tokens_seen": 4105185, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.74707031, "step": 194, "time_per_iteration": 3.0725462436676025 }, { "auxiliary_loss_clip": 0.01600153, "auxiliary_loss_mlp": 0.01436216, "balance_loss_clip": 1.37777197, "balance_loss_mlp": 1.35343742, "epoch": 0.01172403427025402, "flos": 17904238214400.0, "grad_norm": 1.8620773395163162, "language_loss": 1.06341648, "learning_rate": 3.3950322793970014e-06, "loss": 1.09378028, "num_input_tokens_seen": 4123160, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.82714844, "step": 195, "time_per_iteration": 2.960056781768799 }, { "auxiliary_loss_clip": 0.01598722, "auxiliary_loss_mlp": 0.01445266, "balance_loss_clip": 1.37920976, "balance_loss_mlp": 1.34513021, "epoch": 0.01178415752292199, "flos": 17903469052800.0, "grad_norm": 2.0132020838664912, "language_loss": 0.99307179, "learning_rate": 3.3983256481301445e-06, "loss": 1.02351165, "num_input_tokens_seen": 4140425, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 1.00244141, "step": 196, "time_per_iteration": 2.994088649749756 }, { "auxiliary_loss_clip": 0.01595048, "auxiliary_loss_mlp": 0.01447349, "balance_loss_clip": 1.37689674, "balance_loss_mlp": 1.34354186, "epoch": 0.011844280775589959, "flos": 22903903935360.0, "grad_norm": 1.7019718340320535, "language_loss": 1.06016111, "learning_rate": 3.4016022566445335e-06, "loss": 1.09058499, "num_input_tokens_seen": 4159555, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 1.03710938, "step": 197, "time_per_iteration": 3.001882791519165 }, { "auxiliary_loss_clip": 0.01588873, "auxiliary_loss_mlp": 0.01428747, "balance_loss_clip": 1.37480688, "balance_loss_mlp": 1.35250032, "epoch": 0.01190440402825793, "flos": 26991838811520.0, "grad_norm": 1.7998534599724627, "language_loss": 0.90103662, "learning_rate": 3.4048622746649966e-06, "loss": 0.93121278, "num_input_tokens_seen": 4180480, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.76269531, "step": 198, "time_per_iteration": 3.0731801986694336 }, { "auxiliary_loss_clip": 0.01594941, "auxiliary_loss_mlp": 0.01455012, "balance_loss_clip": 1.3803103, "balance_loss_mlp": 1.36860907, "epoch": 0.011964527280925898, "flos": 20531213890560.0, "grad_norm": 1.5120346432146654, "language_loss": 0.94269764, "learning_rate": 3.4081058693512278e-06, "loss": 0.97319716, "num_input_tokens_seen": 4198835, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.86425781, "step": 199, "time_per_iteration": 3.017489194869995 }, { "auxiliary_loss_clip": 0.01589983, "auxiliary_loss_mlp": 0.0149015, "balance_loss_clip": 1.375211, "balance_loss_mlp": 1.38085949, "epoch": 0.012024650533593867, "flos": 27757546128000.0, "grad_norm": 1.5399294366883403, "language_loss": 0.92364168, "learning_rate": 3.411333205349222e-06, "loss": 0.95444298, "num_input_tokens_seen": 4219335, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 1.09179688, "step": 200, "time_per_iteration": 3.12546706199646 }, { "auxiliary_loss_clip": 0.01579263, "auxiliary_loss_mlp": 0.01453223, "balance_loss_clip": 1.36549819, "balance_loss_mlp": 1.36553311, "epoch": 0.012084773786261837, "flos": 10459164775680.0, "grad_norm": 1.6793761058769672, "language_loss": 1.0211432, "learning_rate": 3.4145444448414217e-06, "loss": 1.05146813, "num_input_tokens_seen": 4236940, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.87695312, "step": 201, "time_per_iteration": 3.0311992168426514 }, { "auxiliary_loss_clip": 0.01580439, "auxiliary_loss_mlp": 0.01461107, "balance_loss_clip": 1.3682791, "balance_loss_mlp": 1.36335516, "epoch": 0.012144897038929806, "flos": 23114410583040.0, "grad_norm": 1.5656129433252899, "language_loss": 0.93370402, "learning_rate": 3.4177397475956223e-06, "loss": 0.96411949, "num_input_tokens_seen": 4256755, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.97851562, "step": 202, "time_per_iteration": 3.074474334716797 }, { "auxiliary_loss_clip": 0.01568137, "auxiliary_loss_mlp": 0.01426323, "balance_loss_clip": 1.35298061, "balance_loss_mlp": 1.34588087, "epoch": 0.012205020291597776, "flos": 21043178513280.0, "grad_norm": 1.6077091105372745, "language_loss": 0.99539471, "learning_rate": 3.4209192710126685e-06, "loss": 1.02533925, "num_input_tokens_seen": 4276505, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.80517578, "step": 203, "time_per_iteration": 2.967900037765503 }, { "auxiliary_loss_clip": 0.01578951, "auxiliary_loss_mlp": 0.01460392, "balance_loss_clip": 1.40472007, "balance_loss_mlp": 1.37341678, "epoch": 0.012265143544265745, "flos": 68475144153600.0, "grad_norm": 1.0529902784300085, "language_loss": 0.61530805, "learning_rate": 3.4240831701729837e-06, "loss": 0.64570153, "num_input_tokens_seen": 4330965, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.87109375, "step": 204, "time_per_iteration": 3.2920563220977783 }, { "auxiliary_loss_clip": 0.0156772, "auxiliary_loss_mlp": 0.01448896, "balance_loss_clip": 1.35566282, "balance_loss_mlp": 1.35147834, "epoch": 0.012325266796933715, "flos": 17028051799680.0, "grad_norm": 1.901813848608763, "language_loss": 1.03444135, "learning_rate": 3.4272315978819516e-06, "loss": 1.06460762, "num_input_tokens_seen": 4348200, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.97412109, "step": 205, "time_per_iteration": 2.985851526260376 }, { "auxiliary_loss_clip": 0.01570419, "auxiliary_loss_mlp": 0.01464171, "balance_loss_clip": 1.35819101, "balance_loss_mlp": 1.3507787, "epoch": 0.012385390049601683, "flos": 20198781210240.0, "grad_norm": 1.8404623470408281, "language_loss": 1.01589632, "learning_rate": 3.4303647047142043e-06, "loss": 1.04624224, "num_input_tokens_seen": 4365460, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 1.13427734, "step": 206, "time_per_iteration": 2.9894423484802246 }, { "auxiliary_loss_clip": 0.01570843, "auxiliary_loss_mlp": 0.01455271, "balance_loss_clip": 1.35698056, "balance_loss_mlp": 1.35885477, "epoch": 0.012445513302269652, "flos": 16261756300800.0, "grad_norm": 1.7020108087685581, "language_loss": 1.08177805, "learning_rate": 3.43348263905683e-06, "loss": 1.11203933, "num_input_tokens_seen": 4383650, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.96435547, "step": 207, "time_per_iteration": 2.9436988830566406 }, { "auxiliary_loss_clip": 0.0156513, "auxiliary_loss_mlp": 0.01454605, "balance_loss_clip": 1.35251713, "balance_loss_mlp": 1.34679234, "epoch": 0.012505636554937622, "flos": 23779999860480.0, "grad_norm": 1.5428073141987484, "language_loss": 0.85805249, "learning_rate": 3.436585547151547e-06, "loss": 0.88824981, "num_input_tokens_seen": 4403765, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 1.07910156, "step": 208, "time_per_iteration": 3.064143657684326 }, { "auxiliary_loss_clip": 0.01555433, "auxiliary_loss_mlp": 0.01426469, "balance_loss_clip": 1.34742963, "balance_loss_mlp": 1.34531164, "epoch": 0.012565759807605591, "flos": 30603760698240.0, "grad_norm": 1.851827969060184, "language_loss": 1.10349154, "learning_rate": 3.4396735731358586e-06, "loss": 1.13331056, "num_input_tokens_seen": 4421935, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.81201172, "step": 209, "time_per_iteration": 2.996107816696167 }, { "auxiliary_loss_clip": 0.01555773, "auxiliary_loss_mlp": 0.01444747, "balance_loss_clip": 1.34767008, "balance_loss_mlp": 1.34737718, "epoch": 0.012625883060273561, "flos": 40127984046720.0, "grad_norm": 2.183608990792774, "language_loss": 0.97831374, "learning_rate": 3.4427468590832302e-06, "loss": 1.00831902, "num_input_tokens_seen": 4441470, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.97363281, "step": 210, "time_per_iteration": 3.157759428024292 }, { "auxiliary_loss_clip": 0.01549514, "auxiliary_loss_mlp": 0.01449906, "balance_loss_clip": 1.3399713, "balance_loss_mlp": 1.34628963, "epoch": 0.01268600631294153, "flos": 27100553362560.0, "grad_norm": 1.8298789375655278, "language_loss": 1.04656506, "learning_rate": 3.445805545042314e-06, "loss": 1.07655931, "num_input_tokens_seen": 4459950, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 1.03515625, "step": 211, "time_per_iteration": 2.976926565170288 }, { "auxiliary_loss_clip": 0.01545301, "auxiliary_loss_mlp": 0.01429967, "balance_loss_clip": 1.33997726, "balance_loss_mlp": 1.35996699, "epoch": 0.012746129565609499, "flos": 16991557228800.0, "grad_norm": 1.9047585529259456, "language_loss": 1.09408629, "learning_rate": 3.448849769075239e-06, "loss": 1.1238389, "num_input_tokens_seen": 4478390, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.69970703, "step": 212, "time_per_iteration": 2.9629440307617188 }, { "auxiliary_loss_clip": 0.01542534, "auxiliary_loss_mlp": 0.01446048, "balance_loss_clip": 1.33732605, "balance_loss_mlp": 1.36150515, "epoch": 0.012806252818277469, "flos": 46549218729600.0, "grad_norm": 1.5140581898646686, "language_loss": 0.86952293, "learning_rate": 3.4518796672950093e-06, "loss": 0.89940876, "num_input_tokens_seen": 4501665, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.84472656, "step": 213, "time_per_iteration": 3.3281195163726807 }, { "auxiliary_loss_clip": 0.01543328, "auxiliary_loss_mlp": 0.01443704, "balance_loss_clip": 1.33556056, "balance_loss_mlp": 1.34929061, "epoch": 0.012866376070945438, "flos": 14395782481920.0, "grad_norm": 2.0071758723376036, "language_loss": 1.00026679, "learning_rate": 3.4548953739020187e-06, "loss": 1.03013706, "num_input_tokens_seen": 4519055, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.94482422, "step": 214, "time_per_iteration": 3.098564386367798 }, { "auxiliary_loss_clip": 0.01540683, "auxiliary_loss_mlp": 0.01431085, "balance_loss_clip": 1.33509338, "balance_loss_mlp": 1.34492087, "epoch": 0.012926499323613408, "flos": 26152011233280.0, "grad_norm": 1.7264916970998099, "language_loss": 0.90226293, "learning_rate": 3.4578970212197196e-06, "loss": 0.93198061, "num_input_tokens_seen": 4540870, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.86181641, "step": 215, "time_per_iteration": 3.114870309829712 }, { "auxiliary_loss_clip": 0.0154069, "auxiliary_loss_mlp": 0.01419949, "balance_loss_clip": 1.3332057, "balance_loss_mlp": 1.34408414, "epoch": 0.012986622576281377, "flos": 30129286032000.0, "grad_norm": 1.8948763808052436, "language_loss": 1.05602026, "learning_rate": 3.460884739729461e-06, "loss": 1.0856266, "num_input_tokens_seen": 4560395, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.75878906, "step": 216, "time_per_iteration": 3.1361331939697266 }, { "auxiliary_loss_clip": 0.01547965, "auxiliary_loss_mlp": 0.01434006, "balance_loss_clip": 1.33968568, "balance_loss_mlp": 1.34560049, "epoch": 0.013046745828949347, "flos": 13961919663360.0, "grad_norm": 2.1322378241289504, "language_loss": 1.11902142, "learning_rate": 3.463858658104523e-06, "loss": 1.14884114, "num_input_tokens_seen": 4575785, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.88525391, "step": 217, "time_per_iteration": 3.0592234134674072 }, { "auxiliary_loss_clip": 0.01547375, "auxiliary_loss_mlp": 0.01429443, "balance_loss_clip": 1.34041226, "balance_loss_mlp": 1.35262465, "epoch": 0.013106869081617315, "flos": 17356774406400.0, "grad_norm": 1.6561432610172337, "language_loss": 1.03728271, "learning_rate": 3.4668189032433696e-06, "loss": 1.06705093, "num_input_tokens_seen": 4594985, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.76855469, "step": 218, "time_per_iteration": 3.036843776702881 }, { "auxiliary_loss_clip": 0.01547172, "auxiliary_loss_mlp": 0.01439344, "balance_loss_clip": 1.34101033, "balance_loss_mlp": 1.35761392, "epoch": 0.013166992334285284, "flos": 25895915809920.0, "grad_norm": 1.6917249011666626, "language_loss": 0.97110111, "learning_rate": 3.46976560030214e-06, "loss": 1.00096631, "num_input_tokens_seen": 4616125, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.81640625, "step": 219, "time_per_iteration": 3.0055127143859863 }, { "auxiliary_loss_clip": 0.01548611, "auxiliary_loss_mlp": 0.0142526, "balance_loss_clip": 1.3441056, "balance_loss_mlp": 1.35444951, "epoch": 0.013227115586953254, "flos": 31188488238720.0, "grad_norm": 1.514770208155254, "language_loss": 0.9790594, "learning_rate": 3.4726988727263976e-06, "loss": 1.00879812, "num_input_tokens_seen": 4637795, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.70703125, "step": 220, "time_per_iteration": 3.127387285232544 }, { "auxiliary_loss_clip": 0.01550258, "auxiliary_loss_mlp": 0.01412156, "balance_loss_clip": 1.34800196, "balance_loss_mlp": 1.3456372, "epoch": 0.013287238839621223, "flos": 20418382062720.0, "grad_norm": 1.624010904639624, "language_loss": 0.97542638, "learning_rate": 3.475618842282164e-06, "loss": 1.00505042, "num_input_tokens_seen": 4656835, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.66552734, "step": 221, "time_per_iteration": 3.010651111602783 }, { "auxiliary_loss_clip": 0.01545648, "auxiliary_loss_mlp": 0.01412944, "balance_loss_clip": 1.34445381, "balance_loss_mlp": 1.33970201, "epoch": 0.013347362092289193, "flos": 14145252168960.0, "grad_norm": 1.7979664062556922, "language_loss": 1.06572032, "learning_rate": 3.4785256290862486e-06, "loss": 1.0953064, "num_input_tokens_seen": 4673015, "router_z_loss_clip": 2.01269531, "router_z_loss_mlp": 0.73291016, "step": 222, "time_per_iteration": 2.9702508449554443 }, { "auxiliary_loss_clip": 0.01544633, "auxiliary_loss_mlp": 0.0143031, "balance_loss_clip": 1.34095025, "balance_loss_mlp": 1.35449314, "epoch": 0.013407485344957162, "flos": 21807437996160.0, "grad_norm": 1.8929668477572879, "language_loss": 1.0565865, "learning_rate": 3.481419351635897e-06, "loss": 1.08633602, "num_input_tokens_seen": 4692355, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.75830078, "step": 223, "time_per_iteration": 3.011080265045166 }, { "auxiliary_loss_clip": 0.01538105, "auxiliary_loss_mlp": 0.01440734, "balance_loss_clip": 1.33741093, "balance_loss_mlp": 1.36997163, "epoch": 0.013467608597625132, "flos": 18630057600000.0, "grad_norm": 2.0340043540875357, "language_loss": 1.01779819, "learning_rate": 3.484300126837776e-06, "loss": 1.04758656, "num_input_tokens_seen": 4710080, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.70703125, "step": 224, "time_per_iteration": 4.353667736053467 }, { "auxiliary_loss_clip": 0.01537377, "auxiliary_loss_mlp": 0.01461071, "balance_loss_clip": 1.33349156, "balance_loss_mlp": 1.38396645, "epoch": 0.013527731850293101, "flos": 18561683427840.0, "grad_norm": 1.6696514359732253, "language_loss": 1.0311991, "learning_rate": 3.487168070036317e-06, "loss": 1.06118357, "num_input_tokens_seen": 4728980, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.77197266, "step": 225, "time_per_iteration": 6.091062307357788 }, { "auxiliary_loss_clip": 0.01545256, "auxiliary_loss_mlp": 0.01462484, "balance_loss_clip": 1.33913267, "balance_loss_mlp": 1.3785603, "epoch": 0.01358785510296107, "flos": 19173042172800.0, "grad_norm": 1.6628444424216993, "language_loss": 1.0870378, "learning_rate": 3.4900232950414224e-06, "loss": 1.11711514, "num_input_tokens_seen": 4747020, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.83935547, "step": 226, "time_per_iteration": 4.461630582809448 }, { "auxiliary_loss_clip": 0.01542054, "auxiliary_loss_mlp": 0.01419012, "balance_loss_clip": 1.33768368, "balance_loss_mlp": 1.35344672, "epoch": 0.01364797835562904, "flos": 23340028993920.0, "grad_norm": 1.9115911600531583, "language_loss": 1.03947318, "learning_rate": 3.4928659141555727e-06, "loss": 1.06908381, "num_input_tokens_seen": 4765000, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.65625, "step": 227, "time_per_iteration": 3.015738010406494 }, { "auxiliary_loss_clip": 0.01533437, "auxiliary_loss_mlp": 0.01392338, "balance_loss_clip": 1.36927652, "balance_loss_mlp": 1.34503591, "epoch": 0.013708101608297009, "flos": 71029148670720.0, "grad_norm": 0.9499424665603502, "language_loss": 0.57789373, "learning_rate": 3.4956960382003234e-06, "loss": 0.60715151, "num_input_tokens_seen": 4833210, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.47265625, "step": 228, "time_per_iteration": 3.4632973670959473 }, { "auxiliary_loss_clip": 0.01538427, "auxiliary_loss_mlp": 0.01401956, "balance_loss_clip": 1.33955169, "balance_loss_mlp": 1.3410641, "epoch": 0.013768224860964979, "flos": 16333569077760.0, "grad_norm": 2.0534891648260056, "language_loss": 1.01966214, "learning_rate": 3.4985137765422354e-06, "loss": 1.04906595, "num_input_tokens_seen": 4850120, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.60839844, "step": 229, "time_per_iteration": 3.0026321411132812 }, { "auxiliary_loss_clip": 0.01546391, "auxiliary_loss_mlp": 0.01402634, "balance_loss_clip": 1.34813881, "balance_loss_mlp": 1.34107423, "epoch": 0.013828348113632948, "flos": 20202536528640.0, "grad_norm": 2.039958872605183, "language_loss": 0.98587745, "learning_rate": 3.501319237118231e-06, "loss": 1.01536775, "num_input_tokens_seen": 4866215, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.61572266, "step": 230, "time_per_iteration": 2.9712865352630615 }, { "auxiliary_loss_clip": 0.01537479, "auxiliary_loss_mlp": 0.01412425, "balance_loss_clip": 1.33900666, "balance_loss_mlp": 1.34106636, "epoch": 0.013888471366300916, "flos": 20750905232640.0, "grad_norm": 1.5512726377315005, "language_loss": 1.00041687, "learning_rate": 3.5041125264604056e-06, "loss": 1.02991593, "num_input_tokens_seen": 4885630, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.71337891, "step": 231, "time_per_iteration": 3.0177319049835205 }, { "auxiliary_loss_clip": 0.01543093, "auxiliary_loss_mlp": 0.01407073, "balance_loss_clip": 1.34452498, "balance_loss_mlp": 1.34355879, "epoch": 0.013948594618968886, "flos": 22100706662400.0, "grad_norm": 1.6557998835614465, "language_loss": 0.986265, "learning_rate": 3.5068937497203002e-06, "loss": 1.01576674, "num_input_tokens_seen": 4905570, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.63476562, "step": 232, "time_per_iteration": 3.034668445587158 }, { "auxiliary_loss_clip": 0.01538857, "auxiliary_loss_mlp": 0.01426975, "balance_loss_clip": 1.34251785, "balance_loss_mlp": 1.34004736, "epoch": 0.014008717871636855, "flos": 19072607420160.0, "grad_norm": 2.360152341163756, "language_loss": 0.89624757, "learning_rate": 3.509663010692652e-06, "loss": 0.92590582, "num_input_tokens_seen": 4923535, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.86865234, "step": 233, "time_per_iteration": 3.021674394607544 }, { "auxiliary_loss_clip": 0.01544517, "auxiliary_loss_mlp": 0.01423078, "balance_loss_clip": 1.34542036, "balance_loss_mlp": 1.34616458, "epoch": 0.014068841124304825, "flos": 14537055306240.0, "grad_norm": 1.823594101367171, "language_loss": 0.99260366, "learning_rate": 3.512420411838642e-06, "loss": 1.02227962, "num_input_tokens_seen": 4939200, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.76953125, "step": 234, "time_per_iteration": 2.9990434646606445 }, { "auxiliary_loss_clip": 0.01541841, "auxiliary_loss_mlp": 0.0141043, "balance_loss_clip": 1.34432316, "balance_loss_mlp": 1.34381568, "epoch": 0.014128964376972794, "flos": 18086484844800.0, "grad_norm": 2.106158128252329, "language_loss": 1.06949103, "learning_rate": 3.515166054308634e-06, "loss": 1.09901381, "num_input_tokens_seen": 4956620, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.66625977, "step": 235, "time_per_iteration": 2.953233480453491 }, { "auxiliary_loss_clip": 0.01540034, "auxiliary_loss_mlp": 0.01425278, "balance_loss_clip": 1.33978438, "balance_loss_mlp": 1.34030616, "epoch": 0.014189087629640764, "flos": 25344334725120.0, "grad_norm": 1.973853259890551, "language_loss": 0.95837873, "learning_rate": 3.5179000379644498e-06, "loss": 0.98803186, "num_input_tokens_seen": 4975650, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.85009766, "step": 236, "time_per_iteration": 3.108091354370117 }, { "auxiliary_loss_clip": 0.01534465, "auxiliary_loss_mlp": 0.01411695, "balance_loss_clip": 1.33595824, "balance_loss_mlp": 1.3408848, "epoch": 0.014249210882308733, "flos": 36154012118400.0, "grad_norm": 1.6472033711304293, "language_loss": 0.94469661, "learning_rate": 3.520622461401154e-06, "loss": 0.97415823, "num_input_tokens_seen": 4997415, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.70703125, "step": 237, "time_per_iteration": 3.147148370742798 }, { "auxiliary_loss_clip": 0.01531647, "auxiliary_loss_mlp": 0.01441091, "balance_loss_clip": 1.33193135, "balance_loss_mlp": 1.34105003, "epoch": 0.014309334134976702, "flos": 12940750350720.0, "grad_norm": 1.6857618712516538, "language_loss": 0.92148811, "learning_rate": 3.5233334219683935e-06, "loss": 0.95121545, "num_input_tokens_seen": 5013905, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 1.00048828, "step": 238, "time_per_iteration": 3.0487539768218994 }, { "auxiliary_loss_clip": 0.01526684, "auxiliary_loss_mlp": 0.01433568, "balance_loss_clip": 1.33139622, "balance_loss_mlp": 1.35121799, "epoch": 0.014369457387644672, "flos": 20787354558720.0, "grad_norm": 1.4702974274791385, "language_loss": 0.97015762, "learning_rate": 3.526033015791284e-06, "loss": 0.99976009, "num_input_tokens_seen": 5033645, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.82324219, "step": 239, "time_per_iteration": 3.03141188621521 }, { "auxiliary_loss_clip": 0.01517616, "auxiliary_loss_mlp": 0.0139697, "balance_loss_clip": 1.32820475, "balance_loss_mlp": 1.34435058, "epoch": 0.01442958064031264, "flos": 25859556973440.0, "grad_norm": 1.8281698220235636, "language_loss": 1.02019739, "learning_rate": 3.528721337790862e-06, "loss": 1.04934335, "num_input_tokens_seen": 5052875, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.52636719, "step": 240, "time_per_iteration": 3.03023624420166 }, { "auxiliary_loss_clip": 0.01537564, "auxiliary_loss_mlp": 0.01433203, "balance_loss_clip": 1.33819675, "balance_loss_mlp": 1.34317613, "epoch": 0.014489703892980611, "flos": 28231251632640.0, "grad_norm": 1.5706241175056228, "language_loss": 0.95799619, "learning_rate": 3.531398481704111e-06, "loss": 0.98770386, "num_input_tokens_seen": 5075005, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.90014648, "step": 241, "time_per_iteration": 3.0657451152801514 }, { "auxiliary_loss_clip": 0.01537643, "auxiliary_loss_mlp": 0.01391238, "balance_loss_clip": 1.34420371, "balance_loss_mlp": 1.34145594, "epoch": 0.01454982714564858, "flos": 22500834842880.0, "grad_norm": 1.57702522257575, "language_loss": 0.97568578, "learning_rate": 3.534064540103573e-06, "loss": 1.0049746, "num_input_tokens_seen": 5091875, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.49755859, "step": 242, "time_per_iteration": 3.0432698726654053 }, { "auxiliary_loss_clip": 0.01535947, "auxiliary_loss_mlp": 0.01395094, "balance_loss_clip": 1.34300327, "balance_loss_mlp": 1.34161711, "epoch": 0.014609950398316548, "flos": 21663269504640.0, "grad_norm": 1.7671406578339897, "language_loss": 0.96698225, "learning_rate": 3.536719604416555e-06, "loss": 0.99629271, "num_input_tokens_seen": 5111290, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.53515625, "step": 243, "time_per_iteration": 2.973078966140747 }, { "auxiliary_loss_clip": 0.0153846, "auxiliary_loss_mlp": 0.01402678, "balance_loss_clip": 1.34361792, "balance_loss_mlp": 1.34672117, "epoch": 0.014670073650984519, "flos": 21879567486720.0, "grad_norm": 1.5513554859779264, "language_loss": 0.94136047, "learning_rate": 3.5393637649439464e-06, "loss": 0.97077185, "num_input_tokens_seen": 5132265, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.55981445, "step": 244, "time_per_iteration": 3.0259532928466797 }, { "auxiliary_loss_clip": 0.01549616, "auxiliary_loss_mlp": 0.01429884, "balance_loss_clip": 1.34524202, "balance_loss_mlp": 1.35182548, "epoch": 0.014730196903652487, "flos": 23193688752000.0, "grad_norm": 2.136069430411029, "language_loss": 0.96741927, "learning_rate": 3.54199711087864e-06, "loss": 0.9972142, "num_input_tokens_seen": 5148575, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.78027344, "step": 245, "time_per_iteration": 2.927734136581421 }, { "auxiliary_loss_clip": 0.01521465, "auxiliary_loss_mlp": 0.01422144, "balance_loss_clip": 1.3281002, "balance_loss_mlp": 1.35114336, "epoch": 0.014790320156320457, "flos": 23233214724480.0, "grad_norm": 1.742875451921813, "language_loss": 0.94243884, "learning_rate": 3.5446197303235913e-06, "loss": 0.97187495, "num_input_tokens_seen": 5170415, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.7097168, "step": 246, "time_per_iteration": 3.0417020320892334 }, { "auxiliary_loss_clip": 0.01514138, "auxiliary_loss_mlp": 0.01425263, "balance_loss_clip": 1.32048953, "balance_loss_mlp": 1.35464299, "epoch": 0.014850443408988426, "flos": 15824047674240.0, "grad_norm": 1.516718986174222, "language_loss": 1.00428391, "learning_rate": 3.5472317103095034e-06, "loss": 1.03367805, "num_input_tokens_seen": 5188565, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.70654297, "step": 247, "time_per_iteration": 3.0131680965423584 }, { "auxiliary_loss_clip": 0.01503631, "auxiliary_loss_mlp": 0.01416549, "balance_loss_clip": 1.3083998, "balance_loss_mlp": 1.34864712, "epoch": 0.014910566661656396, "flos": 22791343576320.0, "grad_norm": 1.7280001728929737, "language_loss": 0.90848935, "learning_rate": 3.549833136812155e-06, "loss": 0.93769115, "num_input_tokens_seen": 5207810, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.67871094, "step": 248, "time_per_iteration": 3.0582101345062256 }, { "auxiliary_loss_clip": 0.01501963, "auxiliary_loss_mlp": 0.01417489, "balance_loss_clip": 1.30928552, "balance_loss_mlp": 1.34977841, "epoch": 0.014970689914324365, "flos": 26874980196480.0, "grad_norm": 1.620061910370753, "language_loss": 0.93865281, "learning_rate": 3.552424094769381e-06, "loss": 0.96784729, "num_input_tokens_seen": 5226210, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.67626953, "step": 249, "time_per_iteration": 3.1427114009857178 }, { "auxiliary_loss_clip": 0.0149497, "auxiliary_loss_mlp": 0.014032, "balance_loss_clip": 1.30651999, "balance_loss_mlp": 1.34166384, "epoch": 0.015030813166992334, "flos": 13992306186240.0, "grad_norm": 1.9939782907251342, "language_loss": 1.06728542, "learning_rate": 3.5550046680977174e-06, "loss": 1.09626722, "num_input_tokens_seen": 5241660, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.61547852, "step": 250, "time_per_iteration": 2.955688714981079 }, { "auxiliary_loss_clip": 0.01507225, "auxiliary_loss_mlp": 0.01430628, "balance_loss_clip": 1.31432784, "balance_loss_mlp": 1.34761024, "epoch": 0.015090936419660304, "flos": 24728632479360.0, "grad_norm": 1.75651532953761, "language_loss": 1.09728301, "learning_rate": 3.5575749397087034e-06, "loss": 1.12666154, "num_input_tokens_seen": 5261090, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.83105469, "step": 251, "time_per_iteration": 3.048722505569458 }, { "auxiliary_loss_clip": 0.01499759, "auxiliary_loss_mlp": 0.0141744, "balance_loss_clip": 1.30865979, "balance_loss_mlp": 1.34953868, "epoch": 0.015151059672328273, "flos": 25749711302400.0, "grad_norm": 1.7049632383354647, "language_loss": 0.97725958, "learning_rate": 3.5601349915248707e-06, "loss": 1.00643158, "num_input_tokens_seen": 5279175, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.6796875, "step": 252, "time_per_iteration": 3.11375093460083 }, { "auxiliary_loss_clip": 0.01496236, "auxiliary_loss_mlp": 0.01421074, "balance_loss_clip": 1.30496168, "balance_loss_mlp": 1.3498348, "epoch": 0.015211182924996243, "flos": 21881105809920.0, "grad_norm": 1.9149137336692048, "language_loss": 1.12649786, "learning_rate": 3.5626849044954064e-06, "loss": 1.15567088, "num_input_tokens_seen": 5296975, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.71240234, "step": 253, "time_per_iteration": 2.958259344100952 }, { "auxiliary_loss_clip": 0.01521347, "auxiliary_loss_mlp": 0.01427505, "balance_loss_clip": 1.36887252, "balance_loss_mlp": 1.39221895, "epoch": 0.015271306177664212, "flos": 66926237765760.0, "grad_norm": 0.9097627796219767, "language_loss": 0.55826509, "learning_rate": 3.5652247586115167e-06, "loss": 0.58775359, "num_input_tokens_seen": 5358375, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.35351562, "step": 254, "time_per_iteration": 3.3988163471221924 }, { "auxiliary_loss_clip": 0.01493778, "auxiliary_loss_mlp": 0.01430392, "balance_loss_clip": 1.30201626, "balance_loss_mlp": 1.36096442, "epoch": 0.01533142943033218, "flos": 26845453324800.0, "grad_norm": 1.67899170853955, "language_loss": 1.0334152, "learning_rate": 3.567754632921479e-06, "loss": 1.06265688, "num_input_tokens_seen": 5377255, "router_z_loss_clip": 1.91796875, "router_z_loss_mlp": 0.69433594, "step": 255, "time_per_iteration": 3.1387276649475098 }, { "auxiliary_loss_clip": 0.01492574, "auxiliary_loss_mlp": 0.01451185, "balance_loss_clip": 1.30023885, "balance_loss_mlp": 1.38118553, "epoch": 0.01539155268300015, "flos": 20823532416000.0, "grad_norm": 1.8500275296933248, "language_loss": 0.97365046, "learning_rate": 3.5702746055454075e-06, "loss": 1.003088, "num_input_tokens_seen": 5395320, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.70019531, "step": 256, "time_per_iteration": 3.1409034729003906 }, { "auxiliary_loss_clip": 0.01499804, "auxiliary_loss_mlp": 0.01443717, "balance_loss_clip": 1.30421281, "balance_loss_mlp": 1.36346483, "epoch": 0.01545167593566812, "flos": 15970568895360.0, "grad_norm": 2.306947397733322, "language_loss": 0.92229998, "learning_rate": 3.5727847536897254e-06, "loss": 0.95173526, "num_input_tokens_seen": 5411970, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.80273438, "step": 257, "time_per_iteration": 2.9621479511260986 }, { "auxiliary_loss_clip": 0.01494134, "auxiliary_loss_mlp": 0.01451152, "balance_loss_clip": 1.2997936, "balance_loss_mlp": 1.34953809, "epoch": 0.01551179918833609, "flos": 22612037857920.0, "grad_norm": 1.8186499191267613, "language_loss": 1.05892444, "learning_rate": 3.5752851536613596e-06, "loss": 1.08837736, "num_input_tokens_seen": 5430245, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 1.01464844, "step": 258, "time_per_iteration": 3.1616013050079346 }, { "auxiliary_loss_clip": 0.01493301, "auxiliary_loss_mlp": 0.01412727, "balance_loss_clip": 1.3009882, "balance_loss_mlp": 1.34010458, "epoch": 0.015571922441004058, "flos": 22826028355200.0, "grad_norm": 1.9649827033688687, "language_loss": 1.02177918, "learning_rate": 3.577775880881658e-06, "loss": 1.05083942, "num_input_tokens_seen": 5448905, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.72607422, "step": 259, "time_per_iteration": 4.476131916046143 }, { "auxiliary_loss_clip": 0.01496346, "auxiliary_loss_mlp": 0.01412601, "balance_loss_clip": 1.30462313, "balance_loss_mlp": 1.3415997, "epoch": 0.015632045693672027, "flos": 18955296357120.0, "grad_norm": 1.6180244478368793, "language_loss": 1.03253531, "learning_rate": 3.5802570099000424e-06, "loss": 1.06162488, "num_input_tokens_seen": 5466405, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.71044922, "step": 260, "time_per_iteration": 4.6127049922943115 }, { "auxiliary_loss_clip": 0.01504015, "auxiliary_loss_mlp": 0.01423853, "balance_loss_clip": 1.30994344, "balance_loss_mlp": 1.34684408, "epoch": 0.015692168946339995, "flos": 29983895930880.0, "grad_norm": 1.7491298145402792, "language_loss": 1.02591562, "learning_rate": 3.5827286144073947e-06, "loss": 1.05519426, "num_input_tokens_seen": 5487055, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.76904297, "step": 261, "time_per_iteration": 4.452184438705444 }, { "auxiliary_loss_clip": 0.01516066, "auxiliary_loss_mlp": 0.01442714, "balance_loss_clip": 1.32149994, "balance_loss_mlp": 1.35960126, "epoch": 0.015752292199007967, "flos": 19401827719680.0, "grad_norm": 1.5811001112028633, "language_loss": 0.77230787, "learning_rate": 3.5851907672491904e-06, "loss": 0.80189562, "num_input_tokens_seen": 5506600, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.83105469, "step": 262, "time_per_iteration": 3.0023179054260254 }, { "auxiliary_loss_clip": 0.01512664, "auxiliary_loss_mlp": 0.01441328, "balance_loss_clip": 1.31741226, "balance_loss_mlp": 1.35325599, "epoch": 0.015812415451675936, "flos": 20349555442560.0, "grad_norm": 1.6833239508127824, "language_loss": 0.8233102, "learning_rate": 3.587643540438383e-06, "loss": 0.85285014, "num_input_tokens_seen": 5524350, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.88085938, "step": 263, "time_per_iteration": 2.983851432800293 }, { "auxiliary_loss_clip": 0.01510672, "auxiliary_loss_mlp": 0.01433856, "balance_loss_clip": 1.31800413, "balance_loss_mlp": 1.35145867, "epoch": 0.015872538704343905, "flos": 17533139212800.0, "grad_norm": 2.086961113888894, "language_loss": 1.03318727, "learning_rate": 3.590087005168037e-06, "loss": 1.06263256, "num_input_tokens_seen": 5542145, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.82421875, "step": 264, "time_per_iteration": 2.9981093406677246 }, { "auxiliary_loss_clip": 0.01508437, "auxiliary_loss_mlp": 0.01396569, "balance_loss_clip": 1.31915498, "balance_loss_mlp": 1.34268665, "epoch": 0.015932661957011873, "flos": 15266901479040.0, "grad_norm": 1.9130482137158251, "language_loss": 1.14311886, "learning_rate": 3.5925212318237344e-06, "loss": 1.17216897, "num_input_tokens_seen": 5557920, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.53833008, "step": 265, "time_per_iteration": 2.989739179611206 }, { "auxiliary_loss_clip": 0.01511034, "auxiliary_loss_mlp": 0.01413019, "balance_loss_clip": 1.31729448, "balance_loss_mlp": 1.34027791, "epoch": 0.015992785209679845, "flos": 20312065486080.0, "grad_norm": 1.7922365078398237, "language_loss": 0.91443074, "learning_rate": 3.5949462899957323e-06, "loss": 0.94367129, "num_input_tokens_seen": 5576290, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.7277832, "step": 266, "time_per_iteration": 3.0518651008605957 }, { "auxiliary_loss_clip": 0.01497402, "auxiliary_loss_mlp": 0.01397294, "balance_loss_clip": 1.30961037, "balance_loss_mlp": 1.34286296, "epoch": 0.016052908462347814, "flos": 23371410902400.0, "grad_norm": 1.899517010506227, "language_loss": 0.97124958, "learning_rate": 3.5973622484909068e-06, "loss": 1.00019646, "num_input_tokens_seen": 5595205, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.54394531, "step": 267, "time_per_iteration": 3.119874954223633 }, { "auxiliary_loss_clip": 0.01500339, "auxiliary_loss_mlp": 0.01401077, "balance_loss_clip": 1.30966687, "balance_loss_mlp": 1.34035194, "epoch": 0.016113031715015783, "flos": 21296106800640.0, "grad_norm": 1.8967484384952342, "language_loss": 0.99606454, "learning_rate": 3.599769175344462e-06, "loss": 1.02507877, "num_input_tokens_seen": 5612645, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.60693359, "step": 268, "time_per_iteration": 3.0038843154907227 }, { "auxiliary_loss_clip": 0.01499467, "auxiliary_loss_mlp": 0.01415009, "balance_loss_clip": 1.3107481, "balance_loss_mlp": 1.34324479, "epoch": 0.01617315496768375, "flos": 18923507245440.0, "grad_norm": 1.7709569959392844, "language_loss": 0.99513739, "learning_rate": 3.602167137831432e-06, "loss": 1.0242821, "num_input_tokens_seen": 5628345, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.71728516, "step": 269, "time_per_iteration": 3.0612123012542725 }, { "auxiliary_loss_clip": 0.01497056, "auxiliary_loss_mlp": 0.01410551, "balance_loss_clip": 1.3065927, "balance_loss_mlp": 1.34124255, "epoch": 0.01623327822035172, "flos": 16555567904640.0, "grad_norm": 1.8287034574259209, "language_loss": 1.07515216, "learning_rate": 3.6045562024779565e-06, "loss": 1.10422826, "num_input_tokens_seen": 5645940, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.69311523, "step": 270, "time_per_iteration": 3.061020612716675 }, { "auxiliary_loss_clip": 0.01488502, "auxiliary_loss_mlp": 0.01410139, "balance_loss_clip": 1.30125809, "balance_loss_mlp": 1.34247553, "epoch": 0.016293401473019692, "flos": 23523904437120.0, "grad_norm": 1.7254651216900898, "language_loss": 0.98590124, "learning_rate": 3.606936435072361e-06, "loss": 1.01488769, "num_input_tokens_seen": 5665690, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.67675781, "step": 271, "time_per_iteration": 3.0728659629821777 }, { "auxiliary_loss_clip": 0.01495146, "auxiliary_loss_mlp": 0.01404902, "balance_loss_clip": 1.30626762, "balance_loss_mlp": 1.34489191, "epoch": 0.01635352472568766, "flos": 29026169107200.0, "grad_norm": 1.9776424046432988, "language_loss": 0.96737254, "learning_rate": 3.609307900676025e-06, "loss": 0.99637306, "num_input_tokens_seen": 5683190, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.59985352, "step": 272, "time_per_iteration": 3.103900909423828 }, { "auxiliary_loss_clip": 0.01490369, "auxiliary_loss_mlp": 0.01414603, "balance_loss_clip": 1.30223227, "balance_loss_mlp": 1.34786987, "epoch": 0.01641364797835563, "flos": 13378323242880.0, "grad_norm": 1.8709334937543125, "language_loss": 0.9319911, "learning_rate": 3.611670663634051e-06, "loss": 0.96104085, "num_input_tokens_seen": 5699780, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.66772461, "step": 273, "time_per_iteration": 3.0304439067840576 }, { "auxiliary_loss_clip": 0.01499912, "auxiliary_loss_mlp": 0.01423625, "balance_loss_clip": 1.30913794, "balance_loss_mlp": 1.35326767, "epoch": 0.016473771231023598, "flos": 18887057919360.0, "grad_norm": 1.7943621424665783, "language_loss": 1.05686402, "learning_rate": 3.614024787585744e-06, "loss": 1.08609939, "num_input_tokens_seen": 5716980, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.70410156, "step": 274, "time_per_iteration": 3.044799566268921 }, { "auxiliary_loss_clip": 0.01496866, "auxiliary_loss_mlp": 0.01425705, "balance_loss_clip": 1.30865598, "balance_loss_mlp": 1.3518424, "epoch": 0.016533894483691566, "flos": 22611902123520.0, "grad_norm": 1.6615583293566911, "language_loss": 0.99816883, "learning_rate": 3.6163703354748927e-06, "loss": 1.02739453, "num_input_tokens_seen": 5737780, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.73925781, "step": 275, "time_per_iteration": 3.052940607070923 }, { "auxiliary_loss_clip": 0.014945, "auxiliary_loss_mlp": 0.01414059, "balance_loss_clip": 1.30802441, "balance_loss_mlp": 1.34932864, "epoch": 0.01659401773635954, "flos": 21517200731520.0, "grad_norm": 1.475723944904423, "language_loss": 0.90201354, "learning_rate": 3.6187073695598707e-06, "loss": 0.93109918, "num_input_tokens_seen": 5758330, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.64672852, "step": 276, "time_per_iteration": 3.0280468463897705 }, { "auxiliary_loss_clip": 0.01505257, "auxiliary_loss_mlp": 0.01410711, "balance_loss_clip": 1.31645894, "balance_loss_mlp": 1.34423971, "epoch": 0.016654140989027507, "flos": 32863075977600.0, "grad_norm": 1.5797113314272933, "language_loss": 0.8910737, "learning_rate": 3.621035951423551e-06, "loss": 0.92023337, "num_input_tokens_seen": 5778340, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.66455078, "step": 277, "time_per_iteration": 3.2062783241271973 }, { "auxiliary_loss_clip": 0.01498274, "auxiliary_loss_mlp": 0.01396574, "balance_loss_clip": 1.31197357, "balance_loss_mlp": 1.34066498, "epoch": 0.016714264241695476, "flos": 12312153336960.0, "grad_norm": 1.8414944991782476, "language_loss": 0.9014163, "learning_rate": 3.623356141983041e-06, "loss": 0.93036473, "num_input_tokens_seen": 5794295, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.55883789, "step": 278, "time_per_iteration": 3.0505058765411377 }, { "auxiliary_loss_clip": 0.01503943, "auxiliary_loss_mlp": 0.01408682, "balance_loss_clip": 1.31156635, "balance_loss_mlp": 1.34001732, "epoch": 0.016774387494363444, "flos": 27134695203840.0, "grad_norm": 1.6753241897745959, "language_loss": 1.02659738, "learning_rate": 3.6256680014992486e-06, "loss": 1.05572367, "num_input_tokens_seen": 5814405, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.68701172, "step": 279, "time_per_iteration": 3.133037805557251 }, { "auxiliary_loss_clip": 0.01497442, "auxiliary_loss_mlp": 0.01421199, "balance_loss_clip": 1.30784976, "balance_loss_mlp": 1.34256911, "epoch": 0.016834510747031413, "flos": 20200591002240.0, "grad_norm": 1.8016591020591932, "language_loss": 1.07473278, "learning_rate": 3.6279715895862713e-06, "loss": 1.10391927, "num_input_tokens_seen": 5832795, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.78613281, "step": 280, "time_per_iteration": 3.116851329803467 }, { "auxiliary_loss_clip": 0.01487226, "auxiliary_loss_mlp": 0.01413694, "balance_loss_clip": 1.29906964, "balance_loss_mlp": 1.34302628, "epoch": 0.016894633999699385, "flos": 27286374332160.0, "grad_norm": 1.5058306002681243, "language_loss": 0.84433413, "learning_rate": 3.6302669652206183e-06, "loss": 0.87334335, "num_input_tokens_seen": 5855750, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.70654297, "step": 281, "time_per_iteration": 3.167877435684204 }, { "auxiliary_loss_clip": 0.01485974, "auxiliary_loss_mlp": 0.01417188, "balance_loss_clip": 1.29811001, "balance_loss_mlp": 1.34556723, "epoch": 0.016954757252367354, "flos": 14911185709440.0, "grad_norm": 2.1072468692071835, "language_loss": 1.01152253, "learning_rate": 3.632554186750274e-06, "loss": 1.04055417, "num_input_tokens_seen": 5872610, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.71582031, "step": 282, "time_per_iteration": 3.1417341232299805 }, { "auxiliary_loss_clip": 0.0148228, "auxiliary_loss_mlp": 0.01412605, "balance_loss_clip": 1.29277134, "balance_loss_mlp": 1.34155655, "epoch": 0.017014880505035322, "flos": 21368145801600.0, "grad_norm": 1.6627071864352685, "language_loss": 0.9007085, "learning_rate": 3.6348333119035937e-06, "loss": 0.92965734, "num_input_tokens_seen": 5892985, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.71044922, "step": 283, "time_per_iteration": 3.3196709156036377 }, { "auxiliary_loss_clip": 0.01472782, "auxiliary_loss_mlp": 0.01415784, "balance_loss_clip": 1.28571892, "balance_loss_mlp": 1.34268534, "epoch": 0.01707500375770329, "flos": 35346199875840.0, "grad_norm": 1.784532941800421, "language_loss": 0.94354904, "learning_rate": 3.6371043977980503e-06, "loss": 0.97243464, "num_input_tokens_seen": 5914060, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.73095703, "step": 284, "time_per_iteration": 3.245760917663574 }, { "auxiliary_loss_clip": 0.01465664, "auxiliary_loss_mlp": 0.01411844, "balance_loss_clip": 1.28227806, "balance_loss_mlp": 1.34632707, "epoch": 0.01713512701037126, "flos": 23591373713280.0, "grad_norm": 1.9563699841610152, "language_loss": 1.09732497, "learning_rate": 3.639367500948819e-06, "loss": 1.12609994, "num_input_tokens_seen": 5932860, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.65527344, "step": 285, "time_per_iteration": 3.0912346839904785 }, { "auxiliary_loss_clip": 0.01463259, "auxiliary_loss_mlp": 0.01408419, "balance_loss_clip": 1.28129053, "balance_loss_mlp": 1.3433789, "epoch": 0.01719525026303923, "flos": 27645709685760.0, "grad_norm": 1.6831460302368153, "language_loss": 1.05088341, "learning_rate": 3.6416226772772178e-06, "loss": 1.07960033, "num_input_tokens_seen": 5952725, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.64990234, "step": 286, "time_per_iteration": 3.253262996673584 }, { "auxiliary_loss_clip": 0.01469827, "auxiliary_loss_mlp": 0.01402503, "balance_loss_clip": 1.28674436, "balance_loss_mlp": 1.34287429, "epoch": 0.0172553735157072, "flos": 26991069649920.0, "grad_norm": 1.489969933521248, "language_loss": 0.9993127, "learning_rate": 3.643869982119001e-06, "loss": 1.028036, "num_input_tokens_seen": 5970560, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.59643555, "step": 287, "time_per_iteration": 3.0771868228912354 }, { "auxiliary_loss_clip": 0.01479763, "auxiliary_loss_mlp": 0.01405626, "balance_loss_clip": 1.29350686, "balance_loss_mlp": 1.33984613, "epoch": 0.01731549676837517, "flos": 14062761619200.0, "grad_norm": 2.082134750542835, "language_loss": 1.17210579, "learning_rate": 3.646109470232502e-06, "loss": 1.20095968, "num_input_tokens_seen": 5982980, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.6574707, "step": 288, "time_per_iteration": 3.045431613922119 }, { "auxiliary_loss_clip": 0.01477235, "auxiliary_loss_mlp": 0.01368571, "balance_loss_clip": 1.32320094, "balance_loss_mlp": 1.33900678, "epoch": 0.017375620021043137, "flos": 66546153031680.0, "grad_norm": 0.9375127621114936, "language_loss": 0.64246446, "learning_rate": 3.6483411958066417e-06, "loss": 0.67092252, "num_input_tokens_seen": 6049445, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.29492188, "step": 289, "time_per_iteration": 3.5431721210479736 }, { "auxiliary_loss_clip": 0.01486309, "auxiliary_loss_mlp": 0.01396211, "balance_loss_clip": 1.30509067, "balance_loss_mlp": 1.34256721, "epoch": 0.01743574327371111, "flos": 15231854741760.0, "grad_norm": 2.154210799896163, "language_loss": 1.00688875, "learning_rate": 3.6505652124687957e-06, "loss": 1.03571403, "num_input_tokens_seen": 6064150, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.53686523, "step": 290, "time_per_iteration": 3.0662574768066406 }, { "auxiliary_loss_clip": 0.01508007, "auxiliary_loss_mlp": 0.01405689, "balance_loss_clip": 1.32217836, "balance_loss_mlp": 1.3489933, "epoch": 0.017495866526379078, "flos": 25384675104000.0, "grad_norm": 1.5705570950264276, "language_loss": 0.97315371, "learning_rate": 3.6527815732925258e-06, "loss": 1.00229073, "num_input_tokens_seen": 6083920, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.56738281, "step": 291, "time_per_iteration": 3.085784435272217 }, { "auxiliary_loss_clip": 0.01506241, "auxiliary_loss_mlp": 0.01397565, "balance_loss_clip": 1.32405698, "balance_loss_mlp": 1.34291995, "epoch": 0.017555989779047047, "flos": 26370661944960.0, "grad_norm": 1.5352469087520764, "language_loss": 0.81681895, "learning_rate": 3.6549903308051806e-06, "loss": 0.84585702, "num_input_tokens_seen": 6105460, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.54663086, "step": 292, "time_per_iteration": 3.007880449295044 }, { "auxiliary_loss_clip": 0.01498802, "auxiliary_loss_mlp": 0.01395748, "balance_loss_clip": 1.31978095, "balance_loss_mlp": 1.3505919, "epoch": 0.017616113031715015, "flos": 22347074453760.0, "grad_norm": 1.8670207636289677, "language_loss": 0.9868027, "learning_rate": 3.6571915369953646e-06, "loss": 1.01574814, "num_input_tokens_seen": 6122890, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.45166016, "step": 293, "time_per_iteration": 4.44026780128479 }, { "auxiliary_loss_clip": 0.01496488, "auxiliary_loss_mlp": 0.01393086, "balance_loss_clip": 1.31870604, "balance_loss_mlp": 1.34685707, "epoch": 0.017676236284382984, "flos": 20166494405760.0, "grad_norm": 1.520819554894164, "language_loss": 0.93919694, "learning_rate": 3.6593852433202797e-06, "loss": 0.96809268, "num_input_tokens_seen": 6142890, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.46240234, "step": 294, "time_per_iteration": 3.053316831588745 }, { "auxiliary_loss_clip": 0.01490006, "auxiliary_loss_mlp": 0.01400349, "balance_loss_clip": 1.31014943, "balance_loss_mlp": 1.34548938, "epoch": 0.017736359537050956, "flos": 25232905486080.0, "grad_norm": 2.065134015617671, "language_loss": 0.9333865, "learning_rate": 3.6615715007129453e-06, "loss": 0.96229005, "num_input_tokens_seen": 6162030, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.54833984, "step": 295, "time_per_iteration": 4.520801305770874 }, { "auxiliary_loss_clip": 0.0151354, "auxiliary_loss_mlp": 0.01393216, "balance_loss_clip": 1.33275843, "balance_loss_mlp": 1.34600925, "epoch": 0.017796482789718925, "flos": 20348288588160.0, "grad_norm": 1.8143509918538279, "language_loss": 0.93494141, "learning_rate": 3.6637503595892897e-06, "loss": 0.96400905, "num_input_tokens_seen": 6180540, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.47216797, "step": 296, "time_per_iteration": 6.0985267162323 }, { "auxiliary_loss_clip": 0.01510128, "auxiliary_loss_mlp": 0.01390328, "balance_loss_clip": 1.3312403, "balance_loss_mlp": 1.33944988, "epoch": 0.017856606042386893, "flos": 22388500707840.0, "grad_norm": 1.7342284669151882, "language_loss": 0.98527503, "learning_rate": 3.665921869855132e-06, "loss": 1.0142796, "num_input_tokens_seen": 6199425, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.50830078, "step": 297, "time_per_iteration": 3.0394985675811768 }, { "auxiliary_loss_clip": 0.01520207, "auxiliary_loss_mlp": 0.01388346, "balance_loss_clip": 1.34330201, "balance_loss_mlp": 1.34318995, "epoch": 0.017916729295054862, "flos": 20239709771520.0, "grad_norm": 1.6885585161482857, "language_loss": 1.01833534, "learning_rate": 3.6680860809130346e-06, "loss": 1.04742098, "num_input_tokens_seen": 6219170, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.45166016, "step": 298, "time_per_iteration": 2.981853485107422 }, { "auxiliary_loss_clip": 0.01522472, "auxiliary_loss_mlp": 0.01406211, "balance_loss_clip": 1.34464145, "balance_loss_mlp": 1.35023046, "epoch": 0.01797685254772283, "flos": 19400425130880.0, "grad_norm": 1.496350976297131, "language_loss": 0.97697282, "learning_rate": 3.6702430416690516e-06, "loss": 1.00625968, "num_input_tokens_seen": 6237930, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.55957031, "step": 299, "time_per_iteration": 3.018101692199707 }, { "auxiliary_loss_clip": 0.01523025, "auxiliary_loss_mlp": 0.01420803, "balance_loss_clip": 1.34110975, "balance_loss_mlp": 1.36007786, "epoch": 0.018036975800390802, "flos": 24436766401920.0, "grad_norm": 2.0791540134678446, "language_loss": 0.82400465, "learning_rate": 3.672392800539357e-06, "loss": 0.85344297, "num_input_tokens_seen": 6257170, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.60742188, "step": 300, "time_per_iteration": 3.007295608520508 }, { "auxiliary_loss_clip": 0.01533687, "auxiliary_loss_mlp": 0.01417113, "balance_loss_clip": 1.35504127, "balance_loss_mlp": 1.35934424, "epoch": 0.01809709905305877, "flos": 15787462613760.0, "grad_norm": 1.7327589260779785, "language_loss": 1.01556432, "learning_rate": 3.6745354054567686e-06, "loss": 1.04507232, "num_input_tokens_seen": 6274780, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.5769043, "step": 301, "time_per_iteration": 3.038264274597168 }, { "auxiliary_loss_clip": 0.01549346, "auxiliary_loss_mlp": 0.01361224, "balance_loss_clip": 1.39953136, "balance_loss_mlp": 1.34176922, "epoch": 0.01815722230572674, "flos": 67383673125120.0, "grad_norm": 0.8555211435471284, "language_loss": 0.62499094, "learning_rate": 3.676670903877158e-06, "loss": 0.65409666, "num_input_tokens_seen": 6340435, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.19433594, "step": 302, "time_per_iteration": 3.5381991863250732 }, { "auxiliary_loss_clip": 0.01497692, "auxiliary_loss_mlp": 0.01442441, "balance_loss_clip": 1.31959164, "balance_loss_mlp": 1.38352799, "epoch": 0.01821734555839471, "flos": 15493696254720.0, "grad_norm": 1.7998762991168389, "language_loss": 1.02892613, "learning_rate": 3.6787993427857567e-06, "loss": 1.05832756, "num_input_tokens_seen": 6358160, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.58911133, "step": 303, "time_per_iteration": 3.114450216293335 }, { "auxiliary_loss_clip": 0.014903, "auxiliary_loss_mlp": 0.01449872, "balance_loss_clip": 1.31330061, "balance_loss_mlp": 1.40366626, "epoch": 0.018277468811062677, "flos": 24108043795200.0, "grad_norm": 1.5777160398055636, "language_loss": 0.91424119, "learning_rate": 3.680920768703364e-06, "loss": 0.94364297, "num_input_tokens_seen": 6378485, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.46166992, "step": 304, "time_per_iteration": 3.2559657096862793 }, { "auxiliary_loss_clip": 0.01488879, "auxiliary_loss_mlp": 0.01437166, "balance_loss_clip": 1.31249475, "balance_loss_mlp": 1.38705063, "epoch": 0.01833759206373065, "flos": 20969058251520.0, "grad_norm": 1.4462071373287906, "language_loss": 0.88289249, "learning_rate": 3.6830352276924415e-06, "loss": 0.91215301, "num_input_tokens_seen": 6397845, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.5012207, "step": 305, "time_per_iteration": 3.1393206119537354 }, { "auxiliary_loss_clip": 0.01482995, "auxiliary_loss_mlp": 0.01403499, "balance_loss_clip": 1.30736363, "balance_loss_mlp": 1.35412252, "epoch": 0.018397715316398618, "flos": 19400153662080.0, "grad_norm": 1.5734879561229687, "language_loss": 0.99025142, "learning_rate": 3.685142765363119e-06, "loss": 1.0191164, "num_input_tokens_seen": 6416475, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.49389648, "step": 306, "time_per_iteration": 2.957449436187744 }, { "auxiliary_loss_clip": 0.01483647, "auxiliary_loss_mlp": 0.01400988, "balance_loss_clip": 1.30548954, "balance_loss_mlp": 1.3478446, "epoch": 0.018457838569066586, "flos": 29144475555840.0, "grad_norm": 1.6645942564336902, "language_loss": 0.98157763, "learning_rate": 3.687243426879095e-06, "loss": 1.0104239, "num_input_tokens_seen": 6437520, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.53125, "step": 307, "time_per_iteration": 3.1539502143859863 }, { "auxiliary_loss_clip": 0.01487184, "auxiliary_loss_mlp": 0.01418091, "balance_loss_clip": 1.30950236, "balance_loss_mlp": 1.3624208, "epoch": 0.018517961821734555, "flos": 19218088010880.0, "grad_norm": 1.7528135992564817, "language_loss": 0.8607589, "learning_rate": 3.6893372569634466e-06, "loss": 0.88981164, "num_input_tokens_seen": 6455680, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.55688477, "step": 308, "time_per_iteration": 3.064777374267578 }, { "auxiliary_loss_clip": 0.01496588, "auxiliary_loss_mlp": 0.01460572, "balance_loss_clip": 1.31005168, "balance_loss_mlp": 1.3750751, "epoch": 0.018578085074402523, "flos": 19872004129920.0, "grad_norm": 1.6750426611418898, "language_loss": 0.9778837, "learning_rate": 3.6914242999043395e-06, "loss": 1.00745535, "num_input_tokens_seen": 6474880, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.85498047, "step": 309, "time_per_iteration": 3.1307332515716553 }, { "auxiliary_loss_clip": 0.0149202, "auxiliary_loss_mlp": 0.01443482, "balance_loss_clip": 1.30658054, "balance_loss_mlp": 1.37324381, "epoch": 0.018638208327070496, "flos": 29619040711680.0, "grad_norm": 1.7402551650750409, "language_loss": 0.88026601, "learning_rate": 3.69350459956065e-06, "loss": 0.909621, "num_input_tokens_seen": 6495945, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.70263672, "step": 310, "time_per_iteration": 3.207200050354004 }, { "auxiliary_loss_clip": 0.01476897, "auxiliary_loss_mlp": 0.01411427, "balance_loss_clip": 1.29884362, "balance_loss_mlp": 1.36209834, "epoch": 0.018698331579738464, "flos": 45747288311040.0, "grad_norm": 1.5071328675910307, "language_loss": 0.83718276, "learning_rate": 3.695578199367497e-06, "loss": 0.86606598, "num_input_tokens_seen": 6519930, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.49389648, "step": 311, "time_per_iteration": 3.3521690368652344 }, { "auxiliary_loss_clip": 0.01474421, "auxiliary_loss_mlp": 0.01408315, "balance_loss_clip": 1.29499292, "balance_loss_mlp": 1.34997427, "epoch": 0.018758454832406433, "flos": 20492909527680.0, "grad_norm": 2.07319267649501, "language_loss": 1.02580142, "learning_rate": 3.6976451423416825e-06, "loss": 1.05462885, "num_input_tokens_seen": 6535070, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.58374023, "step": 312, "time_per_iteration": 2.9914748668670654 }, { "auxiliary_loss_clip": 0.01469237, "auxiliary_loss_mlp": 0.01400846, "balance_loss_clip": 1.29221463, "balance_loss_mlp": 1.34636736, "epoch": 0.0188185780850744, "flos": 15786014780160.0, "grad_norm": 1.7539398931101273, "language_loss": 1.03918719, "learning_rate": 3.699705471087043e-06, "loss": 1.06788802, "num_input_tokens_seen": 6554135, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.54492188, "step": 313, "time_per_iteration": 2.9839365482330322 }, { "auxiliary_loss_clip": 0.01486014, "auxiliary_loss_mlp": 0.01410003, "balance_loss_clip": 1.30269504, "balance_loss_mlp": 1.34787118, "epoch": 0.018878701337742373, "flos": 22465969084800.0, "grad_norm": 1.911464390569888, "language_loss": 0.94432068, "learning_rate": 3.7017592277997256e-06, "loss": 0.97328079, "num_input_tokens_seen": 6572275, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.62182617, "step": 314, "time_per_iteration": 3.0625627040863037 }, { "auxiliary_loss_clip": 0.01478496, "auxiliary_loss_mlp": 0.01405175, "balance_loss_clip": 1.30213809, "balance_loss_mlp": 1.35143518, "epoch": 0.018938824590410342, "flos": 31006015384320.0, "grad_norm": 2.3323486976310037, "language_loss": 1.04136896, "learning_rate": 3.7038064542733654e-06, "loss": 1.07020569, "num_input_tokens_seen": 6594520, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.53710938, "step": 315, "time_per_iteration": 3.095252275466919 }, { "auxiliary_loss_clip": 0.01497209, "auxiliary_loss_mlp": 0.01440329, "balance_loss_clip": 1.31497765, "balance_loss_mlp": 1.35404527, "epoch": 0.01899894784307831, "flos": 23269483071360.0, "grad_norm": 1.5835644772275472, "language_loss": 0.91874284, "learning_rate": 3.7058471919041945e-06, "loss": 0.94811821, "num_input_tokens_seen": 6614245, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.86254883, "step": 316, "time_per_iteration": 2.9922244548797607 }, { "auxiliary_loss_clip": 0.01493459, "auxiliary_loss_mlp": 0.01410624, "balance_loss_clip": 1.31684017, "balance_loss_mlp": 1.35547793, "epoch": 0.01905907109574628, "flos": 17467027280640.0, "grad_norm": 1.6459503298263989, "language_loss": 0.96892136, "learning_rate": 3.7078814816960605e-06, "loss": 0.99796218, "num_input_tokens_seen": 6632015, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.55126953, "step": 317, "time_per_iteration": 3.0228896141052246 }, { "auxiliary_loss_clip": 0.01513492, "auxiliary_loss_mlp": 0.01444572, "balance_loss_clip": 1.33365369, "balance_loss_mlp": 1.35380578, "epoch": 0.019119194348414248, "flos": 14976980928000.0, "grad_norm": 1.9903374222222234, "language_loss": 1.05596328, "learning_rate": 3.709909364265374e-06, "loss": 1.08554399, "num_input_tokens_seen": 6649015, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.90771484, "step": 318, "time_per_iteration": 3.077000856399536 }, { "auxiliary_loss_clip": 0.01514164, "auxiliary_loss_mlp": 0.01392695, "balance_loss_clip": 1.33813977, "balance_loss_mlp": 1.34253144, "epoch": 0.01917931760108222, "flos": 25493525389440.0, "grad_norm": 1.9090698910439856, "language_loss": 1.05335855, "learning_rate": 3.7119308798459706e-06, "loss": 1.08242726, "num_input_tokens_seen": 6669225, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.50170898, "step": 319, "time_per_iteration": 3.0136423110961914 }, { "auxiliary_loss_clip": 0.01523239, "auxiliary_loss_mlp": 0.01398995, "balance_loss_clip": 1.38626301, "balance_loss_mlp": 1.37038481, "epoch": 0.01923944085375019, "flos": 71587181779200.0, "grad_norm": 0.9624002766621472, "language_loss": 0.59982169, "learning_rate": 3.7139460682939026e-06, "loss": 0.62904394, "num_input_tokens_seen": 6725775, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.28515625, "step": 320, "time_per_iteration": 3.3411078453063965 }, { "auxiliary_loss_clip": 0.01509156, "auxiliary_loss_mlp": 0.01399642, "balance_loss_clip": 1.33212852, "balance_loss_mlp": 1.35262609, "epoch": 0.019299564106418157, "flos": 19691929249920.0, "grad_norm": 1.771317087982245, "language_loss": 1.06636834, "learning_rate": 3.715954969092154e-06, "loss": 1.09545624, "num_input_tokens_seen": 6744170, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.4699707, "step": 321, "time_per_iteration": 3.104935646057129 }, { "auxiliary_loss_clip": 0.01507086, "auxiliary_loss_mlp": 0.01425781, "balance_loss_clip": 1.33182752, "balance_loss_mlp": 1.38305664, "epoch": 0.019359687359086126, "flos": 24397285674240.0, "grad_norm": 1.7814399770519727, "language_loss": 0.95840096, "learning_rate": 3.7179576213552805e-06, "loss": 0.98772955, "num_input_tokens_seen": 6764565, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.42700195, "step": 322, "time_per_iteration": 3.078573226928711 }, { "auxiliary_loss_clip": 0.01497133, "auxiliary_loss_mlp": 0.01431927, "balance_loss_clip": 1.32489681, "balance_loss_mlp": 1.38007081, "epoch": 0.019419810611754094, "flos": 23961929777280.0, "grad_norm": 1.931863835200685, "language_loss": 0.87765032, "learning_rate": 3.719954063833981e-06, "loss": 0.90694094, "num_input_tokens_seen": 6785310, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.51928711, "step": 323, "time_per_iteration": 3.072577953338623 }, { "auxiliary_loss_clip": 0.01501231, "auxiliary_loss_mlp": 0.01419917, "balance_loss_clip": 1.32731128, "balance_loss_mlp": 1.38041091, "epoch": 0.019479933864422067, "flos": 22169442792960.0, "grad_norm": 1.5837378961118929, "language_loss": 1.01682043, "learning_rate": 3.721944334919596e-06, "loss": 1.04603183, "num_input_tokens_seen": 6803290, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.39526367, "step": 324, "time_per_iteration": 3.0561697483062744 }, { "auxiliary_loss_clip": 0.01505146, "auxiliary_loss_mlp": 0.01408207, "balance_loss_clip": 1.33186126, "balance_loss_mlp": 1.3645525, "epoch": 0.019540057117090035, "flos": 22247001659520.0, "grad_norm": 1.7131676642877194, "language_loss": 0.82971179, "learning_rate": 3.7239284726485375e-06, "loss": 0.85884535, "num_input_tokens_seen": 6822570, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.43652344, "step": 325, "time_per_iteration": 3.0457406044006348 }, { "auxiliary_loss_clip": 0.01493283, "auxiliary_loss_mlp": 0.01386142, "balance_loss_clip": 1.32303524, "balance_loss_mlp": 1.34499121, "epoch": 0.019600180369758004, "flos": 23087372175360.0, "grad_norm": 1.5096052595433331, "language_loss": 0.86447024, "learning_rate": 3.72590651470665e-06, "loss": 0.89326453, "num_input_tokens_seen": 6841910, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.41162109, "step": 326, "time_per_iteration": 3.2306222915649414 }, { "auxiliary_loss_clip": 0.01477275, "auxiliary_loss_mlp": 0.01416236, "balance_loss_clip": 1.30882549, "balance_loss_mlp": 1.36712146, "epoch": 0.019660303622425972, "flos": 25421803102080.0, "grad_norm": 1.7092177694503095, "language_loss": 0.89608318, "learning_rate": 3.727878498433505e-06, "loss": 0.92501831, "num_input_tokens_seen": 6862480, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.49121094, "step": 327, "time_per_iteration": 3.0549449920654297 }, { "auxiliary_loss_clip": 0.01485842, "auxiliary_loss_mlp": 0.01467873, "balance_loss_clip": 1.3133111, "balance_loss_mlp": 1.42088056, "epoch": 0.01972042687509394, "flos": 23667484746240.0, "grad_norm": 1.8342278057771246, "language_loss": 0.89528757, "learning_rate": 3.7298444608266328e-06, "loss": 0.92482471, "num_input_tokens_seen": 6882015, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.4699707, "step": 328, "time_per_iteration": 3.0483720302581787 }, { "auxiliary_loss_clip": 0.01474137, "auxiliary_loss_mlp": 0.01510382, "balance_loss_clip": 1.29972148, "balance_loss_mlp": 1.45692909, "epoch": 0.019780550127761913, "flos": 18232644107520.0, "grad_norm": 2.0624704831482807, "language_loss": 1.14614737, "learning_rate": 3.731804438545683e-06, "loss": 1.17599261, "num_input_tokens_seen": 6899785, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.53491211, "step": 329, "time_per_iteration": 4.45777440071106 }, { "auxiliary_loss_clip": 0.01464542, "auxiliary_loss_mlp": 0.01512852, "balance_loss_clip": 1.29056382, "balance_loss_mlp": 1.45408154, "epoch": 0.01984067338042988, "flos": 22428886331520.0, "grad_norm": 1.906942680224435, "language_loss": 0.86636215, "learning_rate": 3.7337584679165324e-06, "loss": 0.89613605, "num_input_tokens_seen": 6918575, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.58789062, "step": 330, "time_per_iteration": 4.598118782043457 }, { "auxiliary_loss_clip": 0.01464744, "auxiliary_loss_mlp": 0.0147667, "balance_loss_clip": 1.28726912, "balance_loss_mlp": 1.41484797, "epoch": 0.01990079663309785, "flos": 17063867698560.0, "grad_norm": 1.9352809832485176, "language_loss": 1.11481023, "learning_rate": 3.7357065849353186e-06, "loss": 1.14422441, "num_input_tokens_seen": 6936965, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.61791992, "step": 331, "time_per_iteration": 6.723036050796509 }, { "auxiliary_loss_clip": 0.01463144, "auxiliary_loss_mlp": 0.01464907, "balance_loss_clip": 1.28754973, "balance_loss_mlp": 1.40639949, "epoch": 0.01996091988576582, "flos": 15970478405760.0, "grad_norm": 1.6518732858021503, "language_loss": 1.03887153, "learning_rate": 3.737648825272422e-06, "loss": 1.06815219, "num_input_tokens_seen": 6953475, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.58496094, "step": 332, "time_per_iteration": 3.059453248977661 }, { "auxiliary_loss_clip": 0.01469571, "auxiliary_loss_mlp": 0.01460515, "balance_loss_clip": 1.28973866, "balance_loss_mlp": 1.38603282, "epoch": 0.02002104313843379, "flos": 23596893578880.0, "grad_norm": 2.067335079294609, "language_loss": 0.93447161, "learning_rate": 3.739585224276384e-06, "loss": 0.96377254, "num_input_tokens_seen": 6971630, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.74414062, "step": 333, "time_per_iteration": 3.004992723464966 }, { "auxiliary_loss_clip": 0.01472687, "auxiliary_loss_mlp": 0.01455281, "balance_loss_clip": 1.29409552, "balance_loss_mlp": 1.38203859, "epoch": 0.02008116639110176, "flos": 34108189643520.0, "grad_norm": 1.6350103243163914, "language_loss": 0.94027656, "learning_rate": 3.7415158169777673e-06, "loss": 0.96955621, "num_input_tokens_seen": 6992775, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.73291016, "step": 334, "time_per_iteration": 3.2349965572357178 }, { "auxiliary_loss_clip": 0.0147642, "auxiliary_loss_mlp": 0.01427596, "balance_loss_clip": 1.29691744, "balance_loss_mlp": 1.35897923, "epoch": 0.020141289643769728, "flos": 19693467573120.0, "grad_norm": 1.5715361575108515, "language_loss": 0.92593014, "learning_rate": 3.7434406380929575e-06, "loss": 0.9549703, "num_input_tokens_seen": 7011425, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.68603516, "step": 335, "time_per_iteration": 3.1878459453582764 }, { "auxiliary_loss_clip": 0.01478459, "auxiliary_loss_mlp": 0.01440047, "balance_loss_clip": 1.29956186, "balance_loss_mlp": 1.37476826, "epoch": 0.020201412896437697, "flos": 20750271805440.0, "grad_norm": 1.96762225571726, "language_loss": 1.04488349, "learning_rate": 3.745359722027911e-06, "loss": 1.07406855, "num_input_tokens_seen": 7029450, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.65380859, "step": 336, "time_per_iteration": 3.0201406478881836 }, { "auxiliary_loss_clip": 0.01472405, "auxiliary_loss_mlp": 0.0145465, "balance_loss_clip": 1.29384494, "balance_loss_mlp": 1.39587986, "epoch": 0.020261536149105665, "flos": 20276159097600.0, "grad_norm": 1.5103736592518329, "language_loss": 0.96229464, "learning_rate": 3.7472731028818428e-06, "loss": 0.99156523, "num_input_tokens_seen": 7047555, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.58837891, "step": 337, "time_per_iteration": 3.1171329021453857 }, { "auxiliary_loss_clip": 0.01459495, "auxiliary_loss_mlp": 0.01435409, "balance_loss_clip": 1.28435302, "balance_loss_mlp": 1.38021469, "epoch": 0.020321659401773638, "flos": 25859828442240.0, "grad_norm": 1.2721854678417928, "language_loss": 0.95535713, "learning_rate": 3.7491808144508626e-06, "loss": 0.98430622, "num_input_tokens_seen": 7068185, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.55175781, "step": 338, "time_per_iteration": 3.07940673828125 }, { "auxiliary_loss_clip": 0.01454658, "auxiliary_loss_mlp": 0.01400357, "balance_loss_clip": 1.28217638, "balance_loss_mlp": 1.35126662, "epoch": 0.020381782654441606, "flos": 17504336257920.0, "grad_norm": 1.6471674983854503, "language_loss": 0.96002573, "learning_rate": 3.7510828902315576e-06, "loss": 0.98857582, "num_input_tokens_seen": 7085955, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.49145508, "step": 339, "time_per_iteration": 3.021568536758423 }, { "auxiliary_loss_clip": 0.01459348, "auxiliary_loss_mlp": 0.01405556, "balance_loss_clip": 1.28401589, "balance_loss_mlp": 1.3539623, "epoch": 0.020441905907109575, "flos": 24254745995520.0, "grad_norm": 1.4845437094376257, "language_loss": 0.9766953, "learning_rate": 3.75297936342452e-06, "loss": 1.00534439, "num_input_tokens_seen": 7106345, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.51611328, "step": 340, "time_per_iteration": 3.0535926818847656 }, { "auxiliary_loss_clip": 0.01477674, "auxiliary_loss_mlp": 0.01421407, "balance_loss_clip": 1.2992878, "balance_loss_mlp": 1.35937035, "epoch": 0.020502029159777543, "flos": 22242567669120.0, "grad_norm": 1.6394163382626339, "language_loss": 0.97501427, "learning_rate": 3.7548702669378253e-06, "loss": 1.00400507, "num_input_tokens_seen": 7125070, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.62036133, "step": 341, "time_per_iteration": 3.0693359375 }, { "auxiliary_loss_clip": 0.01487616, "auxiliary_loss_mlp": 0.01424647, "balance_loss_clip": 1.31366706, "balance_loss_mlp": 1.3724339, "epoch": 0.020562152412445512, "flos": 23998424348160.0, "grad_norm": 1.8438483522548366, "language_loss": 0.97410512, "learning_rate": 3.756755633390458e-06, "loss": 1.00322771, "num_input_tokens_seen": 7144675, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.52197266, "step": 342, "time_per_iteration": 3.195490598678589 }, { "auxiliary_loss_clip": 0.01497527, "auxiliary_loss_mlp": 0.01427227, "balance_loss_clip": 1.32412255, "balance_loss_mlp": 1.37463176, "epoch": 0.020622275665113484, "flos": 26985504539520.0, "grad_norm": 1.4232196874778749, "language_loss": 0.97543871, "learning_rate": 3.7586354951156886e-06, "loss": 1.00468612, "num_input_tokens_seen": 7165505, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.52587891, "step": 343, "time_per_iteration": 3.173346996307373 }, { "auxiliary_loss_clip": 0.01502068, "auxiliary_loss_mlp": 0.01416996, "balance_loss_clip": 1.32960653, "balance_loss_mlp": 1.37036109, "epoch": 0.020682398917781453, "flos": 22610725758720.0, "grad_norm": 1.523119582949282, "language_loss": 0.86881483, "learning_rate": 3.7605098841644e-06, "loss": 0.89800549, "num_input_tokens_seen": 7184605, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.46679688, "step": 344, "time_per_iteration": 3.075843572616577 }, { "auxiliary_loss_clip": 0.01515132, "auxiliary_loss_mlp": 0.01397996, "balance_loss_clip": 1.34014606, "balance_loss_mlp": 1.3530302, "epoch": 0.02074252217044942, "flos": 15022162500480.0, "grad_norm": 1.4009184950519529, "language_loss": 0.88423657, "learning_rate": 3.7623788323083666e-06, "loss": 0.91336793, "num_input_tokens_seen": 7203065, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.44921875, "step": 345, "time_per_iteration": 3.0539114475250244 }, { "auxiliary_loss_clip": 0.01522942, "auxiliary_loss_mlp": 0.01401536, "balance_loss_clip": 1.34606242, "balance_loss_mlp": 1.35089564, "epoch": 0.02080264542311739, "flos": 25348904449920.0, "grad_norm": 1.7071571912615158, "language_loss": 0.97690898, "learning_rate": 3.7642423710434837e-06, "loss": 1.0061537, "num_input_tokens_seen": 7222995, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.50610352, "step": 346, "time_per_iteration": 3.038041114807129 }, { "auxiliary_loss_clip": 0.01537242, "auxiliary_loss_mlp": 0.01403328, "balance_loss_clip": 1.35929668, "balance_loss_mlp": 1.34503472, "epoch": 0.02086276867578536, "flos": 24399366935040.0, "grad_norm": 1.6685732937941726, "language_loss": 0.91264701, "learning_rate": 3.7661005315929563e-06, "loss": 0.9420526, "num_input_tokens_seen": 7244625, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.58325195, "step": 347, "time_per_iteration": 3.0891525745391846 }, { "auxiliary_loss_clip": 0.0153872, "auxiliary_loss_mlp": 0.01402186, "balance_loss_clip": 1.35928476, "balance_loss_mlp": 1.34801722, "epoch": 0.02092289192845333, "flos": 24472899014400.0, "grad_norm": 1.5293046524872853, "language_loss": 0.83135027, "learning_rate": 3.7679533449104354e-06, "loss": 0.86075932, "num_input_tokens_seen": 7263255, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.54150391, "step": 348, "time_per_iteration": 3.0590052604675293 }, { "auxiliary_loss_clip": 0.01541126, "auxiliary_loss_mlp": 0.01410458, "balance_loss_clip": 1.3610636, "balance_loss_mlp": 1.34961414, "epoch": 0.0209830151811213, "flos": 17458566503040.0, "grad_norm": 1.851508479393449, "language_loss": 0.92335117, "learning_rate": 3.7698008416831116e-06, "loss": 0.95286703, "num_input_tokens_seen": 7279275, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.60864258, "step": 349, "time_per_iteration": 2.971245050430298 }, { "auxiliary_loss_clip": 0.01533332, "auxiliary_loss_mlp": 0.01385645, "balance_loss_clip": 1.35909522, "balance_loss_mlp": 1.34301567, "epoch": 0.021043138433789268, "flos": 24585323639040.0, "grad_norm": 1.5157037398508373, "language_loss": 0.93425339, "learning_rate": 3.7716430523347664e-06, "loss": 0.9634431, "num_input_tokens_seen": 7300180, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.42626953, "step": 350, "time_per_iteration": 3.0633199214935303 }, { "auxiliary_loss_clip": 0.01527433, "auxiliary_loss_mlp": 0.01390161, "balance_loss_clip": 1.35112095, "balance_loss_mlp": 1.34352672, "epoch": 0.021103261686457236, "flos": 24462628444800.0, "grad_norm": 1.78754116670291, "language_loss": 0.89276218, "learning_rate": 3.773480007028776e-06, "loss": 0.92193812, "num_input_tokens_seen": 7317430, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.46655273, "step": 351, "time_per_iteration": 3.0954644680023193 }, { "auxiliary_loss_clip": 0.01531319, "auxiliary_loss_mlp": 0.01396814, "balance_loss_clip": 1.35297775, "balance_loss_mlp": 1.34302652, "epoch": 0.021163384939125205, "flos": 14690996674560.0, "grad_norm": 1.5979956049716375, "language_loss": 0.96232545, "learning_rate": 3.775311735671078e-06, "loss": 0.99160677, "num_input_tokens_seen": 7334875, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.5378418, "step": 352, "time_per_iteration": 3.1323671340942383 }, { "auxiliary_loss_clip": 0.01527189, "auxiliary_loss_mlp": 0.01393762, "balance_loss_clip": 1.34998524, "balance_loss_mlp": 1.34681737, "epoch": 0.021223508191793177, "flos": 24502471130880.0, "grad_norm": 1.556924245880262, "language_loss": 0.90965623, "learning_rate": 3.7771382679130878e-06, "loss": 0.93886578, "num_input_tokens_seen": 7355185, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.46948242, "step": 353, "time_per_iteration": 3.077589750289917 }, { "auxiliary_loss_clip": 0.01514095, "auxiliary_loss_mlp": 0.01411254, "balance_loss_clip": 1.33765483, "balance_loss_mlp": 1.35842061, "epoch": 0.021283631444461146, "flos": 24135806119680.0, "grad_norm": 1.7590114882651124, "language_loss": 0.88947845, "learning_rate": 3.7789596331545845e-06, "loss": 0.91873199, "num_input_tokens_seen": 7374425, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.52856445, "step": 354, "time_per_iteration": 2.9961435794830322 }, { "auxiliary_loss_clip": 0.01498583, "auxiliary_loss_mlp": 0.01418702, "balance_loss_clip": 1.32054603, "balance_loss_mlp": 1.35728502, "epoch": 0.021343754697129114, "flos": 25203197635200.0, "grad_norm": 1.760801692489904, "language_loss": 0.91473269, "learning_rate": 3.780775860546545e-06, "loss": 0.94390559, "num_input_tokens_seen": 7394175, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.61401367, "step": 355, "time_per_iteration": 3.031515121459961 }, { "auxiliary_loss_clip": 0.01495181, "auxiliary_loss_mlp": 0.01425081, "balance_loss_clip": 1.3175869, "balance_loss_mlp": 1.36430788, "epoch": 0.021403877949797083, "flos": 17282925613440.0, "grad_norm": 1.8409357690168766, "language_loss": 1.02948475, "learning_rate": 3.7825869789939474e-06, "loss": 1.05868733, "num_input_tokens_seen": 7412645, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.60742188, "step": 356, "time_per_iteration": 2.9889063835144043 }, { "auxiliary_loss_clip": 0.01469107, "auxiliary_loss_mlp": 0.01428397, "balance_loss_clip": 1.29765463, "balance_loss_mlp": 1.37205863, "epoch": 0.021464001202465055, "flos": 30929089944960.0, "grad_norm": 1.6436245803737446, "language_loss": 0.90599585, "learning_rate": 3.784393017158528e-06, "loss": 0.93497086, "num_input_tokens_seen": 7432275, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.56323242, "step": 357, "time_per_iteration": 3.0896823406219482 }, { "auxiliary_loss_clip": 0.01462172, "auxiliary_loss_mlp": 0.01420497, "balance_loss_clip": 1.28911722, "balance_loss_mlp": 1.36740136, "epoch": 0.021524124455133024, "flos": 18195380375040.0, "grad_norm": 1.9704477456252407, "language_loss": 0.8992632, "learning_rate": 3.786194003461506e-06, "loss": 0.92808992, "num_input_tokens_seen": 7450245, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.53051758, "step": 358, "time_per_iteration": 3.0209639072418213 }, { "auxiliary_loss_clip": 0.01457875, "auxiliary_loss_mlp": 0.01414363, "balance_loss_clip": 1.28148413, "balance_loss_mlp": 1.35685611, "epoch": 0.021584247707800992, "flos": 13813045712640.0, "grad_norm": 1.7466749884829962, "language_loss": 1.01175082, "learning_rate": 3.787989966086264e-06, "loss": 1.0404731, "num_input_tokens_seen": 7466845, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.57519531, "step": 359, "time_per_iteration": 3.0023770332336426 }, { "auxiliary_loss_clip": 0.01468335, "auxiliary_loss_mlp": 0.01407062, "balance_loss_clip": 1.28754759, "balance_loss_mlp": 1.34664679, "epoch": 0.02164437096046896, "flos": 23305117991040.0, "grad_norm": 1.9815774087440103, "language_loss": 0.95779788, "learning_rate": 3.789780932980997e-06, "loss": 0.98655182, "num_input_tokens_seen": 7485450, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.60400391, "step": 360, "time_per_iteration": 3.0000691413879395 }, { "auxiliary_loss_clip": 0.0149684, "auxiliary_loss_mlp": 0.0139793, "balance_loss_clip": 1.34285188, "balance_loss_mlp": 1.36817586, "epoch": 0.02170449421313693, "flos": 68930679231360.0, "grad_norm": 0.8714581387023677, "language_loss": 0.65157449, "learning_rate": 3.79156693186132e-06, "loss": 0.6805222, "num_input_tokens_seen": 7553780, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.296875, "step": 361, "time_per_iteration": 3.525885581970215 }, { "auxiliary_loss_clip": 0.01439667, "auxiliary_loss_mlp": 0.01410311, "balance_loss_clip": 1.26543677, "balance_loss_mlp": 1.34460247, "epoch": 0.0217646174658049, "flos": 25239465982080.0, "grad_norm": 2.079753360825001, "language_loss": 0.9527486, "learning_rate": 3.7933479902128433e-06, "loss": 0.98124838, "num_input_tokens_seen": 7574155, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.65771484, "step": 362, "time_per_iteration": 3.015835762023926 }, { "auxiliary_loss_clip": 0.01438653, "auxiliary_loss_mlp": 0.01416791, "balance_loss_clip": 1.26202929, "balance_loss_mlp": 1.34927106, "epoch": 0.02182474071847287, "flos": 22903360997760.0, "grad_norm": 1.677656112524445, "language_loss": 1.01712966, "learning_rate": 3.7951241352937077e-06, "loss": 1.04568422, "num_input_tokens_seen": 7592320, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.67529297, "step": 363, "time_per_iteration": 4.384565591812134 }, { "auxiliary_loss_clip": 0.01436558, "auxiliary_loss_mlp": 0.01424127, "balance_loss_clip": 1.26181901, "balance_loss_mlp": 1.35512853, "epoch": 0.02188486397114084, "flos": 23669339783040.0, "grad_norm": 1.6760356263019067, "language_loss": 1.00927758, "learning_rate": 3.7968953941370915e-06, "loss": 1.03788447, "num_input_tokens_seen": 7611185, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.68945312, "step": 364, "time_per_iteration": 3.041700839996338 }, { "auxiliary_loss_clip": 0.01431452, "auxiliary_loss_mlp": 0.01434458, "balance_loss_clip": 1.25763702, "balance_loss_mlp": 1.3497721, "epoch": 0.021944987223808807, "flos": 21553650057600.0, "grad_norm": 1.6876051991043017, "language_loss": 0.93163347, "learning_rate": 3.798661793553676e-06, "loss": 0.96029258, "num_input_tokens_seen": 7631970, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.84619141, "step": 365, "time_per_iteration": 4.378239393234253 }, { "auxiliary_loss_clip": 0.01430169, "auxiliary_loss_mlp": 0.01413898, "balance_loss_clip": 1.25651252, "balance_loss_mlp": 1.34013104, "epoch": 0.022005110476476776, "flos": 16079509670400.0, "grad_norm": 1.4857720072229692, "language_loss": 0.91796309, "learning_rate": 3.8004233601340808e-06, "loss": 0.94640374, "num_input_tokens_seen": 7649745, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.73681641, "step": 366, "time_per_iteration": 6.557047367095947 }, { "auxiliary_loss_clip": 0.01431823, "auxiliary_loss_mlp": 0.01405276, "balance_loss_clip": 1.25627637, "balance_loss_mlp": 1.34128475, "epoch": 0.022065233729144748, "flos": 21443578162560.0, "grad_norm": 1.7479548207497289, "language_loss": 1.00543082, "learning_rate": 3.8021801202512694e-06, "loss": 1.03380179, "num_input_tokens_seen": 7668830, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.63891602, "step": 367, "time_per_iteration": 2.987262725830078 }, { "auxiliary_loss_clip": 0.01433545, "auxiliary_loss_mlp": 0.01410776, "balance_loss_clip": 1.25360799, "balance_loss_mlp": 1.34535384, "epoch": 0.022125356981812717, "flos": 21553740547200.0, "grad_norm": 1.5688373830425824, "language_loss": 0.9672749, "learning_rate": 3.803932100062912e-06, "loss": 0.99571806, "num_input_tokens_seen": 7687240, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.65380859, "step": 368, "time_per_iteration": 2.9796643257141113 }, { "auxiliary_loss_clip": 0.01429409, "auxiliary_loss_mlp": 0.01409932, "balance_loss_clip": 1.25231242, "balance_loss_mlp": 1.35161448, "epoch": 0.022185480234480685, "flos": 20713867724160.0, "grad_norm": 2.1678097600846558, "language_loss": 0.96813887, "learning_rate": 3.8056793255137264e-06, "loss": 0.99653232, "num_input_tokens_seen": 7704440, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.58349609, "step": 369, "time_per_iteration": 3.066594362258911 }, { "auxiliary_loss_clip": 0.01420545, "auxiliary_loss_mlp": 0.01423145, "balance_loss_clip": 1.24796224, "balance_loss_mlp": 1.34437203, "epoch": 0.022245603487148654, "flos": 25204419244800.0, "grad_norm": 1.7243772129759378, "language_loss": 0.94816613, "learning_rate": 3.8074218223377844e-06, "loss": 0.97660303, "num_input_tokens_seen": 7727160, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.78637695, "step": 370, "time_per_iteration": 3.007535934448242 }, { "auxiliary_loss_clip": 0.01420156, "auxiliary_loss_mlp": 0.01405832, "balance_loss_clip": 1.24880195, "balance_loss_mlp": 1.34541714, "epoch": 0.022305726739816623, "flos": 21405454778880.0, "grad_norm": 1.396836741858576, "language_loss": 0.8987062, "learning_rate": 3.8091596160607834e-06, "loss": 0.92696607, "num_input_tokens_seen": 7747730, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.60498047, "step": 371, "time_per_iteration": 2.941368818283081 }, { "auxiliary_loss_clip": 0.01424207, "auxiliary_loss_mlp": 0.01405674, "balance_loss_clip": 1.25183988, "balance_loss_mlp": 1.34494925, "epoch": 0.022365849992484595, "flos": 22502373166080.0, "grad_norm": 2.0712633649793264, "language_loss": 0.98791945, "learning_rate": 3.8108927320022896e-06, "loss": 1.01621819, "num_input_tokens_seen": 7766765, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.60717773, "step": 372, "time_per_iteration": 2.941314935684204 }, { "auxiliary_loss_clip": 0.01418251, "auxiliary_loss_mlp": 0.01412321, "balance_loss_clip": 1.24824047, "balance_loss_mlp": 1.35059404, "epoch": 0.022425973245152563, "flos": 17865119445120.0, "grad_norm": 2.0194244935171204, "language_loss": 0.9441303, "learning_rate": 3.8126211952779548e-06, "loss": 0.97243607, "num_input_tokens_seen": 7784010, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.61791992, "step": 373, "time_per_iteration": 2.9012603759765625 }, { "auxiliary_loss_clip": 0.01428781, "auxiliary_loss_mlp": 0.01416824, "balance_loss_clip": 1.25678647, "balance_loss_mlp": 1.34770656, "epoch": 0.022486096497820532, "flos": 15490438629120.0, "grad_norm": 1.9337030588875195, "language_loss": 0.95945686, "learning_rate": 3.8143450308016952e-06, "loss": 0.98791289, "num_input_tokens_seen": 7801305, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.69140625, "step": 374, "time_per_iteration": 2.9126269817352295 }, { "auxiliary_loss_clip": 0.01419895, "auxiliary_loss_mlp": 0.01409538, "balance_loss_clip": 1.25123644, "balance_loss_mlp": 1.34819293, "epoch": 0.0225462197504885, "flos": 27796167204480.0, "grad_norm": 1.5101593797601496, "language_loss": 0.92323256, "learning_rate": 3.8160642632878525e-06, "loss": 0.95152688, "num_input_tokens_seen": 7823965, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.61328125, "step": 375, "time_per_iteration": 3.106255054473877 }, { "auxiliary_loss_clip": 0.01423486, "auxiliary_loss_mlp": 0.01388757, "balance_loss_clip": 1.25807095, "balance_loss_mlp": 1.34092987, "epoch": 0.02260634300315647, "flos": 19985288405760.0, "grad_norm": 1.9035116013405038, "language_loss": 0.98437035, "learning_rate": 3.817778917253314e-06, "loss": 1.01249278, "num_input_tokens_seen": 7842115, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.4777832, "step": 376, "time_per_iteration": 3.031017780303955 }, { "auxiliary_loss_clip": 0.01428907, "auxiliary_loss_mlp": 0.01398933, "balance_loss_clip": 1.26043415, "balance_loss_mlp": 1.33966231, "epoch": 0.02266646625582444, "flos": 16035187749120.0, "grad_norm": 2.487692859361529, "language_loss": 0.93319046, "learning_rate": 3.8194890170196155e-06, "loss": 0.96146894, "num_input_tokens_seen": 7857830, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.59228516, "step": 377, "time_per_iteration": 2.947862148284912 }, { "auxiliary_loss_clip": 0.01424874, "auxiliary_loss_mlp": 0.01390467, "balance_loss_clip": 1.26091862, "balance_loss_mlp": 1.34213948, "epoch": 0.02272658950849241, "flos": 20412274014720.0, "grad_norm": 1.7820099384838644, "language_loss": 1.08027172, "learning_rate": 3.8211945867150055e-06, "loss": 1.10842514, "num_input_tokens_seen": 7875840, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.48339844, "step": 378, "time_per_iteration": 2.962033987045288 }, { "auxiliary_loss_clip": 0.01415652, "auxiliary_loss_mlp": 0.01397076, "balance_loss_clip": 1.27318418, "balance_loss_mlp": 1.36598623, "epoch": 0.02278671276116038, "flos": 69878452199040.0, "grad_norm": 1.0108386916639804, "language_loss": 0.75607866, "learning_rate": 3.822895650276492e-06, "loss": 0.78420597, "num_input_tokens_seen": 7940190, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.31054688, "step": 379, "time_per_iteration": 3.4685165882110596 }, { "auxiliary_loss_clip": 0.0142333, "auxiliary_loss_mlp": 0.01417852, "balance_loss_clip": 1.25590694, "balance_loss_mlp": 1.35641193, "epoch": 0.022846836013828347, "flos": 38522992089600.0, "grad_norm": 1.929103013988106, "language_loss": 0.92802167, "learning_rate": 3.824592231451859e-06, "loss": 0.95643353, "num_input_tokens_seen": 7960840, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.61450195, "step": 380, "time_per_iteration": 3.1631617546081543 }, { "auxiliary_loss_clip": 0.01418127, "auxiliary_loss_mlp": 0.01417501, "balance_loss_clip": 1.25637436, "balance_loss_mlp": 1.36984134, "epoch": 0.02290695926649632, "flos": 20969239230720.0, "grad_norm": 1.7691760748065462, "language_loss": 1.0748477, "learning_rate": 3.826284353801652e-06, "loss": 1.10320401, "num_input_tokens_seen": 7975500, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.47680664, "step": 381, "time_per_iteration": 3.0069222450256348 }, { "auxiliary_loss_clip": 0.01420619, "auxiliary_loss_mlp": 0.01423419, "balance_loss_clip": 1.25696027, "balance_loss_mlp": 1.36169195, "epoch": 0.022967082519164288, "flos": 24032475699840.0, "grad_norm": 1.7801391124094386, "language_loss": 0.9878068, "learning_rate": 3.827972040701142e-06, "loss": 1.01624715, "num_input_tokens_seen": 7993880, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.61791992, "step": 382, "time_per_iteration": 3.013899087905884 }, { "auxiliary_loss_clip": 0.01410668, "auxiliary_loss_mlp": 0.01395707, "balance_loss_clip": 1.24989867, "balance_loss_mlp": 1.34382701, "epoch": 0.023027205771832256, "flos": 21007226880000.0, "grad_norm": 1.5384667393928166, "language_loss": 0.96763223, "learning_rate": 3.829655315342268e-06, "loss": 0.99569595, "num_input_tokens_seen": 8012730, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.51904297, "step": 383, "time_per_iteration": 2.913273572921753 }, { "auxiliary_loss_clip": 0.01410722, "auxiliary_loss_mlp": 0.01392522, "balance_loss_clip": 1.24868751, "balance_loss_mlp": 1.33923507, "epoch": 0.023087329024500225, "flos": 21370679510400.0, "grad_norm": 1.6365186973659018, "language_loss": 0.96944022, "learning_rate": 3.831334200735543e-06, "loss": 0.9974727, "num_input_tokens_seen": 8031275, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.53271484, "step": 384, "time_per_iteration": 2.9126317501068115 }, { "auxiliary_loss_clip": 0.01408058, "auxiliary_loss_mlp": 0.01405683, "balance_loss_clip": 1.24840367, "balance_loss_mlp": 1.36012149, "epoch": 0.023147452277168194, "flos": 21882644133120.0, "grad_norm": 1.962256499625677, "language_loss": 0.97008049, "learning_rate": 3.8330087197119426e-06, "loss": 0.998218, "num_input_tokens_seen": 8051600, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.45581055, "step": 385, "time_per_iteration": 2.906268835067749 }, { "auxiliary_loss_clip": 0.01411438, "auxiliary_loss_mlp": 0.01425657, "balance_loss_clip": 1.25042713, "balance_loss_mlp": 1.37039185, "epoch": 0.023207575529836166, "flos": 18926040954240.0, "grad_norm": 1.5169892406388585, "language_loss": 0.76595026, "learning_rate": 3.83467889492477e-06, "loss": 0.79432124, "num_input_tokens_seen": 8070600, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.55273438, "step": 386, "time_per_iteration": 2.958756923675537 }, { "auxiliary_loss_clip": 0.01414276, "auxiliary_loss_mlp": 0.01446646, "balance_loss_clip": 1.24932146, "balance_loss_mlp": 1.36954165, "epoch": 0.023267698782504134, "flos": 25056857393280.0, "grad_norm": 1.6164475051241824, "language_loss": 0.96341622, "learning_rate": 3.836344748851495e-06, "loss": 0.99202549, "num_input_tokens_seen": 8090680, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.77050781, "step": 387, "time_per_iteration": 3.0218093395233154 }, { "auxiliary_loss_clip": 0.01417861, "auxiliary_loss_mlp": 0.01397459, "balance_loss_clip": 1.25201678, "balance_loss_mlp": 1.34610343, "epoch": 0.023327822035172103, "flos": 28891637758080.0, "grad_norm": 1.5233004525436982, "language_loss": 0.94490337, "learning_rate": 3.838006303795566e-06, "loss": 0.9730565, "num_input_tokens_seen": 8114610, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.51318359, "step": 388, "time_per_iteration": 3.0295302867889404 }, { "auxiliary_loss_clip": 0.01413909, "auxiliary_loss_mlp": 0.01388534, "balance_loss_clip": 1.25124776, "balance_loss_mlp": 1.34168434, "epoch": 0.02338794528784007, "flos": 27131844781440.0, "grad_norm": 1.8298004973904598, "language_loss": 1.06572104, "learning_rate": 3.839663581888206e-06, "loss": 1.09374535, "num_input_tokens_seen": 8133975, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.46875, "step": 389, "time_per_iteration": 2.9822375774383545 }, { "auxiliary_loss_clip": 0.01416686, "auxiliary_loss_mlp": 0.01390751, "balance_loss_clip": 1.2534982, "balance_loss_mlp": 1.34297156, "epoch": 0.02344806854050804, "flos": 21331696475520.0, "grad_norm": 1.6355058499117958, "language_loss": 0.97054803, "learning_rate": 3.841316605090178e-06, "loss": 0.99862236, "num_input_tokens_seen": 8153570, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.47827148, "step": 390, "time_per_iteration": 3.0225462913513184 }, { "auxiliary_loss_clip": 0.01417435, "auxiliary_loss_mlp": 0.01394344, "balance_loss_clip": 1.25555551, "balance_loss_mlp": 1.34859204, "epoch": 0.023508191793176012, "flos": 24800626235520.0, "grad_norm": 1.8998978110571239, "language_loss": 1.04456878, "learning_rate": 3.842965395193529e-06, "loss": 1.07268655, "num_input_tokens_seen": 8170075, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.45776367, "step": 391, "time_per_iteration": 3.075310468673706 }, { "auxiliary_loss_clip": 0.01424582, "auxiliary_loss_mlp": 0.01397412, "balance_loss_clip": 1.25918221, "balance_loss_mlp": 1.34455442, "epoch": 0.02356831504584398, "flos": 26006666376960.0, "grad_norm": 1.6777613334588555, "language_loss": 0.9790647, "learning_rate": 3.84460997382332e-06, "loss": 1.00728464, "num_input_tokens_seen": 8190420, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.52832031, "step": 392, "time_per_iteration": 3.035109519958496 }, { "auxiliary_loss_clip": 0.01420273, "auxiliary_loss_mlp": 0.0139555, "balance_loss_clip": 1.25692046, "balance_loss_mlp": 1.34574389, "epoch": 0.02362843829851195, "flos": 19071793013760.0, "grad_norm": 1.6557398565930521, "language_loss": 0.97907925, "learning_rate": 3.8462503624393256e-06, "loss": 1.00723743, "num_input_tokens_seen": 8208790, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.49780273, "step": 393, "time_per_iteration": 2.9025425910949707 }, { "auxiliary_loss_clip": 0.01417072, "auxiliary_loss_mlp": 0.01389342, "balance_loss_clip": 1.25435269, "balance_loss_mlp": 1.34072864, "epoch": 0.023688561551179918, "flos": 16079690649600.0, "grad_norm": 1.5429131146218331, "language_loss": 0.89719033, "learning_rate": 3.84788658233771e-06, "loss": 0.92525446, "num_input_tokens_seen": 8226885, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.48632812, "step": 394, "time_per_iteration": 2.9892895221710205 }, { "auxiliary_loss_clip": 0.01414507, "auxiliary_loss_mlp": 0.01406078, "balance_loss_clip": 1.25088549, "balance_loss_mlp": 1.34158587, "epoch": 0.023748684803847887, "flos": 21733996406400.0, "grad_norm": 1.4948832172421616, "language_loss": 0.93770057, "learning_rate": 3.84951865465269e-06, "loss": 0.96590644, "num_input_tokens_seen": 8246825, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.64428711, "step": 395, "time_per_iteration": 2.9781875610351562 }, { "auxiliary_loss_clip": 0.01374215, "auxiliary_loss_mlp": 0.01363218, "balance_loss_clip": 1.24009013, "balance_loss_mlp": 1.33966208, "epoch": 0.02380880805651586, "flos": 61954289124480.0, "grad_norm": 0.9497876900872289, "language_loss": 0.64010942, "learning_rate": 3.851146600358172e-06, "loss": 0.66748369, "num_input_tokens_seen": 8302835, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.23535156, "step": 396, "time_per_iteration": 3.201403856277466 }, { "auxiliary_loss_clip": 0.01408417, "auxiliary_loss_mlp": 0.01391393, "balance_loss_clip": 1.24388313, "balance_loss_mlp": 1.34204006, "epoch": 0.023868931309183827, "flos": 20275932873600.0, "grad_norm": 1.9052428541560735, "language_loss": 0.9938761, "learning_rate": 3.852770440269372e-06, "loss": 1.02187419, "num_input_tokens_seen": 8320745, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.49365234, "step": 397, "time_per_iteration": 2.9417977333068848 }, { "auxiliary_loss_clip": 0.01409678, "auxiliary_loss_mlp": 0.01392491, "balance_loss_clip": 1.24542904, "balance_loss_mlp": 1.34537947, "epoch": 0.023929054561851796, "flos": 21148182990720.0, "grad_norm": 1.7170932428235586, "language_loss": 1.0027554, "learning_rate": 3.854390195044404e-06, "loss": 1.0307771, "num_input_tokens_seen": 8339540, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.47119141, "step": 398, "time_per_iteration": 4.419039726257324 }, { "auxiliary_loss_clip": 0.01404018, "auxiliary_loss_mlp": 0.01420851, "balance_loss_clip": 1.23968101, "balance_loss_mlp": 1.34408045, "epoch": 0.023989177814519765, "flos": 13706321932800.0, "grad_norm": 2.0404837169316954, "language_loss": 1.07362795, "learning_rate": 3.856005885185868e-06, "loss": 1.10187662, "num_input_tokens_seen": 8354890, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.76708984, "step": 399, "time_per_iteration": 2.8647799491882324 }, { "auxiliary_loss_clip": 0.01402744, "auxiliary_loss_mlp": 0.0138984, "balance_loss_clip": 1.24073505, "balance_loss_mlp": 1.34396887, "epoch": 0.024049301067187733, "flos": 26332945764480.0, "grad_norm": 1.6443111007752744, "language_loss": 0.93580818, "learning_rate": 3.857617531042398e-06, "loss": 0.96373403, "num_input_tokens_seen": 8375845, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.45874023, "step": 400, "time_per_iteration": 4.4383416175842285 }, { "auxiliary_loss_clip": 0.01408905, "auxiliary_loss_mlp": 0.01395649, "balance_loss_clip": 1.24383187, "balance_loss_mlp": 1.34152746, "epoch": 0.024109424319855705, "flos": 24436042485120.0, "grad_norm": 1.5000425070600032, "language_loss": 0.8928504, "learning_rate": 3.8592251528102065e-06, "loss": 0.92089593, "num_input_tokens_seen": 8395240, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.54125977, "step": 401, "time_per_iteration": 5.888904333114624 }, { "auxiliary_loss_clip": 0.01403965, "auxiliary_loss_mlp": 0.01394754, "balance_loss_clip": 1.24218559, "balance_loss_mlp": 1.3442812, "epoch": 0.024169547572523674, "flos": 29615828330880.0, "grad_norm": 1.6399373181979282, "language_loss": 0.90155423, "learning_rate": 3.8608287705345976e-06, "loss": 0.92954147, "num_input_tokens_seen": 8416950, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.50488281, "step": 402, "time_per_iteration": 3.050919532775879 }, { "auxiliary_loss_clip": 0.01409755, "auxiliary_loss_mlp": 0.01393463, "balance_loss_clip": 1.24203622, "balance_loss_mlp": 1.34112978, "epoch": 0.024229670825191642, "flos": 22611675899520.0, "grad_norm": 2.0705636146900686, "language_loss": 1.08263421, "learning_rate": 3.86242840411147e-06, "loss": 1.11066628, "num_input_tokens_seen": 8433660, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.52270508, "step": 403, "time_per_iteration": 2.9243438243865967 }, { "auxiliary_loss_clip": 0.01399575, "auxiliary_loss_mlp": 0.01398846, "balance_loss_clip": 1.23578787, "balance_loss_mlp": 1.34207821, "epoch": 0.02428979407785961, "flos": 18159338252160.0, "grad_norm": 1.8548026670563853, "language_loss": 1.1123991, "learning_rate": 3.864024073288798e-06, "loss": 1.14038324, "num_input_tokens_seen": 8450180, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.56787109, "step": 404, "time_per_iteration": 2.909031629562378 }, { "auxiliary_loss_clip": 0.01400673, "auxiliary_loss_mlp": 0.01396473, "balance_loss_clip": 1.23807812, "balance_loss_mlp": 1.34602344, "epoch": 0.024349917330527583, "flos": 15313485640320.0, "grad_norm": 1.6281389854642492, "language_loss": 1.00279319, "learning_rate": 3.865615797668091e-06, "loss": 1.03076482, "num_input_tokens_seen": 8467775, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.50439453, "step": 405, "time_per_iteration": 2.8969004154205322 }, { "auxiliary_loss_clip": 0.01400338, "auxiliary_loss_mlp": 0.01389689, "balance_loss_clip": 1.23801303, "balance_loss_mlp": 1.33988297, "epoch": 0.024410040583195552, "flos": 20782784833920.0, "grad_norm": 1.7097613299573007, "language_loss": 1.04452968, "learning_rate": 3.867203596705844e-06, "loss": 1.0724299, "num_input_tokens_seen": 8486765, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.49853516, "step": 406, "time_per_iteration": 2.934605360031128 }, { "auxiliary_loss_clip": 0.01398272, "auxiliary_loss_mlp": 0.01386491, "balance_loss_clip": 1.23508644, "balance_loss_mlp": 1.33930767, "epoch": 0.02447016383586352, "flos": 21808885829760.0, "grad_norm": 1.4899744765643543, "language_loss": 0.97820926, "learning_rate": 3.86878748971496e-06, "loss": 1.00605679, "num_input_tokens_seen": 8506515, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.47167969, "step": 407, "time_per_iteration": 2.9809410572052 }, { "auxiliary_loss_clip": 0.01393676, "auxiliary_loss_mlp": 0.0140858, "balance_loss_clip": 1.2330811, "balance_loss_mlp": 1.34523225, "epoch": 0.02453028708853149, "flos": 33961170729600.0, "grad_norm": 1.3913886376998021, "language_loss": 0.82262158, "learning_rate": 3.8703674958661596e-06, "loss": 0.85064411, "num_input_tokens_seen": 8528035, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.63305664, "step": 408, "time_per_iteration": 3.0335466861724854 }, { "auxiliary_loss_clip": 0.0139531, "auxiliary_loss_mlp": 0.01388018, "balance_loss_clip": 1.23351383, "balance_loss_mlp": 1.34176445, "epoch": 0.024590410341199458, "flos": 21801510927360.0, "grad_norm": 3.086162629915322, "language_loss": 1.06479311, "learning_rate": 3.871943634189376e-06, "loss": 1.09262645, "num_input_tokens_seen": 8546455, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.46289062, "step": 409, "time_per_iteration": 2.96094012260437 }, { "auxiliary_loss_clip": 0.01402858, "auxiliary_loss_mlp": 0.01397965, "balance_loss_clip": 1.23773444, "balance_loss_mlp": 1.34656167, "epoch": 0.02465053359386743, "flos": 35127911122560.0, "grad_norm": 1.7839932470057396, "language_loss": 0.93995953, "learning_rate": 3.873515923575128e-06, "loss": 0.96796781, "num_input_tokens_seen": 8568450, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.51391602, "step": 410, "time_per_iteration": 3.063957452774048 }, { "auxiliary_loss_clip": 0.01393456, "auxiliary_loss_mlp": 0.01409216, "balance_loss_clip": 1.23241997, "balance_loss_mlp": 1.34956384, "epoch": 0.0247106568465354, "flos": 27462196200960.0, "grad_norm": 1.9660835906450607, "language_loss": 0.91820765, "learning_rate": 3.875084382775879e-06, "loss": 0.94623435, "num_input_tokens_seen": 8589340, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.59643555, "step": 411, "time_per_iteration": 3.0753536224365234 }, { "auxiliary_loss_clip": 0.01395271, "auxiliary_loss_mlp": 0.01407578, "balance_loss_clip": 1.23251987, "balance_loss_mlp": 1.35293245, "epoch": 0.024770780099203367, "flos": 20713415276160.0, "grad_norm": 1.8564675148160124, "language_loss": 0.98956716, "learning_rate": 3.87664903040738e-06, "loss": 1.01759565, "num_input_tokens_seen": 8607150, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.54711914, "step": 412, "time_per_iteration": 2.9543721675872803 }, { "auxiliary_loss_clip": 0.01367631, "auxiliary_loss_mlp": 0.01409191, "balance_loss_clip": 1.23895931, "balance_loss_mlp": 1.38096189, "epoch": 0.024830903351871336, "flos": 69581971152000.0, "grad_norm": 0.8686447461050046, "language_loss": 0.5884645, "learning_rate": 3.878209884949994e-06, "loss": 0.61623269, "num_input_tokens_seen": 8669865, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.28320312, "step": 413, "time_per_iteration": 3.42899751663208 }, { "auxiliary_loss_clip": 0.01393279, "auxiliary_loss_mlp": 0.01409888, "balance_loss_clip": 1.23052263, "balance_loss_mlp": 1.34217715, "epoch": 0.024891026604539304, "flos": 32283280120320.0, "grad_norm": 1.3623774937065019, "language_loss": 0.88302422, "learning_rate": 3.879766964750006e-06, "loss": 0.91105592, "num_input_tokens_seen": 8690235, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.67749023, "step": 414, "time_per_iteration": 3.034453868865967 }, { "auxiliary_loss_clip": 0.01390613, "auxiliary_loss_mlp": 0.01382706, "balance_loss_clip": 1.23197889, "balance_loss_mlp": 1.34036326, "epoch": 0.024951149857207276, "flos": 18848708311680.0, "grad_norm": 1.6416095718939654, "language_loss": 0.91087341, "learning_rate": 3.881320288020917e-06, "loss": 0.93860662, "num_input_tokens_seen": 8706295, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.42333984, "step": 415, "time_per_iteration": 2.92095947265625 }, { "auxiliary_loss_clip": 0.01402184, "auxiliary_loss_mlp": 0.01395512, "balance_loss_clip": 1.23761499, "balance_loss_mlp": 1.34439492, "epoch": 0.025011273109875245, "flos": 15385524641280.0, "grad_norm": 1.915516112430771, "language_loss": 1.17814624, "learning_rate": 3.882869872844723e-06, "loss": 1.20612323, "num_input_tokens_seen": 8724200, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.51123047, "step": 416, "time_per_iteration": 2.9286961555480957 }, { "auxiliary_loss_clip": 0.01396875, "auxiliary_loss_mlp": 0.01417205, "balance_loss_clip": 1.23453426, "balance_loss_mlp": 1.35264099, "epoch": 0.025071396362543213, "flos": 18924547875840.0, "grad_norm": 1.3825791710074287, "language_loss": 0.87238657, "learning_rate": 3.884415737173176e-06, "loss": 0.90052736, "num_input_tokens_seen": 8744170, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.64599609, "step": 417, "time_per_iteration": 3.029409646987915 }, { "auxiliary_loss_clip": 0.01397121, "auxiliary_loss_mlp": 0.01393547, "balance_loss_clip": 1.2375468, "balance_loss_mlp": 1.34958267, "epoch": 0.025131519615211182, "flos": 25348904449920.0, "grad_norm": 1.5118812616867696, "language_loss": 0.8642205, "learning_rate": 3.8859578988290344e-06, "loss": 0.89212716, "num_input_tokens_seen": 8765120, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.43969727, "step": 418, "time_per_iteration": 2.9942479133605957 }, { "auxiliary_loss_clip": 0.01401634, "auxiliary_loss_mlp": 0.01392822, "balance_loss_clip": 1.24175024, "balance_loss_mlp": 1.34382665, "epoch": 0.02519164286787915, "flos": 18962490280320.0, "grad_norm": 1.9549665690522424, "language_loss": 1.01676965, "learning_rate": 3.887496375507294e-06, "loss": 1.04471409, "num_input_tokens_seen": 8783500, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.49023438, "step": 419, "time_per_iteration": 3.0266122817993164 }, { "auxiliary_loss_clip": 0.01398229, "auxiliary_loss_mlp": 0.01387285, "balance_loss_clip": 1.24019802, "balance_loss_mlp": 1.33995903, "epoch": 0.025251766120547123, "flos": 17430125506560.0, "grad_norm": 1.4827028826682658, "language_loss": 0.84841162, "learning_rate": 3.8890311847764065e-06, "loss": 0.87626672, "num_input_tokens_seen": 8801175, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.47314453, "step": 420, "time_per_iteration": 2.925823926925659 }, { "auxiliary_loss_clip": 0.01403063, "auxiliary_loss_mlp": 0.01389467, "balance_loss_clip": 1.24438667, "balance_loss_mlp": 1.33997166, "epoch": 0.02531188937321509, "flos": 25056133476480.0, "grad_norm": 1.5284838003167749, "language_loss": 0.88954908, "learning_rate": 3.890562344079484e-06, "loss": 0.91747439, "num_input_tokens_seen": 8820215, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.49487305, "step": 421, "time_per_iteration": 2.99147367477417 }, { "auxiliary_loss_clip": 0.01404969, "auxiliary_loss_mlp": 0.01412558, "balance_loss_clip": 1.24627173, "balance_loss_mlp": 1.34668326, "epoch": 0.02537201262588306, "flos": 30604620349440.0, "grad_norm": 1.6931249223096987, "language_loss": 0.97400844, "learning_rate": 3.89208987073549e-06, "loss": 1.00218368, "num_input_tokens_seen": 8839660, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.65893555, "step": 422, "time_per_iteration": 3.0301687717437744 }, { "auxiliary_loss_clip": 0.01404636, "auxiliary_loss_mlp": 0.01395875, "balance_loss_clip": 1.24663889, "balance_loss_mlp": 1.35062301, "epoch": 0.02543213587855103, "flos": 26075900200320.0, "grad_norm": 1.4579814032477771, "language_loss": 0.93270713, "learning_rate": 3.893613781940409e-06, "loss": 0.96071231, "num_input_tokens_seen": 8859280, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.45263672, "step": 423, "time_per_iteration": 2.9439830780029297 }, { "auxiliary_loss_clip": 0.01401004, "auxiliary_loss_mlp": 0.01395924, "balance_loss_clip": 1.24583828, "balance_loss_mlp": 1.34914601, "epoch": 0.025492259131218997, "flos": 36035117487360.0, "grad_norm": 1.473053770949093, "language_loss": 0.84502828, "learning_rate": 3.895134094768415e-06, "loss": 0.87299764, "num_input_tokens_seen": 8880560, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.46728516, "step": 424, "time_per_iteration": 3.1030938625335693 }, { "auxiliary_loss_clip": 0.01398381, "auxiliary_loss_mlp": 0.01398871, "balance_loss_clip": 1.24246073, "balance_loss_mlp": 1.35521662, "epoch": 0.02555238238388697, "flos": 18597227857920.0, "grad_norm": 1.7119743807831254, "language_loss": 0.97581804, "learning_rate": 3.896650826173015e-06, "loss": 1.00379062, "num_input_tokens_seen": 8899155, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.43652344, "step": 425, "time_per_iteration": 2.9173872470855713 }, { "auxiliary_loss_clip": 0.01401549, "auxiliary_loss_mlp": 0.01394044, "balance_loss_clip": 1.24450099, "balance_loss_mlp": 1.35215354, "epoch": 0.025612505636554938, "flos": 24253795854720.0, "grad_norm": 1.8073242196451935, "language_loss": 0.99149489, "learning_rate": 3.898163992988186e-06, "loss": 1.01945078, "num_input_tokens_seen": 8917890, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.41894531, "step": 426, "time_per_iteration": 2.956328868865967 }, { "auxiliary_loss_clip": 0.01364835, "auxiliary_loss_mlp": 0.0135326, "balance_loss_clip": 1.24177933, "balance_loss_mlp": 1.33027601, "epoch": 0.025672628889222907, "flos": 60617472952320.0, "grad_norm": 0.8757884556402402, "language_loss": 0.57315564, "learning_rate": 3.899673611929491e-06, "loss": 0.60033655, "num_input_tokens_seen": 8978260, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.22949219, "step": 427, "time_per_iteration": 3.5057671070098877 }, { "auxiliary_loss_clip": 0.01395341, "auxiliary_loss_mlp": 0.0138328, "balance_loss_clip": 1.24001908, "balance_loss_mlp": 1.34282017, "epoch": 0.025732752141890875, "flos": 19582717006080.0, "grad_norm": 1.8159926443199037, "language_loss": 1.01561594, "learning_rate": 3.901179699595194e-06, "loss": 1.04340219, "num_input_tokens_seen": 8994460, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.40429688, "step": 428, "time_per_iteration": 2.901517391204834 }, { "auxiliary_loss_clip": 0.01392683, "auxiliary_loss_mlp": 0.01386575, "balance_loss_clip": 1.23747241, "balance_loss_mlp": 1.34106064, "epoch": 0.025792875394558847, "flos": 31296524117760.0, "grad_norm": 1.385685399917727, "language_loss": 0.92941296, "learning_rate": 3.902682272467353e-06, "loss": 0.95720553, "num_input_tokens_seen": 9016670, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.45507812, "step": 429, "time_per_iteration": 3.0726804733276367 }, { "auxiliary_loss_clip": 0.01397484, "auxiliary_loss_mlp": 0.0138321, "balance_loss_clip": 1.23908448, "balance_loss_mlp": 1.34217799, "epoch": 0.025852998647226816, "flos": 32392356629760.0, "grad_norm": 1.6523392557458885, "language_loss": 0.96333861, "learning_rate": 3.904181346912895e-06, "loss": 0.99114561, "num_input_tokens_seen": 9039720, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.41040039, "step": 430, "time_per_iteration": 3.0614352226257324 }, { "auxiliary_loss_clip": 0.01404627, "auxiliary_loss_mlp": 0.0139908, "balance_loss_clip": 1.24561357, "balance_loss_mlp": 1.34443426, "epoch": 0.025913121899894784, "flos": 20202853242240.0, "grad_norm": 1.2954869332652592, "language_loss": 0.93149275, "learning_rate": 3.905676939184698e-06, "loss": 0.95952982, "num_input_tokens_seen": 9059850, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.54711914, "step": 431, "time_per_iteration": 2.9164254665374756 }, { "auxiliary_loss_clip": 0.01399544, "auxiliary_loss_mlp": 0.0138687, "balance_loss_clip": 1.24134183, "balance_loss_mlp": 1.34173787, "epoch": 0.025973245152562753, "flos": 14728577120640.0, "grad_norm": 1.7946940387086892, "language_loss": 1.02198172, "learning_rate": 3.907169065422638e-06, "loss": 1.04984593, "num_input_tokens_seen": 9077590, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.45166016, "step": 432, "time_per_iteration": 2.9389102458953857 }, { "auxiliary_loss_clip": 0.013944, "auxiliary_loss_mlp": 0.01380363, "balance_loss_clip": 1.23937201, "balance_loss_mlp": 1.34104741, "epoch": 0.02603336840523072, "flos": 31005336712320.0, "grad_norm": 1.5179713286549281, "language_loss": 0.88159579, "learning_rate": 3.908657741654636e-06, "loss": 0.90934348, "num_input_tokens_seen": 9099880, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.39331055, "step": 433, "time_per_iteration": 4.500396013259888 }, { "auxiliary_loss_clip": 0.0139473, "auxiliary_loss_mlp": 0.01393043, "balance_loss_clip": 1.23726118, "balance_loss_mlp": 1.34073412, "epoch": 0.026093491657898694, "flos": 17682782325120.0, "grad_norm": 1.720657523640355, "language_loss": 1.00807548, "learning_rate": 3.910142983797699e-06, "loss": 1.03595328, "num_input_tokens_seen": 9118620, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.52294922, "step": 434, "time_per_iteration": 2.913849353790283 }, { "auxiliary_loss_clip": 0.01397763, "auxiliary_loss_mlp": 0.01390417, "balance_loss_clip": 1.23954082, "balance_loss_mlp": 1.34719181, "epoch": 0.026153614910566662, "flos": 17866748257920.0, "grad_norm": 1.7119908151992933, "language_loss": 0.94537556, "learning_rate": 3.9116248076589305e-06, "loss": 0.9732573, "num_input_tokens_seen": 9135655, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.43212891, "step": 435, "time_per_iteration": 5.990097761154175 }, { "auxiliary_loss_clip": 0.01391564, "auxiliary_loss_mlp": 0.01390757, "balance_loss_clip": 1.23408782, "balance_loss_mlp": 1.34517109, "epoch": 0.02621373816323463, "flos": 20020697101440.0, "grad_norm": 1.7765572630960804, "language_loss": 1.00634158, "learning_rate": 3.913103228936546e-06, "loss": 1.03416479, "num_input_tokens_seen": 9153520, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.45556641, "step": 436, "time_per_iteration": 4.30284571647644 }, { "auxiliary_loss_clip": 0.01386504, "auxiliary_loss_mlp": 0.01390833, "balance_loss_clip": 1.23267043, "balance_loss_mlp": 1.34429348, "epoch": 0.0262738614159026, "flos": 19290534215040.0, "grad_norm": 1.7667629061327987, "language_loss": 0.90416002, "learning_rate": 3.914578263220868e-06, "loss": 0.93193334, "num_input_tokens_seen": 9170750, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.46533203, "step": 437, "time_per_iteration": 2.9344027042388916 }, { "auxiliary_loss_clip": 0.01392165, "auxiliary_loss_mlp": 0.01395532, "balance_loss_clip": 1.23491108, "balance_loss_mlp": 1.34195948, "epoch": 0.026333984668570568, "flos": 18816511996800.0, "grad_norm": 1.871606671805799, "language_loss": 1.0253365, "learning_rate": 3.916049925995316e-06, "loss": 1.05321348, "num_input_tokens_seen": 9188430, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.53540039, "step": 438, "time_per_iteration": 2.9069290161132812 }, { "auxiliary_loss_clip": 0.01341885, "auxiliary_loss_mlp": 0.01368823, "balance_loss_clip": 1.21738458, "balance_loss_mlp": 1.35127509, "epoch": 0.02639410792123854, "flos": 64605696992640.0, "grad_norm": 0.8881325415306814, "language_loss": 0.62714016, "learning_rate": 3.917518232637377e-06, "loss": 0.65424728, "num_input_tokens_seen": 9255835, "router_z_loss_clip": 1.25, "router_z_loss_mlp": 0.17578125, "step": 439, "time_per_iteration": 3.4116811752319336 }, { "auxiliary_loss_clip": 0.0139053, "auxiliary_loss_mlp": 0.0139002, "balance_loss_clip": 1.23565936, "balance_loss_mlp": 1.34619856, "epoch": 0.02645423117390651, "flos": 28484632368000.0, "grad_norm": 1.6358271261305932, "language_loss": 0.87253404, "learning_rate": 3.918983198419573e-06, "loss": 0.90033948, "num_input_tokens_seen": 9276835, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.43823242, "step": 440, "time_per_iteration": 3.0350475311279297 }, { "auxiliary_loss_clip": 0.01391654, "auxiliary_loss_mlp": 0.01393591, "balance_loss_clip": 1.23760378, "balance_loss_mlp": 1.35191536, "epoch": 0.026514354426574478, "flos": 18560326083840.0, "grad_norm": 1.545352241203962, "language_loss": 0.94404054, "learning_rate": 3.920444838510415e-06, "loss": 0.97189295, "num_input_tokens_seen": 9295075, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.41674805, "step": 441, "time_per_iteration": 2.8807730674743652 }, { "auxiliary_loss_clip": 0.01395765, "auxiliary_loss_mlp": 0.01409837, "balance_loss_clip": 1.24056935, "balance_loss_mlp": 1.36437011, "epoch": 0.026574477679242446, "flos": 20677554132480.0, "grad_norm": 1.6466820553337695, "language_loss": 0.89875805, "learning_rate": 3.92190316797534e-06, "loss": 0.92681408, "num_input_tokens_seen": 9314205, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.45507812, "step": 442, "time_per_iteration": 2.946699619293213 }, { "auxiliary_loss_clip": 0.01330035, "auxiliary_loss_mlp": 0.01364959, "balance_loss_clip": 1.20807445, "balance_loss_mlp": 1.34769762, "epoch": 0.026634600931910415, "flos": 57984977410560.0, "grad_norm": 0.961870205521472, "language_loss": 0.64679015, "learning_rate": 3.92335820177765e-06, "loss": 0.67374015, "num_input_tokens_seen": 9367395, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.17285156, "step": 443, "time_per_iteration": 3.2001097202301025 }, { "auxiliary_loss_clip": 0.01393847, "auxiliary_loss_mlp": 0.01413296, "balance_loss_clip": 1.23834443, "balance_loss_mlp": 1.37104845, "epoch": 0.026694724184578387, "flos": 15823595226240.0, "grad_norm": 1.696066373000528, "language_loss": 0.98958337, "learning_rate": 3.924809954779425e-06, "loss": 1.01765478, "num_input_tokens_seen": 9385185, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.42236328, "step": 444, "time_per_iteration": 2.8932900428771973 }, { "auxiliary_loss_clip": 0.01399802, "auxiliary_loss_mlp": 0.01415403, "balance_loss_clip": 1.24138057, "balance_loss_mlp": 1.36862528, "epoch": 0.026754847437246355, "flos": 23450598581760.0, "grad_norm": 1.891215762411875, "language_loss": 1.05967486, "learning_rate": 3.9262584417424425e-06, "loss": 1.08782685, "num_input_tokens_seen": 9403225, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.46801758, "step": 445, "time_per_iteration": 2.998979091644287 }, { "auxiliary_loss_clip": 0.01395505, "auxiliary_loss_mlp": 0.01414267, "balance_loss_clip": 1.23962259, "balance_loss_mlp": 1.36517704, "epoch": 0.026814970689914324, "flos": 17349173280000.0, "grad_norm": 1.7231729153345738, "language_loss": 1.06627774, "learning_rate": 3.9277036773290725e-06, "loss": 1.09437537, "num_input_tokens_seen": 9420540, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.49121094, "step": 446, "time_per_iteration": 2.9196202754974365 }, { "auxiliary_loss_clip": 0.01396812, "auxiliary_loss_mlp": 0.01418474, "balance_loss_clip": 1.2432183, "balance_loss_mlp": 1.37448537, "epoch": 0.026875093942582293, "flos": 17903695276800.0, "grad_norm": 1.60495784845932, "language_loss": 0.90625036, "learning_rate": 3.92914567610317e-06, "loss": 0.93440318, "num_input_tokens_seen": 9438840, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.43994141, "step": 447, "time_per_iteration": 3.000779390335083 }, { "auxiliary_loss_clip": 0.01402573, "auxiliary_loss_mlp": 0.01407204, "balance_loss_clip": 1.24788427, "balance_loss_mlp": 1.3569926, "epoch": 0.026935217195250265, "flos": 21733498713600.0, "grad_norm": 1.6459954513717325, "language_loss": 0.97970629, "learning_rate": 3.930584452530952e-06, "loss": 1.00780404, "num_input_tokens_seen": 9457215, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.50244141, "step": 448, "time_per_iteration": 2.9637033939361572 }, { "auxiliary_loss_clip": 0.01407288, "auxiliary_loss_mlp": 0.01387741, "balance_loss_clip": 1.25114012, "balance_loss_mlp": 1.34592271, "epoch": 0.026995340447918233, "flos": 23633071436160.0, "grad_norm": 1.9923106691001402, "language_loss": 0.97877586, "learning_rate": 3.9320200209818755e-06, "loss": 1.00672626, "num_input_tokens_seen": 9475615, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.41845703, "step": 449, "time_per_iteration": 2.9294071197509766 }, { "auxiliary_loss_clip": 0.01422271, "auxiliary_loss_mlp": 0.0140812, "balance_loss_clip": 1.26023197, "balance_loss_mlp": 1.34660769, "epoch": 0.027055463700586202, "flos": 17940280337280.0, "grad_norm": 1.6448201583312032, "language_loss": 0.95780754, "learning_rate": 3.933452395729493e-06, "loss": 0.9861114, "num_input_tokens_seen": 9493975, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.61572266, "step": 450, "time_per_iteration": 2.943301200866699 }, { "auxiliary_loss_clip": 0.01420491, "auxiliary_loss_mlp": 0.0140163, "balance_loss_clip": 1.26398015, "balance_loss_mlp": 1.35666466, "epoch": 0.02711558695325417, "flos": 25129756045440.0, "grad_norm": 1.3423579669924286, "language_loss": 0.87928998, "learning_rate": 3.934881590952304e-06, "loss": 0.90751117, "num_input_tokens_seen": 9514810, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.44970703, "step": 451, "time_per_iteration": 3.01282000541687 }, { "auxiliary_loss_clip": 0.01413283, "auxiliary_loss_mlp": 0.01404288, "balance_loss_clip": 1.25681424, "balance_loss_mlp": 1.36220717, "epoch": 0.02717571020592214, "flos": 24249226129920.0, "grad_norm": 1.3995550304768931, "language_loss": 0.82752109, "learning_rate": 3.936307620734599e-06, "loss": 0.8556968, "num_input_tokens_seen": 9533635, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.42114258, "step": 452, "time_per_iteration": 3.0229203701019287 }, { "auxiliary_loss_clip": 0.01414978, "auxiliary_loss_mlp": 0.01409309, "balance_loss_clip": 1.25545239, "balance_loss_mlp": 1.3611964, "epoch": 0.02723583345859011, "flos": 25129801290240.0, "grad_norm": 1.3968705017557292, "language_loss": 0.79238933, "learning_rate": 3.937730499067294e-06, "loss": 0.82063222, "num_input_tokens_seen": 9555420, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.48120117, "step": 453, "time_per_iteration": 3.002072334289551 }, { "auxiliary_loss_clip": 0.01400529, "auxiliary_loss_mlp": 0.01393735, "balance_loss_clip": 1.245749, "balance_loss_mlp": 1.35251212, "epoch": 0.02729595671125808, "flos": 42757040983680.0, "grad_norm": 1.5344341816424516, "language_loss": 0.93651056, "learning_rate": 3.939150239848748e-06, "loss": 0.96445322, "num_input_tokens_seen": 9578950, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.41235352, "step": 454, "time_per_iteration": 3.089520215988159 }, { "auxiliary_loss_clip": 0.01395591, "auxiliary_loss_mlp": 0.01401952, "balance_loss_clip": 1.24100053, "balance_loss_mlp": 1.3535533, "epoch": 0.02735607996392605, "flos": 21440139557760.0, "grad_norm": 1.3592123467678543, "language_loss": 0.83798641, "learning_rate": 3.9405668568855866e-06, "loss": 0.86596179, "num_input_tokens_seen": 9598160, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.48388672, "step": 455, "time_per_iteration": 2.975954532623291 }, { "auxiliary_loss_clip": 0.01395962, "auxiliary_loss_mlp": 0.01386229, "balance_loss_clip": 1.23839784, "balance_loss_mlp": 1.34321856, "epoch": 0.027416203216594017, "flos": 20861339086080.0, "grad_norm": 1.5838004258745126, "language_loss": 0.92867589, "learning_rate": 3.941980363893499e-06, "loss": 0.95649779, "num_input_tokens_seen": 9616010, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.42993164, "step": 456, "time_per_iteration": 2.9180235862731934 }, { "auxiliary_loss_clip": 0.01395749, "auxiliary_loss_mlp": 0.01406482, "balance_loss_clip": 1.23816264, "balance_loss_mlp": 1.355937, "epoch": 0.027476326469261986, "flos": 13232254469760.0, "grad_norm": 1.4667757240477777, "language_loss": 0.9012388, "learning_rate": 3.9433907744980384e-06, "loss": 0.92926109, "num_input_tokens_seen": 9634000, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.50537109, "step": 457, "time_per_iteration": 2.953564405441284 }, { "auxiliary_loss_clip": 0.01393254, "auxiliary_loss_mlp": 0.01415634, "balance_loss_clip": 1.23626959, "balance_loss_mlp": 1.36966717, "epoch": 0.027536449721929958, "flos": 24035145143040.0, "grad_norm": 1.7860379551689107, "language_loss": 1.04052508, "learning_rate": 3.944798102235412e-06, "loss": 1.06861389, "num_input_tokens_seen": 9653455, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.45922852, "step": 458, "time_per_iteration": 3.1575000286102295 }, { "auxiliary_loss_clip": 0.01392861, "auxiliary_loss_mlp": 0.01424783, "balance_loss_clip": 1.23459816, "balance_loss_mlp": 1.37028027, "epoch": 0.027596572974597926, "flos": 13013875226880.0, "grad_norm": 1.7813968559654139, "language_loss": 0.95093888, "learning_rate": 3.9462023605532545e-06, "loss": 0.97911531, "num_input_tokens_seen": 9669650, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.54516602, "step": 459, "time_per_iteration": 2.9447152614593506 }, { "auxiliary_loss_clip": 0.01388817, "auxiliary_loss_mlp": 0.01383823, "balance_loss_clip": 1.23493648, "balance_loss_mlp": 1.3450563, "epoch": 0.027656696227265895, "flos": 26154861655680.0, "grad_norm": 1.4659047235051597, "language_loss": 0.91302001, "learning_rate": 3.947603562811407e-06, "loss": 0.94074649, "num_input_tokens_seen": 9691415, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.38745117, "step": 460, "time_per_iteration": 3.053457260131836 }, { "auxiliary_loss_clip": 0.01345289, "auxiliary_loss_mlp": 0.01352943, "balance_loss_clip": 1.22423649, "balance_loss_mlp": 1.3336786, "epoch": 0.027716819479933864, "flos": 60727997295360.0, "grad_norm": 1.5583128175533738, "language_loss": 0.73720002, "learning_rate": 3.949001722282675e-06, "loss": 0.76418233, "num_input_tokens_seen": 9755605, "router_z_loss_clip": 1.2109375, "router_z_loss_mlp": 0.19238281, "step": 461, "time_per_iteration": 3.3921680450439453 }, { "auxiliary_loss_clip": 0.01393577, "auxiliary_loss_mlp": 0.01393772, "balance_loss_clip": 1.23783088, "balance_loss_mlp": 1.35154796, "epoch": 0.027776942732601832, "flos": 31224349382400.0, "grad_norm": 1.8583306578271923, "language_loss": 0.95236367, "learning_rate": 3.950396852153582e-06, "loss": 0.98023719, "num_input_tokens_seen": 9776270, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.42236328, "step": 462, "time_per_iteration": 3.0480215549468994 }, { "auxiliary_loss_clip": 0.01385656, "auxiliary_loss_mlp": 0.0140004, "balance_loss_clip": 1.23103058, "balance_loss_mlp": 1.35893643, "epoch": 0.027837065985269804, "flos": 22684936510080.0, "grad_norm": 1.812960128268221, "language_loss": 1.04469562, "learning_rate": 3.951788965525118e-06, "loss": 1.07255244, "num_input_tokens_seen": 9794465, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.41064453, "step": 463, "time_per_iteration": 3.029693841934204 }, { "auxiliary_loss_clip": 0.01342879, "auxiliary_loss_mlp": 0.01374128, "balance_loss_clip": 1.22463417, "balance_loss_mlp": 1.35209787, "epoch": 0.027897189237937773, "flos": 62210882240640.0, "grad_norm": 0.8934585770852488, "language_loss": 0.59342104, "learning_rate": 3.953178075413476e-06, "loss": 0.6205911, "num_input_tokens_seen": 9849685, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.22070312, "step": 464, "time_per_iteration": 3.3332359790802 }, { "auxiliary_loss_clip": 0.01402421, "auxiliary_loss_mlp": 0.01401433, "balance_loss_clip": 1.24197197, "balance_loss_mlp": 1.35327208, "epoch": 0.02795731249060574, "flos": 24502425886080.0, "grad_norm": 1.631430914523931, "language_loss": 0.97409999, "learning_rate": 3.954564194750784e-06, "loss": 1.00213861, "num_input_tokens_seen": 9869505, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.48144531, "step": 465, "time_per_iteration": 3.0281388759613037 }, { "auxiliary_loss_clip": 0.01386456, "auxiliary_loss_mlp": 0.01382408, "balance_loss_clip": 1.23317468, "balance_loss_mlp": 1.34128129, "epoch": 0.02801743574327371, "flos": 23743641024000.0, "grad_norm": 1.62046004387898, "language_loss": 0.87181485, "learning_rate": 3.955947336385828e-06, "loss": 0.89950347, "num_input_tokens_seen": 9890950, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.41137695, "step": 466, "time_per_iteration": 3.0103933811187744 }, { "auxiliary_loss_clip": 0.01384214, "auxiliary_loss_mlp": 0.01385689, "balance_loss_clip": 1.23188627, "balance_loss_mlp": 1.34317899, "epoch": 0.02807755899594168, "flos": 20638254384000.0, "grad_norm": 1.592974291934706, "language_loss": 0.94272721, "learning_rate": 3.957327513084761e-06, "loss": 0.9704262, "num_input_tokens_seen": 9911265, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.42504883, "step": 467, "time_per_iteration": 2.9819204807281494 }, { "auxiliary_loss_clip": 0.01391666, "auxiliary_loss_mlp": 0.01401553, "balance_loss_clip": 1.23797643, "balance_loss_mlp": 1.35858989, "epoch": 0.02813768224860965, "flos": 19253858664960.0, "grad_norm": 1.7023492181544089, "language_loss": 0.96316338, "learning_rate": 3.958704737531818e-06, "loss": 0.99109554, "num_input_tokens_seen": 9929025, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.42944336, "step": 468, "time_per_iteration": 4.536728858947754 }, { "auxiliary_loss_clip": 0.0139219, "auxiliary_loss_mlp": 0.01397046, "balance_loss_clip": 1.23710251, "balance_loss_mlp": 1.35136557, "epoch": 0.02819780550127762, "flos": 20823803884800.0, "grad_norm": 1.7877427659459773, "language_loss": 1.04302728, "learning_rate": 3.9600790223300065e-06, "loss": 1.07091963, "num_input_tokens_seen": 9945190, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.45678711, "step": 469, "time_per_iteration": 3.0760984420776367 }, { "auxiliary_loss_clip": 0.01386101, "auxiliary_loss_mlp": 0.01400987, "balance_loss_clip": 1.23119903, "balance_loss_mlp": 1.35416174, "epoch": 0.028257928753945588, "flos": 19983569103360.0, "grad_norm": 1.741033123569448, "language_loss": 0.98274845, "learning_rate": 3.96145038000181e-06, "loss": 1.0106194, "num_input_tokens_seen": 9962820, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.46850586, "step": 470, "time_per_iteration": 6.044286251068115 }, { "auxiliary_loss_clip": 0.01392183, "auxiliary_loss_mlp": 0.01396169, "balance_loss_clip": 1.23528922, "balance_loss_mlp": 1.35284841, "epoch": 0.028318052006613557, "flos": 20493950158080.0, "grad_norm": 1.5820727714074132, "language_loss": 1.04733515, "learning_rate": 3.962818822989861e-06, "loss": 1.07521868, "num_input_tokens_seen": 9982595, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.43334961, "step": 471, "time_per_iteration": 3.0550925731658936 }, { "auxiliary_loss_clip": 0.01387844, "auxiliary_loss_mlp": 0.01416427, "balance_loss_clip": 1.23063016, "balance_loss_mlp": 1.36204338, "epoch": 0.02837817525928153, "flos": 28527280231680.0, "grad_norm": 1.5595663105136184, "language_loss": 0.86103892, "learning_rate": 3.964184363657625e-06, "loss": 0.8890816, "num_input_tokens_seen": 10004645, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.54418945, "step": 472, "time_per_iteration": 3.018585205078125 }, { "auxiliary_loss_clip": 0.01388575, "auxiliary_loss_mlp": 0.01418744, "balance_loss_clip": 1.23426294, "balance_loss_mlp": 1.3634789, "epoch": 0.028438298511949497, "flos": 18560869021440.0, "grad_norm": 1.5753504677617136, "language_loss": 1.0451405, "learning_rate": 3.965547014290071e-06, "loss": 1.07321382, "num_input_tokens_seen": 10022555, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.55297852, "step": 473, "time_per_iteration": 2.959052085876465 }, { "auxiliary_loss_clip": 0.0139158, "auxiliary_loss_mlp": 0.01415794, "balance_loss_clip": 1.23660386, "balance_loss_mlp": 1.36901641, "epoch": 0.028498421764617466, "flos": 16919111024640.0, "grad_norm": 1.7869397134839766, "language_loss": 1.04671931, "learning_rate": 3.96690678709433e-06, "loss": 1.07479298, "num_input_tokens_seen": 10041025, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.4675293, "step": 474, "time_per_iteration": 2.8996384143829346 }, { "auxiliary_loss_clip": 0.01385194, "auxiliary_loss_mlp": 0.0139348, "balance_loss_clip": 1.23230219, "balance_loss_mlp": 1.3457005, "epoch": 0.028558545017285435, "flos": 27789289994880.0, "grad_norm": 1.7831213893897782, "language_loss": 0.89693069, "learning_rate": 3.968263694200355e-06, "loss": 0.92471743, "num_input_tokens_seen": 10060775, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.47753906, "step": 475, "time_per_iteration": 3.032871723175049 }, { "auxiliary_loss_clip": 0.01346133, "auxiliary_loss_mlp": 0.01425698, "balance_loss_clip": 1.22225761, "balance_loss_mlp": 1.39651585, "epoch": 0.028618668269953403, "flos": 65685150887040.0, "grad_norm": 0.9710452519682823, "language_loss": 0.67209744, "learning_rate": 3.969617747661569e-06, "loss": 0.69981575, "num_input_tokens_seen": 10120225, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.29101562, "step": 476, "time_per_iteration": 3.3518314361572266 }, { "auxiliary_loss_clip": 0.01397256, "auxiliary_loss_mlp": 0.01419079, "balance_loss_clip": 1.23945737, "balance_loss_mlp": 1.36536384, "epoch": 0.028678791522621375, "flos": 21945136481280.0, "grad_norm": 1.7313659423111778, "language_loss": 0.98940587, "learning_rate": 3.970968959455509e-06, "loss": 1.01756918, "num_input_tokens_seen": 10137880, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.53735352, "step": 477, "time_per_iteration": 2.969330310821533 }, { "auxiliary_loss_clip": 0.01391901, "auxiliary_loss_mlp": 0.01432608, "balance_loss_clip": 1.23873854, "balance_loss_mlp": 1.37786734, "epoch": 0.028738914775289344, "flos": 24582970909440.0, "grad_norm": 1.8757472518641047, "language_loss": 0.95890367, "learning_rate": 3.97231734148446e-06, "loss": 0.98714876, "num_input_tokens_seen": 10156930, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.54760742, "step": 478, "time_per_iteration": 2.9957146644592285 }, { "auxiliary_loss_clip": 0.01387514, "auxiliary_loss_mlp": 0.01433185, "balance_loss_clip": 1.23384476, "balance_loss_mlp": 1.37596464, "epoch": 0.028799038027957313, "flos": 23268306706560.0, "grad_norm": 1.4470676800656015, "language_loss": 0.92238498, "learning_rate": 3.973662905576082e-06, "loss": 0.95059198, "num_input_tokens_seen": 10176295, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.57226562, "step": 479, "time_per_iteration": 3.0036611557006836 }, { "auxiliary_loss_clip": 0.0138214, "auxiliary_loss_mlp": 0.01407357, "balance_loss_clip": 1.22935414, "balance_loss_mlp": 1.36384535, "epoch": 0.02885916128062528, "flos": 22174329231360.0, "grad_norm": 1.7120811404400147, "language_loss": 0.8532092, "learning_rate": 3.975005663484038e-06, "loss": 0.88110411, "num_input_tokens_seen": 10195790, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.43505859, "step": 480, "time_per_iteration": 3.052868366241455 }, { "auxiliary_loss_clip": 0.0138378, "auxiliary_loss_mlp": 0.01387542, "balance_loss_clip": 1.23049438, "balance_loss_mlp": 1.34474635, "epoch": 0.02891928453329325, "flos": 22943927600640.0, "grad_norm": 1.435140853335933, "language_loss": 0.95851576, "learning_rate": 3.976345626888605e-06, "loss": 0.98622894, "num_input_tokens_seen": 10218405, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.42797852, "step": 481, "time_per_iteration": 3.131495475769043 }, { "auxiliary_loss_clip": 0.01328846, "auxiliary_loss_mlp": 0.0136574, "balance_loss_clip": 1.20733905, "balance_loss_mlp": 1.34428215, "epoch": 0.028979407785961222, "flos": 57458443962240.0, "grad_norm": 0.8443228180500754, "language_loss": 0.6636076, "learning_rate": 3.9776828073972864e-06, "loss": 0.69055343, "num_input_tokens_seen": 10271005, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.21484375, "step": 482, "time_per_iteration": 3.1530187129974365 }, { "auxiliary_loss_clip": 0.01387044, "auxiliary_loss_mlp": 0.01398808, "balance_loss_clip": 1.2309773, "balance_loss_mlp": 1.34986019, "epoch": 0.02903953103862919, "flos": 16730258653440.0, "grad_norm": 1.9488692586413054, "language_loss": 0.98439902, "learning_rate": 3.979017216545415e-06, "loss": 1.01225758, "num_input_tokens_seen": 10288405, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.48974609, "step": 483, "time_per_iteration": 2.965873956680298 }, { "auxiliary_loss_clip": 0.01393316, "auxiliary_loss_mlp": 0.01407522, "balance_loss_clip": 1.23676753, "balance_loss_mlp": 1.36038697, "epoch": 0.02909965429129716, "flos": 16772318334720.0, "grad_norm": 1.4540944066892914, "language_loss": 0.87004936, "learning_rate": 3.980348865796749e-06, "loss": 0.8980577, "num_input_tokens_seen": 10306875, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.47119141, "step": 484, "time_per_iteration": 3.045238971710205 }, { "auxiliary_loss_clip": 0.01391126, "auxiliary_loss_mlp": 0.01416725, "balance_loss_clip": 1.23546386, "balance_loss_mlp": 1.35139823, "epoch": 0.029159777543965128, "flos": 19793087919360.0, "grad_norm": 1.8418630867811154, "language_loss": 0.94000095, "learning_rate": 3.9816777665440615e-06, "loss": 0.96807945, "num_input_tokens_seen": 10323965, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.65356445, "step": 485, "time_per_iteration": 3.074298143386841 }, { "auxiliary_loss_clip": 0.01386934, "auxiliary_loss_mlp": 0.01386102, "balance_loss_clip": 1.23393691, "balance_loss_mlp": 1.34340167, "epoch": 0.029219900796633096, "flos": 19651226912640.0, "grad_norm": 1.6911738387022914, "language_loss": 1.00434935, "learning_rate": 3.983003930109732e-06, "loss": 1.0320797, "num_input_tokens_seen": 10342620, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.42675781, "step": 486, "time_per_iteration": 3.085508108139038 }, { "auxiliary_loss_clip": 0.01389204, "auxiliary_loss_mlp": 0.01388613, "balance_loss_clip": 1.2342732, "balance_loss_mlp": 1.34555435, "epoch": 0.02928002404930107, "flos": 25896911195520.0, "grad_norm": 1.4838180744284526, "language_loss": 0.98852217, "learning_rate": 3.984327367746315e-06, "loss": 1.01630032, "num_input_tokens_seen": 10364610, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.43041992, "step": 487, "time_per_iteration": 2.9954655170440674 }, { "auxiliary_loss_clip": 0.01396275, "auxiliary_loss_mlp": 0.01394573, "balance_loss_clip": 1.24080586, "balance_loss_mlp": 1.3440758, "epoch": 0.029340147301969037, "flos": 20668007479680.0, "grad_norm": 2.1816299400909656, "language_loss": 1.05241919, "learning_rate": 3.985648090637122e-06, "loss": 1.08032775, "num_input_tokens_seen": 10380910, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.50537109, "step": 488, "time_per_iteration": 2.9837381839752197 }, { "auxiliary_loss_clip": 0.01393009, "auxiliary_loss_mlp": 0.01388786, "balance_loss_clip": 1.23978245, "balance_loss_mlp": 1.34417808, "epoch": 0.029400270554637006, "flos": 24439300110720.0, "grad_norm": 1.5692332047972928, "language_loss": 0.94955957, "learning_rate": 3.986966109896785e-06, "loss": 0.97737747, "num_input_tokens_seen": 10400665, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.44604492, "step": 489, "time_per_iteration": 3.0294408798217773 }, { "auxiliary_loss_clip": 0.01393313, "auxiliary_loss_mlp": 0.01393123, "balance_loss_clip": 1.23912573, "balance_loss_mlp": 1.34446192, "epoch": 0.029460393807304974, "flos": 20130588017280.0, "grad_norm": 1.4491188411076936, "language_loss": 0.95083821, "learning_rate": 3.988281436571815e-06, "loss": 0.97870255, "num_input_tokens_seen": 10420150, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.48681641, "step": 490, "time_per_iteration": 2.9073665142059326 }, { "auxiliary_loss_clip": 0.01398045, "auxiliary_loss_mlp": 0.01391866, "balance_loss_clip": 1.24218082, "balance_loss_mlp": 1.34520733, "epoch": 0.029520517059972943, "flos": 17684863585920.0, "grad_norm": 1.8406277389500665, "language_loss": 1.05263448, "learning_rate": 3.989594081641164e-06, "loss": 1.0805335, "num_input_tokens_seen": 10438210, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.46606445, "step": 491, "time_per_iteration": 3.0073463916778564 }, { "auxiliary_loss_clip": 0.01392238, "auxiliary_loss_mlp": 0.01392325, "balance_loss_clip": 1.23919392, "balance_loss_mlp": 1.34933782, "epoch": 0.029580640312640915, "flos": 18962535525120.0, "grad_norm": 1.5519043220247328, "language_loss": 0.93984282, "learning_rate": 3.9909040560167675e-06, "loss": 0.9676885, "num_input_tokens_seen": 10455125, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.4296875, "step": 492, "time_per_iteration": 2.9352288246154785 }, { "auxiliary_loss_clip": 0.0140917, "auxiliary_loss_mlp": 0.01427459, "balance_loss_clip": 1.25452554, "balance_loss_mlp": 1.35343051, "epoch": 0.029640763565308884, "flos": 18733297530240.0, "grad_norm": 1.9499404470537487, "language_loss": 0.99370396, "learning_rate": 3.992211370544093e-06, "loss": 1.02207017, "num_input_tokens_seen": 10470990, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.73974609, "step": 493, "time_per_iteration": 2.9547882080078125 }, { "auxiliary_loss_clip": 0.01408833, "auxiliary_loss_mlp": 0.01389746, "balance_loss_clip": 1.25428677, "balance_loss_mlp": 1.3501687, "epoch": 0.029700886817976852, "flos": 20605153173120.0, "grad_norm": 1.444415619017881, "language_loss": 0.98846906, "learning_rate": 3.99351603600268e-06, "loss": 1.01645494, "num_input_tokens_seen": 10490685, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.39575195, "step": 494, "time_per_iteration": 2.949354887008667 }, { "auxiliary_loss_clip": 0.01412453, "auxiliary_loss_mlp": 0.01389366, "balance_loss_clip": 1.25701046, "balance_loss_mlp": 1.3465941, "epoch": 0.02976101007064482, "flos": 22247046904320.0, "grad_norm": 1.8480463043505453, "language_loss": 0.97758102, "learning_rate": 3.994818063106668e-06, "loss": 1.00559926, "num_input_tokens_seen": 10509435, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.42797852, "step": 495, "time_per_iteration": 2.9766459465026855 }, { "auxiliary_loss_clip": 0.01401994, "auxiliary_loss_mlp": 0.01379683, "balance_loss_clip": 1.2488637, "balance_loss_mlp": 1.3427043, "epoch": 0.029821133323312793, "flos": 23743188576000.0, "grad_norm": 1.4243236317464374, "language_loss": 0.72786117, "learning_rate": 3.99611746250533e-06, "loss": 0.75567794, "num_input_tokens_seen": 10530050, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.36987305, "step": 496, "time_per_iteration": 3.0946428775787354 }, { "auxiliary_loss_clip": 0.01406656, "auxiliary_loss_mlp": 0.01392992, "balance_loss_clip": 1.25194752, "balance_loss_mlp": 1.35229349, "epoch": 0.02988125657598076, "flos": 22429791227520.0, "grad_norm": 1.3815379439829476, "language_loss": 0.96899009, "learning_rate": 3.997414244783595e-06, "loss": 0.99698657, "num_input_tokens_seen": 10551370, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.40673828, "step": 497, "time_per_iteration": 2.9681286811828613 }, { "auxiliary_loss_clip": 0.0139576, "auxiliary_loss_mlp": 0.01397834, "balance_loss_clip": 1.24123824, "balance_loss_mlp": 1.3505795, "epoch": 0.02994137982864873, "flos": 13853114622720.0, "grad_norm": 1.9164314637955884, "language_loss": 1.02092266, "learning_rate": 3.998708420462557e-06, "loss": 1.04885864, "num_input_tokens_seen": 10569225, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.47265625, "step": 498, "time_per_iteration": 2.9890737533569336 }, { "auxiliary_loss_clip": 0.01391419, "auxiliary_loss_mlp": 0.01401417, "balance_loss_clip": 1.23638237, "balance_loss_mlp": 1.35637999, "epoch": 0.0300015030813167, "flos": 23917200652800.0, "grad_norm": 1.9508563847400384, "language_loss": 0.97575581, "learning_rate": 4e-06, "loss": 1.00368428, "num_input_tokens_seen": 10586170, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.45043945, "step": 499, "time_per_iteration": 2.9326705932617188 }, { "auxiliary_loss_clip": 0.01382613, "auxiliary_loss_mlp": 0.01395076, "balance_loss_clip": 1.23100603, "balance_loss_mlp": 1.34834599, "epoch": 0.030061626333984667, "flos": 22026631645440.0, "grad_norm": 1.3859092865242784, "language_loss": 0.90384626, "learning_rate": 3.9999999620799e-06, "loss": 0.9316231, "num_input_tokens_seen": 10606205, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.46777344, "step": 500, "time_per_iteration": 3.1282660961151123 }, { "auxiliary_loss_clip": 0.01385132, "auxiliary_loss_mlp": 0.01417048, "balance_loss_clip": 1.23113632, "balance_loss_mlp": 1.35374844, "epoch": 0.03012174958665264, "flos": 23050198932480.0, "grad_norm": 1.8813208962994148, "language_loss": 1.03797114, "learning_rate": 3.9999998483196e-06, "loss": 1.06599295, "num_input_tokens_seen": 10625995, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.63354492, "step": 501, "time_per_iteration": 2.9867758750915527 }, { "auxiliary_loss_clip": 0.01386401, "auxiliary_loss_mlp": 0.01398615, "balance_loss_clip": 1.23308694, "balance_loss_mlp": 1.3543644, "epoch": 0.030181872839320608, "flos": 18962626014720.0, "grad_norm": 1.8442159602792667, "language_loss": 0.98640454, "learning_rate": 3.9999996587191065e-06, "loss": 1.01425469, "num_input_tokens_seen": 10644105, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.44262695, "step": 502, "time_per_iteration": 2.873424768447876 }, { "auxiliary_loss_clip": 0.01383441, "auxiliary_loss_mlp": 0.01396099, "balance_loss_clip": 1.23239124, "balance_loss_mlp": 1.35573423, "epoch": 0.030241996091988577, "flos": 16736909639040.0, "grad_norm": 1.9187942634401132, "language_loss": 0.95452714, "learning_rate": 3.999999393278425e-06, "loss": 0.98232257, "num_input_tokens_seen": 10661090, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.40380859, "step": 503, "time_per_iteration": 3.0077877044677734 }, { "auxiliary_loss_clip": 0.01372528, "auxiliary_loss_mlp": 0.01395638, "balance_loss_clip": 1.2272408, "balance_loss_mlp": 1.35920751, "epoch": 0.030302119344656545, "flos": 28632963381120.0, "grad_norm": 1.478112613598757, "language_loss": 0.97800672, "learning_rate": 3.999999051997567e-06, "loss": 1.00568843, "num_input_tokens_seen": 10682380, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.36425781, "step": 504, "time_per_iteration": 4.583301067352295 }, { "auxiliary_loss_clip": 0.01380949, "auxiliary_loss_mlp": 0.0138645, "balance_loss_clip": 1.23060441, "balance_loss_mlp": 1.34892309, "epoch": 0.030362242597324514, "flos": 15677797921920.0, "grad_norm": 1.4534440519363268, "language_loss": 0.84831536, "learning_rate": 3.9999986348765425e-06, "loss": 0.87598926, "num_input_tokens_seen": 10699925, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.375, "step": 505, "time_per_iteration": 5.2347893714904785 }, { "auxiliary_loss_clip": 0.0138243, "auxiliary_loss_mlp": 0.01409152, "balance_loss_clip": 1.26197577, "balance_loss_mlp": 1.38187671, "epoch": 0.030422365849992486, "flos": 72161973446400.0, "grad_norm": 0.8825622931093922, "language_loss": 0.5520798, "learning_rate": 3.999998141915371e-06, "loss": 0.57999557, "num_input_tokens_seen": 10766525, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.2734375, "step": 506, "time_per_iteration": 5.04321551322937 }, { "auxiliary_loss_clip": 0.01384931, "auxiliary_loss_mlp": 0.01419956, "balance_loss_clip": 1.23507762, "balance_loss_mlp": 1.36287892, "epoch": 0.030482489102660455, "flos": 19437372149760.0, "grad_norm": 1.4770197951000967, "language_loss": 0.90798068, "learning_rate": 3.999997573114069e-06, "loss": 0.93602955, "num_input_tokens_seen": 10786725, "router_z_loss_clip": 1.50097656, "router_z_loss_mlp": 0.57080078, "step": 507, "time_per_iteration": 2.9115123748779297 }, { "auxiliary_loss_clip": 0.0138816, "auxiliary_loss_mlp": 0.0142399, "balance_loss_clip": 1.2396543, "balance_loss_mlp": 1.38388777, "epoch": 0.030542612355328423, "flos": 20385461831040.0, "grad_norm": 1.801841636527473, "language_loss": 1.00916982, "learning_rate": 3.999996928472659e-06, "loss": 1.03729141, "num_input_tokens_seen": 10805390, "router_z_loss_clip": 1.48535156, "router_z_loss_mlp": 0.40136719, "step": 508, "time_per_iteration": 2.9402434825897217 }, { "auxiliary_loss_clip": 0.01389368, "auxiliary_loss_mlp": 0.01437864, "balance_loss_clip": 1.23873377, "balance_loss_mlp": 1.39454341, "epoch": 0.030602735607996392, "flos": 34690473964800.0, "grad_norm": 1.6227045912075388, "language_loss": 0.79833537, "learning_rate": 3.999996207991165e-06, "loss": 0.8266077, "num_input_tokens_seen": 10828030, "router_z_loss_clip": 1.50390625, "router_z_loss_mlp": 0.43310547, "step": 509, "time_per_iteration": 3.067511558532715 }, { "auxiliary_loss_clip": 0.01380324, "auxiliary_loss_mlp": 0.0139072, "balance_loss_clip": 1.23200727, "balance_loss_mlp": 1.35123801, "epoch": 0.03066285886066436, "flos": 23668932579840.0, "grad_norm": 1.7821230854378898, "language_loss": 0.92262346, "learning_rate": 3.999995411669614e-06, "loss": 0.95033383, "num_input_tokens_seen": 10845240, "router_z_loss_clip": 1.48242188, "router_z_loss_mlp": 0.39477539, "step": 510, "time_per_iteration": 2.9140372276306152 }, { "auxiliary_loss_clip": 0.01382837, "auxiliary_loss_mlp": 0.01378616, "balance_loss_clip": 1.23423159, "balance_loss_mlp": 1.34130335, "epoch": 0.030722982113332332, "flos": 23013342403200.0, "grad_norm": 1.7832758260586248, "language_loss": 0.97462076, "learning_rate": 3.999994539508036e-06, "loss": 1.00223517, "num_input_tokens_seen": 10864325, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.37329102, "step": 511, "time_per_iteration": 2.9507713317871094 }, { "auxiliary_loss_clip": 0.01385504, "auxiliary_loss_mlp": 0.01401197, "balance_loss_clip": 1.23218274, "balance_loss_mlp": 1.35859168, "epoch": 0.0307831053660003, "flos": 24760783549440.0, "grad_norm": 1.5482808632358973, "language_loss": 0.93070579, "learning_rate": 3.9999935915064655e-06, "loss": 0.95857286, "num_input_tokens_seen": 10883860, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.42626953, "step": 512, "time_per_iteration": 2.9955101013183594 }, { "auxiliary_loss_clip": 0.01381079, "auxiliary_loss_mlp": 0.01424607, "balance_loss_clip": 1.22898185, "balance_loss_mlp": 1.38009429, "epoch": 0.03084322861866827, "flos": 26152101722880.0, "grad_norm": 1.6439685943218116, "language_loss": 0.96728939, "learning_rate": 3.9999925676649374e-06, "loss": 0.99534625, "num_input_tokens_seen": 10904555, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.44555664, "step": 513, "time_per_iteration": 3.0988457202911377 }, { "auxiliary_loss_clip": 0.01381079, "auxiliary_loss_mlp": 0.01428278, "balance_loss_clip": 1.22732019, "balance_loss_mlp": 1.38819945, "epoch": 0.03090335187133624, "flos": 18780741342720.0, "grad_norm": 1.2908774511688603, "language_loss": 0.90234423, "learning_rate": 3.999991467983491e-06, "loss": 0.9304378, "num_input_tokens_seen": 10923700, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.40087891, "step": 514, "time_per_iteration": 3.0078907012939453 }, { "auxiliary_loss_clip": 0.0139261, "auxiliary_loss_mlp": 0.01415255, "balance_loss_clip": 1.23924172, "balance_loss_mlp": 1.37319827, "epoch": 0.030963475124004207, "flos": 23232309828480.0, "grad_norm": 2.378660491198348, "language_loss": 0.92467397, "learning_rate": 3.999990292462167e-06, "loss": 0.95275259, "num_input_tokens_seen": 10942730, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.42089844, "step": 515, "time_per_iteration": 2.931396245956421 }, { "auxiliary_loss_clip": 0.01382406, "auxiliary_loss_mlp": 0.01401265, "balance_loss_clip": 1.2282238, "balance_loss_mlp": 1.35479772, "epoch": 0.03102359837667218, "flos": 42541738387200.0, "grad_norm": 1.5105274688297254, "language_loss": 0.95988578, "learning_rate": 3.999989041101011e-06, "loss": 0.98772246, "num_input_tokens_seen": 10967120, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.46459961, "step": 516, "time_per_iteration": 3.1434574127197266 }, { "auxiliary_loss_clip": 0.01381521, "auxiliary_loss_mlp": 0.01393495, "balance_loss_clip": 1.22707498, "balance_loss_mlp": 1.35317791, "epoch": 0.031083721629340148, "flos": 21186396864000.0, "grad_norm": 1.5355368633898163, "language_loss": 0.86996794, "learning_rate": 3.999987713900071e-06, "loss": 0.89771807, "num_input_tokens_seen": 10986775, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.40332031, "step": 517, "time_per_iteration": 2.9121408462524414 }, { "auxiliary_loss_clip": 0.01377672, "auxiliary_loss_mlp": 0.01412103, "balance_loss_clip": 1.22401285, "balance_loss_mlp": 1.36637402, "epoch": 0.031143844882008116, "flos": 29728252955520.0, "grad_norm": 1.3408313006073282, "language_loss": 0.96561956, "learning_rate": 3.999986310859396e-06, "loss": 0.99351734, "num_input_tokens_seen": 11011360, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.45703125, "step": 518, "time_per_iteration": 3.0218162536621094 }, { "auxiliary_loss_clip": 0.01386037, "auxiliary_loss_mlp": 0.01405725, "balance_loss_clip": 1.23136044, "balance_loss_mlp": 1.35971022, "epoch": 0.031203968134676085, "flos": 23122871360640.0, "grad_norm": 1.6397878325037119, "language_loss": 0.96773589, "learning_rate": 3.999984831979039e-06, "loss": 0.99565351, "num_input_tokens_seen": 11030150, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.46020508, "step": 519, "time_per_iteration": 2.99403977394104 }, { "auxiliary_loss_clip": 0.01379204, "auxiliary_loss_mlp": 0.01398535, "balance_loss_clip": 1.22603059, "balance_loss_mlp": 1.35747933, "epoch": 0.03126409138734405, "flos": 20962814469120.0, "grad_norm": 1.63570805755852, "language_loss": 0.96088082, "learning_rate": 3.999983277259057e-06, "loss": 0.98865819, "num_input_tokens_seen": 11049145, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.41064453, "step": 520, "time_per_iteration": 2.8942577838897705 }, { "auxiliary_loss_clip": 0.01385572, "auxiliary_loss_mlp": 0.01386406, "balance_loss_clip": 1.23213184, "balance_loss_mlp": 1.34420586, "epoch": 0.031324214640012026, "flos": 21659378451840.0, "grad_norm": 1.5367765307014523, "language_loss": 0.96037579, "learning_rate": 3.999981646699509e-06, "loss": 0.98809552, "num_input_tokens_seen": 11068835, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.42163086, "step": 521, "time_per_iteration": 2.9566125869750977 }, { "auxiliary_loss_clip": 0.01388699, "auxiliary_loss_mlp": 0.01389588, "balance_loss_clip": 1.23249733, "balance_loss_mlp": 1.3469348, "epoch": 0.03138433789267999, "flos": 23451955925760.0, "grad_norm": 1.5779766309577878, "language_loss": 0.79782134, "learning_rate": 3.999979940300456e-06, "loss": 0.8256042, "num_input_tokens_seen": 11088980, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.42700195, "step": 522, "time_per_iteration": 2.943603754043579 }, { "auxiliary_loss_clip": 0.01395642, "auxiliary_loss_mlp": 0.01390611, "balance_loss_clip": 1.23521614, "balance_loss_mlp": 1.34986567, "epoch": 0.03144446114534796, "flos": 18989573932800.0, "grad_norm": 2.0136101770400527, "language_loss": 1.00219035, "learning_rate": 3.999978158061963e-06, "loss": 1.0300529, "num_input_tokens_seen": 11104300, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.40673828, "step": 523, "time_per_iteration": 2.9485507011413574 }, { "auxiliary_loss_clip": 0.01397135, "auxiliary_loss_mlp": 0.01410399, "balance_loss_clip": 1.23499906, "balance_loss_mlp": 1.36416936, "epoch": 0.031504584398015935, "flos": 22647853756800.0, "grad_norm": 1.674414843428846, "language_loss": 1.03941453, "learning_rate": 3.999976299984099e-06, "loss": 1.06748986, "num_input_tokens_seen": 11123335, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.4621582, "step": 524, "time_per_iteration": 3.0019829273223877 }, { "auxiliary_loss_clip": 0.013954, "auxiliary_loss_mlp": 0.0140097, "balance_loss_clip": 1.23676395, "balance_loss_mlp": 1.36408627, "epoch": 0.0315647076506839, "flos": 25306844768640.0, "grad_norm": 1.8008962303306546, "language_loss": 0.94083709, "learning_rate": 3.999974366066933e-06, "loss": 0.96880078, "num_input_tokens_seen": 11140880, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.36889648, "step": 525, "time_per_iteration": 3.0052995681762695 }, { "auxiliary_loss_clip": 0.01390021, "auxiliary_loss_mlp": 0.01404591, "balance_loss_clip": 1.23248565, "balance_loss_mlp": 1.35461831, "epoch": 0.03162483090335187, "flos": 16991647718400.0, "grad_norm": 1.661271286255701, "language_loss": 0.90558642, "learning_rate": 3.999972356310538e-06, "loss": 0.9335326, "num_input_tokens_seen": 11158710, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.49975586, "step": 526, "time_per_iteration": 2.934609889984131 }, { "auxiliary_loss_clip": 0.0138842, "auxiliary_loss_mlp": 0.01384785, "balance_loss_clip": 1.23024106, "balance_loss_mlp": 1.3406775, "epoch": 0.03168495415601984, "flos": 18743839568640.0, "grad_norm": 1.601443643064492, "language_loss": 0.95155841, "learning_rate": 3.999970270714991e-06, "loss": 0.97929043, "num_input_tokens_seen": 11177550, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.44116211, "step": 527, "time_per_iteration": 2.9149956703186035 }, { "auxiliary_loss_clip": 0.01392083, "auxiliary_loss_mlp": 0.0138915, "balance_loss_clip": 1.23354363, "balance_loss_mlp": 1.34432757, "epoch": 0.03174507740868781, "flos": 21224701226880.0, "grad_norm": 1.6999017800949616, "language_loss": 1.08205235, "learning_rate": 3.999968109280371e-06, "loss": 1.10986471, "num_input_tokens_seen": 11196230, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.44824219, "step": 528, "time_per_iteration": 2.883425235748291 }, { "auxiliary_loss_clip": 0.01393634, "auxiliary_loss_mlp": 0.01389864, "balance_loss_clip": 1.23388791, "balance_loss_mlp": 1.34980965, "epoch": 0.03180520066135578, "flos": 24797685323520.0, "grad_norm": 1.6407552048578695, "language_loss": 0.93865645, "learning_rate": 3.99996587200676e-06, "loss": 0.96649146, "num_input_tokens_seen": 11214935, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.40039062, "step": 529, "time_per_iteration": 3.064373254776001 }, { "auxiliary_loss_clip": 0.01396485, "auxiliary_loss_mlp": 0.01394397, "balance_loss_clip": 1.23704076, "balance_loss_mlp": 1.34957457, "epoch": 0.03186532391402375, "flos": 24875244190080.0, "grad_norm": 1.5404479783076748, "language_loss": 1.00049734, "learning_rate": 3.999963558894243e-06, "loss": 1.02840614, "num_input_tokens_seen": 11235310, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.44824219, "step": 530, "time_per_iteration": 2.948352575302124 }, { "auxiliary_loss_clip": 0.01398473, "auxiliary_loss_mlp": 0.01393585, "balance_loss_clip": 1.23581767, "balance_loss_mlp": 1.34988248, "epoch": 0.03192544716669172, "flos": 21224927450880.0, "grad_norm": 1.7580558168853224, "language_loss": 0.89432806, "learning_rate": 3.999961169942907e-06, "loss": 0.92224866, "num_input_tokens_seen": 11254425, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.43676758, "step": 531, "time_per_iteration": 2.9925355911254883 }, { "auxiliary_loss_clip": 0.01399831, "auxiliary_loss_mlp": 0.01389245, "balance_loss_clip": 1.23727727, "balance_loss_mlp": 1.34988201, "epoch": 0.03198557041935969, "flos": 24364003484160.0, "grad_norm": 1.5905847935105442, "language_loss": 1.01604247, "learning_rate": 3.999958705152843e-06, "loss": 1.04393327, "num_input_tokens_seen": 11274595, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.39355469, "step": 532, "time_per_iteration": 3.0931036472320557 }, { "auxiliary_loss_clip": 0.01360488, "auxiliary_loss_mlp": 0.01363705, "balance_loss_clip": 1.23154283, "balance_loss_mlp": 1.34100759, "epoch": 0.032045693672027656, "flos": 61858197872640.0, "grad_norm": 0.7286025446375463, "language_loss": 0.58068454, "learning_rate": 3.9999561645241445e-06, "loss": 0.60792649, "num_input_tokens_seen": 11336705, "router_z_loss_clip": 1.2890625, "router_z_loss_mlp": 0.2265625, "step": 533, "time_per_iteration": 3.486196756362915 }, { "auxiliary_loss_clip": 0.01402596, "auxiliary_loss_mlp": 0.01390816, "balance_loss_clip": 1.23838091, "balance_loss_mlp": 1.35119104, "epoch": 0.03210581692469563, "flos": 28412502877440.0, "grad_norm": 1.4797032329372963, "language_loss": 0.96599734, "learning_rate": 3.999953548056907e-06, "loss": 0.99393147, "num_input_tokens_seen": 11356820, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.39599609, "step": 534, "time_per_iteration": 2.998612403869629 }, { "auxiliary_loss_clip": 0.01403492, "auxiliary_loss_mlp": 0.01387605, "balance_loss_clip": 1.2360146, "balance_loss_mlp": 1.34597707, "epoch": 0.03216594017736359, "flos": 24728496744960.0, "grad_norm": 1.7414919972863243, "language_loss": 0.90999508, "learning_rate": 3.999950855751232e-06, "loss": 0.93790615, "num_input_tokens_seen": 11376645, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.41625977, "step": 535, "time_per_iteration": 2.98520565032959 }, { "auxiliary_loss_clip": 0.01404672, "auxiliary_loss_mlp": 0.01391438, "balance_loss_clip": 1.23800397, "balance_loss_mlp": 1.34849834, "epoch": 0.032226063430031565, "flos": 31187854811520.0, "grad_norm": 1.9769038313455278, "language_loss": 0.93482828, "learning_rate": 3.999948087607219e-06, "loss": 0.96278942, "num_input_tokens_seen": 11397310, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.42944336, "step": 536, "time_per_iteration": 3.1017744541168213 }, { "auxiliary_loss_clip": 0.01414027, "auxiliary_loss_mlp": 0.01395451, "balance_loss_clip": 1.24216807, "balance_loss_mlp": 1.35069966, "epoch": 0.03228618668269954, "flos": 32211195874560.0, "grad_norm": 1.4828893014279971, "language_loss": 0.83592248, "learning_rate": 3.999945243624975e-06, "loss": 0.86401731, "num_input_tokens_seen": 11418475, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.44750977, "step": 537, "time_per_iteration": 3.110311269760132 }, { "auxiliary_loss_clip": 0.01409134, "auxiliary_loss_mlp": 0.01385102, "balance_loss_clip": 1.2376653, "balance_loss_mlp": 1.34295011, "epoch": 0.0323463099353675, "flos": 22679280910080.0, "grad_norm": 1.6555354543230907, "language_loss": 0.94436759, "learning_rate": 3.999942323804607e-06, "loss": 0.97230995, "num_input_tokens_seen": 11436630, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.42138672, "step": 538, "time_per_iteration": 3.068390130996704 }, { "auxiliary_loss_clip": 0.01405225, "auxiliary_loss_mlp": 0.01387194, "balance_loss_clip": 1.23500729, "balance_loss_mlp": 1.34518492, "epoch": 0.032406433188035474, "flos": 26916089736960.0, "grad_norm": 1.6173157136304888, "language_loss": 0.88633031, "learning_rate": 3.999939328146225e-06, "loss": 0.91425443, "num_input_tokens_seen": 11457275, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.41992188, "step": 539, "time_per_iteration": 4.450425148010254 }, { "auxiliary_loss_clip": 0.01408529, "auxiliary_loss_mlp": 0.01394137, "balance_loss_clip": 1.23601818, "balance_loss_mlp": 1.34392619, "epoch": 0.03246655644070344, "flos": 31516441683840.0, "grad_norm": 1.6390056716875077, "language_loss": 0.88359332, "learning_rate": 3.999936256649943e-06, "loss": 0.9116199, "num_input_tokens_seen": 11476925, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.50170898, "step": 540, "time_per_iteration": 4.552294731140137 }, { "auxiliary_loss_clip": 0.01417582, "auxiliary_loss_mlp": 0.01396456, "balance_loss_clip": 1.24124503, "balance_loss_mlp": 1.3509419, "epoch": 0.03252667969337141, "flos": 23227830593280.0, "grad_norm": 1.7019306959655565, "language_loss": 0.9771533, "learning_rate": 3.999933109315878e-06, "loss": 1.00529373, "num_input_tokens_seen": 11496830, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.45532227, "step": 541, "time_per_iteration": 5.889995813369751 }, { "auxiliary_loss_clip": 0.01415401, "auxiliary_loss_mlp": 0.01391975, "balance_loss_clip": 1.23799074, "balance_loss_mlp": 1.3518014, "epoch": 0.032586802946039384, "flos": 14765750363520.0, "grad_norm": 1.7455096562493742, "language_loss": 0.9773379, "learning_rate": 3.9999298861441496e-06, "loss": 1.00541162, "num_input_tokens_seen": 11515605, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.40185547, "step": 542, "time_per_iteration": 2.981987476348877 }, { "auxiliary_loss_clip": 0.01419446, "auxiliary_loss_mlp": 0.01406006, "balance_loss_clip": 1.23915958, "balance_loss_mlp": 1.36175561, "epoch": 0.03264692619870735, "flos": 24291557280000.0, "grad_norm": 1.5078472738815625, "language_loss": 0.81774366, "learning_rate": 3.999926587134879e-06, "loss": 0.84599829, "num_input_tokens_seen": 11536230, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.44262695, "step": 543, "time_per_iteration": 2.9540932178497314 }, { "auxiliary_loss_clip": 0.01430083, "auxiliary_loss_mlp": 0.01400923, "balance_loss_clip": 1.24284554, "balance_loss_mlp": 1.35643399, "epoch": 0.03270704945137532, "flos": 22903406242560.0, "grad_norm": 1.9370127086916347, "language_loss": 1.05949759, "learning_rate": 3.999923212288192e-06, "loss": 1.08780766, "num_input_tokens_seen": 11554715, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.44458008, "step": 544, "time_per_iteration": 3.057410478591919 }, { "auxiliary_loss_clip": 0.01425491, "auxiliary_loss_mlp": 0.01409114, "balance_loss_clip": 1.24187613, "balance_loss_mlp": 1.36028552, "epoch": 0.032767172704043286, "flos": 18050126008320.0, "grad_norm": 1.9524213903566685, "language_loss": 0.84328485, "learning_rate": 3.999919761604216e-06, "loss": 0.87163097, "num_input_tokens_seen": 11571370, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.48803711, "step": 545, "time_per_iteration": 2.934523344039917 }, { "auxiliary_loss_clip": 0.01432438, "auxiliary_loss_mlp": 0.01395592, "balance_loss_clip": 1.24188435, "balance_loss_mlp": 1.35062671, "epoch": 0.03282729595671126, "flos": 22539003471360.0, "grad_norm": 1.8226463613501447, "language_loss": 1.03038907, "learning_rate": 3.999916235083083e-06, "loss": 1.05866933, "num_input_tokens_seen": 11588560, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.44946289, "step": 546, "time_per_iteration": 2.908083438873291 }, { "auxiliary_loss_clip": 0.01446037, "auxiliary_loss_mlp": 0.01393571, "balance_loss_clip": 1.25105596, "balance_loss_mlp": 1.34991646, "epoch": 0.03288741920937923, "flos": 20419829896320.0, "grad_norm": 1.8617514936145956, "language_loss": 0.99947137, "learning_rate": 3.999912632724925e-06, "loss": 1.02786732, "num_input_tokens_seen": 11605685, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.4362793, "step": 547, "time_per_iteration": 2.9383485317230225 }, { "auxiliary_loss_clip": 0.01439037, "auxiliary_loss_mlp": 0.0140349, "balance_loss_clip": 1.24744642, "balance_loss_mlp": 1.34700871, "epoch": 0.032947542462047195, "flos": 20787942741120.0, "grad_norm": 1.462629543571229, "language_loss": 0.95405847, "learning_rate": 3.999908954529881e-06, "loss": 0.9824838, "num_input_tokens_seen": 11626290, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.56445312, "step": 548, "time_per_iteration": 2.965420722961426 }, { "auxiliary_loss_clip": 0.01454649, "auxiliary_loss_mlp": 0.01399372, "balance_loss_clip": 1.25546622, "balance_loss_mlp": 1.35009098, "epoch": 0.03300766571471517, "flos": 19910715696000.0, "grad_norm": 1.913275016463635, "language_loss": 0.87104952, "learning_rate": 3.999905200498087e-06, "loss": 0.89958966, "num_input_tokens_seen": 11643950, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.49316406, "step": 549, "time_per_iteration": 2.972935438156128 }, { "auxiliary_loss_clip": 0.01450822, "auxiliary_loss_mlp": 0.01399339, "balance_loss_clip": 1.25678253, "balance_loss_mlp": 1.34898472, "epoch": 0.03306778896738313, "flos": 17976774908160.0, "grad_norm": 1.5685326313515922, "language_loss": 0.94064575, "learning_rate": 3.999901370629689e-06, "loss": 0.96914738, "num_input_tokens_seen": 11662560, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.50366211, "step": 550, "time_per_iteration": 2.9491541385650635 }, { "auxiliary_loss_clip": 0.01469297, "auxiliary_loss_mlp": 0.01401381, "balance_loss_clip": 1.27222669, "balance_loss_mlp": 1.34778476, "epoch": 0.033127912220051105, "flos": 21663495728640.0, "grad_norm": 1.4016472453168107, "language_loss": 0.86652619, "learning_rate": 3.99989746492483e-06, "loss": 0.89523292, "num_input_tokens_seen": 11682265, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.53588867, "step": 551, "time_per_iteration": 2.9162535667419434 }, { "auxiliary_loss_clip": 0.01474119, "auxiliary_loss_mlp": 0.01391931, "balance_loss_clip": 1.27341855, "balance_loss_mlp": 1.34729874, "epoch": 0.03318803547271908, "flos": 30200012933760.0, "grad_norm": 1.9473256714319205, "language_loss": 1.06166327, "learning_rate": 3.999893483383658e-06, "loss": 1.09032369, "num_input_tokens_seen": 11699300, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.4465332, "step": 552, "time_per_iteration": 3.022292137145996 }, { "auxiliary_loss_clip": 0.01476424, "auxiliary_loss_mlp": 0.01391541, "balance_loss_clip": 1.27649856, "balance_loss_mlp": 1.34228325, "epoch": 0.03324815872538704, "flos": 20385733299840.0, "grad_norm": 1.9504226313854764, "language_loss": 1.04576027, "learning_rate": 3.999889426006326e-06, "loss": 1.07444, "num_input_tokens_seen": 11716955, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.49291992, "step": 553, "time_per_iteration": 2.897512912750244 }, { "auxiliary_loss_clip": 0.01491939, "auxiliary_loss_mlp": 0.01388291, "balance_loss_clip": 1.28249943, "balance_loss_mlp": 1.34239566, "epoch": 0.033308281978055014, "flos": 24504190433280.0, "grad_norm": 1.745131349360767, "language_loss": 0.8820616, "learning_rate": 3.999885292792986e-06, "loss": 0.91086394, "num_input_tokens_seen": 11736130, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.45898438, "step": 554, "time_per_iteration": 3.0998427867889404 }, { "auxiliary_loss_clip": 0.01493839, "auxiliary_loss_mlp": 0.01399226, "balance_loss_clip": 1.2877872, "balance_loss_mlp": 1.3488009, "epoch": 0.03336840523072298, "flos": 23409760510080.0, "grad_norm": 1.708469788618555, "language_loss": 0.90916187, "learning_rate": 3.999881083743795e-06, "loss": 0.93809253, "num_input_tokens_seen": 11754425, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.50415039, "step": 555, "time_per_iteration": 2.939497232437134 }, { "auxiliary_loss_clip": 0.01525754, "auxiliary_loss_mlp": 0.01395023, "balance_loss_clip": 1.30556285, "balance_loss_mlp": 1.34731567, "epoch": 0.03342852848339095, "flos": 30561610527360.0, "grad_norm": 1.9202911351776153, "language_loss": 1.02395892, "learning_rate": 3.999876798858914e-06, "loss": 1.05316663, "num_input_tokens_seen": 11772845, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.47705078, "step": 556, "time_per_iteration": 3.023836612701416 }, { "auxiliary_loss_clip": 0.01518416, "auxiliary_loss_mlp": 0.01397441, "balance_loss_clip": 1.3020649, "balance_loss_mlp": 1.35016227, "epoch": 0.03348865173605892, "flos": 22903587221760.0, "grad_norm": 1.8669638468318388, "language_loss": 0.94562364, "learning_rate": 3.999872438138503e-06, "loss": 0.97478217, "num_input_tokens_seen": 11792850, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.47314453, "step": 557, "time_per_iteration": 2.9554860591888428 }, { "auxiliary_loss_clip": 0.01553419, "auxiliary_loss_mlp": 0.01404297, "balance_loss_clip": 1.3264966, "balance_loss_mlp": 1.34936523, "epoch": 0.03354877498872689, "flos": 17684999320320.0, "grad_norm": 2.080247149894823, "language_loss": 1.10047805, "learning_rate": 3.999868001582729e-06, "loss": 1.13005519, "num_input_tokens_seen": 11809670, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.54907227, "step": 558, "time_per_iteration": 2.8692195415496826 }, { "auxiliary_loss_clip": 0.0152942, "auxiliary_loss_mlp": 0.01397831, "balance_loss_clip": 1.30539191, "balance_loss_mlp": 1.34938407, "epoch": 0.03360889824139486, "flos": 21662998035840.0, "grad_norm": 1.844863204598611, "language_loss": 0.91878843, "learning_rate": 3.99986348919176e-06, "loss": 0.94806093, "num_input_tokens_seen": 11829665, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.48510742, "step": 559, "time_per_iteration": 2.9709036350250244 }, { "auxiliary_loss_clip": 0.01548703, "auxiliary_loss_mlp": 0.01394481, "balance_loss_clip": 1.31528199, "balance_loss_mlp": 1.34245849, "epoch": 0.033669021494062826, "flos": 21804949532160.0, "grad_norm": 1.5933235318591445, "language_loss": 0.93449032, "learning_rate": 3.9998589009657675e-06, "loss": 0.96392214, "num_input_tokens_seen": 11848190, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.52001953, "step": 560, "time_per_iteration": 2.90649151802063 }, { "auxiliary_loss_clip": 0.01542154, "auxiliary_loss_mlp": 0.01392166, "balance_loss_clip": 1.31243253, "balance_loss_mlp": 1.34658027, "epoch": 0.0337291447467308, "flos": 21874816782720.0, "grad_norm": 1.95546844922186, "language_loss": 0.90095866, "learning_rate": 3.999854236904925e-06, "loss": 0.9303019, "num_input_tokens_seen": 11864795, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.45581055, "step": 561, "time_per_iteration": 2.9607489109039307 }, { "auxiliary_loss_clip": 0.0157492, "auxiliary_loss_mlp": 0.013993, "balance_loss_clip": 1.32602048, "balance_loss_mlp": 1.35545468, "epoch": 0.03378926799939877, "flos": 24257279704320.0, "grad_norm": 1.5364889026421389, "language_loss": 0.87829781, "learning_rate": 3.999849497009409e-06, "loss": 0.90804005, "num_input_tokens_seen": 11885275, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.4387207, "step": 562, "time_per_iteration": 3.085115432739258 }, { "auxiliary_loss_clip": 0.01574845, "auxiliary_loss_mlp": 0.01415154, "balance_loss_clip": 1.32363439, "balance_loss_mlp": 1.35640764, "epoch": 0.033849391252066735, "flos": 16516313400960.0, "grad_norm": 1.685787422507711, "language_loss": 0.92255175, "learning_rate": 3.999844681279401e-06, "loss": 0.95245177, "num_input_tokens_seen": 11903595, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.58789062, "step": 563, "time_per_iteration": 2.916097640991211 }, { "auxiliary_loss_clip": 0.01566128, "auxiliary_loss_mlp": 0.01397724, "balance_loss_clip": 1.31944573, "balance_loss_mlp": 1.34949231, "epoch": 0.03390951450473471, "flos": 15677752677120.0, "grad_norm": 1.8652635008686627, "language_loss": 0.99611664, "learning_rate": 3.99983978971508e-06, "loss": 1.02575517, "num_input_tokens_seen": 11917815, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.48266602, "step": 564, "time_per_iteration": 2.9636433124542236 }, { "auxiliary_loss_clip": 0.01554717, "auxiliary_loss_mlp": 0.01413911, "balance_loss_clip": 1.31344008, "balance_loss_mlp": 1.34708214, "epoch": 0.03396963775740267, "flos": 22685117489280.0, "grad_norm": 1.9772971195326738, "language_loss": 1.03355336, "learning_rate": 3.999834822316635e-06, "loss": 1.06323981, "num_input_tokens_seen": 11936305, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.66821289, "step": 565, "time_per_iteration": 2.902076005935669 }, { "auxiliary_loss_clip": 0.01426553, "auxiliary_loss_mlp": 0.01361786, "balance_loss_clip": 1.27183127, "balance_loss_mlp": 1.33737159, "epoch": 0.034029761010070644, "flos": 64427115191040.0, "grad_norm": 0.9737345547455311, "language_loss": 0.550174, "learning_rate": 3.9998297790842535e-06, "loss": 0.57805741, "num_input_tokens_seen": 11998940, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.24414062, "step": 566, "time_per_iteration": 3.475527048110962 }, { "auxiliary_loss_clip": 0.01544389, "auxiliary_loss_mlp": 0.01393325, "balance_loss_clip": 1.30670536, "balance_loss_mlp": 1.3473103, "epoch": 0.034089884262738616, "flos": 25013666592000.0, "grad_norm": 1.977168972354823, "language_loss": 0.89368403, "learning_rate": 3.999824660018126e-06, "loss": 0.92306119, "num_input_tokens_seen": 12018860, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.46044922, "step": 567, "time_per_iteration": 3.06400728225708 }, { "auxiliary_loss_clip": 0.0152702, "auxiliary_loss_mlp": 0.01411977, "balance_loss_clip": 1.30008912, "balance_loss_mlp": 1.36541343, "epoch": 0.03415000751540658, "flos": 28450807240320.0, "grad_norm": 1.8598290678917226, "language_loss": 0.88551795, "learning_rate": 3.999819465118447e-06, "loss": 0.91490793, "num_input_tokens_seen": 12039675, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.46606445, "step": 568, "time_per_iteration": 3.1752214431762695 }, { "auxiliary_loss_clip": 0.0154429, "auxiliary_loss_mlp": 0.01419374, "balance_loss_clip": 1.31187963, "balance_loss_mlp": 1.37157059, "epoch": 0.034210130768074554, "flos": 21478217696640.0, "grad_norm": 1.5523308431901075, "language_loss": 0.94625556, "learning_rate": 3.999814194385413e-06, "loss": 0.97589225, "num_input_tokens_seen": 12057680, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.47827148, "step": 569, "time_per_iteration": 3.0447850227355957 }, { "auxiliary_loss_clip": 0.01553742, "auxiliary_loss_mlp": 0.01414346, "balance_loss_clip": 1.31067061, "balance_loss_mlp": 1.36244202, "epoch": 0.03427025402074252, "flos": 18706711570560.0, "grad_norm": 1.5836588248020789, "language_loss": 1.01278734, "learning_rate": 3.9998088478192255e-06, "loss": 1.04246819, "num_input_tokens_seen": 12076135, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.51953125, "step": 570, "time_per_iteration": 2.9158406257629395 }, { "auxiliary_loss_clip": 0.01547575, "auxiliary_loss_mlp": 0.01411431, "balance_loss_clip": 1.31243038, "balance_loss_mlp": 1.36091006, "epoch": 0.03433037727341049, "flos": 20859710273280.0, "grad_norm": 1.7575915748496391, "language_loss": 0.90065736, "learning_rate": 3.9998034254200846e-06, "loss": 0.93024743, "num_input_tokens_seen": 12094785, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.50537109, "step": 571, "time_per_iteration": 2.9750373363494873 }, { "auxiliary_loss_clip": 0.01531726, "auxiliary_loss_mlp": 0.01404086, "balance_loss_clip": 1.30553555, "balance_loss_mlp": 1.35287333, "epoch": 0.03439050052607846, "flos": 25421079185280.0, "grad_norm": 1.8091068039489129, "language_loss": 0.91639858, "learning_rate": 3.999797927188199e-06, "loss": 0.94575667, "num_input_tokens_seen": 12114590, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.51196289, "step": 572, "time_per_iteration": 3.0706939697265625 }, { "auxiliary_loss_clip": 0.01544496, "auxiliary_loss_mlp": 0.01387988, "balance_loss_clip": 1.31299996, "balance_loss_mlp": 1.34585905, "epoch": 0.03445062377874643, "flos": 17649273911040.0, "grad_norm": 1.6600077499875048, "language_loss": 0.92382163, "learning_rate": 3.999792353123774e-06, "loss": 0.95314646, "num_input_tokens_seen": 12132390, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.42138672, "step": 573, "time_per_iteration": 4.458945035934448 }, { "auxiliary_loss_clip": 0.01552719, "auxiliary_loss_mlp": 0.01399822, "balance_loss_clip": 1.3191011, "balance_loss_mlp": 1.35199523, "epoch": 0.0345107470314144, "flos": 16773404209920.0, "grad_norm": 1.8508423895187767, "language_loss": 0.91364622, "learning_rate": 3.999786703227023e-06, "loss": 0.94317156, "num_input_tokens_seen": 12149035, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.47851562, "step": 574, "time_per_iteration": 2.9615890979766846 }, { "auxiliary_loss_clip": 0.01561784, "auxiliary_loss_mlp": 0.01406269, "balance_loss_clip": 1.32681191, "balance_loss_mlp": 1.36120772, "epoch": 0.03457087028408237, "flos": 14692715976960.0, "grad_norm": 1.95884858121731, "language_loss": 0.94009417, "learning_rate": 3.9997809774981606e-06, "loss": 0.96977472, "num_input_tokens_seen": 12167530, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.45043945, "step": 575, "time_per_iteration": 5.783670663833618 }, { "auxiliary_loss_clip": 0.0156227, "auxiliary_loss_mlp": 0.01412789, "balance_loss_clip": 1.328969, "balance_loss_mlp": 1.36531985, "epoch": 0.03463099353675034, "flos": 20020923325440.0, "grad_norm": 1.8911038298399365, "language_loss": 0.89538419, "learning_rate": 3.9997751759374025e-06, "loss": 0.92513484, "num_input_tokens_seen": 12186340, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.47460938, "step": 576, "time_per_iteration": 4.725090026855469 }, { "auxiliary_loss_clip": 0.01566596, "auxiliary_loss_mlp": 0.01415614, "balance_loss_clip": 1.33039618, "balance_loss_mlp": 1.37067223, "epoch": 0.03469111678941831, "flos": 25312002675840.0, "grad_norm": 1.8773244416313495, "language_loss": 0.92319477, "learning_rate": 3.99976929854497e-06, "loss": 0.95301682, "num_input_tokens_seen": 12204090, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.44946289, "step": 577, "time_per_iteration": 3.0688271522521973 }, { "auxiliary_loss_clip": 0.0159292, "auxiliary_loss_mlp": 0.01412019, "balance_loss_clip": 1.34544909, "balance_loss_mlp": 1.36533618, "epoch": 0.034751240042086275, "flos": 23270116498560.0, "grad_norm": 1.7727618356935604, "language_loss": 0.78086531, "learning_rate": 3.9997633453210845e-06, "loss": 0.81091475, "num_input_tokens_seen": 12224850, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.46704102, "step": 578, "time_per_iteration": 2.976106643676758 }, { "auxiliary_loss_clip": 0.01620519, "auxiliary_loss_mlp": 0.01419709, "balance_loss_clip": 1.35702312, "balance_loss_mlp": 1.36635125, "epoch": 0.03481136329475425, "flos": 23779818881280.0, "grad_norm": 1.63921187455204, "language_loss": 0.83574361, "learning_rate": 3.999757316265973e-06, "loss": 0.86614585, "num_input_tokens_seen": 12244935, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.53369141, "step": 579, "time_per_iteration": 2.9703712463378906 }, { "auxiliary_loss_clip": 0.01603844, "auxiliary_loss_mlp": 0.01404163, "balance_loss_clip": 1.34932995, "balance_loss_mlp": 1.35512018, "epoch": 0.03487148654742222, "flos": 20167263567360.0, "grad_norm": 1.742623779028622, "language_loss": 0.93522108, "learning_rate": 3.999751211379863e-06, "loss": 0.96530116, "num_input_tokens_seen": 12262140, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.49072266, "step": 580, "time_per_iteration": 2.944106340408325 }, { "auxiliary_loss_clip": 0.01630414, "auxiliary_loss_mlp": 0.01403878, "balance_loss_clip": 1.36538768, "balance_loss_mlp": 1.35750556, "epoch": 0.034931609800090184, "flos": 15678114635520.0, "grad_norm": 1.911331932963442, "language_loss": 0.93035293, "learning_rate": 3.999745030662987e-06, "loss": 0.96069586, "num_input_tokens_seen": 12280930, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.46386719, "step": 581, "time_per_iteration": 3.028916835784912 }, { "auxiliary_loss_clip": 0.01657324, "auxiliary_loss_mlp": 0.01406587, "balance_loss_clip": 1.38635778, "balance_loss_mlp": 1.35499287, "epoch": 0.034991733052758156, "flos": 16370968544640.0, "grad_norm": 1.6989088379021084, "language_loss": 0.84250855, "learning_rate": 3.99973877411558e-06, "loss": 0.87314767, "num_input_tokens_seen": 12299125, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.51635742, "step": 582, "time_per_iteration": 3.1410257816314697 }, { "auxiliary_loss_clip": 0.01646788, "auxiliary_loss_mlp": 0.01407181, "balance_loss_clip": 1.37820065, "balance_loss_mlp": 1.35289311, "epoch": 0.03505185630542612, "flos": 19395810161280.0, "grad_norm": 1.7233054618047592, "language_loss": 0.93704462, "learning_rate": 3.999732441737877e-06, "loss": 0.96758431, "num_input_tokens_seen": 12316905, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.54272461, "step": 583, "time_per_iteration": 2.9909024238586426 }, { "auxiliary_loss_clip": 0.01662865, "auxiliary_loss_mlp": 0.01406104, "balance_loss_clip": 1.38821793, "balance_loss_mlp": 1.34740579, "epoch": 0.03511197955809409, "flos": 21333551512320.0, "grad_norm": 2.222458601761601, "language_loss": 0.91606987, "learning_rate": 3.99972603353012e-06, "loss": 0.94675964, "num_input_tokens_seen": 12335070, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.5871582, "step": 584, "time_per_iteration": 2.9337964057922363 }, { "auxiliary_loss_clip": 0.01673411, "auxiliary_loss_mlp": 0.01400228, "balance_loss_clip": 1.39251792, "balance_loss_mlp": 1.34596384, "epoch": 0.035172102810762065, "flos": 14144573496960.0, "grad_norm": 2.4465801816659063, "language_loss": 1.06761026, "learning_rate": 3.999719549492551e-06, "loss": 1.09834671, "num_input_tokens_seen": 12350315, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.54248047, "step": 585, "time_per_iteration": 2.921839952468872 }, { "auxiliary_loss_clip": 0.01683938, "auxiliary_loss_mlp": 0.01408947, "balance_loss_clip": 1.39824033, "balance_loss_mlp": 1.34638631, "epoch": 0.03523222606343003, "flos": 20304962052480.0, "grad_norm": 2.1229876399611443, "language_loss": 0.96464878, "learning_rate": 3.9997129896254165e-06, "loss": 0.99557763, "num_input_tokens_seen": 12366030, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.62597656, "step": 586, "time_per_iteration": 2.8853392601013184 }, { "auxiliary_loss_clip": 0.01720813, "auxiliary_loss_mlp": 0.01407195, "balance_loss_clip": 1.41742241, "balance_loss_mlp": 1.34930742, "epoch": 0.035292349316098, "flos": 20385461831040.0, "grad_norm": 1.951628450169371, "language_loss": 0.86362016, "learning_rate": 3.999706353928965e-06, "loss": 0.89490026, "num_input_tokens_seen": 12384895, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.57885742, "step": 587, "time_per_iteration": 2.9347589015960693 }, { "auxiliary_loss_clip": 0.01743331, "auxiliary_loss_mlp": 0.01408699, "balance_loss_clip": 1.4238832, "balance_loss_mlp": 1.34649539, "epoch": 0.03535247256876597, "flos": 21478398675840.0, "grad_norm": 1.5591245626304466, "language_loss": 0.86218131, "learning_rate": 3.999699642403449e-06, "loss": 0.89370161, "num_input_tokens_seen": 12404980, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.62231445, "step": 588, "time_per_iteration": 2.9129388332366943 }, { "auxiliary_loss_clip": 0.01756896, "auxiliary_loss_mlp": 0.01414689, "balance_loss_clip": 1.43345726, "balance_loss_mlp": 1.3465488, "epoch": 0.03541259582143394, "flos": 23633523884160.0, "grad_norm": 1.848168934006035, "language_loss": 1.05603802, "learning_rate": 3.99969285504912e-06, "loss": 1.08775389, "num_input_tokens_seen": 12423835, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.68164062, "step": 589, "time_per_iteration": 3.0794179439544678 }, { "auxiliary_loss_clip": 0.01793693, "auxiliary_loss_mlp": 0.01420307, "balance_loss_clip": 1.4631784, "balance_loss_mlp": 1.35908079, "epoch": 0.03547271907410191, "flos": 33738628965120.0, "grad_norm": 2.287648887012532, "language_loss": 0.91153169, "learning_rate": 3.99968599186624e-06, "loss": 0.9436717, "num_input_tokens_seen": 12443135, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.61279297, "step": 590, "time_per_iteration": 3.0532338619232178 }, { "auxiliary_loss_clip": 0.01821535, "auxiliary_loss_mlp": 0.01429073, "balance_loss_clip": 1.49155951, "balance_loss_mlp": 1.36021757, "epoch": 0.03553284232676988, "flos": 21152707470720.0, "grad_norm": 1.9762038539876163, "language_loss": 0.94691497, "learning_rate": 3.999679052855065e-06, "loss": 0.97942102, "num_input_tokens_seen": 12462895, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.68847656, "step": 591, "time_per_iteration": 2.9257352352142334 }, { "auxiliary_loss_clip": 0.01862717, "auxiliary_loss_mlp": 0.01451327, "balance_loss_clip": 1.51514554, "balance_loss_mlp": 1.37427056, "epoch": 0.03559296557943785, "flos": 20055924817920.0, "grad_norm": 2.1810165776733474, "language_loss": 0.90176773, "learning_rate": 3.999672038015861e-06, "loss": 0.93490815, "num_input_tokens_seen": 12481515, "router_z_loss_clip": 3.47460938, "router_z_loss_mlp": 0.77099609, "step": 592, "time_per_iteration": 2.947638988494873 }, { "auxiliary_loss_clip": 0.01740423, "auxiliary_loss_mlp": 0.0148205, "balance_loss_clip": 1.48756504, "balance_loss_mlp": 1.41605544, "epoch": 0.035653088832105814, "flos": 60365856764160.0, "grad_norm": 0.873922115681388, "language_loss": 0.59890729, "learning_rate": 3.999664947348893e-06, "loss": 0.63113207, "num_input_tokens_seen": 12548220, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.66015625, "step": 593, "time_per_iteration": 3.5285894870758057 }, { "auxiliary_loss_clip": 0.01888832, "auxiliary_loss_mlp": 0.01433082, "balance_loss_clip": 1.54534328, "balance_loss_mlp": 1.35206783, "epoch": 0.035713212084773786, "flos": 20120815140480.0, "grad_norm": 1.9080466413344728, "language_loss": 0.91681135, "learning_rate": 3.999657780854429e-06, "loss": 0.95003045, "num_input_tokens_seen": 12566105, "router_z_loss_clip": 3.43164062, "router_z_loss_mlp": 0.80957031, "step": 594, "time_per_iteration": 2.970724582672119 }, { "auxiliary_loss_clip": 0.01893908, "auxiliary_loss_mlp": 0.01432502, "balance_loss_clip": 1.54917574, "balance_loss_mlp": 1.35105872, "epoch": 0.03577333533744176, "flos": 26296767907200.0, "grad_norm": 1.837331053138254, "language_loss": 0.90361065, "learning_rate": 3.999650538532742e-06, "loss": 0.93687475, "num_input_tokens_seen": 12586680, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.81494141, "step": 595, "time_per_iteration": 3.101132869720459 }, { "auxiliary_loss_clip": 0.01892363, "auxiliary_loss_mlp": 0.01432611, "balance_loss_clip": 1.55467236, "balance_loss_mlp": 1.35035682, "epoch": 0.035833458590109724, "flos": 10896963891840.0, "grad_norm": 2.6076380909210455, "language_loss": 1.06050766, "learning_rate": 3.999643220384106e-06, "loss": 1.09375751, "num_input_tokens_seen": 12601605, "router_z_loss_clip": 3.37304688, "router_z_loss_mlp": 0.82226562, "step": 596, "time_per_iteration": 3.223883867263794 }, { "auxiliary_loss_clip": 0.01901068, "auxiliary_loss_mlp": 0.01447962, "balance_loss_clip": 1.5656594, "balance_loss_mlp": 1.35741055, "epoch": 0.035893581842777696, "flos": 22100163724800.0, "grad_norm": 2.7167327480983103, "language_loss": 0.89809322, "learning_rate": 3.999635826408799e-06, "loss": 0.93158352, "num_input_tokens_seen": 12620365, "router_z_loss_clip": 3.35546875, "router_z_loss_mlp": 0.90429688, "step": 597, "time_per_iteration": 3.0605852603912354 }, { "auxiliary_loss_clip": 0.01884526, "auxiliary_loss_mlp": 0.01437787, "balance_loss_clip": 1.55691886, "balance_loss_mlp": 1.36197007, "epoch": 0.03595370509544566, "flos": 23048434385280.0, "grad_norm": 1.5508005042904893, "language_loss": 0.84828365, "learning_rate": 3.999628356607101e-06, "loss": 0.8815068, "num_input_tokens_seen": 12641140, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.7578125, "step": 598, "time_per_iteration": 3.0795273780822754 }, { "auxiliary_loss_clip": 0.01906464, "auxiliary_loss_mlp": 0.01420613, "balance_loss_clip": 1.57834888, "balance_loss_mlp": 1.35600185, "epoch": 0.03601382834811363, "flos": 20787626027520.0, "grad_norm": 2.2941943231481536, "language_loss": 0.85047519, "learning_rate": 3.999620810979295e-06, "loss": 0.88374597, "num_input_tokens_seen": 12661080, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.64599609, "step": 599, "time_per_iteration": 3.1124086380004883 }, { "auxiliary_loss_clip": 0.01947726, "auxiliary_loss_mlp": 0.0144355, "balance_loss_clip": 1.59417224, "balance_loss_mlp": 1.36515772, "epoch": 0.036073951600781605, "flos": 23962246490880.0, "grad_norm": 2.3527894683130315, "language_loss": 0.94584858, "learning_rate": 3.999613189525668e-06, "loss": 0.97976142, "num_input_tokens_seen": 12678270, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.78320312, "step": 600, "time_per_iteration": 3.151994228363037 }, { "auxiliary_loss_clip": 0.01930073, "auxiliary_loss_mlp": 0.0144983, "balance_loss_clip": 1.5954659, "balance_loss_mlp": 1.37463236, "epoch": 0.03613407485344957, "flos": 18920475843840.0, "grad_norm": 1.539176234993382, "language_loss": 0.86816466, "learning_rate": 3.999605492246508e-06, "loss": 0.90196371, "num_input_tokens_seen": 12697295, "router_z_loss_clip": 3.34570312, "router_z_loss_mlp": 0.75195312, "step": 601, "time_per_iteration": 3.00903058052063 }, { "auxiliary_loss_clip": 0.01929101, "auxiliary_loss_mlp": 0.01440509, "balance_loss_clip": 1.59515429, "balance_loss_mlp": 1.36302269, "epoch": 0.03619419810611754, "flos": 23048660609280.0, "grad_norm": 2.2456939514783096, "language_loss": 0.83116972, "learning_rate": 3.999597719142107e-06, "loss": 0.86486578, "num_input_tokens_seen": 12716165, "router_z_loss_clip": 3.34179688, "router_z_loss_mlp": 0.77392578, "step": 602, "time_per_iteration": 3.0253920555114746 }, { "auxiliary_loss_clip": 0.01944493, "auxiliary_loss_mlp": 0.01440998, "balance_loss_clip": 1.59861147, "balance_loss_mlp": 1.37180924, "epoch": 0.03625432135878551, "flos": 29468990396160.0, "grad_norm": 1.8356301208711294, "language_loss": 0.84219337, "learning_rate": 3.999589870212761e-06, "loss": 0.87604827, "num_input_tokens_seen": 12735475, "router_z_loss_clip": 3.45898438, "router_z_loss_mlp": 0.69140625, "step": 603, "time_per_iteration": 3.133394479751587 }, { "auxiliary_loss_clip": 0.01932313, "auxiliary_loss_mlp": 0.01452384, "balance_loss_clip": 1.60335541, "balance_loss_mlp": 1.35291553, "epoch": 0.03631444461145348, "flos": 23517660654720.0, "grad_norm": 1.8812945710816888, "language_loss": 0.90467238, "learning_rate": 3.9995819454587664e-06, "loss": 0.93851942, "num_input_tokens_seen": 12754540, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.99462891, "step": 604, "time_per_iteration": 2.9759998321533203 }, { "auxiliary_loss_clip": 0.01928248, "auxiliary_loss_mlp": 0.01443333, "balance_loss_clip": 1.59704578, "balance_loss_mlp": 1.35526109, "epoch": 0.03637456786412145, "flos": 16626882988800.0, "grad_norm": 1.9062668440924413, "language_loss": 0.8915112, "learning_rate": 3.999573944880424e-06, "loss": 0.92522705, "num_input_tokens_seen": 12773050, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.88037109, "step": 605, "time_per_iteration": 3.019666910171509 }, { "auxiliary_loss_clip": 0.01930871, "auxiliary_loss_mlp": 0.01424753, "balance_loss_clip": 1.59140468, "balance_loss_mlp": 1.35794806, "epoch": 0.03643469111678942, "flos": 15860361265920.0, "grad_norm": 2.8454459646878076, "language_loss": 0.91682136, "learning_rate": 3.9995658684780375e-06, "loss": 0.95037758, "num_input_tokens_seen": 12791240, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.66748047, "step": 606, "time_per_iteration": 2.983440399169922 }, { "auxiliary_loss_clip": 0.01934032, "auxiliary_loss_mlp": 0.01440346, "balance_loss_clip": 1.59333563, "balance_loss_mlp": 1.36409986, "epoch": 0.03649481436945739, "flos": 23630175768960.0, "grad_norm": 2.23284893499446, "language_loss": 0.8723774, "learning_rate": 3.999557716251912e-06, "loss": 0.90612119, "num_input_tokens_seen": 12812245, "router_z_loss_clip": 3.40429688, "router_z_loss_mlp": 0.76220703, "step": 607, "time_per_iteration": 3.020555019378662 }, { "auxiliary_loss_clip": 0.01912361, "auxiliary_loss_mlp": 0.01424373, "balance_loss_clip": 1.58429635, "balance_loss_mlp": 1.35260868, "epoch": 0.036554937622125354, "flos": 21763839991680.0, "grad_norm": 2.286078587468741, "language_loss": 0.8705709, "learning_rate": 3.999549488202358e-06, "loss": 0.90393817, "num_input_tokens_seen": 12831085, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.71826172, "step": 608, "time_per_iteration": 4.450667381286621 }, { "auxiliary_loss_clip": 0.0191477, "auxiliary_loss_mlp": 0.01426967, "balance_loss_clip": 1.58949876, "balance_loss_mlp": 1.35396314, "epoch": 0.036615060874793326, "flos": 17828217671040.0, "grad_norm": 2.323977553959948, "language_loss": 0.87034273, "learning_rate": 3.999541184329688e-06, "loss": 0.90376008, "num_input_tokens_seen": 12849115, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.72998047, "step": 609, "time_per_iteration": 2.9909160137176514 }, { "auxiliary_loss_clip": 0.01931804, "auxiliary_loss_mlp": 0.01418835, "balance_loss_clip": 1.59376013, "balance_loss_mlp": 1.34921646, "epoch": 0.0366751841274613, "flos": 26763279488640.0, "grad_norm": 2.1419707970279918, "language_loss": 0.85734242, "learning_rate": 3.999532804634215e-06, "loss": 0.89084876, "num_input_tokens_seen": 12868005, "router_z_loss_clip": 3.3828125, "router_z_loss_mlp": 0.69628906, "step": 610, "time_per_iteration": 5.774108409881592 }, { "auxiliary_loss_clip": 0.01940297, "auxiliary_loss_mlp": 0.01436349, "balance_loss_clip": 1.60339344, "balance_loss_mlp": 1.35905361, "epoch": 0.03673530738012926, "flos": 22206254077440.0, "grad_norm": 1.926008475677365, "language_loss": 0.92885894, "learning_rate": 3.9995243491162575e-06, "loss": 0.96262538, "num_input_tokens_seen": 12886890, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.77392578, "step": 611, "time_per_iteration": 4.349674463272095 }, { "auxiliary_loss_clip": 0.01946333, "auxiliary_loss_mlp": 0.01431096, "balance_loss_clip": 1.61029577, "balance_loss_mlp": 1.35842562, "epoch": 0.036795430632797235, "flos": 24692545111680.0, "grad_norm": 3.94706731067507, "language_loss": 0.77191204, "learning_rate": 3.999515817776136e-06, "loss": 0.80568635, "num_input_tokens_seen": 12906130, "router_z_loss_clip": 3.36132812, "router_z_loss_mlp": 0.72607422, "step": 612, "time_per_iteration": 2.924837350845337 }, { "auxiliary_loss_clip": 0.01955085, "auxiliary_loss_mlp": 0.01435615, "balance_loss_clip": 1.6044724, "balance_loss_mlp": 1.36351705, "epoch": 0.0368555538854652, "flos": 17757897972480.0, "grad_norm": 2.5676435594147384, "language_loss": 0.85787749, "learning_rate": 3.999507210614175e-06, "loss": 0.89178449, "num_input_tokens_seen": 12925260, "router_z_loss_clip": 3.50585938, "router_z_loss_mlp": 0.72119141, "step": 613, "time_per_iteration": 3.089292287826538 }, { "auxiliary_loss_clip": 0.01939063, "auxiliary_loss_mlp": 0.01430163, "balance_loss_clip": 1.60057998, "balance_loss_mlp": 1.35835099, "epoch": 0.03691567713813317, "flos": 20604203032320.0, "grad_norm": 3.155582607142536, "language_loss": 0.9845022, "learning_rate": 3.9994985276307e-06, "loss": 1.01819444, "num_input_tokens_seen": 12944590, "router_z_loss_clip": 3.38671875, "router_z_loss_mlp": 0.71826172, "step": 614, "time_per_iteration": 3.069502592086792 }, { "auxiliary_loss_clip": 0.0197522, "auxiliary_loss_mlp": 0.0143698, "balance_loss_clip": 1.61723971, "balance_loss_mlp": 1.35515499, "epoch": 0.036975800390801145, "flos": 33661839260160.0, "grad_norm": 3.0095886195578037, "language_loss": 0.82343149, "learning_rate": 3.999489768826041e-06, "loss": 0.85755348, "num_input_tokens_seen": 12964785, "router_z_loss_clip": 3.578125, "router_z_loss_mlp": 0.81835938, "step": 615, "time_per_iteration": 3.0997555255889893 }, { "auxiliary_loss_clip": 0.01984445, "auxiliary_loss_mlp": 0.01453485, "balance_loss_clip": 1.62321246, "balance_loss_mlp": 1.37618971, "epoch": 0.03703592364346911, "flos": 28305507628800.0, "grad_norm": 1.601603677165002, "language_loss": 0.87257296, "learning_rate": 3.999480934200528e-06, "loss": 0.90695226, "num_input_tokens_seen": 12986705, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.7734375, "step": 616, "time_per_iteration": 3.0701889991760254 }, { "auxiliary_loss_clip": 0.0199195, "auxiliary_loss_mlp": 0.01430348, "balance_loss_clip": 1.62976766, "balance_loss_mlp": 1.3622086, "epoch": 0.03709604689613708, "flos": 31516803642240.0, "grad_norm": 2.177943192355028, "language_loss": 0.73765069, "learning_rate": 3.999472023754499e-06, "loss": 0.77187365, "num_input_tokens_seen": 13010560, "router_z_loss_clip": 3.61914062, "router_z_loss_mlp": 0.68115234, "step": 617, "time_per_iteration": 3.031184673309326 }, { "auxiliary_loss_clip": 0.01965922, "auxiliary_loss_mlp": 0.0143351, "balance_loss_clip": 1.62356257, "balance_loss_mlp": 1.36660957, "epoch": 0.03715617014880505, "flos": 19618759128960.0, "grad_norm": 2.112586893724527, "language_loss": 0.85774469, "learning_rate": 3.99946303748829e-06, "loss": 0.89173907, "num_input_tokens_seen": 13028935, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.66894531, "step": 618, "time_per_iteration": 3.038404941558838 }, { "auxiliary_loss_clip": 0.0198962, "auxiliary_loss_mlp": 0.01546613, "balance_loss_clip": 1.62751937, "balance_loss_mlp": 1.38229489, "epoch": 0.03721629340147302, "flos": 15932581246080.0, "grad_norm": 2.2050070208031958, "language_loss": 0.95398563, "learning_rate": 3.999453975402242e-06, "loss": 0.98934793, "num_input_tokens_seen": 13046000, "router_z_loss_clip": 3.62304688, "router_z_loss_mlp": 1.64404297, "step": 619, "time_per_iteration": 2.9268627166748047 }, { "auxiliary_loss_clip": 0.01990775, "auxiliary_loss_mlp": 0.01464894, "balance_loss_clip": 1.62606525, "balance_loss_mlp": 1.38869524, "epoch": 0.03727641665414099, "flos": 21113543456640.0, "grad_norm": 2.093550364324197, "language_loss": 0.98569179, "learning_rate": 3.9994448374967e-06, "loss": 1.02024841, "num_input_tokens_seen": 13062995, "router_z_loss_clip": 3.6484375, "router_z_loss_mlp": 0.76171875, "step": 620, "time_per_iteration": 2.9641520977020264 }, { "auxiliary_loss_clip": 0.02010425, "auxiliary_loss_mlp": 0.01458263, "balance_loss_clip": 1.63691294, "balance_loss_mlp": 1.3900274, "epoch": 0.037336539906808956, "flos": 24141733188480.0, "grad_norm": 1.8063927247271854, "language_loss": 0.8125475, "learning_rate": 3.999435623772008e-06, "loss": 0.84723437, "num_input_tokens_seen": 13084120, "router_z_loss_clip": 3.73632812, "router_z_loss_mlp": 0.68212891, "step": 621, "time_per_iteration": 3.092707633972168 }, { "auxiliary_loss_clip": 0.01988741, "auxiliary_loss_mlp": 0.01445774, "balance_loss_clip": 1.63122034, "balance_loss_mlp": 1.37977982, "epoch": 0.03739666315947693, "flos": 22356621106560.0, "grad_norm": 2.5836089076134385, "language_loss": 0.9271642, "learning_rate": 3.999426334228518e-06, "loss": 0.96150929, "num_input_tokens_seen": 13100035, "router_z_loss_clip": 3.57226562, "router_z_loss_mlp": 0.65917969, "step": 622, "time_per_iteration": 3.104374647140503 }, { "auxiliary_loss_clip": 0.01991309, "auxiliary_loss_mlp": 0.01435521, "balance_loss_clip": 1.63254762, "balance_loss_mlp": 1.36900246, "epoch": 0.0374567864121449, "flos": 20459355868800.0, "grad_norm": 2.056702133791348, "language_loss": 0.9466567, "learning_rate": 3.999416968866581e-06, "loss": 0.98092502, "num_input_tokens_seen": 13118070, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.66503906, "step": 623, "time_per_iteration": 3.034270763397217 }, { "auxiliary_loss_clip": 0.02000277, "auxiliary_loss_mlp": 0.01465206, "balance_loss_clip": 1.63302922, "balance_loss_mlp": 1.38400054, "epoch": 0.037516909664812866, "flos": 19217318849280.0, "grad_norm": 2.7509971422018205, "language_loss": 0.87283587, "learning_rate": 3.999407527686551e-06, "loss": 0.90749079, "num_input_tokens_seen": 13136355, "router_z_loss_clip": 3.67578125, "router_z_loss_mlp": 0.81152344, "step": 624, "time_per_iteration": 3.065488815307617 }, { "auxiliary_loss_clip": 0.01989801, "auxiliary_loss_mlp": 0.01431066, "balance_loss_clip": 1.62796307, "balance_loss_mlp": 1.36645436, "epoch": 0.03757703291748084, "flos": 35018291675520.0, "grad_norm": 2.338898533237047, "language_loss": 0.7313059, "learning_rate": 3.999398010688788e-06, "loss": 0.76551461, "num_input_tokens_seen": 13155435, "router_z_loss_clip": 3.6171875, "router_z_loss_mlp": 0.64648438, "step": 625, "time_per_iteration": 3.1954362392425537 }, { "auxiliary_loss_clip": 0.01981862, "auxiliary_loss_mlp": 0.0143992, "balance_loss_clip": 1.62162209, "balance_loss_mlp": 1.36601055, "epoch": 0.0376371561701488, "flos": 25494882733440.0, "grad_norm": 1.9093250920311804, "language_loss": 0.80136228, "learning_rate": 3.999388417873652e-06, "loss": 0.83558005, "num_input_tokens_seen": 13174295, "router_z_loss_clip": 3.59765625, "router_z_loss_mlp": 0.73925781, "step": 626, "time_per_iteration": 3.1030514240264893 }, { "auxiliary_loss_clip": 0.01965828, "auxiliary_loss_mlp": 0.0142893, "balance_loss_clip": 1.60747957, "balance_loss_mlp": 1.36059904, "epoch": 0.037697279422816775, "flos": 18194430234240.0, "grad_norm": 2.83098725595125, "language_loss": 0.84185112, "learning_rate": 3.999378749241506e-06, "loss": 0.87579876, "num_input_tokens_seen": 13192500, "router_z_loss_clip": 3.58398438, "router_z_loss_mlp": 0.68359375, "step": 627, "time_per_iteration": 3.173980951309204 }, { "auxiliary_loss_clip": 0.01965883, "auxiliary_loss_mlp": 0.01439176, "balance_loss_clip": 1.60075998, "balance_loss_mlp": 1.36946237, "epoch": 0.03775740267548475, "flos": 24654919420800.0, "grad_norm": 1.705400091856147, "language_loss": 0.91859508, "learning_rate": 3.999369004792719e-06, "loss": 0.95264566, "num_input_tokens_seen": 13213470, "router_z_loss_clip": 3.65039062, "router_z_loss_mlp": 0.69628906, "step": 628, "time_per_iteration": 3.044499397277832 }, { "auxiliary_loss_clip": 0.01937686, "auxiliary_loss_mlp": 0.01440135, "balance_loss_clip": 1.58500576, "balance_loss_mlp": 1.36493731, "epoch": 0.03781752592815271, "flos": 21298142816640.0, "grad_norm": 2.0619044621492413, "language_loss": 0.83408713, "learning_rate": 3.999359184527658e-06, "loss": 0.86786532, "num_input_tokens_seen": 13232365, "router_z_loss_clip": 3.5234375, "router_z_loss_mlp": 0.75195312, "step": 629, "time_per_iteration": 2.999502420425415 }, { "auxiliary_loss_clip": 0.01938917, "auxiliary_loss_mlp": 0.01425595, "balance_loss_clip": 1.5889039, "balance_loss_mlp": 1.35807514, "epoch": 0.037877649180820684, "flos": 22099439808000.0, "grad_norm": 1.7071060015623658, "language_loss": 0.82495666, "learning_rate": 3.999349288446696e-06, "loss": 0.85860181, "num_input_tokens_seen": 13251920, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.67480469, "step": 630, "time_per_iteration": 2.98579740524292 }, { "auxiliary_loss_clip": 0.0194274, "auxiliary_loss_mlp": 0.01420214, "balance_loss_clip": 1.58717513, "balance_loss_mlp": 1.3508817, "epoch": 0.03793777243348865, "flos": 14509383471360.0, "grad_norm": 2.6494900699367276, "language_loss": 0.98618186, "learning_rate": 3.99933931655021e-06, "loss": 1.01981139, "num_input_tokens_seen": 13267440, "router_z_loss_clip": 3.5546875, "router_z_loss_mlp": 0.69335938, "step": 631, "time_per_iteration": 3.008906126022339 }, { "auxiliary_loss_clip": 0.01899916, "auxiliary_loss_mlp": 0.01432099, "balance_loss_clip": 1.56636965, "balance_loss_mlp": 1.36104989, "epoch": 0.03799789568615662, "flos": 21918505276800.0, "grad_norm": 1.8301356800961104, "language_loss": 0.938528, "learning_rate": 3.999329268838575e-06, "loss": 0.97184813, "num_input_tokens_seen": 13287850, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.71044922, "step": 632, "time_per_iteration": 2.9540345668792725 }, { "auxiliary_loss_clip": 0.01927582, "auxiliary_loss_mlp": 0.01439588, "balance_loss_clip": 1.5803442, "balance_loss_mlp": 1.36529696, "epoch": 0.03805801893882459, "flos": 24837799478400.0, "grad_norm": 1.7480285827524378, "language_loss": 0.8796767, "learning_rate": 3.999319145312175e-06, "loss": 0.91334844, "num_input_tokens_seen": 13307760, "router_z_loss_clip": 3.47070312, "router_z_loss_mlp": 0.7421875, "step": 633, "time_per_iteration": 2.9677391052246094 }, { "auxiliary_loss_clip": 0.01935275, "auxiliary_loss_mlp": 0.01432711, "balance_loss_clip": 1.58136535, "balance_loss_mlp": 1.35918283, "epoch": 0.03811814219149256, "flos": 30495362860800.0, "grad_norm": 2.3455166878697504, "language_loss": 0.73889506, "learning_rate": 3.999308945971392e-06, "loss": 0.77257496, "num_input_tokens_seen": 13331230, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.73583984, "step": 634, "time_per_iteration": 3.0682854652404785 }, { "auxiliary_loss_clip": 0.01734273, "auxiliary_loss_mlp": 0.0138631, "balance_loss_clip": 1.46508932, "balance_loss_mlp": 1.33290446, "epoch": 0.03817826544416053, "flos": 67020654960000.0, "grad_norm": 0.9123790216905204, "language_loss": 0.61735392, "learning_rate": 3.999298670816614e-06, "loss": 0.64855975, "num_input_tokens_seen": 13394760, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 0.53515625, "step": 635, "time_per_iteration": 3.4068970680236816 }, { "auxiliary_loss_clip": 0.019051, "auxiliary_loss_mlp": 0.01442545, "balance_loss_clip": 1.5672183, "balance_loss_mlp": 1.35857415, "epoch": 0.038238388696828496, "flos": 20495036033280.0, "grad_norm": 2.299456674945861, "language_loss": 0.89363319, "learning_rate": 3.9992883198482294e-06, "loss": 0.9271096, "num_input_tokens_seen": 13412775, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.83984375, "step": 636, "time_per_iteration": 2.9538116455078125 }, { "auxiliary_loss_clip": 0.01904595, "auxiliary_loss_mlp": 0.01423407, "balance_loss_clip": 1.55946088, "balance_loss_mlp": 1.35374093, "epoch": 0.03829851194949647, "flos": 17974422178560.0, "grad_norm": 2.527381808393012, "language_loss": 0.86820251, "learning_rate": 3.999277893066632e-06, "loss": 0.90148252, "num_input_tokens_seen": 13427835, "router_z_loss_clip": 3.453125, "router_z_loss_mlp": 0.69677734, "step": 637, "time_per_iteration": 3.0464611053466797 }, { "auxiliary_loss_clip": 0.01913999, "auxiliary_loss_mlp": 0.01440562, "balance_loss_clip": 1.56176019, "balance_loss_mlp": 1.36412489, "epoch": 0.03835863520216444, "flos": 22466964470400.0, "grad_norm": 1.8864941358407028, "language_loss": 0.89376199, "learning_rate": 3.999267390472215e-06, "loss": 0.92730761, "num_input_tokens_seen": 13447295, "router_z_loss_clip": 3.52148438, "router_z_loss_mlp": 0.76513672, "step": 638, "time_per_iteration": 2.938405990600586 }, { "auxiliary_loss_clip": 0.01917575, "auxiliary_loss_mlp": 0.01436482, "balance_loss_clip": 1.55777884, "balance_loss_mlp": 1.35947239, "epoch": 0.038418758454832405, "flos": 22174736434560.0, "grad_norm": 2.5582157337740594, "language_loss": 0.75675887, "learning_rate": 3.999256812065381e-06, "loss": 0.79029942, "num_input_tokens_seen": 13468455, "router_z_loss_clip": 3.59570312, "router_z_loss_mlp": 0.77050781, "step": 639, "time_per_iteration": 3.016514539718628 }, { "auxiliary_loss_clip": 0.01907902, "auxiliary_loss_mlp": 0.0144234, "balance_loss_clip": 1.55320704, "balance_loss_mlp": 1.35608017, "epoch": 0.03847888170750038, "flos": 22757699427840.0, "grad_norm": 2.1302036355985834, "language_loss": 0.92118192, "learning_rate": 3.999246157846526e-06, "loss": 0.95468438, "num_input_tokens_seen": 13489085, "router_z_loss_clip": 3.54882812, "router_z_loss_mlp": 0.86230469, "step": 640, "time_per_iteration": 2.9296979904174805 }, { "auxiliary_loss_clip": 0.01899025, "auxiliary_loss_mlp": 0.01429637, "balance_loss_clip": 1.55108666, "balance_loss_mlp": 1.3654542, "epoch": 0.03853900496016834, "flos": 22721521570560.0, "grad_norm": 1.8983695019685312, "language_loss": 0.87132448, "learning_rate": 3.9992354278160574e-06, "loss": 0.90461111, "num_input_tokens_seen": 13509120, "router_z_loss_clip": 3.47851562, "router_z_loss_mlp": 0.64160156, "step": 641, "time_per_iteration": 3.0239412784576416 }, { "auxiliary_loss_clip": 0.01695042, "auxiliary_loss_mlp": 0.01373602, "balance_loss_clip": 1.43437839, "balance_loss_mlp": 1.32839739, "epoch": 0.038599128212836314, "flos": 70431571607040.0, "grad_norm": 0.9020523739176979, "language_loss": 0.65704423, "learning_rate": 3.999224621974381e-06, "loss": 0.68773061, "num_input_tokens_seen": 13562005, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.45117188, "step": 642, "time_per_iteration": 3.3538568019866943 }, { "auxiliary_loss_clip": 0.01864784, "auxiliary_loss_mlp": 0.01436553, "balance_loss_clip": 1.52845466, "balance_loss_mlp": 1.37122655, "epoch": 0.03865925146550429, "flos": 23305660928640.0, "grad_norm": 1.7744260865943782, "language_loss": 0.84141541, "learning_rate": 3.999213740321906e-06, "loss": 0.87442875, "num_input_tokens_seen": 13582185, "router_z_loss_clip": 3.36523438, "router_z_loss_mlp": 0.65332031, "step": 643, "time_per_iteration": 4.3217246532440186 }, { "auxiliary_loss_clip": 0.01863137, "auxiliary_loss_mlp": 0.01426457, "balance_loss_clip": 1.51912498, "balance_loss_mlp": 1.35311985, "epoch": 0.03871937471817225, "flos": 21439460885760.0, "grad_norm": 1.716062926633632, "language_loss": 0.85317433, "learning_rate": 3.999202782859046e-06, "loss": 0.88607025, "num_input_tokens_seen": 13599555, "router_z_loss_clip": 3.4375, "router_z_loss_mlp": 0.73291016, "step": 644, "time_per_iteration": 2.926156759262085 }, { "auxiliary_loss_clip": 0.01866781, "auxiliary_loss_mlp": 0.01420001, "balance_loss_clip": 1.52869511, "balance_loss_mlp": 1.35748792, "epoch": 0.038779497970840224, "flos": 34290662497920.0, "grad_norm": 2.0923708374500243, "language_loss": 0.85123581, "learning_rate": 3.9991917495862165e-06, "loss": 0.88410366, "num_input_tokens_seen": 13621160, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.62451172, "step": 645, "time_per_iteration": 4.45543360710144 }, { "auxiliary_loss_clip": 0.01864505, "auxiliary_loss_mlp": 0.0142313, "balance_loss_clip": 1.52902365, "balance_loss_mlp": 1.35823202, "epoch": 0.03883962122350819, "flos": 22758649568640.0, "grad_norm": 2.7283505714124883, "language_loss": 0.86663806, "learning_rate": 3.9991806405038345e-06, "loss": 0.89951444, "num_input_tokens_seen": 13641915, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.64868164, "step": 646, "time_per_iteration": 4.4281065464019775 }, { "auxiliary_loss_clip": 0.01845573, "auxiliary_loss_mlp": 0.01421225, "balance_loss_clip": 1.51264119, "balance_loss_mlp": 1.35833037, "epoch": 0.03889974447617616, "flos": 21956176212480.0, "grad_norm": 3.326580865876682, "language_loss": 0.85393858, "learning_rate": 3.999169455612323e-06, "loss": 0.88660657, "num_input_tokens_seen": 13661410, "router_z_loss_clip": 3.33007812, "router_z_loss_mlp": 0.62890625, "step": 647, "time_per_iteration": 2.881925582885742 }, { "auxiliary_loss_clip": 0.01845996, "auxiliary_loss_mlp": 0.01415517, "balance_loss_clip": 1.51274228, "balance_loss_mlp": 1.35128653, "epoch": 0.03895986772884413, "flos": 31517572803840.0, "grad_norm": 2.263818984224509, "language_loss": 0.89751983, "learning_rate": 3.999158194912106e-06, "loss": 0.93013501, "num_input_tokens_seen": 13681705, "router_z_loss_clip": 3.33203125, "router_z_loss_mlp": 0.64208984, "step": 648, "time_per_iteration": 2.986081838607788 }, { "auxiliary_loss_clip": 0.01847286, "auxiliary_loss_mlp": 0.01415413, "balance_loss_clip": 1.51501799, "balance_loss_mlp": 1.35318518, "epoch": 0.0390199909815121, "flos": 19910398982400.0, "grad_norm": 1.8080214588962245, "language_loss": 0.87803572, "learning_rate": 3.9991468584036086e-06, "loss": 0.91066277, "num_input_tokens_seen": 13700400, "router_z_loss_clip": 3.31835938, "router_z_loss_mlp": 0.62207031, "step": 649, "time_per_iteration": 2.902810573577881 }, { "auxiliary_loss_clip": 0.01867056, "auxiliary_loss_mlp": 0.01418014, "balance_loss_clip": 1.52446318, "balance_loss_mlp": 1.35678792, "epoch": 0.03908011423418007, "flos": 21621436047360.0, "grad_norm": 1.7147108180707142, "language_loss": 0.82984376, "learning_rate": 3.999135446087263e-06, "loss": 0.8626945, "num_input_tokens_seen": 13720145, "router_z_loss_clip": 3.42382812, "router_z_loss_mlp": 0.61230469, "step": 650, "time_per_iteration": 2.916159152984619 }, { "auxiliary_loss_clip": 0.01848007, "auxiliary_loss_mlp": 0.01419496, "balance_loss_clip": 1.51558495, "balance_loss_mlp": 1.35622001, "epoch": 0.039140237486848035, "flos": 18670714692480.0, "grad_norm": 2.1479990763192647, "language_loss": 0.82980067, "learning_rate": 3.9991239579635e-06, "loss": 0.86247569, "num_input_tokens_seen": 13737500, "router_z_loss_clip": 3.3203125, "router_z_loss_mlp": 0.63232422, "step": 651, "time_per_iteration": 2.916741132736206 }, { "auxiliary_loss_clip": 0.01865738, "auxiliary_loss_mlp": 0.01422804, "balance_loss_clip": 1.52524722, "balance_loss_mlp": 1.36148286, "epoch": 0.03920036073951601, "flos": 18670352734080.0, "grad_norm": 2.189500565319255, "language_loss": 0.91284573, "learning_rate": 3.999112394032757e-06, "loss": 0.94573104, "num_input_tokens_seen": 13754750, "router_z_loss_clip": 3.40820312, "router_z_loss_mlp": 0.61328125, "step": 652, "time_per_iteration": 2.8671233654022217 }, { "auxiliary_loss_clip": 0.01856369, "auxiliary_loss_mlp": 0.01414307, "balance_loss_clip": 1.51763535, "balance_loss_mlp": 1.35079229, "epoch": 0.03926048399218398, "flos": 31366029409920.0, "grad_norm": 2.516393821812266, "language_loss": 0.86500233, "learning_rate": 3.999100754295471e-06, "loss": 0.89770901, "num_input_tokens_seen": 13771990, "router_z_loss_clip": 3.390625, "router_z_loss_mlp": 0.63574219, "step": 653, "time_per_iteration": 3.0481810569763184 }, { "auxiliary_loss_clip": 0.01910817, "auxiliary_loss_mlp": 0.01429159, "balance_loss_clip": 1.54609156, "balance_loss_mlp": 1.36268783, "epoch": 0.039320607244851945, "flos": 29614697210880.0, "grad_norm": 1.9226558760436334, "language_loss": 0.90441477, "learning_rate": 3.999089038752085e-06, "loss": 0.93781459, "num_input_tokens_seen": 13792750, "router_z_loss_clip": 3.64453125, "router_z_loss_mlp": 0.6640625, "step": 654, "time_per_iteration": 3.0295088291168213 }, { "auxiliary_loss_clip": 0.0166176, "auxiliary_loss_mlp": 0.01389718, "balance_loss_clip": 1.41711056, "balance_loss_mlp": 1.35214365, "epoch": 0.03938073049751992, "flos": 66566811922560.0, "grad_norm": 0.7550562334137961, "language_loss": 0.50148559, "learning_rate": 3.999077247403041e-06, "loss": 0.53200042, "num_input_tokens_seen": 13858570, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.375, "step": 655, "time_per_iteration": 3.4123547077178955 }, { "auxiliary_loss_clip": 0.01862617, "auxiliary_loss_mlp": 0.0142485, "balance_loss_clip": 1.52751708, "balance_loss_mlp": 1.36872637, "epoch": 0.03944085375018788, "flos": 23378288112000.0, "grad_norm": 2.0602852960731664, "language_loss": 0.82973337, "learning_rate": 3.9990653802487886e-06, "loss": 0.86260808, "num_input_tokens_seen": 13876335, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.56103516, "step": 656, "time_per_iteration": 2.993741750717163 }, { "auxiliary_loss_clip": 0.01920202, "auxiliary_loss_mlp": 0.01433585, "balance_loss_clip": 1.55846417, "balance_loss_mlp": 1.36959374, "epoch": 0.039500977002855854, "flos": 18556616010240.0, "grad_norm": 2.418791301111852, "language_loss": 0.84842271, "learning_rate": 3.999053437289776e-06, "loss": 0.88196057, "num_input_tokens_seen": 13892640, "router_z_loss_clip": 3.61523438, "router_z_loss_mlp": 0.64013672, "step": 657, "time_per_iteration": 2.9691553115844727 }, { "auxiliary_loss_clip": 0.01898823, "auxiliary_loss_mlp": 0.01431643, "balance_loss_clip": 1.54260051, "balance_loss_mlp": 1.36979771, "epoch": 0.039561100255523826, "flos": 25349175918720.0, "grad_norm": 2.005841512242625, "language_loss": 0.85498816, "learning_rate": 3.999041418526457e-06, "loss": 0.88829285, "num_input_tokens_seen": 13910085, "router_z_loss_clip": 3.56640625, "router_z_loss_mlp": 0.61865234, "step": 658, "time_per_iteration": 2.9596269130706787 }, { "auxiliary_loss_clip": 0.01888085, "auxiliary_loss_mlp": 0.01426321, "balance_loss_clip": 1.54105771, "balance_loss_mlp": 1.36728859, "epoch": 0.03962122350819179, "flos": 18228119627520.0, "grad_norm": 1.8234646566849932, "language_loss": 0.9473331, "learning_rate": 3.999029323959287e-06, "loss": 0.98047709, "num_input_tokens_seen": 13928800, "router_z_loss_clip": 3.46875, "router_z_loss_mlp": 0.59033203, "step": 659, "time_per_iteration": 2.960434675216675 }, { "auxiliary_loss_clip": 0.01911292, "auxiliary_loss_mlp": 0.01426167, "balance_loss_clip": 1.55172038, "balance_loss_mlp": 1.36546612, "epoch": 0.03968134676085976, "flos": 20532028296960.0, "grad_norm": 2.123162369032689, "language_loss": 0.83870721, "learning_rate": 3.999017153588724e-06, "loss": 0.87208176, "num_input_tokens_seen": 13948325, "router_z_loss_clip": 3.59375, "router_z_loss_mlp": 0.60742188, "step": 660, "time_per_iteration": 3.0618982315063477 }, { "auxiliary_loss_clip": 0.01913527, "auxiliary_loss_mlp": 0.01430668, "balance_loss_clip": 1.55949771, "balance_loss_mlp": 1.36434054, "epoch": 0.03974147001352773, "flos": 22433682280320.0, "grad_norm": 1.9628177620222127, "language_loss": 0.84569657, "learning_rate": 3.999004907415231e-06, "loss": 0.87913853, "num_input_tokens_seen": 13969090, "router_z_loss_clip": 3.53710938, "router_z_loss_mlp": 0.66308594, "step": 661, "time_per_iteration": 3.0876083374023438 }, { "auxiliary_loss_clip": 0.0169653, "auxiliary_loss_mlp": 0.01372212, "balance_loss_clip": 1.44540238, "balance_loss_mlp": 1.3430295, "epoch": 0.0398015932661957, "flos": 71161372535040.0, "grad_norm": 0.9321118218152142, "language_loss": 0.69561112, "learning_rate": 3.998992585439272e-06, "loss": 0.72629851, "num_input_tokens_seen": 14037555, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.29101562, "step": 662, "time_per_iteration": 3.5154201984405518 }, { "auxiliary_loss_clip": 0.01934405, "auxiliary_loss_mlp": 0.01436495, "balance_loss_clip": 1.57016182, "balance_loss_mlp": 1.37202692, "epoch": 0.03986171651886367, "flos": 16809853536000.0, "grad_norm": 1.7963993071818343, "language_loss": 0.86277544, "learning_rate": 3.998980187661314e-06, "loss": 0.89648449, "num_input_tokens_seen": 14055765, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.64453125, "step": 663, "time_per_iteration": 2.8989055156707764 }, { "auxiliary_loss_clip": 0.01926693, "auxiliary_loss_mlp": 0.01420133, "balance_loss_clip": 1.56250846, "balance_loss_mlp": 1.35881209, "epoch": 0.03992183977153164, "flos": 24545752421760.0, "grad_norm": 1.9511389514799267, "language_loss": 0.90215009, "learning_rate": 3.998967714081826e-06, "loss": 0.93561834, "num_input_tokens_seen": 14074195, "router_z_loss_clip": 3.64257812, "router_z_loss_mlp": 0.61376953, "step": 664, "time_per_iteration": 3.0310490131378174 }, { "auxiliary_loss_clip": 0.01870944, "auxiliary_loss_mlp": 0.01412269, "balance_loss_clip": 1.54272759, "balance_loss_mlp": 1.35221159, "epoch": 0.03998196302419961, "flos": 15604220597760.0, "grad_norm": 1.9292840558995679, "language_loss": 0.8748585, "learning_rate": 3.998955164701281e-06, "loss": 0.90769064, "num_input_tokens_seen": 14090215, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.60107422, "step": 665, "time_per_iteration": 2.916304111480713 }, { "auxiliary_loss_clip": 0.01928323, "auxiliary_loss_mlp": 0.01431631, "balance_loss_clip": 1.5622592, "balance_loss_mlp": 1.36563671, "epoch": 0.04004208627686758, "flos": 25316934359040.0, "grad_norm": 1.94374697864009, "language_loss": 0.84413898, "learning_rate": 3.998942539520158e-06, "loss": 0.87773848, "num_input_tokens_seen": 14112150, "router_z_loss_clip": 3.66210938, "router_z_loss_mlp": 0.65966797, "step": 666, "time_per_iteration": 2.979661226272583 }, { "auxiliary_loss_clip": 0.01892872, "auxiliary_loss_mlp": 0.01415505, "balance_loss_clip": 1.54545093, "balance_loss_mlp": 1.35475636, "epoch": 0.04010220952953555, "flos": 23485962032640.0, "grad_norm": 1.8909717933166827, "language_loss": 0.90105104, "learning_rate": 3.998929838538932e-06, "loss": 0.93413484, "num_input_tokens_seen": 14131475, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.60742188, "step": 667, "time_per_iteration": 2.935433864593506 }, { "auxiliary_loss_clip": 0.01877965, "auxiliary_loss_mlp": 0.01422149, "balance_loss_clip": 1.54295158, "balance_loss_mlp": 1.3608036, "epoch": 0.04016233278220352, "flos": 18624266265600.0, "grad_norm": 2.211125468405515, "language_loss": 0.85424697, "learning_rate": 3.998917061758087e-06, "loss": 0.8872481, "num_input_tokens_seen": 14146165, "router_z_loss_clip": 3.34960938, "router_z_loss_mlp": 0.61352539, "step": 668, "time_per_iteration": 2.887366771697998 }, { "auxiliary_loss_clip": 0.01661343, "auxiliary_loss_mlp": 0.01386464, "balance_loss_clip": 1.41890478, "balance_loss_mlp": 1.34755397, "epoch": 0.040222456034871484, "flos": 70937790140160.0, "grad_norm": 0.8589919875942349, "language_loss": 0.60309774, "learning_rate": 3.998904209178107e-06, "loss": 0.6335758, "num_input_tokens_seen": 14215005, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.38867188, "step": 669, "time_per_iteration": 3.48913311958313 }, { "auxiliary_loss_clip": 0.01886383, "auxiliary_loss_mlp": 0.01420693, "balance_loss_clip": 1.53702879, "balance_loss_mlp": 1.35202837, "epoch": 0.040282579287539456, "flos": 23774118036480.0, "grad_norm": 1.8331547380817905, "language_loss": 0.89137757, "learning_rate": 3.9988912807994785e-06, "loss": 0.92444831, "num_input_tokens_seen": 14235510, "router_z_loss_clip": 3.4921875, "router_z_loss_mlp": 0.68701172, "step": 670, "time_per_iteration": 2.9291563034057617 }, { "auxiliary_loss_clip": 0.01876047, "auxiliary_loss_mlp": 0.01417794, "balance_loss_clip": 1.53944075, "balance_loss_mlp": 1.3616941, "epoch": 0.04034270254020743, "flos": 18487653655680.0, "grad_norm": 4.428579487525348, "language_loss": 0.77549636, "learning_rate": 3.998878276622692e-06, "loss": 0.80843484, "num_input_tokens_seen": 14254565, "router_z_loss_clip": 3.3671875, "router_z_loss_mlp": 0.56079102, "step": 671, "time_per_iteration": 2.951420783996582 }, { "auxiliary_loss_clip": 0.01881956, "auxiliary_loss_mlp": 0.0142333, "balance_loss_clip": 1.5373559, "balance_loss_mlp": 1.36105514, "epoch": 0.040402825792875394, "flos": 17210750878080.0, "grad_norm": 2.1542671324686564, "language_loss": 0.96062577, "learning_rate": 3.998865196648242e-06, "loss": 0.99367857, "num_input_tokens_seen": 14271885, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.62255859, "step": 672, "time_per_iteration": 2.8764255046844482 }, { "auxiliary_loss_clip": 0.01877707, "auxiliary_loss_mlp": 0.01441526, "balance_loss_clip": 1.53363895, "balance_loss_mlp": 1.36971474, "epoch": 0.040462949045543366, "flos": 19181593440000.0, "grad_norm": 1.934278569793815, "language_loss": 0.9276436, "learning_rate": 3.998852040876622e-06, "loss": 0.96083587, "num_input_tokens_seen": 14289670, "router_z_loss_clip": 3.44140625, "router_z_loss_mlp": 0.7175293, "step": 673, "time_per_iteration": 2.9121477603912354 }, { "auxiliary_loss_clip": 0.018724, "auxiliary_loss_mlp": 0.01432367, "balance_loss_clip": 1.53451514, "balance_loss_mlp": 1.36961508, "epoch": 0.04052307229821133, "flos": 24029218074240.0, "grad_norm": 1.803658760214416, "language_loss": 0.77880013, "learning_rate": 3.998838809308334e-06, "loss": 0.81184781, "num_input_tokens_seen": 14309285, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.62744141, "step": 674, "time_per_iteration": 2.9818060398101807 }, { "auxiliary_loss_clip": 0.01900931, "auxiliary_loss_mlp": 0.01423454, "balance_loss_clip": 1.54986596, "balance_loss_mlp": 1.3602736, "epoch": 0.0405831955508793, "flos": 16445541254400.0, "grad_norm": 2.4015005255456257, "language_loss": 0.82635844, "learning_rate": 3.9988255019438766e-06, "loss": 0.85960227, "num_input_tokens_seen": 14328300, "router_z_loss_clip": 3.51171875, "router_z_loss_mlp": 0.63208008, "step": 675, "time_per_iteration": 2.879110813140869 }, { "auxiliary_loss_clip": 0.01857873, "auxiliary_loss_mlp": 0.01427985, "balance_loss_clip": 1.52898538, "balance_loss_mlp": 1.37038267, "epoch": 0.040643318803547275, "flos": 24290607139200.0, "grad_norm": 1.580471145716994, "language_loss": 0.79494107, "learning_rate": 3.998812118783757e-06, "loss": 0.82779962, "num_input_tokens_seen": 14346395, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.57543945, "step": 676, "time_per_iteration": 2.9655325412750244 }, { "auxiliary_loss_clip": 0.01889619, "auxiliary_loss_mlp": 0.01434585, "balance_loss_clip": 1.54212189, "balance_loss_mlp": 1.37977242, "epoch": 0.04070344205621524, "flos": 17720950953600.0, "grad_norm": 2.105352965424125, "language_loss": 0.88311815, "learning_rate": 3.9987986598284804e-06, "loss": 0.9163602, "num_input_tokens_seen": 14364605, "router_z_loss_clip": 3.47265625, "router_z_loss_mlp": 0.54736328, "step": 677, "time_per_iteration": 4.350664377212524 }, { "auxiliary_loss_clip": 0.01890635, "auxiliary_loss_mlp": 0.01419903, "balance_loss_clip": 1.55372143, "balance_loss_mlp": 1.36110914, "epoch": 0.04076356530888321, "flos": 26188958252160.0, "grad_norm": 1.9219578950959613, "language_loss": 0.79372543, "learning_rate": 3.998785125078559e-06, "loss": 0.82683086, "num_input_tokens_seen": 14385265, "router_z_loss_clip": 3.37109375, "router_z_loss_mlp": 0.58837891, "step": 678, "time_per_iteration": 2.9264941215515137 }, { "auxiliary_loss_clip": 0.01881921, "auxiliary_loss_mlp": 0.01432368, "balance_loss_clip": 1.54716015, "balance_loss_mlp": 1.38184738, "epoch": 0.04082368856155118, "flos": 35787256617600.0, "grad_norm": 1.6239066200833818, "language_loss": 0.84643549, "learning_rate": 3.998771514534505e-06, "loss": 0.87957841, "num_input_tokens_seen": 14406090, "router_z_loss_clip": 3.34765625, "router_z_loss_mlp": 0.50439453, "step": 679, "time_per_iteration": 3.057399034500122 }, { "auxiliary_loss_clip": 0.01881568, "auxiliary_loss_mlp": 0.01423099, "balance_loss_clip": 1.55263519, "balance_loss_mlp": 1.36800086, "epoch": 0.04088381181421915, "flos": 28158126756480.0, "grad_norm": 1.8513016466523973, "language_loss": 0.79441047, "learning_rate": 3.998757828196835e-06, "loss": 0.82745719, "num_input_tokens_seen": 14425130, "router_z_loss_clip": 3.29101562, "router_z_loss_mlp": 0.55126953, "step": 680, "time_per_iteration": 5.890503883361816 }, { "auxiliary_loss_clip": 0.01901783, "auxiliary_loss_mlp": 0.01433088, "balance_loss_clip": 1.55541158, "balance_loss_mlp": 1.37403142, "epoch": 0.04094393506688712, "flos": 27608581687680.0, "grad_norm": 1.6542279445311474, "language_loss": 0.8598094, "learning_rate": 3.9987440660660685e-06, "loss": 0.89315808, "num_input_tokens_seen": 14447355, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.59057617, "step": 681, "time_per_iteration": 3.004376173019409 }, { "auxiliary_loss_clip": 0.01911731, "auxiliary_loss_mlp": 0.0141565, "balance_loss_clip": 1.56634855, "balance_loss_mlp": 1.35943127, "epoch": 0.04100405831955509, "flos": 23122328423040.0, "grad_norm": 1.6818273753128772, "language_loss": 0.75456005, "learning_rate": 3.998730228142726e-06, "loss": 0.78783393, "num_input_tokens_seen": 14466790, "router_z_loss_clip": 3.45507812, "router_z_loss_mlp": 0.56225586, "step": 682, "time_per_iteration": 2.9098241329193115 }, { "auxiliary_loss_clip": 0.01882517, "auxiliary_loss_mlp": 0.01412526, "balance_loss_clip": 1.5453794, "balance_loss_mlp": 1.35377991, "epoch": 0.04106418157222306, "flos": 20166268181760.0, "grad_norm": 1.672625039716468, "language_loss": 0.75341654, "learning_rate": 3.998716314427333e-06, "loss": 0.78636694, "num_input_tokens_seen": 14485195, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.58764648, "step": 683, "time_per_iteration": 2.9536118507385254 }, { "auxiliary_loss_clip": 0.01877363, "auxiliary_loss_mlp": 0.01412375, "balance_loss_clip": 1.54620409, "balance_loss_mlp": 1.36068606, "epoch": 0.041124304824891024, "flos": 17429627813760.0, "grad_norm": 2.44255990949796, "language_loss": 0.84105563, "learning_rate": 3.998702324920417e-06, "loss": 0.87395304, "num_input_tokens_seen": 14503370, "router_z_loss_clip": 3.3125, "router_z_loss_mlp": 0.51733398, "step": 684, "time_per_iteration": 2.8678042888641357 }, { "auxiliary_loss_clip": 0.0186854, "auxiliary_loss_mlp": 0.01403294, "balance_loss_clip": 1.53617358, "balance_loss_mlp": 1.34902954, "epoch": 0.041184428077558996, "flos": 25792359166080.0, "grad_norm": 1.7904478562112298, "language_loss": 0.92644894, "learning_rate": 3.9986882596225085e-06, "loss": 0.9591673, "num_input_tokens_seen": 14526415, "router_z_loss_clip": 3.32226562, "router_z_loss_mlp": 0.54223633, "step": 685, "time_per_iteration": 2.9945783615112305 }, { "auxiliary_loss_clip": 0.01888534, "auxiliary_loss_mlp": 0.01408772, "balance_loss_clip": 1.54666662, "balance_loss_mlp": 1.35319638, "epoch": 0.04124455133022697, "flos": 22974676081920.0, "grad_norm": 2.2106828472277673, "language_loss": 0.91037917, "learning_rate": 3.998674118534141e-06, "loss": 0.94335222, "num_input_tokens_seen": 14546595, "router_z_loss_clip": 3.421875, "router_z_loss_mlp": 0.5559082, "step": 686, "time_per_iteration": 2.9504990577697754 }, { "auxiliary_loss_clip": 0.01885941, "auxiliary_loss_mlp": 0.01419542, "balance_loss_clip": 1.54158914, "balance_loss_mlp": 1.36232173, "epoch": 0.04130467458289493, "flos": 21299228691840.0, "grad_norm": 2.016049254943442, "language_loss": 0.73946941, "learning_rate": 3.998659901655851e-06, "loss": 0.77252424, "num_input_tokens_seen": 14566590, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.5715332, "step": 687, "time_per_iteration": 2.9170024394989014 }, { "auxiliary_loss_clip": 0.0183868, "auxiliary_loss_mlp": 0.01408747, "balance_loss_clip": 1.52008343, "balance_loss_mlp": 1.35875034, "epoch": 0.041364797835562905, "flos": 19983704837760.0, "grad_norm": 1.420124221429297, "language_loss": 0.88659251, "learning_rate": 3.998645608988177e-06, "loss": 0.91906679, "num_input_tokens_seen": 14585965, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.50024414, "step": 688, "time_per_iteration": 2.93145751953125 }, { "auxiliary_loss_clip": 0.01838327, "auxiliary_loss_mlp": 0.01403956, "balance_loss_clip": 1.52289879, "balance_loss_mlp": 1.35341144, "epoch": 0.04142492108823087, "flos": 21915745344000.0, "grad_norm": 1.9017765394804462, "language_loss": 0.8579247, "learning_rate": 3.998631240531661e-06, "loss": 0.89034754, "num_input_tokens_seen": 14606015, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.50537109, "step": 689, "time_per_iteration": 2.898087739944458 }, { "auxiliary_loss_clip": 0.01842574, "auxiliary_loss_mlp": 0.01424882, "balance_loss_clip": 1.51690888, "balance_loss_mlp": 1.36561096, "epoch": 0.04148504434089884, "flos": 27651953468160.0, "grad_norm": 1.956580045528937, "language_loss": 0.71512437, "learning_rate": 3.998616796286848e-06, "loss": 0.74779892, "num_input_tokens_seen": 14629955, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.59326172, "step": 690, "time_per_iteration": 3.041569471359253 }, { "auxiliary_loss_clip": 0.01846391, "auxiliary_loss_mlp": 0.01408434, "balance_loss_clip": 1.52527702, "balance_loss_mlp": 1.35917687, "epoch": 0.041545167593566815, "flos": 20527187103360.0, "grad_norm": 1.6961855779836625, "language_loss": 0.77274734, "learning_rate": 3.998602276254286e-06, "loss": 0.80529559, "num_input_tokens_seen": 14648000, "router_z_loss_clip": 3.20898438, "router_z_loss_mlp": 0.49194336, "step": 691, "time_per_iteration": 2.936180830001831 }, { "auxiliary_loss_clip": 0.01835314, "auxiliary_loss_mlp": 0.01409073, "balance_loss_clip": 1.51812124, "balance_loss_mlp": 1.35523796, "epoch": 0.04160529084623478, "flos": 11874851913600.0, "grad_norm": 2.1722637304999552, "language_loss": 0.86083645, "learning_rate": 3.998587680434526e-06, "loss": 0.89328033, "num_input_tokens_seen": 14662235, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.53857422, "step": 692, "time_per_iteration": 2.8621623516082764 }, { "auxiliary_loss_clip": 0.0185828, "auxiliary_loss_mlp": 0.01409178, "balance_loss_clip": 1.5180285, "balance_loss_mlp": 1.35050273, "epoch": 0.04166541409890275, "flos": 14836070062080.0, "grad_norm": 2.266761029064862, "language_loss": 0.93417192, "learning_rate": 3.99857300882812e-06, "loss": 0.96684647, "num_input_tokens_seen": 14676065, "router_z_loss_clip": 3.40234375, "router_z_loss_mlp": 0.58691406, "step": 693, "time_per_iteration": 2.884765863418579 }, { "auxiliary_loss_clip": 0.01848824, "auxiliary_loss_mlp": 0.01401082, "balance_loss_clip": 1.51981378, "balance_loss_mlp": 1.34488702, "epoch": 0.04172553735157072, "flos": 25818130719360.0, "grad_norm": 1.9467063765605717, "language_loss": 0.85516679, "learning_rate": 3.998558261435626e-06, "loss": 0.88766587, "num_input_tokens_seen": 14694955, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.56176758, "step": 694, "time_per_iteration": 2.947791337966919 }, { "auxiliary_loss_clip": 0.01872993, "auxiliary_loss_mlp": 0.01411732, "balance_loss_clip": 1.5277288, "balance_loss_mlp": 1.3636663, "epoch": 0.04178566060423869, "flos": 24290471404800.0, "grad_norm": 2.6033693232401256, "language_loss": 0.8734439, "learning_rate": 3.9985434382576015e-06, "loss": 0.90629113, "num_input_tokens_seen": 14715510, "router_z_loss_clip": 3.45507812, "router_z_loss_mlp": 0.48046875, "step": 695, "time_per_iteration": 2.991652250289917 }, { "auxiliary_loss_clip": 0.01834696, "auxiliary_loss_mlp": 0.01412095, "balance_loss_clip": 1.51167929, "balance_loss_mlp": 1.35902286, "epoch": 0.04184578385690666, "flos": 18230789070720.0, "grad_norm": 2.246575441005443, "language_loss": 0.8801986, "learning_rate": 3.99852853929461e-06, "loss": 0.91266656, "num_input_tokens_seen": 14731755, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.53125, "step": 696, "time_per_iteration": 2.9283676147460938 }, { "auxiliary_loss_clip": 0.01840252, "auxiliary_loss_mlp": 0.01404549, "balance_loss_clip": 1.51386762, "balance_loss_mlp": 1.35007024, "epoch": 0.041905907109574626, "flos": 22784647345920.0, "grad_norm": 1.9688901479121834, "language_loss": 0.95167482, "learning_rate": 3.998513564547216e-06, "loss": 0.98412281, "num_input_tokens_seen": 14750810, "router_z_loss_clip": 3.26367188, "router_z_loss_mlp": 0.54418945, "step": 697, "time_per_iteration": 2.942450523376465 }, { "auxiliary_loss_clip": 0.018095, "auxiliary_loss_mlp": 0.01403152, "balance_loss_clip": 1.49613905, "balance_loss_mlp": 1.35050964, "epoch": 0.0419660303622426, "flos": 20166539650560.0, "grad_norm": 2.6527356008532714, "language_loss": 0.87111253, "learning_rate": 3.998498514015987e-06, "loss": 0.90323907, "num_input_tokens_seen": 14768435, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.52685547, "step": 698, "time_per_iteration": 2.943557024002075 }, { "auxiliary_loss_clip": 0.01834468, "auxiliary_loss_mlp": 0.01415486, "balance_loss_clip": 1.50868201, "balance_loss_mlp": 1.36525154, "epoch": 0.042026153614910564, "flos": 23086874482560.0, "grad_norm": 2.235809062833807, "language_loss": 0.9316889, "learning_rate": 3.998483387701495e-06, "loss": 0.96418834, "num_input_tokens_seen": 14786690, "router_z_loss_clip": 3.2578125, "router_z_loss_mlp": 0.50268555, "step": 699, "time_per_iteration": 2.893601894378662 }, { "auxiliary_loss_clip": 0.01656688, "auxiliary_loss_mlp": 0.01428847, "balance_loss_clip": 1.43760383, "balance_loss_mlp": 1.40347934, "epoch": 0.042086276867578536, "flos": 64527911902080.0, "grad_norm": 0.9084685749167275, "language_loss": 0.67912495, "learning_rate": 3.998468185604312e-06, "loss": 0.70998031, "num_input_tokens_seen": 14853840, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25390625, "step": 700, "time_per_iteration": 3.380621910095215 }, { "auxiliary_loss_clip": 0.01835094, "auxiliary_loss_mlp": 0.01423899, "balance_loss_clip": 1.5081811, "balance_loss_mlp": 1.36326933, "epoch": 0.04214640012024651, "flos": 15495460801920.0, "grad_norm": 2.200241090378552, "language_loss": 0.91733503, "learning_rate": 3.998452907725016e-06, "loss": 0.94992495, "num_input_tokens_seen": 14869580, "router_z_loss_clip": 3.26757812, "router_z_loss_mlp": 0.60644531, "step": 701, "time_per_iteration": 2.894984006881714 }, { "auxiliary_loss_clip": 0.01838811, "auxiliary_loss_mlp": 0.01414127, "balance_loss_clip": 1.50803781, "balance_loss_mlp": 1.35859895, "epoch": 0.04220652337291447, "flos": 23887221333120.0, "grad_norm": 1.7841697170120672, "language_loss": 0.70937371, "learning_rate": 3.998437554064184e-06, "loss": 0.74190307, "num_input_tokens_seen": 14891065, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.5546875, "step": 702, "time_per_iteration": 2.91745662689209 }, { "auxiliary_loss_clip": 0.0160452, "auxiliary_loss_mlp": 0.0137902, "balance_loss_clip": 1.38627684, "balance_loss_mlp": 1.35517788, "epoch": 0.042266646625582445, "flos": 63826235256960.0, "grad_norm": 0.8520686425842733, "language_loss": 0.6106168, "learning_rate": 3.9984221246224006e-06, "loss": 0.64045215, "num_input_tokens_seen": 14954815, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.23828125, "step": 703, "time_per_iteration": 3.423539638519287 }, { "auxiliary_loss_clip": 0.0159064, "auxiliary_loss_mlp": 0.01371895, "balance_loss_clip": 1.37667143, "balance_loss_mlp": 1.35177267, "epoch": 0.04232676987825041, "flos": 50049973566720.0, "grad_norm": 1.0395438235276593, "language_loss": 0.57906008, "learning_rate": 3.9984066194002494e-06, "loss": 0.60868543, "num_input_tokens_seen": 15003050, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.20117188, "step": 704, "time_per_iteration": 3.164651870727539 }, { "auxiliary_loss_clip": 0.01844929, "auxiliary_loss_mlp": 0.01418002, "balance_loss_clip": 1.51236641, "balance_loss_mlp": 1.36020899, "epoch": 0.04238689313091838, "flos": 21626005772160.0, "grad_norm": 2.7090755696354125, "language_loss": 0.91177702, "learning_rate": 3.998391038398319e-06, "loss": 0.94440639, "num_input_tokens_seen": 15021990, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.57788086, "step": 705, "time_per_iteration": 2.936997890472412 }, { "auxiliary_loss_clip": 0.01815578, "auxiliary_loss_mlp": 0.01435747, "balance_loss_clip": 1.50898385, "balance_loss_mlp": 1.38312769, "epoch": 0.042447016383586354, "flos": 19144374952320.0, "grad_norm": 1.8025493171591411, "language_loss": 0.74006224, "learning_rate": 3.998375381617201e-06, "loss": 0.7725755, "num_input_tokens_seen": 15040700, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.52612305, "step": 706, "time_per_iteration": 2.9508841037750244 }, { "auxiliary_loss_clip": 0.01840996, "auxiliary_loss_mlp": 0.01432665, "balance_loss_clip": 1.51314437, "balance_loss_mlp": 1.37391901, "epoch": 0.04250713963625432, "flos": 24436766401920.0, "grad_norm": 2.514182690714767, "language_loss": 0.95214689, "learning_rate": 3.9983596490574875e-06, "loss": 0.98488343, "num_input_tokens_seen": 15056725, "router_z_loss_clip": 3.27734375, "router_z_loss_mlp": 0.58789062, "step": 707, "time_per_iteration": 2.974381446838379 }, { "auxiliary_loss_clip": 0.01873567, "auxiliary_loss_mlp": 0.01428884, "balance_loss_clip": 1.53425312, "balance_loss_mlp": 1.36417675, "epoch": 0.04256726288892229, "flos": 30378187532160.0, "grad_norm": 1.7254142115679436, "language_loss": 0.83380532, "learning_rate": 3.998343840719776e-06, "loss": 0.86682975, "num_input_tokens_seen": 15077550, "router_z_loss_clip": 3.38867188, "router_z_loss_mlp": 0.64697266, "step": 708, "time_per_iteration": 3.045398473739624 }, { "auxiliary_loss_clip": 0.0187123, "auxiliary_loss_mlp": 0.01414633, "balance_loss_clip": 1.52891517, "balance_loss_mlp": 1.35359836, "epoch": 0.04262738614159026, "flos": 16371330503040.0, "grad_norm": 2.165310153970776, "language_loss": 0.85613942, "learning_rate": 3.998327956604666e-06, "loss": 0.88899803, "num_input_tokens_seen": 15094955, "router_z_loss_clip": 3.42382812, "router_z_loss_mlp": 0.61083984, "step": 709, "time_per_iteration": 2.9153432846069336 }, { "auxiliary_loss_clip": 0.01899001, "auxiliary_loss_mlp": 0.01412544, "balance_loss_clip": 1.54926682, "balance_loss_mlp": 1.34798002, "epoch": 0.04268750939425823, "flos": 20422046891520.0, "grad_norm": 2.6497421113705313, "language_loss": 0.89039063, "learning_rate": 3.99831199671276e-06, "loss": 0.92350614, "num_input_tokens_seen": 15113395, "router_z_loss_clip": 3.49804688, "router_z_loss_mlp": 0.64501953, "step": 710, "time_per_iteration": 2.986358404159546 }, { "auxiliary_loss_clip": 0.01890333, "auxiliary_loss_mlp": 0.0141115, "balance_loss_clip": 1.55238152, "balance_loss_mlp": 1.35431099, "epoch": 0.0427476326469262, "flos": 20312608423680.0, "grad_norm": 2.0018863062527354, "language_loss": 0.86911535, "learning_rate": 3.998295961044662e-06, "loss": 0.90213013, "num_input_tokens_seen": 15132920, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.56762695, "step": 711, "time_per_iteration": 2.94836163520813 }, { "auxiliary_loss_clip": 0.0187944, "auxiliary_loss_mlp": 0.01406386, "balance_loss_clip": 1.5419724, "balance_loss_mlp": 1.34973764, "epoch": 0.042807755899594166, "flos": 21660238103040.0, "grad_norm": 1.686725212099122, "language_loss": 0.87494421, "learning_rate": 3.9982798496009804e-06, "loss": 0.90780246, "num_input_tokens_seen": 15153115, "router_z_loss_clip": 3.375, "router_z_loss_mlp": 0.56640625, "step": 712, "time_per_iteration": 4.401049375534058 }, { "auxiliary_loss_clip": 0.01913099, "auxiliary_loss_mlp": 0.01429716, "balance_loss_clip": 1.56280684, "balance_loss_mlp": 1.36598706, "epoch": 0.04286787915226214, "flos": 21445206975360.0, "grad_norm": 2.2050732453180215, "language_loss": 0.94239867, "learning_rate": 3.998263662382328e-06, "loss": 0.9758268, "num_input_tokens_seen": 15172770, "router_z_loss_clip": 3.5, "router_z_loss_mlp": 0.63745117, "step": 713, "time_per_iteration": 2.935232400894165 }, { "auxiliary_loss_clip": 0.0161998, "auxiliary_loss_mlp": 0.0138637, "balance_loss_clip": 1.40543556, "balance_loss_mlp": 1.35890424, "epoch": 0.04292800240493011, "flos": 66432552042240.0, "grad_norm": 0.8892425080044829, "language_loss": 0.63811636, "learning_rate": 3.9982473993893165e-06, "loss": 0.66817987, "num_input_tokens_seen": 15240055, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.27539062, "step": 714, "time_per_iteration": 3.4342894554138184 }, { "auxiliary_loss_clip": 0.018662, "auxiliary_loss_mlp": 0.01416303, "balance_loss_clip": 1.53611696, "balance_loss_mlp": 1.35793781, "epoch": 0.042988125657598075, "flos": 31662917660160.0, "grad_norm": 1.7261391371845218, "language_loss": 0.769256, "learning_rate": 3.998231060622563e-06, "loss": 0.80208111, "num_input_tokens_seen": 15261585, "router_z_loss_clip": 3.30078125, "router_z_loss_mlp": 0.58447266, "step": 715, "time_per_iteration": 5.839420795440674 }, { "auxiliary_loss_clip": 0.01892364, "auxiliary_loss_mlp": 0.01423129, "balance_loss_clip": 1.54726517, "balance_loss_mlp": 1.36953259, "epoch": 0.04304824891026605, "flos": 33259675063680.0, "grad_norm": 2.0276379052797586, "language_loss": 0.75453043, "learning_rate": 3.998214646082688e-06, "loss": 0.78768539, "num_input_tokens_seen": 15281160, "router_z_loss_clip": 3.44726562, "router_z_loss_mlp": 0.53613281, "step": 716, "time_per_iteration": 3.0408191680908203 }, { "auxiliary_loss_clip": 0.01597324, "auxiliary_loss_mlp": 0.01351057, "balance_loss_clip": 1.38407516, "balance_loss_mlp": 1.33131552, "epoch": 0.04310837216293401, "flos": 64099252235520.0, "grad_norm": 0.9088058074208603, "language_loss": 0.65822923, "learning_rate": 3.998198155770314e-06, "loss": 0.68771303, "num_input_tokens_seen": 15344505, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.19726562, "step": 717, "time_per_iteration": 3.354990243911743 }, { "auxiliary_loss_clip": 0.01578036, "auxiliary_loss_mlp": 0.01349515, "balance_loss_clip": 1.36827695, "balance_loss_mlp": 1.32996488, "epoch": 0.043168495415601985, "flos": 61372050048000.0, "grad_norm": 0.9809814160912551, "language_loss": 0.58976793, "learning_rate": 3.998181589686065e-06, "loss": 0.61904347, "num_input_tokens_seen": 15404050, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.1953125, "step": 718, "time_per_iteration": 3.2009143829345703 }, { "auxiliary_loss_clip": 0.01867079, "auxiliary_loss_mlp": 0.01419621, "balance_loss_clip": 1.53106618, "balance_loss_mlp": 1.36421227, "epoch": 0.04322861866826996, "flos": 20714003458560.0, "grad_norm": 2.076529409708467, "language_loss": 0.94171143, "learning_rate": 3.99816494783057e-06, "loss": 0.97457844, "num_input_tokens_seen": 15424190, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.55395508, "step": 719, "time_per_iteration": 2.9428839683532715 }, { "auxiliary_loss_clip": 0.01882313, "auxiliary_loss_mlp": 0.01427644, "balance_loss_clip": 1.53601754, "balance_loss_mlp": 1.3676343, "epoch": 0.04328874192093792, "flos": 30385698168960.0, "grad_norm": 1.6273678316622155, "language_loss": 0.67888725, "learning_rate": 3.99814823020446e-06, "loss": 0.7119869, "num_input_tokens_seen": 15446500, "router_z_loss_clip": 3.46289062, "router_z_loss_mlp": 0.59985352, "step": 720, "time_per_iteration": 3.049025058746338 }, { "auxiliary_loss_clip": 0.0185489, "auxiliary_loss_mlp": 0.01424686, "balance_loss_clip": 1.5253253, "balance_loss_mlp": 1.3678236, "epoch": 0.043348865173605894, "flos": 21954864113280.0, "grad_norm": 1.7857669997040797, "language_loss": 0.799528, "learning_rate": 3.9981314368083684e-06, "loss": 0.83232379, "num_input_tokens_seen": 15465830, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.56860352, "step": 721, "time_per_iteration": 2.9399807453155518 }, { "auxiliary_loss_clip": 0.01885821, "auxiliary_loss_mlp": 0.01438031, "balance_loss_clip": 1.54395938, "balance_loss_mlp": 1.38162088, "epoch": 0.04340898842627386, "flos": 15271606938240.0, "grad_norm": 2.4318385868713466, "language_loss": 0.91326874, "learning_rate": 3.998114567642933e-06, "loss": 0.94650722, "num_input_tokens_seen": 15479985, "router_z_loss_clip": 3.41796875, "router_z_loss_mlp": 0.56420898, "step": 722, "time_per_iteration": 3.0237135887145996 }, { "auxiliary_loss_clip": 0.0188476, "auxiliary_loss_mlp": 0.01418902, "balance_loss_clip": 1.54072654, "balance_loss_mlp": 1.36160994, "epoch": 0.04346911167894183, "flos": 27977192225280.0, "grad_norm": 1.6611790582638595, "language_loss": 0.87100774, "learning_rate": 3.998097622708792e-06, "loss": 0.90404439, "num_input_tokens_seen": 15501545, "router_z_loss_clip": 3.43945312, "router_z_loss_mlp": 0.57299805, "step": 723, "time_per_iteration": 2.9661364555358887 }, { "auxiliary_loss_clip": 0.01876483, "auxiliary_loss_mlp": 0.01421548, "balance_loss_clip": 1.54038143, "balance_loss_mlp": 1.36175275, "epoch": 0.0435292349316098, "flos": 29254230737280.0, "grad_norm": 1.9287581479853886, "language_loss": 0.84676349, "learning_rate": 3.99808060200659e-06, "loss": 0.87974381, "num_input_tokens_seen": 15521725, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.59814453, "step": 724, "time_per_iteration": 2.921574831008911 }, { "auxiliary_loss_clip": 0.01857113, "auxiliary_loss_mlp": 0.01426174, "balance_loss_clip": 1.52804804, "balance_loss_mlp": 1.36487687, "epoch": 0.04358935818427777, "flos": 20568070419840.0, "grad_norm": 2.1401418010799445, "language_loss": 0.82473868, "learning_rate": 3.998063505536971e-06, "loss": 0.8575716, "num_input_tokens_seen": 15540910, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.61328125, "step": 725, "time_per_iteration": 2.8997013568878174 }, { "auxiliary_loss_clip": 0.0188996, "auxiliary_loss_mlp": 0.01409792, "balance_loss_clip": 1.5404017, "balance_loss_mlp": 1.35414529, "epoch": 0.04364948143694574, "flos": 14472979390080.0, "grad_norm": 1.9426785383783498, "language_loss": 0.89713764, "learning_rate": 3.998046333300584e-06, "loss": 0.93013513, "num_input_tokens_seen": 15558640, "router_z_loss_clip": 3.49414062, "router_z_loss_mlp": 0.55615234, "step": 726, "time_per_iteration": 2.8861801624298096 }, { "auxiliary_loss_clip": 0.01545121, "auxiliary_loss_mlp": 0.01399475, "balance_loss_clip": 1.3435781, "balance_loss_mlp": 1.38040137, "epoch": 0.043709604689613706, "flos": 50092015265280.0, "grad_norm": 0.9066111944712127, "language_loss": 0.5606811, "learning_rate": 3.998029085298079e-06, "loss": 0.59012711, "num_input_tokens_seen": 15612975, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.19042969, "step": 727, "time_per_iteration": 3.4231810569763184 }, { "auxiliary_loss_clip": 0.01878452, "auxiliary_loss_mlp": 0.01415019, "balance_loss_clip": 1.54053032, "balance_loss_mlp": 1.36266208, "epoch": 0.04376972794228168, "flos": 13999771578240.0, "grad_norm": 2.0011747958995967, "language_loss": 0.85094321, "learning_rate": 3.998011761530112e-06, "loss": 0.88387787, "num_input_tokens_seen": 15631070, "router_z_loss_clip": 3.37695312, "router_z_loss_mlp": 0.52368164, "step": 728, "time_per_iteration": 2.8805689811706543 }, { "auxiliary_loss_clip": 0.01881251, "auxiliary_loss_mlp": 0.01409838, "balance_loss_clip": 1.5514549, "balance_loss_mlp": 1.35950804, "epoch": 0.04382985119494965, "flos": 22017944643840.0, "grad_norm": 1.9612928068288507, "language_loss": 0.77830446, "learning_rate": 3.997994361997338e-06, "loss": 0.81121528, "num_input_tokens_seen": 15647825, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.50268555, "step": 729, "time_per_iteration": 2.9253108501434326 }, { "auxiliary_loss_clip": 0.0188017, "auxiliary_loss_mlp": 0.01437363, "balance_loss_clip": 1.54280567, "balance_loss_mlp": 1.37465882, "epoch": 0.043889974447617615, "flos": 24217075059840.0, "grad_norm": 4.75827974862781, "language_loss": 0.97816634, "learning_rate": 3.997976886700417e-06, "loss": 1.01134169, "num_input_tokens_seen": 15668260, "router_z_loss_clip": 3.36914062, "router_z_loss_mlp": 0.62670898, "step": 730, "time_per_iteration": 2.910001754760742 }, { "auxiliary_loss_clip": 0.0190936, "auxiliary_loss_mlp": 0.0144303, "balance_loss_clip": 1.56059241, "balance_loss_mlp": 1.38750231, "epoch": 0.04395009770028559, "flos": 17283378061440.0, "grad_norm": 1.9971074187491038, "language_loss": 0.90748799, "learning_rate": 3.997959335640013e-06, "loss": 0.94101191, "num_input_tokens_seen": 15685630, "router_z_loss_clip": 3.484375, "router_z_loss_mlp": 0.55493164, "step": 731, "time_per_iteration": 2.9167802333831787 }, { "auxiliary_loss_clip": 0.01892854, "auxiliary_loss_mlp": 0.01423794, "balance_loss_clip": 1.55966783, "balance_loss_mlp": 1.3691721, "epoch": 0.04401022095295355, "flos": 12317265999360.0, "grad_norm": 2.686306618743098, "language_loss": 0.93006575, "learning_rate": 3.997941708816791e-06, "loss": 0.96323228, "num_input_tokens_seen": 15698645, "router_z_loss_clip": 3.328125, "router_z_loss_mlp": 0.54663086, "step": 732, "time_per_iteration": 2.8737337589263916 }, { "auxiliary_loss_clip": 0.01894209, "auxiliary_loss_mlp": 0.01442912, "balance_loss_clip": 1.55442023, "balance_loss_mlp": 1.38936293, "epoch": 0.044070344205621524, "flos": 20969465454720.0, "grad_norm": 2.1442995468662414, "language_loss": 0.88115162, "learning_rate": 3.997924006231419e-06, "loss": 0.91452283, "num_input_tokens_seen": 15716775, "router_z_loss_clip": 3.3984375, "router_z_loss_mlp": 0.53588867, "step": 733, "time_per_iteration": 2.9321916103363037 }, { "auxiliary_loss_clip": 0.01892767, "auxiliary_loss_mlp": 0.01448693, "balance_loss_clip": 1.56099737, "balance_loss_mlp": 1.39774346, "epoch": 0.044130467458289496, "flos": 13853340846720.0, "grad_norm": 2.8942354956444145, "language_loss": 0.94055319, "learning_rate": 3.9979062278845685e-06, "loss": 0.97396773, "num_input_tokens_seen": 15733320, "router_z_loss_clip": 3.31054688, "router_z_loss_mlp": 0.50976562, "step": 734, "time_per_iteration": 2.8936517238616943 }, { "auxiliary_loss_clip": 0.01874641, "auxiliary_loss_mlp": 0.01439175, "balance_loss_clip": 1.55348217, "balance_loss_mlp": 1.38870168, "epoch": 0.04419059071095746, "flos": 28666064592000.0, "grad_norm": 2.150201630083173, "language_loss": 0.81715298, "learning_rate": 3.9978883737769125e-06, "loss": 0.85029113, "num_input_tokens_seen": 15752705, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.50463867, "step": 735, "time_per_iteration": 2.996674060821533 }, { "auxiliary_loss_clip": 0.01876597, "auxiliary_loss_mlp": 0.01426291, "balance_loss_clip": 1.54754567, "balance_loss_mlp": 1.37627029, "epoch": 0.04425071396362543, "flos": 28194847551360.0, "grad_norm": 2.3276130780406974, "language_loss": 0.91140306, "learning_rate": 3.9978704439091305e-06, "loss": 0.9444319, "num_input_tokens_seen": 15772800, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.50048828, "step": 736, "time_per_iteration": 2.995058059692383 }, { "auxiliary_loss_clip": 0.01869817, "auxiliary_loss_mlp": 0.01423546, "balance_loss_clip": 1.55083799, "balance_loss_mlp": 1.3730973, "epoch": 0.0443108372162934, "flos": 23668661111040.0, "grad_norm": 1.7285589272321336, "language_loss": 0.87185109, "learning_rate": 3.997852438281901e-06, "loss": 0.90478474, "num_input_tokens_seen": 15793665, "router_z_loss_clip": 3.18164062, "router_z_loss_mlp": 0.50415039, "step": 737, "time_per_iteration": 2.9087588787078857 }, { "auxiliary_loss_clip": 0.01872622, "auxiliary_loss_mlp": 0.01409727, "balance_loss_clip": 1.55355358, "balance_loss_mlp": 1.35925376, "epoch": 0.04437096046896137, "flos": 33991692986880.0, "grad_norm": 1.7775664963375757, "language_loss": 0.87357605, "learning_rate": 3.997834356895906e-06, "loss": 0.90639949, "num_input_tokens_seen": 15813175, "router_z_loss_clip": 3.19140625, "router_z_loss_mlp": 0.50512695, "step": 738, "time_per_iteration": 3.0028109550476074 }, { "auxiliary_loss_clip": 0.01588643, "auxiliary_loss_mlp": 0.01377567, "balance_loss_clip": 1.40529346, "balance_loss_mlp": 1.35973382, "epoch": 0.04443108372162934, "flos": 67426619719680.0, "grad_norm": 0.8774947206118064, "language_loss": 0.59251225, "learning_rate": 3.9978161997518324e-06, "loss": 0.62217426, "num_input_tokens_seen": 15872050, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.17871094, "step": 739, "time_per_iteration": 3.331681489944458 }, { "auxiliary_loss_clip": 0.01850388, "auxiliary_loss_mlp": 0.01406158, "balance_loss_clip": 1.53516042, "balance_loss_mlp": 1.35053515, "epoch": 0.04449120697429731, "flos": 29764747526400.0, "grad_norm": 2.146540833101568, "language_loss": 0.94922328, "learning_rate": 3.997797966850369e-06, "loss": 0.98178881, "num_input_tokens_seen": 15891085, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.55664062, "step": 740, "time_per_iteration": 2.9626245498657227 }, { "auxiliary_loss_clip": 0.01834548, "auxiliary_loss_mlp": 0.01405322, "balance_loss_clip": 1.52528477, "balance_loss_mlp": 1.36042786, "epoch": 0.04455133022696528, "flos": 36515835936000.0, "grad_norm": 1.8109677680183802, "language_loss": 0.74031162, "learning_rate": 3.997779658192205e-06, "loss": 0.77271032, "num_input_tokens_seen": 15914225, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.44897461, "step": 741, "time_per_iteration": 3.0658676624298096 }, { "auxiliary_loss_clip": 0.01816081, "auxiliary_loss_mlp": 0.01412371, "balance_loss_clip": 1.51785052, "balance_loss_mlp": 1.36196911, "epoch": 0.044611453479633245, "flos": 28815390990720.0, "grad_norm": 1.6545422574039155, "language_loss": 0.89935893, "learning_rate": 3.997761273778037e-06, "loss": 0.93164349, "num_input_tokens_seen": 15934540, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.50390625, "step": 742, "time_per_iteration": 2.944192886352539 }, { "auxiliary_loss_clip": 0.01829265, "auxiliary_loss_mlp": 0.01406171, "balance_loss_clip": 1.52322972, "balance_loss_mlp": 1.35584092, "epoch": 0.04467157673230122, "flos": 20020516122240.0, "grad_norm": 1.6689098756707486, "language_loss": 0.86307043, "learning_rate": 3.997742813608561e-06, "loss": 0.89542484, "num_input_tokens_seen": 15952560, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.50341797, "step": 743, "time_per_iteration": 2.9224765300750732 }, { "auxiliary_loss_clip": 0.01844098, "auxiliary_loss_mlp": 0.01419169, "balance_loss_clip": 1.5290345, "balance_loss_mlp": 1.36676478, "epoch": 0.04473169998496919, "flos": 18014174375040.0, "grad_norm": 2.1167540475477704, "language_loss": 0.82972831, "learning_rate": 3.997724277684479e-06, "loss": 0.86236095, "num_input_tokens_seen": 15970620, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.52392578, "step": 744, "time_per_iteration": 2.8655498027801514 }, { "auxiliary_loss_clip": 0.01828467, "auxiliary_loss_mlp": 0.01428365, "balance_loss_clip": 1.51996446, "balance_loss_mlp": 1.37312317, "epoch": 0.044791823237637154, "flos": 20641557254400.0, "grad_norm": 1.8840597342492515, "language_loss": 0.87216598, "learning_rate": 3.99770566600649e-06, "loss": 0.90473431, "num_input_tokens_seen": 15987325, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.55297852, "step": 745, "time_per_iteration": 2.8988864421844482 }, { "auxiliary_loss_clip": 0.01821615, "auxiliary_loss_mlp": 0.01400654, "balance_loss_clip": 1.51276243, "balance_loss_mlp": 1.35223126, "epoch": 0.04485194649030513, "flos": 31188759707520.0, "grad_norm": 1.5940569941983864, "language_loss": 0.70484185, "learning_rate": 3.997686978575302e-06, "loss": 0.73706448, "num_input_tokens_seen": 16008310, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.48413086, "step": 746, "time_per_iteration": 2.9693877696990967 }, { "auxiliary_loss_clip": 0.01829242, "auxiliary_loss_mlp": 0.01408519, "balance_loss_clip": 1.51754999, "balance_loss_mlp": 1.35756886, "epoch": 0.04491206974297309, "flos": 26155133124480.0, "grad_norm": 4.50444213445697, "language_loss": 0.72110617, "learning_rate": 3.997668215391625e-06, "loss": 0.75348377, "num_input_tokens_seen": 16029620, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.50952148, "step": 747, "time_per_iteration": 4.337121486663818 }, { "auxiliary_loss_clip": 0.01831279, "auxiliary_loss_mlp": 0.01406919, "balance_loss_clip": 1.51720095, "balance_loss_mlp": 1.35830593, "epoch": 0.044972192995641064, "flos": 20677373153280.0, "grad_norm": 1.7543771950286817, "language_loss": 0.69462407, "learning_rate": 3.997649376456168e-06, "loss": 0.72700608, "num_input_tokens_seen": 16049065, "router_z_loss_clip": 3.14453125, "router_z_loss_mlp": 0.48583984, "step": 748, "time_per_iteration": 3.0355687141418457 }, { "auxiliary_loss_clip": 0.01821218, "auxiliary_loss_mlp": 0.01410234, "balance_loss_clip": 1.50347352, "balance_loss_mlp": 1.36045265, "epoch": 0.045032316248309036, "flos": 16115325569280.0, "grad_norm": 2.098925474326493, "language_loss": 0.79656464, "learning_rate": 3.997630461769647e-06, "loss": 0.82887918, "num_input_tokens_seen": 16066765, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.49780273, "step": 749, "time_per_iteration": 4.484265565872192 }, { "auxiliary_loss_clip": 0.01818191, "auxiliary_loss_mlp": 0.01399623, "balance_loss_clip": 1.50725865, "balance_loss_mlp": 1.35062838, "epoch": 0.045092439500977, "flos": 17867969867520.0, "grad_norm": 2.2087650294882195, "language_loss": 0.91904932, "learning_rate": 3.997611471332778e-06, "loss": 0.95122743, "num_input_tokens_seen": 16085980, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.49023438, "step": 750, "time_per_iteration": 4.368940114974976 }, { "auxiliary_loss_clip": 0.01828831, "auxiliary_loss_mlp": 0.014155, "balance_loss_clip": 1.50582111, "balance_loss_mlp": 1.36173701, "epoch": 0.04515256275364497, "flos": 24473668176000.0, "grad_norm": 1.8919848037991864, "language_loss": 0.76874387, "learning_rate": 3.9975924051462825e-06, "loss": 0.80118716, "num_input_tokens_seen": 16106260, "router_z_loss_clip": 3.22460938, "router_z_loss_mlp": 0.53735352, "step": 751, "time_per_iteration": 2.9266202449798584 }, { "auxiliary_loss_clip": 0.01827861, "auxiliary_loss_mlp": 0.01398185, "balance_loss_clip": 1.50911379, "balance_loss_mlp": 1.34806991, "epoch": 0.04521268600631294, "flos": 20924419616640.0, "grad_norm": 2.044618179238129, "language_loss": 0.72757006, "learning_rate": 3.997573263210883e-06, "loss": 0.75983059, "num_input_tokens_seen": 16123475, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.50073242, "step": 752, "time_per_iteration": 2.948711633682251 }, { "auxiliary_loss_clip": 0.01837854, "auxiliary_loss_mlp": 0.01407321, "balance_loss_clip": 1.51439106, "balance_loss_mlp": 1.35505939, "epoch": 0.04527280925898091, "flos": 13379409118080.0, "grad_norm": 2.94500083802943, "language_loss": 0.95578623, "learning_rate": 3.997554045527305e-06, "loss": 0.98823798, "num_input_tokens_seen": 16138335, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.52270508, "step": 753, "time_per_iteration": 2.946030378341675 }, { "auxiliary_loss_clip": 0.01839672, "auxiliary_loss_mlp": 0.01416577, "balance_loss_clip": 1.51202047, "balance_loss_mlp": 1.36693859, "epoch": 0.04533293251164888, "flos": 23264008450560.0, "grad_norm": 1.864918050199659, "language_loss": 0.93731219, "learning_rate": 3.997534752096277e-06, "loss": 0.96987468, "num_input_tokens_seen": 16157110, "router_z_loss_clip": 3.2734375, "router_z_loss_mlp": 0.49658203, "step": 754, "time_per_iteration": 2.9738311767578125 }, { "auxiliary_loss_clip": 0.01799635, "auxiliary_loss_mlp": 0.0141314, "balance_loss_clip": 1.48793459, "balance_loss_mlp": 1.36476433, "epoch": 0.04539305576431685, "flos": 12429600134400.0, "grad_norm": 2.1802391860680417, "language_loss": 0.82788706, "learning_rate": 3.997515382918531e-06, "loss": 0.8600148, "num_input_tokens_seen": 16174155, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.48339844, "step": 755, "time_per_iteration": 2.904738426208496 }, { "auxiliary_loss_clip": 0.01861373, "auxiliary_loss_mlp": 0.01415794, "balance_loss_clip": 1.5292325, "balance_loss_mlp": 1.36059988, "epoch": 0.04545317901698482, "flos": 16079645404800.0, "grad_norm": 2.2249185344744586, "language_loss": 0.81624317, "learning_rate": 3.9974959379948015e-06, "loss": 0.84901482, "num_input_tokens_seen": 16192240, "router_z_loss_clip": 3.32421875, "router_z_loss_mlp": 0.55175781, "step": 756, "time_per_iteration": 2.874044418334961 }, { "auxiliary_loss_clip": 0.01527385, "auxiliary_loss_mlp": 0.01363339, "balance_loss_clip": 1.34259439, "balance_loss_mlp": 1.34331167, "epoch": 0.045513302269652785, "flos": 66430244557440.0, "grad_norm": 0.8143196125274184, "language_loss": 0.62874645, "learning_rate": 3.997476417325827e-06, "loss": 0.65765369, "num_input_tokens_seen": 16255775, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.20019531, "step": 757, "time_per_iteration": 3.380831003189087 }, { "auxiliary_loss_clip": 0.0182952, "auxiliary_loss_mlp": 0.01402036, "balance_loss_clip": 1.5164001, "balance_loss_mlp": 1.35635519, "epoch": 0.04557342552232076, "flos": 21481294343040.0, "grad_norm": 1.4304769051346566, "language_loss": 0.85912132, "learning_rate": 3.997456820912346e-06, "loss": 0.89143693, "num_input_tokens_seen": 16277015, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.45703125, "step": 758, "time_per_iteration": 2.9174766540527344 }, { "auxiliary_loss_clip": 0.0182339, "auxiliary_loss_mlp": 0.01403253, "balance_loss_clip": 1.51454926, "balance_loss_mlp": 1.35471058, "epoch": 0.04563354877498873, "flos": 23743233820800.0, "grad_norm": 1.572253467288795, "language_loss": 0.90430075, "learning_rate": 3.997437148755101e-06, "loss": 0.93656719, "num_input_tokens_seen": 16296005, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.4855957, "step": 759, "time_per_iteration": 2.9826741218566895 }, { "auxiliary_loss_clip": 0.01851553, "auxiliary_loss_mlp": 0.01411191, "balance_loss_clip": 1.52170324, "balance_loss_mlp": 1.36283958, "epoch": 0.045693672027656694, "flos": 25745865494400.0, "grad_norm": 1.9694097359266136, "language_loss": 0.76808208, "learning_rate": 3.9974174008548405e-06, "loss": 0.80070955, "num_input_tokens_seen": 16315300, "router_z_loss_clip": 3.30273438, "router_z_loss_mlp": 0.48339844, "step": 760, "time_per_iteration": 2.9721078872680664 }, { "auxiliary_loss_clip": 0.0182119, "auxiliary_loss_mlp": 0.01414614, "balance_loss_clip": 1.50802028, "balance_loss_mlp": 1.36538029, "epoch": 0.045753795280324666, "flos": 19728469065600.0, "grad_norm": 2.02199454136294, "language_loss": 0.85850781, "learning_rate": 3.9973975772123105e-06, "loss": 0.8908658, "num_input_tokens_seen": 16333820, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.49267578, "step": 761, "time_per_iteration": 2.88466215133667 }, { "auxiliary_loss_clip": 0.01823041, "auxiliary_loss_mlp": 0.01403745, "balance_loss_clip": 1.51203454, "balance_loss_mlp": 1.35312915, "epoch": 0.04581391853299264, "flos": 23265546773760.0, "grad_norm": 1.8711302250722732, "language_loss": 0.81321716, "learning_rate": 3.997377677828266e-06, "loss": 0.84548503, "num_input_tokens_seen": 16355290, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.50634766, "step": 762, "time_per_iteration": 3.0334715843200684 }, { "auxiliary_loss_clip": 0.01533627, "auxiliary_loss_mlp": 0.01350375, "balance_loss_clip": 1.35450578, "balance_loss_mlp": 1.33311367, "epoch": 0.0458740417856606, "flos": 64261500664320.0, "grad_norm": 1.0230698571538615, "language_loss": 0.58958972, "learning_rate": 3.9973577027034585e-06, "loss": 0.61842972, "num_input_tokens_seen": 16415995, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.17285156, "step": 763, "time_per_iteration": 3.4223034381866455 }, { "auxiliary_loss_clip": 0.0182625, "auxiliary_loss_mlp": 0.01391582, "balance_loss_clip": 1.50772262, "balance_loss_mlp": 1.34225309, "epoch": 0.045934165038328575, "flos": 20778034129920.0, "grad_norm": 2.1360309516049054, "language_loss": 0.90989161, "learning_rate": 3.9973376518386475e-06, "loss": 0.94206989, "num_input_tokens_seen": 16433120, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.49365234, "step": 764, "time_per_iteration": 2.913908004760742 }, { "auxiliary_loss_clip": 0.01852021, "auxiliary_loss_mlp": 0.01406893, "balance_loss_clip": 1.52348483, "balance_loss_mlp": 1.35525167, "epoch": 0.04599428829099654, "flos": 30274947601920.0, "grad_norm": 2.4258240181928388, "language_loss": 0.90691137, "learning_rate": 3.997317525234592e-06, "loss": 0.93950057, "num_input_tokens_seen": 16453360, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.51635742, "step": 765, "time_per_iteration": 3.030827760696411 }, { "auxiliary_loss_clip": 0.01860152, "auxiliary_loss_mlp": 0.01406598, "balance_loss_clip": 1.52474284, "balance_loss_mlp": 1.35595822, "epoch": 0.04605441154366451, "flos": 23049158302080.0, "grad_norm": 2.4334746267586627, "language_loss": 0.9224034, "learning_rate": 3.997297322892056e-06, "loss": 0.95507097, "num_input_tokens_seen": 16471160, "router_z_loss_clip": 3.3515625, "router_z_loss_mlp": 0.50634766, "step": 766, "time_per_iteration": 2.925028085708618 }, { "auxiliary_loss_clip": 0.01845157, "auxiliary_loss_mlp": 0.01391694, "balance_loss_clip": 1.51981235, "balance_loss_mlp": 1.34703779, "epoch": 0.046114534796332485, "flos": 22027400807040.0, "grad_norm": 2.1255610463993126, "language_loss": 0.86258286, "learning_rate": 3.997277044811806e-06, "loss": 0.89495134, "num_input_tokens_seen": 16488940, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.44677734, "step": 767, "time_per_iteration": 2.928607225418091 }, { "auxiliary_loss_clip": 0.01830262, "auxiliary_loss_mlp": 0.01390609, "balance_loss_clip": 1.51123428, "balance_loss_mlp": 1.34316409, "epoch": 0.04617465804900045, "flos": 29874547952640.0, "grad_norm": 2.6164681187059013, "language_loss": 0.90550232, "learning_rate": 3.99725669099461e-06, "loss": 0.937711, "num_input_tokens_seen": 16509505, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.47485352, "step": 768, "time_per_iteration": 3.0374767780303955 }, { "auxiliary_loss_clip": 0.01825738, "auxiliary_loss_mlp": 0.01393484, "balance_loss_clip": 1.5027678, "balance_loss_mlp": 1.34560978, "epoch": 0.04623478130166842, "flos": 25641132485760.0, "grad_norm": 1.855598811037543, "language_loss": 0.77536201, "learning_rate": 3.9972362614412395e-06, "loss": 0.80755424, "num_input_tokens_seen": 16528840, "router_z_loss_clip": 3.23046875, "router_z_loss_mlp": 0.47875977, "step": 769, "time_per_iteration": 2.930849313735962 }, { "auxiliary_loss_clip": 0.01840537, "auxiliary_loss_mlp": 0.01396492, "balance_loss_clip": 1.52008522, "balance_loss_mlp": 1.34954739, "epoch": 0.04629490455433639, "flos": 20458812931200.0, "grad_norm": 1.7108334592982801, "language_loss": 0.88691765, "learning_rate": 3.997215756152471e-06, "loss": 0.91928792, "num_input_tokens_seen": 16548335, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.46923828, "step": 770, "time_per_iteration": 2.9216597080230713 }, { "auxiliary_loss_clip": 0.01859199, "auxiliary_loss_mlp": 0.01394573, "balance_loss_clip": 1.52304351, "balance_loss_mlp": 1.34536326, "epoch": 0.04635502780700436, "flos": 23159094462720.0, "grad_norm": 1.9048594332260138, "language_loss": 0.89956772, "learning_rate": 3.99719517512908e-06, "loss": 0.93210554, "num_input_tokens_seen": 16567725, "router_z_loss_clip": 3.36328125, "router_z_loss_mlp": 0.49194336, "step": 771, "time_per_iteration": 2.9021871089935303 }, { "auxiliary_loss_clip": 0.018788, "auxiliary_loss_mlp": 0.01402846, "balance_loss_clip": 1.53030348, "balance_loss_mlp": 1.34755707, "epoch": 0.04641515105967233, "flos": 23301724631040.0, "grad_norm": 2.458990416584798, "language_loss": 0.86799014, "learning_rate": 3.997174518371848e-06, "loss": 0.90080661, "num_input_tokens_seen": 16588175, "router_z_loss_clip": 3.48242188, "router_z_loss_mlp": 0.5534668, "step": 772, "time_per_iteration": 2.9592127799987793 }, { "auxiliary_loss_clip": 0.0184297, "auxiliary_loss_mlp": 0.0140165, "balance_loss_clip": 1.51696324, "balance_loss_mlp": 1.35384727, "epoch": 0.046475274312340296, "flos": 25125095831040.0, "grad_norm": 2.0053639612289373, "language_loss": 0.76235378, "learning_rate": 3.997153785881557e-06, "loss": 0.79480004, "num_input_tokens_seen": 16607735, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.47802734, "step": 773, "time_per_iteration": 2.937987804412842 }, { "auxiliary_loss_clip": 0.01826529, "auxiliary_loss_mlp": 0.01398626, "balance_loss_clip": 1.51024282, "balance_loss_mlp": 1.35087037, "epoch": 0.04653539756500827, "flos": 25275824818560.0, "grad_norm": 1.892682913212188, "language_loss": 0.80694079, "learning_rate": 3.997132977658996e-06, "loss": 0.83919227, "num_input_tokens_seen": 16627225, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.47753906, "step": 774, "time_per_iteration": 2.9249420166015625 }, { "auxiliary_loss_clip": 0.01847365, "auxiliary_loss_mlp": 0.01397992, "balance_loss_clip": 1.5220468, "balance_loss_mlp": 1.3462069, "epoch": 0.046595520817676234, "flos": 35416700553600.0, "grad_norm": 1.9830329975893493, "language_loss": 0.75557733, "learning_rate": 3.997112093704952e-06, "loss": 0.78803086, "num_input_tokens_seen": 16647785, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.51806641, "step": 775, "time_per_iteration": 3.0458877086639404 }, { "auxiliary_loss_clip": 0.01845925, "auxiliary_loss_mlp": 0.01396654, "balance_loss_clip": 1.51606452, "balance_loss_mlp": 1.34501219, "epoch": 0.046655644070344206, "flos": 18121260113280.0, "grad_norm": 1.577608297709537, "language_loss": 0.79408789, "learning_rate": 3.997091134020217e-06, "loss": 0.82651371, "num_input_tokens_seen": 16667555, "router_z_loss_clip": 3.29296875, "router_z_loss_mlp": 0.5168457, "step": 776, "time_per_iteration": 2.8755991458892822 }, { "auxiliary_loss_clip": 0.01814775, "auxiliary_loss_mlp": 0.01393668, "balance_loss_clip": 1.49961662, "balance_loss_mlp": 1.34336162, "epoch": 0.04671576732301218, "flos": 29217193228800.0, "grad_norm": 2.0017917932076426, "language_loss": 0.74597222, "learning_rate": 3.997070098605585e-06, "loss": 0.77805662, "num_input_tokens_seen": 16686875, "router_z_loss_clip": 3.15234375, "router_z_loss_mlp": 0.50244141, "step": 777, "time_per_iteration": 2.9886343479156494 }, { "auxiliary_loss_clip": 0.01824814, "auxiliary_loss_mlp": 0.01390522, "balance_loss_clip": 1.50977933, "balance_loss_mlp": 1.33969092, "epoch": 0.04677589057568014, "flos": 30489526281600.0, "grad_norm": 1.8400168204724396, "language_loss": 0.78576303, "learning_rate": 3.997048987461856e-06, "loss": 0.81791633, "num_input_tokens_seen": 16706420, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.50927734, "step": 778, "time_per_iteration": 2.9512522220611572 }, { "auxiliary_loss_clip": 0.01831366, "auxiliary_loss_mlp": 0.01393039, "balance_loss_clip": 1.51449943, "balance_loss_mlp": 1.34392512, "epoch": 0.046836013828348115, "flos": 20567210768640.0, "grad_norm": 1.7805126195342975, "language_loss": 0.81811976, "learning_rate": 3.997027800589829e-06, "loss": 0.85036385, "num_input_tokens_seen": 16726390, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.49169922, "step": 779, "time_per_iteration": 2.9345955848693848 }, { "auxiliary_loss_clip": 0.01817615, "auxiliary_loss_mlp": 0.01395925, "balance_loss_clip": 1.50507855, "balance_loss_mlp": 1.34731126, "epoch": 0.04689613708101608, "flos": 25458342917760.0, "grad_norm": 1.6001421332730201, "language_loss": 0.78782463, "learning_rate": 3.997006537990308e-06, "loss": 0.81996012, "num_input_tokens_seen": 16748965, "router_z_loss_clip": 3.12304688, "router_z_loss_mlp": 0.4855957, "step": 780, "time_per_iteration": 2.9785804748535156 }, { "auxiliary_loss_clip": 0.01809633, "auxiliary_loss_mlp": 0.01391016, "balance_loss_clip": 1.50308776, "balance_loss_mlp": 1.34574008, "epoch": 0.04695626033368405, "flos": 23011351632000.0, "grad_norm": 1.8454841549771226, "language_loss": 0.78052449, "learning_rate": 3.996985199664099e-06, "loss": 0.81253099, "num_input_tokens_seen": 16768620, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.45288086, "step": 781, "time_per_iteration": 4.298590898513794 }, { "auxiliary_loss_clip": 0.01864387, "auxiliary_loss_mlp": 0.01403717, "balance_loss_clip": 1.52601779, "balance_loss_mlp": 1.35219526, "epoch": 0.047016383586352024, "flos": 29145516186240.0, "grad_norm": 3.0953733130967858, "language_loss": 0.78691792, "learning_rate": 3.99696378561201e-06, "loss": 0.81959897, "num_input_tokens_seen": 16789755, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.515625, "step": 782, "time_per_iteration": 2.9480690956115723 }, { "auxiliary_loss_clip": 0.0184835, "auxiliary_loss_mlp": 0.0140193, "balance_loss_clip": 1.5195384, "balance_loss_mlp": 1.35467517, "epoch": 0.04707650683901999, "flos": 14984672544000.0, "grad_norm": 2.630421796350862, "language_loss": 0.83736241, "learning_rate": 3.996942295834855e-06, "loss": 0.86986518, "num_input_tokens_seen": 16807585, "router_z_loss_clip": 3.28710938, "router_z_loss_mlp": 0.47216797, "step": 783, "time_per_iteration": 2.8645083904266357 }, { "auxiliary_loss_clip": 0.01821279, "auxiliary_loss_mlp": 0.01397082, "balance_loss_clip": 1.51320028, "balance_loss_mlp": 1.35278392, "epoch": 0.04713663009168796, "flos": 21660645306240.0, "grad_norm": 1.8812322655608364, "language_loss": 0.83666897, "learning_rate": 3.996920730333448e-06, "loss": 0.86885256, "num_input_tokens_seen": 16827220, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.44311523, "step": 784, "time_per_iteration": 4.293826580047607 }, { "auxiliary_loss_clip": 0.01840727, "auxiliary_loss_mlp": 0.01407437, "balance_loss_clip": 1.51957297, "balance_loss_mlp": 1.3555814, "epoch": 0.04719675334435593, "flos": 21335542283520.0, "grad_norm": 2.038855176247955, "language_loss": 0.82196027, "learning_rate": 3.996899089108607e-06, "loss": 0.85444188, "num_input_tokens_seen": 16846230, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.51855469, "step": 785, "time_per_iteration": 5.750572919845581 }, { "auxiliary_loss_clip": 0.01862031, "auxiliary_loss_mlp": 0.0140669, "balance_loss_clip": 1.53198457, "balance_loss_mlp": 1.3582201, "epoch": 0.0472568765970239, "flos": 17940416071680.0, "grad_norm": 2.371061068008672, "language_loss": 0.92075324, "learning_rate": 3.996877372161152e-06, "loss": 0.95344049, "num_input_tokens_seen": 16865325, "router_z_loss_clip": 3.29882812, "router_z_loss_mlp": 0.48461914, "step": 786, "time_per_iteration": 2.8435044288635254 }, { "auxiliary_loss_clip": 0.01864519, "auxiliary_loss_mlp": 0.0140156, "balance_loss_clip": 1.52150631, "balance_loss_mlp": 1.35070515, "epoch": 0.04731699984969187, "flos": 18086394355200.0, "grad_norm": 2.3130492437649974, "language_loss": 0.8175329, "learning_rate": 3.9968555794919065e-06, "loss": 0.85019368, "num_input_tokens_seen": 16882930, "router_z_loss_clip": 3.4296875, "router_z_loss_mlp": 0.50854492, "step": 787, "time_per_iteration": 2.906297445297241 }, { "auxiliary_loss_clip": 0.01858608, "auxiliary_loss_mlp": 0.01400122, "balance_loss_clip": 1.53087068, "balance_loss_mlp": 1.34805119, "epoch": 0.047377123102359836, "flos": 23195408054400.0, "grad_norm": 2.1134192986518854, "language_loss": 0.83571285, "learning_rate": 3.996833711101698e-06, "loss": 0.8683002, "num_input_tokens_seen": 16900710, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.52099609, "step": 788, "time_per_iteration": 2.8946852684020996 }, { "auxiliary_loss_clip": 0.01841545, "auxiliary_loss_mlp": 0.01396683, "balance_loss_clip": 1.52317727, "balance_loss_mlp": 1.34568512, "epoch": 0.04743724635502781, "flos": 22758151875840.0, "grad_norm": 1.7962856076281197, "language_loss": 0.86332119, "learning_rate": 3.996811766991355e-06, "loss": 0.89570343, "num_input_tokens_seen": 16919210, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.51025391, "step": 789, "time_per_iteration": 2.9510281085968018 }, { "auxiliary_loss_clip": 0.01851716, "auxiliary_loss_mlp": 0.01405319, "balance_loss_clip": 1.526088, "balance_loss_mlp": 1.35043561, "epoch": 0.04749736960769577, "flos": 17247697896960.0, "grad_norm": 2.234319510346056, "language_loss": 0.84486669, "learning_rate": 3.996789747161709e-06, "loss": 0.87743706, "num_input_tokens_seen": 16937125, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.54931641, "step": 790, "time_per_iteration": 3.005979299545288 }, { "auxiliary_loss_clip": 0.01838206, "auxiliary_loss_mlp": 0.01404923, "balance_loss_clip": 1.51464772, "balance_loss_mlp": 1.35344887, "epoch": 0.047557492860363745, "flos": 40494060875520.0, "grad_norm": 1.8067560405250707, "language_loss": 0.90425384, "learning_rate": 3.996767651613597e-06, "loss": 0.9366852, "num_input_tokens_seen": 16958610, "router_z_loss_clip": 3.234375, "router_z_loss_mlp": 0.515625, "step": 791, "time_per_iteration": 3.03739595413208 }, { "auxiliary_loss_clip": 0.01845247, "auxiliary_loss_mlp": 0.0139394, "balance_loss_clip": 1.5269959, "balance_loss_mlp": 1.3427515, "epoch": 0.04761761611303172, "flos": 18707344997760.0, "grad_norm": 1.9780024267043066, "language_loss": 0.91547269, "learning_rate": 3.996745480347854e-06, "loss": 0.94786447, "num_input_tokens_seen": 16977300, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.51123047, "step": 792, "time_per_iteration": 2.91386079788208 }, { "auxiliary_loss_clip": 0.01851402, "auxiliary_loss_mlp": 0.01397307, "balance_loss_clip": 1.52458668, "balance_loss_mlp": 1.3437109, "epoch": 0.04767773936569968, "flos": 20931477805440.0, "grad_norm": 1.746686072804086, "language_loss": 0.75069177, "learning_rate": 3.996723233365324e-06, "loss": 0.78317893, "num_input_tokens_seen": 16994950, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.53588867, "step": 793, "time_per_iteration": 2.9040868282318115 }, { "auxiliary_loss_clip": 0.01870766, "auxiliary_loss_mlp": 0.01389, "balance_loss_clip": 1.53725767, "balance_loss_mlp": 1.33907485, "epoch": 0.047737862618367655, "flos": 23743233820800.0, "grad_norm": 1.8813575652980163, "language_loss": 0.89648056, "learning_rate": 3.996700910666847e-06, "loss": 0.92907822, "num_input_tokens_seen": 17014760, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.4987793, "step": 794, "time_per_iteration": 2.917881488800049 }, { "auxiliary_loss_clip": 0.01864417, "auxiliary_loss_mlp": 0.01406163, "balance_loss_clip": 1.53375578, "balance_loss_mlp": 1.35018277, "epoch": 0.04779798587103562, "flos": 23706196312320.0, "grad_norm": 2.4200086282525017, "language_loss": 0.73541319, "learning_rate": 3.996678512253272e-06, "loss": 0.76811898, "num_input_tokens_seen": 17032715, "router_z_loss_clip": 3.3046875, "router_z_loss_mlp": 0.55932617, "step": 795, "time_per_iteration": 2.9173803329467773 }, { "auxiliary_loss_clip": 0.01827111, "auxiliary_loss_mlp": 0.0140686, "balance_loss_clip": 1.51393044, "balance_loss_mlp": 1.33936346, "epoch": 0.04785810912370359, "flos": 23193598262400.0, "grad_norm": 2.2317957737512293, "language_loss": 0.82829022, "learning_rate": 3.996656038125449e-06, "loss": 0.86062998, "num_input_tokens_seen": 17052215, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.67504883, "step": 796, "time_per_iteration": 2.9835941791534424 }, { "auxiliary_loss_clip": 0.01853467, "auxiliary_loss_mlp": 0.01400692, "balance_loss_clip": 1.52777743, "balance_loss_mlp": 1.34390104, "epoch": 0.047918232376371564, "flos": 18049628315520.0, "grad_norm": 2.072032321952671, "language_loss": 0.84482729, "learning_rate": 3.996633488284228e-06, "loss": 0.87736887, "num_input_tokens_seen": 17069225, "router_z_loss_clip": 3.25585938, "router_z_loss_mlp": 0.56787109, "step": 797, "time_per_iteration": 2.8903167247772217 }, { "auxiliary_loss_clip": 0.01616355, "auxiliary_loss_mlp": 0.01391907, "balance_loss_clip": 1.41686726, "balance_loss_mlp": 1.36730194, "epoch": 0.04797835562903953, "flos": 62472271305600.0, "grad_norm": 0.9351957346055234, "language_loss": 0.64653492, "learning_rate": 3.996610862730465e-06, "loss": 0.6766175, "num_input_tokens_seen": 17126680, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.24609375, "step": 798, "time_per_iteration": 3.2863686084747314 }, { "auxiliary_loss_clip": 0.01867027, "auxiliary_loss_mlp": 0.01392628, "balance_loss_clip": 1.52881086, "balance_loss_mlp": 1.33745825, "epoch": 0.0480384788817075, "flos": 21517336465920.0, "grad_norm": 1.9815362972049368, "language_loss": 0.93685377, "learning_rate": 3.996588161465018e-06, "loss": 0.96945035, "num_input_tokens_seen": 17144835, "router_z_loss_clip": 3.38085938, "router_z_loss_mlp": 0.55224609, "step": 799, "time_per_iteration": 2.9071173667907715 }, { "auxiliary_loss_clip": 0.01850862, "auxiliary_loss_mlp": 0.01389532, "balance_loss_clip": 1.52861166, "balance_loss_mlp": 1.34079969, "epoch": 0.048098602134375466, "flos": 21736937318400.0, "grad_norm": 2.1364826074748073, "language_loss": 0.88807458, "learning_rate": 3.996565384488748e-06, "loss": 0.92047846, "num_input_tokens_seen": 17165030, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.48779297, "step": 800, "time_per_iteration": 2.979412078857422 }, { "auxiliary_loss_clip": 0.01866784, "auxiliary_loss_mlp": 0.01395592, "balance_loss_clip": 1.53351831, "balance_loss_mlp": 1.34561944, "epoch": 0.04815872538704344, "flos": 22941710605440.0, "grad_norm": 2.061690072831917, "language_loss": 0.86406684, "learning_rate": 3.996542531802518e-06, "loss": 0.89669061, "num_input_tokens_seen": 17184895, "router_z_loss_clip": 3.3359375, "router_z_loss_mlp": 0.50048828, "step": 801, "time_per_iteration": 2.905458927154541 }, { "auxiliary_loss_clip": 0.01863367, "auxiliary_loss_mlp": 0.01404691, "balance_loss_clip": 1.52951956, "balance_loss_mlp": 1.35092807, "epoch": 0.04821884863971141, "flos": 43189048765440.0, "grad_norm": 1.7019540161384774, "language_loss": 0.82406479, "learning_rate": 3.996519603407196e-06, "loss": 0.85674536, "num_input_tokens_seen": 17208225, "router_z_loss_clip": 3.33984375, "router_z_loss_mlp": 0.53759766, "step": 802, "time_per_iteration": 3.1033570766448975 }, { "auxiliary_loss_clip": 0.01823356, "auxiliary_loss_mlp": 0.01400601, "balance_loss_clip": 1.5065794, "balance_loss_mlp": 1.35375166, "epoch": 0.048278971892379376, "flos": 18628021584000.0, "grad_norm": 1.739054873352336, "language_loss": 0.8805483, "learning_rate": 3.996496599303649e-06, "loss": 0.91278785, "num_input_tokens_seen": 17226305, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.46875, "step": 803, "time_per_iteration": 2.910318613052368 }, { "auxiliary_loss_clip": 0.01859458, "auxiliary_loss_mlp": 0.01391074, "balance_loss_clip": 1.53174305, "balance_loss_mlp": 1.34138751, "epoch": 0.04833909514504735, "flos": 20239709771520.0, "grad_norm": 2.2358467299767324, "language_loss": 0.88474125, "learning_rate": 3.996473519492753e-06, "loss": 0.91724658, "num_input_tokens_seen": 17244545, "router_z_loss_clip": 3.27929688, "router_z_loss_mlp": 0.49682617, "step": 804, "time_per_iteration": 2.914029598236084 }, { "auxiliary_loss_clip": 0.01839535, "auxiliary_loss_mlp": 0.01416621, "balance_loss_clip": 1.51489258, "balance_loss_mlp": 1.36471725, "epoch": 0.04839921839771532, "flos": 24656050540800.0, "grad_norm": 1.9013931102613904, "language_loss": 0.88015497, "learning_rate": 3.99645036397538e-06, "loss": 0.91271645, "num_input_tokens_seen": 17265730, "router_z_loss_clip": 3.24414062, "router_z_loss_mlp": 0.51904297, "step": 805, "time_per_iteration": 3.0111243724823 }, { "auxiliary_loss_clip": 0.01839249, "auxiliary_loss_mlp": 0.0140756, "balance_loss_clip": 1.52127957, "balance_loss_mlp": 1.35575116, "epoch": 0.048459341650383285, "flos": 24838342416000.0, "grad_norm": 2.1353189903897465, "language_loss": 0.70154035, "learning_rate": 3.9964271327524085e-06, "loss": 0.73400843, "num_input_tokens_seen": 17284820, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.51855469, "step": 806, "time_per_iteration": 2.932370901107788 }, { "auxiliary_loss_clip": 0.01817468, "auxiliary_loss_mlp": 0.01407163, "balance_loss_clip": 1.50604773, "balance_loss_mlp": 1.35337567, "epoch": 0.04851946490305126, "flos": 22173198111360.0, "grad_norm": 2.0724064528612103, "language_loss": 0.78897703, "learning_rate": 3.9964038258247214e-06, "loss": 0.82122338, "num_input_tokens_seen": 17305085, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.53759766, "step": 807, "time_per_iteration": 2.962855339050293 }, { "auxiliary_loss_clip": 0.0182054, "auxiliary_loss_mlp": 0.01398479, "balance_loss_clip": 1.50889182, "balance_loss_mlp": 1.35072351, "epoch": 0.04857958815571922, "flos": 19801503452160.0, "grad_norm": 2.1480360511518506, "language_loss": 0.89356852, "learning_rate": 3.9963804431932005e-06, "loss": 0.92575872, "num_input_tokens_seen": 17322715, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.4777832, "step": 808, "time_per_iteration": 2.9136788845062256 }, { "auxiliary_loss_clip": 0.01850997, "auxiliary_loss_mlp": 0.01401411, "balance_loss_clip": 1.52032948, "balance_loss_mlp": 1.34979367, "epoch": 0.048639711408387194, "flos": 18707299752960.0, "grad_norm": 1.6030490585742618, "language_loss": 0.91894817, "learning_rate": 3.996356984858732e-06, "loss": 0.95147228, "num_input_tokens_seen": 17341455, "router_z_loss_clip": 3.30859375, "router_z_loss_mlp": 0.51635742, "step": 809, "time_per_iteration": 2.9515011310577393 }, { "auxiliary_loss_clip": 0.01837949, "auxiliary_loss_mlp": 0.01405112, "balance_loss_clip": 1.5183723, "balance_loss_mlp": 1.35707045, "epoch": 0.048699834661055166, "flos": 24874294049280.0, "grad_norm": 1.785782932243432, "language_loss": 0.87767208, "learning_rate": 3.996333450822208e-06, "loss": 0.91010273, "num_input_tokens_seen": 17360765, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.48022461, "step": 810, "time_per_iteration": 3.045872688293457 }, { "auxiliary_loss_clip": 0.01847411, "auxiliary_loss_mlp": 0.01397066, "balance_loss_clip": 1.52350378, "balance_loss_mlp": 1.34683156, "epoch": 0.04875995791372313, "flos": 20713686744960.0, "grad_norm": 1.8408782498077456, "language_loss": 0.82856613, "learning_rate": 3.99630984108452e-06, "loss": 0.86101091, "num_input_tokens_seen": 17380625, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.50219727, "step": 811, "time_per_iteration": 2.903075933456421 }, { "auxiliary_loss_clip": 0.01818789, "auxiliary_loss_mlp": 0.01390545, "balance_loss_clip": 1.50683105, "balance_loss_mlp": 1.34019077, "epoch": 0.048820081166391104, "flos": 18597544571520.0, "grad_norm": 1.6497366446825628, "language_loss": 0.76484424, "learning_rate": 3.9962861556465615e-06, "loss": 0.79693758, "num_input_tokens_seen": 17399355, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.50292969, "step": 812, "time_per_iteration": 3.0392627716064453 }, { "auxiliary_loss_clip": 0.01798738, "auxiliary_loss_mlp": 0.01396716, "balance_loss_clip": 1.5012238, "balance_loss_mlp": 1.3534193, "epoch": 0.04888020441905907, "flos": 22716635132160.0, "grad_norm": 1.7809746580975487, "language_loss": 0.92108989, "learning_rate": 3.996262394509233e-06, "loss": 0.95304447, "num_input_tokens_seen": 17418240, "router_z_loss_clip": 2.97460938, "router_z_loss_mlp": 0.43310547, "step": 813, "time_per_iteration": 2.8799514770507812 }, { "auxiliary_loss_clip": 0.01808244, "auxiliary_loss_mlp": 0.01386942, "balance_loss_clip": 1.49865031, "balance_loss_mlp": 1.34359753, "epoch": 0.04894032767172704, "flos": 22794736936320.0, "grad_norm": 2.336550220074976, "language_loss": 0.77508688, "learning_rate": 3.9962385576734335e-06, "loss": 0.80703872, "num_input_tokens_seen": 17436250, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.43334961, "step": 814, "time_per_iteration": 2.940197706222534 }, { "auxiliary_loss_clip": 0.01799965, "auxiliary_loss_mlp": 0.01394469, "balance_loss_clip": 1.49066353, "balance_loss_mlp": 1.34566522, "epoch": 0.04900045092439501, "flos": 25526038417920.0, "grad_norm": 3.910530535185917, "language_loss": 0.86133736, "learning_rate": 3.9962146451400675e-06, "loss": 0.8932817, "num_input_tokens_seen": 17455750, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.48852539, "step": 815, "time_per_iteration": 2.9385859966278076 }, { "auxiliary_loss_clip": 0.01822635, "auxiliary_loss_mlp": 0.01388171, "balance_loss_clip": 1.50163722, "balance_loss_mlp": 1.33993936, "epoch": 0.04906057417706298, "flos": 25969221665280.0, "grad_norm": 2.0774461332568364, "language_loss": 0.94099939, "learning_rate": 3.996190656910043e-06, "loss": 0.97310746, "num_input_tokens_seen": 17474995, "router_z_loss_clip": 3.20703125, "router_z_loss_mlp": 0.48217773, "step": 816, "time_per_iteration": 4.376782417297363 }, { "auxiliary_loss_clip": 0.01814921, "auxiliary_loss_mlp": 0.01388295, "balance_loss_clip": 1.49504447, "balance_loss_mlp": 1.34132648, "epoch": 0.04912069742973095, "flos": 18633858163200.0, "grad_norm": 2.1324283968048285, "language_loss": 0.82597482, "learning_rate": 3.996166592984268e-06, "loss": 0.85800701, "num_input_tokens_seen": 17493395, "router_z_loss_clip": 3.203125, "router_z_loss_mlp": 0.46948242, "step": 817, "time_per_iteration": 2.895831823348999 }, { "auxiliary_loss_clip": 0.01800808, "auxiliary_loss_mlp": 0.01402223, "balance_loss_clip": 1.49211407, "balance_loss_mlp": 1.352036, "epoch": 0.049180820682398915, "flos": 23711263729920.0, "grad_norm": 1.5462755635436127, "language_loss": 0.86202067, "learning_rate": 3.996142453363656e-06, "loss": 0.89405096, "num_input_tokens_seen": 17514565, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.50219727, "step": 818, "time_per_iteration": 2.9735167026519775 }, { "auxiliary_loss_clip": 0.01826964, "auxiliary_loss_mlp": 0.01393234, "balance_loss_clip": 1.49724865, "balance_loss_mlp": 1.34652781, "epoch": 0.04924094393506689, "flos": 22430922347520.0, "grad_norm": 2.7282464017841166, "language_loss": 0.79514813, "learning_rate": 3.996118238049124e-06, "loss": 0.82735014, "num_input_tokens_seen": 17534590, "router_z_loss_clip": 3.296875, "router_z_loss_mlp": 0.46679688, "step": 819, "time_per_iteration": 4.409702301025391 }, { "auxiliary_loss_clip": 0.0180604, "auxiliary_loss_mlp": 0.01396224, "balance_loss_clip": 1.48923528, "balance_loss_mlp": 1.3488977, "epoch": 0.04930106718773486, "flos": 15745674401280.0, "grad_norm": 2.1236838463034413, "language_loss": 0.86490518, "learning_rate": 3.996093947041586e-06, "loss": 0.89692783, "num_input_tokens_seen": 17551900, "router_z_loss_clip": 3.1640625, "router_z_loss_mlp": 0.47363281, "step": 820, "time_per_iteration": 4.201559543609619 }, { "auxiliary_loss_clip": 0.01781165, "auxiliary_loss_mlp": 0.01380241, "balance_loss_clip": 1.46974015, "balance_loss_mlp": 1.33441651, "epoch": 0.049361190440402825, "flos": 26261947393920.0, "grad_norm": 1.6835459718250223, "language_loss": 0.92528224, "learning_rate": 3.996069580341966e-06, "loss": 0.95689625, "num_input_tokens_seen": 17571485, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.45825195, "step": 821, "time_per_iteration": 2.9575586318969727 }, { "auxiliary_loss_clip": 0.01794888, "auxiliary_loss_mlp": 0.01399648, "balance_loss_clip": 1.48152208, "balance_loss_mlp": 1.34855485, "epoch": 0.0494213136930708, "flos": 21262462652160.0, "grad_norm": 1.7941132314032007, "language_loss": 0.91358036, "learning_rate": 3.996045137951188e-06, "loss": 0.94552571, "num_input_tokens_seen": 17591410, "router_z_loss_clip": 3.1328125, "router_z_loss_mlp": 0.51074219, "step": 822, "time_per_iteration": 2.920083999633789 }, { "auxiliary_loss_clip": 0.01802128, "auxiliary_loss_mlp": 0.01390564, "balance_loss_clip": 1.48946881, "balance_loss_mlp": 1.34135473, "epoch": 0.04948143694573876, "flos": 27977282714880.0, "grad_norm": 1.7603327171710064, "language_loss": 0.69669139, "learning_rate": 3.996020619870178e-06, "loss": 0.72861838, "num_input_tokens_seen": 17612010, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.49291992, "step": 823, "time_per_iteration": 2.91367506980896 }, { "auxiliary_loss_clip": 0.01516922, "auxiliary_loss_mlp": 0.01401135, "balance_loss_clip": 1.33240712, "balance_loss_mlp": 1.38740194, "epoch": 0.049541560198406734, "flos": 66206553690240.0, "grad_norm": 1.3610024248720263, "language_loss": 0.62392616, "learning_rate": 3.995996026099866e-06, "loss": 0.65310669, "num_input_tokens_seen": 17673430, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.13769531, "step": 824, "time_per_iteration": 3.3961522579193115 }, { "auxiliary_loss_clip": 0.01825708, "auxiliary_loss_mlp": 0.01394966, "balance_loss_clip": 1.49865556, "balance_loss_mlp": 1.3435396, "epoch": 0.049601683451074706, "flos": 22902818060160.0, "grad_norm": 1.9881053459045397, "language_loss": 0.92805862, "learning_rate": 3.995971356641185e-06, "loss": 0.96026534, "num_input_tokens_seen": 17689545, "router_z_loss_clip": 3.27539062, "router_z_loss_mlp": 0.51489258, "step": 825, "time_per_iteration": 2.879167318344116 }, { "auxiliary_loss_clip": 0.01786378, "auxiliary_loss_mlp": 0.01380799, "balance_loss_clip": 1.4775238, "balance_loss_mlp": 1.33292496, "epoch": 0.04966180670374267, "flos": 21443487672960.0, "grad_norm": 4.2119845320806, "language_loss": 0.69332087, "learning_rate": 3.9959466114950695e-06, "loss": 0.72499263, "num_input_tokens_seen": 17705965, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.47875977, "step": 826, "time_per_iteration": 2.9242491722106934 }, { "auxiliary_loss_clip": 0.01788047, "auxiliary_loss_mlp": 0.01381852, "balance_loss_clip": 1.47656679, "balance_loss_mlp": 1.33695745, "epoch": 0.04972192995641064, "flos": 23116446599040.0, "grad_norm": 1.871044601870244, "language_loss": 0.80878162, "learning_rate": 3.995921790662459e-06, "loss": 0.84048057, "num_input_tokens_seen": 17724580, "router_z_loss_clip": 3.11132812, "router_z_loss_mlp": 0.44873047, "step": 827, "time_per_iteration": 2.9052062034606934 }, { "auxiliary_loss_clip": 0.01799373, "auxiliary_loss_mlp": 0.01387627, "balance_loss_clip": 1.48233485, "balance_loss_mlp": 1.33734488, "epoch": 0.04978205320907861, "flos": 40420890754560.0, "grad_norm": 1.7401849402315255, "language_loss": 0.80930394, "learning_rate": 3.995896894144294e-06, "loss": 0.84117401, "num_input_tokens_seen": 17747755, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.50268555, "step": 828, "time_per_iteration": 3.0711886882781982 }, { "auxiliary_loss_clip": 0.01774237, "auxiliary_loss_mlp": 0.01384949, "balance_loss_clip": 1.4699049, "balance_loss_mlp": 1.33800399, "epoch": 0.04984217646174658, "flos": 25239465982080.0, "grad_norm": 1.6343608822225304, "language_loss": 0.86034715, "learning_rate": 3.995871921941519e-06, "loss": 0.8919391, "num_input_tokens_seen": 17768550, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.46899414, "step": 829, "time_per_iteration": 2.9460737705230713 }, { "auxiliary_loss_clip": 0.01802786, "auxiliary_loss_mlp": 0.01398795, "balance_loss_clip": 1.48619795, "balance_loss_mlp": 1.34488869, "epoch": 0.04990229971441455, "flos": 15967763717760.0, "grad_norm": 1.851268271977176, "language_loss": 0.77943528, "learning_rate": 3.99584687405508e-06, "loss": 0.81145108, "num_input_tokens_seen": 17786080, "router_z_loss_clip": 3.16210938, "router_z_loss_mlp": 0.53881836, "step": 830, "time_per_iteration": 2.8687191009521484 }, { "auxiliary_loss_clip": 0.01787295, "auxiliary_loss_mlp": 0.01393857, "balance_loss_clip": 1.47876096, "balance_loss_mlp": 1.34262109, "epoch": 0.04996242296708252, "flos": 18414121576320.0, "grad_norm": 1.8460346272955606, "language_loss": 0.80827886, "learning_rate": 3.995821750485929e-06, "loss": 0.84009039, "num_input_tokens_seen": 17803635, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.51220703, "step": 831, "time_per_iteration": 2.877595901489258 }, { "auxiliary_loss_clip": 0.01805221, "auxiliary_loss_mlp": 0.01409239, "balance_loss_clip": 1.48660588, "balance_loss_mlp": 1.35959983, "epoch": 0.05002254621975049, "flos": 17866974481920.0, "grad_norm": 2.8871010573190796, "language_loss": 0.94391936, "learning_rate": 3.995796551235016e-06, "loss": 0.97606391, "num_input_tokens_seen": 17822190, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.49682617, "step": 832, "time_per_iteration": 2.890181303024292 }, { "auxiliary_loss_clip": 0.01788666, "auxiliary_loss_mlp": 0.01395125, "balance_loss_clip": 1.48139071, "balance_loss_mlp": 1.3493011, "epoch": 0.050082669472418455, "flos": 45676425674880.0, "grad_norm": 2.051987689918829, "language_loss": 0.84519434, "learning_rate": 3.9957712763032974e-06, "loss": 0.87703222, "num_input_tokens_seen": 17846915, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.45874023, "step": 833, "time_per_iteration": 3.150804281234741 }, { "auxiliary_loss_clip": 0.01816112, "auxiliary_loss_mlp": 0.01402476, "balance_loss_clip": 1.50085545, "balance_loss_mlp": 1.34718704, "epoch": 0.05014279272508643, "flos": 37976297443200.0, "grad_norm": 2.065844511031575, "language_loss": 0.84388304, "learning_rate": 3.995745925691733e-06, "loss": 0.87606889, "num_input_tokens_seen": 17867270, "router_z_loss_clip": 3.14648438, "router_z_loss_mlp": 0.55273438, "step": 834, "time_per_iteration": 3.060631275177002 }, { "auxiliary_loss_clip": 0.01836929, "auxiliary_loss_mlp": 0.01389722, "balance_loss_clip": 1.51282406, "balance_loss_mlp": 1.34432733, "epoch": 0.0502029159777544, "flos": 21006005270400.0, "grad_norm": 15.201290321965066, "language_loss": 0.94532466, "learning_rate": 3.995720499401282e-06, "loss": 0.97759116, "num_input_tokens_seen": 17884880, "router_z_loss_clip": 3.23828125, "router_z_loss_mlp": 0.45361328, "step": 835, "time_per_iteration": 2.848585605621338 }, { "auxiliary_loss_clip": 0.0183201, "auxiliary_loss_mlp": 0.01413752, "balance_loss_clip": 1.51348972, "balance_loss_mlp": 1.36449456, "epoch": 0.050263039230422364, "flos": 15896539123200.0, "grad_norm": 2.4706956653806955, "language_loss": 0.80907446, "learning_rate": 3.995694997432911e-06, "loss": 0.84153211, "num_input_tokens_seen": 17903695, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.4921875, "step": 836, "time_per_iteration": 2.8848702907562256 }, { "auxiliary_loss_clip": 0.018032, "auxiliary_loss_mlp": 0.01392972, "balance_loss_clip": 1.50184894, "balance_loss_mlp": 1.34605145, "epoch": 0.050323162483090336, "flos": 23743324310400.0, "grad_norm": 2.1252050587452906, "language_loss": 0.85682911, "learning_rate": 3.9956694197875855e-06, "loss": 0.88879079, "num_input_tokens_seen": 17920745, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.46948242, "step": 837, "time_per_iteration": 2.866690158843994 }, { "auxiliary_loss_clip": 0.01819871, "auxiliary_loss_mlp": 0.0138325, "balance_loss_clip": 1.50013995, "balance_loss_mlp": 1.33888054, "epoch": 0.0503832857357583, "flos": 20275797139200.0, "grad_norm": 2.4298288551517895, "language_loss": 0.75213742, "learning_rate": 3.995643766466275e-06, "loss": 0.78416866, "num_input_tokens_seen": 17938220, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.44433594, "step": 838, "time_per_iteration": 2.90195369720459 }, { "auxiliary_loss_clip": 0.01816102, "auxiliary_loss_mlp": 0.01389304, "balance_loss_clip": 1.49762249, "balance_loss_mlp": 1.33852053, "epoch": 0.05044340898842627, "flos": 17793894850560.0, "grad_norm": 2.267826514107252, "language_loss": 0.85492152, "learning_rate": 3.995618037469953e-06, "loss": 0.88697553, "num_input_tokens_seen": 17957325, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.5078125, "step": 839, "time_per_iteration": 2.8225862979888916 }, { "auxiliary_loss_clip": 0.01823601, "auxiliary_loss_mlp": 0.01387291, "balance_loss_clip": 1.51064348, "balance_loss_mlp": 1.34013176, "epoch": 0.050503532241094246, "flos": 22976893077120.0, "grad_norm": 1.8300055540046678, "language_loss": 0.87262237, "learning_rate": 3.995592232799595e-06, "loss": 0.90473127, "num_input_tokens_seen": 17975875, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.47192383, "step": 840, "time_per_iteration": 2.916903018951416 }, { "auxiliary_loss_clip": 0.0182705, "auxiliary_loss_mlp": 0.01383786, "balance_loss_clip": 1.51147962, "balance_loss_mlp": 1.33567333, "epoch": 0.05056365549376221, "flos": 22785461752320.0, "grad_norm": 2.010340981670334, "language_loss": 0.96352673, "learning_rate": 3.99556635245618e-06, "loss": 0.99563515, "num_input_tokens_seen": 17994340, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.48120117, "step": 841, "time_per_iteration": 3.005314826965332 }, { "auxiliary_loss_clip": 0.01807994, "auxiliary_loss_mlp": 0.01391783, "balance_loss_clip": 1.497877, "balance_loss_mlp": 1.33863902, "epoch": 0.05062377874643018, "flos": 30928501762560.0, "grad_norm": 1.9655716273118453, "language_loss": 0.79264289, "learning_rate": 3.995540396440688e-06, "loss": 0.82464063, "num_input_tokens_seen": 18015260, "router_z_loss_clip": 3.09960938, "router_z_loss_mlp": 0.53100586, "step": 842, "time_per_iteration": 3.043440341949463 }, { "auxiliary_loss_clip": 0.01855147, "auxiliary_loss_mlp": 0.01380734, "balance_loss_clip": 1.526613, "balance_loss_mlp": 1.33295536, "epoch": 0.05068390199909815, "flos": 19656882512640.0, "grad_norm": 3.20927757314938, "language_loss": 0.79720527, "learning_rate": 3.995514364754105e-06, "loss": 0.82956409, "num_input_tokens_seen": 18033960, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 0.47827148, "step": 843, "time_per_iteration": 3.000242233276367 }, { "auxiliary_loss_clip": 0.01838416, "auxiliary_loss_mlp": 0.01382012, "balance_loss_clip": 1.51292562, "balance_loss_mlp": 1.33113313, "epoch": 0.05074402525176612, "flos": 37976342688000.0, "grad_norm": 2.0326589152989456, "language_loss": 0.85111976, "learning_rate": 3.995488257397417e-06, "loss": 0.88332403, "num_input_tokens_seen": 18056700, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.50854492, "step": 844, "time_per_iteration": 3.0275068283081055 }, { "auxiliary_loss_clip": 0.01826131, "auxiliary_loss_mlp": 0.0137729, "balance_loss_clip": 1.50440192, "balance_loss_mlp": 1.33349252, "epoch": 0.05080414850443409, "flos": 22064800273920.0, "grad_norm": 2.6589327775192992, "language_loss": 0.77616584, "learning_rate": 3.995462074371614e-06, "loss": 0.80820012, "num_input_tokens_seen": 18075815, "router_z_loss_clip": 3.22070312, "router_z_loss_mlp": 0.43798828, "step": 845, "time_per_iteration": 2.9278721809387207 }, { "auxiliary_loss_clip": 0.01816248, "auxiliary_loss_mlp": 0.01372795, "balance_loss_clip": 1.50177646, "balance_loss_mlp": 1.3196516, "epoch": 0.05086427175710206, "flos": 20234325640320.0, "grad_norm": 1.8893195906928064, "language_loss": 0.89712608, "learning_rate": 3.99543581567769e-06, "loss": 0.92901647, "num_input_tokens_seen": 18095095, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.53100586, "step": 846, "time_per_iteration": 2.9648349285125732 }, { "auxiliary_loss_clip": 0.01813592, "auxiliary_loss_mlp": 0.01369228, "balance_loss_clip": 1.50278819, "balance_loss_mlp": 1.32481062, "epoch": 0.05092439500977003, "flos": 15167733580800.0, "grad_norm": 2.524893424300868, "language_loss": 0.8969394, "learning_rate": 3.9954094813166394e-06, "loss": 0.92876762, "num_input_tokens_seen": 18112675, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.4440918, "step": 847, "time_per_iteration": 2.8614354133605957 }, { "auxiliary_loss_clip": 0.01806018, "auxiliary_loss_mlp": 0.01367872, "balance_loss_clip": 1.49729681, "balance_loss_mlp": 1.32123733, "epoch": 0.050984518262437994, "flos": 22065478945920.0, "grad_norm": 4.636223784752397, "language_loss": 0.84552395, "learning_rate": 3.995383071289462e-06, "loss": 0.87726289, "num_input_tokens_seen": 18130745, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.46655273, "step": 848, "time_per_iteration": 3.0094499588012695 }, { "auxiliary_loss_clip": 0.01799842, "auxiliary_loss_mlp": 0.01377914, "balance_loss_clip": 1.49196172, "balance_loss_mlp": 1.33433104, "epoch": 0.05104464151510597, "flos": 30237095687040.0, "grad_norm": 1.7011880709186042, "language_loss": 0.89660692, "learning_rate": 3.995356585597158e-06, "loss": 0.92838448, "num_input_tokens_seen": 18152410, "router_z_loss_clip": 3.078125, "router_z_loss_mlp": 0.43579102, "step": 849, "time_per_iteration": 2.95316219329834 }, { "auxiliary_loss_clip": 0.01813582, "auxiliary_loss_mlp": 0.013616, "balance_loss_clip": 1.50708103, "balance_loss_mlp": 1.31689668, "epoch": 0.05110476476777394, "flos": 18342082575360.0, "grad_norm": 1.790855200726768, "language_loss": 0.86346138, "learning_rate": 3.995330024240732e-06, "loss": 0.89521313, "num_input_tokens_seen": 18170870, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.44702148, "step": 850, "time_per_iteration": 2.878868818283081 }, { "auxiliary_loss_clip": 0.01817026, "auxiliary_loss_mlp": 0.01378849, "balance_loss_clip": 1.50307941, "balance_loss_mlp": 1.32842302, "epoch": 0.051164888020441904, "flos": 38013018238080.0, "grad_norm": 2.1813917151089823, "language_loss": 0.67864013, "learning_rate": 3.995303387221192e-06, "loss": 0.71059883, "num_input_tokens_seen": 18191555, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.50439453, "step": 851, "time_per_iteration": 4.476234436035156 }, { "auxiliary_loss_clip": 0.01822883, "auxiliary_loss_mlp": 0.01377187, "balance_loss_clip": 1.50723672, "balance_loss_mlp": 1.33050513, "epoch": 0.051225011273109876, "flos": 23048796343680.0, "grad_norm": 2.095090364139327, "language_loss": 0.85511827, "learning_rate": 3.995276674539547e-06, "loss": 0.88711894, "num_input_tokens_seen": 18208620, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.46679688, "step": 852, "time_per_iteration": 2.8926923274993896 }, { "auxiliary_loss_clip": 0.01823062, "auxiliary_loss_mlp": 0.01375601, "balance_loss_clip": 1.50540531, "balance_loss_mlp": 1.33275747, "epoch": 0.05128513452577785, "flos": 18268957699200.0, "grad_norm": 3.4281229407797915, "language_loss": 0.81977731, "learning_rate": 3.995249886196811e-06, "loss": 0.8517639, "num_input_tokens_seen": 18226370, "router_z_loss_clip": 3.171875, "router_z_loss_mlp": 0.42797852, "step": 853, "time_per_iteration": 2.8876209259033203 }, { "auxiliary_loss_clip": 0.01804689, "auxiliary_loss_mlp": 0.01377132, "balance_loss_clip": 1.49787354, "balance_loss_mlp": 1.33085454, "epoch": 0.05134525777844581, "flos": 27210444278400.0, "grad_norm": 1.7967464202972343, "language_loss": 0.79279524, "learning_rate": 3.995223022193999e-06, "loss": 0.82461345, "num_input_tokens_seen": 18247075, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.46289062, "step": 854, "time_per_iteration": 7.1201441287994385 }, { "auxiliary_loss_clip": 0.01829336, "auxiliary_loss_mlp": 0.01362233, "balance_loss_clip": 1.5116694, "balance_loss_mlp": 1.3195796, "epoch": 0.051405381031113785, "flos": 28372931660160.0, "grad_norm": 2.066049808287637, "language_loss": 0.8404395, "learning_rate": 3.99519608253213e-06, "loss": 0.87235522, "num_input_tokens_seen": 18265680, "router_z_loss_clip": 3.17773438, "router_z_loss_mlp": 0.42675781, "step": 855, "time_per_iteration": 2.9628236293792725 }, { "auxiliary_loss_clip": 0.01576137, "auxiliary_loss_mlp": 0.01350152, "balance_loss_clip": 1.39099693, "balance_loss_mlp": 1.32983899, "epoch": 0.05146550428378175, "flos": 65649018274560.0, "grad_norm": 1.0166848510085653, "language_loss": 0.65837896, "learning_rate": 3.995169067212227e-06, "loss": 0.68764186, "num_input_tokens_seen": 18327015, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.203125, "step": 856, "time_per_iteration": 3.348721504211426 }, { "auxiliary_loss_clip": 0.01808813, "auxiliary_loss_mlp": 0.01356211, "balance_loss_clip": 1.50570428, "balance_loss_mlp": 1.31434441, "epoch": 0.05152562753644972, "flos": 22065116987520.0, "grad_norm": 1.6972895422584147, "language_loss": 0.78500831, "learning_rate": 3.9951419762353116e-06, "loss": 0.81665862, "num_input_tokens_seen": 18345235, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.41821289, "step": 857, "time_per_iteration": 2.9286608695983887 }, { "auxiliary_loss_clip": 0.01817284, "auxiliary_loss_mlp": 0.01366413, "balance_loss_clip": 1.50524783, "balance_loss_mlp": 1.32356966, "epoch": 0.051585750789117694, "flos": 18517271016960.0, "grad_norm": 2.1992065336226343, "language_loss": 0.8981958, "learning_rate": 3.995114809602412e-06, "loss": 0.93003279, "num_input_tokens_seen": 18362350, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.4284668, "step": 858, "time_per_iteration": 2.9300036430358887 }, { "auxiliary_loss_clip": 0.01830388, "auxiliary_loss_mlp": 0.01362196, "balance_loss_clip": 1.51691341, "balance_loss_mlp": 1.31720686, "epoch": 0.05164587404178566, "flos": 23740157174400.0, "grad_norm": 2.003928726043768, "language_loss": 0.78726923, "learning_rate": 3.9950875673145605e-06, "loss": 0.81919509, "num_input_tokens_seen": 18383390, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.44995117, "step": 859, "time_per_iteration": 2.8781325817108154 }, { "auxiliary_loss_clip": 0.01850452, "auxiliary_loss_mlp": 0.0136897, "balance_loss_clip": 1.52527332, "balance_loss_mlp": 1.32390881, "epoch": 0.05170599729445363, "flos": 16261439587200.0, "grad_norm": 2.1278202593354476, "language_loss": 0.92207611, "learning_rate": 3.995060249372788e-06, "loss": 0.9542703, "num_input_tokens_seen": 18399220, "router_z_loss_clip": 3.25390625, "router_z_loss_mlp": 0.45019531, "step": 860, "time_per_iteration": 2.8851518630981445 }, { "auxiliary_loss_clip": 0.01828204, "auxiliary_loss_mlp": 0.01358449, "balance_loss_clip": 1.51953816, "balance_loss_mlp": 1.31708312, "epoch": 0.0517661205471216, "flos": 23995981128960.0, "grad_norm": 1.7694978813659668, "language_loss": 0.83461797, "learning_rate": 3.99503285577813e-06, "loss": 0.86648452, "num_input_tokens_seen": 18419005, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.41381836, "step": 861, "time_per_iteration": 2.9634711742401123 }, { "auxiliary_loss_clip": 0.01852806, "auxiliary_loss_mlp": 0.01369825, "balance_loss_clip": 1.53507268, "balance_loss_mlp": 1.32917476, "epoch": 0.05182624379978957, "flos": 29289503698560.0, "grad_norm": 1.5563595125341918, "language_loss": 0.80394101, "learning_rate": 3.995005386531627e-06, "loss": 0.83616734, "num_input_tokens_seen": 18440550, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.40649414, "step": 862, "time_per_iteration": 2.9379584789276123 }, { "auxiliary_loss_clip": 0.01830652, "auxiliary_loss_mlp": 0.01357801, "balance_loss_clip": 1.52242565, "balance_loss_mlp": 1.31397963, "epoch": 0.05188636705245754, "flos": 24181575874560.0, "grad_norm": 2.2707327723332122, "language_loss": 0.91866994, "learning_rate": 3.9949778416343195e-06, "loss": 0.95055443, "num_input_tokens_seen": 18461950, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.4387207, "step": 863, "time_per_iteration": 2.9594500064849854 }, { "auxiliary_loss_clip": 0.018547, "auxiliary_loss_mlp": 0.01378785, "balance_loss_clip": 1.5396688, "balance_loss_mlp": 1.33369994, "epoch": 0.051946490305125506, "flos": 26771966490240.0, "grad_norm": 1.9135553070425162, "language_loss": 0.77285695, "learning_rate": 3.9949502210872525e-06, "loss": 0.80519176, "num_input_tokens_seen": 18480555, "router_z_loss_clip": 3.1484375, "router_z_loss_mlp": 0.45068359, "step": 864, "time_per_iteration": 2.916667938232422 }, { "auxiliary_loss_clip": 0.01851128, "auxiliary_loss_mlp": 0.01362304, "balance_loss_clip": 1.53160191, "balance_loss_mlp": 1.32170153, "epoch": 0.05200661355779348, "flos": 21511318907520.0, "grad_norm": 2.3824763250921954, "language_loss": 0.814924, "learning_rate": 3.994922524891474e-06, "loss": 0.84705842, "num_input_tokens_seen": 18499645, "router_z_loss_clip": 3.19726562, "router_z_loss_mlp": 0.40600586, "step": 865, "time_per_iteration": 2.9067957401275635 }, { "auxiliary_loss_clip": 0.0182903, "auxiliary_loss_mlp": 0.01361395, "balance_loss_clip": 1.52244174, "balance_loss_mlp": 1.31909978, "epoch": 0.05206673681046144, "flos": 18123748577280.0, "grad_norm": 2.1988326702584735, "language_loss": 0.88584638, "learning_rate": 3.994894753048032e-06, "loss": 0.9177506, "num_input_tokens_seen": 18516810, "router_z_loss_clip": 3.06640625, "router_z_loss_mlp": 0.42333984, "step": 866, "time_per_iteration": 2.9035234451293945 }, { "auxiliary_loss_clip": 0.01844, "auxiliary_loss_mlp": 0.01377901, "balance_loss_clip": 1.5372839, "balance_loss_mlp": 1.33779883, "epoch": 0.052126860063129415, "flos": 17531600889600.0, "grad_norm": 2.1165075224252465, "language_loss": 0.90509367, "learning_rate": 3.9948669055579815e-06, "loss": 0.93731272, "num_input_tokens_seen": 18532510, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.40087891, "step": 867, "time_per_iteration": 2.9128005504608154 }, { "auxiliary_loss_clip": 0.01818964, "auxiliary_loss_mlp": 0.01368096, "balance_loss_clip": 1.52205837, "balance_loss_mlp": 1.32732606, "epoch": 0.05218698331579739, "flos": 32610735872640.0, "grad_norm": 1.4607265956456723, "language_loss": 0.64220965, "learning_rate": 3.9948389824223785e-06, "loss": 0.67408025, "num_input_tokens_seen": 18557380, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.4074707, "step": 868, "time_per_iteration": 3.0640904903411865 }, { "auxiliary_loss_clip": 0.01866036, "auxiliary_loss_mlp": 0.01358639, "balance_loss_clip": 1.54187369, "balance_loss_mlp": 1.31546199, "epoch": 0.05224710656846535, "flos": 22137427457280.0, "grad_norm": 1.7261248696424438, "language_loss": 0.8508594, "learning_rate": 3.994810983642281e-06, "loss": 0.88310611, "num_input_tokens_seen": 18575720, "router_z_loss_clip": 3.23632812, "router_z_loss_mlp": 0.43164062, "step": 869, "time_per_iteration": 2.917043924331665 }, { "auxiliary_loss_clip": 0.01863737, "auxiliary_loss_mlp": 0.01381443, "balance_loss_clip": 1.53866887, "balance_loss_mlp": 1.33683527, "epoch": 0.052307229821133325, "flos": 11152380643200.0, "grad_norm": 2.0377219942512115, "language_loss": 0.8958711, "learning_rate": 3.994782909218751e-06, "loss": 0.92832291, "num_input_tokens_seen": 18592185, "router_z_loss_clip": 3.25, "router_z_loss_mlp": 0.44580078, "step": 870, "time_per_iteration": 2.97171688079834 }, { "auxiliary_loss_clip": 0.01850786, "auxiliary_loss_mlp": 0.0136775, "balance_loss_clip": 1.53517175, "balance_loss_mlp": 1.3231895, "epoch": 0.05236735307380129, "flos": 19135506971520.0, "grad_norm": 2.1034959426379363, "language_loss": 0.82252818, "learning_rate": 3.994754759152854e-06, "loss": 0.85471356, "num_input_tokens_seen": 18609560, "router_z_loss_clip": 3.15429688, "router_z_loss_mlp": 0.44580078, "step": 871, "time_per_iteration": 2.9046144485473633 }, { "auxiliary_loss_clip": 0.01819981, "auxiliary_loss_mlp": 0.01360846, "balance_loss_clip": 1.52098656, "balance_loss_mlp": 1.31764436, "epoch": 0.05242747632646926, "flos": 20970732309120.0, "grad_norm": 1.8439899003995517, "language_loss": 0.82167101, "learning_rate": 3.994726533445656e-06, "loss": 0.85347927, "num_input_tokens_seen": 18629405, "router_z_loss_clip": 2.99023438, "router_z_loss_mlp": 0.43212891, "step": 872, "time_per_iteration": 2.9418015480041504 }, { "auxiliary_loss_clip": 0.01568037, "auxiliary_loss_mlp": 0.01384349, "balance_loss_clip": 1.38824129, "balance_loss_mlp": 1.36928082, "epoch": 0.052487599579137234, "flos": 65047640647680.0, "grad_norm": 0.9114661984303519, "language_loss": 0.61750519, "learning_rate": 3.9946982320982274e-06, "loss": 0.64702904, "num_input_tokens_seen": 18681480, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.15039062, "step": 873, "time_per_iteration": 3.269615650177002 }, { "auxiliary_loss_clip": 0.01828929, "auxiliary_loss_mlp": 0.01364362, "balance_loss_clip": 1.51872659, "balance_loss_mlp": 1.31875205, "epoch": 0.0525477228318052, "flos": 23297743088640.0, "grad_norm": 2.232765692489517, "language_loss": 0.91665459, "learning_rate": 3.994669855111643e-06, "loss": 0.94858748, "num_input_tokens_seen": 18700390, "router_z_loss_clip": 3.1015625, "router_z_loss_mlp": 0.45629883, "step": 874, "time_per_iteration": 2.8996899127960205 }, { "auxiliary_loss_clip": 0.01826651, "auxiliary_loss_mlp": 0.01357293, "balance_loss_clip": 1.51545358, "balance_loss_mlp": 1.31092012, "epoch": 0.05260784608447317, "flos": 32242034845440.0, "grad_norm": 1.8286756194771692, "language_loss": 0.7659229, "learning_rate": 3.994641402486977e-06, "loss": 0.79776227, "num_input_tokens_seen": 18721280, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.46386719, "step": 875, "time_per_iteration": 3.078219413757324 }, { "auxiliary_loss_clip": 0.01800907, "auxiliary_loss_mlp": 0.01358723, "balance_loss_clip": 1.49698472, "balance_loss_mlp": 1.31547427, "epoch": 0.052667969337141136, "flos": 24474165868800.0, "grad_norm": 1.8111117713375375, "language_loss": 0.93941855, "learning_rate": 3.99461287422531e-06, "loss": 0.97101486, "num_input_tokens_seen": 18741545, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.43261719, "step": 876, "time_per_iteration": 2.949913263320923 }, { "auxiliary_loss_clip": 0.01541522, "auxiliary_loss_mlp": 0.01363613, "balance_loss_clip": 1.36036658, "balance_loss_mlp": 1.34463537, "epoch": 0.05272809258980911, "flos": 57815897016960.0, "grad_norm": 0.8244072927814181, "language_loss": 0.62998748, "learning_rate": 3.994584270327722e-06, "loss": 0.65903878, "num_input_tokens_seen": 18801400, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.18945312, "step": 877, "time_per_iteration": 3.343492269515991 }, { "auxiliary_loss_clip": 0.01800666, "auxiliary_loss_mlp": 0.01359869, "balance_loss_clip": 1.49258852, "balance_loss_mlp": 1.30992055, "epoch": 0.05278821584247708, "flos": 17429763548160.0, "grad_norm": 4.189960888742742, "language_loss": 0.87971097, "learning_rate": 3.994555590795299e-06, "loss": 0.91131639, "num_input_tokens_seen": 18819670, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.49951172, "step": 878, "time_per_iteration": 2.9486749172210693 }, { "auxiliary_loss_clip": 0.01837989, "auxiliary_loss_mlp": 0.01363508, "balance_loss_clip": 1.51474774, "balance_loss_mlp": 1.31432247, "epoch": 0.052848339095145046, "flos": 26147531998080.0, "grad_norm": 1.9200719947467402, "language_loss": 0.85137105, "learning_rate": 3.9945268356291275e-06, "loss": 0.88338602, "num_input_tokens_seen": 18840580, "router_z_loss_clip": 3.23242188, "router_z_loss_mlp": 0.4921875, "step": 879, "time_per_iteration": 2.924668550491333 }, { "auxiliary_loss_clip": 0.01821014, "auxiliary_loss_mlp": 0.01371327, "balance_loss_clip": 1.51379347, "balance_loss_mlp": 1.31467879, "epoch": 0.05290846234781302, "flos": 16480768970880.0, "grad_norm": 1.7029594594745732, "language_loss": 0.86198425, "learning_rate": 3.9944980048302985e-06, "loss": 0.89390779, "num_input_tokens_seen": 18859295, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.56640625, "step": 880, "time_per_iteration": 2.940678834915161 }, { "auxiliary_loss_clip": 0.01849287, "auxiliary_loss_mlp": 0.01353515, "balance_loss_clip": 1.53299463, "balance_loss_mlp": 1.30764341, "epoch": 0.05296858560048098, "flos": 19874718817920.0, "grad_norm": 6.342344532104024, "language_loss": 0.90035778, "learning_rate": 3.994469098399906e-06, "loss": 0.9323858, "num_input_tokens_seen": 18877485, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.45898438, "step": 881, "time_per_iteration": 2.980752468109131 }, { "auxiliary_loss_clip": 0.01827373, "auxiliary_loss_mlp": 0.01356874, "balance_loss_clip": 1.51075494, "balance_loss_mlp": 1.29912949, "epoch": 0.053028708853148955, "flos": 24399095466240.0, "grad_norm": 2.619292917965124, "language_loss": 0.89445192, "learning_rate": 3.994440116339046e-06, "loss": 0.92629439, "num_input_tokens_seen": 18898275, "router_z_loss_clip": 3.16796875, "router_z_loss_mlp": 0.5769043, "step": 882, "time_per_iteration": 2.9575963020324707 }, { "auxiliary_loss_clip": 0.01830939, "auxiliary_loss_mlp": 0.01369397, "balance_loss_clip": 1.51536751, "balance_loss_mlp": 1.32364464, "epoch": 0.05308883210581693, "flos": 36406985650560.0, "grad_norm": 2.292568020602893, "language_loss": 0.72086704, "learning_rate": 3.994411058648816e-06, "loss": 0.75287038, "num_input_tokens_seen": 18920665, "router_z_loss_clip": 3.15820312, "router_z_loss_mlp": 0.45776367, "step": 883, "time_per_iteration": 3.0957469940185547 }, { "auxiliary_loss_clip": 0.01820832, "auxiliary_loss_mlp": 0.01350373, "balance_loss_clip": 1.50926507, "balance_loss_mlp": 1.29734898, "epoch": 0.05314895535848489, "flos": 22864920900480.0, "grad_norm": 1.8801506760104214, "language_loss": 0.78126299, "learning_rate": 3.994381925330319e-06, "loss": 0.81297505, "num_input_tokens_seen": 18939835, "router_z_loss_clip": 3.109375, "router_z_loss_mlp": 0.53076172, "step": 884, "time_per_iteration": 2.891754627227783 }, { "auxiliary_loss_clip": 0.01820893, "auxiliary_loss_mlp": 0.01340818, "balance_loss_clip": 1.51645029, "balance_loss_mlp": 1.28965378, "epoch": 0.053209078611152864, "flos": 12868168412160.0, "grad_norm": 2.0867899253512787, "language_loss": 0.88063407, "learning_rate": 3.994352716384659e-06, "loss": 0.91225117, "num_input_tokens_seen": 18958405, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.51196289, "step": 885, "time_per_iteration": 4.38989782333374 }, { "auxiliary_loss_clip": 0.01840395, "auxiliary_loss_mlp": 0.01341788, "balance_loss_clip": 1.5228883, "balance_loss_mlp": 1.28626013, "epoch": 0.05326920186382083, "flos": 12171604429440.0, "grad_norm": 2.3494068003137394, "language_loss": 0.89352345, "learning_rate": 3.994323431812945e-06, "loss": 0.92534524, "num_input_tokens_seen": 18975445, "router_z_loss_clip": 3.17578125, "router_z_loss_mlp": 0.5559082, "step": 886, "time_per_iteration": 2.9353585243225098 }, { "auxiliary_loss_clip": 0.01808147, "auxiliary_loss_mlp": 0.01352665, "balance_loss_clip": 1.50146484, "balance_loss_mlp": 1.29897308, "epoch": 0.0533293251164888, "flos": 22713060792960.0, "grad_norm": 1.9677783655486352, "language_loss": 0.91324037, "learning_rate": 3.994294071616286e-06, "loss": 0.94484842, "num_input_tokens_seen": 18991930, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.53710938, "step": 887, "time_per_iteration": 2.8994715213775635 }, { "auxiliary_loss_clip": 0.01825567, "auxiliary_loss_mlp": 0.01342842, "balance_loss_clip": 1.51131713, "balance_loss_mlp": 1.28819621, "epoch": 0.053389448369156774, "flos": 26951679411840.0, "grad_norm": 2.42149636260106, "language_loss": 0.77242804, "learning_rate": 3.994264635795796e-06, "loss": 0.80411208, "num_input_tokens_seen": 19009790, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.54711914, "step": 888, "time_per_iteration": 2.9645450115203857 }, { "auxiliary_loss_clip": 0.01825061, "auxiliary_loss_mlp": 0.01358587, "balance_loss_clip": 1.51172471, "balance_loss_mlp": 1.30398881, "epoch": 0.05344957162182474, "flos": 25567193203200.0, "grad_norm": 2.0852687813171733, "language_loss": 0.90151858, "learning_rate": 3.994235124352592e-06, "loss": 0.93335509, "num_input_tokens_seen": 19030170, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.5456543, "step": 889, "time_per_iteration": 6.02805233001709 }, { "auxiliary_loss_clip": 0.01819501, "auxiliary_loss_mlp": 0.01338042, "balance_loss_clip": 1.51269126, "balance_loss_mlp": 1.28806901, "epoch": 0.05350969487449271, "flos": 19729192982400.0, "grad_norm": 1.8594039118978922, "language_loss": 0.90204954, "learning_rate": 3.994205537287791e-06, "loss": 0.93362498, "num_input_tokens_seen": 19048075, "router_z_loss_clip": 3.06445312, "router_z_loss_mlp": 0.49926758, "step": 890, "time_per_iteration": 4.285792112350464 }, { "auxiliary_loss_clip": 0.01819079, "auxiliary_loss_mlp": 0.01336548, "balance_loss_clip": 1.50529587, "balance_loss_mlp": 1.28884077, "epoch": 0.053569818127160676, "flos": 27027745200000.0, "grad_norm": 2.4204191542134263, "language_loss": 0.95190752, "learning_rate": 3.994175874602517e-06, "loss": 0.98346376, "num_input_tokens_seen": 19067465, "router_z_loss_clip": 3.13476562, "router_z_loss_mlp": 0.47680664, "step": 891, "time_per_iteration": 2.991344451904297 }, { "auxiliary_loss_clip": 0.01817344, "auxiliary_loss_mlp": 0.01334956, "balance_loss_clip": 1.50421977, "balance_loss_mlp": 1.28410125, "epoch": 0.05362994137982865, "flos": 13196167102080.0, "grad_norm": 1.7341180593992296, "language_loss": 0.73255563, "learning_rate": 3.994146136297893e-06, "loss": 0.76407862, "num_input_tokens_seen": 19085505, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.5090332, "step": 892, "time_per_iteration": 2.8669216632843018 }, { "auxiliary_loss_clip": 0.01827967, "auxiliary_loss_mlp": 0.01344879, "balance_loss_clip": 1.51594377, "balance_loss_mlp": 1.29760051, "epoch": 0.05369006463249662, "flos": 28669774665600.0, "grad_norm": 1.9776620580344464, "language_loss": 0.84789604, "learning_rate": 3.994116322375049e-06, "loss": 0.87962449, "num_input_tokens_seen": 19104360, "router_z_loss_clip": 3.12109375, "router_z_loss_mlp": 0.47241211, "step": 893, "time_per_iteration": 2.973574161529541 }, { "auxiliary_loss_clip": 0.01823167, "auxiliary_loss_mlp": 0.01336882, "balance_loss_clip": 1.50769567, "balance_loss_mlp": 1.28631306, "epoch": 0.053750187885164585, "flos": 28924919948160.0, "grad_norm": 1.774188471381303, "language_loss": 0.83699286, "learning_rate": 3.994086432835114e-06, "loss": 0.86859334, "num_input_tokens_seen": 19124680, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.50610352, "step": 894, "time_per_iteration": 2.9355788230895996 }, { "auxiliary_loss_clip": 0.01815447, "auxiliary_loss_mlp": 0.01327712, "balance_loss_clip": 1.50680447, "balance_loss_mlp": 1.28224516, "epoch": 0.05381031113783256, "flos": 15167235888000.0, "grad_norm": 2.6488484702625748, "language_loss": 0.77675927, "learning_rate": 3.994056467679221e-06, "loss": 0.80819082, "num_input_tokens_seen": 19142895, "router_z_loss_clip": 3.0859375, "router_z_loss_mlp": 0.4543457, "step": 895, "time_per_iteration": 2.8974344730377197 }, { "auxiliary_loss_clip": 0.01836184, "auxiliary_loss_mlp": 0.01345751, "balance_loss_clip": 1.51685596, "balance_loss_mlp": 1.29539704, "epoch": 0.05387043439050053, "flos": 21845244666240.0, "grad_norm": 2.047597700048923, "language_loss": 0.88850421, "learning_rate": 3.9940264269085065e-06, "loss": 0.92032349, "num_input_tokens_seen": 19163125, "router_z_loss_clip": 3.1953125, "router_z_loss_mlp": 0.50317383, "step": 896, "time_per_iteration": 2.938079833984375 }, { "auxiliary_loss_clip": 0.0183299, "auxiliary_loss_mlp": 0.01339245, "balance_loss_clip": 1.5149734, "balance_loss_mlp": 1.28572059, "epoch": 0.053930557643168495, "flos": 17318967736320.0, "grad_norm": 2.1000402481542433, "language_loss": 0.89968133, "learning_rate": 3.9939963105241115e-06, "loss": 0.93140376, "num_input_tokens_seen": 19179385, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.53540039, "step": 897, "time_per_iteration": 2.8849101066589355 }, { "auxiliary_loss_clip": 0.01792134, "auxiliary_loss_mlp": 0.01328286, "balance_loss_clip": 1.49125373, "balance_loss_mlp": 1.27190042, "epoch": 0.05399068089583647, "flos": 17357588812800.0, "grad_norm": 1.7214176860540233, "language_loss": 0.92299122, "learning_rate": 3.993966118527175e-06, "loss": 0.95419538, "num_input_tokens_seen": 19198725, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.56420898, "step": 898, "time_per_iteration": 2.948834180831909 }, { "auxiliary_loss_clip": 0.01811634, "auxiliary_loss_mlp": 0.01334077, "balance_loss_clip": 1.49247384, "balance_loss_mlp": 1.28267431, "epoch": 0.05405080414850443, "flos": 17494156177920.0, "grad_norm": 3.0181159383087737, "language_loss": 0.94630206, "learning_rate": 3.993935850918845e-06, "loss": 0.97775924, "num_input_tokens_seen": 19212380, "router_z_loss_clip": 3.18945312, "router_z_loss_mlp": 0.51416016, "step": 899, "time_per_iteration": 2.8465311527252197 }, { "auxiliary_loss_clip": 0.01809096, "auxiliary_loss_mlp": 0.0132014, "balance_loss_clip": 1.50926292, "balance_loss_mlp": 1.26990473, "epoch": 0.054110927401172404, "flos": 24506995610880.0, "grad_norm": 2.783918634899374, "language_loss": 0.76743126, "learning_rate": 3.9939055077002665e-06, "loss": 0.79872358, "num_input_tokens_seen": 19232235, "router_z_loss_clip": 2.99609375, "router_z_loss_mlp": 0.50317383, "step": 900, "time_per_iteration": 2.973247766494751 }, { "auxiliary_loss_clip": 0.01798848, "auxiliary_loss_mlp": 0.01335497, "balance_loss_clip": 1.49030232, "balance_loss_mlp": 1.27434266, "epoch": 0.054171050653840376, "flos": 22940398506240.0, "grad_norm": 2.513470820318757, "language_loss": 0.77238572, "learning_rate": 3.993875088872592e-06, "loss": 0.80372918, "num_input_tokens_seen": 19251460, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.61206055, "step": 901, "time_per_iteration": 2.9066410064697266 }, { "auxiliary_loss_clip": 0.01771224, "auxiliary_loss_mlp": 0.01312642, "balance_loss_clip": 1.48025572, "balance_loss_mlp": 1.26436234, "epoch": 0.05423117390650834, "flos": 12941021819520.0, "grad_norm": 4.020323794041706, "language_loss": 0.86799669, "learning_rate": 3.9938445944369745e-06, "loss": 0.8988353, "num_input_tokens_seen": 19269060, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.48217773, "step": 902, "time_per_iteration": 2.875523805618286 }, { "auxiliary_loss_clip": 0.01794277, "auxiliary_loss_mlp": 0.01320592, "balance_loss_clip": 1.48742819, "balance_loss_mlp": 1.26394343, "epoch": 0.05429129715917631, "flos": 19910806185600.0, "grad_norm": 1.8379763427161284, "language_loss": 0.88245296, "learning_rate": 3.993814024394569e-06, "loss": 0.91360164, "num_input_tokens_seen": 19288620, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.56689453, "step": 903, "time_per_iteration": 2.99001407623291 }, { "auxiliary_loss_clip": 0.01795298, "auxiliary_loss_mlp": 0.01315418, "balance_loss_clip": 1.49021804, "balance_loss_mlp": 1.26439619, "epoch": 0.05435142041184428, "flos": 16917075008640.0, "grad_norm": 3.635522511019434, "language_loss": 0.77855074, "learning_rate": 3.993783378746537e-06, "loss": 0.80965793, "num_input_tokens_seen": 19306615, "router_z_loss_clip": 3.04492188, "router_z_loss_mlp": 0.51000977, "step": 904, "time_per_iteration": 2.872283458709717 }, { "auxiliary_loss_clip": 0.01807286, "auxiliary_loss_mlp": 0.01310812, "balance_loss_clip": 1.49694157, "balance_loss_mlp": 1.26112604, "epoch": 0.05441154366451225, "flos": 23958083969280.0, "grad_norm": 2.2450681190330424, "language_loss": 0.88082826, "learning_rate": 3.993752657494039e-06, "loss": 0.91200918, "num_input_tokens_seen": 19321680, "router_z_loss_clip": 3.10351562, "router_z_loss_mlp": 0.49731445, "step": 905, "time_per_iteration": 2.928906202316284 }, { "auxiliary_loss_clip": 0.01799434, "auxiliary_loss_mlp": 0.01317229, "balance_loss_clip": 1.50462413, "balance_loss_mlp": 1.26267934, "epoch": 0.05447166691718022, "flos": 19985152671360.0, "grad_norm": 2.8940891997299967, "language_loss": 0.75633049, "learning_rate": 3.993721860638241e-06, "loss": 0.7874971, "num_input_tokens_seen": 19339760, "router_z_loss_clip": 2.94726562, "router_z_loss_mlp": 0.54541016, "step": 906, "time_per_iteration": 2.892223358154297 }, { "auxiliary_loss_clip": 0.01809935, "auxiliary_loss_mlp": 0.0132403, "balance_loss_clip": 1.50437939, "balance_loss_mlp": 1.2714107, "epoch": 0.05453179016984819, "flos": 24947328435840.0, "grad_norm": 1.839839549330035, "language_loss": 0.89762485, "learning_rate": 3.993690988180309e-06, "loss": 0.9289645, "num_input_tokens_seen": 19359585, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.52612305, "step": 907, "time_per_iteration": 3.0679714679718018 }, { "auxiliary_loss_clip": 0.01805964, "auxiliary_loss_mlp": 0.01324462, "balance_loss_clip": 1.49746776, "balance_loss_mlp": 1.26342702, "epoch": 0.05459191342251616, "flos": 18124517738880.0, "grad_norm": 3.808097026398685, "language_loss": 0.88467824, "learning_rate": 3.9936600401214165e-06, "loss": 0.91598248, "num_input_tokens_seen": 19378590, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.6105957, "step": 908, "time_per_iteration": 2.9402353763580322 }, { "auxiliary_loss_clip": 0.01819141, "auxiliary_loss_mlp": 0.01304859, "balance_loss_clip": 1.51185405, "balance_loss_mlp": 1.25481462, "epoch": 0.054652036675184125, "flos": 19217635562880.0, "grad_norm": 2.071742558990563, "language_loss": 0.91760516, "learning_rate": 3.9936290164627345e-06, "loss": 0.94884509, "num_input_tokens_seen": 19397910, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.49975586, "step": 909, "time_per_iteration": 2.8678462505340576 }, { "auxiliary_loss_clip": 0.0181916, "auxiliary_loss_mlp": 0.01314232, "balance_loss_clip": 1.50630784, "balance_loss_mlp": 1.26278138, "epoch": 0.0547121599278521, "flos": 16334292994560.0, "grad_norm": 1.9725686797636022, "language_loss": 0.7395097, "learning_rate": 3.99359791720544e-06, "loss": 0.77084363, "num_input_tokens_seen": 19415950, "router_z_loss_clip": 3.12890625, "router_z_loss_mlp": 0.51464844, "step": 910, "time_per_iteration": 2.9442813396453857 }, { "auxiliary_loss_clip": 0.01798481, "auxiliary_loss_mlp": 0.01306952, "balance_loss_clip": 1.49972177, "balance_loss_mlp": 1.25969768, "epoch": 0.05477228318052007, "flos": 20348560056960.0, "grad_norm": 4.296121085858535, "language_loss": 0.85511494, "learning_rate": 3.993566742350714e-06, "loss": 0.88616925, "num_input_tokens_seen": 19435275, "router_z_loss_clip": 2.98632812, "router_z_loss_mlp": 0.47241211, "step": 911, "time_per_iteration": 2.8902196884155273 }, { "auxiliary_loss_clip": 0.01834938, "auxiliary_loss_mlp": 0.01311461, "balance_loss_clip": 1.5218792, "balance_loss_mlp": 1.25815034, "epoch": 0.054832406433188034, "flos": 21981042869760.0, "grad_norm": 2.4367855726741645, "language_loss": 0.78793681, "learning_rate": 3.993535491899736e-06, "loss": 0.81940079, "num_input_tokens_seen": 19452090, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.53369141, "step": 912, "time_per_iteration": 2.887500286102295 }, { "auxiliary_loss_clip": 0.01809126, "auxiliary_loss_mlp": 0.01294013, "balance_loss_clip": 1.50858974, "balance_loss_mlp": 1.24673426, "epoch": 0.054892529685856006, "flos": 16407553605120.0, "grad_norm": 2.162502011067309, "language_loss": 0.85739934, "learning_rate": 3.993504165853694e-06, "loss": 0.88843071, "num_input_tokens_seen": 19470865, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.47290039, "step": 913, "time_per_iteration": 2.8801772594451904 }, { "auxiliary_loss_clip": 0.01811921, "auxiliary_loss_mlp": 0.01277972, "balance_loss_clip": 1.51135182, "balance_loss_mlp": 1.22747517, "epoch": 0.05495265293852397, "flos": 23922222825600.0, "grad_norm": 3.5432700374361965, "language_loss": 0.85183299, "learning_rate": 3.993472764213772e-06, "loss": 0.88273191, "num_input_tokens_seen": 19492145, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.50537109, "step": 914, "time_per_iteration": 2.9222562313079834 }, { "auxiliary_loss_clip": 0.01822716, "auxiliary_loss_mlp": 0.01280819, "balance_loss_clip": 1.51854992, "balance_loss_mlp": 1.23513794, "epoch": 0.055012776191191944, "flos": 23597527006080.0, "grad_norm": 2.223313911790687, "language_loss": 0.93764597, "learning_rate": 3.9934412869811655e-06, "loss": 0.96868134, "num_input_tokens_seen": 19511015, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.45678711, "step": 915, "time_per_iteration": 2.9515678882598877 }, { "auxiliary_loss_clip": 0.01812385, "auxiliary_loss_mlp": 0.01283608, "balance_loss_clip": 1.51045215, "balance_loss_mlp": 1.23172808, "epoch": 0.055072899443859916, "flos": 17537030265600.0, "grad_norm": 1.6952305176205038, "language_loss": 0.91252571, "learning_rate": 3.993409734157064e-06, "loss": 0.94348562, "num_input_tokens_seen": 19529040, "router_z_loss_clip": 3.01953125, "router_z_loss_mlp": 0.51879883, "step": 916, "time_per_iteration": 2.8573005199432373 }, { "auxiliary_loss_clip": 0.01831952, "auxiliary_loss_mlp": 0.01275272, "balance_loss_clip": 1.51393878, "balance_loss_mlp": 1.23054528, "epoch": 0.05513302269652788, "flos": 21696596939520.0, "grad_norm": 2.0728064789163625, "language_loss": 0.82017642, "learning_rate": 3.993378105742666e-06, "loss": 0.85124874, "num_input_tokens_seen": 19549540, "router_z_loss_clip": 3.1796875, "router_z_loss_mlp": 0.44702148, "step": 917, "time_per_iteration": 3.0046277046203613 }, { "auxiliary_loss_clip": 0.01842565, "auxiliary_loss_mlp": 0.01277238, "balance_loss_clip": 1.52977335, "balance_loss_mlp": 1.22614539, "epoch": 0.05519314594919585, "flos": 21622521922560.0, "grad_norm": 1.8758102119320275, "language_loss": 0.82289231, "learning_rate": 3.9933464017391705e-06, "loss": 0.85409033, "num_input_tokens_seen": 19567570, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.51049805, "step": 918, "time_per_iteration": 2.9367332458496094 }, { "auxiliary_loss_clip": 0.01841115, "auxiliary_loss_mlp": 0.01281238, "balance_loss_clip": 1.52192795, "balance_loss_mlp": 1.22304058, "epoch": 0.05525326920186382, "flos": 21808523871360.0, "grad_norm": 2.1477621549127073, "language_loss": 0.90166974, "learning_rate": 3.99331462214778e-06, "loss": 0.93289328, "num_input_tokens_seen": 19585330, "router_z_loss_clip": 3.19335938, "router_z_loss_mlp": 0.58154297, "step": 919, "time_per_iteration": 2.947847604751587 }, { "auxiliary_loss_clip": 0.0182475, "auxiliary_loss_mlp": 0.01260962, "balance_loss_clip": 1.51625562, "balance_loss_mlp": 1.20703149, "epoch": 0.05531339245453179, "flos": 28451485912320.0, "grad_norm": 2.16259163437597, "language_loss": 0.9004454, "learning_rate": 3.993282766969699e-06, "loss": 0.93130249, "num_input_tokens_seen": 19604970, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.5390625, "step": 920, "time_per_iteration": 4.363829135894775 }, { "auxiliary_loss_clip": 0.01828106, "auxiliary_loss_mlp": 0.01252203, "balance_loss_clip": 1.5244658, "balance_loss_mlp": 1.19309914, "epoch": 0.05537351570719976, "flos": 37388402766720.0, "grad_norm": 2.916465323927769, "language_loss": 0.6804232, "learning_rate": 3.993250836206136e-06, "loss": 0.71122628, "num_input_tokens_seen": 19626235, "router_z_loss_clip": 3.03710938, "router_z_loss_mlp": 0.59106445, "step": 921, "time_per_iteration": 3.050734281539917 }, { "auxiliary_loss_clip": 0.01863353, "auxiliary_loss_mlp": 0.01234188, "balance_loss_clip": 1.53925312, "balance_loss_mlp": 1.17453575, "epoch": 0.05543363895986773, "flos": 20094183936000.0, "grad_norm": 1.8621926257525712, "language_loss": 0.74321294, "learning_rate": 3.993218829858301e-06, "loss": 0.77418834, "num_input_tokens_seen": 19644305, "router_z_loss_clip": 3.24023438, "router_z_loss_mlp": 0.59667969, "step": 922, "time_per_iteration": 2.9207465648651123 }, { "auxiliary_loss_clip": 0.01836176, "auxiliary_loss_mlp": 0.01218895, "balance_loss_clip": 1.52290404, "balance_loss_mlp": 1.15917158, "epoch": 0.0554937622125357, "flos": 24543399692160.0, "grad_norm": 2.4075152153371104, "language_loss": 0.84960294, "learning_rate": 3.993186747927408e-06, "loss": 0.8801536, "num_input_tokens_seen": 19662130, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.59692383, "step": 923, "time_per_iteration": 2.9464728832244873 }, { "auxiliary_loss_clip": 0.01813052, "auxiliary_loss_mlp": 0.01242433, "balance_loss_clip": 1.50640559, "balance_loss_mlp": 1.17851257, "epoch": 0.055553885465203665, "flos": 14328358450560.0, "grad_norm": 1.9966025039225748, "language_loss": 0.81055301, "learning_rate": 3.993154590414675e-06, "loss": 0.84110785, "num_input_tokens_seen": 19680715, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.63891602, "step": 924, "time_per_iteration": 4.420054197311401 }, { "auxiliary_loss_clip": 0.01802992, "auxiliary_loss_mlp": 0.01225223, "balance_loss_clip": 1.50149763, "balance_loss_mlp": 1.17277074, "epoch": 0.05561400871787164, "flos": 27392600419200.0, "grad_norm": 2.19191681749827, "language_loss": 1.03813112, "learning_rate": 3.993122357321319e-06, "loss": 1.06841326, "num_input_tokens_seen": 19700535, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.52441406, "step": 925, "time_per_iteration": 2.952658176422119 }, { "auxiliary_loss_clip": 0.01818934, "auxiliary_loss_mlp": 0.01225922, "balance_loss_clip": 1.50946283, "balance_loss_mlp": 1.16865385, "epoch": 0.05567413197053961, "flos": 23231314442880.0, "grad_norm": 2.0936737817801654, "language_loss": 0.82510746, "learning_rate": 3.993090048648564e-06, "loss": 0.85555601, "num_input_tokens_seen": 19718825, "router_z_loss_clip": 3.09765625, "router_z_loss_mlp": 0.57324219, "step": 926, "time_per_iteration": 2.9741051197052 }, { "auxiliary_loss_clip": 0.01859545, "auxiliary_loss_mlp": 0.01214459, "balance_loss_clip": 1.52988219, "balance_loss_mlp": 1.16358006, "epoch": 0.055734255223207574, "flos": 25275870063360.0, "grad_norm": 2.8047397473570923, "language_loss": 0.76422632, "learning_rate": 3.993057664397634e-06, "loss": 0.79496634, "num_input_tokens_seen": 19739080, "router_z_loss_clip": 3.29492188, "router_z_loss_mlp": 0.50927734, "step": 927, "time_per_iteration": 2.9632952213287354 }, { "auxiliary_loss_clip": 0.01524745, "auxiliary_loss_mlp": 0.01281707, "balance_loss_clip": 1.34803295, "balance_loss_mlp": 1.26034474, "epoch": 0.055794378475875546, "flos": 66536606378880.0, "grad_norm": 0.7989656387942049, "language_loss": 0.60015935, "learning_rate": 3.9930252045697585e-06, "loss": 0.6282239, "num_input_tokens_seen": 19802960, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.21386719, "step": 928, "time_per_iteration": 3.373538017272949 }, { "auxiliary_loss_clip": 0.01794104, "auxiliary_loss_mlp": 0.01201874, "balance_loss_clip": 1.49049687, "balance_loss_mlp": 1.13764453, "epoch": 0.05585450172854351, "flos": 25348497246720.0, "grad_norm": 1.993697928502927, "language_loss": 0.97807825, "learning_rate": 3.992992669166168e-06, "loss": 1.00803804, "num_input_tokens_seen": 19822765, "router_z_loss_clip": 3.03320312, "router_z_loss_mlp": 0.64233398, "step": 929, "time_per_iteration": 2.953200101852417 }, { "auxiliary_loss_clip": 0.01800258, "auxiliary_loss_mlp": 0.01194494, "balance_loss_clip": 1.49608827, "balance_loss_mlp": 1.13331532, "epoch": 0.05591462498121148, "flos": 33924811893120.0, "grad_norm": 2.205080640536082, "language_loss": 0.73961151, "learning_rate": 3.992960058188094e-06, "loss": 0.76955903, "num_input_tokens_seen": 19843590, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.61157227, "step": 930, "time_per_iteration": 2.9848570823669434 }, { "auxiliary_loss_clip": 0.01805885, "auxiliary_loss_mlp": 0.01183901, "balance_loss_clip": 1.49413872, "balance_loss_mlp": 1.12925589, "epoch": 0.055974748233879455, "flos": 17939782644480.0, "grad_norm": 2.7199080134763496, "language_loss": 0.88239849, "learning_rate": 3.992927371636776e-06, "loss": 0.9122963, "num_input_tokens_seen": 19860230, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.54663086, "step": 931, "time_per_iteration": 2.961127281188965 }, { "auxiliary_loss_clip": 0.01819974, "auxiliary_loss_mlp": 0.0118285, "balance_loss_clip": 1.50521255, "balance_loss_mlp": 1.13387847, "epoch": 0.05603487148654742, "flos": 24031932762240.0, "grad_norm": 1.777550438733614, "language_loss": 0.85831845, "learning_rate": 3.9928946095134525e-06, "loss": 0.88834667, "num_input_tokens_seen": 19880795, "router_z_loss_clip": 3.15039062, "router_z_loss_mlp": 0.48950195, "step": 932, "time_per_iteration": 2.921264410018921 }, { "auxiliary_loss_clip": 0.01819874, "auxiliary_loss_mlp": 0.01199686, "balance_loss_clip": 1.50642169, "balance_loss_mlp": 1.14506459, "epoch": 0.05609499473921539, "flos": 17315755355520.0, "grad_norm": 3.7180759795969633, "language_loss": 0.77384377, "learning_rate": 3.992861771819365e-06, "loss": 0.80403942, "num_input_tokens_seen": 19897960, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.54614258, "step": 933, "time_per_iteration": 2.9201443195343018 }, { "auxiliary_loss_clip": 0.01808115, "auxiliary_loss_mlp": 0.01173153, "balance_loss_clip": 1.49650502, "balance_loss_mlp": 1.11752987, "epoch": 0.05615511799188336, "flos": 21004195478400.0, "grad_norm": 2.431947597054783, "language_loss": 0.8825593, "learning_rate": 3.99282885855576e-06, "loss": 0.91237193, "num_input_tokens_seen": 19913315, "router_z_loss_clip": 3.11523438, "router_z_loss_mlp": 0.5559082, "step": 934, "time_per_iteration": 2.9288856983184814 }, { "auxiliary_loss_clip": 0.01781886, "auxiliary_loss_mlp": 0.01183805, "balance_loss_clip": 1.49008346, "balance_loss_mlp": 1.1302563, "epoch": 0.05621524124455133, "flos": 17282111207040.0, "grad_norm": 2.244230933642363, "language_loss": 0.81859976, "learning_rate": 3.992795869723885e-06, "loss": 0.84825671, "num_input_tokens_seen": 19928790, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.53491211, "step": 935, "time_per_iteration": 2.852914333343506 }, { "auxiliary_loss_clip": 0.01506819, "auxiliary_loss_mlp": 0.01246009, "balance_loss_clip": 1.328192, "balance_loss_mlp": 1.22703099, "epoch": 0.0562753644972193, "flos": 58747653325440.0, "grad_norm": 0.8373000863460053, "language_loss": 0.69211292, "learning_rate": 3.99276280532499e-06, "loss": 0.71964121, "num_input_tokens_seen": 19988785, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.18945312, "step": 936, "time_per_iteration": 3.2782094478607178 }, { "auxiliary_loss_clip": 0.01834713, "auxiliary_loss_mlp": 0.011736, "balance_loss_clip": 1.51889431, "balance_loss_mlp": 1.11544991, "epoch": 0.05633548774988727, "flos": 17465941405440.0, "grad_norm": 1.8612314170687285, "language_loss": 0.79343462, "learning_rate": 3.992729665360331e-06, "loss": 0.8235178, "num_input_tokens_seen": 20007685, "router_z_loss_clip": 3.16015625, "router_z_loss_mlp": 0.58178711, "step": 937, "time_per_iteration": 3.016991376876831 }, { "auxiliary_loss_clip": 0.01498435, "auxiliary_loss_mlp": 0.01255596, "balance_loss_clip": 1.3230865, "balance_loss_mlp": 1.21249008, "epoch": 0.05639561100255524, "flos": 70687531296000.0, "grad_norm": 0.8923820394992804, "language_loss": 0.64593017, "learning_rate": 3.992696449831162e-06, "loss": 0.6734705, "num_input_tokens_seen": 20072750, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.43164062, "step": 938, "time_per_iteration": 3.3356151580810547 }, { "auxiliary_loss_clip": 0.01852264, "auxiliary_loss_mlp": 0.01148252, "balance_loss_clip": 1.52615798, "balance_loss_mlp": 1.08802772, "epoch": 0.056455734255223204, "flos": 20495850439680.0, "grad_norm": 10.89292738387266, "language_loss": 0.82072914, "learning_rate": 3.992663158738745e-06, "loss": 0.85073423, "num_input_tokens_seen": 20089070, "router_z_loss_clip": 3.26171875, "router_z_loss_mlp": 0.6015625, "step": 939, "time_per_iteration": 2.966855764389038 }, { "auxiliary_loss_clip": 0.01802884, "auxiliary_loss_mlp": 0.01156681, "balance_loss_clip": 1.49764073, "balance_loss_mlp": 1.09378588, "epoch": 0.056515857507891176, "flos": 22063488174720.0, "grad_norm": 2.5977853870767533, "language_loss": 0.75260365, "learning_rate": 3.992629792084341e-06, "loss": 0.78219926, "num_input_tokens_seen": 20108790, "router_z_loss_clip": 3.05273438, "router_z_loss_mlp": 0.62890625, "step": 940, "time_per_iteration": 2.9463765621185303 }, { "auxiliary_loss_clip": 0.01821132, "auxiliary_loss_mlp": 0.01139553, "balance_loss_clip": 1.51737189, "balance_loss_mlp": 1.07696843, "epoch": 0.05657598076055915, "flos": 24036140528640.0, "grad_norm": 1.9101663160291347, "language_loss": 0.73412901, "learning_rate": 3.992596349869216e-06, "loss": 0.76373589, "num_input_tokens_seen": 20128455, "router_z_loss_clip": 3.0390625, "router_z_loss_mlp": 0.62573242, "step": 941, "time_per_iteration": 2.972933292388916 }, { "auxiliary_loss_clip": 0.01807581, "auxiliary_loss_mlp": 0.01128786, "balance_loss_clip": 1.49848926, "balance_loss_mlp": 1.0726862, "epoch": 0.05663610401322711, "flos": 20489063719680.0, "grad_norm": 2.154991494335058, "language_loss": 0.82089889, "learning_rate": 3.992562832094637e-06, "loss": 0.85026258, "num_input_tokens_seen": 20145775, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.56103516, "step": 942, "time_per_iteration": 2.9231905937194824 }, { "auxiliary_loss_clip": 0.01805441, "auxiliary_loss_mlp": 0.01154505, "balance_loss_clip": 1.49837422, "balance_loss_mlp": 1.09847653, "epoch": 0.056696227265895086, "flos": 21078858677760.0, "grad_norm": 1.969478680408, "language_loss": 0.9046042, "learning_rate": 3.9925292387618755e-06, "loss": 0.93420362, "num_input_tokens_seen": 20164315, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.56054688, "step": 943, "time_per_iteration": 2.952671766281128 }, { "auxiliary_loss_clip": 0.01811321, "auxiliary_loss_mlp": 0.01126002, "balance_loss_clip": 1.50448644, "balance_loss_mlp": 1.07602906, "epoch": 0.05675635051856306, "flos": 17830163197440.0, "grad_norm": 2.3150109969683355, "language_loss": 0.7875241, "learning_rate": 3.992495569872206e-06, "loss": 0.81689727, "num_input_tokens_seen": 20182760, "router_z_loss_clip": 3.06835938, "router_z_loss_mlp": 0.5, "step": 944, "time_per_iteration": 2.9937245845794678 }, { "auxiliary_loss_clip": 0.0182348, "auxiliary_loss_mlp": 0.01127015, "balance_loss_clip": 1.5117172, "balance_loss_mlp": 1.07637489, "epoch": 0.05681647377123102, "flos": 23125993251840.0, "grad_norm": 2.023598600908038, "language_loss": 0.80577797, "learning_rate": 3.992461825426906e-06, "loss": 0.83528292, "num_input_tokens_seen": 20203830, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.5065918, "step": 945, "time_per_iteration": 2.9441916942596436 }, { "auxiliary_loss_clip": 0.01828885, "auxiliary_loss_mlp": 0.01128285, "balance_loss_clip": 1.515136, "balance_loss_mlp": 1.07077813, "epoch": 0.056876597023898995, "flos": 16079057222400.0, "grad_norm": 2.5776856782879594, "language_loss": 0.8405292, "learning_rate": 3.992428005427252e-06, "loss": 0.87010086, "num_input_tokens_seen": 20220365, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.57519531, "step": 946, "time_per_iteration": 2.89601993560791 }, { "auxiliary_loss_clip": 0.01835966, "auxiliary_loss_mlp": 0.01142674, "balance_loss_clip": 1.5084995, "balance_loss_mlp": 1.08299851, "epoch": 0.05693672027656696, "flos": 16844085866880.0, "grad_norm": 1.8355066976118155, "language_loss": 0.81324726, "learning_rate": 3.992394109874529e-06, "loss": 0.84303367, "num_input_tokens_seen": 20238640, "router_z_loss_clip": 3.27148438, "router_z_loss_mlp": 0.59619141, "step": 947, "time_per_iteration": 2.948582887649536 }, { "auxiliary_loss_clip": 0.01853746, "auxiliary_loss_mlp": 0.01139829, "balance_loss_clip": 1.5245657, "balance_loss_mlp": 1.09030986, "epoch": 0.05699684352923493, "flos": 21396858266880.0, "grad_norm": 2.8695168211434687, "language_loss": 0.89341986, "learning_rate": 3.9923601387700225e-06, "loss": 0.9233557, "num_input_tokens_seen": 20251025, "router_z_loss_clip": 3.2890625, "router_z_loss_mlp": 0.49536133, "step": 948, "time_per_iteration": 2.9198410511016846 }, { "auxiliary_loss_clip": 0.01813829, "auxiliary_loss_mlp": 0.01126792, "balance_loss_clip": 1.50337386, "balance_loss_mlp": 1.07333875, "epoch": 0.057056966781902904, "flos": 15568857146880.0, "grad_norm": 1.843729330524657, "language_loss": 0.89480019, "learning_rate": 3.992326092115019e-06, "loss": 0.92420632, "num_input_tokens_seen": 20269775, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.53417969, "step": 949, "time_per_iteration": 3.0143661499023438 }, { "auxiliary_loss_clip": 0.01815096, "auxiliary_loss_mlp": 0.01131878, "balance_loss_clip": 1.50624919, "balance_loss_mlp": 1.08064151, "epoch": 0.05711709003457087, "flos": 19947346001280.0, "grad_norm": 2.4636524784259946, "language_loss": 0.8111912, "learning_rate": 3.992291969910811e-06, "loss": 0.84066093, "num_input_tokens_seen": 20287715, "router_z_loss_clip": 3.08789062, "router_z_loss_mlp": 0.51245117, "step": 950, "time_per_iteration": 3.043057441711426 }, { "auxiliary_loss_clip": 0.01836223, "auxiliary_loss_mlp": 0.01134918, "balance_loss_clip": 1.51527572, "balance_loss_mlp": 1.07328701, "epoch": 0.05717721328723884, "flos": 30343955201280.0, "grad_norm": 2.1464229552790712, "language_loss": 0.84199166, "learning_rate": 3.992257772158691e-06, "loss": 0.87170303, "num_input_tokens_seen": 20307070, "router_z_loss_clip": 3.2109375, "router_z_loss_mlp": 0.61621094, "step": 951, "time_per_iteration": 3.026181221008301 }, { "auxiliary_loss_clip": 0.01826237, "auxiliary_loss_mlp": 0.01119243, "balance_loss_clip": 1.50702643, "balance_loss_mlp": 1.06209397, "epoch": 0.05723733653990681, "flos": 23663503203840.0, "grad_norm": 2.347996341450788, "language_loss": 0.89865255, "learning_rate": 3.992223498859958e-06, "loss": 0.92810738, "num_input_tokens_seen": 20324945, "router_z_loss_clip": 3.1875, "router_z_loss_mlp": 0.5715332, "step": 952, "time_per_iteration": 2.9891815185546875 }, { "auxiliary_loss_clip": 0.01847534, "auxiliary_loss_mlp": 0.01166208, "balance_loss_clip": 1.51565874, "balance_loss_mlp": 1.10040426, "epoch": 0.05729745979257478, "flos": 22066248107520.0, "grad_norm": 1.8217459827910385, "language_loss": 0.80831194, "learning_rate": 3.9921891500159084e-06, "loss": 0.8384493, "num_input_tokens_seen": 20346135, "router_z_loss_clip": 3.31445312, "router_z_loss_mlp": 0.65820312, "step": 953, "time_per_iteration": 3.052690267562866 }, { "auxiliary_loss_clip": 0.01810574, "auxiliary_loss_mlp": 0.01133222, "balance_loss_clip": 1.4950248, "balance_loss_mlp": 1.07366526, "epoch": 0.05735758304524275, "flos": 19612424856960.0, "grad_norm": 1.9342926455715435, "language_loss": 0.89199835, "learning_rate": 3.992154725627848e-06, "loss": 0.92143631, "num_input_tokens_seen": 20364450, "router_z_loss_clip": 3.15625, "router_z_loss_mlp": 0.59594727, "step": 954, "time_per_iteration": 3.028079032897949 }, { "auxiliary_loss_clip": 0.01821366, "auxiliary_loss_mlp": 0.01145391, "balance_loss_clip": 1.5009191, "balance_loss_mlp": 1.09186625, "epoch": 0.057417706297910716, "flos": 19108378074240.0, "grad_norm": 2.1771568903413265, "language_loss": 0.90394211, "learning_rate": 3.9921202256970804e-06, "loss": 0.93360972, "num_input_tokens_seen": 20383500, "router_z_loss_clip": 3.19921875, "router_z_loss_mlp": 0.53515625, "step": 955, "time_per_iteration": 4.370601177215576 }, { "auxiliary_loss_clip": 0.01803572, "auxiliary_loss_mlp": 0.01151463, "balance_loss_clip": 1.49157071, "balance_loss_mlp": 1.09786606, "epoch": 0.05747782955057869, "flos": 16663241825280.0, "grad_norm": 2.408368784324984, "language_loss": 0.91776884, "learning_rate": 3.992085650224914e-06, "loss": 0.94731927, "num_input_tokens_seen": 20400295, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.53613281, "step": 956, "time_per_iteration": 2.9173810482025146 }, { "auxiliary_loss_clip": 0.01799494, "auxiliary_loss_mlp": 0.01156998, "balance_loss_clip": 1.49723983, "balance_loss_mlp": 1.10168493, "epoch": 0.05753795280324665, "flos": 14510288367360.0, "grad_norm": 1.5854873244984242, "language_loss": 0.77253294, "learning_rate": 3.99205099921266e-06, "loss": 0.80209786, "num_input_tokens_seen": 20419085, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.55297852, "step": 957, "time_per_iteration": 2.9963958263397217 }, { "auxiliary_loss_clip": 0.01796244, "auxiliary_loss_mlp": 0.01163023, "balance_loss_clip": 1.48546946, "balance_loss_mlp": 1.10990334, "epoch": 0.057598076055914625, "flos": 18085127500800.0, "grad_norm": 1.9111101960021382, "language_loss": 0.81484997, "learning_rate": 3.992016272661633e-06, "loss": 0.84444261, "num_input_tokens_seen": 20437465, "router_z_loss_clip": 3.10742188, "router_z_loss_mlp": 0.53149414, "step": 958, "time_per_iteration": 2.9076666831970215 }, { "auxiliary_loss_clip": 0.01788126, "auxiliary_loss_mlp": 0.01144111, "balance_loss_clip": 1.48001909, "balance_loss_mlp": 1.09614062, "epoch": 0.0576581993085826, "flos": 22134034097280.0, "grad_norm": 3.4016494791228715, "language_loss": 0.90427709, "learning_rate": 3.99198147057315e-06, "loss": 0.93359947, "num_input_tokens_seen": 20456235, "router_z_loss_clip": 3.08007812, "router_z_loss_mlp": 0.47973633, "step": 959, "time_per_iteration": 5.83426570892334 }, { "auxiliary_loss_clip": 0.01789907, "auxiliary_loss_mlp": 0.01147008, "balance_loss_clip": 1.48212516, "balance_loss_mlp": 1.10249555, "epoch": 0.05771832256125056, "flos": 33194558517120.0, "grad_norm": 2.2904403450486686, "language_loss": 0.80814618, "learning_rate": 3.991946592948529e-06, "loss": 0.83751529, "num_input_tokens_seen": 20476825, "router_z_loss_clip": 3.07421875, "router_z_loss_mlp": 0.4453125, "step": 960, "time_per_iteration": 3.1046619415283203 }, { "auxiliary_loss_clip": 0.01814349, "auxiliary_loss_mlp": 0.01170118, "balance_loss_clip": 1.49410212, "balance_loss_mlp": 1.12105167, "epoch": 0.057778445813918534, "flos": 24180716223360.0, "grad_norm": 1.7920129052323541, "language_loss": 0.9497633, "learning_rate": 3.991911639789094e-06, "loss": 0.97960794, "num_input_tokens_seen": 20496965, "router_z_loss_clip": 3.20117188, "router_z_loss_mlp": 0.49121094, "step": 961, "time_per_iteration": 3.0662448406219482 }, { "auxiliary_loss_clip": 0.01813588, "auxiliary_loss_mlp": 0.01157318, "balance_loss_clip": 1.49518657, "balance_loss_mlp": 1.10550976, "epoch": 0.0578385690665865, "flos": 29655037589760.0, "grad_norm": 3.1496734546212877, "language_loss": 0.71032959, "learning_rate": 3.991876611096169e-06, "loss": 0.74003863, "num_input_tokens_seen": 20518035, "router_z_loss_clip": 3.18359375, "router_z_loss_mlp": 0.51806641, "step": 962, "time_per_iteration": 3.0075128078460693 }, { "auxiliary_loss_clip": 0.01801373, "auxiliary_loss_mlp": 0.01135076, "balance_loss_clip": 1.48495126, "balance_loss_mlp": 1.0882988, "epoch": 0.05789869231925447, "flos": 20894892744960.0, "grad_norm": 2.201444155386885, "language_loss": 0.8997736, "learning_rate": 3.991841506871084e-06, "loss": 0.92913806, "num_input_tokens_seen": 20534740, "router_z_loss_clip": 3.16601562, "router_z_loss_mlp": 0.46801758, "step": 963, "time_per_iteration": 2.976529359817505 }, { "auxiliary_loss_clip": 0.01817438, "auxiliary_loss_mlp": 0.01141877, "balance_loss_clip": 1.50018108, "balance_loss_mlp": 1.09149933, "epoch": 0.057958815571922444, "flos": 26042210807040.0, "grad_norm": 2.6425702285963646, "language_loss": 0.88256055, "learning_rate": 3.99180632711517e-06, "loss": 0.91215372, "num_input_tokens_seen": 20553485, "router_z_loss_clip": 3.17382812, "router_z_loss_mlp": 0.50390625, "step": 964, "time_per_iteration": 2.9432666301727295 }, { "auxiliary_loss_clip": 0.01801345, "auxiliary_loss_mlp": 0.01142654, "balance_loss_clip": 1.48766971, "balance_loss_mlp": 1.09425521, "epoch": 0.05801893882459041, "flos": 18086846803200.0, "grad_norm": 4.5298790846055965, "language_loss": 0.79566801, "learning_rate": 3.99177107182976e-06, "loss": 0.82510799, "num_input_tokens_seen": 20572155, "router_z_loss_clip": 3.13671875, "router_z_loss_mlp": 0.484375, "step": 965, "time_per_iteration": 2.9682512283325195 }, { "auxiliary_loss_clip": 0.01787567, "auxiliary_loss_mlp": 0.01136967, "balance_loss_clip": 1.48307681, "balance_loss_mlp": 1.08441949, "epoch": 0.05807906207725838, "flos": 17757852727680.0, "grad_norm": 1.9209339410999229, "language_loss": 0.8451575, "learning_rate": 3.99173574101619e-06, "loss": 0.87440282, "num_input_tokens_seen": 20590395, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.52539062, "step": 966, "time_per_iteration": 3.0391228199005127 }, { "auxiliary_loss_clip": 0.0179777, "auxiliary_loss_mlp": 0.01134891, "balance_loss_clip": 1.49362993, "balance_loss_mlp": 1.0877558, "epoch": 0.058139185329926346, "flos": 18049311601920.0, "grad_norm": 2.1316567157031217, "language_loss": 0.78331923, "learning_rate": 3.9917003346758035e-06, "loss": 0.81264585, "num_input_tokens_seen": 20608435, "router_z_loss_clip": 3.04101562, "router_z_loss_mlp": 0.47143555, "step": 967, "time_per_iteration": 2.917583703994751 }, { "auxiliary_loss_clip": 0.01491509, "auxiliary_loss_mlp": 0.01192346, "balance_loss_clip": 1.32212186, "balance_loss_mlp": 1.1654526, "epoch": 0.05819930858259432, "flos": 62391319079040.0, "grad_norm": 0.8075296042625363, "language_loss": 0.57560199, "learning_rate": 3.991664852809939e-06, "loss": 0.6024406, "num_input_tokens_seen": 20668575, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.26953125, "step": 968, "time_per_iteration": 3.3976640701293945 }, { "auxiliary_loss_clip": 0.01771766, "auxiliary_loss_mlp": 0.01147993, "balance_loss_clip": 1.47534823, "balance_loss_mlp": 1.09043932, "epoch": 0.05825943183526229, "flos": 19144691665920.0, "grad_norm": 2.025823885216549, "language_loss": 0.84289056, "learning_rate": 3.991629295419945e-06, "loss": 0.87208813, "num_input_tokens_seen": 20687355, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.57617188, "step": 969, "time_per_iteration": 2.9408795833587646 }, { "auxiliary_loss_clip": 0.01780061, "auxiliary_loss_mlp": 0.01154017, "balance_loss_clip": 1.4686842, "balance_loss_mlp": 1.1040448, "epoch": 0.058319555087930255, "flos": 29034222681600.0, "grad_norm": 2.091273090492494, "language_loss": 0.78792429, "learning_rate": 3.991593662507167e-06, "loss": 0.81726509, "num_input_tokens_seen": 20705710, "router_z_loss_clip": 3.11328125, "router_z_loss_mlp": 0.5, "step": 970, "time_per_iteration": 2.9662909507751465 }, { "auxiliary_loss_clip": 0.01778524, "auxiliary_loss_mlp": 0.01143298, "balance_loss_clip": 1.47375774, "balance_loss_mlp": 1.09184694, "epoch": 0.05837967834059823, "flos": 18889274914560.0, "grad_norm": 2.607590734016365, "language_loss": 0.93898678, "learning_rate": 3.991557954072958e-06, "loss": 0.96820498, "num_input_tokens_seen": 20722405, "router_z_loss_clip": 3.04882812, "router_z_loss_mlp": 0.51513672, "step": 971, "time_per_iteration": 2.920809745788574 }, { "auxiliary_loss_clip": 0.01756702, "auxiliary_loss_mlp": 0.01121608, "balance_loss_clip": 1.45477188, "balance_loss_mlp": 1.07094443, "epoch": 0.05843980159326619, "flos": 25714166872320.0, "grad_norm": 1.7375957140504457, "language_loss": 0.87568176, "learning_rate": 3.991522170118673e-06, "loss": 0.9044649, "num_input_tokens_seen": 20741480, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.50634766, "step": 972, "time_per_iteration": 2.908933401107788 }, { "auxiliary_loss_clip": 0.01769997, "auxiliary_loss_mlp": 0.01174221, "balance_loss_clip": 1.46907532, "balance_loss_mlp": 1.12155461, "epoch": 0.058499924845934165, "flos": 25561899561600.0, "grad_norm": 2.0940785176596153, "language_loss": 0.88775885, "learning_rate": 3.991486310645667e-06, "loss": 0.91720104, "num_input_tokens_seen": 20759685, "router_z_loss_clip": 3.0078125, "router_z_loss_mlp": 0.52685547, "step": 973, "time_per_iteration": 2.970554828643799 }, { "auxiliary_loss_clip": 0.01761172, "auxiliary_loss_mlp": 0.01160684, "balance_loss_clip": 1.46474636, "balance_loss_mlp": 1.10701609, "epoch": 0.05856004809860214, "flos": 16444998316800.0, "grad_norm": 2.0606692034576484, "language_loss": 0.76586604, "learning_rate": 3.991450375655301e-06, "loss": 0.79508466, "num_input_tokens_seen": 20778180, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.53662109, "step": 974, "time_per_iteration": 2.9474990367889404 }, { "auxiliary_loss_clip": 0.01739818, "auxiliary_loss_mlp": 0.01169788, "balance_loss_clip": 1.44869041, "balance_loss_mlp": 1.11318755, "epoch": 0.0586201713512701, "flos": 39473660724480.0, "grad_norm": 1.4271591496739375, "language_loss": 0.77935803, "learning_rate": 3.991414365148936e-06, "loss": 0.8084541, "num_input_tokens_seen": 20802705, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.56616211, "step": 975, "time_per_iteration": 3.0622963905334473 }, { "auxiliary_loss_clip": 0.01762792, "auxiliary_loss_mlp": 0.01149318, "balance_loss_clip": 1.45939469, "balance_loss_mlp": 1.08957052, "epoch": 0.058680294603938074, "flos": 23374939996800.0, "grad_norm": 2.47163363219876, "language_loss": 0.79049522, "learning_rate": 3.99137827912794e-06, "loss": 0.81961632, "num_input_tokens_seen": 20822540, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.59765625, "step": 976, "time_per_iteration": 2.970222234725952 }, { "auxiliary_loss_clip": 0.01746409, "auxiliary_loss_mlp": 0.01148405, "balance_loss_clip": 1.45020735, "balance_loss_mlp": 1.09118426, "epoch": 0.05874041785660604, "flos": 32244297085440.0, "grad_norm": 1.797441877631057, "language_loss": 0.88619471, "learning_rate": 3.991342117593679e-06, "loss": 0.91514283, "num_input_tokens_seen": 20844175, "router_z_loss_clip": 2.9609375, "router_z_loss_mlp": 0.57202148, "step": 977, "time_per_iteration": 2.960991621017456 }, { "auxiliary_loss_clip": 0.01758621, "auxiliary_loss_mlp": 0.01144513, "balance_loss_clip": 1.45872498, "balance_loss_mlp": 1.08827043, "epoch": 0.05880054110927401, "flos": 22320307514880.0, "grad_norm": 1.4908016473801688, "language_loss": 0.81136203, "learning_rate": 3.991305880547527e-06, "loss": 0.84039336, "num_input_tokens_seen": 20864730, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.56176758, "step": 978, "time_per_iteration": 2.9335594177246094 }, { "auxiliary_loss_clip": 0.01777462, "auxiliary_loss_mlp": 0.01131803, "balance_loss_clip": 1.47058034, "balance_loss_mlp": 1.07243681, "epoch": 0.05886066436194198, "flos": 27391740768000.0, "grad_norm": 1.9355336357238397, "language_loss": 0.82057917, "learning_rate": 3.991269567990855e-06, "loss": 0.84967184, "num_input_tokens_seen": 20885200, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.59399414, "step": 979, "time_per_iteration": 2.945584297180176 }, { "auxiliary_loss_clip": 0.01473931, "auxiliary_loss_mlp": 0.0113949, "balance_loss_clip": 1.30363488, "balance_loss_mlp": 1.06968153, "epoch": 0.05892078761460995, "flos": 59610266300160.0, "grad_norm": 0.9615661736269862, "language_loss": 0.59125507, "learning_rate": 3.9912331799250415e-06, "loss": 0.61738932, "num_input_tokens_seen": 20940325, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.69921875, "step": 980, "time_per_iteration": 3.2757680416107178 }, { "auxiliary_loss_clip": 0.0174837, "auxiliary_loss_mlp": 0.01112781, "balance_loss_clip": 1.45818305, "balance_loss_mlp": 1.05615628, "epoch": 0.05898091086727792, "flos": 15422516904960.0, "grad_norm": 2.7763120702718154, "language_loss": 0.88997209, "learning_rate": 3.9911967163514665e-06, "loss": 0.91858363, "num_input_tokens_seen": 20958220, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.56713867, "step": 981, "time_per_iteration": 2.8843767642974854 }, { "auxiliary_loss_clip": 0.0178376, "auxiliary_loss_mlp": 0.01113488, "balance_loss_clip": 1.48050678, "balance_loss_mlp": 1.05989099, "epoch": 0.059041034119945886, "flos": 23664724813440.0, "grad_norm": 1.9114560187327032, "language_loss": 0.81018984, "learning_rate": 3.991160177271513e-06, "loss": 0.83916235, "num_input_tokens_seen": 20978920, "router_z_loss_clip": 3.02929688, "router_z_loss_mlp": 0.53564453, "step": 982, "time_per_iteration": 2.9167864322662354 }, { "auxiliary_loss_clip": 0.01798443, "auxiliary_loss_mlp": 0.0111861, "balance_loss_clip": 1.48551524, "balance_loss_mlp": 1.06518006, "epoch": 0.05910115737261386, "flos": 24764855581440.0, "grad_norm": 2.2682188939168757, "language_loss": 0.87547767, "learning_rate": 3.9911235626865654e-06, "loss": 0.90464818, "num_input_tokens_seen": 20999490, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.53442383, "step": 983, "time_per_iteration": 2.9428207874298096 }, { "auxiliary_loss_clip": 0.0175733, "auxiliary_loss_mlp": 0.01118514, "balance_loss_clip": 1.46027482, "balance_loss_mlp": 1.06615734, "epoch": 0.05916128062528183, "flos": 11736158042880.0, "grad_norm": 2.303648113199355, "language_loss": 0.86889964, "learning_rate": 3.9910868725980125e-06, "loss": 0.89765811, "num_input_tokens_seen": 21017865, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 0.52392578, "step": 984, "time_per_iteration": 2.8902368545532227 }, { "auxiliary_loss_clip": 0.01741363, "auxiliary_loss_mlp": 0.01096622, "balance_loss_clip": 1.44815087, "balance_loss_mlp": 1.05046427, "epoch": 0.059221403877949795, "flos": 21911673312000.0, "grad_norm": 2.175059402342379, "language_loss": 0.79157126, "learning_rate": 3.9910501070072465e-06, "loss": 0.81995112, "num_input_tokens_seen": 21035900, "router_z_loss_clip": 2.93164062, "router_z_loss_mlp": 0.46166992, "step": 985, "time_per_iteration": 2.893165111541748 }, { "auxiliary_loss_clip": 0.01771611, "auxiliary_loss_mlp": 0.01130213, "balance_loss_clip": 1.46997595, "balance_loss_mlp": 1.08338761, "epoch": 0.05928152713061777, "flos": 20522391154560.0, "grad_norm": 2.5719379370295283, "language_loss": 0.92309588, "learning_rate": 3.991013265915661e-06, "loss": 0.95211416, "num_input_tokens_seen": 21053235, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.46826172, "step": 986, "time_per_iteration": 2.8692374229431152 }, { "auxiliary_loss_clip": 0.01772994, "auxiliary_loss_mlp": 0.01120556, "balance_loss_clip": 1.46565008, "balance_loss_mlp": 1.07280087, "epoch": 0.05934165038328574, "flos": 24504914350080.0, "grad_norm": 1.920348175342391, "language_loss": 0.78143734, "learning_rate": 3.9909763493246525e-06, "loss": 0.81037283, "num_input_tokens_seen": 21073090, "router_z_loss_clip": 3.07226562, "router_z_loss_mlp": 0.4777832, "step": 987, "time_per_iteration": 2.957395553588867 }, { "auxiliary_loss_clip": 0.01784216, "auxiliary_loss_mlp": 0.01133454, "balance_loss_clip": 1.47021174, "balance_loss_mlp": 1.08212233, "epoch": 0.059401773635953704, "flos": 38742411962880.0, "grad_norm": 2.426251495929481, "language_loss": 0.73397893, "learning_rate": 3.990939357235621e-06, "loss": 0.76315564, "num_input_tokens_seen": 21094895, "router_z_loss_clip": 3.14257812, "router_z_loss_mlp": 0.51318359, "step": 988, "time_per_iteration": 3.044674873352051 }, { "auxiliary_loss_clip": 0.01464726, "auxiliary_loss_mlp": 0.01114485, "balance_loss_clip": 1.30212998, "balance_loss_mlp": 1.08759105, "epoch": 0.059461896888621676, "flos": 58050998853120.0, "grad_norm": 0.9488903398449772, "language_loss": 0.71248472, "learning_rate": 3.99090228964997e-06, "loss": 0.73827684, "num_input_tokens_seen": 21147555, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.26953125, "step": 989, "time_per_iteration": 4.650926351547241 }, { "auxiliary_loss_clip": 0.01780717, "auxiliary_loss_mlp": 0.01124669, "balance_loss_clip": 1.47023439, "balance_loss_mlp": 1.07660389, "epoch": 0.05952202014128964, "flos": 22138015639680.0, "grad_norm": 6.264074962202939, "language_loss": 0.82071888, "learning_rate": 3.990865146569105e-06, "loss": 0.84977275, "num_input_tokens_seen": 21167845, "router_z_loss_clip": 3.10546875, "router_z_loss_mlp": 0.48071289, "step": 990, "time_per_iteration": 2.8928844928741455 }, { "auxiliary_loss_clip": 0.01749238, "auxiliary_loss_mlp": 0.01112948, "balance_loss_clip": 1.450845, "balance_loss_mlp": 1.06235552, "epoch": 0.059582143393957614, "flos": 20454650409600.0, "grad_norm": 1.946403534793412, "language_loss": 0.87365746, "learning_rate": 3.990827927994434e-06, "loss": 0.90227932, "num_input_tokens_seen": 21185085, "router_z_loss_clip": 2.984375, "router_z_loss_mlp": 0.50585938, "step": 991, "time_per_iteration": 2.923243761062622 }, { "auxiliary_loss_clip": 0.01785655, "auxiliary_loss_mlp": 0.01115336, "balance_loss_clip": 1.47416401, "balance_loss_mlp": 1.06584024, "epoch": 0.059642266646625586, "flos": 20604610235520.0, "grad_norm": 4.194706136498649, "language_loss": 0.79371905, "learning_rate": 3.9907906339273674e-06, "loss": 0.82272893, "num_input_tokens_seen": 21204230, "router_z_loss_clip": 3.1171875, "router_z_loss_mlp": 0.49487305, "step": 992, "time_per_iteration": 2.866121292114258 }, { "auxiliary_loss_clip": 0.01788712, "auxiliary_loss_mlp": 0.01116158, "balance_loss_clip": 1.48039627, "balance_loss_mlp": 1.06866503, "epoch": 0.05970238989929355, "flos": 19361623075200.0, "grad_norm": 2.6581920028364223, "language_loss": 0.76972079, "learning_rate": 3.9907532643693215e-06, "loss": 0.79876947, "num_input_tokens_seen": 21222655, "router_z_loss_clip": 3.08398438, "router_z_loss_mlp": 0.4753418, "step": 993, "time_per_iteration": 4.399053573608398 }, { "auxiliary_loss_clip": 0.01768839, "auxiliary_loss_mlp": 0.0113828, "balance_loss_clip": 1.46545231, "balance_loss_mlp": 1.08744895, "epoch": 0.05976251315196152, "flos": 30276395435520.0, "grad_norm": 2.8629526542588923, "language_loss": 0.81012928, "learning_rate": 3.990715819321712e-06, "loss": 0.83920044, "num_input_tokens_seen": 21242310, "router_z_loss_clip": 3.03125, "router_z_loss_mlp": 0.50830078, "step": 994, "time_per_iteration": 5.7542078495025635 }, { "auxiliary_loss_clip": 0.01764486, "auxiliary_loss_mlp": 0.01125745, "balance_loss_clip": 1.45731437, "balance_loss_mlp": 1.07603514, "epoch": 0.05982263640462949, "flos": 23195498544000.0, "grad_norm": 2.3575456302053572, "language_loss": 0.8222034, "learning_rate": 3.99067829878596e-06, "loss": 0.85110569, "num_input_tokens_seen": 21261410, "router_z_loss_clip": 3.0703125, "router_z_loss_mlp": 0.49682617, "step": 995, "time_per_iteration": 2.9191224575042725 }, { "auxiliary_loss_clip": 0.01778943, "auxiliary_loss_mlp": 0.01115622, "balance_loss_clip": 1.46941471, "balance_loss_mlp": 1.06657887, "epoch": 0.05988275965729746, "flos": 27861283751040.0, "grad_norm": 2.3387336930079283, "language_loss": 0.88894904, "learning_rate": 3.990640702763487e-06, "loss": 0.91789472, "num_input_tokens_seen": 21280080, "router_z_loss_clip": 3.09375, "router_z_loss_mlp": 0.49047852, "step": 996, "time_per_iteration": 2.9270241260528564 }, { "auxiliary_loss_clip": 0.01768029, "auxiliary_loss_mlp": 0.01104629, "balance_loss_clip": 1.46670771, "balance_loss_mlp": 1.05088913, "epoch": 0.05994288290996543, "flos": 24690463850880.0, "grad_norm": 3.1824842774948836, "language_loss": 0.89937687, "learning_rate": 3.990603031255718e-06, "loss": 0.92810345, "num_input_tokens_seen": 21296765, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.53735352, "step": 997, "time_per_iteration": 2.8931725025177 }, { "auxiliary_loss_clip": 0.01450192, "auxiliary_loss_mlp": 0.01078129, "balance_loss_clip": 1.28053331, "balance_loss_mlp": 1.03044569, "epoch": 0.0600030061626334, "flos": 69963114499200.0, "grad_norm": 1.0614858789100483, "language_loss": 0.75583935, "learning_rate": 3.990565284264083e-06, "loss": 0.78112257, "num_input_tokens_seen": 21363345, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.4765625, "step": 998, "time_per_iteration": 3.4666221141815186 }, { "auxiliary_loss_clip": 0.01743604, "auxiliary_loss_mlp": 0.01098276, "balance_loss_clip": 1.45488107, "balance_loss_mlp": 1.04947162, "epoch": 0.06006312941530137, "flos": 26550917804160.0, "grad_norm": 1.7707557400643377, "language_loss": 0.77670515, "learning_rate": 3.990527461790013e-06, "loss": 0.80512393, "num_input_tokens_seen": 21385290, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.48828125, "step": 999, "time_per_iteration": 2.9835479259490967 }, { "auxiliary_loss_clip": 0.01779867, "auxiliary_loss_mlp": 0.01111157, "balance_loss_clip": 1.46700263, "balance_loss_mlp": 1.05293489, "epoch": 0.060123252667969335, "flos": 27355291441920.0, "grad_norm": 1.7339375132223012, "language_loss": 0.83587921, "learning_rate": 3.990489563834943e-06, "loss": 0.86478943, "num_input_tokens_seen": 21407625, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.58178711, "step": 1000, "time_per_iteration": 3.0643134117126465 }, { "auxiliary_loss_clip": 0.01783456, "auxiliary_loss_mlp": 0.01118305, "balance_loss_clip": 1.4779247, "balance_loss_mlp": 1.05302608, "epoch": 0.06018337592063731, "flos": 27028197648000.0, "grad_norm": 2.0289755495665727, "language_loss": 0.88062441, "learning_rate": 3.990451590400309e-06, "loss": 0.90964204, "num_input_tokens_seen": 21426835, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.65283203, "step": 1001, "time_per_iteration": 3.078782558441162 }, { "auxiliary_loss_clip": 0.01778232, "auxiliary_loss_mlp": 0.01127018, "balance_loss_clip": 1.4771452, "balance_loss_mlp": 1.06932116, "epoch": 0.06024349917330528, "flos": 25603868753280.0, "grad_norm": 3.424867566705001, "language_loss": 0.75499094, "learning_rate": 3.990413541487551e-06, "loss": 0.78404349, "num_input_tokens_seen": 21444920, "router_z_loss_clip": 3.01171875, "router_z_loss_mlp": 0.57714844, "step": 1002, "time_per_iteration": 2.8902413845062256 }, { "auxiliary_loss_clip": 0.01783125, "auxiliary_loss_mlp": 0.01105495, "balance_loss_clip": 1.47705889, "balance_loss_mlp": 1.05311489, "epoch": 0.060303622425973244, "flos": 26142509825280.0, "grad_norm": 2.5630817683705303, "language_loss": 0.7803756, "learning_rate": 3.990375417098112e-06, "loss": 0.8092618, "num_input_tokens_seen": 21463555, "router_z_loss_clip": 3.0625, "router_z_loss_mlp": 0.52392578, "step": 1003, "time_per_iteration": 2.942902088165283 }, { "auxiliary_loss_clip": 0.01777849, "auxiliary_loss_mlp": 0.01111834, "balance_loss_clip": 1.46847713, "balance_loss_mlp": 1.05761766, "epoch": 0.060363745678641216, "flos": 20387181133440.0, "grad_norm": 2.9149342127984106, "language_loss": 0.71679455, "learning_rate": 3.990337217233437e-06, "loss": 0.74569136, "num_input_tokens_seen": 21481990, "router_z_loss_clip": 3.09570312, "router_z_loss_mlp": 0.54223633, "step": 1004, "time_per_iteration": 2.866318464279175 }, { "auxiliary_loss_clip": 0.01800068, "auxiliary_loss_mlp": 0.01120875, "balance_loss_clip": 1.48791361, "balance_loss_mlp": 1.06556225, "epoch": 0.06042386893130918, "flos": 17758214686080.0, "grad_norm": 2.236491318712935, "language_loss": 0.8563534, "learning_rate": 3.990298941894976e-06, "loss": 0.8855629, "num_input_tokens_seen": 21500385, "router_z_loss_clip": 3.11914062, "router_z_loss_mlp": 0.5534668, "step": 1005, "time_per_iteration": 2.8661811351776123 }, { "auxiliary_loss_clip": 0.01471201, "auxiliary_loss_mlp": 0.01248823, "balance_loss_clip": 1.29227734, "balance_loss_mlp": 1.16451824, "epoch": 0.06048399218397715, "flos": 68570484226560.0, "grad_norm": 0.9328488173804859, "language_loss": 0.59108686, "learning_rate": 3.9902605910841794e-06, "loss": 0.61828709, "num_input_tokens_seen": 21561040, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.84375, "step": 1006, "time_per_iteration": 3.4758994579315186 }, { "auxiliary_loss_clip": 0.01764285, "auxiliary_loss_mlp": 0.01119911, "balance_loss_clip": 1.45837474, "balance_loss_mlp": 1.06555188, "epoch": 0.060544115436645125, "flos": 23268985378560.0, "grad_norm": 1.9734780088783437, "language_loss": 0.76431131, "learning_rate": 3.990222164802503e-06, "loss": 0.79315323, "num_input_tokens_seen": 21580655, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.54418945, "step": 1007, "time_per_iteration": 2.9659454822540283 }, { "auxiliary_loss_clip": 0.01772858, "auxiliary_loss_mlp": 0.01122475, "balance_loss_clip": 1.46641684, "balance_loss_mlp": 1.07052374, "epoch": 0.06060423868931309, "flos": 23888985880320.0, "grad_norm": 1.8553294946977932, "language_loss": 0.82669604, "learning_rate": 3.9901836630514006e-06, "loss": 0.85564941, "num_input_tokens_seen": 21599650, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.51977539, "step": 1008, "time_per_iteration": 2.9513726234436035 }, { "auxiliary_loss_clip": 0.01772593, "auxiliary_loss_mlp": 0.01110892, "balance_loss_clip": 1.4744153, "balance_loss_mlp": 1.06182516, "epoch": 0.06066436194198106, "flos": 18735740749440.0, "grad_norm": 2.081513422032007, "language_loss": 0.80450046, "learning_rate": 3.990145085832335e-06, "loss": 0.83333534, "num_input_tokens_seen": 21617550, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.49145508, "step": 1009, "time_per_iteration": 2.9590160846710205 }, { "auxiliary_loss_clip": 0.01756764, "auxiliary_loss_mlp": 0.01092227, "balance_loss_clip": 1.46559119, "balance_loss_mlp": 1.04251635, "epoch": 0.06072448519464903, "flos": 24650123472000.0, "grad_norm": 1.7450216860543388, "language_loss": 0.94533348, "learning_rate": 3.990106433146769e-06, "loss": 0.97382343, "num_input_tokens_seen": 21635865, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.49682617, "step": 1010, "time_per_iteration": 2.922691822052002 }, { "auxiliary_loss_clip": 0.01802396, "auxiliary_loss_mlp": 0.01101886, "balance_loss_clip": 1.48367071, "balance_loss_mlp": 1.04719305, "epoch": 0.060784608447317, "flos": 17386346522880.0, "grad_norm": 2.5556733651468937, "language_loss": 0.74273312, "learning_rate": 3.9900677049961665e-06, "loss": 0.77177596, "num_input_tokens_seen": 21653945, "router_z_loss_clip": 3.18554688, "router_z_loss_mlp": 0.54663086, "step": 1011, "time_per_iteration": 3.039433717727661 }, { "auxiliary_loss_clip": 0.0177328, "auxiliary_loss_mlp": 0.01116799, "balance_loss_clip": 1.47163677, "balance_loss_mlp": 1.06255841, "epoch": 0.06084473169998497, "flos": 23701717077120.0, "grad_norm": 1.8633959632349348, "language_loss": 0.88229191, "learning_rate": 3.990028901381999e-06, "loss": 0.91119266, "num_input_tokens_seen": 21671230, "router_z_loss_clip": 3.01757812, "router_z_loss_mlp": 0.54174805, "step": 1012, "time_per_iteration": 3.075021743774414 }, { "auxiliary_loss_clip": 0.01764553, "auxiliary_loss_mlp": 0.0111357, "balance_loss_clip": 1.46178496, "balance_loss_mlp": 1.05665922, "epoch": 0.06090485495265294, "flos": 23555829283200.0, "grad_norm": 3.4309493970906235, "language_loss": 0.79051965, "learning_rate": 3.989990022305734e-06, "loss": 0.81930089, "num_input_tokens_seen": 21691155, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.56933594, "step": 1013, "time_per_iteration": 3.0801842212677 }, { "auxiliary_loss_clip": 0.01785438, "auxiliary_loss_mlp": 0.01117131, "balance_loss_clip": 1.47955787, "balance_loss_mlp": 1.06262898, "epoch": 0.06096497820532091, "flos": 20348922015360.0, "grad_norm": 3.132082957752681, "language_loss": 0.87760311, "learning_rate": 3.98995106776885e-06, "loss": 0.90662885, "num_input_tokens_seen": 21707405, "router_z_loss_clip": 3.05859375, "router_z_loss_mlp": 0.54541016, "step": 1014, "time_per_iteration": 2.954566717147827 }, { "auxiliary_loss_clip": 0.01787772, "auxiliary_loss_mlp": 0.01097586, "balance_loss_clip": 1.47414505, "balance_loss_mlp": 1.04453802, "epoch": 0.061025101457988874, "flos": 26949281437440.0, "grad_norm": 2.8504941829538786, "language_loss": 0.76601887, "learning_rate": 3.98991203777282e-06, "loss": 0.7948724, "num_input_tokens_seen": 21728090, "router_z_loss_clip": 3.13085938, "router_z_loss_mlp": 0.53051758, "step": 1015, "time_per_iteration": 2.9903922080993652 }, { "auxiliary_loss_clip": 0.017505, "auxiliary_loss_mlp": 0.01109162, "balance_loss_clip": 1.45743346, "balance_loss_mlp": 1.06145489, "epoch": 0.061085224710656846, "flos": 25386168182400.0, "grad_norm": 1.6699326971956314, "language_loss": 0.80578256, "learning_rate": 3.9898729323191275e-06, "loss": 0.8343792, "num_input_tokens_seen": 21747950, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.47705078, "step": 1016, "time_per_iteration": 3.0523488521575928 }, { "auxiliary_loss_clip": 0.0177542, "auxiliary_loss_mlp": 0.01090713, "balance_loss_clip": 1.4750495, "balance_loss_mlp": 1.03783226, "epoch": 0.06114534796332482, "flos": 24834949056000.0, "grad_norm": 1.580675696355111, "language_loss": 0.77814472, "learning_rate": 3.989833751409254e-06, "loss": 0.80680609, "num_input_tokens_seen": 21767900, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.52905273, "step": 1017, "time_per_iteration": 2.9541842937469482 }, { "auxiliary_loss_clip": 0.01784733, "auxiliary_loss_mlp": 0.01119936, "balance_loss_clip": 1.47203934, "balance_loss_mlp": 1.06767416, "epoch": 0.061205471215992784, "flos": 20641195296000.0, "grad_norm": 2.3501197110012293, "language_loss": 0.88139933, "learning_rate": 3.989794495044685e-06, "loss": 0.91044605, "num_input_tokens_seen": 21787375, "router_z_loss_clip": 3.12695312, "router_z_loss_mlp": 0.5222168, "step": 1018, "time_per_iteration": 2.9190120697021484 }, { "auxiliary_loss_clip": 0.01762296, "auxiliary_loss_mlp": 0.01110537, "balance_loss_clip": 1.46461296, "balance_loss_mlp": 1.0611372, "epoch": 0.061265594468660756, "flos": 16516992072960.0, "grad_norm": 3.210226008132924, "language_loss": 0.81643718, "learning_rate": 3.989755163226909e-06, "loss": 0.84516555, "num_input_tokens_seen": 21806275, "router_z_loss_clip": 2.9765625, "router_z_loss_mlp": 0.49389648, "step": 1019, "time_per_iteration": 2.9377410411834717 }, { "auxiliary_loss_clip": 0.0175308, "auxiliary_loss_mlp": 0.01108285, "balance_loss_clip": 1.45773196, "balance_loss_mlp": 1.05683446, "epoch": 0.06132571772132872, "flos": 26256382283520.0, "grad_norm": 1.6369066684028653, "language_loss": 0.85087967, "learning_rate": 3.989715755957418e-06, "loss": 0.87949336, "num_input_tokens_seen": 21826430, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.5144043, "step": 1020, "time_per_iteration": 2.9024717807769775 }, { "auxiliary_loss_clip": 0.01758379, "auxiliary_loss_mlp": 0.01112267, "balance_loss_clip": 1.4619689, "balance_loss_mlp": 1.06162727, "epoch": 0.06138584097399669, "flos": 37428878880000.0, "grad_norm": 2.172716191146108, "language_loss": 0.80054629, "learning_rate": 3.989676273237705e-06, "loss": 0.82925272, "num_input_tokens_seen": 21847800, "router_z_loss_clip": 2.96484375, "router_z_loss_mlp": 0.50634766, "step": 1021, "time_per_iteration": 3.0067856311798096 }, { "auxiliary_loss_clip": 0.01734307, "auxiliary_loss_mlp": 0.0110066, "balance_loss_clip": 1.44287395, "balance_loss_mlp": 1.0499723, "epoch": 0.061445964226664665, "flos": 17429582568960.0, "grad_norm": 1.9900527885747565, "language_loss": 0.90149105, "learning_rate": 3.9896367150692705e-06, "loss": 0.92984062, "num_input_tokens_seen": 21863385, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.50732422, "step": 1022, "time_per_iteration": 2.8270459175109863 }, { "auxiliary_loss_clip": 0.01727234, "auxiliary_loss_mlp": 0.01100139, "balance_loss_clip": 1.43986511, "balance_loss_mlp": 1.04818761, "epoch": 0.06150608747933263, "flos": 22609866107520.0, "grad_norm": 2.465964295804433, "language_loss": 0.8456322, "learning_rate": 3.989597081453611e-06, "loss": 0.87390596, "num_input_tokens_seen": 21881880, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.51977539, "step": 1023, "time_per_iteration": 2.908308506011963 }, { "auxiliary_loss_clip": 0.01458392, "auxiliary_loss_mlp": 0.01106745, "balance_loss_clip": 1.28580499, "balance_loss_mlp": 1.06325746, "epoch": 0.0615662107320006, "flos": 56767264110720.0, "grad_norm": 0.9412958094999629, "language_loss": 0.65272641, "learning_rate": 3.989557372392231e-06, "loss": 0.67837775, "num_input_tokens_seen": 21940550, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.43554688, "step": 1024, "time_per_iteration": 4.7813379764556885 }, { "auxiliary_loss_clip": 0.01762097, "auxiliary_loss_mlp": 0.01113161, "balance_loss_clip": 1.45964742, "balance_loss_mlp": 1.06311727, "epoch": 0.06162633398466857, "flos": 22574955104640.0, "grad_norm": 2.338938560779928, "language_loss": 0.89807844, "learning_rate": 3.989517587886636e-06, "loss": 0.92683101, "num_input_tokens_seen": 21958390, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.5, "step": 1025, "time_per_iteration": 2.8680503368377686 }, { "auxiliary_loss_clip": 0.0175568, "auxiliary_loss_mlp": 0.01096299, "balance_loss_clip": 1.4578923, "balance_loss_mlp": 1.04809117, "epoch": 0.06168645723733654, "flos": 25604049732480.0, "grad_norm": 2.5534052411207275, "language_loss": 0.85558999, "learning_rate": 3.989477727938335e-06, "loss": 0.88410985, "num_input_tokens_seen": 21978625, "router_z_loss_clip": 2.97851562, "router_z_loss_mlp": 0.48144531, "step": 1026, "time_per_iteration": 3.0151915550231934 }, { "auxiliary_loss_clip": 0.01766933, "auxiliary_loss_mlp": 0.01093902, "balance_loss_clip": 1.46521711, "balance_loss_mlp": 1.04605103, "epoch": 0.06174658049000451, "flos": 16006520528640.0, "grad_norm": 4.233212544919618, "language_loss": 0.83724552, "learning_rate": 3.989437792548839e-06, "loss": 0.86585391, "num_input_tokens_seen": 21996035, "router_z_loss_clip": 3.015625, "router_z_loss_mlp": 0.47875977, "step": 1027, "time_per_iteration": 2.9180421829223633 }, { "auxiliary_loss_clip": 0.01741351, "auxiliary_loss_mlp": 0.0108745, "balance_loss_clip": 1.44877517, "balance_loss_mlp": 1.04069662, "epoch": 0.06180670374267248, "flos": 11291979409920.0, "grad_norm": 4.3802808550790795, "language_loss": 0.86246991, "learning_rate": 3.989397781719663e-06, "loss": 0.89075798, "num_input_tokens_seen": 22011625, "router_z_loss_clip": 2.92578125, "router_z_loss_mlp": 0.46728516, "step": 1028, "time_per_iteration": 2.837451219558716 }, { "auxiliary_loss_clip": 0.01455406, "auxiliary_loss_mlp": 0.01056385, "balance_loss_clip": 1.28816664, "balance_loss_mlp": 1.01575804, "epoch": 0.06186682699534045, "flos": 65157305339520.0, "grad_norm": 1.05428391854155, "language_loss": 0.60597384, "learning_rate": 3.989357695452323e-06, "loss": 0.63109183, "num_input_tokens_seen": 22066035, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.40625, "step": 1029, "time_per_iteration": 7.551038503646851 }, { "auxiliary_loss_clip": 0.0173921, "auxiliary_loss_mlp": 0.01108819, "balance_loss_clip": 1.4432013, "balance_loss_mlp": 1.05977595, "epoch": 0.061926950248008414, "flos": 21115624717440.0, "grad_norm": 1.9647172765041028, "language_loss": 0.83996975, "learning_rate": 3.98931753374834e-06, "loss": 0.86844993, "num_input_tokens_seen": 22085015, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.49023438, "step": 1030, "time_per_iteration": 2.8411941528320312 }, { "auxiliary_loss_clip": 0.01750262, "auxiliary_loss_mlp": 0.0110731, "balance_loss_clip": 1.45673215, "balance_loss_mlp": 1.05929208, "epoch": 0.061987073500676386, "flos": 17757309790080.0, "grad_norm": 2.528812080894756, "language_loss": 0.82780004, "learning_rate": 3.989277296609237e-06, "loss": 0.85637575, "num_input_tokens_seen": 22102775, "router_z_loss_clip": 2.93554688, "router_z_loss_mlp": 0.48071289, "step": 1031, "time_per_iteration": 2.912752628326416 }, { "auxiliary_loss_clip": 0.01733133, "auxiliary_loss_mlp": 0.01099299, "balance_loss_clip": 1.44583797, "balance_loss_mlp": 1.05793369, "epoch": 0.06204719675334436, "flos": 21846511520640.0, "grad_norm": 1.5592754345030688, "language_loss": 0.78595883, "learning_rate": 3.98923698403654e-06, "loss": 0.81428319, "num_input_tokens_seen": 22121680, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.41357422, "step": 1032, "time_per_iteration": 2.8701016902923584 }, { "auxiliary_loss_clip": 0.01761686, "auxiliary_loss_mlp": 0.01121842, "balance_loss_clip": 1.46452475, "balance_loss_mlp": 1.07263207, "epoch": 0.06210732000601232, "flos": 19362935174400.0, "grad_norm": 1.8803227839005745, "language_loss": 0.91664344, "learning_rate": 3.989196596031776e-06, "loss": 0.94547874, "num_input_tokens_seen": 22138155, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.49243164, "step": 1033, "time_per_iteration": 2.859041929244995 }, { "auxiliary_loss_clip": 0.01735547, "auxiliary_loss_mlp": 0.01116456, "balance_loss_clip": 1.44652939, "balance_loss_mlp": 1.07521009, "epoch": 0.062167443258680295, "flos": 24758928512640.0, "grad_norm": 1.8834403006290097, "language_loss": 0.86785245, "learning_rate": 3.989156132596479e-06, "loss": 0.89637244, "num_input_tokens_seen": 22157420, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.41210938, "step": 1034, "time_per_iteration": 2.9061217308044434 }, { "auxiliary_loss_clip": 0.01717429, "auxiliary_loss_mlp": 0.01121075, "balance_loss_clip": 1.44049525, "balance_loss_mlp": 1.07911372, "epoch": 0.06222756651134827, "flos": 34471551784320.0, "grad_norm": 2.4027110345740725, "language_loss": 0.83155692, "learning_rate": 3.989115593732182e-06, "loss": 0.85994202, "num_input_tokens_seen": 22178620, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.41967773, "step": 1035, "time_per_iteration": 2.9614744186401367 }, { "auxiliary_loss_clip": 0.01733698, "auxiliary_loss_mlp": 0.01102233, "balance_loss_clip": 1.44758916, "balance_loss_mlp": 1.06108189, "epoch": 0.06228768976401623, "flos": 25677536567040.0, "grad_norm": 2.014183682891057, "language_loss": 0.80815291, "learning_rate": 3.989074979440421e-06, "loss": 0.83651227, "num_input_tokens_seen": 22197125, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.41137695, "step": 1036, "time_per_iteration": 2.9048259258270264 }, { "auxiliary_loss_clip": 0.01727459, "auxiliary_loss_mlp": 0.01105266, "balance_loss_clip": 1.44422376, "balance_loss_mlp": 1.06413913, "epoch": 0.062347813016684205, "flos": 25305034976640.0, "grad_norm": 2.289986129438155, "language_loss": 0.87771559, "learning_rate": 3.989034289722739e-06, "loss": 0.90604287, "num_input_tokens_seen": 22217575, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.41113281, "step": 1037, "time_per_iteration": 2.9057538509368896 }, { "auxiliary_loss_clip": 0.01740674, "auxiliary_loss_mlp": 0.01104176, "balance_loss_clip": 1.45432663, "balance_loss_mlp": 1.06495619, "epoch": 0.06240793626935217, "flos": 26918442466560.0, "grad_norm": 2.17218124273449, "language_loss": 0.82448542, "learning_rate": 3.988993524580676e-06, "loss": 0.85293388, "num_input_tokens_seen": 22236840, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.39208984, "step": 1038, "time_per_iteration": 2.9137020111083984 }, { "auxiliary_loss_clip": 0.01735794, "auxiliary_loss_mlp": 0.01125473, "balance_loss_clip": 1.45549679, "balance_loss_mlp": 1.08396459, "epoch": 0.06246805952202014, "flos": 21625598568960.0, "grad_norm": 1.7732554016872213, "language_loss": 0.87295985, "learning_rate": 3.98895268401578e-06, "loss": 0.90157253, "num_input_tokens_seen": 22256465, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.4152832, "step": 1039, "time_per_iteration": 2.888058662414551 }, { "auxiliary_loss_clip": 0.01757381, "auxiliary_loss_mlp": 0.01105729, "balance_loss_clip": 1.46479321, "balance_loss_mlp": 1.06402981, "epoch": 0.0625281827746881, "flos": 19319744373120.0, "grad_norm": 1.9248844862207504, "language_loss": 0.82382154, "learning_rate": 3.9889117680296e-06, "loss": 0.85245264, "num_input_tokens_seen": 22274025, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.41674805, "step": 1040, "time_per_iteration": 2.921339511871338 }, { "auxiliary_loss_clip": 0.01768295, "auxiliary_loss_mlp": 0.01120702, "balance_loss_clip": 1.47491086, "balance_loss_mlp": 1.07962286, "epoch": 0.06258830602735609, "flos": 27757274659200.0, "grad_norm": 2.3323472699951933, "language_loss": 0.71318835, "learning_rate": 3.988870776623685e-06, "loss": 0.7420783, "num_input_tokens_seen": 22292245, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.41064453, "step": 1041, "time_per_iteration": 2.9496092796325684 }, { "auxiliary_loss_clip": 0.01745431, "auxiliary_loss_mlp": 0.01103614, "balance_loss_clip": 1.45403087, "balance_loss_mlp": 1.06122315, "epoch": 0.06264842928002405, "flos": 23233214724480.0, "grad_norm": 3.9091823612043366, "language_loss": 0.82761753, "learning_rate": 3.9888297097995905e-06, "loss": 0.85610801, "num_input_tokens_seen": 22311455, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.42407227, "step": 1042, "time_per_iteration": 2.8704757690429688 }, { "auxiliary_loss_clip": 0.01760614, "auxiliary_loss_mlp": 0.01108625, "balance_loss_clip": 1.47254574, "balance_loss_mlp": 1.06900001, "epoch": 0.06270855253269202, "flos": 38413734600960.0, "grad_norm": 1.6159776810035342, "language_loss": 0.79290557, "learning_rate": 3.988788567558874e-06, "loss": 0.82159793, "num_input_tokens_seen": 22333750, "router_z_loss_clip": 2.8828125, "router_z_loss_mlp": 0.39624023, "step": 1043, "time_per_iteration": 3.192606210708618 }, { "auxiliary_loss_clip": 0.01737305, "auxiliary_loss_mlp": 0.01084531, "balance_loss_clip": 1.45415425, "balance_loss_mlp": 1.04452479, "epoch": 0.06276867578535998, "flos": 22463209152000.0, "grad_norm": 2.1038082376414335, "language_loss": 0.94099969, "learning_rate": 3.988747349903097e-06, "loss": 0.96921802, "num_input_tokens_seen": 22351940, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.40039062, "step": 1044, "time_per_iteration": 2.956242561340332 }, { "auxiliary_loss_clip": 0.01762031, "auxiliary_loss_mlp": 0.01107552, "balance_loss_clip": 1.47193527, "balance_loss_mlp": 1.06342125, "epoch": 0.06282879903802796, "flos": 22940896199040.0, "grad_norm": 3.299193864973366, "language_loss": 0.86993623, "learning_rate": 3.988706056833821e-06, "loss": 0.89863205, "num_input_tokens_seen": 22372085, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.44140625, "step": 1045, "time_per_iteration": 3.019695281982422 }, { "auxiliary_loss_clip": 0.01755911, "auxiliary_loss_mlp": 0.01104295, "balance_loss_clip": 1.47076941, "balance_loss_mlp": 1.06450295, "epoch": 0.06288892229069593, "flos": 34831339585920.0, "grad_norm": 1.8370873263421879, "language_loss": 0.80741465, "learning_rate": 3.9886646883526125e-06, "loss": 0.83601665, "num_input_tokens_seen": 22392020, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.39819336, "step": 1046, "time_per_iteration": 3.028822422027588 }, { "auxiliary_loss_clip": 0.01767714, "auxiliary_loss_mlp": 0.01108284, "balance_loss_clip": 1.47247636, "balance_loss_mlp": 1.06980324, "epoch": 0.06294904554336389, "flos": 19436512498560.0, "grad_norm": 1.9217036854499492, "language_loss": 0.79487407, "learning_rate": 3.988623244461039e-06, "loss": 0.82363403, "num_input_tokens_seen": 22411180, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.38500977, "step": 1047, "time_per_iteration": 2.847055673599243 }, { "auxiliary_loss_clip": 0.01804642, "auxiliary_loss_mlp": 0.01102612, "balance_loss_clip": 1.49930358, "balance_loss_mlp": 1.05929172, "epoch": 0.06300916879603187, "flos": 40676759953920.0, "grad_norm": 2.2964380453478643, "language_loss": 0.79250073, "learning_rate": 3.988581725160672e-06, "loss": 0.82157326, "num_input_tokens_seen": 22435105, "router_z_loss_clip": 3.05664062, "router_z_loss_mlp": 0.43286133, "step": 1048, "time_per_iteration": 3.108105421066284 }, { "auxiliary_loss_clip": 0.01776605, "auxiliary_loss_mlp": 0.01093656, "balance_loss_clip": 1.48267186, "balance_loss_mlp": 1.04973984, "epoch": 0.06306929204869983, "flos": 23814322680960.0, "grad_norm": 2.3495273603732727, "language_loss": 0.79698145, "learning_rate": 3.988540130453087e-06, "loss": 0.82568407, "num_input_tokens_seen": 22452710, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.43896484, "step": 1049, "time_per_iteration": 2.916186809539795 }, { "auxiliary_loss_clip": 0.01775529, "auxiliary_loss_mlp": 0.01084095, "balance_loss_clip": 1.48018289, "balance_loss_mlp": 1.0391531, "epoch": 0.0631294153013678, "flos": 18924909834240.0, "grad_norm": 1.9284429166849173, "language_loss": 0.84785175, "learning_rate": 3.988498460339862e-06, "loss": 0.87644798, "num_input_tokens_seen": 22470175, "router_z_loss_clip": 2.95507812, "router_z_loss_mlp": 0.44921875, "step": 1050, "time_per_iteration": 2.903005599975586 }, { "auxiliary_loss_clip": 0.01764088, "auxiliary_loss_mlp": 0.01089241, "balance_loss_clip": 1.48199725, "balance_loss_mlp": 1.04933023, "epoch": 0.06318953855403578, "flos": 24290923852800.0, "grad_norm": 2.9277066596991337, "language_loss": 0.79522878, "learning_rate": 3.988456714822575e-06, "loss": 0.82376212, "num_input_tokens_seen": 22490020, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.39892578, "step": 1051, "time_per_iteration": 2.9847066402435303 }, { "auxiliary_loss_clip": 0.01787204, "auxiliary_loss_mlp": 0.01098907, "balance_loss_clip": 1.49109316, "balance_loss_mlp": 1.05754137, "epoch": 0.06324966180670374, "flos": 22539229695360.0, "grad_norm": 2.9056891946360137, "language_loss": 0.82186389, "learning_rate": 3.98841489390281e-06, "loss": 0.85072505, "num_input_tokens_seen": 22509685, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.41333008, "step": 1052, "time_per_iteration": 2.92612361907959 }, { "auxiliary_loss_clip": 0.01771935, "auxiliary_loss_mlp": 0.01103932, "balance_loss_clip": 1.4738822, "balance_loss_mlp": 1.06247079, "epoch": 0.06330978505937171, "flos": 15786421983360.0, "grad_norm": 2.0808861890616317, "language_loss": 0.80976057, "learning_rate": 3.988372997582155e-06, "loss": 0.83851916, "num_input_tokens_seen": 22527905, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.41479492, "step": 1053, "time_per_iteration": 2.9314537048339844 }, { "auxiliary_loss_clip": 0.01755651, "auxiliary_loss_mlp": 0.01089254, "balance_loss_clip": 1.46913409, "balance_loss_mlp": 1.04579055, "epoch": 0.06336990831203967, "flos": 21481203853440.0, "grad_norm": 1.7863040620360378, "language_loss": 0.86004102, "learning_rate": 3.988331025862195e-06, "loss": 0.88849002, "num_input_tokens_seen": 22546335, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.43457031, "step": 1054, "time_per_iteration": 2.9013099670410156 }, { "auxiliary_loss_clip": 0.01757754, "auxiliary_loss_mlp": 0.01107937, "balance_loss_clip": 1.46588516, "balance_loss_mlp": 1.06709647, "epoch": 0.06343003156470765, "flos": 18488106103680.0, "grad_norm": 1.8192064518483613, "language_loss": 0.86988091, "learning_rate": 3.9882889787445225e-06, "loss": 0.89853787, "num_input_tokens_seen": 22563885, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.40844727, "step": 1055, "time_per_iteration": 2.931546211242676 }, { "auxiliary_loss_clip": 0.0178881, "auxiliary_loss_mlp": 0.01104719, "balance_loss_clip": 1.4807725, "balance_loss_mlp": 1.0607065, "epoch": 0.06349015481737562, "flos": 25165617189120.0, "grad_norm": 2.6190387645977924, "language_loss": 0.83905435, "learning_rate": 3.988246856230734e-06, "loss": 0.86798966, "num_input_tokens_seen": 22583035, "router_z_loss_clip": 3.08203125, "router_z_loss_mlp": 0.44018555, "step": 1056, "time_per_iteration": 2.962200164794922 }, { "auxiliary_loss_clip": 0.01787566, "auxiliary_loss_mlp": 0.01111884, "balance_loss_clip": 1.47861218, "balance_loss_mlp": 1.06029046, "epoch": 0.06355027807004358, "flos": 26883486218880.0, "grad_norm": 2.153540532384241, "language_loss": 0.82990301, "learning_rate": 3.988204658322426e-06, "loss": 0.85889751, "num_input_tokens_seen": 22605055, "router_z_loss_clip": 3.08984375, "router_z_loss_mlp": 0.51611328, "step": 1057, "time_per_iteration": 2.947171926498413 }, { "auxiliary_loss_clip": 0.01722677, "auxiliary_loss_mlp": 0.01089327, "balance_loss_clip": 1.44525552, "balance_loss_mlp": 1.04500484, "epoch": 0.06361040132271156, "flos": 21406404919680.0, "grad_norm": 1.839197615920782, "language_loss": 0.84954029, "learning_rate": 3.988162385021196e-06, "loss": 0.87766027, "num_input_tokens_seen": 22623760, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.44311523, "step": 1058, "time_per_iteration": 2.9516613483428955 }, { "auxiliary_loss_clip": 0.01749457, "auxiliary_loss_mlp": 0.01117299, "balance_loss_clip": 1.45961094, "balance_loss_mlp": 1.07214236, "epoch": 0.06367052457537953, "flos": 25743377030400.0, "grad_norm": 1.855521302230082, "language_loss": 0.88974363, "learning_rate": 3.988120036328651e-06, "loss": 0.9184112, "num_input_tokens_seen": 22643000, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.45141602, "step": 1059, "time_per_iteration": 4.368882656097412 }, { "auxiliary_loss_clip": 0.01752919, "auxiliary_loss_mlp": 0.01106652, "balance_loss_clip": 1.46077883, "balance_loss_mlp": 1.06070864, "epoch": 0.0637306478280475, "flos": 17638279424640.0, "grad_norm": 1.887208650155512, "language_loss": 0.9380244, "learning_rate": 3.988077612246394e-06, "loss": 0.96662015, "num_input_tokens_seen": 22660460, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.45922852, "step": 1060, "time_per_iteration": 2.869223117828369 }, { "auxiliary_loss_clip": 0.01723478, "auxiliary_loss_mlp": 0.01097943, "balance_loss_clip": 1.44218194, "balance_loss_mlp": 1.05357313, "epoch": 0.06379077108071547, "flos": 13670551278720.0, "grad_norm": 2.4857569845775047, "language_loss": 0.89936805, "learning_rate": 3.988035112776035e-06, "loss": 0.92758238, "num_input_tokens_seen": 22679270, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.44384766, "step": 1061, "time_per_iteration": 2.9586708545684814 }, { "auxiliary_loss_clip": 0.01744387, "auxiliary_loss_mlp": 0.01129532, "balance_loss_clip": 1.44540977, "balance_loss_mlp": 1.08041799, "epoch": 0.06385089433338344, "flos": 28491419088000.0, "grad_norm": 2.485194966339801, "language_loss": 0.79436672, "learning_rate": 3.987992537919185e-06, "loss": 0.82310593, "num_input_tokens_seen": 22699330, "router_z_loss_clip": 2.98828125, "router_z_loss_mlp": 0.49145508, "step": 1062, "time_per_iteration": 2.979839563369751 }, { "auxiliary_loss_clip": 0.01743715, "auxiliary_loss_mlp": 0.0110472, "balance_loss_clip": 1.45320678, "balance_loss_mlp": 1.06230509, "epoch": 0.0639110175860514, "flos": 24320360234880.0, "grad_norm": 2.225872801233063, "language_loss": 0.88141608, "learning_rate": 3.987949887677459e-06, "loss": 0.90990037, "num_input_tokens_seen": 22717945, "router_z_loss_clip": 2.90429688, "router_z_loss_mlp": 0.42431641, "step": 1063, "time_per_iteration": 2.9812557697296143 }, { "auxiliary_loss_clip": 0.01753563, "auxiliary_loss_mlp": 0.01108147, "balance_loss_clip": 1.45970643, "balance_loss_mlp": 1.06196523, "epoch": 0.06397114083871938, "flos": 22100887641600.0, "grad_norm": 1.904324705273974, "language_loss": 0.8215313, "learning_rate": 3.9879071620524744e-06, "loss": 0.85014844, "num_input_tokens_seen": 22736790, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.46191406, "step": 1064, "time_per_iteration": 5.847304582595825 }, { "auxiliary_loss_clip": 0.01728762, "auxiliary_loss_mlp": 0.01119061, "balance_loss_clip": 1.44704461, "balance_loss_mlp": 1.07380879, "epoch": 0.06403126409138735, "flos": 19582174068480.0, "grad_norm": 2.4519472243780625, "language_loss": 0.86275077, "learning_rate": 3.987864361045851e-06, "loss": 0.89122903, "num_input_tokens_seen": 22754745, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.45263672, "step": 1065, "time_per_iteration": 2.972856283187866 }, { "auxiliary_loss_clip": 0.01721832, "auxiliary_loss_mlp": 0.01119014, "balance_loss_clip": 1.43451214, "balance_loss_mlp": 1.0745728, "epoch": 0.06409138734405531, "flos": 40822240544640.0, "grad_norm": 1.4168105153476913, "language_loss": 0.69800854, "learning_rate": 3.987821484659211e-06, "loss": 0.72641706, "num_input_tokens_seen": 22776780, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.44482422, "step": 1066, "time_per_iteration": 3.0889222621917725 }, { "auxiliary_loss_clip": 0.0172932, "auxiliary_loss_mlp": 0.01110269, "balance_loss_clip": 1.44399118, "balance_loss_mlp": 1.06866479, "epoch": 0.06415151059672328, "flos": 20449402012800.0, "grad_norm": 1.9220329987313058, "language_loss": 0.92754364, "learning_rate": 3.987778532894181e-06, "loss": 0.95593953, "num_input_tokens_seen": 22793915, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.41601562, "step": 1067, "time_per_iteration": 2.939772605895996 }, { "auxiliary_loss_clip": 0.01732527, "auxiliary_loss_mlp": 0.01115119, "balance_loss_clip": 1.44641566, "balance_loss_mlp": 1.06960523, "epoch": 0.06421163384939126, "flos": 18079743369600.0, "grad_norm": 2.1195306808710255, "language_loss": 0.85973728, "learning_rate": 3.987735505752391e-06, "loss": 0.88821375, "num_input_tokens_seen": 22812670, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.45507812, "step": 1068, "time_per_iteration": 2.9522650241851807 }, { "auxiliary_loss_clip": 0.01719565, "auxiliary_loss_mlp": 0.01122733, "balance_loss_clip": 1.44106495, "balance_loss_mlp": 1.07297516, "epoch": 0.06427175710205922, "flos": 25130434717440.0, "grad_norm": 2.135140385739313, "language_loss": 0.91790491, "learning_rate": 3.987692403235471e-06, "loss": 0.94632792, "num_input_tokens_seen": 22832440, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.49780273, "step": 1069, "time_per_iteration": 2.981790542602539 }, { "auxiliary_loss_clip": 0.01732515, "auxiliary_loss_mlp": 0.01124218, "balance_loss_clip": 1.44390666, "balance_loss_mlp": 1.07665372, "epoch": 0.06433188035472719, "flos": 17388518273280.0, "grad_norm": 2.454605877266586, "language_loss": 0.98568225, "learning_rate": 3.987649225345056e-06, "loss": 1.01424956, "num_input_tokens_seen": 22845495, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.47607422, "step": 1070, "time_per_iteration": 2.917107582092285 }, { "auxiliary_loss_clip": 0.01733819, "auxiliary_loss_mlp": 0.01104872, "balance_loss_clip": 1.44733763, "balance_loss_mlp": 1.05847549, "epoch": 0.06439200360739517, "flos": 23555738793600.0, "grad_norm": 1.6570412362268654, "language_loss": 0.89191127, "learning_rate": 3.987605972082782e-06, "loss": 0.92029816, "num_input_tokens_seen": 22865390, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.46386719, "step": 1071, "time_per_iteration": 2.9443490505218506 }, { "auxiliary_loss_clip": 0.01704184, "auxiliary_loss_mlp": 0.01109491, "balance_loss_clip": 1.42826271, "balance_loss_mlp": 1.06199837, "epoch": 0.06445212686006313, "flos": 21989458402560.0, "grad_norm": 2.796421678623422, "language_loss": 0.78996003, "learning_rate": 3.987562643450292e-06, "loss": 0.81809676, "num_input_tokens_seen": 22885495, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.4753418, "step": 1072, "time_per_iteration": 2.9573192596435547 }, { "auxiliary_loss_clip": 0.01725489, "auxiliary_loss_mlp": 0.01118141, "balance_loss_clip": 1.43980217, "balance_loss_mlp": 1.07036233, "epoch": 0.0645122501127311, "flos": 25932274646400.0, "grad_norm": 2.5198820194986387, "language_loss": 0.83350307, "learning_rate": 3.987519239449226e-06, "loss": 0.86193937, "num_input_tokens_seen": 22904845, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.47753906, "step": 1073, "time_per_iteration": 2.967038631439209 }, { "auxiliary_loss_clip": 0.01717309, "auxiliary_loss_mlp": 0.01104622, "balance_loss_clip": 1.44258928, "balance_loss_mlp": 1.0587976, "epoch": 0.06457237336539907, "flos": 25636065068160.0, "grad_norm": 2.2892290814939225, "language_loss": 0.81935132, "learning_rate": 3.987475760081233e-06, "loss": 0.84757066, "num_input_tokens_seen": 22925940, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.45874023, "step": 1074, "time_per_iteration": 3.0130085945129395 }, { "auxiliary_loss_clip": 0.0172568, "auxiliary_loss_mlp": 0.01092348, "balance_loss_clip": 1.4453876, "balance_loss_mlp": 1.04909921, "epoch": 0.06463249661806704, "flos": 19473459517440.0, "grad_norm": 4.539720651722752, "language_loss": 0.82304054, "learning_rate": 3.987432205347958e-06, "loss": 0.85122085, "num_input_tokens_seen": 22944375, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.43237305, "step": 1075, "time_per_iteration": 3.0138700008392334 }, { "auxiliary_loss_clip": 0.01710781, "auxiliary_loss_mlp": 0.01099048, "balance_loss_clip": 1.43353677, "balance_loss_mlp": 1.05541778, "epoch": 0.064692619870735, "flos": 24508579178880.0, "grad_norm": 2.281583428462896, "language_loss": 0.90676308, "learning_rate": 3.987388575251055e-06, "loss": 0.93486142, "num_input_tokens_seen": 22959145, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.43652344, "step": 1076, "time_per_iteration": 2.9074480533599854 }, { "auxiliary_loss_clip": 0.01724738, "auxiliary_loss_mlp": 0.01106654, "balance_loss_clip": 1.44384992, "balance_loss_mlp": 1.05951834, "epoch": 0.06475274312340297, "flos": 17027327882880.0, "grad_norm": 1.8316548306322429, "language_loss": 0.82992005, "learning_rate": 3.98734486979218e-06, "loss": 0.85823405, "num_input_tokens_seen": 22978100, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.47143555, "step": 1077, "time_per_iteration": 2.9090092182159424 }, { "auxiliary_loss_clip": 0.01742388, "auxiliary_loss_mlp": 0.01105175, "balance_loss_clip": 1.44995594, "balance_loss_mlp": 1.05629945, "epoch": 0.06481286637607095, "flos": 24583287623040.0, "grad_norm": 2.002357603417955, "language_loss": 0.94002444, "learning_rate": 3.987301088972986e-06, "loss": 0.96850008, "num_input_tokens_seen": 22997285, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.48901367, "step": 1078, "time_per_iteration": 2.9229512214660645 }, { "auxiliary_loss_clip": 0.01760043, "auxiliary_loss_mlp": 0.0110283, "balance_loss_clip": 1.46047914, "balance_loss_mlp": 1.05311978, "epoch": 0.06487298962873891, "flos": 21115488983040.0, "grad_norm": 1.8746992090384071, "language_loss": 0.80908412, "learning_rate": 3.987257232795137e-06, "loss": 0.83771282, "num_input_tokens_seen": 23016285, "router_z_loss_clip": 2.9921875, "router_z_loss_mlp": 0.49707031, "step": 1079, "time_per_iteration": 2.9435999393463135 }, { "auxiliary_loss_clip": 0.01732123, "auxiliary_loss_mlp": 0.01107938, "balance_loss_clip": 1.44813764, "balance_loss_mlp": 1.061113, "epoch": 0.06493311288140688, "flos": 24618922542720.0, "grad_norm": 1.8382228937895808, "language_loss": 0.72456443, "learning_rate": 3.987213301260294e-06, "loss": 0.75296497, "num_input_tokens_seen": 23036420, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.46826172, "step": 1080, "time_per_iteration": 2.9569170475006104 }, { "auxiliary_loss_clip": 0.01734279, "auxiliary_loss_mlp": 0.0108878, "balance_loss_clip": 1.44639111, "balance_loss_mlp": 1.03797281, "epoch": 0.06499323613407486, "flos": 25348949694720.0, "grad_norm": 2.603785400984728, "language_loss": 0.74544013, "learning_rate": 3.987169294370123e-06, "loss": 0.77367067, "num_input_tokens_seen": 23056945, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.50830078, "step": 1081, "time_per_iteration": 2.929741382598877 }, { "auxiliary_loss_clip": 0.01716126, "auxiliary_loss_mlp": 0.01087034, "balance_loss_clip": 1.4401443, "balance_loss_mlp": 1.03636992, "epoch": 0.06505335938674282, "flos": 20385326096640.0, "grad_norm": 2.662456643947311, "language_loss": 0.86840391, "learning_rate": 3.987125212126294e-06, "loss": 0.8964355, "num_input_tokens_seen": 23074940, "router_z_loss_clip": 2.75976562, "router_z_loss_mlp": 0.50708008, "step": 1082, "time_per_iteration": 2.90867280960083 }, { "auxiliary_loss_clip": 0.01758874, "auxiliary_loss_mlp": 0.0109391, "balance_loss_clip": 1.45710778, "balance_loss_mlp": 1.04562998, "epoch": 0.06511348263941079, "flos": 25348859205120.0, "grad_norm": 2.4265526615676407, "language_loss": 0.84446776, "learning_rate": 3.987081054530478e-06, "loss": 0.87299562, "num_input_tokens_seen": 23093420, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.4831543, "step": 1083, "time_per_iteration": 2.888659954071045 }, { "auxiliary_loss_clip": 0.01732819, "auxiliary_loss_mlp": 0.01103274, "balance_loss_clip": 1.44616389, "balance_loss_mlp": 1.05218053, "epoch": 0.06517360589207877, "flos": 20340732706560.0, "grad_norm": 2.1264269036891643, "language_loss": 0.81526709, "learning_rate": 3.987036821584348e-06, "loss": 0.84362805, "num_input_tokens_seen": 23111550, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.51098633, "step": 1084, "time_per_iteration": 2.9511709213256836 }, { "auxiliary_loss_clip": 0.01725693, "auxiliary_loss_mlp": 0.01098321, "balance_loss_clip": 1.44075513, "balance_loss_mlp": 1.04465258, "epoch": 0.06523372914474673, "flos": 31692987469440.0, "grad_norm": 3.1313932209047604, "language_loss": 0.68742931, "learning_rate": 3.986992513289584e-06, "loss": 0.71566951, "num_input_tokens_seen": 23130335, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.53637695, "step": 1085, "time_per_iteration": 3.1240668296813965 }, { "auxiliary_loss_clip": 0.0173554, "auxiliary_loss_mlp": 0.01095776, "balance_loss_clip": 1.45352054, "balance_loss_mlp": 1.04339552, "epoch": 0.0652938523974147, "flos": 20788168965120.0, "grad_norm": 1.9172528250795644, "language_loss": 0.78989136, "learning_rate": 3.9869481296478645e-06, "loss": 0.81820452, "num_input_tokens_seen": 23152380, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.52368164, "step": 1086, "time_per_iteration": 3.111402750015259 }, { "auxiliary_loss_clip": 0.01732224, "auxiliary_loss_mlp": 0.01107969, "balance_loss_clip": 1.45045519, "balance_loss_mlp": 1.056041, "epoch": 0.06535397565008266, "flos": 16699736396160.0, "grad_norm": 2.327778367103435, "language_loss": 0.87041205, "learning_rate": 3.986903670660872e-06, "loss": 0.89881396, "num_input_tokens_seen": 23171630, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.51953125, "step": 1087, "time_per_iteration": 2.9237046241760254 }, { "auxiliary_loss_clip": 0.0173413, "auxiliary_loss_mlp": 0.01106527, "balance_loss_clip": 1.45015228, "balance_loss_mlp": 1.05824733, "epoch": 0.06541409890275064, "flos": 26879006983680.0, "grad_norm": 2.124317695553559, "language_loss": 0.80336535, "learning_rate": 3.9868591363302945e-06, "loss": 0.83177185, "num_input_tokens_seen": 23192520, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.4831543, "step": 1088, "time_per_iteration": 2.9563703536987305 }, { "auxiliary_loss_clip": 0.01731971, "auxiliary_loss_mlp": 0.01097546, "balance_loss_clip": 1.44776869, "balance_loss_mlp": 1.05119753, "epoch": 0.06547422215541861, "flos": 20531259135360.0, "grad_norm": 1.9834864647016117, "language_loss": 0.73003018, "learning_rate": 3.9868145266578186e-06, "loss": 0.75832546, "num_input_tokens_seen": 23210710, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.46386719, "step": 1089, "time_per_iteration": 3.001741409301758 }, { "auxiliary_loss_clip": 0.01715105, "auxiliary_loss_mlp": 0.01078199, "balance_loss_clip": 1.43955708, "balance_loss_mlp": 1.03490257, "epoch": 0.06553434540808657, "flos": 22026405421440.0, "grad_norm": 1.6022040135244706, "language_loss": 0.87374407, "learning_rate": 3.9867698416451366e-06, "loss": 0.90167713, "num_input_tokens_seen": 23230305, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.43286133, "step": 1090, "time_per_iteration": 3.0122454166412354 }, { "auxiliary_loss_clip": 0.01737361, "auxiliary_loss_mlp": 0.01099679, "balance_loss_clip": 1.45306754, "balance_loss_mlp": 1.05185258, "epoch": 0.06559446866075455, "flos": 24619510725120.0, "grad_norm": 2.796697461268191, "language_loss": 0.74027598, "learning_rate": 3.9867250812939434e-06, "loss": 0.76864642, "num_input_tokens_seen": 23249015, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.47827148, "step": 1091, "time_per_iteration": 2.9875214099884033 }, { "auxiliary_loss_clip": 0.01741994, "auxiliary_loss_mlp": 0.01094948, "balance_loss_clip": 1.46008587, "balance_loss_mlp": 1.04533267, "epoch": 0.06565459191342252, "flos": 24284001398400.0, "grad_norm": 2.0419714628705514, "language_loss": 0.84173548, "learning_rate": 3.986680245605936e-06, "loss": 0.87010491, "num_input_tokens_seen": 23265105, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.49584961, "step": 1092, "time_per_iteration": 2.9963228702545166 }, { "auxiliary_loss_clip": 0.01744972, "auxiliary_loss_mlp": 0.01091014, "balance_loss_clip": 1.45675588, "balance_loss_mlp": 1.04001641, "epoch": 0.06571471516609048, "flos": 24797006651520.0, "grad_norm": 1.7372875513209753, "language_loss": 0.72635341, "learning_rate": 3.986635334582814e-06, "loss": 0.75471324, "num_input_tokens_seen": 23283950, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.51000977, "step": 1093, "time_per_iteration": 3.0057733058929443 }, { "auxiliary_loss_clip": 0.01737242, "auxiliary_loss_mlp": 0.01096707, "balance_loss_clip": 1.45472765, "balance_loss_mlp": 1.04907107, "epoch": 0.06577483841875846, "flos": 26225362333440.0, "grad_norm": 1.7927805142556603, "language_loss": 0.89127207, "learning_rate": 3.986590348226282e-06, "loss": 0.91961157, "num_input_tokens_seen": 23305005, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.47631836, "step": 1094, "time_per_iteration": 4.393615484237671 }, { "auxiliary_loss_clip": 0.01754974, "auxiliary_loss_mlp": 0.01107473, "balance_loss_clip": 1.47047555, "balance_loss_mlp": 1.05683303, "epoch": 0.06583496167142643, "flos": 25091225458560.0, "grad_norm": 1.4772628579604468, "language_loss": 0.830634, "learning_rate": 3.986545286538044e-06, "loss": 0.85925847, "num_input_tokens_seen": 23323220, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.50585938, "step": 1095, "time_per_iteration": 3.063180685043335 }, { "auxiliary_loss_clip": 0.01758167, "auxiliary_loss_mlp": 0.01088261, "balance_loss_clip": 1.4688561, "balance_loss_mlp": 1.04153073, "epoch": 0.06589508492409439, "flos": 25640815772160.0, "grad_norm": 2.3154209125445924, "language_loss": 0.73786545, "learning_rate": 3.986500149519811e-06, "loss": 0.76632971, "num_input_tokens_seen": 23342235, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.46655273, "step": 1096, "time_per_iteration": 3.0325822830200195 }, { "auxiliary_loss_clip": 0.01744085, "auxiliary_loss_mlp": 0.0109002, "balance_loss_clip": 1.46101153, "balance_loss_mlp": 1.04591215, "epoch": 0.06595520817676236, "flos": 23631261644160.0, "grad_norm": 1.6178821112809478, "language_loss": 0.79414994, "learning_rate": 3.986454937173292e-06, "loss": 0.82249093, "num_input_tokens_seen": 23363680, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.44091797, "step": 1097, "time_per_iteration": 3.0436770915985107 }, { "auxiliary_loss_clip": 0.01777051, "auxiliary_loss_mlp": 0.01099625, "balance_loss_clip": 1.47507608, "balance_loss_mlp": 1.05444515, "epoch": 0.06601533142943034, "flos": 33814604263680.0, "grad_norm": 2.00212914439608, "language_loss": 0.79848421, "learning_rate": 3.986409649500203e-06, "loss": 0.82725102, "num_input_tokens_seen": 23385590, "router_z_loss_clip": 3.02148438, "router_z_loss_mlp": 0.45214844, "step": 1098, "time_per_iteration": 3.115184783935547 }, { "auxiliary_loss_clip": 0.01769933, "auxiliary_loss_mlp": 0.01085563, "balance_loss_clip": 1.48165345, "balance_loss_mlp": 1.04000151, "epoch": 0.0660754546820983, "flos": 20266929158400.0, "grad_norm": 2.036835422515395, "language_loss": 0.83505535, "learning_rate": 3.986364286502261e-06, "loss": 0.86361033, "num_input_tokens_seen": 23402945, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.45605469, "step": 1099, "time_per_iteration": 7.400240421295166 }, { "auxiliary_loss_clip": 0.01750478, "auxiliary_loss_mlp": 0.01075247, "balance_loss_clip": 1.46780109, "balance_loss_mlp": 1.03013837, "epoch": 0.06613557793476627, "flos": 19363523356800.0, "grad_norm": 1.9774998621546798, "language_loss": 0.84955657, "learning_rate": 3.986318848181186e-06, "loss": 0.87781382, "num_input_tokens_seen": 23421410, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.45117188, "step": 1100, "time_per_iteration": 2.974531650543213 }, { "auxiliary_loss_clip": 0.01761734, "auxiliary_loss_mlp": 0.01091089, "balance_loss_clip": 1.47157216, "balance_loss_mlp": 1.04423952, "epoch": 0.06619570118743424, "flos": 13780668418560.0, "grad_norm": 3.1202256330676112, "language_loss": 0.7497772, "learning_rate": 3.986273334538702e-06, "loss": 0.77830541, "num_input_tokens_seen": 23438870, "router_z_loss_clip": 2.8984375, "router_z_loss_mlp": 0.46850586, "step": 1101, "time_per_iteration": 2.9259326457977295 }, { "auxiliary_loss_clip": 0.01767658, "auxiliary_loss_mlp": 0.01080252, "balance_loss_clip": 1.47729135, "balance_loss_mlp": 1.03676498, "epoch": 0.06625582444010221, "flos": 17866748257920.0, "grad_norm": 2.58149305793957, "language_loss": 0.885665, "learning_rate": 3.986227745576533e-06, "loss": 0.9141441, "num_input_tokens_seen": 23456975, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.43481445, "step": 1102, "time_per_iteration": 2.9965481758117676 }, { "auxiliary_loss_clip": 0.01777894, "auxiliary_loss_mlp": 0.01083197, "balance_loss_clip": 1.48404551, "balance_loss_mlp": 1.03513217, "epoch": 0.06631594769277017, "flos": 11846275182720.0, "grad_norm": 1.9666606129193156, "language_loss": 0.84470856, "learning_rate": 3.98618208129641e-06, "loss": 0.87331951, "num_input_tokens_seen": 23473440, "router_z_loss_clip": 2.93945312, "router_z_loss_mlp": 0.48046875, "step": 1103, "time_per_iteration": 2.9953227043151855 }, { "auxiliary_loss_clip": 0.01770883, "auxiliary_loss_mlp": 0.01114489, "balance_loss_clip": 1.48208094, "balance_loss_mlp": 1.0660423, "epoch": 0.06637607094543815, "flos": 19803177509760.0, "grad_norm": 1.841229150888138, "language_loss": 0.83678144, "learning_rate": 3.986136341700063e-06, "loss": 0.86563516, "num_input_tokens_seen": 23493880, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.48388672, "step": 1104, "time_per_iteration": 2.9772369861602783 }, { "auxiliary_loss_clip": 0.01769873, "auxiliary_loss_mlp": 0.01090531, "balance_loss_clip": 1.48641241, "balance_loss_mlp": 1.0470438, "epoch": 0.06643619419810612, "flos": 25497099728640.0, "grad_norm": 1.4957116099067367, "language_loss": 0.80923736, "learning_rate": 3.986090526789227e-06, "loss": 0.83784139, "num_input_tokens_seen": 23514920, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.43481445, "step": 1105, "time_per_iteration": 3.0643210411071777 }, { "auxiliary_loss_clip": 0.01767182, "auxiliary_loss_mlp": 0.01097072, "balance_loss_clip": 1.48648906, "balance_loss_mlp": 1.05320334, "epoch": 0.06649631745077408, "flos": 16955379371520.0, "grad_norm": 1.7595968954551977, "language_loss": 0.97386545, "learning_rate": 3.986044636565639e-06, "loss": 1.00250793, "num_input_tokens_seen": 23531635, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.43920898, "step": 1106, "time_per_iteration": 2.998225688934326 }, { "auxiliary_loss_clip": 0.01790624, "auxiliary_loss_mlp": 0.01079645, "balance_loss_clip": 1.49478602, "balance_loss_mlp": 1.03529942, "epoch": 0.06655644070344206, "flos": 17867653153920.0, "grad_norm": 1.6565295343057573, "language_loss": 0.84513593, "learning_rate": 3.985998671031039e-06, "loss": 0.87383866, "num_input_tokens_seen": 23551020, "router_z_loss_clip": 2.95898438, "router_z_loss_mlp": 0.44335938, "step": 1107, "time_per_iteration": 3.1502256393432617 }, { "auxiliary_loss_clip": 0.014676, "auxiliary_loss_mlp": 0.01172473, "balance_loss_clip": 1.28858137, "balance_loss_mlp": 1.13337255, "epoch": 0.06661656395611003, "flos": 61448342060160.0, "grad_norm": 0.830976931440527, "language_loss": 0.56755757, "learning_rate": 3.9859526301871705e-06, "loss": 0.59395826, "num_input_tokens_seen": 23610675, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.390625, "step": 1108, "time_per_iteration": 3.35772967338562 }, { "auxiliary_loss_clip": 0.01784933, "auxiliary_loss_mlp": 0.01082812, "balance_loss_clip": 1.4914031, "balance_loss_mlp": 1.03894269, "epoch": 0.066676687208778, "flos": 20671310350080.0, "grad_norm": 2.5541455989432813, "language_loss": 0.74656141, "learning_rate": 3.9859065140357795e-06, "loss": 0.77523875, "num_input_tokens_seen": 23628710, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.43823242, "step": 1109, "time_per_iteration": 2.9016807079315186 }, { "auxiliary_loss_clip": 0.01758358, "auxiliary_loss_mlp": 0.01079347, "balance_loss_clip": 1.4752115, "balance_loss_mlp": 1.03888702, "epoch": 0.06673681046144596, "flos": 20933106618240.0, "grad_norm": 1.620623019912696, "language_loss": 0.79759663, "learning_rate": 3.985860322578614e-06, "loss": 0.82597369, "num_input_tokens_seen": 23649160, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.40478516, "step": 1110, "time_per_iteration": 2.9524264335632324 }, { "auxiliary_loss_clip": 0.01773748, "auxiliary_loss_mlp": 0.0110024, "balance_loss_clip": 1.48403859, "balance_loss_mlp": 1.06128192, "epoch": 0.06679693371411394, "flos": 31078144874880.0, "grad_norm": 3.659221291677558, "language_loss": 0.72957015, "learning_rate": 3.985814055817427e-06, "loss": 0.75831002, "num_input_tokens_seen": 23671995, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.3894043, "step": 1111, "time_per_iteration": 2.977801561355591 }, { "auxiliary_loss_clip": 0.01780874, "auxiliary_loss_mlp": 0.01107994, "balance_loss_clip": 1.48701584, "balance_loss_mlp": 1.0677731, "epoch": 0.0668570569667819, "flos": 21736213401600.0, "grad_norm": 1.9440383483003574, "language_loss": 0.80538237, "learning_rate": 3.985767713753971e-06, "loss": 0.83427101, "num_input_tokens_seen": 23690705, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 0.40234375, "step": 1112, "time_per_iteration": 2.942483425140381 }, { "auxiliary_loss_clip": 0.01774849, "auxiliary_loss_mlp": 0.01120721, "balance_loss_clip": 1.48869443, "balance_loss_mlp": 1.07763863, "epoch": 0.06691718021944987, "flos": 22757473203840.0, "grad_norm": 1.9991070425184172, "language_loss": 0.81648648, "learning_rate": 3.985721296390005e-06, "loss": 0.84544218, "num_input_tokens_seen": 23709990, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.4309082, "step": 1113, "time_per_iteration": 2.9562251567840576 }, { "auxiliary_loss_clip": 0.01744177, "auxiliary_loss_mlp": 0.01109111, "balance_loss_clip": 1.4688673, "balance_loss_mlp": 1.06943822, "epoch": 0.06697730347211785, "flos": 16554482029440.0, "grad_norm": 2.7514611871893497, "language_loss": 0.8461982, "learning_rate": 3.985674803727289e-06, "loss": 0.87473106, "num_input_tokens_seen": 23728485, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.3972168, "step": 1114, "time_per_iteration": 2.9056577682495117 }, { "auxiliary_loss_clip": 0.01455675, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.28287292, "balance_loss_mlp": 1.00996161, "epoch": 0.06703742672478581, "flos": 59812149173760.0, "grad_norm": 0.8451899874782671, "language_loss": 0.58309799, "learning_rate": 3.985628235767584e-06, "loss": 0.60799658, "num_input_tokens_seen": 23786650, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.2421875, "step": 1115, "time_per_iteration": 3.3437414169311523 }, { "auxiliary_loss_clip": 0.01778429, "auxiliary_loss_mlp": 0.0112172, "balance_loss_clip": 1.48894501, "balance_loss_mlp": 1.07644403, "epoch": 0.06709754997745378, "flos": 16808903395200.0, "grad_norm": 2.702988190676563, "language_loss": 0.93082017, "learning_rate": 3.985581592512658e-06, "loss": 0.9598217, "num_input_tokens_seen": 23802555, "router_z_loss_clip": 2.89257812, "router_z_loss_mlp": 0.45239258, "step": 1116, "time_per_iteration": 2.8920130729675293 }, { "auxiliary_loss_clip": 0.01780936, "auxiliary_loss_mlp": 0.01126618, "balance_loss_clip": 1.48837924, "balance_loss_mlp": 1.08589661, "epoch": 0.06715767323012176, "flos": 22133174446080.0, "grad_norm": 1.9560883740736086, "language_loss": 0.88852501, "learning_rate": 3.985534873964279e-06, "loss": 0.91760051, "num_input_tokens_seen": 23822945, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.40722656, "step": 1117, "time_per_iteration": 3.0018818378448486 }, { "auxiliary_loss_clip": 0.0144369, "auxiliary_loss_mlp": 0.01073777, "balance_loss_clip": 1.27206695, "balance_loss_mlp": 1.05556166, "epoch": 0.06721779648278972, "flos": 66643583644800.0, "grad_norm": 0.8650562609671947, "language_loss": 0.59873402, "learning_rate": 3.985488080124218e-06, "loss": 0.6239087, "num_input_tokens_seen": 23874075, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.18261719, "step": 1118, "time_per_iteration": 3.24596905708313 }, { "auxiliary_loss_clip": 0.01794294, "auxiliary_loss_mlp": 0.01092811, "balance_loss_clip": 1.49612546, "balance_loss_mlp": 1.05139744, "epoch": 0.06727791973545769, "flos": 22392617984640.0, "grad_norm": 2.344515846792726, "language_loss": 0.87493664, "learning_rate": 3.985441210994251e-06, "loss": 0.9038077, "num_input_tokens_seen": 23889720, "router_z_loss_clip": 2.98046875, "router_z_loss_mlp": 0.41381836, "step": 1119, "time_per_iteration": 2.9194021224975586 }, { "auxiliary_loss_clip": 0.01763388, "auxiliary_loss_mlp": 0.01111295, "balance_loss_clip": 1.47731256, "balance_loss_mlp": 1.07102633, "epoch": 0.06733804298812565, "flos": 24290969097600.0, "grad_norm": 1.829720566023223, "language_loss": 0.86594296, "learning_rate": 3.9853942665761545e-06, "loss": 0.8946898, "num_input_tokens_seen": 23909385, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.40258789, "step": 1120, "time_per_iteration": 2.983499765396118 }, { "auxiliary_loss_clip": 0.0179329, "auxiliary_loss_mlp": 0.01122749, "balance_loss_clip": 1.49916816, "balance_loss_mlp": 1.08002412, "epoch": 0.06739816624079363, "flos": 15925070609280.0, "grad_norm": 1.8461268729512665, "language_loss": 0.79897928, "learning_rate": 3.985347246871708e-06, "loss": 0.82813966, "num_input_tokens_seen": 23926830, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.42749023, "step": 1121, "time_per_iteration": 2.881929874420166 }, { "auxiliary_loss_clip": 0.01439313, "auxiliary_loss_mlp": 0.01022523, "balance_loss_clip": 1.26538038, "balance_loss_mlp": 1.00240028, "epoch": 0.0674582894934616, "flos": 71434977696000.0, "grad_norm": 0.7621670857372473, "language_loss": 0.58532143, "learning_rate": 3.985300151882694e-06, "loss": 0.60993981, "num_input_tokens_seen": 23992640, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.20117188, "step": 1122, "time_per_iteration": 3.4584033489227295 }, { "auxiliary_loss_clip": 0.01787508, "auxiliary_loss_mlp": 0.01110695, "balance_loss_clip": 1.49315333, "balance_loss_mlp": 1.06861424, "epoch": 0.06751841274612956, "flos": 25275870063360.0, "grad_norm": 3.5325857879232325, "language_loss": 0.73612785, "learning_rate": 3.985252981610901e-06, "loss": 0.7651099, "num_input_tokens_seen": 24011135, "router_z_loss_clip": 2.9453125, "router_z_loss_mlp": 0.4206543, "step": 1123, "time_per_iteration": 3.012976884841919 }, { "auxiliary_loss_clip": 0.01803792, "auxiliary_loss_mlp": 0.0110326, "balance_loss_clip": 1.50131488, "balance_loss_mlp": 1.06017756, "epoch": 0.06757853599879754, "flos": 23812558133760.0, "grad_norm": 1.8359494955447002, "language_loss": 0.80354464, "learning_rate": 3.985205736058114e-06, "loss": 0.83261526, "num_input_tokens_seen": 24030695, "router_z_loss_clip": 3.02539062, "router_z_loss_mlp": 0.4309082, "step": 1124, "time_per_iteration": 2.9381868839263916 }, { "auxiliary_loss_clip": 0.01781343, "auxiliary_loss_mlp": 0.01081859, "balance_loss_clip": 1.49227417, "balance_loss_mlp": 1.04273438, "epoch": 0.0676386592514655, "flos": 21043857185280.0, "grad_norm": 1.8885530983194523, "language_loss": 0.73892158, "learning_rate": 3.985158415226128e-06, "loss": 0.76755363, "num_input_tokens_seen": 24050680, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.39111328, "step": 1125, "time_per_iteration": 2.954005241394043 }, { "auxiliary_loss_clip": 0.01784842, "auxiliary_loss_mlp": 0.01097686, "balance_loss_clip": 1.49179482, "balance_loss_mlp": 1.05787003, "epoch": 0.06769878250413347, "flos": 25567193203200.0, "grad_norm": 2.6544142005686964, "language_loss": 0.83032131, "learning_rate": 3.985111019116736e-06, "loss": 0.85914648, "num_input_tokens_seen": 24067205, "router_z_loss_clip": 2.9296875, "router_z_loss_mlp": 0.3984375, "step": 1126, "time_per_iteration": 2.961317777633667 }, { "auxiliary_loss_clip": 0.01434716, "auxiliary_loss_mlp": 0.01014259, "balance_loss_clip": 1.26352429, "balance_loss_mlp": 0.99880904, "epoch": 0.06775890575680145, "flos": 70687938499200.0, "grad_norm": 0.7902248914487475, "language_loss": 0.59876448, "learning_rate": 3.985063547731735e-06, "loss": 0.62325418, "num_input_tokens_seen": 24131320, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.15429688, "step": 1127, "time_per_iteration": 3.440735101699829 }, { "auxiliary_loss_clip": 0.01785199, "auxiliary_loss_mlp": 0.01109218, "balance_loss_clip": 1.49244273, "balance_loss_mlp": 1.06832957, "epoch": 0.06781902900946941, "flos": 24244113467520.0, "grad_norm": 1.9840662838060583, "language_loss": 0.82907218, "learning_rate": 3.985016001072925e-06, "loss": 0.85801637, "num_input_tokens_seen": 24149930, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.40893555, "step": 1128, "time_per_iteration": 2.9745333194732666 }, { "auxiliary_loss_clip": 0.01815432, "auxiliary_loss_mlp": 0.01118336, "balance_loss_clip": 1.50900817, "balance_loss_mlp": 1.07458663, "epoch": 0.06787915226213738, "flos": 22427302763520.0, "grad_norm": 2.646893922195106, "language_loss": 0.78473097, "learning_rate": 3.984968379142109e-06, "loss": 0.81406868, "num_input_tokens_seen": 24169590, "router_z_loss_clip": 3.06054688, "router_z_loss_mlp": 0.4375, "step": 1129, "time_per_iteration": 4.303941011428833 }, { "auxiliary_loss_clip": 0.017947, "auxiliary_loss_mlp": 0.01100976, "balance_loss_clip": 1.49370492, "balance_loss_mlp": 1.0572021, "epoch": 0.06793927551480534, "flos": 37721152160640.0, "grad_norm": 1.951544639686094, "language_loss": 0.75001538, "learning_rate": 3.984920681941094e-06, "loss": 0.77897215, "num_input_tokens_seen": 24189965, "router_z_loss_clip": 3.00976562, "router_z_loss_mlp": 0.4375, "step": 1130, "time_per_iteration": 3.039942741394043 }, { "auxiliary_loss_clip": 0.01775956, "auxiliary_loss_mlp": 0.01107316, "balance_loss_clip": 1.48581553, "balance_loss_mlp": 1.06843042, "epoch": 0.06799939876747332, "flos": 20641150051200.0, "grad_norm": 2.2016438470572655, "language_loss": 0.81806189, "learning_rate": 3.984872909471688e-06, "loss": 0.84689462, "num_input_tokens_seen": 24208045, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.38867188, "step": 1131, "time_per_iteration": 2.9461746215820312 }, { "auxiliary_loss_clip": 0.01763196, "auxiliary_loss_mlp": 0.01101781, "balance_loss_clip": 1.47825074, "balance_loss_mlp": 1.06015277, "epoch": 0.06805952202014129, "flos": 14872519388160.0, "grad_norm": 1.8597213994609005, "language_loss": 0.82135308, "learning_rate": 3.984825061735701e-06, "loss": 0.85000288, "num_input_tokens_seen": 24223805, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.41601562, "step": 1132, "time_per_iteration": 2.884608030319214 }, { "auxiliary_loss_clip": 0.01781808, "auxiliary_loss_mlp": 0.01113013, "balance_loss_clip": 1.49054408, "balance_loss_mlp": 1.07186246, "epoch": 0.06811964527280925, "flos": 48926930947200.0, "grad_norm": 1.4297570721558497, "language_loss": 0.65490377, "learning_rate": 3.9847771387349495e-06, "loss": 0.68385196, "num_input_tokens_seen": 24249475, "router_z_loss_clip": 2.91015625, "router_z_loss_mlp": 0.41162109, "step": 1133, "time_per_iteration": 3.280437469482422 }, { "auxiliary_loss_clip": 0.01802387, "auxiliary_loss_mlp": 0.01105853, "balance_loss_clip": 1.49957681, "balance_loss_mlp": 1.06289005, "epoch": 0.06817976852547723, "flos": 15385615130880.0, "grad_norm": 1.9842473187868983, "language_loss": 0.77088434, "learning_rate": 3.9847291404712506e-06, "loss": 0.79996669, "num_input_tokens_seen": 24267980, "router_z_loss_clip": 3.02734375, "router_z_loss_mlp": 0.4296875, "step": 1134, "time_per_iteration": 5.790952920913696 }, { "auxiliary_loss_clip": 0.01780266, "auxiliary_loss_mlp": 0.01101158, "balance_loss_clip": 1.49274802, "balance_loss_mlp": 1.06119883, "epoch": 0.0682398917781452, "flos": 20164956082560.0, "grad_norm": 1.75619328365223, "language_loss": 0.88908952, "learning_rate": 3.984681066946423e-06, "loss": 0.91790372, "num_input_tokens_seen": 24286805, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.39941406, "step": 1135, "time_per_iteration": 4.318697214126587 }, { "auxiliary_loss_clip": 0.01792087, "auxiliary_loss_mlp": 0.01106024, "balance_loss_clip": 1.49672318, "balance_loss_mlp": 1.06391895, "epoch": 0.06830001503081316, "flos": 23451051029760.0, "grad_norm": 2.1834946854949626, "language_loss": 0.79806721, "learning_rate": 3.984632918162291e-06, "loss": 0.8270483, "num_input_tokens_seen": 24305855, "router_z_loss_clip": 2.953125, "router_z_loss_mlp": 0.42114258, "step": 1136, "time_per_iteration": 2.961277723312378 }, { "auxiliary_loss_clip": 0.01801486, "auxiliary_loss_mlp": 0.01103813, "balance_loss_clip": 1.50816965, "balance_loss_mlp": 1.06099343, "epoch": 0.06836013828348114, "flos": 34363063457280.0, "grad_norm": 2.028757697892878, "language_loss": 0.86820656, "learning_rate": 3.984584694120679e-06, "loss": 0.89725959, "num_input_tokens_seen": 24326535, "router_z_loss_clip": 2.93359375, "router_z_loss_mlp": 0.4284668, "step": 1137, "time_per_iteration": 2.9968631267547607 }, { "auxiliary_loss_clip": 0.01768299, "auxiliary_loss_mlp": 0.01111839, "balance_loss_clip": 1.48083735, "balance_loss_mlp": 1.07414508, "epoch": 0.06842026153614911, "flos": 23159230197120.0, "grad_norm": 2.021102761838922, "language_loss": 0.80818772, "learning_rate": 3.984536394823418e-06, "loss": 0.83698905, "num_input_tokens_seen": 24345810, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.37719727, "step": 1138, "time_per_iteration": 2.9238510131835938 }, { "auxiliary_loss_clip": 0.01779437, "auxiliary_loss_mlp": 0.01099406, "balance_loss_clip": 1.48964417, "balance_loss_mlp": 1.05927992, "epoch": 0.06848038478881707, "flos": 24619917928320.0, "grad_norm": 2.2607119602967467, "language_loss": 0.86203003, "learning_rate": 3.984488020272336e-06, "loss": 0.89081848, "num_input_tokens_seen": 24366095, "router_z_loss_clip": 2.89648438, "router_z_loss_mlp": 0.40112305, "step": 1139, "time_per_iteration": 2.9141201972961426 }, { "auxiliary_loss_clip": 0.0178041, "auxiliary_loss_mlp": 0.01085807, "balance_loss_clip": 1.49157107, "balance_loss_mlp": 1.04587162, "epoch": 0.06854050804148504, "flos": 40895998848000.0, "grad_norm": 1.5130640781885838, "language_loss": 0.76314354, "learning_rate": 3.984439570469271e-06, "loss": 0.79180562, "num_input_tokens_seen": 24388665, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.39941406, "step": 1140, "time_per_iteration": 3.161508798599243 }, { "auxiliary_loss_clip": 0.01785595, "auxiliary_loss_mlp": 0.0110339, "balance_loss_clip": 1.49298286, "balance_loss_mlp": 1.0636456, "epoch": 0.06860063129415302, "flos": 31698914538240.0, "grad_norm": 2.04622856788199, "language_loss": 0.7029835, "learning_rate": 3.9843910454160574e-06, "loss": 0.73187333, "num_input_tokens_seen": 24407705, "router_z_loss_clip": 2.921875, "router_z_loss_mlp": 0.39746094, "step": 1141, "time_per_iteration": 3.042388677597046 }, { "auxiliary_loss_clip": 0.01782916, "auxiliary_loss_mlp": 0.01092359, "balance_loss_clip": 1.48284721, "balance_loss_mlp": 1.05235219, "epoch": 0.06866075454682098, "flos": 26553270533760.0, "grad_norm": 2.139667974776622, "language_loss": 0.80406606, "learning_rate": 3.984342445114538e-06, "loss": 0.83281875, "num_input_tokens_seen": 24428390, "router_z_loss_clip": 2.99804688, "router_z_loss_mlp": 0.40039062, "step": 1142, "time_per_iteration": 3.0027992725372314 }, { "auxiliary_loss_clip": 0.0175986, "auxiliary_loss_mlp": 0.01078084, "balance_loss_clip": 1.48119748, "balance_loss_mlp": 1.03998494, "epoch": 0.06872087779948895, "flos": 29802011258880.0, "grad_norm": 1.7872533265285635, "language_loss": 0.70513737, "learning_rate": 3.984293769566553e-06, "loss": 0.73351681, "num_input_tokens_seen": 24450810, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.38110352, "step": 1143, "time_per_iteration": 3.014019012451172 }, { "auxiliary_loss_clip": 0.01745685, "auxiliary_loss_mlp": 0.01088501, "balance_loss_clip": 1.46915126, "balance_loss_mlp": 1.05030656, "epoch": 0.06878100105215693, "flos": 26951905635840.0, "grad_norm": 1.806922279363485, "language_loss": 0.75769001, "learning_rate": 3.98424501877395e-06, "loss": 0.78603196, "num_input_tokens_seen": 24469965, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.38208008, "step": 1144, "time_per_iteration": 3.0195586681365967 }, { "auxiliary_loss_clip": 0.01783507, "auxiliary_loss_mlp": 0.01091063, "balance_loss_clip": 1.48624361, "balance_loss_mlp": 1.04755163, "epoch": 0.06884112430482489, "flos": 10677905976960.0, "grad_norm": 2.246671589225427, "language_loss": 0.93436694, "learning_rate": 3.984196192738577e-06, "loss": 0.96311259, "num_input_tokens_seen": 24486370, "router_z_loss_clip": 2.97070312, "router_z_loss_mlp": 0.43554688, "step": 1145, "time_per_iteration": 2.889540910720825 }, { "auxiliary_loss_clip": 0.0179275, "auxiliary_loss_mlp": 0.0110185, "balance_loss_clip": 1.492553, "balance_loss_mlp": 1.05857706, "epoch": 0.06890124755749286, "flos": 20203305690240.0, "grad_norm": 2.2112595961326162, "language_loss": 0.84479892, "learning_rate": 3.984147291462285e-06, "loss": 0.87374485, "num_input_tokens_seen": 24503780, "router_z_loss_clip": 3.00390625, "router_z_loss_mlp": 0.43310547, "step": 1146, "time_per_iteration": 2.977402448654175 }, { "auxiliary_loss_clip": 0.01739956, "auxiliary_loss_mlp": 0.01078082, "balance_loss_clip": 1.46196139, "balance_loss_mlp": 1.03833771, "epoch": 0.06896137081016084, "flos": 20458993910400.0, "grad_norm": 1.7327090292129335, "language_loss": 0.86641192, "learning_rate": 3.98409831494693e-06, "loss": 0.89459223, "num_input_tokens_seen": 24522320, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.3972168, "step": 1147, "time_per_iteration": 2.9276974201202393 }, { "auxiliary_loss_clip": 0.01754834, "auxiliary_loss_mlp": 0.01083027, "balance_loss_clip": 1.46990585, "balance_loss_mlp": 1.04406953, "epoch": 0.0690214940628288, "flos": 18377717495040.0, "grad_norm": 1.765621534859995, "language_loss": 0.87507212, "learning_rate": 3.984049263194367e-06, "loss": 0.90345073, "num_input_tokens_seen": 24540445, "router_z_loss_clip": 2.84765625, "router_z_loss_mlp": 0.38989258, "step": 1148, "time_per_iteration": 2.961883783340454 }, { "auxiliary_loss_clip": 0.01736728, "auxiliary_loss_mlp": 0.01088318, "balance_loss_clip": 1.45327306, "balance_loss_mlp": 1.04828787, "epoch": 0.06908161731549677, "flos": 20567663216640.0, "grad_norm": 2.382562797615503, "language_loss": 0.71986365, "learning_rate": 3.9840001362064575e-06, "loss": 0.74811405, "num_input_tokens_seen": 24557105, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.40039062, "step": 1149, "time_per_iteration": 2.9428791999816895 }, { "auxiliary_loss_clip": 0.01760479, "auxiliary_loss_mlp": 0.01074686, "balance_loss_clip": 1.47101974, "balance_loss_mlp": 1.03639627, "epoch": 0.06914174056816474, "flos": 27575209008000.0, "grad_norm": 2.149800710497741, "language_loss": 0.86074388, "learning_rate": 3.983950933985064e-06, "loss": 0.88909554, "num_input_tokens_seen": 24578240, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.3828125, "step": 1150, "time_per_iteration": 3.00156569480896 }, { "auxiliary_loss_clip": 0.0176389, "auxiliary_loss_mlp": 0.0109314, "balance_loss_clip": 1.47581637, "balance_loss_mlp": 1.05334806, "epoch": 0.06920186382083271, "flos": 15312264030720.0, "grad_norm": 3.013811228292107, "language_loss": 0.84592628, "learning_rate": 3.983901656532052e-06, "loss": 0.87449664, "num_input_tokens_seen": 24593585, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.39794922, "step": 1151, "time_per_iteration": 2.9745144844055176 }, { "auxiliary_loss_clip": 0.0176165, "auxiliary_loss_mlp": 0.01096524, "balance_loss_clip": 1.46992767, "balance_loss_mlp": 1.05918789, "epoch": 0.06926198707350067, "flos": 25201930780800.0, "grad_norm": 2.072542761457855, "language_loss": 0.87080699, "learning_rate": 3.983852303849291e-06, "loss": 0.89938873, "num_input_tokens_seen": 24613110, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.37353516, "step": 1152, "time_per_iteration": 2.951582908630371 }, { "auxiliary_loss_clip": 0.01736643, "auxiliary_loss_mlp": 0.01083812, "balance_loss_clip": 1.45184994, "balance_loss_mlp": 1.04480708, "epoch": 0.06932211032616864, "flos": 13262640992640.0, "grad_norm": 2.923065751743557, "language_loss": 0.92647183, "learning_rate": 3.983802875938651e-06, "loss": 0.95467639, "num_input_tokens_seen": 24628795, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.39013672, "step": 1153, "time_per_iteration": 2.955064296722412 }, { "auxiliary_loss_clip": 0.0173797, "auxiliary_loss_mlp": 0.01084936, "balance_loss_clip": 1.45431972, "balance_loss_mlp": 1.04366565, "epoch": 0.06938223357883662, "flos": 24838251926400.0, "grad_norm": 2.1225094967668166, "language_loss": 0.8310492, "learning_rate": 3.983753372802008e-06, "loss": 0.85927826, "num_input_tokens_seen": 24645480, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.4128418, "step": 1154, "time_per_iteration": 2.9111173152923584 }, { "auxiliary_loss_clip": 0.01742665, "auxiliary_loss_mlp": 0.01081409, "balance_loss_clip": 1.4587245, "balance_loss_mlp": 1.04121125, "epoch": 0.06944235683150458, "flos": 27278456492160.0, "grad_norm": 2.4603130437876852, "language_loss": 0.76968384, "learning_rate": 3.983703794441237e-06, "loss": 0.79792452, "num_input_tokens_seen": 24664630, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.40234375, "step": 1155, "time_per_iteration": 2.9915316104888916 }, { "auxiliary_loss_clip": 0.01747401, "auxiliary_loss_mlp": 0.01083891, "balance_loss_clip": 1.45977402, "balance_loss_mlp": 1.04452848, "epoch": 0.06950248008417255, "flos": 25818040229760.0, "grad_norm": 4.871177763113254, "language_loss": 0.72125125, "learning_rate": 3.98365414085822e-06, "loss": 0.74956429, "num_input_tokens_seen": 24684210, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.39355469, "step": 1156, "time_per_iteration": 2.9384782314300537 }, { "auxiliary_loss_clip": 0.01721828, "auxiliary_loss_mlp": 0.01075663, "balance_loss_clip": 1.43956709, "balance_loss_mlp": 1.03656256, "epoch": 0.06956260333684053, "flos": 22281279235200.0, "grad_norm": 2.219946572602928, "language_loss": 0.76466644, "learning_rate": 3.98360441205484e-06, "loss": 0.7926414, "num_input_tokens_seen": 24702490, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.39086914, "step": 1157, "time_per_iteration": 2.8861210346221924 }, { "auxiliary_loss_clip": 0.01745972, "auxiliary_loss_mlp": 0.01080283, "balance_loss_clip": 1.4597404, "balance_loss_mlp": 1.03877389, "epoch": 0.0696227265895085, "flos": 29693884890240.0, "grad_norm": 1.7211163533486489, "language_loss": 0.73171544, "learning_rate": 3.983554608032982e-06, "loss": 0.759978, "num_input_tokens_seen": 24724340, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.41479492, "step": 1158, "time_per_iteration": 3.0217227935791016 }, { "auxiliary_loss_clip": 0.01741903, "auxiliary_loss_mlp": 0.01079304, "balance_loss_clip": 1.45792198, "balance_loss_mlp": 1.03929758, "epoch": 0.06968284984217646, "flos": 25535358846720.0, "grad_norm": 1.7231024398076913, "language_loss": 0.8121056, "learning_rate": 3.983504728794533e-06, "loss": 0.84031773, "num_input_tokens_seen": 24745550, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.40014648, "step": 1159, "time_per_iteration": 2.968329429626465 }, { "auxiliary_loss_clip": 0.01742283, "auxiliary_loss_mlp": 0.01084784, "balance_loss_clip": 1.4558394, "balance_loss_mlp": 1.04220212, "epoch": 0.06974297309484444, "flos": 20706357087360.0, "grad_norm": 2.6560324991410025, "language_loss": 0.84323162, "learning_rate": 3.983454774341387e-06, "loss": 0.87150228, "num_input_tokens_seen": 24762575, "router_z_loss_clip": 2.86132812, "router_z_loss_mlp": 0.42602539, "step": 1160, "time_per_iteration": 2.905940294265747 }, { "auxiliary_loss_clip": 0.01728161, "auxiliary_loss_mlp": 0.01073135, "balance_loss_clip": 1.44484377, "balance_loss_mlp": 1.03219819, "epoch": 0.0698030963475124, "flos": 26516368759680.0, "grad_norm": 1.8101908981803763, "language_loss": 0.7730915, "learning_rate": 3.983404744675437e-06, "loss": 0.80110443, "num_input_tokens_seen": 24782605, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.40869141, "step": 1161, "time_per_iteration": 2.952885150909424 }, { "auxiliary_loss_clip": 0.01728234, "auxiliary_loss_mlp": 0.01068383, "balance_loss_clip": 1.44422698, "balance_loss_mlp": 1.02949739, "epoch": 0.06986321960018037, "flos": 23051058583680.0, "grad_norm": 1.633409104216109, "language_loss": 0.84162581, "learning_rate": 3.9833546397985794e-06, "loss": 0.86959195, "num_input_tokens_seen": 24802910, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.38867188, "step": 1162, "time_per_iteration": 2.879686117172241 }, { "auxiliary_loss_clip": 0.01723115, "auxiliary_loss_mlp": 0.01067573, "balance_loss_clip": 1.44501972, "balance_loss_mlp": 1.02966487, "epoch": 0.06992334285284833, "flos": 28596514055040.0, "grad_norm": 11.040833235859107, "language_loss": 0.80968076, "learning_rate": 3.983304459712716e-06, "loss": 0.83758765, "num_input_tokens_seen": 24823305, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.37915039, "step": 1163, "time_per_iteration": 4.371046543121338 }, { "auxiliary_loss_clip": 0.01738412, "auxiliary_loss_mlp": 0.01077845, "balance_loss_clip": 1.44815302, "balance_loss_mlp": 1.03800559, "epoch": 0.06998346610551631, "flos": 20605198417920.0, "grad_norm": 2.7626409823828477, "language_loss": 0.80418587, "learning_rate": 3.983254204419749e-06, "loss": 0.83234847, "num_input_tokens_seen": 24842155, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.3984375, "step": 1164, "time_per_iteration": 2.923943042755127 }, { "auxiliary_loss_clip": 0.01712709, "auxiliary_loss_mlp": 0.01075891, "balance_loss_clip": 1.42895484, "balance_loss_mlp": 1.03826833, "epoch": 0.07004358935818428, "flos": 22539093960960.0, "grad_norm": 1.661469348126522, "language_loss": 0.74778467, "learning_rate": 3.983203873921583e-06, "loss": 0.77567065, "num_input_tokens_seen": 24862080, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.3762207, "step": 1165, "time_per_iteration": 2.893615484237671 }, { "auxiliary_loss_clip": 0.01712109, "auxiliary_loss_mlp": 0.01074353, "balance_loss_clip": 1.43215501, "balance_loss_mlp": 1.03775609, "epoch": 0.07010371261085224, "flos": 28961866967040.0, "grad_norm": 1.7925751006522423, "language_loss": 0.82505178, "learning_rate": 3.983153468220128e-06, "loss": 0.85291642, "num_input_tokens_seen": 24886165, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.3659668, "step": 1166, "time_per_iteration": 2.982210636138916 }, { "auxiliary_loss_clip": 0.01725198, "auxiliary_loss_mlp": 0.01078402, "balance_loss_clip": 1.43985724, "balance_loss_mlp": 1.04101789, "epoch": 0.07016383586352022, "flos": 23669566007040.0, "grad_norm": 1.9654444086813077, "language_loss": 0.8662982, "learning_rate": 3.983102987317295e-06, "loss": 0.89433414, "num_input_tokens_seen": 24905775, "router_z_loss_clip": 2.85546875, "router_z_loss_mlp": 0.3737793, "step": 1167, "time_per_iteration": 2.8925750255584717 }, { "auxiliary_loss_clip": 0.01727189, "auxiliary_loss_mlp": 0.01078335, "balance_loss_clip": 1.44217777, "balance_loss_mlp": 1.03935409, "epoch": 0.07022395911618819, "flos": 19801639186560.0, "grad_norm": 2.5703404707862894, "language_loss": 0.92231417, "learning_rate": 3.983052431214997e-06, "loss": 0.95036948, "num_input_tokens_seen": 24924295, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.38989258, "step": 1168, "time_per_iteration": 2.9164843559265137 }, { "auxiliary_loss_clip": 0.01723879, "auxiliary_loss_mlp": 0.01075371, "balance_loss_clip": 1.4369669, "balance_loss_mlp": 1.03698575, "epoch": 0.07028408236885615, "flos": 21699085403520.0, "grad_norm": 2.843384274012963, "language_loss": 0.90027738, "learning_rate": 3.983001799915153e-06, "loss": 0.92826986, "num_input_tokens_seen": 24943210, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.38378906, "step": 1169, "time_per_iteration": 4.388768911361694 }, { "auxiliary_loss_clip": 0.01737596, "auxiliary_loss_mlp": 0.01080943, "balance_loss_clip": 1.44595742, "balance_loss_mlp": 1.04129398, "epoch": 0.07034420562152413, "flos": 25641403954560.0, "grad_norm": 2.1051026856087085, "language_loss": 0.86896288, "learning_rate": 3.982951093419681e-06, "loss": 0.89714825, "num_input_tokens_seen": 24960360, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.39648438, "step": 1170, "time_per_iteration": 4.4324798583984375 }, { "auxiliary_loss_clip": 0.01712682, "auxiliary_loss_mlp": 0.01091295, "balance_loss_clip": 1.43891263, "balance_loss_mlp": 1.05395889, "epoch": 0.0704043288741921, "flos": 20819369894400.0, "grad_norm": 1.8235108999273948, "language_loss": 0.7695992, "learning_rate": 3.982900311730506e-06, "loss": 0.79763901, "num_input_tokens_seen": 24978290, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.37329102, "step": 1171, "time_per_iteration": 2.895761489868164 }, { "auxiliary_loss_clip": 0.01705467, "auxiliary_loss_mlp": 0.01089557, "balance_loss_clip": 1.42738473, "balance_loss_mlp": 1.05431914, "epoch": 0.07046445212686006, "flos": 25604140222080.0, "grad_norm": 1.8031258721489132, "language_loss": 0.90964723, "learning_rate": 3.9828494548495514e-06, "loss": 0.93759745, "num_input_tokens_seen": 24997055, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.35205078, "step": 1172, "time_per_iteration": 3.139467239379883 }, { "auxiliary_loss_clip": 0.01718161, "auxiliary_loss_mlp": 0.01092915, "balance_loss_clip": 1.42629385, "balance_loss_mlp": 1.05386233, "epoch": 0.07052457537952803, "flos": 25568007609600.0, "grad_norm": 1.5733268441001675, "language_loss": 0.83832705, "learning_rate": 3.982798522778748e-06, "loss": 0.86643785, "num_input_tokens_seen": 25017490, "router_z_loss_clip": 2.91992188, "router_z_loss_mlp": 0.390625, "step": 1173, "time_per_iteration": 2.964430332183838 }, { "auxiliary_loss_clip": 0.01700662, "auxiliary_loss_mlp": 0.01081855, "balance_loss_clip": 1.42357922, "balance_loss_mlp": 1.04575872, "epoch": 0.070584698632196, "flos": 17977725048960.0, "grad_norm": 1.9552783716035997, "language_loss": 0.84077764, "learning_rate": 3.9827475155200245e-06, "loss": 0.86860275, "num_input_tokens_seen": 25035660, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.36083984, "step": 1174, "time_per_iteration": 2.8872430324554443 }, { "auxiliary_loss_clip": 0.01700675, "auxiliary_loss_mlp": 0.01077218, "balance_loss_clip": 1.4258492, "balance_loss_mlp": 1.04128838, "epoch": 0.07064482188486397, "flos": 25380919785600.0, "grad_norm": 3.674582629124737, "language_loss": 0.87052703, "learning_rate": 3.982696433075317e-06, "loss": 0.89830595, "num_input_tokens_seen": 25054785, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.359375, "step": 1175, "time_per_iteration": 2.9896137714385986 }, { "auxiliary_loss_clip": 0.01714685, "auxiliary_loss_mlp": 0.01087933, "balance_loss_clip": 1.43113565, "balance_loss_mlp": 1.05190849, "epoch": 0.07070494513753194, "flos": 24910607640960.0, "grad_norm": 1.723610517678584, "language_loss": 0.8547827, "learning_rate": 3.982645275446563e-06, "loss": 0.88280892, "num_input_tokens_seen": 25075180, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.36035156, "step": 1176, "time_per_iteration": 2.9168763160705566 }, { "auxiliary_loss_clip": 0.01700325, "auxiliary_loss_mlp": 0.01077941, "balance_loss_clip": 1.4231739, "balance_loss_mlp": 1.0428462, "epoch": 0.07076506839019991, "flos": 22346622005760.0, "grad_norm": 1.9652279590081545, "language_loss": 0.75990933, "learning_rate": 3.982594042635701e-06, "loss": 0.78769201, "num_input_tokens_seen": 25093035, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.35107422, "step": 1177, "time_per_iteration": 2.923025608062744 }, { "auxiliary_loss_clip": 0.01714983, "auxiliary_loss_mlp": 0.01089819, "balance_loss_clip": 1.4295876, "balance_loss_mlp": 1.05315065, "epoch": 0.07082519164286788, "flos": 18669945530880.0, "grad_norm": 1.8068675617996233, "language_loss": 0.87232077, "learning_rate": 3.982542734644673e-06, "loss": 0.90036881, "num_input_tokens_seen": 25112520, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.36694336, "step": 1178, "time_per_iteration": 2.8652961254119873 }, { "auxiliary_loss_clip": 0.01440522, "auxiliary_loss_mlp": 0.0104938, "balance_loss_clip": 1.27498198, "balance_loss_mlp": 1.03211808, "epoch": 0.07088531489553584, "flos": 63686048307840.0, "grad_norm": 0.847612952468103, "language_loss": 0.63460118, "learning_rate": 3.982491351475427e-06, "loss": 0.65950024, "num_input_tokens_seen": 25177760, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.17285156, "step": 1179, "time_per_iteration": 3.4792938232421875 }, { "auxiliary_loss_clip": 0.01721916, "auxiliary_loss_mlp": 0.01091705, "balance_loss_clip": 1.43438852, "balance_loss_mlp": 1.0559423, "epoch": 0.07094543814820382, "flos": 21580778954880.0, "grad_norm": 2.4464394103108047, "language_loss": 0.86921805, "learning_rate": 3.98243989312991e-06, "loss": 0.89735425, "num_input_tokens_seen": 25195260, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.35742188, "step": 1180, "time_per_iteration": 2.8962419033050537 }, { "auxiliary_loss_clip": 0.01699546, "auxiliary_loss_mlp": 0.01075862, "balance_loss_clip": 1.41976452, "balance_loss_mlp": 1.03707147, "epoch": 0.07100556140087179, "flos": 22099847011200.0, "grad_norm": 3.8808583244963386, "language_loss": 0.90603805, "learning_rate": 3.982388359610074e-06, "loss": 0.93379205, "num_input_tokens_seen": 25212740, "router_z_loss_clip": 2.796875, "router_z_loss_mlp": 0.38769531, "step": 1181, "time_per_iteration": 2.88250470161438 }, { "auxiliary_loss_clip": 0.01688241, "auxiliary_loss_mlp": 0.01061109, "balance_loss_clip": 1.41726637, "balance_loss_mlp": 1.0276829, "epoch": 0.07106568465353975, "flos": 47938998579840.0, "grad_norm": 1.885771899897693, "language_loss": 0.85516238, "learning_rate": 3.9823367509178725e-06, "loss": 0.88265586, "num_input_tokens_seen": 25236420, "router_z_loss_clip": 2.70898438, "router_z_loss_mlp": 0.33422852, "step": 1182, "time_per_iteration": 3.1554551124572754 }, { "auxiliary_loss_clip": 0.01709463, "auxiliary_loss_mlp": 0.01070045, "balance_loss_clip": 1.42889595, "balance_loss_mlp": 1.03340054, "epoch": 0.07112580790620772, "flos": 23451277253760.0, "grad_norm": 2.165353215465474, "language_loss": 0.81728697, "learning_rate": 3.982285067055262e-06, "loss": 0.84508198, "num_input_tokens_seen": 25255120, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.36645508, "step": 1183, "time_per_iteration": 2.9735240936279297 }, { "auxiliary_loss_clip": 0.01723271, "auxiliary_loss_mlp": 0.01068223, "balance_loss_clip": 1.43319452, "balance_loss_mlp": 1.03281784, "epoch": 0.0711859311588757, "flos": 31881523127040.0, "grad_norm": 2.063488622546912, "language_loss": 0.81384939, "learning_rate": 3.982233308024204e-06, "loss": 0.84176433, "num_input_tokens_seen": 25275150, "router_z_loss_clip": 2.90234375, "router_z_loss_mlp": 0.35400391, "step": 1184, "time_per_iteration": 2.956021547317505 }, { "auxiliary_loss_clip": 0.01694805, "auxiliary_loss_mlp": 0.01071256, "balance_loss_clip": 1.41919231, "balance_loss_mlp": 1.03463459, "epoch": 0.07124605441154366, "flos": 19619935493760.0, "grad_norm": 1.7983909342733337, "language_loss": 0.78965789, "learning_rate": 3.98218147382666e-06, "loss": 0.8173185, "num_input_tokens_seen": 25293680, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.36621094, "step": 1185, "time_per_iteration": 2.9469523429870605 }, { "auxiliary_loss_clip": 0.01713783, "auxiliary_loss_mlp": 0.01082376, "balance_loss_clip": 1.43160462, "balance_loss_mlp": 1.04782891, "epoch": 0.07130617766421163, "flos": 14692715976960.0, "grad_norm": 2.4884418341115904, "language_loss": 0.68930507, "learning_rate": 3.982129564464596e-06, "loss": 0.71726662, "num_input_tokens_seen": 25310050, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.34521484, "step": 1186, "time_per_iteration": 2.8512160778045654 }, { "auxiliary_loss_clip": 0.01691548, "auxiliary_loss_mlp": 0.01069641, "balance_loss_clip": 1.41857743, "balance_loss_mlp": 1.02934873, "epoch": 0.07136630091687961, "flos": 26079112581120.0, "grad_norm": 1.8787388922612374, "language_loss": 0.71592003, "learning_rate": 3.98207757993998e-06, "loss": 0.74353194, "num_input_tokens_seen": 25331020, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.40332031, "step": 1187, "time_per_iteration": 3.004605770111084 }, { "auxiliary_loss_clip": 0.01719162, "auxiliary_loss_mlp": 0.01071555, "balance_loss_clip": 1.44113469, "balance_loss_mlp": 1.03860593, "epoch": 0.07142642416954757, "flos": 15677616942720.0, "grad_norm": 2.409841600979247, "language_loss": 0.80428207, "learning_rate": 3.9820255202547845e-06, "loss": 0.83218926, "num_input_tokens_seen": 25347875, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.32910156, "step": 1188, "time_per_iteration": 2.9323060512542725 }, { "auxiliary_loss_clip": 0.01712766, "auxiliary_loss_mlp": 0.01071645, "balance_loss_clip": 1.43560302, "balance_loss_mlp": 1.03690767, "epoch": 0.07148654742221554, "flos": 19764737412480.0, "grad_norm": 1.9420417860906642, "language_loss": 0.86858535, "learning_rate": 3.981973385410981e-06, "loss": 0.89642942, "num_input_tokens_seen": 25366715, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.34741211, "step": 1189, "time_per_iteration": 2.942032814025879 }, { "auxiliary_loss_clip": 0.0171633, "auxiliary_loss_mlp": 0.01069603, "balance_loss_clip": 1.43951368, "balance_loss_mlp": 1.03350639, "epoch": 0.07154667067488352, "flos": 23480894615040.0, "grad_norm": 2.8304067860425843, "language_loss": 0.78945482, "learning_rate": 3.9819211754105494e-06, "loss": 0.81731415, "num_input_tokens_seen": 25385450, "router_z_loss_clip": 2.76953125, "router_z_loss_mlp": 0.36108398, "step": 1190, "time_per_iteration": 2.926969289779663 }, { "auxiliary_loss_clip": 0.0173779, "auxiliary_loss_mlp": 0.01074461, "balance_loss_clip": 1.44896078, "balance_loss_mlp": 1.03507435, "epoch": 0.07160679392755148, "flos": 18342127820160.0, "grad_norm": 2.0474937790828425, "language_loss": 0.77704114, "learning_rate": 3.981868890255468e-06, "loss": 0.80516362, "num_input_tokens_seen": 25403940, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.39404297, "step": 1191, "time_per_iteration": 2.903620719909668 }, { "auxiliary_loss_clip": 0.0172124, "auxiliary_loss_mlp": 0.01080661, "balance_loss_clip": 1.43466735, "balance_loss_mlp": 1.04582763, "epoch": 0.07166691718021945, "flos": 17755499998080.0, "grad_norm": 3.31795051111562, "language_loss": 0.76172864, "learning_rate": 3.981816529947719e-06, "loss": 0.78974771, "num_input_tokens_seen": 25420410, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.34814453, "step": 1192, "time_per_iteration": 2.9529917240142822 }, { "auxiliary_loss_clip": 0.01727446, "auxiliary_loss_mlp": 0.01078597, "balance_loss_clip": 1.44257212, "balance_loss_mlp": 1.04450274, "epoch": 0.07172704043288743, "flos": 22461173136000.0, "grad_norm": 1.9096970835370717, "language_loss": 0.79305339, "learning_rate": 3.9817640944892896e-06, "loss": 0.82111388, "num_input_tokens_seen": 25439415, "router_z_loss_clip": 2.84960938, "router_z_loss_mlp": 0.34082031, "step": 1193, "time_per_iteration": 2.911536455154419 }, { "auxiliary_loss_clip": 0.01737895, "auxiliary_loss_mlp": 0.01079276, "balance_loss_clip": 1.45515561, "balance_loss_mlp": 1.04227328, "epoch": 0.07178716368555539, "flos": 23232717031680.0, "grad_norm": 1.837871044667439, "language_loss": 0.87734944, "learning_rate": 3.981711583882166e-06, "loss": 0.90552115, "num_input_tokens_seen": 25458715, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.37011719, "step": 1194, "time_per_iteration": 2.9038984775543213 }, { "auxiliary_loss_clip": 0.01741711, "auxiliary_loss_mlp": 0.01082663, "balance_loss_clip": 1.45766115, "balance_loss_mlp": 1.04728198, "epoch": 0.07184728693822336, "flos": 25160504526720.0, "grad_norm": 1.8814635220373335, "language_loss": 0.82097638, "learning_rate": 3.981658998128341e-06, "loss": 0.84922016, "num_input_tokens_seen": 25477985, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.35375977, "step": 1195, "time_per_iteration": 2.9334497451782227 }, { "auxiliary_loss_clip": 0.01745805, "auxiliary_loss_mlp": 0.01077178, "balance_loss_clip": 1.46009898, "balance_loss_mlp": 1.04487276, "epoch": 0.07190741019089132, "flos": 22721566815360.0, "grad_norm": 1.867701780210213, "language_loss": 0.80789912, "learning_rate": 3.981606337229808e-06, "loss": 0.83612895, "num_input_tokens_seen": 25497110, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.32299805, "step": 1196, "time_per_iteration": 2.9574496746063232 }, { "auxiliary_loss_clip": 0.01728089, "auxiliary_loss_mlp": 0.01082063, "balance_loss_clip": 1.44284189, "balance_loss_mlp": 1.04713452, "epoch": 0.0719675334435593, "flos": 29361361720320.0, "grad_norm": 2.4457021167157698, "language_loss": 0.73589826, "learning_rate": 3.9815536011885655e-06, "loss": 0.7639997, "num_input_tokens_seen": 25516555, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.34936523, "step": 1197, "time_per_iteration": 3.030885934829712 }, { "auxiliary_loss_clip": 0.01739424, "auxiliary_loss_mlp": 0.01078717, "balance_loss_clip": 1.45779407, "balance_loss_mlp": 1.04459929, "epoch": 0.07202765669622727, "flos": 17648821463040.0, "grad_norm": 2.001110647556366, "language_loss": 0.87090337, "learning_rate": 3.98150079000661e-06, "loss": 0.89908481, "num_input_tokens_seen": 25533895, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.34106445, "step": 1198, "time_per_iteration": 4.424349784851074 }, { "auxiliary_loss_clip": 0.01751791, "auxiliary_loss_mlp": 0.01092928, "balance_loss_clip": 1.46431303, "balance_loss_mlp": 1.05928731, "epoch": 0.07208777994889523, "flos": 21443985365760.0, "grad_norm": 2.0261192201506373, "language_loss": 0.84525955, "learning_rate": 3.981447903685947e-06, "loss": 0.8737067, "num_input_tokens_seen": 25554195, "router_z_loss_clip": 2.87109375, "router_z_loss_mlp": 0.33642578, "step": 1199, "time_per_iteration": 2.9546959400177 }, { "auxiliary_loss_clip": 0.0174724, "auxiliary_loss_mlp": 0.01086107, "balance_loss_clip": 1.46095288, "balance_loss_mlp": 1.05449295, "epoch": 0.07214790320156321, "flos": 26951588922240.0, "grad_norm": 2.162753263759689, "language_loss": 0.77799565, "learning_rate": 3.981394942228581e-06, "loss": 0.80632913, "num_input_tokens_seen": 25574155, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.31591797, "step": 1200, "time_per_iteration": 3.147609233856201 }, { "auxiliary_loss_clip": 0.01758858, "auxiliary_loss_mlp": 0.01094145, "balance_loss_clip": 1.47000027, "balance_loss_mlp": 1.05835867, "epoch": 0.07220802645423118, "flos": 23890886161920.0, "grad_norm": 2.3226840472419927, "language_loss": 0.84240931, "learning_rate": 3.98134190563652e-06, "loss": 0.87093937, "num_input_tokens_seen": 25592735, "router_z_loss_clip": 2.88671875, "router_z_loss_mlp": 0.3581543, "step": 1201, "time_per_iteration": 2.89512038230896 }, { "auxiliary_loss_clip": 0.01743841, "auxiliary_loss_mlp": 0.01088443, "balance_loss_clip": 1.45149279, "balance_loss_mlp": 1.04929507, "epoch": 0.07226814970689914, "flos": 19252682300160.0, "grad_norm": 1.917896726347435, "language_loss": 0.70590383, "learning_rate": 3.981288793911775e-06, "loss": 0.7342267, "num_input_tokens_seen": 25611510, "router_z_loss_clip": 2.92773438, "router_z_loss_mlp": 0.39135742, "step": 1202, "time_per_iteration": 2.9453768730163574 }, { "auxiliary_loss_clip": 0.01739272, "auxiliary_loss_mlp": 0.01071937, "balance_loss_clip": 1.45604241, "balance_loss_mlp": 1.0371753, "epoch": 0.07232827295956712, "flos": 19181412460800.0, "grad_norm": 2.0618424164882208, "language_loss": 0.89387202, "learning_rate": 3.98123560705636e-06, "loss": 0.92198414, "num_input_tokens_seen": 25629560, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.34790039, "step": 1203, "time_per_iteration": 4.4452478885650635 }, { "auxiliary_loss_clip": 0.01767333, "auxiliary_loss_mlp": 0.01087884, "balance_loss_clip": 1.47615254, "balance_loss_mlp": 1.05254996, "epoch": 0.07238839621223508, "flos": 17648776218240.0, "grad_norm": 1.686556145630741, "language_loss": 0.80710328, "learning_rate": 3.981182345072293e-06, "loss": 0.83565545, "num_input_tokens_seen": 25648330, "router_z_loss_clip": 2.91210938, "router_z_loss_mlp": 0.35375977, "step": 1204, "time_per_iteration": 4.305245637893677 }, { "auxiliary_loss_clip": 0.01737967, "auxiliary_loss_mlp": 0.01097466, "balance_loss_clip": 1.45582771, "balance_loss_mlp": 1.0603447, "epoch": 0.07244851946490305, "flos": 28303878816000.0, "grad_norm": 1.605170742336957, "language_loss": 0.83086634, "learning_rate": 3.981129007961593e-06, "loss": 0.85922068, "num_input_tokens_seen": 25669470, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.37109375, "step": 1205, "time_per_iteration": 4.344234943389893 }, { "auxiliary_loss_clip": 0.0175286, "auxiliary_loss_mlp": 0.01081941, "balance_loss_clip": 1.46489418, "balance_loss_mlp": 1.04536748, "epoch": 0.07250864271757101, "flos": 22575000349440.0, "grad_norm": 1.8510455252659823, "language_loss": 0.78052974, "learning_rate": 3.981075595726283e-06, "loss": 0.80887783, "num_input_tokens_seen": 25690470, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.36572266, "step": 1206, "time_per_iteration": 2.927222728729248 }, { "auxiliary_loss_clip": 0.0173858, "auxiliary_loss_mlp": 0.01071468, "balance_loss_clip": 1.45540977, "balance_loss_mlp": 1.03475118, "epoch": 0.072568765970239, "flos": 21772436503680.0, "grad_norm": 2.0375811785618083, "language_loss": 0.79009116, "learning_rate": 3.981022108368387e-06, "loss": 0.81819159, "num_input_tokens_seen": 25709205, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.3671875, "step": 1207, "time_per_iteration": 2.98776912689209 }, { "auxiliary_loss_clip": 0.01725553, "auxiliary_loss_mlp": 0.01074397, "balance_loss_clip": 1.44766903, "balance_loss_mlp": 1.03863418, "epoch": 0.07262888922290696, "flos": 25530517653120.0, "grad_norm": 1.6976473278794113, "language_loss": 0.81308889, "learning_rate": 3.9809685458899345e-06, "loss": 0.84108835, "num_input_tokens_seen": 25728485, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.35766602, "step": 1208, "time_per_iteration": 2.9281349182128906 }, { "auxiliary_loss_clip": 0.01737577, "auxiliary_loss_mlp": 0.01074648, "balance_loss_clip": 1.45709348, "balance_loss_mlp": 1.04129362, "epoch": 0.07268901247557492, "flos": 21255132994560.0, "grad_norm": 1.810228870763851, "language_loss": 0.79778206, "learning_rate": 3.980914908292955e-06, "loss": 0.82590431, "num_input_tokens_seen": 25747730, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.33374023, "step": 1209, "time_per_iteration": 2.9338574409484863 }, { "auxiliary_loss_clip": 0.01752962, "auxiliary_loss_mlp": 0.01065357, "balance_loss_clip": 1.46704757, "balance_loss_mlp": 1.0298562, "epoch": 0.0727491357282429, "flos": 25489091399040.0, "grad_norm": 2.3990415635268616, "language_loss": 0.83185416, "learning_rate": 3.980861195579486e-06, "loss": 0.86003739, "num_input_tokens_seen": 25768050, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.35498047, "step": 1210, "time_per_iteration": 3.1589083671569824 }, { "auxiliary_loss_clip": 0.01739507, "auxiliary_loss_mlp": 0.01082434, "balance_loss_clip": 1.45913279, "balance_loss_mlp": 1.0426662, "epoch": 0.07280925898091087, "flos": 24472853769600.0, "grad_norm": 1.6588875731584751, "language_loss": 0.86237931, "learning_rate": 3.98080740775156e-06, "loss": 0.89059877, "num_input_tokens_seen": 25787985, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.3972168, "step": 1211, "time_per_iteration": 2.910682201385498 }, { "auxiliary_loss_clip": 0.01737098, "auxiliary_loss_mlp": 0.01066662, "balance_loss_clip": 1.45601916, "balance_loss_mlp": 1.02939737, "epoch": 0.07286938223357883, "flos": 18295226945280.0, "grad_norm": 2.1028432800332655, "language_loss": 0.93129295, "learning_rate": 3.98075354481122e-06, "loss": 0.95933056, "num_input_tokens_seen": 25803620, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.37280273, "step": 1212, "time_per_iteration": 2.9579455852508545 }, { "auxiliary_loss_clip": 0.0173523, "auxiliary_loss_mlp": 0.01061275, "balance_loss_clip": 1.45369434, "balance_loss_mlp": 1.02501178, "epoch": 0.07292950548624681, "flos": 21224610737280.0, "grad_norm": 1.7221741431888318, "language_loss": 0.74217641, "learning_rate": 3.9806996067605055e-06, "loss": 0.77014148, "num_input_tokens_seen": 25823315, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.36206055, "step": 1213, "time_per_iteration": 2.8672401905059814 }, { "auxiliary_loss_clip": 0.0174543, "auxiliary_loss_mlp": 0.01070239, "balance_loss_clip": 1.4595561, "balance_loss_mlp": 1.03171074, "epoch": 0.07298962873891478, "flos": 24652068998400.0, "grad_norm": 1.6503147979080937, "language_loss": 0.86349249, "learning_rate": 3.980645593601465e-06, "loss": 0.89164913, "num_input_tokens_seen": 25842605, "router_z_loss_clip": 2.85742188, "router_z_loss_mlp": 0.38525391, "step": 1214, "time_per_iteration": 2.940476417541504 }, { "auxiliary_loss_clip": 0.0174297, "auxiliary_loss_mlp": 0.0107418, "balance_loss_clip": 1.46085334, "balance_loss_mlp": 1.0365572, "epoch": 0.07304975199158274, "flos": 27064149281280.0, "grad_norm": 2.9535399495248567, "language_loss": 0.86468691, "learning_rate": 3.980591505336144e-06, "loss": 0.89285845, "num_input_tokens_seen": 25863030, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.3762207, "step": 1215, "time_per_iteration": 2.998314619064331 }, { "auxiliary_loss_clip": 0.01734469, "auxiliary_loss_mlp": 0.0106546, "balance_loss_clip": 1.44905901, "balance_loss_mlp": 1.02716994, "epoch": 0.07310987524425071, "flos": 33563214299520.0, "grad_norm": 1.6807280439957868, "language_loss": 0.83393174, "learning_rate": 3.980537341966595e-06, "loss": 0.86193103, "num_input_tokens_seen": 25888015, "router_z_loss_clip": 2.8515625, "router_z_loss_mlp": 0.38330078, "step": 1216, "time_per_iteration": 2.9751698970794678 }, { "auxiliary_loss_clip": 0.01765809, "auxiliary_loss_mlp": 0.01065848, "balance_loss_clip": 1.47734332, "balance_loss_mlp": 1.02829695, "epoch": 0.07316999849691869, "flos": 28122627571200.0, "grad_norm": 1.9834005684413027, "language_loss": 0.78346801, "learning_rate": 3.980483103494872e-06, "loss": 0.81178463, "num_input_tokens_seen": 25908660, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.37573242, "step": 1217, "time_per_iteration": 2.966101884841919 }, { "auxiliary_loss_clip": 0.01742526, "auxiliary_loss_mlp": 0.01074209, "balance_loss_clip": 1.46050537, "balance_loss_mlp": 1.03706324, "epoch": 0.07323012174958665, "flos": 14400804654720.0, "grad_norm": 1.8600213996681079, "language_loss": 0.87973988, "learning_rate": 3.98042878992303e-06, "loss": 0.90790719, "num_input_tokens_seen": 25927215, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.37133789, "step": 1218, "time_per_iteration": 2.865882396697998 }, { "auxiliary_loss_clip": 0.01733777, "auxiliary_loss_mlp": 0.01067971, "balance_loss_clip": 1.45329738, "balance_loss_mlp": 1.02960896, "epoch": 0.07329024500225462, "flos": 21626412975360.0, "grad_norm": 1.9651097117463758, "language_loss": 0.88450789, "learning_rate": 3.9803744012531305e-06, "loss": 0.9125253, "num_input_tokens_seen": 25945500, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.3840332, "step": 1219, "time_per_iteration": 2.907116651535034 }, { "auxiliary_loss_clip": 0.01736842, "auxiliary_loss_mlp": 0.0107386, "balance_loss_clip": 1.4583354, "balance_loss_mlp": 1.03564191, "epoch": 0.0733503682549226, "flos": 13231802021760.0, "grad_norm": 1.9035129675019191, "language_loss": 0.87007231, "learning_rate": 3.980319937487235e-06, "loss": 0.89817929, "num_input_tokens_seen": 25963105, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.38232422, "step": 1220, "time_per_iteration": 2.8809688091278076 }, { "auxiliary_loss_clip": 0.01744602, "auxiliary_loss_mlp": 0.01074062, "balance_loss_clip": 1.46072721, "balance_loss_mlp": 1.0339129, "epoch": 0.07341049150759056, "flos": 20896838271360.0, "grad_norm": 2.0663208744488446, "language_loss": 0.8005898, "learning_rate": 3.98026539862741e-06, "loss": 0.82877648, "num_input_tokens_seen": 25981690, "router_z_loss_clip": 2.83789062, "router_z_loss_mlp": 0.40112305, "step": 1221, "time_per_iteration": 2.8785417079925537 }, { "auxiliary_loss_clip": 0.01738903, "auxiliary_loss_mlp": 0.0106669, "balance_loss_clip": 1.45837414, "balance_loss_mlp": 1.02961588, "epoch": 0.07347061476025853, "flos": 15421928722560.0, "grad_norm": 1.8746087726504528, "language_loss": 0.93722951, "learning_rate": 3.980210784675722e-06, "loss": 0.96528542, "num_input_tokens_seen": 25999890, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.37109375, "step": 1222, "time_per_iteration": 2.96284556388855 }, { "auxiliary_loss_clip": 0.01739344, "auxiliary_loss_mlp": 0.01069028, "balance_loss_clip": 1.45512319, "balance_loss_mlp": 1.02928352, "epoch": 0.0735307380129265, "flos": 11116202785920.0, "grad_norm": 2.9634330438431626, "language_loss": 0.9308179, "learning_rate": 3.980156095634242e-06, "loss": 0.95890158, "num_input_tokens_seen": 26016445, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.3972168, "step": 1223, "time_per_iteration": 2.846430540084839 }, { "auxiliary_loss_clip": 0.01750781, "auxiliary_loss_mlp": 0.01075169, "balance_loss_clip": 1.46650648, "balance_loss_mlp": 1.03673625, "epoch": 0.07359086126559447, "flos": 23742871862400.0, "grad_norm": 2.413643272568145, "language_loss": 0.83740896, "learning_rate": 3.980101331505045e-06, "loss": 0.86566842, "num_input_tokens_seen": 26036080, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.3840332, "step": 1224, "time_per_iteration": 3.008087396621704 }, { "auxiliary_loss_clip": 0.01734798, "auxiliary_loss_mlp": 0.01075699, "balance_loss_clip": 1.45144153, "balance_loss_mlp": 1.03512013, "epoch": 0.07365098451826244, "flos": 21002340441600.0, "grad_norm": 2.083858182567427, "language_loss": 0.85000324, "learning_rate": 3.9800464922902076e-06, "loss": 0.87810826, "num_input_tokens_seen": 26055805, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.40600586, "step": 1225, "time_per_iteration": 3.069096326828003 }, { "auxiliary_loss_clip": 0.01737809, "auxiliary_loss_mlp": 0.01069145, "balance_loss_clip": 1.45951688, "balance_loss_mlp": 1.03021157, "epoch": 0.0737111077709304, "flos": 19941961870080.0, "grad_norm": 2.0975455494419983, "language_loss": 0.92384607, "learning_rate": 3.979991577991808e-06, "loss": 0.95191562, "num_input_tokens_seen": 26073905, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.38916016, "step": 1226, "time_per_iteration": 2.9532361030578613 }, { "auxiliary_loss_clip": 0.01782014, "auxiliary_loss_mlp": 0.01073972, "balance_loss_clip": 1.48533571, "balance_loss_mlp": 1.03451407, "epoch": 0.07377123102359838, "flos": 16590252683520.0, "grad_norm": 2.5088905806470616, "language_loss": 0.79429889, "learning_rate": 3.97993658861193e-06, "loss": 0.82285869, "num_input_tokens_seen": 26091700, "router_z_loss_clip": 2.96289062, "router_z_loss_mlp": 0.39453125, "step": 1227, "time_per_iteration": 3.056983470916748 }, { "auxiliary_loss_clip": 0.01740669, "auxiliary_loss_mlp": 0.01070421, "balance_loss_clip": 1.46197307, "balance_loss_mlp": 1.03000879, "epoch": 0.07383135427626634, "flos": 28339966183680.0, "grad_norm": 2.169365393380425, "language_loss": 0.86905336, "learning_rate": 3.9798815241526575e-06, "loss": 0.89716423, "num_input_tokens_seen": 26114105, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.40332031, "step": 1228, "time_per_iteration": 2.9955191612243652 }, { "auxiliary_loss_clip": 0.01748338, "auxiliary_loss_mlp": 0.01073462, "balance_loss_clip": 1.46644998, "balance_loss_mlp": 1.03042769, "epoch": 0.07389147752893431, "flos": 20056648734720.0, "grad_norm": 4.268344981787953, "language_loss": 0.81027192, "learning_rate": 3.97982638461608e-06, "loss": 0.83848989, "num_input_tokens_seen": 26131165, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.4296875, "step": 1229, "time_per_iteration": 2.953364133834839 }, { "auxiliary_loss_clip": 0.01758347, "auxiliary_loss_mlp": 0.01074346, "balance_loss_clip": 1.47193432, "balance_loss_mlp": 1.03350449, "epoch": 0.07395160078160229, "flos": 18123296129280.0, "grad_norm": 1.9638302560810479, "language_loss": 0.80922222, "learning_rate": 3.979771170004287e-06, "loss": 0.83754921, "num_input_tokens_seen": 26150040, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.40844727, "step": 1230, "time_per_iteration": 2.9035606384277344 }, { "auxiliary_loss_clip": 0.01734878, "auxiliary_loss_mlp": 0.01070906, "balance_loss_clip": 1.457165, "balance_loss_mlp": 1.03197217, "epoch": 0.07401172403427025, "flos": 23597481761280.0, "grad_norm": 2.239313452335804, "language_loss": 0.83430797, "learning_rate": 3.979715880319372e-06, "loss": 0.86236578, "num_input_tokens_seen": 26169380, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.3894043, "step": 1231, "time_per_iteration": 2.9736645221710205 }, { "auxiliary_loss_clip": 0.01749904, "auxiliary_loss_mlp": 0.01075794, "balance_loss_clip": 1.46252811, "balance_loss_mlp": 1.0319488, "epoch": 0.07407184728693822, "flos": 26371204882560.0, "grad_norm": 2.121473087369066, "language_loss": 0.97927123, "learning_rate": 3.979660515563434e-06, "loss": 1.00752819, "num_input_tokens_seen": 26189420, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.43823242, "step": 1232, "time_per_iteration": 3.0363388061523438 }, { "auxiliary_loss_clip": 0.01740952, "auxiliary_loss_mlp": 0.01071942, "balance_loss_clip": 1.46368349, "balance_loss_mlp": 1.03391445, "epoch": 0.0741319705396062, "flos": 22210642823040.0, "grad_norm": 2.9429534614298607, "language_loss": 0.83075738, "learning_rate": 3.979605075738569e-06, "loss": 0.85888624, "num_input_tokens_seen": 26209300, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.37988281, "step": 1233, "time_per_iteration": 4.419411897659302 }, { "auxiliary_loss_clip": 0.01753128, "auxiliary_loss_mlp": 0.01077026, "balance_loss_clip": 1.46535671, "balance_loss_mlp": 1.03539777, "epoch": 0.07419209379227416, "flos": 39214488654720.0, "grad_norm": 2.1884629755441427, "language_loss": 0.72009736, "learning_rate": 3.979549560846883e-06, "loss": 0.7483989, "num_input_tokens_seen": 26228110, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.41625977, "step": 1234, "time_per_iteration": 3.118805170059204 }, { "auxiliary_loss_clip": 0.01753646, "auxiliary_loss_mlp": 0.01077283, "balance_loss_clip": 1.47166014, "balance_loss_mlp": 1.03815866, "epoch": 0.07425221704494213, "flos": 22791343576320.0, "grad_norm": 2.196036768049439, "language_loss": 0.78298187, "learning_rate": 3.979493970890478e-06, "loss": 0.81129116, "num_input_tokens_seen": 26247020, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.39135742, "step": 1235, "time_per_iteration": 2.968195915222168 }, { "auxiliary_loss_clip": 0.01744384, "auxiliary_loss_mlp": 0.01062963, "balance_loss_clip": 1.4673667, "balance_loss_mlp": 1.02541161, "epoch": 0.0743123402976101, "flos": 22283089027200.0, "grad_norm": 2.043937499821166, "language_loss": 0.84789968, "learning_rate": 3.979438305871464e-06, "loss": 0.87597317, "num_input_tokens_seen": 26265750, "router_z_loss_clip": 2.76757812, "router_z_loss_mlp": 0.37548828, "step": 1236, "time_per_iteration": 2.902275562286377 }, { "auxiliary_loss_clip": 0.01759652, "auxiliary_loss_mlp": 0.01068365, "balance_loss_clip": 1.47450292, "balance_loss_mlp": 1.02890694, "epoch": 0.07437246355027807, "flos": 29327038899840.0, "grad_norm": 2.538933912720982, "language_loss": 0.77946121, "learning_rate": 3.979382565791951e-06, "loss": 0.8077414, "num_input_tokens_seen": 26287905, "router_z_loss_clip": 2.85351562, "router_z_loss_mlp": 0.39453125, "step": 1237, "time_per_iteration": 2.94968318939209 }, { "auxiliary_loss_clip": 0.01752334, "auxiliary_loss_mlp": 0.0106996, "balance_loss_clip": 1.47003424, "balance_loss_mlp": 1.032552, "epoch": 0.07443258680294604, "flos": 31958131852800.0, "grad_norm": 1.7857787820615114, "language_loss": 0.78460157, "learning_rate": 3.979326750654053e-06, "loss": 0.81282449, "num_input_tokens_seen": 26311795, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.37402344, "step": 1238, "time_per_iteration": 4.414613962173462 }, { "auxiliary_loss_clip": 0.01770746, "auxiliary_loss_mlp": 0.01069184, "balance_loss_clip": 1.48345912, "balance_loss_mlp": 1.02784216, "epoch": 0.074492710055614, "flos": 22685750916480.0, "grad_norm": 2.0565679634515157, "language_loss": 0.88199615, "learning_rate": 3.9792708604598854e-06, "loss": 0.9103955, "num_input_tokens_seen": 26330330, "router_z_loss_clip": 2.87304688, "router_z_loss_mlp": 0.41308594, "step": 1239, "time_per_iteration": 5.773285627365112 }, { "auxiliary_loss_clip": 0.0176868, "auxiliary_loss_mlp": 0.01063593, "balance_loss_clip": 1.48096228, "balance_loss_mlp": 1.0236814, "epoch": 0.07455283330828198, "flos": 21293980295040.0, "grad_norm": 4.184757193333043, "language_loss": 0.90643001, "learning_rate": 3.979214895211569e-06, "loss": 0.9347527, "num_input_tokens_seen": 26348865, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.39868164, "step": 1240, "time_per_iteration": 2.9377450942993164 }, { "auxiliary_loss_clip": 0.01760012, "auxiliary_loss_mlp": 0.01071419, "balance_loss_clip": 1.47619224, "balance_loss_mlp": 1.03012514, "epoch": 0.07461295656094995, "flos": 24398869242240.0, "grad_norm": 2.3221089535548267, "language_loss": 0.90357721, "learning_rate": 3.979158854911225e-06, "loss": 0.93189156, "num_input_tokens_seen": 26368210, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.4128418, "step": 1241, "time_per_iteration": 2.914361000061035 }, { "auxiliary_loss_clip": 0.01471498, "auxiliary_loss_mlp": 0.01032675, "balance_loss_clip": 1.29809165, "balance_loss_mlp": 1.00883317, "epoch": 0.07467307981361791, "flos": 62138074078080.0, "grad_norm": 0.9091378248313787, "language_loss": 0.6314882, "learning_rate": 3.979102739560979e-06, "loss": 0.6565299, "num_input_tokens_seen": 26424890, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.23828125, "step": 1242, "time_per_iteration": 3.4046709537506104 }, { "auxiliary_loss_clip": 0.01804198, "auxiliary_loss_mlp": 0.01076023, "balance_loss_clip": 1.5040946, "balance_loss_mlp": 1.03546774, "epoch": 0.07473320306628589, "flos": 24873841601280.0, "grad_norm": 2.2085478184233978, "language_loss": 0.64503086, "learning_rate": 3.9790465491629595e-06, "loss": 0.67383301, "num_input_tokens_seen": 26446405, "router_z_loss_clip": 3.0, "router_z_loss_mlp": 0.4050293, "step": 1243, "time_per_iteration": 2.9248616695404053 }, { "auxiliary_loss_clip": 0.0176037, "auxiliary_loss_mlp": 0.01082033, "balance_loss_clip": 1.47819281, "balance_loss_mlp": 1.0439105, "epoch": 0.07479332631895386, "flos": 24907576239360.0, "grad_norm": 1.848427621321491, "language_loss": 0.78092623, "learning_rate": 3.978990283719296e-06, "loss": 0.80935025, "num_input_tokens_seen": 26466070, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.38110352, "step": 1244, "time_per_iteration": 2.9813613891601562 }, { "auxiliary_loss_clip": 0.01761704, "auxiliary_loss_mlp": 0.01068048, "balance_loss_clip": 1.47839069, "balance_loss_mlp": 1.03083086, "epoch": 0.07485344957162182, "flos": 17822516826240.0, "grad_norm": 2.768903890669495, "language_loss": 0.71654117, "learning_rate": 3.978933943232123e-06, "loss": 0.74483871, "num_input_tokens_seen": 26479350, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.37207031, "step": 1245, "time_per_iteration": 2.8912391662597656 }, { "auxiliary_loss_clip": 0.01757208, "auxiliary_loss_mlp": 0.01073239, "balance_loss_clip": 1.47419667, "balance_loss_mlp": 1.036165, "epoch": 0.0749135728242898, "flos": 25021222473600.0, "grad_norm": 2.106114171054669, "language_loss": 0.89950359, "learning_rate": 3.978877527703576e-06, "loss": 0.92780805, "num_input_tokens_seen": 26498255, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.37109375, "step": 1246, "time_per_iteration": 2.88694429397583 }, { "auxiliary_loss_clip": 0.01799153, "auxiliary_loss_mlp": 0.01087021, "balance_loss_clip": 1.49760079, "balance_loss_mlp": 1.04429698, "epoch": 0.07497369607695777, "flos": 17831113338240.0, "grad_norm": 2.5269229169624863, "language_loss": 0.91223234, "learning_rate": 3.9788210371357945e-06, "loss": 0.94109404, "num_input_tokens_seen": 26515375, "router_z_loss_clip": 3.01367188, "router_z_loss_mlp": 0.42700195, "step": 1247, "time_per_iteration": 2.894378185272217 }, { "auxiliary_loss_clip": 0.01755564, "auxiliary_loss_mlp": 0.01080991, "balance_loss_clip": 1.47535264, "balance_loss_mlp": 1.04398882, "epoch": 0.07503381932962573, "flos": 15129157749120.0, "grad_norm": 2.722098938491771, "language_loss": 0.66454566, "learning_rate": 3.978764471530921e-06, "loss": 0.69291121, "num_input_tokens_seen": 26533595, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.36987305, "step": 1248, "time_per_iteration": 2.887066125869751 }, { "auxiliary_loss_clip": 0.01729293, "auxiliary_loss_mlp": 0.01079229, "balance_loss_clip": 1.45595884, "balance_loss_mlp": 1.04189301, "epoch": 0.0750939425822937, "flos": 12822715370880.0, "grad_norm": 2.2795683105930493, "language_loss": 0.75825602, "learning_rate": 3.978707830891102e-06, "loss": 0.78634125, "num_input_tokens_seen": 26549405, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.37353516, "step": 1249, "time_per_iteration": 2.9188787937164307 }, { "auxiliary_loss_clip": 0.01769071, "auxiliary_loss_mlp": 0.01087395, "balance_loss_clip": 1.47671127, "balance_loss_mlp": 1.05127466, "epoch": 0.07515406583496168, "flos": 24217482263040.0, "grad_norm": 2.630988987744165, "language_loss": 0.84280241, "learning_rate": 3.978651115218482e-06, "loss": 0.87136704, "num_input_tokens_seen": 26567200, "router_z_loss_clip": 2.91796875, "router_z_loss_mlp": 0.36132812, "step": 1250, "time_per_iteration": 2.8844637870788574 }, { "auxiliary_loss_clip": 0.01753261, "auxiliary_loss_mlp": 0.01078826, "balance_loss_clip": 1.47415257, "balance_loss_mlp": 1.04094076, "epoch": 0.07521418908762964, "flos": 26699520286080.0, "grad_norm": 2.229562137137637, "language_loss": 0.69622493, "learning_rate": 3.978594324515215e-06, "loss": 0.72454578, "num_input_tokens_seen": 26586190, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.37866211, "step": 1251, "time_per_iteration": 3.007110834121704 }, { "auxiliary_loss_clip": 0.01461099, "auxiliary_loss_mlp": 0.01108392, "balance_loss_clip": 1.29167378, "balance_loss_mlp": 1.05956399, "epoch": 0.0752743123402976, "flos": 59126353453440.0, "grad_norm": 0.9354610690679419, "language_loss": 0.70639718, "learning_rate": 3.9785374587834515e-06, "loss": 0.73209214, "num_input_tokens_seen": 26650710, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.48828125, "step": 1252, "time_per_iteration": 3.478961944580078 }, { "auxiliary_loss_clip": 0.01749889, "auxiliary_loss_mlp": 0.01084817, "balance_loss_clip": 1.46743941, "balance_loss_mlp": 1.0469805, "epoch": 0.07533443559296558, "flos": 23487455111040.0, "grad_norm": 2.2850336638375666, "language_loss": 0.8139683, "learning_rate": 3.97848051802535e-06, "loss": 0.84231532, "num_input_tokens_seen": 26669000, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.37841797, "step": 1253, "time_per_iteration": 2.8962042331695557 }, { "auxiliary_loss_clip": 0.01742657, "auxiliary_loss_mlp": 0.01069823, "balance_loss_clip": 1.4563272, "balance_loss_mlp": 1.03594351, "epoch": 0.07539455884563355, "flos": 20886929660160.0, "grad_norm": 2.3159823642241304, "language_loss": 0.96008432, "learning_rate": 3.978423502243069e-06, "loss": 0.98820925, "num_input_tokens_seen": 26683075, "router_z_loss_clip": 2.859375, "router_z_loss_mlp": 0.33911133, "step": 1254, "time_per_iteration": 2.885986566543579 }, { "auxiliary_loss_clip": 0.01736942, "auxiliary_loss_mlp": 0.01074947, "balance_loss_clip": 1.4607358, "balance_loss_mlp": 1.03713417, "epoch": 0.07545468209830151, "flos": 27684421251840.0, "grad_norm": 1.9277262708035223, "language_loss": 0.8913976, "learning_rate": 3.97836641143877e-06, "loss": 0.91951644, "num_input_tokens_seen": 26701875, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.37792969, "step": 1255, "time_per_iteration": 2.951539993286133 }, { "auxiliary_loss_clip": 0.01739936, "auxiliary_loss_mlp": 0.0107543, "balance_loss_clip": 1.4618721, "balance_loss_mlp": 1.03821325, "epoch": 0.0755148053509695, "flos": 14145071189760.0, "grad_norm": 1.7257814108298783, "language_loss": 0.80488253, "learning_rate": 3.978309245614618e-06, "loss": 0.83303618, "num_input_tokens_seen": 26719050, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.37182617, "step": 1256, "time_per_iteration": 2.865190267562866 }, { "auxiliary_loss_clip": 0.01459123, "auxiliary_loss_mlp": 0.01046295, "balance_loss_clip": 1.2852397, "balance_loss_mlp": 1.00299823, "epoch": 0.07557492860363746, "flos": 58260872073600.0, "grad_norm": 0.7844914671057381, "language_loss": 0.58185482, "learning_rate": 3.9782520047727825e-06, "loss": 0.60690892, "num_input_tokens_seen": 26780650, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.43359375, "step": 1257, "time_per_iteration": 3.495898485183716 }, { "auxiliary_loss_clip": 0.01757087, "auxiliary_loss_mlp": 0.01084338, "balance_loss_clip": 1.47047102, "balance_loss_mlp": 1.04228067, "epoch": 0.07563505185630542, "flos": 24655055155200.0, "grad_norm": 2.47743322287645, "language_loss": 0.91487241, "learning_rate": 3.978194688915432e-06, "loss": 0.94328666, "num_input_tokens_seen": 26798725, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.4206543, "step": 1258, "time_per_iteration": 2.9163155555725098 }, { "auxiliary_loss_clip": 0.01718934, "auxiliary_loss_mlp": 0.01070764, "balance_loss_clip": 1.45128655, "balance_loss_mlp": 1.03433347, "epoch": 0.07569517510897339, "flos": 15531321945600.0, "grad_norm": 1.9483982590610465, "language_loss": 0.82698214, "learning_rate": 3.978137298044741e-06, "loss": 0.85487908, "num_input_tokens_seen": 26817005, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.36425781, "step": 1259, "time_per_iteration": 2.9140992164611816 }, { "auxiliary_loss_clip": 0.01746552, "auxiliary_loss_mlp": 0.01069834, "balance_loss_clip": 1.46700418, "balance_loss_mlp": 1.03516769, "epoch": 0.07575529836164137, "flos": 22938498224640.0, "grad_norm": 1.7480143636119096, "language_loss": 0.77179372, "learning_rate": 3.978079832162885e-06, "loss": 0.79995763, "num_input_tokens_seen": 26836655, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.34667969, "step": 1260, "time_per_iteration": 2.9436309337615967 }, { "auxiliary_loss_clip": 0.01740319, "auxiliary_loss_mlp": 0.0107198, "balance_loss_clip": 1.46633804, "balance_loss_mlp": 1.03478646, "epoch": 0.07581542161430933, "flos": 19509908843520.0, "grad_norm": 1.7031594956235663, "language_loss": 0.86320269, "learning_rate": 3.978022291272044e-06, "loss": 0.89132571, "num_input_tokens_seen": 26854925, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.37182617, "step": 1261, "time_per_iteration": 2.888699769973755 }, { "auxiliary_loss_clip": 0.01733815, "auxiliary_loss_mlp": 0.01070146, "balance_loss_clip": 1.45722914, "balance_loss_mlp": 1.03543234, "epoch": 0.0758755448669773, "flos": 24984411189120.0, "grad_norm": 2.0589400484844913, "language_loss": 0.84320378, "learning_rate": 3.977964675374399e-06, "loss": 0.87124348, "num_input_tokens_seen": 26876170, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.34716797, "step": 1262, "time_per_iteration": 2.9670019149780273 }, { "auxiliary_loss_clip": 0.01732578, "auxiliary_loss_mlp": 0.01080833, "balance_loss_clip": 1.45250809, "balance_loss_mlp": 1.04268634, "epoch": 0.07593566811964528, "flos": 22758378099840.0, "grad_norm": 2.6645614476911392, "language_loss": 0.84352511, "learning_rate": 3.977906984472136e-06, "loss": 0.87165928, "num_input_tokens_seen": 26895005, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.3815918, "step": 1263, "time_per_iteration": 2.9595067501068115 }, { "auxiliary_loss_clip": 0.01722591, "auxiliary_loss_mlp": 0.01065261, "balance_loss_clip": 1.44611859, "balance_loss_mlp": 1.03073788, "epoch": 0.07599579137231324, "flos": 23122780871040.0, "grad_norm": 2.238079142121044, "language_loss": 0.77217853, "learning_rate": 3.977849218567442e-06, "loss": 0.80005705, "num_input_tokens_seen": 26913930, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.34521484, "step": 1264, "time_per_iteration": 2.996696710586548 }, { "auxiliary_loss_clip": 0.01730685, "auxiliary_loss_mlp": 0.01065692, "balance_loss_clip": 1.45036972, "balance_loss_mlp": 1.02818918, "epoch": 0.07605591462498121, "flos": 14510288367360.0, "grad_norm": 2.343877725059831, "language_loss": 0.8303113, "learning_rate": 3.977791377662507e-06, "loss": 0.85827506, "num_input_tokens_seen": 26931485, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.375, "step": 1265, "time_per_iteration": 2.968015670776367 }, { "auxiliary_loss_clip": 0.01741248, "auxiliary_loss_mlp": 0.01071394, "balance_loss_clip": 1.45523226, "balance_loss_mlp": 1.03415275, "epoch": 0.07611603787764919, "flos": 23524809333120.0, "grad_norm": 1.8358103372153176, "language_loss": 0.66649503, "learning_rate": 3.977733461759524e-06, "loss": 0.69462144, "num_input_tokens_seen": 26951670, "router_z_loss_clip": 2.86328125, "router_z_loss_mlp": 0.37207031, "step": 1266, "time_per_iteration": 3.065746307373047 }, { "auxiliary_loss_clip": 0.0173546, "auxiliary_loss_mlp": 0.01068197, "balance_loss_clip": 1.45270658, "balance_loss_mlp": 1.03396034, "epoch": 0.07617616113031715, "flos": 21517200731520.0, "grad_norm": 2.486078464385837, "language_loss": 0.82681119, "learning_rate": 3.977675470860691e-06, "loss": 0.85484773, "num_input_tokens_seen": 26970335, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.34228516, "step": 1267, "time_per_iteration": 3.2914657592773438 }, { "auxiliary_loss_clip": 0.01718345, "auxiliary_loss_mlp": 0.01060197, "balance_loss_clip": 1.44293714, "balance_loss_mlp": 1.02443433, "epoch": 0.07623628438298512, "flos": 14580924779520.0, "grad_norm": 2.053217755541146, "language_loss": 0.7529521, "learning_rate": 3.977617404968205e-06, "loss": 0.78073752, "num_input_tokens_seen": 26986025, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.35766602, "step": 1268, "time_per_iteration": 4.371596097946167 }, { "auxiliary_loss_clip": 0.01716627, "auxiliary_loss_mlp": 0.01059661, "balance_loss_clip": 1.44232655, "balance_loss_mlp": 1.02416086, "epoch": 0.07629640763565308, "flos": 14728531875840.0, "grad_norm": 1.9508578651369524, "language_loss": 0.84397286, "learning_rate": 3.977559264084269e-06, "loss": 0.87173569, "num_input_tokens_seen": 27004045, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.35498047, "step": 1269, "time_per_iteration": 2.8916733264923096 }, { "auxiliary_loss_clip": 0.01723385, "auxiliary_loss_mlp": 0.01065655, "balance_loss_clip": 1.4477725, "balance_loss_mlp": 1.02815223, "epoch": 0.07635653088832106, "flos": 14911140464640.0, "grad_norm": 2.1987140659948587, "language_loss": 0.91458678, "learning_rate": 3.977501048211088e-06, "loss": 0.94247723, "num_input_tokens_seen": 27022070, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.375, "step": 1270, "time_per_iteration": 3.068037509918213 }, { "auxiliary_loss_clip": 0.01742019, "auxiliary_loss_mlp": 0.01070811, "balance_loss_clip": 1.46395993, "balance_loss_mlp": 1.03037524, "epoch": 0.07641665414098903, "flos": 26662889980800.0, "grad_norm": 4.5670719508048805, "language_loss": 0.73055732, "learning_rate": 3.977442757350869e-06, "loss": 0.75868565, "num_input_tokens_seen": 27041755, "router_z_loss_clip": 2.77929688, "router_z_loss_mlp": 0.40454102, "step": 1271, "time_per_iteration": 3.007747173309326 }, { "auxiliary_loss_clip": 0.01715276, "auxiliary_loss_mlp": 0.01068738, "balance_loss_clip": 1.45187306, "balance_loss_mlp": 1.03297532, "epoch": 0.07647677739365699, "flos": 25203921552000.0, "grad_norm": 1.5208492813461223, "language_loss": 0.84197932, "learning_rate": 3.977384391505823e-06, "loss": 0.86981952, "num_input_tokens_seen": 27061540, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.35717773, "step": 1272, "time_per_iteration": 2.9786059856414795 }, { "auxiliary_loss_clip": 0.01721079, "auxiliary_loss_mlp": 0.01068862, "balance_loss_clip": 1.44679165, "balance_loss_mlp": 1.03386235, "epoch": 0.07653690064632497, "flos": 20567798951040.0, "grad_norm": 1.5991624553566999, "language_loss": 0.82105732, "learning_rate": 3.977325950678162e-06, "loss": 0.8489567, "num_input_tokens_seen": 27081395, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.35009766, "step": 1273, "time_per_iteration": 4.382061243057251 }, { "auxiliary_loss_clip": 0.0174976, "auxiliary_loss_mlp": 0.01075948, "balance_loss_clip": 1.46820331, "balance_loss_mlp": 1.04009008, "epoch": 0.07659702389899294, "flos": 22278474057600.0, "grad_norm": 1.6334320278034284, "language_loss": 0.8217749, "learning_rate": 3.977267434870103e-06, "loss": 0.85003197, "num_input_tokens_seen": 27101175, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.35864258, "step": 1274, "time_per_iteration": 5.8802330493927 }, { "auxiliary_loss_clip": 0.01727434, "auxiliary_loss_mlp": 0.01082611, "balance_loss_clip": 1.45567441, "balance_loss_mlp": 1.0445354, "epoch": 0.0766571471516609, "flos": 32649221214720.0, "grad_norm": 1.6915330171567986, "language_loss": 0.73476589, "learning_rate": 3.977208844083865e-06, "loss": 0.76286638, "num_input_tokens_seen": 27124505, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.38061523, "step": 1275, "time_per_iteration": 3.0718259811401367 }, { "auxiliary_loss_clip": 0.01724252, "auxiliary_loss_mlp": 0.01073045, "balance_loss_clip": 1.44810486, "balance_loss_mlp": 1.03759253, "epoch": 0.07671727040432888, "flos": 15275588480640.0, "grad_norm": 1.9022017864873937, "language_loss": 0.81663787, "learning_rate": 3.9771501783216685e-06, "loss": 0.84461081, "num_input_tokens_seen": 27140960, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.35473633, "step": 1276, "time_per_iteration": 2.8892822265625 }, { "auxiliary_loss_clip": 0.0174845, "auxiliary_loss_mlp": 0.01076411, "balance_loss_clip": 1.46690893, "balance_loss_mlp": 1.03938472, "epoch": 0.07677739365699685, "flos": 28195842936960.0, "grad_norm": 2.2304626521020494, "language_loss": 0.61748576, "learning_rate": 3.97709143758574e-06, "loss": 0.64573443, "num_input_tokens_seen": 27160985, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.37011719, "step": 1277, "time_per_iteration": 3.034935712814331 }, { "auxiliary_loss_clip": 0.01750936, "auxiliary_loss_mlp": 0.01069643, "balance_loss_clip": 1.4696182, "balance_loss_mlp": 1.03166318, "epoch": 0.07683751690966481, "flos": 18305226046080.0, "grad_norm": 2.576421489293144, "language_loss": 0.77631617, "learning_rate": 3.977032621878305e-06, "loss": 0.80452204, "num_input_tokens_seen": 27178390, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.37963867, "step": 1278, "time_per_iteration": 2.8928890228271484 }, { "auxiliary_loss_clip": 0.01728835, "auxiliary_loss_mlp": 0.01073396, "balance_loss_clip": 1.45895433, "balance_loss_mlp": 1.02938437, "epoch": 0.07689764016233278, "flos": 21991132460160.0, "grad_norm": 2.0281555783937866, "language_loss": 0.89704406, "learning_rate": 3.976973731201596e-06, "loss": 0.92506635, "num_input_tokens_seen": 27197505, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.43994141, "step": 1279, "time_per_iteration": 2.9805259704589844 }, { "auxiliary_loss_clip": 0.01725253, "auxiliary_loss_mlp": 0.0106869, "balance_loss_clip": 1.45405555, "balance_loss_mlp": 1.033499, "epoch": 0.07695776341500075, "flos": 22245961029120.0, "grad_norm": 2.470507607785178, "language_loss": 0.84064806, "learning_rate": 3.976914765557845e-06, "loss": 0.86858743, "num_input_tokens_seen": 27214260, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.35180664, "step": 1280, "time_per_iteration": 2.976846694946289 }, { "auxiliary_loss_clip": 0.01726636, "auxiliary_loss_mlp": 0.0107868, "balance_loss_clip": 1.45712566, "balance_loss_mlp": 1.04196382, "epoch": 0.07701788666766872, "flos": 16152363077760.0, "grad_norm": 2.0441251798628914, "language_loss": 0.77052355, "learning_rate": 3.9768557249492875e-06, "loss": 0.79857671, "num_input_tokens_seen": 27232525, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.36694336, "step": 1281, "time_per_iteration": 2.889867067337036 }, { "auxiliary_loss_clip": 0.01746481, "auxiliary_loss_mlp": 0.01088412, "balance_loss_clip": 1.4643898, "balance_loss_mlp": 1.05031228, "epoch": 0.07707800992033668, "flos": 19472283152640.0, "grad_norm": 1.8313120072104785, "language_loss": 0.7698704, "learning_rate": 3.9767966093781634e-06, "loss": 0.79821932, "num_input_tokens_seen": 27249800, "router_z_loss_clip": 2.82617188, "router_z_loss_mlp": 0.38134766, "step": 1282, "time_per_iteration": 2.919570207595825 }, { "auxiliary_loss_clip": 0.01738537, "auxiliary_loss_mlp": 0.01085777, "balance_loss_clip": 1.4654814, "balance_loss_mlp": 1.04734433, "epoch": 0.07713813317300466, "flos": 18999799257600.0, "grad_norm": 2.1621741440844873, "language_loss": 0.84758776, "learning_rate": 3.976737418846713e-06, "loss": 0.87583089, "num_input_tokens_seen": 27268895, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.38452148, "step": 1283, "time_per_iteration": 2.8886008262634277 }, { "auxiliary_loss_clip": 0.01754871, "auxiliary_loss_mlp": 0.01075871, "balance_loss_clip": 1.47956944, "balance_loss_mlp": 1.03710413, "epoch": 0.07719825642567263, "flos": 18123069905280.0, "grad_norm": 2.215603331025433, "language_loss": 0.77059877, "learning_rate": 3.976678153357181e-06, "loss": 0.79890621, "num_input_tokens_seen": 27288180, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.38793945, "step": 1284, "time_per_iteration": 2.924194574356079 }, { "auxiliary_loss_clip": 0.01756304, "auxiliary_loss_mlp": 0.01072524, "balance_loss_clip": 1.48091066, "balance_loss_mlp": 1.03425789, "epoch": 0.0772583796783406, "flos": 42209758154880.0, "grad_norm": 2.898738287219596, "language_loss": 0.77238631, "learning_rate": 3.976618812911817e-06, "loss": 0.80067456, "num_input_tokens_seen": 27311815, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.3828125, "step": 1285, "time_per_iteration": 3.0994951725006104 }, { "auxiliary_loss_clip": 0.01747339, "auxiliary_loss_mlp": 0.01068938, "balance_loss_clip": 1.47416735, "balance_loss_mlp": 1.03029013, "epoch": 0.07731850293100857, "flos": 24764357888640.0, "grad_norm": 1.951009346643708, "language_loss": 0.85257119, "learning_rate": 3.9765593975128685e-06, "loss": 0.88073397, "num_input_tokens_seen": 27331890, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.38671875, "step": 1286, "time_per_iteration": 2.925128698348999 }, { "auxiliary_loss_clip": 0.0178301, "auxiliary_loss_mlp": 0.01066222, "balance_loss_clip": 1.49531364, "balance_loss_mlp": 1.02797949, "epoch": 0.07737862618367654, "flos": 17574655956480.0, "grad_norm": 2.3008927117564912, "language_loss": 0.79816276, "learning_rate": 3.97649990716259e-06, "loss": 0.82665509, "num_input_tokens_seen": 27348320, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.38232422, "step": 1287, "time_per_iteration": 2.9392285346984863 }, { "auxiliary_loss_clip": 0.01755652, "auxiliary_loss_mlp": 0.0106919, "balance_loss_clip": 1.48160517, "balance_loss_mlp": 1.03070974, "epoch": 0.0774387494363445, "flos": 25637467656960.0, "grad_norm": 1.5895200849943512, "language_loss": 0.85740888, "learning_rate": 3.976440341863237e-06, "loss": 0.88565731, "num_input_tokens_seen": 27367670, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.38476562, "step": 1288, "time_per_iteration": 2.9329679012298584 }, { "auxiliary_loss_clip": 0.01807792, "auxiliary_loss_mlp": 0.01070105, "balance_loss_clip": 1.51997399, "balance_loss_mlp": 1.02995539, "epoch": 0.07749887268901248, "flos": 12247489238400.0, "grad_norm": 2.1979636540319674, "language_loss": 0.88373482, "learning_rate": 3.976380701617068e-06, "loss": 0.91251385, "num_input_tokens_seen": 27385485, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.40185547, "step": 1289, "time_per_iteration": 2.9131546020507812 }, { "auxiliary_loss_clip": 0.01794165, "auxiliary_loss_mlp": 0.01062036, "balance_loss_clip": 1.51546896, "balance_loss_mlp": 1.02336454, "epoch": 0.07755899594168045, "flos": 25092401823360.0, "grad_norm": 1.608742435670239, "language_loss": 0.86622036, "learning_rate": 3.976320986426344e-06, "loss": 0.8947823, "num_input_tokens_seen": 27405110, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.38671875, "step": 1290, "time_per_iteration": 3.034775972366333 }, { "auxiliary_loss_clip": 0.01793012, "auxiliary_loss_mlp": 0.01079741, "balance_loss_clip": 1.51886368, "balance_loss_mlp": 1.04004455, "epoch": 0.07761911919434841, "flos": 14254509657600.0, "grad_norm": 2.004659572921241, "language_loss": 0.93054682, "learning_rate": 3.9762611962933315e-06, "loss": 0.95927441, "num_input_tokens_seen": 27422855, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.39672852, "step": 1291, "time_per_iteration": 2.8811709880828857 }, { "auxiliary_loss_clip": 0.01611301, "auxiliary_loss_mlp": 0.01080905, "balance_loss_clip": 1.43155122, "balance_loss_mlp": 1.05744457, "epoch": 0.07767924244701638, "flos": 67272017662080.0, "grad_norm": 0.9301485934613928, "language_loss": 0.65147805, "learning_rate": 3.9762013312202955e-06, "loss": 0.67840004, "num_input_tokens_seen": 27487190, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.234375, "step": 1292, "time_per_iteration": 3.4918689727783203 }, { "auxiliary_loss_clip": 0.01804499, "auxiliary_loss_mlp": 0.01059589, "balance_loss_clip": 1.52583015, "balance_loss_mlp": 1.0242312, "epoch": 0.07773936569968436, "flos": 28562915151360.0, "grad_norm": 1.8499176256455119, "language_loss": 0.89070344, "learning_rate": 3.9761413912095075e-06, "loss": 0.91934431, "num_input_tokens_seen": 27510465, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.35351562, "step": 1293, "time_per_iteration": 2.977987289428711 }, { "auxiliary_loss_clip": 0.01828741, "auxiliary_loss_mlp": 0.01067464, "balance_loss_clip": 1.54834819, "balance_loss_mlp": 1.02862573, "epoch": 0.07779948895235232, "flos": 27501676928640.0, "grad_norm": 1.9703196581961127, "language_loss": 0.85988283, "learning_rate": 3.976081376263239e-06, "loss": 0.88884485, "num_input_tokens_seen": 27528645, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.38867188, "step": 1294, "time_per_iteration": 3.0797393321990967 }, { "auxiliary_loss_clip": 0.01870879, "auxiliary_loss_mlp": 0.01065827, "balance_loss_clip": 1.57699037, "balance_loss_mlp": 1.02608252, "epoch": 0.07785961220502029, "flos": 18232417883520.0, "grad_norm": 2.1861493025714136, "language_loss": 0.82153273, "learning_rate": 3.976021286383768e-06, "loss": 0.85089982, "num_input_tokens_seen": 27546165, "router_z_loss_clip": 2.94140625, "router_z_loss_mlp": 0.3972168, "step": 1295, "time_per_iteration": 2.927900552749634 }, { "auxiliary_loss_clip": 0.01852767, "auxiliary_loss_mlp": 0.0107366, "balance_loss_clip": 1.56830549, "balance_loss_mlp": 1.0308876, "epoch": 0.07791973545768827, "flos": 24618967787520.0, "grad_norm": 1.988307150976017, "language_loss": 0.89092273, "learning_rate": 3.975961121573371e-06, "loss": 0.920187, "num_input_tokens_seen": 27566520, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.42773438, "step": 1296, "time_per_iteration": 2.9144833087921143 }, { "auxiliary_loss_clip": 0.01862245, "auxiliary_loss_mlp": 0.01073393, "balance_loss_clip": 1.57390916, "balance_loss_mlp": 1.03493595, "epoch": 0.07797985871035623, "flos": 14289963598080.0, "grad_norm": 2.438153300308794, "language_loss": 0.98303407, "learning_rate": 3.9759008818343305e-06, "loss": 1.01239049, "num_input_tokens_seen": 27581960, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.38452148, "step": 1297, "time_per_iteration": 2.884141445159912 }, { "auxiliary_loss_clip": 0.01848178, "auxiliary_loss_mlp": 0.01076101, "balance_loss_clip": 1.55788994, "balance_loss_mlp": 1.04153049, "epoch": 0.0780399819630242, "flos": 26620558830720.0, "grad_norm": 2.122082091738837, "language_loss": 0.77846909, "learning_rate": 3.97584056716893e-06, "loss": 0.80771184, "num_input_tokens_seen": 27601415, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.34570312, "step": 1298, "time_per_iteration": 2.965386390686035 }, { "auxiliary_loss_clip": 0.01836773, "auxiliary_loss_mlp": 0.01084868, "balance_loss_clip": 1.55378401, "balance_loss_mlp": 1.04886699, "epoch": 0.07810010521569218, "flos": 21844339770240.0, "grad_norm": 1.5520113538525946, "language_loss": 0.82647514, "learning_rate": 3.9757801775794575e-06, "loss": 0.85569155, "num_input_tokens_seen": 27621490, "router_z_loss_clip": 2.83007812, "router_z_loss_mlp": 0.35986328, "step": 1299, "time_per_iteration": 3.009385824203491 }, { "auxiliary_loss_clip": 0.01838558, "auxiliary_loss_mlp": 0.01086957, "balance_loss_clip": 1.56167519, "balance_loss_mlp": 1.04537666, "epoch": 0.07816022846836014, "flos": 25091768396160.0, "grad_norm": 1.7672952857701527, "language_loss": 0.88196242, "learning_rate": 3.975719713068202e-06, "loss": 0.91121763, "num_input_tokens_seen": 27640600, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.41577148, "step": 1300, "time_per_iteration": 3.0674591064453125 }, { "auxiliary_loss_clip": 0.01844526, "auxiliary_loss_mlp": 0.01088812, "balance_loss_clip": 1.55501914, "balance_loss_mlp": 1.05056965, "epoch": 0.0782203517210281, "flos": 40932538663680.0, "grad_norm": 1.7634050669601604, "language_loss": 0.73660433, "learning_rate": 3.975659173637458e-06, "loss": 0.76593769, "num_input_tokens_seen": 27663070, "router_z_loss_clip": 2.89453125, "router_z_loss_mlp": 0.38232422, "step": 1301, "time_per_iteration": 3.1398513317108154 }, { "auxiliary_loss_clip": 0.01852726, "auxiliary_loss_mlp": 0.01120414, "balance_loss_clip": 1.56486654, "balance_loss_mlp": 1.08221889, "epoch": 0.07828047497369607, "flos": 41186869539840.0, "grad_norm": 1.5190403635096623, "language_loss": 0.72084296, "learning_rate": 3.97559855928952e-06, "loss": 0.75057429, "num_input_tokens_seen": 27686425, "router_z_loss_clip": 2.87890625, "router_z_loss_mlp": 0.38232422, "step": 1302, "time_per_iteration": 3.193338632583618 }, { "auxiliary_loss_clip": 0.01838205, "auxiliary_loss_mlp": 0.01100887, "balance_loss_clip": 1.55639303, "balance_loss_mlp": 1.06374145, "epoch": 0.07834059822636405, "flos": 23517162961920.0, "grad_norm": 2.1747333138552456, "language_loss": 0.83594853, "learning_rate": 3.9755378700266864e-06, "loss": 0.8653394, "num_input_tokens_seen": 27704900, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.37158203, "step": 1303, "time_per_iteration": 4.4202141761779785 }, { "auxiliary_loss_clip": 0.01840331, "auxiliary_loss_mlp": 0.01100586, "balance_loss_clip": 1.55767512, "balance_loss_mlp": 1.06365538, "epoch": 0.07840072147903202, "flos": 20203622403840.0, "grad_norm": 1.7329407836014536, "language_loss": 0.76395625, "learning_rate": 3.9754771058512585e-06, "loss": 0.79336542, "num_input_tokens_seen": 27724890, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.36962891, "step": 1304, "time_per_iteration": 3.007347345352173 }, { "auxiliary_loss_clip": 0.01830605, "auxiliary_loss_mlp": 0.01101989, "balance_loss_clip": 1.5467788, "balance_loss_mlp": 1.06706071, "epoch": 0.07846084473169998, "flos": 21370679510400.0, "grad_norm": 1.729353113173296, "language_loss": 0.77417636, "learning_rate": 3.975416266765542e-06, "loss": 0.80350232, "num_input_tokens_seen": 27743115, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.34936523, "step": 1305, "time_per_iteration": 2.9907658100128174 }, { "auxiliary_loss_clip": 0.01822183, "auxiliary_loss_mlp": 0.01115397, "balance_loss_clip": 1.53483343, "balance_loss_mlp": 1.07739258, "epoch": 0.07852096798436796, "flos": 25421938836480.0, "grad_norm": 2.0256731478959606, "language_loss": 0.86735284, "learning_rate": 3.975355352771841e-06, "loss": 0.89672863, "num_input_tokens_seen": 27763570, "router_z_loss_clip": 2.87695312, "router_z_loss_mlp": 0.38012695, "step": 1306, "time_per_iteration": 3.035352945327759 }, { "auxiliary_loss_clip": 0.01814415, "auxiliary_loss_mlp": 0.0110271, "balance_loss_clip": 1.53002036, "balance_loss_mlp": 1.06506407, "epoch": 0.07858109123703592, "flos": 24582156503040.0, "grad_norm": 3.2890982257210255, "language_loss": 0.91390014, "learning_rate": 3.975294363872468e-06, "loss": 0.94307142, "num_input_tokens_seen": 27780030, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.3762207, "step": 1307, "time_per_iteration": 2.9242122173309326 }, { "auxiliary_loss_clip": 0.01828437, "auxiliary_loss_mlp": 0.01091755, "balance_loss_clip": 1.54675901, "balance_loss_mlp": 1.05506253, "epoch": 0.07864121448970389, "flos": 20707216738560.0, "grad_norm": 6.61180532298452, "language_loss": 0.83968723, "learning_rate": 3.975233300069735e-06, "loss": 0.86888915, "num_input_tokens_seen": 27796225, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.3671875, "step": 1308, "time_per_iteration": 4.563499927520752 }, { "auxiliary_loss_clip": 0.01808705, "auxiliary_loss_mlp": 0.01093618, "balance_loss_clip": 1.53403997, "balance_loss_mlp": 1.05885673, "epoch": 0.07870133774237187, "flos": 22976893077120.0, "grad_norm": 1.4215884598710988, "language_loss": 0.78695154, "learning_rate": 3.975172161365958e-06, "loss": 0.81597477, "num_input_tokens_seen": 27815975, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.34765625, "step": 1309, "time_per_iteration": 5.931636095046997 }, { "auxiliary_loss_clip": 0.01828399, "auxiliary_loss_mlp": 0.01096926, "balance_loss_clip": 1.54174781, "balance_loss_mlp": 1.06049585, "epoch": 0.07876146099503983, "flos": 18851920692480.0, "grad_norm": 1.9012223399748578, "language_loss": 0.81805426, "learning_rate": 3.975110947763453e-06, "loss": 0.8473075, "num_input_tokens_seen": 27832255, "router_z_loss_clip": 2.86523438, "router_z_loss_mlp": 0.36425781, "step": 1310, "time_per_iteration": 3.044184684753418 }, { "auxiliary_loss_clip": 0.01795535, "auxiliary_loss_mlp": 0.01078981, "balance_loss_clip": 1.52653968, "balance_loss_mlp": 1.04264569, "epoch": 0.0788215842477078, "flos": 23816403941760.0, "grad_norm": 1.7436961302633238, "language_loss": 0.7428003, "learning_rate": 3.9750496592645435e-06, "loss": 0.77154547, "num_input_tokens_seen": 27852180, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.36328125, "step": 1311, "time_per_iteration": 2.978878974914551 }, { "auxiliary_loss_clip": 0.01815468, "auxiliary_loss_mlp": 0.01078543, "balance_loss_clip": 1.54011953, "balance_loss_mlp": 1.04320908, "epoch": 0.07888170750037576, "flos": 21589873159680.0, "grad_norm": 1.7447601827914365, "language_loss": 0.8759079, "learning_rate": 3.974988295871553e-06, "loss": 0.90484798, "num_input_tokens_seen": 27871435, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.35327148, "step": 1312, "time_per_iteration": 2.9277594089508057 }, { "auxiliary_loss_clip": 0.01803052, "auxiliary_loss_mlp": 0.01072181, "balance_loss_clip": 1.53015387, "balance_loss_mlp": 1.03515506, "epoch": 0.07894183075304374, "flos": 19874311614720.0, "grad_norm": 1.6019000632817448, "language_loss": 0.83263129, "learning_rate": 3.9749268575868085e-06, "loss": 0.86138368, "num_input_tokens_seen": 27890625, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.37036133, "step": 1313, "time_per_iteration": 2.9565110206604004 }, { "auxiliary_loss_clip": 0.018455, "auxiliary_loss_mlp": 0.0107586, "balance_loss_clip": 1.55665481, "balance_loss_mlp": 1.03511381, "epoch": 0.07900195400571171, "flos": 16152001119360.0, "grad_norm": 2.5188212163567454, "language_loss": 0.75324893, "learning_rate": 3.97486534441264e-06, "loss": 0.78246248, "num_input_tokens_seen": 27906530, "router_z_loss_clip": 2.88867188, "router_z_loss_mlp": 0.4074707, "step": 1314, "time_per_iteration": 2.9270973205566406 }, { "auxiliary_loss_clip": 0.01820626, "auxiliary_loss_mlp": 0.01066646, "balance_loss_clip": 1.54203343, "balance_loss_mlp": 1.03069305, "epoch": 0.07906207725837967, "flos": 23740383398400.0, "grad_norm": 1.4776185339329515, "language_loss": 0.80779386, "learning_rate": 3.974803756351379e-06, "loss": 0.83666658, "num_input_tokens_seen": 27926725, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.35986328, "step": 1315, "time_per_iteration": 3.0160586833953857 }, { "auxiliary_loss_clip": 0.01834576, "auxiliary_loss_mlp": 0.01068368, "balance_loss_clip": 1.55009842, "balance_loss_mlp": 1.0306741, "epoch": 0.07912220051104765, "flos": 24326468282880.0, "grad_norm": 1.7272118719863356, "language_loss": 0.74823552, "learning_rate": 3.974742093405362e-06, "loss": 0.77726495, "num_input_tokens_seen": 27947875, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.37670898, "step": 1316, "time_per_iteration": 2.9685840606689453 }, { "auxiliary_loss_clip": 0.01850993, "auxiliary_loss_mlp": 0.01071581, "balance_loss_clip": 1.55968845, "balance_loss_mlp": 1.03472137, "epoch": 0.07918232376371562, "flos": 18889320159360.0, "grad_norm": 3.380070988256466, "language_loss": 0.66482151, "learning_rate": 3.974680355576927e-06, "loss": 0.69404721, "num_input_tokens_seen": 27965040, "router_z_loss_clip": 2.9140625, "router_z_loss_mlp": 0.36889648, "step": 1317, "time_per_iteration": 2.975184440612793 }, { "auxiliary_loss_clip": 0.01868064, "auxiliary_loss_mlp": 0.01080751, "balance_loss_clip": 1.57240129, "balance_loss_mlp": 1.04262805, "epoch": 0.07924244701638358, "flos": 27386492371200.0, "grad_norm": 2.274152817975685, "language_loss": 0.76550329, "learning_rate": 3.974618542868415e-06, "loss": 0.79499149, "num_input_tokens_seen": 27985330, "router_z_loss_clip": 2.95703125, "router_z_loss_mlp": 0.38110352, "step": 1318, "time_per_iteration": 3.042052984237671 }, { "auxiliary_loss_clip": 0.01816779, "auxiliary_loss_mlp": 0.01064739, "balance_loss_clip": 1.5398705, "balance_loss_mlp": 1.02866662, "epoch": 0.07930257026905156, "flos": 25131565837440.0, "grad_norm": 2.088255838787028, "language_loss": 0.91273797, "learning_rate": 3.97455665528217e-06, "loss": 0.94155312, "num_input_tokens_seen": 28007615, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.36108398, "step": 1319, "time_per_iteration": 3.022975206375122 }, { "auxiliary_loss_clip": 0.01832011, "auxiliary_loss_mlp": 0.01072378, "balance_loss_clip": 1.55288577, "balance_loss_mlp": 1.03492236, "epoch": 0.07936269352171953, "flos": 21844294525440.0, "grad_norm": 2.0802619639306927, "language_loss": 0.81525147, "learning_rate": 3.974494692820539e-06, "loss": 0.84429538, "num_input_tokens_seen": 28027765, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.37451172, "step": 1320, "time_per_iteration": 3.0054545402526855 }, { "auxiliary_loss_clip": 0.0184322, "auxiliary_loss_mlp": 0.01073694, "balance_loss_clip": 1.55642295, "balance_loss_mlp": 1.03447437, "epoch": 0.07942281677438749, "flos": 16947009083520.0, "grad_norm": 2.160568726827266, "language_loss": 0.70156789, "learning_rate": 3.974432655485872e-06, "loss": 0.73073703, "num_input_tokens_seen": 28044225, "router_z_loss_clip": 2.86914062, "router_z_loss_mlp": 0.39208984, "step": 1321, "time_per_iteration": 2.9728448390960693 }, { "auxiliary_loss_clip": 0.01807986, "auxiliary_loss_mlp": 0.01072265, "balance_loss_clip": 1.53165674, "balance_loss_mlp": 1.03416538, "epoch": 0.07948294002705546, "flos": 18995591491200.0, "grad_norm": 6.318990862507566, "language_loss": 0.85557503, "learning_rate": 3.9743705432805195e-06, "loss": 0.88437754, "num_input_tokens_seen": 28062915, "router_z_loss_clip": 2.76367188, "router_z_loss_mlp": 0.38110352, "step": 1322, "time_per_iteration": 3.0972354412078857 }, { "auxiliary_loss_clip": 0.01813729, "auxiliary_loss_mlp": 0.01065655, "balance_loss_clip": 1.53501034, "balance_loss_mlp": 1.02867591, "epoch": 0.07954306327972344, "flos": 21663586218240.0, "grad_norm": 1.996142720968327, "language_loss": 0.91959667, "learning_rate": 3.974308356206838e-06, "loss": 0.94839048, "num_input_tokens_seen": 28082175, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.36987305, "step": 1323, "time_per_iteration": 3.0631399154663086 }, { "auxiliary_loss_clip": 0.01809874, "auxiliary_loss_mlp": 0.01069558, "balance_loss_clip": 1.53496861, "balance_loss_mlp": 1.03195977, "epoch": 0.0796031865323914, "flos": 23230273812480.0, "grad_norm": 1.952184094605342, "language_loss": 0.83554459, "learning_rate": 3.974246094267187e-06, "loss": 0.86433887, "num_input_tokens_seen": 28102645, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.37597656, "step": 1324, "time_per_iteration": 2.945810079574585 }, { "auxiliary_loss_clip": 0.01809817, "auxiliary_loss_mlp": 0.01067467, "balance_loss_clip": 1.53293335, "balance_loss_mlp": 1.02762771, "epoch": 0.07966330978505937, "flos": 23304891767040.0, "grad_norm": 3.7263146673144742, "language_loss": 0.81387931, "learning_rate": 3.974183757463925e-06, "loss": 0.84265214, "num_input_tokens_seen": 28122805, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.3984375, "step": 1325, "time_per_iteration": 2.9765818119049072 }, { "auxiliary_loss_clip": 0.01809257, "auxiliary_loss_mlp": 0.01073367, "balance_loss_clip": 1.53311133, "balance_loss_mlp": 1.0322876, "epoch": 0.07972343303772735, "flos": 18371518957440.0, "grad_norm": 2.6816626711017433, "language_loss": 0.89573443, "learning_rate": 3.974121345799418e-06, "loss": 0.92456067, "num_input_tokens_seen": 28140530, "router_z_loss_clip": 2.76171875, "router_z_loss_mlp": 0.41113281, "step": 1326, "time_per_iteration": 2.8975560665130615 }, { "auxiliary_loss_clip": 0.01813299, "auxiliary_loss_mlp": 0.01074996, "balance_loss_clip": 1.53555131, "balance_loss_mlp": 1.03494191, "epoch": 0.07978355629039531, "flos": 21772572238080.0, "grad_norm": 1.704023197078873, "language_loss": 0.83882892, "learning_rate": 3.974058859276032e-06, "loss": 0.86771184, "num_input_tokens_seen": 28159640, "router_z_loss_clip": 2.77734375, "router_z_loss_mlp": 0.40063477, "step": 1327, "time_per_iteration": 2.9447286128997803 }, { "auxiliary_loss_clip": 0.01826141, "auxiliary_loss_mlp": 0.01062367, "balance_loss_clip": 1.53515124, "balance_loss_mlp": 1.02367198, "epoch": 0.07984367954306328, "flos": 18560416573440.0, "grad_norm": 2.348848492272165, "language_loss": 0.81226152, "learning_rate": 3.9739962978961354e-06, "loss": 0.84114659, "num_input_tokens_seen": 28177050, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.38671875, "step": 1328, "time_per_iteration": 2.9278526306152344 }, { "auxiliary_loss_clip": 0.01829402, "auxiliary_loss_mlp": 0.0106716, "balance_loss_clip": 1.54503036, "balance_loss_mlp": 1.02791643, "epoch": 0.07990380279573125, "flos": 16911328919040.0, "grad_norm": 2.4852079952311295, "language_loss": 0.75839508, "learning_rate": 3.973933661662101e-06, "loss": 0.78736067, "num_input_tokens_seen": 28193245, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.39233398, "step": 1329, "time_per_iteration": 2.9189910888671875 }, { "auxiliary_loss_clip": 0.01809689, "auxiliary_loss_mlp": 0.01060681, "balance_loss_clip": 1.53234291, "balance_loss_mlp": 1.02348745, "epoch": 0.07996392604839922, "flos": 24109220160000.0, "grad_norm": 1.5476744365566868, "language_loss": 0.82471859, "learning_rate": 3.973870950576305e-06, "loss": 0.85342228, "num_input_tokens_seen": 28213570, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.37182617, "step": 1330, "time_per_iteration": 3.0812740325927734 }, { "auxiliary_loss_clip": 0.01817732, "auxiliary_loss_mlp": 0.01063353, "balance_loss_clip": 1.53360605, "balance_loss_mlp": 1.02680373, "epoch": 0.08002404930106718, "flos": 14285755831680.0, "grad_norm": 1.744532148731593, "language_loss": 0.89795971, "learning_rate": 3.9738081646411255e-06, "loss": 0.92677051, "num_input_tokens_seen": 28229980, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.36572266, "step": 1331, "time_per_iteration": 2.8350930213928223 }, { "auxiliary_loss_clip": 0.01866023, "auxiliary_loss_mlp": 0.01066328, "balance_loss_clip": 1.57110143, "balance_loss_mlp": 1.02779925, "epoch": 0.08008417255373516, "flos": 40420981244160.0, "grad_norm": 1.8018324982293628, "language_loss": 0.74494582, "learning_rate": 3.973745303858942e-06, "loss": 0.77426928, "num_input_tokens_seen": 28253840, "router_z_loss_clip": 2.94921875, "router_z_loss_mlp": 0.38549805, "step": 1332, "time_per_iteration": 3.0982115268707275 }, { "auxiliary_loss_clip": 0.01831914, "auxiliary_loss_mlp": 0.01067039, "balance_loss_clip": 1.55305088, "balance_loss_mlp": 1.0297029, "epoch": 0.08014429580640313, "flos": 18488151348480.0, "grad_norm": 1.676917693243041, "language_loss": 0.83455777, "learning_rate": 3.973682368232138e-06, "loss": 0.86354727, "num_input_tokens_seen": 28271675, "router_z_loss_clip": 2.78515625, "router_z_loss_mlp": 0.37353516, "step": 1333, "time_per_iteration": 2.974811315536499 }, { "auxiliary_loss_clip": 0.01832487, "auxiliary_loss_mlp": 0.01069712, "balance_loss_clip": 1.54960668, "balance_loss_mlp": 1.0328052, "epoch": 0.0802044190590711, "flos": 22063352440320.0, "grad_norm": 2.05187930577218, "language_loss": 0.76293468, "learning_rate": 3.9736193577631015e-06, "loss": 0.79195672, "num_input_tokens_seen": 28291850, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.36889648, "step": 1334, "time_per_iteration": 2.9213056564331055 }, { "auxiliary_loss_clip": 0.01838097, "auxiliary_loss_mlp": 0.01064677, "balance_loss_clip": 1.55713403, "balance_loss_mlp": 1.02707887, "epoch": 0.08026454231173906, "flos": 24582970909440.0, "grad_norm": 1.8338618379950018, "language_loss": 0.81460565, "learning_rate": 3.973556272454221e-06, "loss": 0.84363341, "num_input_tokens_seen": 28310780, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.37573242, "step": 1335, "time_per_iteration": 2.9960389137268066 }, { "auxiliary_loss_clip": 0.01596772, "auxiliary_loss_mlp": 0.01080933, "balance_loss_clip": 1.41260099, "balance_loss_mlp": 1.0532763, "epoch": 0.08032466556440704, "flos": 52606475827200.0, "grad_norm": 0.7541536853144227, "language_loss": 0.56093943, "learning_rate": 3.973493112307889e-06, "loss": 0.58771646, "num_input_tokens_seen": 28369985, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.27734375, "step": 1336, "time_per_iteration": 3.4885671138763428 }, { "auxiliary_loss_clip": 0.01833512, "auxiliary_loss_mlp": 0.0106685, "balance_loss_clip": 1.55233312, "balance_loss_mlp": 1.03058612, "epoch": 0.080384788817075, "flos": 23853305715840.0, "grad_norm": 1.915288157270104, "language_loss": 0.6937685, "learning_rate": 3.9734298773265005e-06, "loss": 0.72277212, "num_input_tokens_seen": 28388670, "router_z_loss_clip": 2.81054688, "router_z_loss_mlp": 0.36254883, "step": 1337, "time_per_iteration": 3.038100004196167 }, { "auxiliary_loss_clip": 0.01815339, "auxiliary_loss_mlp": 0.01072931, "balance_loss_clip": 1.53570533, "balance_loss_mlp": 1.03361607, "epoch": 0.08044491206974297, "flos": 25311459738240.0, "grad_norm": 2.0108154169936183, "language_loss": 0.88440186, "learning_rate": 3.973366567512453e-06, "loss": 0.91328454, "num_input_tokens_seen": 28411845, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.39282227, "step": 1338, "time_per_iteration": 4.519045829772949 }, { "auxiliary_loss_clip": 0.01834899, "auxiliary_loss_mlp": 0.01072904, "balance_loss_clip": 1.54483795, "balance_loss_mlp": 1.03687918, "epoch": 0.08050503532241095, "flos": 22385152592640.0, "grad_norm": 2.0399270521239314, "language_loss": 0.88304138, "learning_rate": 3.973303182868147e-06, "loss": 0.91211939, "num_input_tokens_seen": 28427875, "router_z_loss_clip": 2.90039062, "router_z_loss_mlp": 0.3605957, "step": 1339, "time_per_iteration": 2.953521251678467 }, { "auxiliary_loss_clip": 0.01807168, "auxiliary_loss_mlp": 0.01063051, "balance_loss_clip": 1.5333786, "balance_loss_mlp": 1.02893305, "epoch": 0.08056515857507891, "flos": 18378622391040.0, "grad_norm": 1.9466064579878857, "language_loss": 0.90142989, "learning_rate": 3.973239723395988e-06, "loss": 0.93013203, "num_input_tokens_seen": 28446615, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.34155273, "step": 1340, "time_per_iteration": 2.8782846927642822 }, { "auxiliary_loss_clip": 0.01550971, "auxiliary_loss_mlp": 0.01030677, "balance_loss_clip": 1.37379944, "balance_loss_mlp": 1.00473714, "epoch": 0.08062528182774688, "flos": 51374211684480.0, "grad_norm": 0.8885486945481568, "language_loss": 0.64754367, "learning_rate": 3.97317618909838e-06, "loss": 0.67336023, "num_input_tokens_seen": 28505290, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.25976562, "step": 1341, "time_per_iteration": 3.3494129180908203 }, { "auxiliary_loss_clip": 0.01848257, "auxiliary_loss_mlp": 0.01079674, "balance_loss_clip": 1.55610335, "balance_loss_mlp": 1.04138398, "epoch": 0.08068540508041486, "flos": 17607802412160.0, "grad_norm": 1.8772685402554194, "language_loss": 0.90250564, "learning_rate": 3.973112579977733e-06, "loss": 0.93178499, "num_input_tokens_seen": 28522735, "router_z_loss_clip": 2.92382812, "router_z_loss_mlp": 0.38305664, "step": 1342, "time_per_iteration": 4.302040100097656 }, { "auxiliary_loss_clip": 0.01827484, "auxiliary_loss_mlp": 0.01073211, "balance_loss_clip": 1.54308319, "balance_loss_mlp": 1.03210771, "epoch": 0.08074552833308282, "flos": 10568286529920.0, "grad_norm": 2.3416060945046615, "language_loss": 0.77433646, "learning_rate": 3.973048896036459e-06, "loss": 0.80334336, "num_input_tokens_seen": 28539460, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.41064453, "step": 1343, "time_per_iteration": 2.866652250289917 }, { "auxiliary_loss_clip": 0.01515219, "auxiliary_loss_mlp": 0.01064404, "balance_loss_clip": 1.33694005, "balance_loss_mlp": 1.03751075, "epoch": 0.08080565158575079, "flos": 60870310750080.0, "grad_norm": 0.8203614794927442, "language_loss": 0.57650089, "learning_rate": 3.972985137276974e-06, "loss": 0.60229707, "num_input_tokens_seen": 28599855, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.26953125, "step": 1344, "time_per_iteration": 6.151159048080444 }, { "auxiliary_loss_clip": 0.01828071, "auxiliary_loss_mlp": 0.01065221, "balance_loss_clip": 1.53947306, "balance_loss_mlp": 1.02900493, "epoch": 0.08086577483841875, "flos": 18341403903360.0, "grad_norm": 2.6421300002750074, "language_loss": 0.87862444, "learning_rate": 3.972921303701695e-06, "loss": 0.90755737, "num_input_tokens_seen": 28617585, "router_z_loss_clip": 2.890625, "router_z_loss_mlp": 0.36206055, "step": 1345, "time_per_iteration": 2.922560930252075 }, { "auxiliary_loss_clip": 0.01797414, "auxiliary_loss_mlp": 0.01068122, "balance_loss_clip": 1.51893413, "balance_loss_mlp": 1.03269327, "epoch": 0.08092589809108673, "flos": 21553604812800.0, "grad_norm": 1.6174773876039161, "language_loss": 0.89026421, "learning_rate": 3.972857395313042e-06, "loss": 0.91891956, "num_input_tokens_seen": 28636355, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.35424805, "step": 1346, "time_per_iteration": 2.9248733520507812 }, { "auxiliary_loss_clip": 0.01786787, "auxiliary_loss_mlp": 0.01079401, "balance_loss_clip": 1.5100323, "balance_loss_mlp": 1.04013336, "epoch": 0.0809860213437547, "flos": 22138332353280.0, "grad_norm": 1.5649249669384577, "language_loss": 0.94058985, "learning_rate": 3.972793412113439e-06, "loss": 0.96925175, "num_input_tokens_seen": 28656260, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.39282227, "step": 1347, "time_per_iteration": 2.885265588760376 }, { "auxiliary_loss_clip": 0.01781129, "auxiliary_loss_mlp": 0.01077046, "balance_loss_clip": 1.50633836, "balance_loss_mlp": 1.03939986, "epoch": 0.08104614459642266, "flos": 21735398995200.0, "grad_norm": 1.724542344059585, "language_loss": 0.90338033, "learning_rate": 3.972729354105312e-06, "loss": 0.93196201, "num_input_tokens_seen": 28675865, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.37646484, "step": 1348, "time_per_iteration": 3.044618606567383 }, { "auxiliary_loss_clip": 0.0176168, "auxiliary_loss_mlp": 0.01075224, "balance_loss_clip": 1.48762763, "balance_loss_mlp": 1.0381496, "epoch": 0.08110626784909064, "flos": 23962563204480.0, "grad_norm": 1.5682890232437847, "language_loss": 0.77514565, "learning_rate": 3.97266522129109e-06, "loss": 0.80351472, "num_input_tokens_seen": 28696255, "router_z_loss_clip": 2.7421875, "router_z_loss_mlp": 0.37109375, "step": 1349, "time_per_iteration": 3.0364632606506348 }, { "auxiliary_loss_clip": 0.01791197, "auxiliary_loss_mlp": 0.01079724, "balance_loss_clip": 1.50715661, "balance_loss_mlp": 1.04279256, "epoch": 0.0811663911017586, "flos": 19034936484480.0, "grad_norm": 1.8342501759779175, "language_loss": 0.90341944, "learning_rate": 3.972601013673205e-06, "loss": 0.93212867, "num_input_tokens_seen": 28713905, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 0.36938477, "step": 1350, "time_per_iteration": 2.8622543811798096 }, { "auxiliary_loss_clip": 0.01762367, "auxiliary_loss_mlp": 0.01072814, "balance_loss_clip": 1.48724651, "balance_loss_mlp": 1.03571606, "epoch": 0.08122651435442657, "flos": 15349753987200.0, "grad_norm": 1.9673824673969504, "language_loss": 0.83422673, "learning_rate": 3.972536731254092e-06, "loss": 0.86257851, "num_input_tokens_seen": 28732075, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.37133789, "step": 1351, "time_per_iteration": 2.8935141563415527 }, { "auxiliary_loss_clip": 0.01763052, "auxiliary_loss_mlp": 0.01075159, "balance_loss_clip": 1.48243308, "balance_loss_mlp": 1.03565347, "epoch": 0.08128663760709455, "flos": 23231902625280.0, "grad_norm": 1.7312084793301712, "language_loss": 0.76381403, "learning_rate": 3.972472374036189e-06, "loss": 0.79219615, "num_input_tokens_seen": 28751150, "router_z_loss_clip": 2.8046875, "router_z_loss_mlp": 0.39526367, "step": 1352, "time_per_iteration": 2.8623712062835693 }, { "auxiliary_loss_clip": 0.01781567, "auxiliary_loss_mlp": 0.01081858, "balance_loss_clip": 1.49975169, "balance_loss_mlp": 1.04130328, "epoch": 0.08134676085976252, "flos": 22975761957120.0, "grad_norm": 1.8115788041420133, "language_loss": 0.84688938, "learning_rate": 3.972407942021935e-06, "loss": 0.87552357, "num_input_tokens_seen": 28773360, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.40551758, "step": 1353, "time_per_iteration": 2.95131516456604 }, { "auxiliary_loss_clip": 0.01450852, "auxiliary_loss_mlp": 0.01142826, "balance_loss_clip": 1.27231765, "balance_loss_mlp": 1.03219974, "epoch": 0.08140688411243048, "flos": 64347683304960.0, "grad_norm": 0.8724739369772011, "language_loss": 0.59869313, "learning_rate": 3.972343435213775e-06, "loss": 0.62462991, "num_input_tokens_seen": 28833390, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 1.109375, "step": 1354, "time_per_iteration": 3.3701465129852295 }, { "auxiliary_loss_clip": 0.01750776, "auxiliary_loss_mlp": 0.01068408, "balance_loss_clip": 1.47799754, "balance_loss_mlp": 1.03009415, "epoch": 0.08146700736509845, "flos": 22502011207680.0, "grad_norm": 1.6148054200530948, "language_loss": 0.84205008, "learning_rate": 3.972278853614154e-06, "loss": 0.87024194, "num_input_tokens_seen": 28852430, "router_z_loss_clip": 2.72460938, "router_z_loss_mlp": 0.3828125, "step": 1355, "time_per_iteration": 2.9188740253448486 }, { "auxiliary_loss_clip": 0.01747928, "auxiliary_loss_mlp": 0.01066937, "balance_loss_clip": 1.4762485, "balance_loss_mlp": 1.02888584, "epoch": 0.08152713061776642, "flos": 20457229363200.0, "grad_norm": 1.9619286660003301, "language_loss": 0.73109454, "learning_rate": 3.972214197225521e-06, "loss": 0.75924325, "num_input_tokens_seen": 28870685, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.38037109, "step": 1356, "time_per_iteration": 2.938732624053955 }, { "auxiliary_loss_clip": 0.01764057, "auxiliary_loss_mlp": 0.01063197, "balance_loss_clip": 1.48111618, "balance_loss_mlp": 1.02524078, "epoch": 0.08158725387043439, "flos": 23560715721600.0, "grad_norm": 2.0536022232714215, "language_loss": 0.72369069, "learning_rate": 3.972149466050329e-06, "loss": 0.75196326, "num_input_tokens_seen": 28889860, "router_z_loss_clip": 2.83203125, "router_z_loss_mlp": 0.37963867, "step": 1357, "time_per_iteration": 2.907543659210205 }, { "auxiliary_loss_clip": 0.01770899, "auxiliary_loss_mlp": 0.01072242, "balance_loss_clip": 1.487957, "balance_loss_mlp": 1.03492987, "epoch": 0.08164737712310235, "flos": 22027310317440.0, "grad_norm": 2.3552324530617623, "language_loss": 0.85755682, "learning_rate": 3.97208466009103e-06, "loss": 0.88598818, "num_input_tokens_seen": 28905865, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.37329102, "step": 1358, "time_per_iteration": 2.989595651626587 }, { "auxiliary_loss_clip": 0.01766915, "auxiliary_loss_mlp": 0.0107078, "balance_loss_clip": 1.48600328, "balance_loss_mlp": 1.0331099, "epoch": 0.08170750037577033, "flos": 23378288112000.0, "grad_norm": 1.7158010584416945, "language_loss": 1.03114879, "learning_rate": 3.972019779350084e-06, "loss": 1.05952573, "num_input_tokens_seen": 28925250, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.37646484, "step": 1359, "time_per_iteration": 2.9303019046783447 }, { "auxiliary_loss_clip": 0.0178615, "auxiliary_loss_mlp": 0.01076286, "balance_loss_clip": 1.49834001, "balance_loss_mlp": 1.03914046, "epoch": 0.0817676236284383, "flos": 28408476090240.0, "grad_norm": 2.1175514656720993, "language_loss": 0.84717053, "learning_rate": 3.971954823829951e-06, "loss": 0.87579489, "num_input_tokens_seen": 28943445, "router_z_loss_clip": 2.88085938, "router_z_loss_mlp": 0.37158203, "step": 1360, "time_per_iteration": 2.977163076400757 }, { "auxiliary_loss_clip": 0.01790942, "auxiliary_loss_mlp": 0.01084686, "balance_loss_clip": 1.50219238, "balance_loss_mlp": 1.04494166, "epoch": 0.08182774688110626, "flos": 19218404724480.0, "grad_norm": 1.9718761399000964, "language_loss": 0.73906976, "learning_rate": 3.971889793533093e-06, "loss": 0.76782608, "num_input_tokens_seen": 28962695, "router_z_loss_clip": 2.88476562, "router_z_loss_mlp": 0.39746094, "step": 1361, "time_per_iteration": 2.9748871326446533 }, { "auxiliary_loss_clip": 0.01752836, "auxiliary_loss_mlp": 0.01088585, "balance_loss_clip": 1.47250462, "balance_loss_mlp": 1.05139184, "epoch": 0.08188787013377424, "flos": 22794420222720.0, "grad_norm": 1.9527998624621776, "language_loss": 0.78631604, "learning_rate": 3.971824688461976e-06, "loss": 0.81473023, "num_input_tokens_seen": 28982120, "router_z_loss_clip": 2.80664062, "router_z_loss_mlp": 0.37207031, "step": 1362, "time_per_iteration": 2.9789035320281982 }, { "auxiliary_loss_clip": 0.01766387, "auxiliary_loss_mlp": 0.01075896, "balance_loss_clip": 1.48303092, "balance_loss_mlp": 1.03879786, "epoch": 0.08194799338644221, "flos": 16475746798080.0, "grad_norm": 2.0649014050132073, "language_loss": 0.74827838, "learning_rate": 3.971759508619069e-06, "loss": 0.77670121, "num_input_tokens_seen": 28998100, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.37109375, "step": 1363, "time_per_iteration": 2.9582254886627197 }, { "auxiliary_loss_clip": 0.01772766, "auxiliary_loss_mlp": 0.01083466, "balance_loss_clip": 1.48922491, "balance_loss_mlp": 1.04438937, "epoch": 0.08200811663911017, "flos": 23923851638400.0, "grad_norm": 2.016765264038403, "language_loss": 0.79393047, "learning_rate": 3.971694254006844e-06, "loss": 0.82249284, "num_input_tokens_seen": 29017095, "router_z_loss_clip": 2.8359375, "router_z_loss_mlp": 0.39038086, "step": 1364, "time_per_iteration": 2.923550844192505 }, { "auxiliary_loss_clip": 0.01770282, "auxiliary_loss_mlp": 0.01076344, "balance_loss_clip": 1.48800421, "balance_loss_mlp": 1.03879356, "epoch": 0.08206823989177814, "flos": 17906002761600.0, "grad_norm": 1.5447050351815912, "language_loss": 0.83267689, "learning_rate": 3.971628924627776e-06, "loss": 0.86114311, "num_input_tokens_seen": 29037240, "router_z_loss_clip": 2.82226562, "router_z_loss_mlp": 0.37597656, "step": 1365, "time_per_iteration": 3.0659120082855225 }, { "auxiliary_loss_clip": 0.01747926, "auxiliary_loss_mlp": 0.01083482, "balance_loss_clip": 1.47391558, "balance_loss_mlp": 1.04941177, "epoch": 0.08212836314444612, "flos": 22097630016000.0, "grad_norm": 1.6112964124402194, "language_loss": 0.82764316, "learning_rate": 3.97156352048434e-06, "loss": 0.85595721, "num_input_tokens_seen": 29056250, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.34057617, "step": 1366, "time_per_iteration": 3.0514895915985107 }, { "auxiliary_loss_clip": 0.01766795, "auxiliary_loss_mlp": 0.01090009, "balance_loss_clip": 1.48255849, "balance_loss_mlp": 1.05133736, "epoch": 0.08218848639711408, "flos": 17604997234560.0, "grad_norm": 1.632354579763871, "language_loss": 0.8303234, "learning_rate": 3.97149804157902e-06, "loss": 0.85889143, "num_input_tokens_seen": 29073380, "router_z_loss_clip": 2.83984375, "router_z_loss_mlp": 0.38696289, "step": 1367, "time_per_iteration": 3.0351409912109375 }, { "auxiliary_loss_clip": 0.01794004, "auxiliary_loss_mlp": 0.01087776, "balance_loss_clip": 1.50257969, "balance_loss_mlp": 1.04993868, "epoch": 0.08224860964978205, "flos": 17866703013120.0, "grad_norm": 2.333098625595346, "language_loss": 0.85312641, "learning_rate": 3.9714324879142946e-06, "loss": 0.88194418, "num_input_tokens_seen": 29091330, "router_z_loss_clip": 2.91601562, "router_z_loss_mlp": 0.37817383, "step": 1368, "time_per_iteration": 2.9744081497192383 }, { "auxiliary_loss_clip": 0.01744689, "auxiliary_loss_mlp": 0.01074284, "balance_loss_clip": 1.47836792, "balance_loss_mlp": 1.03899765, "epoch": 0.08230873290245003, "flos": 25238199127680.0, "grad_norm": 1.752088942775731, "language_loss": 0.82448447, "learning_rate": 3.971366859492653e-06, "loss": 0.85267419, "num_input_tokens_seen": 29110375, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.35302734, "step": 1369, "time_per_iteration": 2.9599194526672363 }, { "auxiliary_loss_clip": 0.01770671, "auxiliary_loss_mlp": 0.0107247, "balance_loss_clip": 1.49050546, "balance_loss_mlp": 1.0364449, "epoch": 0.08236885615511799, "flos": 31772672841600.0, "grad_norm": 2.0336973536431557, "language_loss": 0.75926411, "learning_rate": 3.971301156316582e-06, "loss": 0.78769547, "num_input_tokens_seen": 29129395, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.36010742, "step": 1370, "time_per_iteration": 2.989975690841675 }, { "auxiliary_loss_clip": 0.01768857, "auxiliary_loss_mlp": 0.01076746, "balance_loss_clip": 1.48650742, "balance_loss_mlp": 1.03905153, "epoch": 0.08242897940778596, "flos": 23196358195200.0, "grad_norm": 5.136535752383083, "language_loss": 0.7565378, "learning_rate": 3.971235378388573e-06, "loss": 0.78499377, "num_input_tokens_seen": 29148650, "router_z_loss_clip": 2.82421875, "router_z_loss_mlp": 0.37695312, "step": 1371, "time_per_iteration": 2.9811534881591797 }, { "auxiliary_loss_clip": 0.01751356, "auxiliary_loss_mlp": 0.0105912, "balance_loss_clip": 1.47758842, "balance_loss_mlp": 1.02323794, "epoch": 0.08248910266045394, "flos": 34503657609600.0, "grad_norm": 1.8855956281567816, "language_loss": 0.72162026, "learning_rate": 3.971169525711122e-06, "loss": 0.74972498, "num_input_tokens_seen": 29170785, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.35913086, "step": 1372, "time_per_iteration": 2.985748291015625 }, { "auxiliary_loss_clip": 0.01787272, "auxiliary_loss_mlp": 0.01065605, "balance_loss_clip": 1.50027013, "balance_loss_mlp": 1.02848351, "epoch": 0.0825492259131219, "flos": 13442172935040.0, "grad_norm": 2.416716014365081, "language_loss": 0.89157993, "learning_rate": 3.9711035982867246e-06, "loss": 0.92010874, "num_input_tokens_seen": 29185210, "router_z_loss_clip": 2.8671875, "router_z_loss_mlp": 0.37109375, "step": 1373, "time_per_iteration": 4.2509777545928955 }, { "auxiliary_loss_clip": 0.01759165, "auxiliary_loss_mlp": 0.01062396, "balance_loss_clip": 1.47830153, "balance_loss_mlp": 1.02627516, "epoch": 0.08260934916578987, "flos": 25823560095360.0, "grad_norm": 1.8040252017321134, "language_loss": 0.8421368, "learning_rate": 3.971037596117882e-06, "loss": 0.87035245, "num_input_tokens_seen": 29205210, "router_z_loss_clip": 2.8125, "router_z_loss_mlp": 0.36132812, "step": 1374, "time_per_iteration": 2.9278531074523926 }, { "auxiliary_loss_clip": 0.01465793, "auxiliary_loss_mlp": 0.01029962, "balance_loss_clip": 1.30388033, "balance_loss_mlp": 1.01594353, "epoch": 0.08266947241845783, "flos": 63488870893440.0, "grad_norm": 0.8241947101525418, "language_loss": 0.60740304, "learning_rate": 3.970971519207095e-06, "loss": 0.63236058, "num_input_tokens_seen": 29265350, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.140625, "step": 1375, "time_per_iteration": 3.3244426250457764 }, { "auxiliary_loss_clip": 0.01461756, "auxiliary_loss_mlp": 0.01017944, "balance_loss_clip": 1.29773855, "balance_loss_mlp": 1.00316179, "epoch": 0.08272959567112581, "flos": 70024973420160.0, "grad_norm": 0.9067559758625536, "language_loss": 0.62412393, "learning_rate": 3.970905367556871e-06, "loss": 0.64892095, "num_input_tokens_seen": 29321475, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.14746094, "step": 1376, "time_per_iteration": 3.228940963745117 }, { "auxiliary_loss_clip": 0.01774484, "auxiliary_loss_mlp": 0.01069032, "balance_loss_clip": 1.49547434, "balance_loss_mlp": 1.03338885, "epoch": 0.08278971892379378, "flos": 20423087521920.0, "grad_norm": 1.678029201777842, "language_loss": 0.83895528, "learning_rate": 3.970839141169718e-06, "loss": 0.86739051, "num_input_tokens_seen": 29341405, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.35644531, "step": 1377, "time_per_iteration": 3.092580556869507 }, { "auxiliary_loss_clip": 0.01756542, "auxiliary_loss_mlp": 0.01062618, "balance_loss_clip": 1.48295379, "balance_loss_mlp": 1.02716553, "epoch": 0.08284984217646174, "flos": 26261449701120.0, "grad_norm": 1.7709881064104032, "language_loss": 0.85894746, "learning_rate": 3.970772840048147e-06, "loss": 0.88713908, "num_input_tokens_seen": 29361955, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.35449219, "step": 1378, "time_per_iteration": 4.38018798828125 }, { "auxiliary_loss_clip": 0.01758226, "auxiliary_loss_mlp": 0.0106796, "balance_loss_clip": 1.47700381, "balance_loss_mlp": 1.03143442, "epoch": 0.08290996542912972, "flos": 27205060147200.0, "grad_norm": 2.0660573269683056, "language_loss": 0.89093399, "learning_rate": 3.970706464194672e-06, "loss": 0.91919583, "num_input_tokens_seen": 29382395, "router_z_loss_clip": 2.80859375, "router_z_loss_mlp": 0.36572266, "step": 1379, "time_per_iteration": 4.387006044387817 }, { "auxiliary_loss_clip": 0.01750377, "auxiliary_loss_mlp": 0.01059168, "balance_loss_clip": 1.48139834, "balance_loss_mlp": 1.02538407, "epoch": 0.08297008868179769, "flos": 38632430557440.0, "grad_norm": 1.7566636061174814, "language_loss": 0.80076408, "learning_rate": 3.970640013611812e-06, "loss": 0.82885951, "num_input_tokens_seen": 29404460, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.33789062, "step": 1380, "time_per_iteration": 4.435056924819946 }, { "auxiliary_loss_clip": 0.01739797, "auxiliary_loss_mlp": 0.01067806, "balance_loss_clip": 1.47371936, "balance_loss_mlp": 1.03278244, "epoch": 0.08303021193446565, "flos": 19984338264960.0, "grad_norm": 2.215079791918043, "language_loss": 0.87531769, "learning_rate": 3.970573488302083e-06, "loss": 0.90339375, "num_input_tokens_seen": 29422675, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.3503418, "step": 1381, "time_per_iteration": 2.961040496826172 }, { "auxiliary_loss_clip": 0.01780845, "auxiliary_loss_mlp": 0.01060262, "balance_loss_clip": 1.4971261, "balance_loss_mlp": 1.02423692, "epoch": 0.08309033518713363, "flos": 13670596523520.0, "grad_norm": 2.363480393063436, "language_loss": 0.9038586, "learning_rate": 3.970506888268011e-06, "loss": 0.93226969, "num_input_tokens_seen": 29439840, "router_z_loss_clip": 2.83398438, "router_z_loss_mlp": 0.35986328, "step": 1382, "time_per_iteration": 2.8522253036499023 }, { "auxiliary_loss_clip": 0.01776369, "auxiliary_loss_mlp": 0.01061513, "balance_loss_clip": 1.49793625, "balance_loss_mlp": 1.02794385, "epoch": 0.0831504584398016, "flos": 17977770293760.0, "grad_norm": 1.8115347387199894, "language_loss": 0.78539979, "learning_rate": 3.970440213512121e-06, "loss": 0.81377864, "num_input_tokens_seen": 29457360, "router_z_loss_clip": 2.78320312, "router_z_loss_mlp": 0.33569336, "step": 1383, "time_per_iteration": 2.922457695007324 }, { "auxiliary_loss_clip": 0.01775112, "auxiliary_loss_mlp": 0.01067079, "balance_loss_clip": 1.49554121, "balance_loss_mlp": 1.03336668, "epoch": 0.08321058169246956, "flos": 22611585409920.0, "grad_norm": 1.8434127222020462, "language_loss": 0.84890991, "learning_rate": 3.97037346403694e-06, "loss": 0.87733179, "num_input_tokens_seen": 29477040, "router_z_loss_clip": 2.80078125, "router_z_loss_mlp": 0.3371582, "step": 1384, "time_per_iteration": 2.91709041595459 }, { "auxiliary_loss_clip": 0.01799467, "auxiliary_loss_mlp": 0.01069352, "balance_loss_clip": 1.50863647, "balance_loss_mlp": 1.03375578, "epoch": 0.08327070494513754, "flos": 22859039076480.0, "grad_norm": 2.4367860614338164, "language_loss": 0.86898071, "learning_rate": 3.970306639845e-06, "loss": 0.89766884, "num_input_tokens_seen": 29492010, "router_z_loss_clip": 2.90625, "router_z_loss_mlp": 0.35620117, "step": 1385, "time_per_iteration": 2.870175361633301 }, { "auxiliary_loss_clip": 0.01771334, "auxiliary_loss_mlp": 0.01067914, "balance_loss_clip": 1.48844039, "balance_loss_mlp": 1.02979064, "epoch": 0.0833308281978055, "flos": 22793017633920.0, "grad_norm": 1.8684560632085678, "language_loss": 0.71067047, "learning_rate": 3.970239740938835e-06, "loss": 0.73906296, "num_input_tokens_seen": 29511850, "router_z_loss_clip": 2.828125, "router_z_loss_mlp": 0.38134766, "step": 1386, "time_per_iteration": 2.8969640731811523 }, { "auxiliary_loss_clip": 0.01749167, "auxiliary_loss_mlp": 0.01069108, "balance_loss_clip": 1.47469711, "balance_loss_mlp": 1.03320265, "epoch": 0.08339095145047347, "flos": 20822084582400.0, "grad_norm": 1.539133203196264, "language_loss": 0.83382404, "learning_rate": 3.97017276732098e-06, "loss": 0.86200678, "num_input_tokens_seen": 29531415, "router_z_loss_clip": 2.74414062, "router_z_loss_mlp": 0.35888672, "step": 1387, "time_per_iteration": 2.85795521736145 }, { "auxiliary_loss_clip": 0.017588, "auxiliary_loss_mlp": 0.01074243, "balance_loss_clip": 1.47759008, "balance_loss_mlp": 1.03714466, "epoch": 0.08345107470314143, "flos": 18524555429760.0, "grad_norm": 1.8857309010351542, "language_loss": 0.79107267, "learning_rate": 3.970105718993978e-06, "loss": 0.81940311, "num_input_tokens_seen": 29549525, "router_z_loss_clip": 2.81445312, "router_z_loss_mlp": 0.37109375, "step": 1388, "time_per_iteration": 2.8621129989624023 }, { "auxiliary_loss_clip": 0.01734578, "auxiliary_loss_mlp": 0.01078869, "balance_loss_clip": 1.46533728, "balance_loss_mlp": 1.04165232, "epoch": 0.08351119795580941, "flos": 18816964444800.0, "grad_norm": 2.4525114574714437, "language_loss": 0.81249988, "learning_rate": 3.970038595960369e-06, "loss": 0.84063441, "num_input_tokens_seen": 29568705, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.37231445, "step": 1389, "time_per_iteration": 2.837688684463501 }, { "auxiliary_loss_clip": 0.01774585, "auxiliary_loss_mlp": 0.01070846, "balance_loss_clip": 1.49297249, "balance_loss_mlp": 1.03520286, "epoch": 0.08357132120847738, "flos": 18450887616000.0, "grad_norm": 5.384402542377428, "language_loss": 0.89111555, "learning_rate": 3.969971398222699e-06, "loss": 0.91956991, "num_input_tokens_seen": 29585855, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.35620117, "step": 1390, "time_per_iteration": 2.844578742980957 }, { "auxiliary_loss_clip": 0.01749576, "auxiliary_loss_mlp": 0.01076883, "balance_loss_clip": 1.47308815, "balance_loss_mlp": 1.03928483, "epoch": 0.08363144446114534, "flos": 25933586745600.0, "grad_norm": 2.347095529463442, "language_loss": 0.88276225, "learning_rate": 3.969904125783517e-06, "loss": 0.9110269, "num_input_tokens_seen": 29607280, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 0.3762207, "step": 1391, "time_per_iteration": 2.9085569381713867 }, { "auxiliary_loss_clip": 0.01767857, "auxiliary_loss_mlp": 0.01075147, "balance_loss_clip": 1.47996712, "balance_loss_mlp": 1.0375011, "epoch": 0.08369156771381332, "flos": 18050487966720.0, "grad_norm": 1.9889020763210228, "language_loss": 0.89444786, "learning_rate": 3.969836778645371e-06, "loss": 0.92287785, "num_input_tokens_seen": 29624130, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 0.37646484, "step": 1392, "time_per_iteration": 2.8006601333618164 }, { "auxiliary_loss_clip": 0.01743371, "auxiliary_loss_mlp": 0.01078581, "balance_loss_clip": 1.46625674, "balance_loss_mlp": 1.04336667, "epoch": 0.08375169096648129, "flos": 22685524692480.0, "grad_norm": 2.55870208267229, "language_loss": 0.82534647, "learning_rate": 3.969769356810819e-06, "loss": 0.85356599, "num_input_tokens_seen": 29643210, "router_z_loss_clip": 2.77148438, "router_z_loss_mlp": 0.35229492, "step": 1393, "time_per_iteration": 2.906816005706787 }, { "auxiliary_loss_clip": 0.01727602, "auxiliary_loss_mlp": 0.01081254, "balance_loss_clip": 1.46002972, "balance_loss_mlp": 1.04544353, "epoch": 0.08381181421914925, "flos": 26114114073600.0, "grad_norm": 1.730721656081755, "language_loss": 0.86437201, "learning_rate": 3.969701860282415e-06, "loss": 0.89246058, "num_input_tokens_seen": 29663920, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.3581543, "step": 1394, "time_per_iteration": 2.8907511234283447 }, { "auxiliary_loss_clip": 0.01735164, "auxiliary_loss_mlp": 0.01079019, "balance_loss_clip": 1.45972848, "balance_loss_mlp": 1.04397166, "epoch": 0.08387193747181723, "flos": 20639114035200.0, "grad_norm": 1.7258958866692262, "language_loss": 0.84163547, "learning_rate": 3.969634289062719e-06, "loss": 0.86977726, "num_input_tokens_seen": 29683825, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.35058594, "step": 1395, "time_per_iteration": 2.9512531757354736 }, { "auxiliary_loss_clip": 0.01732564, "auxiliary_loss_mlp": 0.01076692, "balance_loss_clip": 1.45518935, "balance_loss_mlp": 1.0392369, "epoch": 0.0839320607244852, "flos": 13450950426240.0, "grad_norm": 3.2617413051056214, "language_loss": 0.84993708, "learning_rate": 3.969566643154293e-06, "loss": 0.87802964, "num_input_tokens_seen": 29698775, "router_z_loss_clip": 2.7734375, "router_z_loss_mlp": 0.375, "step": 1396, "time_per_iteration": 2.8676838874816895 }, { "auxiliary_loss_clip": 0.01720197, "auxiliary_loss_mlp": 0.01070898, "balance_loss_clip": 1.45195341, "balance_loss_mlp": 1.03279829, "epoch": 0.08399218397715316, "flos": 23487726579840.0, "grad_norm": 1.7894836092889945, "language_loss": 0.78699738, "learning_rate": 3.969498922559703e-06, "loss": 0.81490833, "num_input_tokens_seen": 29719430, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.38085938, "step": 1397, "time_per_iteration": 2.9442577362060547 }, { "auxiliary_loss_clip": 0.01732432, "auxiliary_loss_mlp": 0.01068522, "balance_loss_clip": 1.45987952, "balance_loss_mlp": 1.03216338, "epoch": 0.08405230722982113, "flos": 25931550729600.0, "grad_norm": 1.7699328362967883, "language_loss": 0.79700577, "learning_rate": 3.969431127281516e-06, "loss": 0.82501525, "num_input_tokens_seen": 29739685, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.36352539, "step": 1398, "time_per_iteration": 2.939739227294922 }, { "auxiliary_loss_clip": 0.01710663, "auxiliary_loss_mlp": 0.01069924, "balance_loss_clip": 1.44544983, "balance_loss_mlp": 1.03256428, "epoch": 0.0841124304824891, "flos": 17976367704960.0, "grad_norm": 2.601761082044779, "language_loss": 0.96254277, "learning_rate": 3.969363257322304e-06, "loss": 0.99034858, "num_input_tokens_seen": 29756165, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.3737793, "step": 1399, "time_per_iteration": 2.8060302734375 }, { "auxiliary_loss_clip": 0.01719427, "auxiliary_loss_mlp": 0.01058339, "balance_loss_clip": 1.43792915, "balance_loss_mlp": 1.02150297, "epoch": 0.08417255373515707, "flos": 25640634792960.0, "grad_norm": 2.4764195377250817, "language_loss": 0.83375692, "learning_rate": 3.96929531268464e-06, "loss": 0.8615346, "num_input_tokens_seen": 29776425, "router_z_loss_clip": 2.81835938, "router_z_loss_mlp": 0.36816406, "step": 1400, "time_per_iteration": 2.9495186805725098 }, { "auxiliary_loss_clip": 0.01705879, "auxiliary_loss_mlp": 0.01062138, "balance_loss_clip": 1.43512976, "balance_loss_mlp": 1.02596998, "epoch": 0.08423267698782504, "flos": 26260816273920.0, "grad_norm": 1.6750564853355931, "language_loss": 0.88053596, "learning_rate": 3.969227293371099e-06, "loss": 0.90821612, "num_input_tokens_seen": 29796440, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.36157227, "step": 1401, "time_per_iteration": 2.911494016647339 }, { "auxiliary_loss_clip": 0.0170274, "auxiliary_loss_mlp": 0.01065486, "balance_loss_clip": 1.43108153, "balance_loss_mlp": 1.02750587, "epoch": 0.08429280024049302, "flos": 20129094938880.0, "grad_norm": 1.7529957645445644, "language_loss": 0.8862474, "learning_rate": 3.969159199384263e-06, "loss": 0.9139297, "num_input_tokens_seen": 29814755, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.37988281, "step": 1402, "time_per_iteration": 2.9010045528411865 }, { "auxiliary_loss_clip": 0.0167554, "auxiliary_loss_mlp": 0.01060664, "balance_loss_clip": 1.41144276, "balance_loss_mlp": 1.02428102, "epoch": 0.08435292349316098, "flos": 42940147265280.0, "grad_norm": 2.0878641968795275, "language_loss": 0.89897627, "learning_rate": 3.9690910307267125e-06, "loss": 0.92633832, "num_input_tokens_seen": 29834785, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.36352539, "step": 1403, "time_per_iteration": 3.0818636417388916 }, { "auxiliary_loss_clip": 0.01702916, "auxiliary_loss_mlp": 0.01062048, "balance_loss_clip": 1.42794204, "balance_loss_mlp": 1.02397251, "epoch": 0.08441304674582895, "flos": 22867680833280.0, "grad_norm": 2.5976180259762147, "language_loss": 0.81951451, "learning_rate": 3.969022787401033e-06, "loss": 0.84716409, "num_input_tokens_seen": 29854695, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 0.38085938, "step": 1404, "time_per_iteration": 2.874556064605713 }, { "auxiliary_loss_clip": 0.01714956, "auxiliary_loss_mlp": 0.01065933, "balance_loss_clip": 1.43481994, "balance_loss_mlp": 1.0280478, "epoch": 0.08447316999849692, "flos": 18706711570560.0, "grad_norm": 1.8771345486199051, "language_loss": 0.85080957, "learning_rate": 3.968954469409811e-06, "loss": 0.87861848, "num_input_tokens_seen": 29872180, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.37915039, "step": 1405, "time_per_iteration": 2.836319923400879 }, { "auxiliary_loss_clip": 0.01693822, "auxiliary_loss_mlp": 0.01059742, "balance_loss_clip": 1.42305446, "balance_loss_mlp": 1.02083218, "epoch": 0.08453329325116489, "flos": 25494973223040.0, "grad_norm": 1.7005052167643189, "language_loss": 0.811131, "learning_rate": 3.968886076755639e-06, "loss": 0.83866668, "num_input_tokens_seen": 29893205, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.38916016, "step": 1406, "time_per_iteration": 2.8876311779022217 }, { "auxiliary_loss_clip": 0.01706433, "auxiliary_loss_mlp": 0.01071811, "balance_loss_clip": 1.43202448, "balance_loss_mlp": 1.03278136, "epoch": 0.08459341650383286, "flos": 20929034586240.0, "grad_norm": 1.9097070286358913, "language_loss": 0.80653954, "learning_rate": 3.96881760944111e-06, "loss": 0.83432198, "num_input_tokens_seen": 29911970, "router_z_loss_clip": 2.74609375, "router_z_loss_mlp": 0.39038086, "step": 1407, "time_per_iteration": 2.892956256866455 }, { "auxiliary_loss_clip": 0.0170535, "auxiliary_loss_mlp": 0.0107212, "balance_loss_clip": 1.43234146, "balance_loss_mlp": 1.03330588, "epoch": 0.08465353975650082, "flos": 13051048469760.0, "grad_norm": 2.0263555832229065, "language_loss": 0.92362094, "learning_rate": 3.968749067468819e-06, "loss": 0.95139563, "num_input_tokens_seen": 29929925, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.38842773, "step": 1408, "time_per_iteration": 4.275665521621704 }, { "auxiliary_loss_clip": 0.01451762, "auxiliary_loss_mlp": 0.01085111, "balance_loss_clip": 1.29224741, "balance_loss_mlp": 1.05707347, "epoch": 0.0847136630091688, "flos": 60907664972160.0, "grad_norm": 0.9089391392015953, "language_loss": 0.61906993, "learning_rate": 3.968680450841368e-06, "loss": 0.64443862, "num_input_tokens_seen": 29985950, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.28125, "step": 1409, "time_per_iteration": 3.3845105171203613 }, { "auxiliary_loss_clip": 0.01674573, "auxiliary_loss_mlp": 0.01068142, "balance_loss_clip": 1.41418517, "balance_loss_mlp": 1.03006649, "epoch": 0.08477378626183676, "flos": 22054620193920.0, "grad_norm": 1.7096289229701715, "language_loss": 0.88204515, "learning_rate": 3.968611759561355e-06, "loss": 0.90947223, "num_input_tokens_seen": 30004330, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.38085938, "step": 1410, "time_per_iteration": 2.8872504234313965 }, { "auxiliary_loss_clip": 0.01714445, "auxiliary_loss_mlp": 0.0107726, "balance_loss_clip": 1.44080353, "balance_loss_mlp": 1.03646648, "epoch": 0.08483390951450473, "flos": 16698560031360.0, "grad_norm": 2.255376832042188, "language_loss": 0.75654721, "learning_rate": 3.968542993631388e-06, "loss": 0.78446418, "num_input_tokens_seen": 30022555, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.40771484, "step": 1411, "time_per_iteration": 2.8400719165802 }, { "auxiliary_loss_clip": 0.01444405, "auxiliary_loss_mlp": 0.01089337, "balance_loss_clip": 1.28138685, "balance_loss_mlp": 1.06301582, "epoch": 0.08489403276717271, "flos": 51615349061760.0, "grad_norm": 1.0261755756644533, "language_loss": 0.56826866, "learning_rate": 3.968474153054073e-06, "loss": 0.59360611, "num_input_tokens_seen": 30077220, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.26367188, "step": 1412, "time_per_iteration": 4.642136812210083 }, { "auxiliary_loss_clip": 0.01705887, "auxiliary_loss_mlp": 0.01076071, "balance_loss_clip": 1.43474674, "balance_loss_mlp": 1.03542089, "epoch": 0.08495415601984067, "flos": 17101176675840.0, "grad_norm": 2.0336288613133005, "language_loss": 0.91326344, "learning_rate": 3.96840523783202e-06, "loss": 0.94108301, "num_input_tokens_seen": 30094600, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.40649414, "step": 1413, "time_per_iteration": 2.8639779090881348 }, { "auxiliary_loss_clip": 0.01695898, "auxiliary_loss_mlp": 0.01063387, "balance_loss_clip": 1.4273752, "balance_loss_mlp": 1.02547848, "epoch": 0.08501427927250864, "flos": 23158687259520.0, "grad_norm": 1.710128651009867, "language_loss": 0.89320958, "learning_rate": 3.968336247967844e-06, "loss": 0.92080241, "num_input_tokens_seen": 30114475, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.37890625, "step": 1414, "time_per_iteration": 4.273478031158447 }, { "auxiliary_loss_clip": 0.0170861, "auxiliary_loss_mlp": 0.01065077, "balance_loss_clip": 1.43735564, "balance_loss_mlp": 1.02814555, "epoch": 0.08507440252517662, "flos": 19073059868160.0, "grad_norm": 1.614113913989463, "language_loss": 0.78804165, "learning_rate": 3.96826718346416e-06, "loss": 0.81577849, "num_input_tokens_seen": 30133350, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.36938477, "step": 1415, "time_per_iteration": 4.234161615371704 }, { "auxiliary_loss_clip": 0.01699921, "auxiliary_loss_mlp": 0.01064141, "balance_loss_clip": 1.43137252, "balance_loss_mlp": 1.02599382, "epoch": 0.08513452577784458, "flos": 60202396005120.0, "grad_norm": 1.838000390275932, "language_loss": 0.72164333, "learning_rate": 3.968198044323587e-06, "loss": 0.74928391, "num_input_tokens_seen": 30159005, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.38134766, "step": 1416, "time_per_iteration": 3.2811455726623535 }, { "auxiliary_loss_clip": 0.0171938, "auxiliary_loss_mlp": 0.01063482, "balance_loss_clip": 1.43907213, "balance_loss_mlp": 1.0254786, "epoch": 0.08519464903051255, "flos": 27320018480640.0, "grad_norm": 2.0694159304260125, "language_loss": 0.76324677, "learning_rate": 3.968128830548748e-06, "loss": 0.79107541, "num_input_tokens_seen": 30179450, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.37988281, "step": 1417, "time_per_iteration": 2.9654271602630615 }, { "auxiliary_loss_clip": 0.01698695, "auxiliary_loss_mlp": 0.0105679, "balance_loss_clip": 1.42641354, "balance_loss_mlp": 1.01850009, "epoch": 0.08525477228318051, "flos": 20276068608000.0, "grad_norm": 2.7134467086714196, "language_loss": 0.83940411, "learning_rate": 3.968059542142265e-06, "loss": 0.86695892, "num_input_tokens_seen": 30197235, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.38305664, "step": 1418, "time_per_iteration": 3.080857992172241 }, { "auxiliary_loss_clip": 0.01428271, "auxiliary_loss_mlp": 0.01059974, "balance_loss_clip": 1.26694417, "balance_loss_mlp": 1.03670406, "epoch": 0.08531489553584849, "flos": 67643822597760.0, "grad_norm": 0.8750690451173324, "language_loss": 0.56656063, "learning_rate": 3.9679901791067685e-06, "loss": 0.59144306, "num_input_tokens_seen": 30257410, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.23242188, "step": 1419, "time_per_iteration": 3.3670828342437744 }, { "auxiliary_loss_clip": 0.01689507, "auxiliary_loss_mlp": 0.01062137, "balance_loss_clip": 1.41599929, "balance_loss_mlp": 1.02675605, "epoch": 0.08537501878851646, "flos": 27538442968320.0, "grad_norm": 2.3764680350747738, "language_loss": 0.71989983, "learning_rate": 3.967920741444886e-06, "loss": 0.74741632, "num_input_tokens_seen": 30277865, "router_z_loss_clip": 2.734375, "router_z_loss_mlp": 0.35375977, "step": 1420, "time_per_iteration": 3.0056228637695312 }, { "auxiliary_loss_clip": 0.01692804, "auxiliary_loss_mlp": 0.01059803, "balance_loss_clip": 1.42318439, "balance_loss_mlp": 1.02146566, "epoch": 0.08543514204118442, "flos": 22794601201920.0, "grad_norm": 2.4690451007342156, "language_loss": 0.89526373, "learning_rate": 3.967851229159252e-06, "loss": 0.92278981, "num_input_tokens_seen": 30298545, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.38354492, "step": 1421, "time_per_iteration": 3.0284202098846436 }, { "auxiliary_loss_clip": 0.01431016, "auxiliary_loss_mlp": 0.01054971, "balance_loss_clip": 1.2702651, "balance_loss_mlp": 1.02101982, "epoch": 0.0854952652938524, "flos": 61021989878400.0, "grad_norm": 0.8036311751223029, "language_loss": 0.635903, "learning_rate": 3.967781642252502e-06, "loss": 0.66076291, "num_input_tokens_seen": 30361725, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.33984375, "step": 1422, "time_per_iteration": 3.3928236961364746 }, { "auxiliary_loss_clip": 0.01684407, "auxiliary_loss_mlp": 0.01066852, "balance_loss_clip": 1.41631794, "balance_loss_mlp": 1.02827597, "epoch": 0.08555538854652037, "flos": 28049728919040.0, "grad_norm": 1.8219653336092325, "language_loss": 0.84232134, "learning_rate": 3.967711980727276e-06, "loss": 0.86983395, "num_input_tokens_seen": 30382180, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.38574219, "step": 1423, "time_per_iteration": 3.0131216049194336 }, { "auxiliary_loss_clip": 0.01696909, "auxiliary_loss_mlp": 0.01066573, "balance_loss_clip": 1.42668986, "balance_loss_mlp": 1.03128731, "epoch": 0.08561551179918833, "flos": 23518610795520.0, "grad_norm": 1.7632183728226252, "language_loss": 0.75618935, "learning_rate": 3.967642244586213e-06, "loss": 0.78382409, "num_input_tokens_seen": 30402980, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.35302734, "step": 1424, "time_per_iteration": 2.926640748977661 }, { "auxiliary_loss_clip": 0.01689533, "auxiliary_loss_mlp": 0.01060066, "balance_loss_clip": 1.41794598, "balance_loss_mlp": 1.02308726, "epoch": 0.08567563505185631, "flos": 17934805716480.0, "grad_norm": 1.9034944689676325, "language_loss": 0.77651429, "learning_rate": 3.96757243383196e-06, "loss": 0.80401027, "num_input_tokens_seen": 30420800, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.36938477, "step": 1425, "time_per_iteration": 2.899348735809326 }, { "auxiliary_loss_clip": 0.01692776, "auxiliary_loss_mlp": 0.01065587, "balance_loss_clip": 1.42230022, "balance_loss_mlp": 1.02634311, "epoch": 0.08573575830452428, "flos": 19729102492800.0, "grad_norm": 1.943892824896567, "language_loss": 0.94842446, "learning_rate": 3.9675025484671624e-06, "loss": 0.97600806, "num_input_tokens_seen": 30439620, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.39233398, "step": 1426, "time_per_iteration": 2.9380276203155518 }, { "auxiliary_loss_clip": 0.01704396, "auxiliary_loss_mlp": 0.01062891, "balance_loss_clip": 1.42579079, "balance_loss_mlp": 1.02457714, "epoch": 0.08579588155719224, "flos": 17940642295680.0, "grad_norm": 2.1700824985926315, "language_loss": 0.78074801, "learning_rate": 3.967432588494471e-06, "loss": 0.80842084, "num_input_tokens_seen": 30457300, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 0.38354492, "step": 1427, "time_per_iteration": 2.9521608352661133 }, { "auxiliary_loss_clip": 0.01696848, "auxiliary_loss_mlp": 0.0106418, "balance_loss_clip": 1.42563224, "balance_loss_mlp": 1.02810717, "epoch": 0.08585600480986022, "flos": 16041657755520.0, "grad_norm": 2.312776955225836, "language_loss": 0.83301485, "learning_rate": 3.96736255391654e-06, "loss": 0.86062503, "num_input_tokens_seen": 30471580, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.36083984, "step": 1428, "time_per_iteration": 2.981518507003784 }, { "auxiliary_loss_clip": 0.01704344, "auxiliary_loss_mlp": 0.0106686, "balance_loss_clip": 1.42512119, "balance_loss_mlp": 1.03133535, "epoch": 0.08591612806252819, "flos": 28669050748800.0, "grad_norm": 1.838207722422203, "language_loss": 0.82079577, "learning_rate": 3.967292444736023e-06, "loss": 0.84850776, "num_input_tokens_seen": 30492720, "router_z_loss_clip": 2.79296875, "router_z_loss_mlp": 0.35498047, "step": 1429, "time_per_iteration": 2.9386069774627686 }, { "auxiliary_loss_clip": 0.01702808, "auxiliary_loss_mlp": 0.01069937, "balance_loss_clip": 1.43006253, "balance_loss_mlp": 1.03274357, "epoch": 0.08597625131519615, "flos": 20968786782720.0, "grad_norm": 2.388530811603044, "language_loss": 0.88910341, "learning_rate": 3.967222260955578e-06, "loss": 0.9168309, "num_input_tokens_seen": 30509535, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.37207031, "step": 1430, "time_per_iteration": 2.874907970428467 }, { "auxiliary_loss_clip": 0.01681406, "auxiliary_loss_mlp": 0.01065273, "balance_loss_clip": 1.41620493, "balance_loss_mlp": 1.0326817, "epoch": 0.08603637456786412, "flos": 23266270690560.0, "grad_norm": 1.5623032415037705, "language_loss": 0.82888043, "learning_rate": 3.96715200257787e-06, "loss": 0.8563472, "num_input_tokens_seen": 30529490, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.32568359, "step": 1431, "time_per_iteration": 2.861708164215088 }, { "auxiliary_loss_clip": 0.01700497, "auxiliary_loss_mlp": 0.01081992, "balance_loss_clip": 1.42888129, "balance_loss_mlp": 1.04611015, "epoch": 0.0860964978205321, "flos": 28705862033280.0, "grad_norm": 2.435225587410007, "language_loss": 0.7889396, "learning_rate": 3.967081669605559e-06, "loss": 0.81676447, "num_input_tokens_seen": 30550205, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.35888672, "step": 1432, "time_per_iteration": 2.9523355960845947 }, { "auxiliary_loss_clip": 0.0169521, "auxiliary_loss_mlp": 0.01074067, "balance_loss_clip": 1.42399538, "balance_loss_mlp": 1.03792238, "epoch": 0.08615662107320006, "flos": 19327933681920.0, "grad_norm": 1.8134356160512286, "language_loss": 0.75193352, "learning_rate": 3.967011262041315e-06, "loss": 0.77962625, "num_input_tokens_seen": 30568830, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.36132812, "step": 1433, "time_per_iteration": 3.0045876502990723 }, { "auxiliary_loss_clip": 0.01713174, "auxiliary_loss_mlp": 0.01069437, "balance_loss_clip": 1.43466425, "balance_loss_mlp": 1.03355479, "epoch": 0.08621674432586802, "flos": 15860180286720.0, "grad_norm": 2.816150325384874, "language_loss": 0.87046254, "learning_rate": 3.9669407798878065e-06, "loss": 0.89828873, "num_input_tokens_seen": 30585730, "router_z_loss_clip": 2.7890625, "router_z_loss_mlp": 0.35913086, "step": 1434, "time_per_iteration": 2.818168878555298 }, { "auxiliary_loss_clip": 0.01694877, "auxiliary_loss_mlp": 0.01076575, "balance_loss_clip": 1.42270827, "balance_loss_mlp": 1.04243362, "epoch": 0.086276867578536, "flos": 14108757598080.0, "grad_norm": 2.0602233215660486, "language_loss": 0.80409151, "learning_rate": 3.966870223147707e-06, "loss": 0.83180606, "num_input_tokens_seen": 30603180, "router_z_loss_clip": 2.72265625, "router_z_loss_mlp": 0.34130859, "step": 1435, "time_per_iteration": 2.8941714763641357 }, { "auxiliary_loss_clip": 0.01445746, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.28814626, "balance_loss_mlp": 1.00978422, "epoch": 0.08633699083120397, "flos": 70218440760960.0, "grad_norm": 0.8988382165235212, "language_loss": 0.58007669, "learning_rate": 3.96679959182369e-06, "loss": 0.60481703, "num_input_tokens_seen": 30668895, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18457031, "step": 1436, "time_per_iteration": 3.4676103591918945 }, { "auxiliary_loss_clip": 0.01705789, "auxiliary_loss_mlp": 0.01074657, "balance_loss_clip": 1.43048823, "balance_loss_mlp": 1.03913283, "epoch": 0.08639711408387193, "flos": 30310808745600.0, "grad_norm": 2.597100309015104, "language_loss": 0.7176888, "learning_rate": 3.966728885918437e-06, "loss": 0.74549323, "num_input_tokens_seen": 30688955, "router_z_loss_clip": 2.75390625, "router_z_loss_mlp": 0.35522461, "step": 1437, "time_per_iteration": 3.0359976291656494 }, { "auxiliary_loss_clip": 0.01696379, "auxiliary_loss_mlp": 0.01075258, "balance_loss_clip": 1.42335057, "balance_loss_mlp": 1.04102135, "epoch": 0.08645723733653991, "flos": 20305957438080.0, "grad_norm": 2.006618996803624, "language_loss": 0.7437768, "learning_rate": 3.966658105434627e-06, "loss": 0.7714932, "num_input_tokens_seen": 30706095, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.34228516, "step": 1438, "time_per_iteration": 2.883685827255249 }, { "auxiliary_loss_clip": 0.01686215, "auxiliary_loss_mlp": 0.01072271, "balance_loss_clip": 1.41788769, "balance_loss_mlp": 1.03769994, "epoch": 0.08651736058920788, "flos": 32903506846080.0, "grad_norm": 1.6065783654927681, "language_loss": 0.6571846, "learning_rate": 3.966587250374945e-06, "loss": 0.68476951, "num_input_tokens_seen": 30729025, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.34570312, "step": 1439, "time_per_iteration": 3.0110933780670166 }, { "auxiliary_loss_clip": 0.01690549, "auxiliary_loss_mlp": 0.01077003, "balance_loss_clip": 1.42228723, "balance_loss_mlp": 1.04226589, "epoch": 0.08657748384187584, "flos": 22647446553600.0, "grad_norm": 1.9724974421113723, "language_loss": 0.89311683, "learning_rate": 3.966516320742077e-06, "loss": 0.92079234, "num_input_tokens_seen": 30746155, "router_z_loss_clip": 2.68554688, "router_z_loss_mlp": 0.34741211, "step": 1440, "time_per_iteration": 2.935922384262085 }, { "auxiliary_loss_clip": 0.01728377, "auxiliary_loss_mlp": 0.01079676, "balance_loss_clip": 1.44433522, "balance_loss_mlp": 1.04427075, "epoch": 0.08663760709454381, "flos": 23668570621440.0, "grad_norm": 2.428064854464057, "language_loss": 0.85249579, "learning_rate": 3.9664453165387124e-06, "loss": 0.88057625, "num_input_tokens_seen": 30761410, "router_z_loss_clip": 2.84179688, "router_z_loss_mlp": 0.35375977, "step": 1441, "time_per_iteration": 2.883333206176758 }, { "auxiliary_loss_clip": 0.01469998, "auxiliary_loss_mlp": 0.01019698, "balance_loss_clip": 1.31034112, "balance_loss_mlp": 0.99738199, "epoch": 0.08669773034721179, "flos": 62713345455360.0, "grad_norm": 0.8521165574315908, "language_loss": 0.60621727, "learning_rate": 3.966374237767545e-06, "loss": 0.63111424, "num_input_tokens_seen": 30823010, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.22363281, "step": 1442, "time_per_iteration": 3.5239264965057373 }, { "auxiliary_loss_clip": 0.01709005, "auxiliary_loss_mlp": 0.01084022, "balance_loss_clip": 1.42988181, "balance_loss_mlp": 1.04895055, "epoch": 0.08675785359987975, "flos": 20677146929280.0, "grad_norm": 1.9882040729084187, "language_loss": 0.81546283, "learning_rate": 3.96630308443127e-06, "loss": 0.84339309, "num_input_tokens_seen": 30841980, "router_z_loss_clip": 2.79101562, "router_z_loss_mlp": 0.35058594, "step": 1443, "time_per_iteration": 4.283212661743164 }, { "auxiliary_loss_clip": 0.01707026, "auxiliary_loss_mlp": 0.01079233, "balance_loss_clip": 1.43125248, "balance_loss_mlp": 1.04413795, "epoch": 0.08681797685254772, "flos": 26951905635840.0, "grad_norm": 1.5589618569217094, "language_loss": 0.8396157, "learning_rate": 3.966231856532584e-06, "loss": 0.86747831, "num_input_tokens_seen": 30863280, "router_z_loss_clip": 2.7578125, "router_z_loss_mlp": 0.35107422, "step": 1444, "time_per_iteration": 2.954648971557617 }, { "auxiliary_loss_clip": 0.01723422, "auxiliary_loss_mlp": 0.01080446, "balance_loss_clip": 1.44141483, "balance_loss_mlp": 1.04656696, "epoch": 0.0868781001052157, "flos": 17721901094400.0, "grad_norm": 2.004602500040005, "language_loss": 0.88611609, "learning_rate": 3.966160554074189e-06, "loss": 0.91415477, "num_input_tokens_seen": 30881710, "router_z_loss_clip": 2.81640625, "router_z_loss_mlp": 0.33862305, "step": 1445, "time_per_iteration": 2.815340280532837 }, { "auxiliary_loss_clip": 0.0170191, "auxiliary_loss_mlp": 0.01081163, "balance_loss_clip": 1.42954755, "balance_loss_mlp": 1.0482372, "epoch": 0.08693822335788366, "flos": 19904743382400.0, "grad_norm": 1.8989968519371574, "language_loss": 0.83322692, "learning_rate": 3.96608917705879e-06, "loss": 0.8610577, "num_input_tokens_seen": 30900225, "router_z_loss_clip": 2.72070312, "router_z_loss_mlp": 0.3293457, "step": 1446, "time_per_iteration": 2.8846256732940674 }, { "auxiliary_loss_clip": 0.01470507, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.30922985, "balance_loss_mlp": 1.01409173, "epoch": 0.08699834661055163, "flos": 67050453300480.0, "grad_norm": 0.7325457547512711, "language_loss": 0.54894996, "learning_rate": 3.966017725489091e-06, "loss": 0.57400191, "num_input_tokens_seen": 30959580, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.20605469, "step": 1447, "time_per_iteration": 4.8357110023498535 }, { "auxiliary_loss_clip": 0.01669095, "auxiliary_loss_mlp": 0.01074486, "balance_loss_clip": 1.40504169, "balance_loss_mlp": 1.04079723, "epoch": 0.0870584698632196, "flos": 13488485627520.0, "grad_norm": 2.174017014775898, "language_loss": 0.86280084, "learning_rate": 3.965946199367804e-06, "loss": 0.89023668, "num_input_tokens_seen": 30976775, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.33666992, "step": 1448, "time_per_iteration": 2.8145554065704346 }, { "auxiliary_loss_clip": 0.01719913, "auxiliary_loss_mlp": 0.01077923, "balance_loss_clip": 1.4381783, "balance_loss_mlp": 1.04330468, "epoch": 0.08711859311588757, "flos": 16115551793280.0, "grad_norm": 2.644616822273634, "language_loss": 0.82496297, "learning_rate": 3.965874598697638e-06, "loss": 0.85294127, "num_input_tokens_seen": 30990495, "router_z_loss_clip": 2.8203125, "router_z_loss_mlp": 0.34594727, "step": 1449, "time_per_iteration": 2.8412206172943115 }, { "auxiliary_loss_clip": 0.01693473, "auxiliary_loss_mlp": 0.01077764, "balance_loss_clip": 1.42366874, "balance_loss_mlp": 1.0444572, "epoch": 0.08717871636855554, "flos": 38486588008320.0, "grad_norm": 1.7273493153539616, "language_loss": 0.72537041, "learning_rate": 3.965802923481313e-06, "loss": 0.75308269, "num_input_tokens_seen": 31014080, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.33300781, "step": 1450, "time_per_iteration": 5.849255084991455 }, { "auxiliary_loss_clip": 0.01687652, "auxiliary_loss_mlp": 0.01079067, "balance_loss_clip": 1.41450989, "balance_loss_mlp": 1.04237473, "epoch": 0.0872388396212235, "flos": 17608888287360.0, "grad_norm": 1.9194261179324634, "language_loss": 0.84253937, "learning_rate": 3.965731173721542e-06, "loss": 0.87020653, "num_input_tokens_seen": 31031210, "router_z_loss_clip": 2.73242188, "router_z_loss_mlp": 0.3671875, "step": 1451, "time_per_iteration": 2.875861406326294 }, { "auxiliary_loss_clip": 0.01679399, "auxiliary_loss_mlp": 0.01095182, "balance_loss_clip": 1.41394186, "balance_loss_mlp": 1.05956221, "epoch": 0.08729896287389148, "flos": 25268857119360.0, "grad_norm": 1.7061062845768897, "language_loss": 0.75552243, "learning_rate": 3.965659349421049e-06, "loss": 0.78326821, "num_input_tokens_seen": 31049710, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.35620117, "step": 1452, "time_per_iteration": 2.885282516479492 }, { "auxiliary_loss_clip": 0.01701231, "auxiliary_loss_mlp": 0.01079702, "balance_loss_clip": 1.42550969, "balance_loss_mlp": 1.04439187, "epoch": 0.08735908612655945, "flos": 15640805658240.0, "grad_norm": 2.998545555636555, "language_loss": 0.83116233, "learning_rate": 3.965587450582556e-06, "loss": 0.85897171, "num_input_tokens_seen": 31066160, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.35302734, "step": 1453, "time_per_iteration": 2.867849588394165 }, { "auxiliary_loss_clip": 0.0167399, "auxiliary_loss_mlp": 0.01090298, "balance_loss_clip": 1.40713286, "balance_loss_mlp": 1.05372465, "epoch": 0.08741920937922741, "flos": 20349374463360.0, "grad_norm": 1.9683129212356871, "language_loss": 0.73230976, "learning_rate": 3.96551547720879e-06, "loss": 0.75995266, "num_input_tokens_seen": 31085270, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.36572266, "step": 1454, "time_per_iteration": 2.8371164798736572 }, { "auxiliary_loss_clip": 0.01473424, "auxiliary_loss_mlp": 0.01046824, "balance_loss_clip": 1.31099701, "balance_loss_mlp": 1.02450836, "epoch": 0.08747933263189539, "flos": 62852781225600.0, "grad_norm": 0.7776465392612809, "language_loss": 0.58726376, "learning_rate": 3.96544342930248e-06, "loss": 0.61246622, "num_input_tokens_seen": 31148445, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.22363281, "step": 1455, "time_per_iteration": 3.3931353092193604 }, { "auxiliary_loss_clip": 0.01685266, "auxiliary_loss_mlp": 0.01104446, "balance_loss_clip": 1.41374278, "balance_loss_mlp": 1.06827807, "epoch": 0.08753945588456336, "flos": 33048670723200.0, "grad_norm": 1.7511202281446623, "language_loss": 0.79132116, "learning_rate": 3.965371306866359e-06, "loss": 0.81921828, "num_input_tokens_seen": 31168770, "router_z_loss_clip": 2.71484375, "router_z_loss_mlp": 0.36157227, "step": 1456, "time_per_iteration": 2.9769954681396484 }, { "auxiliary_loss_clip": 0.01694042, "auxiliary_loss_mlp": 0.0109419, "balance_loss_clip": 1.42124677, "balance_loss_mlp": 1.05978656, "epoch": 0.08759957913723132, "flos": 35559014008320.0, "grad_norm": 1.9199576013135244, "language_loss": 0.73496616, "learning_rate": 3.96529910990316e-06, "loss": 0.7628485, "num_input_tokens_seen": 31189270, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.34423828, "step": 1457, "time_per_iteration": 2.9944584369659424 }, { "auxiliary_loss_clip": 0.01671141, "auxiliary_loss_mlp": 0.01084023, "balance_loss_clip": 1.40630615, "balance_loss_mlp": 1.04821217, "epoch": 0.0876597023898993, "flos": 23920684502400.0, "grad_norm": 2.1024040208512083, "language_loss": 0.87908292, "learning_rate": 3.965226838415622e-06, "loss": 0.90663457, "num_input_tokens_seen": 31210385, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.35839844, "step": 1458, "time_per_iteration": 2.94559907913208 }, { "auxiliary_loss_clip": 0.01694166, "auxiliary_loss_mlp": 0.01088486, "balance_loss_clip": 1.42233491, "balance_loss_mlp": 1.05341446, "epoch": 0.08771982564256726, "flos": 18123160394880.0, "grad_norm": 1.6049704396692936, "language_loss": 0.81521058, "learning_rate": 3.965154492406486e-06, "loss": 0.84303707, "num_input_tokens_seen": 31229745, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.35083008, "step": 1459, "time_per_iteration": 2.846565008163452 }, { "auxiliary_loss_clip": 0.01694209, "auxiliary_loss_mlp": 0.0109419, "balance_loss_clip": 1.41897845, "balance_loss_mlp": 1.05711603, "epoch": 0.08777994889523523, "flos": 17720860464000.0, "grad_norm": 2.120039522535448, "language_loss": 0.85368615, "learning_rate": 3.9650820718784945e-06, "loss": 0.88157016, "num_input_tokens_seen": 31248280, "router_z_loss_clip": 2.74804688, "router_z_loss_mlp": 0.37060547, "step": 1460, "time_per_iteration": 3.07643461227417 }, { "auxiliary_loss_clip": 0.01693482, "auxiliary_loss_mlp": 0.01091236, "balance_loss_clip": 1.4209913, "balance_loss_mlp": 1.05406654, "epoch": 0.0878400721479032, "flos": 12826742158080.0, "grad_norm": 3.6312260092922144, "language_loss": 0.82830882, "learning_rate": 3.965009576834394e-06, "loss": 0.85615599, "num_input_tokens_seen": 31262190, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.37182617, "step": 1461, "time_per_iteration": 2.918724536895752 }, { "auxiliary_loss_clip": 0.01691393, "auxiliary_loss_mlp": 0.0108847, "balance_loss_clip": 1.42056823, "balance_loss_mlp": 1.05006099, "epoch": 0.08790019540057117, "flos": 26403039239040.0, "grad_norm": 2.4215007321019977, "language_loss": 0.77273512, "learning_rate": 3.964937007276932e-06, "loss": 0.80053377, "num_input_tokens_seen": 31283690, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.38427734, "step": 1462, "time_per_iteration": 3.0084195137023926 }, { "auxiliary_loss_clip": 0.01712235, "auxiliary_loss_mlp": 0.01092823, "balance_loss_clip": 1.43169188, "balance_loss_mlp": 1.05572486, "epoch": 0.08796031865323914, "flos": 19143017608320.0, "grad_norm": 2.1269486431456897, "language_loss": 0.76087791, "learning_rate": 3.9648643632088634e-06, "loss": 0.78892845, "num_input_tokens_seen": 31302505, "router_z_loss_clip": 2.80273438, "router_z_loss_mlp": 0.37084961, "step": 1463, "time_per_iteration": 2.9102320671081543 }, { "auxiliary_loss_clip": 0.01705008, "auxiliary_loss_mlp": 0.01082479, "balance_loss_clip": 1.42527044, "balance_loss_mlp": 1.04359317, "epoch": 0.0880204419059071, "flos": 26074542856320.0, "grad_norm": 1.7738267130447192, "language_loss": 0.84438133, "learning_rate": 3.964791644632941e-06, "loss": 0.87225622, "num_input_tokens_seen": 31323070, "router_z_loss_clip": 2.79882812, "router_z_loss_mlp": 0.38891602, "step": 1464, "time_per_iteration": 2.892742395401001 }, { "auxiliary_loss_clip": 0.01694478, "auxiliary_loss_mlp": 0.01079096, "balance_loss_clip": 1.41953063, "balance_loss_mlp": 1.04056787, "epoch": 0.08808056515857508, "flos": 22386781405440.0, "grad_norm": 1.8455244796768437, "language_loss": 0.79969823, "learning_rate": 3.964718851551923e-06, "loss": 0.82743394, "num_input_tokens_seen": 31341880, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.38476562, "step": 1465, "time_per_iteration": 2.916489839553833 }, { "auxiliary_loss_clip": 0.01707106, "auxiliary_loss_mlp": 0.01088937, "balance_loss_clip": 1.43181133, "balance_loss_mlp": 1.04897857, "epoch": 0.08814068841124305, "flos": 23195679523200.0, "grad_norm": 2.180820854670081, "language_loss": 0.86923927, "learning_rate": 3.9646459839685675e-06, "loss": 0.89719969, "num_input_tokens_seen": 31361995, "router_z_loss_clip": 2.75195312, "router_z_loss_mlp": 0.39941406, "step": 1466, "time_per_iteration": 2.8658385276794434 }, { "auxiliary_loss_clip": 0.01692679, "auxiliary_loss_mlp": 0.01070206, "balance_loss_clip": 1.41867268, "balance_loss_mlp": 1.0314635, "epoch": 0.08820081166391101, "flos": 25166114881920.0, "grad_norm": 2.552061330735026, "language_loss": 0.85616136, "learning_rate": 3.964573041885641e-06, "loss": 0.88379014, "num_input_tokens_seen": 31381515, "router_z_loss_clip": 2.74023438, "router_z_loss_mlp": 0.38720703, "step": 1467, "time_per_iteration": 2.935750722885132 }, { "auxiliary_loss_clip": 0.01687484, "auxiliary_loss_mlp": 0.01085464, "balance_loss_clip": 1.41671693, "balance_loss_mlp": 1.0474124, "epoch": 0.08826093491657899, "flos": 22240531653120.0, "grad_norm": 1.6075343445833274, "language_loss": 0.76493466, "learning_rate": 3.964500025305907e-06, "loss": 0.79266417, "num_input_tokens_seen": 31400345, "router_z_loss_clip": 2.70703125, "router_z_loss_mlp": 0.38037109, "step": 1468, "time_per_iteration": 2.882694721221924 }, { "auxiliary_loss_clip": 0.01688703, "auxiliary_loss_mlp": 0.01069531, "balance_loss_clip": 1.42107892, "balance_loss_mlp": 1.03004849, "epoch": 0.08832105816924696, "flos": 22136839274880.0, "grad_norm": 1.7240328404565577, "language_loss": 0.81352949, "learning_rate": 3.9644269342321355e-06, "loss": 0.84111184, "num_input_tokens_seen": 31419620, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.39501953, "step": 1469, "time_per_iteration": 2.873887300491333 }, { "auxiliary_loss_clip": 0.01684523, "auxiliary_loss_mlp": 0.01073465, "balance_loss_clip": 1.41281271, "balance_loss_mlp": 1.03415012, "epoch": 0.08838118142191492, "flos": 17575425118080.0, "grad_norm": 1.9588436549928667, "language_loss": 0.79851991, "learning_rate": 3.9643537686670974e-06, "loss": 0.82609981, "num_input_tokens_seen": 31437970, "router_z_loss_clip": 2.71679688, "router_z_loss_mlp": 0.39331055, "step": 1470, "time_per_iteration": 2.901948928833008 }, { "auxiliary_loss_clip": 0.0168469, "auxiliary_loss_mlp": 0.01063463, "balance_loss_clip": 1.41662407, "balance_loss_mlp": 1.0249579, "epoch": 0.0884413046745829, "flos": 20786947355520.0, "grad_norm": 1.7079328903955453, "language_loss": 0.85606629, "learning_rate": 3.964280528613569e-06, "loss": 0.88354778, "num_input_tokens_seen": 31457040, "router_z_loss_clip": 2.68164062, "router_z_loss_mlp": 0.38476562, "step": 1471, "time_per_iteration": 2.882233142852783 }, { "auxiliary_loss_clip": 0.01648562, "auxiliary_loss_mlp": 0.01074955, "balance_loss_clip": 1.39171231, "balance_loss_mlp": 1.03392363, "epoch": 0.08850142792725087, "flos": 22135119972480.0, "grad_norm": 1.6477712455437603, "language_loss": 0.84121907, "learning_rate": 3.964207214074324e-06, "loss": 0.86845422, "num_input_tokens_seen": 31477520, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.41015625, "step": 1472, "time_per_iteration": 2.9077906608581543 }, { "auxiliary_loss_clip": 0.01675809, "auxiliary_loss_mlp": 0.01071689, "balance_loss_clip": 1.41121078, "balance_loss_mlp": 1.03151584, "epoch": 0.08856155117991883, "flos": 22428841086720.0, "grad_norm": 2.634937316587977, "language_loss": 0.85639405, "learning_rate": 3.964133825052146e-06, "loss": 0.88386905, "num_input_tokens_seen": 31495575, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.40161133, "step": 1473, "time_per_iteration": 2.8921308517456055 }, { "auxiliary_loss_clip": 0.01671408, "auxiliary_loss_mlp": 0.0107022, "balance_loss_clip": 1.40414953, "balance_loss_mlp": 1.03023684, "epoch": 0.0886216744325868, "flos": 29949618355200.0, "grad_norm": 1.426364472899277, "language_loss": 0.79852486, "learning_rate": 3.964060361549816e-06, "loss": 0.82594109, "num_input_tokens_seen": 31520020, "router_z_loss_clip": 2.67578125, "router_z_loss_mlp": 0.3996582, "step": 1474, "time_per_iteration": 2.9917385578155518 }, { "auxiliary_loss_clip": 0.01656355, "auxiliary_loss_mlp": 0.01065168, "balance_loss_clip": 1.39996362, "balance_loss_mlp": 1.0256387, "epoch": 0.08868179768525478, "flos": 23992090076160.0, "grad_norm": 1.682659188555523, "language_loss": 0.80450338, "learning_rate": 3.963986823570121e-06, "loss": 0.83171862, "num_input_tokens_seen": 31539265, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.39526367, "step": 1475, "time_per_iteration": 3.0308282375335693 }, { "auxiliary_loss_clip": 0.01674579, "auxiliary_loss_mlp": 0.0106288, "balance_loss_clip": 1.40535843, "balance_loss_mlp": 1.0216341, "epoch": 0.08874192093792274, "flos": 43192623104640.0, "grad_norm": 1.4762002362913769, "language_loss": 0.75783765, "learning_rate": 3.963913211115848e-06, "loss": 0.78521222, "num_input_tokens_seen": 31563425, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.41259766, "step": 1476, "time_per_iteration": 3.0930070877075195 }, { "auxiliary_loss_clip": 0.01663867, "auxiliary_loss_mlp": 0.0107097, "balance_loss_clip": 1.39975309, "balance_loss_mlp": 1.03146422, "epoch": 0.0888020441905907, "flos": 32864931014400.0, "grad_norm": 1.4055656188485488, "language_loss": 0.75966769, "learning_rate": 3.9638395241897895e-06, "loss": 0.78701603, "num_input_tokens_seen": 31584525, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.39501953, "step": 1477, "time_per_iteration": 3.0115277767181396 }, { "auxiliary_loss_clip": 0.01659556, "auxiliary_loss_mlp": 0.01060351, "balance_loss_clip": 1.39239049, "balance_loss_mlp": 1.01874661, "epoch": 0.08886216744325869, "flos": 23159863624320.0, "grad_norm": 1.7128579526688374, "language_loss": 0.88039815, "learning_rate": 3.963765762794739e-06, "loss": 0.90759724, "num_input_tokens_seen": 31603325, "router_z_loss_clip": 2.67382812, "router_z_loss_mlp": 0.41601562, "step": 1478, "time_per_iteration": 4.299095153808594 }, { "auxiliary_loss_clip": 0.0165666, "auxiliary_loss_mlp": 0.01067768, "balance_loss_clip": 1.39166903, "balance_loss_mlp": 1.02892947, "epoch": 0.08892229069592665, "flos": 23342110254720.0, "grad_norm": 1.5493895659949921, "language_loss": 0.78541017, "learning_rate": 3.963691926933495e-06, "loss": 0.81265444, "num_input_tokens_seen": 31624820, "router_z_loss_clip": 2.65039062, "router_z_loss_mlp": 0.38842773, "step": 1479, "time_per_iteration": 2.929715156555176 }, { "auxiliary_loss_clip": 0.01656016, "auxiliary_loss_mlp": 0.01062727, "balance_loss_clip": 1.39300883, "balance_loss_mlp": 1.0217905, "epoch": 0.08898241394859462, "flos": 26225000375040.0, "grad_norm": 2.4639212824896823, "language_loss": 0.79197025, "learning_rate": 3.9636180166088555e-06, "loss": 0.81915772, "num_input_tokens_seen": 31646080, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.40942383, "step": 1480, "time_per_iteration": 2.9235153198242188 }, { "auxiliary_loss_clip": 0.01693831, "auxiliary_loss_mlp": 0.01070554, "balance_loss_clip": 1.41809785, "balance_loss_mlp": 1.02868724, "epoch": 0.0890425372012626, "flos": 23561213414400.0, "grad_norm": 10.443373973339584, "language_loss": 0.68605328, "learning_rate": 3.963544031823624e-06, "loss": 0.71369708, "num_input_tokens_seen": 31665770, "router_z_loss_clip": 2.75585938, "router_z_loss_mlp": 0.41870117, "step": 1481, "time_per_iteration": 2.9531354904174805 }, { "auxiliary_loss_clip": 0.01658667, "auxiliary_loss_mlp": 0.01071292, "balance_loss_clip": 1.39738476, "balance_loss_mlp": 1.02937758, "epoch": 0.08910266045393056, "flos": 23013116179200.0, "grad_norm": 1.9665640994860454, "language_loss": 0.97791833, "learning_rate": 3.9634699725806065e-06, "loss": 1.00521791, "num_input_tokens_seen": 31683805, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.41894531, "step": 1482, "time_per_iteration": 4.270545482635498 }, { "auxiliary_loss_clip": 0.01676101, "auxiliary_loss_mlp": 0.01078856, "balance_loss_clip": 1.40607762, "balance_loss_mlp": 1.03987479, "epoch": 0.08916278370659853, "flos": 31947770793600.0, "grad_norm": 1.9059355623553798, "language_loss": 0.79606533, "learning_rate": 3.96339583888261e-06, "loss": 0.82361495, "num_input_tokens_seen": 31704630, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 0.3894043, "step": 1483, "time_per_iteration": 3.0013105869293213 }, { "auxiliary_loss_clip": 0.01680897, "auxiliary_loss_mlp": 0.01077246, "balance_loss_clip": 1.41390324, "balance_loss_mlp": 1.03802633, "epoch": 0.08922290695926649, "flos": 17539428240000.0, "grad_norm": 1.9878844214004985, "language_loss": 0.87401807, "learning_rate": 3.963321630732448e-06, "loss": 0.90159953, "num_input_tokens_seen": 31723255, "router_z_loss_clip": 2.66796875, "router_z_loss_mlp": 0.3918457, "step": 1484, "time_per_iteration": 2.878570318222046 }, { "auxiliary_loss_clip": 0.01682102, "auxiliary_loss_mlp": 0.01074906, "balance_loss_clip": 1.4093945, "balance_loss_mlp": 1.03423142, "epoch": 0.08928303021193447, "flos": 32137980508800.0, "grad_norm": 1.9650926117134675, "language_loss": 0.81299567, "learning_rate": 3.963247348132932e-06, "loss": 0.84056574, "num_input_tokens_seen": 31747045, "router_z_loss_clip": 2.73046875, "router_z_loss_mlp": 0.40649414, "step": 1485, "time_per_iteration": 4.396559000015259 }, { "auxiliary_loss_clip": 0.01660476, "auxiliary_loss_mlp": 0.01069478, "balance_loss_clip": 1.39631033, "balance_loss_mlp": 1.02997208, "epoch": 0.08934315346460243, "flos": 22134803258880.0, "grad_norm": 1.595647567002402, "language_loss": 0.84274656, "learning_rate": 3.96317299108688e-06, "loss": 0.87004602, "num_input_tokens_seen": 31766615, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.39501953, "step": 1486, "time_per_iteration": 4.358829498291016 }, { "auxiliary_loss_clip": 0.01652827, "auxiliary_loss_mlp": 0.01074429, "balance_loss_clip": 1.39119887, "balance_loss_mlp": 1.03358781, "epoch": 0.0894032767172704, "flos": 22575950490240.0, "grad_norm": 2.244217157801428, "language_loss": 0.78194451, "learning_rate": 3.963098559597111e-06, "loss": 0.8092171, "num_input_tokens_seen": 31785855, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.40844727, "step": 1487, "time_per_iteration": 2.9381394386291504 }, { "auxiliary_loss_clip": 0.01635197, "auxiliary_loss_mlp": 0.01057849, "balance_loss_clip": 1.37701392, "balance_loss_mlp": 1.01884341, "epoch": 0.08946339996993838, "flos": 20202988976640.0, "grad_norm": 2.436894735809548, "language_loss": 0.8501749, "learning_rate": 3.963024053666449e-06, "loss": 0.87710536, "num_input_tokens_seen": 31804210, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.39013672, "step": 1488, "time_per_iteration": 2.884742498397827 }, { "auxiliary_loss_clip": 0.01644925, "auxiliary_loss_mlp": 0.0105712, "balance_loss_clip": 1.38624668, "balance_loss_mlp": 1.01995039, "epoch": 0.08952352322260634, "flos": 48377023920000.0, "grad_norm": 2.3289671024855068, "language_loss": 0.74075156, "learning_rate": 3.962949473297718e-06, "loss": 0.76777196, "num_input_tokens_seen": 31826150, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.37158203, "step": 1489, "time_per_iteration": 3.1257054805755615 }, { "auxiliary_loss_clip": 0.01633045, "auxiliary_loss_mlp": 0.01062526, "balance_loss_clip": 1.37348235, "balance_loss_mlp": 1.02361655, "epoch": 0.08958364647527431, "flos": 31804959646080.0, "grad_norm": 2.674069010172623, "language_loss": 0.9119972, "learning_rate": 3.962874818493745e-06, "loss": 0.93895292, "num_input_tokens_seen": 31848060, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.38891602, "step": 1490, "time_per_iteration": 3.0113754272460938 }, { "auxiliary_loss_clip": 0.01655651, "auxiliary_loss_mlp": 0.01065339, "balance_loss_clip": 1.3933835, "balance_loss_mlp": 1.02788305, "epoch": 0.08964376972794229, "flos": 23378740560000.0, "grad_norm": 2.4490251042082325, "language_loss": 0.7669338, "learning_rate": 3.9628000892573635e-06, "loss": 0.79414368, "num_input_tokens_seen": 31870040, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.37426758, "step": 1491, "time_per_iteration": 2.9008941650390625 }, { "auxiliary_loss_clip": 0.01647906, "auxiliary_loss_mlp": 0.01066715, "balance_loss_clip": 1.38630414, "balance_loss_mlp": 1.02995062, "epoch": 0.08970389298061025, "flos": 23305208480640.0, "grad_norm": 1.6066820056404474, "language_loss": 0.78682804, "learning_rate": 3.9627252855914055e-06, "loss": 0.81397426, "num_input_tokens_seen": 31890400, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.36791992, "step": 1492, "time_per_iteration": 2.936413526535034 }, { "auxiliary_loss_clip": 0.01638439, "auxiliary_loss_mlp": 0.01081234, "balance_loss_clip": 1.37827098, "balance_loss_mlp": 1.03166699, "epoch": 0.08976401623327822, "flos": 33773675702400.0, "grad_norm": 1.8486330335418228, "language_loss": 0.72137928, "learning_rate": 3.962650407498707e-06, "loss": 0.74857593, "num_input_tokens_seen": 31913435, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.49560547, "step": 1493, "time_per_iteration": 3.029247999191284 }, { "auxiliary_loss_clip": 0.01652156, "auxiliary_loss_mlp": 0.01066631, "balance_loss_clip": 1.38856578, "balance_loss_mlp": 1.03027236, "epoch": 0.08982413948594618, "flos": 23920955971200.0, "grad_norm": 2.0390692176178558, "language_loss": 0.88118517, "learning_rate": 3.962575454982109e-06, "loss": 0.90837306, "num_input_tokens_seen": 31932435, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.36376953, "step": 1494, "time_per_iteration": 2.9170124530792236 }, { "auxiliary_loss_clip": 0.01632643, "auxiliary_loss_mlp": 0.0106423, "balance_loss_clip": 1.37449408, "balance_loss_mlp": 1.02868187, "epoch": 0.08988426273861416, "flos": 16846483841280.0, "grad_norm": 1.8717642691748706, "language_loss": 0.84031439, "learning_rate": 3.962500428044454e-06, "loss": 0.86728311, "num_input_tokens_seen": 31950125, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.35546875, "step": 1495, "time_per_iteration": 2.90978741645813 }, { "auxiliary_loss_clip": 0.01669186, "auxiliary_loss_mlp": 0.01069894, "balance_loss_clip": 1.40159094, "balance_loss_mlp": 1.0360384, "epoch": 0.08994438599128213, "flos": 14801521017600.0, "grad_norm": 3.393541058830151, "language_loss": 0.72491241, "learning_rate": 3.962425326688585e-06, "loss": 0.75230318, "num_input_tokens_seen": 31968050, "router_z_loss_clip": 2.67773438, "router_z_loss_mlp": 0.33862305, "step": 1496, "time_per_iteration": 2.8464479446411133 }, { "auxiliary_loss_clip": 0.01644599, "auxiliary_loss_mlp": 0.01070496, "balance_loss_clip": 1.38588929, "balance_loss_mlp": 1.03325534, "epoch": 0.09000450924395009, "flos": 17393268977280.0, "grad_norm": 1.5496575400627126, "language_loss": 0.81716537, "learning_rate": 3.962350150917351e-06, "loss": 0.8443163, "num_input_tokens_seen": 31985675, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.37231445, "step": 1497, "time_per_iteration": 2.8943357467651367 }, { "auxiliary_loss_clip": 0.01658293, "auxiliary_loss_mlp": 0.01066662, "balance_loss_clip": 1.39332759, "balance_loss_mlp": 1.03070867, "epoch": 0.09006463249661807, "flos": 24291104832000.0, "grad_norm": 3.3952271450621967, "language_loss": 0.84967268, "learning_rate": 3.9622749007336035e-06, "loss": 0.87692219, "num_input_tokens_seen": 32005180, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.35961914, "step": 1498, "time_per_iteration": 2.891442060470581 }, { "auxiliary_loss_clip": 0.01665443, "auxiliary_loss_mlp": 0.01067398, "balance_loss_clip": 1.39854503, "balance_loss_mlp": 1.03189778, "epoch": 0.09012475574928604, "flos": 13669691627520.0, "grad_norm": 2.015433740759597, "language_loss": 0.8095361, "learning_rate": 3.962199576140195e-06, "loss": 0.83686447, "num_input_tokens_seen": 32022970, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.35498047, "step": 1499, "time_per_iteration": 2.927785873413086 }, { "auxiliary_loss_clip": 0.01644552, "auxiliary_loss_mlp": 0.01070456, "balance_loss_clip": 1.38787007, "balance_loss_mlp": 1.03500283, "epoch": 0.090184879001954, "flos": 23337269061120.0, "grad_norm": 1.5462937322027968, "language_loss": 0.93682665, "learning_rate": 3.962124177139981e-06, "loss": 0.96397674, "num_input_tokens_seen": 32043055, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.35473633, "step": 1500, "time_per_iteration": 2.947924852371216 }, { "auxiliary_loss_clip": 0.01671828, "auxiliary_loss_mlp": 0.01072001, "balance_loss_clip": 1.40261638, "balance_loss_mlp": 1.03468895, "epoch": 0.09024500225462198, "flos": 23012980444800.0, "grad_norm": 2.5774026193694057, "language_loss": 0.76314932, "learning_rate": 3.962048703735822e-06, "loss": 0.79058754, "num_input_tokens_seen": 32061900, "router_z_loss_clip": 2.69140625, "router_z_loss_mlp": 0.37329102, "step": 1501, "time_per_iteration": 2.8911335468292236 }, { "auxiliary_loss_clip": 0.01476009, "auxiliary_loss_mlp": 0.01057652, "balance_loss_clip": 1.31001985, "balance_loss_mlp": 1.03419137, "epoch": 0.09030512550728995, "flos": 62219343018240.0, "grad_norm": 0.739700095691954, "language_loss": 0.58482862, "learning_rate": 3.96197315593058e-06, "loss": 0.61016524, "num_input_tokens_seen": 32122745, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.234375, "step": 1502, "time_per_iteration": 3.403590679168701 }, { "auxiliary_loss_clip": 0.01647799, "auxiliary_loss_mlp": 0.0106847, "balance_loss_clip": 1.38411653, "balance_loss_mlp": 1.03196836, "epoch": 0.09036524875995791, "flos": 38814993901440.0, "grad_norm": 2.05866843974999, "language_loss": 0.72498345, "learning_rate": 3.961897533727119e-06, "loss": 0.75214612, "num_input_tokens_seen": 32145125, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.36499023, "step": 1503, "time_per_iteration": 3.0312466621398926 }, { "auxiliary_loss_clip": 0.01665307, "auxiliary_loss_mlp": 0.01085847, "balance_loss_clip": 1.39682531, "balance_loss_mlp": 1.04901123, "epoch": 0.09042537201262588, "flos": 21700261768320.0, "grad_norm": 1.9087514985889942, "language_loss": 0.87092113, "learning_rate": 3.961821837128306e-06, "loss": 0.89843261, "num_input_tokens_seen": 32166255, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.3684082, "step": 1504, "time_per_iteration": 2.9238553047180176 }, { "auxiliary_loss_clip": 0.01670948, "auxiliary_loss_mlp": 0.01072346, "balance_loss_clip": 1.39870358, "balance_loss_mlp": 1.03653526, "epoch": 0.09048549526529386, "flos": 22275578390400.0, "grad_norm": 2.5746394782277813, "language_loss": 0.74122477, "learning_rate": 3.961746066137014e-06, "loss": 0.76865768, "num_input_tokens_seen": 32184010, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.35791016, "step": 1505, "time_per_iteration": 2.953392744064331 }, { "auxiliary_loss_clip": 0.01650124, "auxiliary_loss_mlp": 0.01073541, "balance_loss_clip": 1.38625789, "balance_loss_mlp": 1.03880346, "epoch": 0.09054561851796182, "flos": 14619048163200.0, "grad_norm": 2.112731160702036, "language_loss": 0.83871639, "learning_rate": 3.961670220756114e-06, "loss": 0.86595297, "num_input_tokens_seen": 32201635, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.34741211, "step": 1506, "time_per_iteration": 2.8467507362365723 }, { "auxiliary_loss_clip": 0.01639445, "auxiliary_loss_mlp": 0.01070387, "balance_loss_clip": 1.38040805, "balance_loss_mlp": 1.03493464, "epoch": 0.09060574177062979, "flos": 27647609967360.0, "grad_norm": 1.6353697396254228, "language_loss": 0.77677554, "learning_rate": 3.961594300988482e-06, "loss": 0.8038739, "num_input_tokens_seen": 32221940, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.35449219, "step": 1507, "time_per_iteration": 2.971989154815674 }, { "auxiliary_loss_clip": 0.01457405, "auxiliary_loss_mlp": 0.01026021, "balance_loss_clip": 1.29464793, "balance_loss_mlp": 1.00027204, "epoch": 0.09066586502329776, "flos": 66115032163200.0, "grad_norm": 0.8046106051236679, "language_loss": 0.57799071, "learning_rate": 3.961518306836998e-06, "loss": 0.60282499, "num_input_tokens_seen": 32276495, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.2578125, "step": 1508, "time_per_iteration": 3.2084362506866455 }, { "auxiliary_loss_clip": 0.01655207, "auxiliary_loss_mlp": 0.0106778, "balance_loss_clip": 1.39106107, "balance_loss_mlp": 1.03235114, "epoch": 0.09072598827596573, "flos": 18925724240640.0, "grad_norm": 1.7674853944206754, "language_loss": 0.86488581, "learning_rate": 3.961442238304543e-06, "loss": 0.89211559, "num_input_tokens_seen": 32294130, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.35400391, "step": 1509, "time_per_iteration": 2.8927559852600098 }, { "auxiliary_loss_clip": 0.01673208, "auxiliary_loss_mlp": 0.01094452, "balance_loss_clip": 1.39916849, "balance_loss_mlp": 1.05637646, "epoch": 0.0907861115286337, "flos": 24831148492800.0, "grad_norm": 4.48337319120008, "language_loss": 0.86248207, "learning_rate": 3.961366095394002e-06, "loss": 0.89015865, "num_input_tokens_seen": 32313555, "router_z_loss_clip": 2.73828125, "router_z_loss_mlp": 0.38085938, "step": 1510, "time_per_iteration": 2.947505235671997 }, { "auxiliary_loss_clip": 0.01662225, "auxiliary_loss_mlp": 0.01074659, "balance_loss_clip": 1.39516759, "balance_loss_mlp": 1.03889644, "epoch": 0.09084623478130167, "flos": 21662817056640.0, "grad_norm": 2.026678293487158, "language_loss": 0.88278031, "learning_rate": 3.961289878108262e-06, "loss": 0.91014922, "num_input_tokens_seen": 32331430, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.35791016, "step": 1511, "time_per_iteration": 2.859786033630371 }, { "auxiliary_loss_clip": 0.01645444, "auxiliary_loss_mlp": 0.01069385, "balance_loss_clip": 1.38609385, "balance_loss_mlp": 1.03583968, "epoch": 0.09090635803396964, "flos": 27651501020160.0, "grad_norm": 2.0124371127310314, "language_loss": 0.85914028, "learning_rate": 3.9612135864502135e-06, "loss": 0.88628858, "num_input_tokens_seen": 32353705, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.33520508, "step": 1512, "time_per_iteration": 2.994142770767212 }, { "auxiliary_loss_clip": 0.01658503, "auxiliary_loss_mlp": 0.01073684, "balance_loss_clip": 1.39797199, "balance_loss_mlp": 1.03980482, "epoch": 0.0909664812866376, "flos": 17676312318720.0, "grad_norm": 2.1469279417870557, "language_loss": 0.89224726, "learning_rate": 3.961137220422749e-06, "loss": 0.91956913, "num_input_tokens_seen": 32370520, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.33886719, "step": 1513, "time_per_iteration": 4.289369106292725 }, { "auxiliary_loss_clip": 0.0165105, "auxiliary_loss_mlp": 0.01065944, "balance_loss_clip": 1.38828921, "balance_loss_mlp": 1.02722454, "epoch": 0.09102660453930557, "flos": 23962020266880.0, "grad_norm": 1.6461351785800569, "language_loss": 0.87990695, "learning_rate": 3.961060780028764e-06, "loss": 0.90707684, "num_input_tokens_seen": 32389105, "router_z_loss_clip": 2.63085938, "router_z_loss_mlp": 0.38696289, "step": 1514, "time_per_iteration": 2.90616512298584 }, { "auxiliary_loss_clip": 0.01636009, "auxiliary_loss_mlp": 0.01075415, "balance_loss_clip": 1.37680578, "balance_loss_mlp": 1.03896117, "epoch": 0.09108672779197355, "flos": 25824193522560.0, "grad_norm": 1.683506148467793, "language_loss": 0.91180718, "learning_rate": 3.960984265271159e-06, "loss": 0.93892145, "num_input_tokens_seen": 32408065, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.36425781, "step": 1515, "time_per_iteration": 2.998619318008423 }, { "auxiliary_loss_clip": 0.01655958, "auxiliary_loss_mlp": 0.01071523, "balance_loss_clip": 1.39173818, "balance_loss_mlp": 1.03745317, "epoch": 0.09114685104464151, "flos": 29650558354560.0, "grad_norm": 1.7634488736913074, "language_loss": 0.86503738, "learning_rate": 3.9609076761528335e-06, "loss": 0.89231217, "num_input_tokens_seen": 32427225, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.34082031, "step": 1516, "time_per_iteration": 2.938854694366455 }, { "auxiliary_loss_clip": 0.0166739, "auxiliary_loss_mlp": 0.01067818, "balance_loss_clip": 1.39876509, "balance_loss_mlp": 1.03296149, "epoch": 0.09120697429730948, "flos": 33742746241920.0, "grad_norm": 1.6036059964623912, "language_loss": 0.81596935, "learning_rate": 3.960831012676692e-06, "loss": 0.8433215, "num_input_tokens_seen": 32450510, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.34863281, "step": 1517, "time_per_iteration": 4.508835077285767 }, { "auxiliary_loss_clip": 0.01663062, "auxiliary_loss_mlp": 0.01064118, "balance_loss_clip": 1.39769816, "balance_loss_mlp": 1.02897525, "epoch": 0.09126709754997746, "flos": 18409416117120.0, "grad_norm": 1.5995405068390827, "language_loss": 0.78774977, "learning_rate": 3.960754274845642e-06, "loss": 0.81502151, "num_input_tokens_seen": 32468425, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.3515625, "step": 1518, "time_per_iteration": 2.919077157974243 }, { "auxiliary_loss_clip": 0.01655945, "auxiliary_loss_mlp": 0.01056672, "balance_loss_clip": 1.39179683, "balance_loss_mlp": 1.02338874, "epoch": 0.09132722080264542, "flos": 22102380720000.0, "grad_norm": 1.8266928893058352, "language_loss": 0.88356459, "learning_rate": 3.960677462662594e-06, "loss": 0.91069078, "num_input_tokens_seen": 32487510, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.33251953, "step": 1519, "time_per_iteration": 2.8690521717071533 }, { "auxiliary_loss_clip": 0.01662544, "auxiliary_loss_mlp": 0.01063968, "balance_loss_clip": 1.39687383, "balance_loss_mlp": 1.02844381, "epoch": 0.09138734405531339, "flos": 21043088023680.0, "grad_norm": 2.250584127886328, "language_loss": 0.74989736, "learning_rate": 3.96060057613046e-06, "loss": 0.77716255, "num_input_tokens_seen": 32507250, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.35522461, "step": 1520, "time_per_iteration": 4.306478977203369 }, { "auxiliary_loss_clip": 0.01666095, "auxiliary_loss_mlp": 0.01058503, "balance_loss_clip": 1.39719105, "balance_loss_mlp": 1.02293086, "epoch": 0.09144746730798137, "flos": 20093505264000.0, "grad_norm": 2.43809225579792, "language_loss": 0.87891495, "learning_rate": 3.960523615252156e-06, "loss": 0.90616095, "num_input_tokens_seen": 32526045, "router_z_loss_clip": 2.68945312, "router_z_loss_mlp": 0.35546875, "step": 1521, "time_per_iteration": 4.307372808456421 }, { "auxiliary_loss_clip": 0.01669929, "auxiliary_loss_mlp": 0.01059273, "balance_loss_clip": 1.40267062, "balance_loss_mlp": 1.02589464, "epoch": 0.09150759056064933, "flos": 22787000075520.0, "grad_norm": 1.807534738591785, "language_loss": 0.85570478, "learning_rate": 3.960446580030599e-06, "loss": 0.8829968, "num_input_tokens_seen": 32546575, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.33374023, "step": 1522, "time_per_iteration": 2.9227657318115234 }, { "auxiliary_loss_clip": 0.01639962, "auxiliary_loss_mlp": 0.01057061, "balance_loss_clip": 1.3840183, "balance_loss_mlp": 1.02284765, "epoch": 0.0915677138133173, "flos": 27575887680000.0, "grad_norm": 1.6233727391182309, "language_loss": 0.82577395, "learning_rate": 3.960369470468711e-06, "loss": 0.85274422, "num_input_tokens_seen": 32568795, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.3425293, "step": 1523, "time_per_iteration": 2.915952682495117 }, { "auxiliary_loss_clip": 0.01667642, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.39966583, "balance_loss_mlp": 1.02129316, "epoch": 0.09162783706598528, "flos": 17683415752320.0, "grad_norm": 2.0379907384696585, "language_loss": 0.76017284, "learning_rate": 3.960292286569418e-06, "loss": 0.78740174, "num_input_tokens_seen": 32587010, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.33959961, "step": 1524, "time_per_iteration": 2.8934710025787354 }, { "auxiliary_loss_clip": 0.01664751, "auxiliary_loss_mlp": 0.01058879, "balance_loss_clip": 1.40006042, "balance_loss_mlp": 1.02435625, "epoch": 0.09168796031865324, "flos": 18487246452480.0, "grad_norm": 1.8437103088332405, "language_loss": 0.87414771, "learning_rate": 3.960215028335644e-06, "loss": 0.901384, "num_input_tokens_seen": 32602375, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.3449707, "step": 1525, "time_per_iteration": 2.8521816730499268 }, { "auxiliary_loss_clip": 0.01675324, "auxiliary_loss_mlp": 0.01057867, "balance_loss_clip": 1.40696514, "balance_loss_mlp": 1.02148473, "epoch": 0.0917480835713212, "flos": 29399530348800.0, "grad_norm": 2.5446778005448127, "language_loss": 0.75899804, "learning_rate": 3.96013769577032e-06, "loss": 0.78632998, "num_input_tokens_seen": 32621460, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.36376953, "step": 1526, "time_per_iteration": 2.963404417037964 }, { "auxiliary_loss_clip": 0.01656664, "auxiliary_loss_mlp": 0.01060591, "balance_loss_clip": 1.39404547, "balance_loss_mlp": 1.02344561, "epoch": 0.09180820682398917, "flos": 19838948163840.0, "grad_norm": 1.7267260946179757, "language_loss": 0.78369761, "learning_rate": 3.960060288876378e-06, "loss": 0.81087017, "num_input_tokens_seen": 32640440, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.37158203, "step": 1527, "time_per_iteration": 2.906580924987793 }, { "auxiliary_loss_clip": 0.01667804, "auxiliary_loss_mlp": 0.01054095, "balance_loss_clip": 1.40304434, "balance_loss_mlp": 1.01923871, "epoch": 0.09186833007665715, "flos": 23852355575040.0, "grad_norm": 1.9932991185033384, "language_loss": 0.80954593, "learning_rate": 3.959982807656753e-06, "loss": 0.83676493, "num_input_tokens_seen": 32660020, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.34863281, "step": 1528, "time_per_iteration": 2.8752079010009766 }, { "auxiliary_loss_clip": 0.01684812, "auxiliary_loss_mlp": 0.01052023, "balance_loss_clip": 1.41251504, "balance_loss_mlp": 1.01952672, "epoch": 0.09192845332932512, "flos": 12940433637120.0, "grad_norm": 4.031828383907446, "language_loss": 0.79142755, "learning_rate": 3.959905252114384e-06, "loss": 0.81879586, "num_input_tokens_seen": 32678170, "router_z_loss_clip": 2.72851562, "router_z_loss_mlp": 0.32495117, "step": 1529, "time_per_iteration": 2.8646273612976074 }, { "auxiliary_loss_clip": 0.01686021, "auxiliary_loss_mlp": 0.01061238, "balance_loss_clip": 1.41511226, "balance_loss_mlp": 1.02719188, "epoch": 0.09198857658199308, "flos": 24577993981440.0, "grad_norm": 1.8772750235243316, "language_loss": 0.83580929, "learning_rate": 3.959827622252211e-06, "loss": 0.86328197, "num_input_tokens_seen": 32697540, "router_z_loss_clip": 2.7109375, "router_z_loss_mlp": 0.34033203, "step": 1530, "time_per_iteration": 2.8885927200317383 }, { "auxiliary_loss_clip": 0.01676379, "auxiliary_loss_mlp": 0.01057787, "balance_loss_clip": 1.41199255, "balance_loss_mlp": 1.02335894, "epoch": 0.09204869983466106, "flos": 20276702035200.0, "grad_norm": 1.8271986087380496, "language_loss": 0.85206437, "learning_rate": 3.959749918073179e-06, "loss": 0.87940598, "num_input_tokens_seen": 32716805, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.34423828, "step": 1531, "time_per_iteration": 2.898883104324341 }, { "auxiliary_loss_clip": 0.01674737, "auxiliary_loss_mlp": 0.01063659, "balance_loss_clip": 1.40775132, "balance_loss_mlp": 1.02963638, "epoch": 0.09210882308732903, "flos": 20895164213760.0, "grad_norm": 2.532466387693168, "language_loss": 0.82364744, "learning_rate": 3.959672139580233e-06, "loss": 0.85103136, "num_input_tokens_seen": 32736385, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.34008789, "step": 1532, "time_per_iteration": 2.896074056625366 }, { "auxiliary_loss_clip": 0.0167546, "auxiliary_loss_mlp": 0.01058557, "balance_loss_clip": 1.40920353, "balance_loss_mlp": 1.02300894, "epoch": 0.09216894633999699, "flos": 30968615917440.0, "grad_norm": 1.806584452524252, "language_loss": 0.84684324, "learning_rate": 3.9595942867763235e-06, "loss": 0.87418342, "num_input_tokens_seen": 32757140, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.35571289, "step": 1533, "time_per_iteration": 2.9397881031036377 }, { "auxiliary_loss_clip": 0.01676852, "auxiliary_loss_mlp": 0.0106575, "balance_loss_clip": 1.41056919, "balance_loss_mlp": 1.03396904, "epoch": 0.09222906959266497, "flos": 13159310572800.0, "grad_norm": 3.123647290685528, "language_loss": 0.91731352, "learning_rate": 3.959516359664402e-06, "loss": 0.94473958, "num_input_tokens_seen": 32774860, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.31762695, "step": 1534, "time_per_iteration": 2.9093289375305176 }, { "auxiliary_loss_clip": 0.01689488, "auxiliary_loss_mlp": 0.01075286, "balance_loss_clip": 1.42022121, "balance_loss_mlp": 1.041574, "epoch": 0.09228919284533293, "flos": 26005535256960.0, "grad_norm": 2.1953025632176155, "language_loss": 0.78038126, "learning_rate": 3.959438358247424e-06, "loss": 0.808029, "num_input_tokens_seen": 32795250, "router_z_loss_clip": 2.69335938, "router_z_loss_mlp": 0.33691406, "step": 1535, "time_per_iteration": 2.9374282360076904 }, { "auxiliary_loss_clip": 0.01664848, "auxiliary_loss_mlp": 0.01077637, "balance_loss_clip": 1.40434861, "balance_loss_mlp": 1.0433526, "epoch": 0.0923493160980009, "flos": 18669945530880.0, "grad_norm": 2.442655396544495, "language_loss": 0.83426291, "learning_rate": 3.959360282528346e-06, "loss": 0.86168778, "num_input_tokens_seen": 32813805, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.3425293, "step": 1536, "time_per_iteration": 2.895871639251709 }, { "auxiliary_loss_clip": 0.01675834, "auxiliary_loss_mlp": 0.01070177, "balance_loss_clip": 1.41351306, "balance_loss_mlp": 1.03887272, "epoch": 0.09240943935066886, "flos": 21150038027520.0, "grad_norm": 1.8130106426531971, "language_loss": 0.90937197, "learning_rate": 3.959282132510131e-06, "loss": 0.93683207, "num_input_tokens_seen": 32830960, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.31298828, "step": 1537, "time_per_iteration": 2.8687620162963867 }, { "auxiliary_loss_clip": 0.01691744, "auxiliary_loss_mlp": 0.01079056, "balance_loss_clip": 1.4236424, "balance_loss_mlp": 1.04467583, "epoch": 0.09246956260333684, "flos": 20601533589120.0, "grad_norm": 2.3139860813727875, "language_loss": 0.83089095, "learning_rate": 3.959203908195741e-06, "loss": 0.85859901, "num_input_tokens_seen": 32848275, "router_z_loss_clip": 2.68359375, "router_z_loss_mlp": 0.34375, "step": 1538, "time_per_iteration": 2.8745205402374268 }, { "auxiliary_loss_clip": 0.01485637, "auxiliary_loss_mlp": 0.01053187, "balance_loss_clip": 1.32329369, "balance_loss_mlp": 1.03125274, "epoch": 0.09252968585600481, "flos": 67591827043200.0, "grad_norm": 0.8730405108189423, "language_loss": 0.57457119, "learning_rate": 3.959125609588142e-06, "loss": 0.59995943, "num_input_tokens_seen": 32917730, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.21972656, "step": 1539, "time_per_iteration": 3.492041826248169 }, { "auxiliary_loss_clip": 0.01689132, "auxiliary_loss_mlp": 0.01070051, "balance_loss_clip": 1.419572, "balance_loss_mlp": 1.03555179, "epoch": 0.09258980910867277, "flos": 17392680794880.0, "grad_norm": 2.4811633319148427, "language_loss": 0.69509053, "learning_rate": 3.959047236690304e-06, "loss": 0.72268236, "num_input_tokens_seen": 32934910, "router_z_loss_clip": 2.69726562, "router_z_loss_mlp": 0.34521484, "step": 1540, "time_per_iteration": 2.8701462745666504 }, { "auxiliary_loss_clip": 0.01673523, "auxiliary_loss_mlp": 0.01064862, "balance_loss_clip": 1.41102493, "balance_loss_mlp": 1.03215146, "epoch": 0.09264993236134075, "flos": 19875668958720.0, "grad_norm": 1.6297506297394941, "language_loss": 0.84615725, "learning_rate": 3.958968789505198e-06, "loss": 0.87354112, "num_input_tokens_seen": 32953840, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.32739258, "step": 1541, "time_per_iteration": 2.947115659713745 }, { "auxiliary_loss_clip": 0.01475852, "auxiliary_loss_mlp": 0.01046972, "balance_loss_clip": 1.31257975, "balance_loss_mlp": 1.02284372, "epoch": 0.09271005561400872, "flos": 62310864545280.0, "grad_norm": 0.9311058150022005, "language_loss": 0.61956948, "learning_rate": 3.9588902680358e-06, "loss": 0.64479774, "num_input_tokens_seen": 33011410, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.24121094, "step": 1542, "time_per_iteration": 3.4006619453430176 }, { "auxiliary_loss_clip": 0.01691138, "auxiliary_loss_mlp": 0.01073376, "balance_loss_clip": 1.42473745, "balance_loss_mlp": 1.03725576, "epoch": 0.09277017886667668, "flos": 23339938504320.0, "grad_norm": 1.9644571836147888, "language_loss": 0.8400231, "learning_rate": 3.958811672285086e-06, "loss": 0.86766827, "num_input_tokens_seen": 33031675, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.36108398, "step": 1543, "time_per_iteration": 2.964691162109375 }, { "auxiliary_loss_clip": 0.01663198, "auxiliary_loss_mlp": 0.01069577, "balance_loss_clip": 1.40055931, "balance_loss_mlp": 1.03681827, "epoch": 0.09283030211934466, "flos": 54763664313600.0, "grad_norm": 1.8938736090684354, "language_loss": 0.74119055, "learning_rate": 3.958733002256038e-06, "loss": 0.76851833, "num_input_tokens_seen": 33056355, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.32788086, "step": 1544, "time_per_iteration": 3.303894519805908 }, { "auxiliary_loss_clip": 0.01709023, "auxiliary_loss_mlp": 0.01080336, "balance_loss_clip": 1.43616796, "balance_loss_mlp": 1.04731512, "epoch": 0.09289042537201263, "flos": 30346579399680.0, "grad_norm": 1.697534650509015, "language_loss": 0.78734583, "learning_rate": 3.958654257951637e-06, "loss": 0.81523943, "num_input_tokens_seen": 33079520, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.33007812, "step": 1545, "time_per_iteration": 3.1133921146392822 }, { "auxiliary_loss_clip": 0.01685278, "auxiliary_loss_mlp": 0.01061896, "balance_loss_clip": 1.41969967, "balance_loss_mlp": 1.02911353, "epoch": 0.09295054862468059, "flos": 17755273774080.0, "grad_norm": 2.6820716914623084, "language_loss": 0.76525199, "learning_rate": 3.9585754393748706e-06, "loss": 0.79272377, "num_input_tokens_seen": 33096135, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.328125, "step": 1546, "time_per_iteration": 2.9021050930023193 }, { "auxiliary_loss_clip": 0.01697143, "auxiliary_loss_mlp": 0.01068672, "balance_loss_clip": 1.42737544, "balance_loss_mlp": 1.03581762, "epoch": 0.09301067187734856, "flos": 23668118173440.0, "grad_norm": 1.8074082677916805, "language_loss": 0.85311288, "learning_rate": 3.9584965465287275e-06, "loss": 0.88077104, "num_input_tokens_seen": 33115245, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.32861328, "step": 1547, "time_per_iteration": 4.337569952011108 }, { "auxiliary_loss_clip": 0.01688189, "auxiliary_loss_mlp": 0.01077346, "balance_loss_clip": 1.41800451, "balance_loss_mlp": 1.04461133, "epoch": 0.09307079513001654, "flos": 27539393109120.0, "grad_norm": 2.0606188619544596, "language_loss": 0.6944418, "learning_rate": 3.958417579416199e-06, "loss": 0.72209716, "num_input_tokens_seen": 33136640, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.32714844, "step": 1548, "time_per_iteration": 3.04360294342041 }, { "auxiliary_loss_clip": 0.01701575, "auxiliary_loss_mlp": 0.01069372, "balance_loss_clip": 1.42902946, "balance_loss_mlp": 1.03482497, "epoch": 0.0931309183826845, "flos": 20636037388800.0, "grad_norm": 1.9982047661161284, "language_loss": 0.85074699, "learning_rate": 3.9583385380402795e-06, "loss": 0.87845647, "num_input_tokens_seen": 33155060, "router_z_loss_clip": 2.7265625, "router_z_loss_mlp": 0.34545898, "step": 1549, "time_per_iteration": 2.8929972648620605 }, { "auxiliary_loss_clip": 0.01691639, "auxiliary_loss_mlp": 0.01061956, "balance_loss_clip": 1.42602694, "balance_loss_mlp": 1.02886355, "epoch": 0.09319104163535247, "flos": 29032277155200.0, "grad_norm": 1.6455810917990779, "language_loss": 0.77012366, "learning_rate": 3.958259422403966e-06, "loss": 0.79765964, "num_input_tokens_seen": 33175420, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.33105469, "step": 1550, "time_per_iteration": 2.9863178730010986 }, { "auxiliary_loss_clip": 0.01698337, "auxiliary_loss_mlp": 0.01061705, "balance_loss_clip": 1.42893028, "balance_loss_mlp": 1.02801621, "epoch": 0.09325116488802045, "flos": 25312500368640.0, "grad_norm": 2.071664797882892, "language_loss": 0.84583366, "learning_rate": 3.95818023251026e-06, "loss": 0.87343413, "num_input_tokens_seen": 33194120, "router_z_loss_clip": 2.6953125, "router_z_loss_mlp": 0.33666992, "step": 1551, "time_per_iteration": 2.893467664718628 }, { "auxiliary_loss_clip": 0.01472754, "auxiliary_loss_mlp": 0.01020382, "balance_loss_clip": 1.31332803, "balance_loss_mlp": 1.00216663, "epoch": 0.09331128814068841, "flos": 61567146201600.0, "grad_norm": 0.7624139530838739, "language_loss": 0.61921847, "learning_rate": 3.958100968362163e-06, "loss": 0.64414984, "num_input_tokens_seen": 33261080, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18261719, "step": 1552, "time_per_iteration": 4.899768829345703 }, { "auxiliary_loss_clip": 0.01462663, "auxiliary_loss_mlp": 0.01024386, "balance_loss_clip": 1.30172515, "balance_loss_mlp": 1.0039773, "epoch": 0.09337141139335638, "flos": 53323354725120.0, "grad_norm": 0.8340954966249549, "language_loss": 0.59015512, "learning_rate": 3.958021629962681e-06, "loss": 0.61502558, "num_input_tokens_seen": 33330235, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.20410156, "step": 1553, "time_per_iteration": 3.505605459213257 }, { "auxiliary_loss_clip": 0.0169823, "auxiliary_loss_mlp": 0.0106175, "balance_loss_clip": 1.42470193, "balance_loss_mlp": 1.02667832, "epoch": 0.09343153464602436, "flos": 23487228887040.0, "grad_norm": 1.857410680123229, "language_loss": 0.88717604, "learning_rate": 3.957942217314823e-06, "loss": 0.91477579, "num_input_tokens_seen": 33349035, "router_z_loss_clip": 2.73632812, "router_z_loss_mlp": 0.35083008, "step": 1554, "time_per_iteration": 2.9784204959869385 }, { "auxiliary_loss_clip": 0.01668264, "auxiliary_loss_mlp": 0.01060271, "balance_loss_clip": 1.40916312, "balance_loss_mlp": 1.02694035, "epoch": 0.09349165789869232, "flos": 19362663705600.0, "grad_norm": 3.2075799729611316, "language_loss": 0.82897961, "learning_rate": 3.957862730421599e-06, "loss": 0.85626507, "num_input_tokens_seen": 33368060, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.33325195, "step": 1555, "time_per_iteration": 4.3635475635528564 }, { "auxiliary_loss_clip": 0.01465751, "auxiliary_loss_mlp": 0.01027111, "balance_loss_clip": 1.30729675, "balance_loss_mlp": 1.00908649, "epoch": 0.09355178115136029, "flos": 67530583566720.0, "grad_norm": 0.9039178008312055, "language_loss": 0.59732431, "learning_rate": 3.957783169286024e-06, "loss": 0.62225294, "num_input_tokens_seen": 33430825, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18066406, "step": 1556, "time_per_iteration": 4.861315488815308 }, { "auxiliary_loss_clip": 0.01681581, "auxiliary_loss_mlp": 0.01065203, "balance_loss_clip": 1.4190774, "balance_loss_mlp": 1.03284991, "epoch": 0.09361190440402825, "flos": 37355663514240.0, "grad_norm": 1.610525982407241, "language_loss": 0.85251623, "learning_rate": 3.9577035339111155e-06, "loss": 0.87998402, "num_input_tokens_seen": 33454855, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.32348633, "step": 1557, "time_per_iteration": 3.1293222904205322 }, { "auxiliary_loss_clip": 0.01670151, "auxiliary_loss_mlp": 0.0106736, "balance_loss_clip": 1.40632057, "balance_loss_mlp": 1.03307509, "epoch": 0.09367202765669623, "flos": 24910426661760.0, "grad_norm": 1.5471847831115995, "language_loss": 0.79123926, "learning_rate": 3.957623824299893e-06, "loss": 0.81861436, "num_input_tokens_seen": 33476000, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.34277344, "step": 1558, "time_per_iteration": 2.983290910720825 }, { "auxiliary_loss_clip": 0.01696423, "auxiliary_loss_mlp": 0.01072729, "balance_loss_clip": 1.42462504, "balance_loss_mlp": 1.03832579, "epoch": 0.0937321509093642, "flos": 15713975779200.0, "grad_norm": 1.8205707476344841, "language_loss": 0.81291062, "learning_rate": 3.957544040455379e-06, "loss": 0.84060216, "num_input_tokens_seen": 33493845, "router_z_loss_clip": 2.71875, "router_z_loss_mlp": 0.34423828, "step": 1559, "time_per_iteration": 2.8640787601470947 }, { "auxiliary_loss_clip": 0.01679526, "auxiliary_loss_mlp": 0.01065656, "balance_loss_clip": 1.4155767, "balance_loss_mlp": 1.03339767, "epoch": 0.09379227416203216, "flos": 20492954772480.0, "grad_norm": 2.178271087878595, "language_loss": 0.77273452, "learning_rate": 3.957464182380599e-06, "loss": 0.80018634, "num_input_tokens_seen": 33510850, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.32250977, "step": 1560, "time_per_iteration": 2.8942768573760986 }, { "auxiliary_loss_clip": 0.01701621, "auxiliary_loss_mlp": 0.01076726, "balance_loss_clip": 1.43133783, "balance_loss_mlp": 1.04270339, "epoch": 0.09385239741470014, "flos": 24363098588160.0, "grad_norm": 1.6092176782861025, "language_loss": 0.81653351, "learning_rate": 3.95738425007858e-06, "loss": 0.84431696, "num_input_tokens_seen": 33530430, "router_z_loss_clip": 2.70507812, "router_z_loss_mlp": 0.34033203, "step": 1561, "time_per_iteration": 2.9103035926818848 }, { "auxiliary_loss_clip": 0.01680573, "auxiliary_loss_mlp": 0.01066424, "balance_loss_clip": 1.41299379, "balance_loss_mlp": 1.03440452, "epoch": 0.0939125206673681, "flos": 33304404188160.0, "grad_norm": 1.9364840289644394, "language_loss": 0.63734114, "learning_rate": 3.957304243552354e-06, "loss": 0.66481113, "num_input_tokens_seen": 33551975, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.3203125, "step": 1562, "time_per_iteration": 2.968560218811035 }, { "auxiliary_loss_clip": 0.01658543, "auxiliary_loss_mlp": 0.01071725, "balance_loss_clip": 1.40328753, "balance_loss_mlp": 1.0400629, "epoch": 0.09397264392003607, "flos": 19254311112960.0, "grad_norm": 1.953984097138558, "language_loss": 0.86656106, "learning_rate": 3.957224162804956e-06, "loss": 0.89386374, "num_input_tokens_seen": 33569850, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.31616211, "step": 1563, "time_per_iteration": 2.9199600219726562 }, { "auxiliary_loss_clip": 0.01661315, "auxiliary_loss_mlp": 0.01067212, "balance_loss_clip": 1.40234923, "balance_loss_mlp": 1.03545451, "epoch": 0.09403276717270405, "flos": 19327345499520.0, "grad_norm": 1.9685165478772086, "language_loss": 0.77886653, "learning_rate": 3.9571440078394205e-06, "loss": 0.80615175, "num_input_tokens_seen": 33590510, "router_z_loss_clip": 2.59375, "router_z_loss_mlp": 0.31738281, "step": 1564, "time_per_iteration": 2.9036755561828613 }, { "auxiliary_loss_clip": 0.01661603, "auxiliary_loss_mlp": 0.01068242, "balance_loss_clip": 1.40166974, "balance_loss_mlp": 1.03565001, "epoch": 0.09409289042537201, "flos": 23593138260480.0, "grad_norm": 1.859278118064319, "language_loss": 0.81393957, "learning_rate": 3.9570637786587895e-06, "loss": 0.84123808, "num_input_tokens_seen": 33608810, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.32592773, "step": 1565, "time_per_iteration": 2.9754257202148438 }, { "auxiliary_loss_clip": 0.01665799, "auxiliary_loss_mlp": 0.01067583, "balance_loss_clip": 1.40236998, "balance_loss_mlp": 1.03451395, "epoch": 0.09415301367803998, "flos": 20086582809600.0, "grad_norm": 1.780377154388199, "language_loss": 0.77078539, "learning_rate": 3.956983475266103e-06, "loss": 0.79811919, "num_input_tokens_seen": 33627265, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.33081055, "step": 1566, "time_per_iteration": 2.908154010772705 }, { "auxiliary_loss_clip": 0.01671211, "auxiliary_loss_mlp": 0.01062458, "balance_loss_clip": 1.40950513, "balance_loss_mlp": 1.03174937, "epoch": 0.09421313693070796, "flos": 21070081186560.0, "grad_norm": 1.7857000530051752, "language_loss": 0.79621357, "learning_rate": 3.956903097664407e-06, "loss": 0.82355034, "num_input_tokens_seen": 33644810, "router_z_loss_clip": 2.61914062, "router_z_loss_mlp": 0.30688477, "step": 1567, "time_per_iteration": 2.895137310028076 }, { "auxiliary_loss_clip": 0.01669168, "auxiliary_loss_mlp": 0.01057116, "balance_loss_clip": 1.40814734, "balance_loss_mlp": 1.02519202, "epoch": 0.09427326018337592, "flos": 24326784996480.0, "grad_norm": 1.7393687082286562, "language_loss": 0.83758318, "learning_rate": 3.956822645856749e-06, "loss": 0.86484605, "num_input_tokens_seen": 33665665, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.3190918, "step": 1568, "time_per_iteration": 2.899127721786499 }, { "auxiliary_loss_clip": 0.01674193, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.40871382, "balance_loss_mlp": 1.02014935, "epoch": 0.09433338343604389, "flos": 20272856227200.0, "grad_norm": 1.835052947558203, "language_loss": 0.78092921, "learning_rate": 3.9567421198461814e-06, "loss": 0.80819142, "num_input_tokens_seen": 33684760, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.31860352, "step": 1569, "time_per_iteration": 2.9549431800842285 }, { "auxiliary_loss_clip": 0.01642971, "auxiliary_loss_mlp": 0.01050353, "balance_loss_clip": 1.3887912, "balance_loss_mlp": 1.01938248, "epoch": 0.09439350668871185, "flos": 12748730843520.0, "grad_norm": 24.49398075370472, "language_loss": 0.86881441, "learning_rate": 3.956661519635756e-06, "loss": 0.89574766, "num_input_tokens_seen": 33700750, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.30932617, "step": 1570, "time_per_iteration": 2.929267168045044 }, { "auxiliary_loss_clip": 0.01653267, "auxiliary_loss_mlp": 0.01050483, "balance_loss_clip": 1.39278412, "balance_loss_mlp": 1.01779592, "epoch": 0.09445362994137983, "flos": 25974424817280.0, "grad_norm": 1.8294101879097515, "language_loss": 0.77577591, "learning_rate": 3.95658084522853e-06, "loss": 0.80281341, "num_input_tokens_seen": 33724430, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.3269043, "step": 1571, "time_per_iteration": 2.93245530128479 }, { "auxiliary_loss_clip": 0.01631565, "auxiliary_loss_mlp": 0.01052312, "balance_loss_clip": 1.38330686, "balance_loss_mlp": 1.02022123, "epoch": 0.0945137531940478, "flos": 19723854096000.0, "grad_norm": 2.0658382312666177, "language_loss": 0.80485189, "learning_rate": 3.956500096627561e-06, "loss": 0.83169067, "num_input_tokens_seen": 33743455, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.32055664, "step": 1572, "time_per_iteration": 2.929464340209961 }, { "auxiliary_loss_clip": 0.016437, "auxiliary_loss_mlp": 0.01058175, "balance_loss_clip": 1.38848889, "balance_loss_mlp": 1.02510595, "epoch": 0.09457387644671576, "flos": 23625968002560.0, "grad_norm": 1.8566483459501812, "language_loss": 0.88564354, "learning_rate": 3.956419273835913e-06, "loss": 0.91266227, "num_input_tokens_seen": 33763435, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.33056641, "step": 1573, "time_per_iteration": 2.9060301780700684 }, { "auxiliary_loss_clip": 0.01673609, "auxiliary_loss_mlp": 0.01055963, "balance_loss_clip": 1.41033125, "balance_loss_mlp": 1.02239418, "epoch": 0.09463399969938374, "flos": 26918759180160.0, "grad_norm": 1.8515859176987535, "language_loss": 0.83219522, "learning_rate": 3.95633837685665e-06, "loss": 0.85949099, "num_input_tokens_seen": 33784325, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.33544922, "step": 1574, "time_per_iteration": 2.965719699859619 }, { "auxiliary_loss_clip": 0.01648739, "auxiliary_loss_mlp": 0.01051738, "balance_loss_clip": 1.39284444, "balance_loss_mlp": 1.02098179, "epoch": 0.0946941229520517, "flos": 23670063699840.0, "grad_norm": 1.6742444890038077, "language_loss": 0.82612801, "learning_rate": 3.95625740569284e-06, "loss": 0.85313272, "num_input_tokens_seen": 33802510, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.30761719, "step": 1575, "time_per_iteration": 2.936870813369751 }, { "auxiliary_loss_clip": 0.01640362, "auxiliary_loss_mlp": 0.01059645, "balance_loss_clip": 1.38622344, "balance_loss_mlp": 1.02731538, "epoch": 0.09475424620471967, "flos": 24144719345280.0, "grad_norm": 1.9125987984562414, "language_loss": 0.88512975, "learning_rate": 3.956176360347553e-06, "loss": 0.91212988, "num_input_tokens_seen": 33819980, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.32324219, "step": 1576, "time_per_iteration": 2.870584726333618 }, { "auxiliary_loss_clip": 0.01451173, "auxiliary_loss_mlp": 0.01080725, "balance_loss_clip": 1.29460096, "balance_loss_mlp": 1.05611992, "epoch": 0.09481436945738765, "flos": 68457380929920.0, "grad_norm": 0.9971887942456493, "language_loss": 0.6594578, "learning_rate": 3.956095240823862e-06, "loss": 0.68477678, "num_input_tokens_seen": 33878925, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.24609375, "step": 1577, "time_per_iteration": 3.36885929107666 }, { "auxiliary_loss_clip": 0.01647953, "auxiliary_loss_mlp": 0.01052684, "balance_loss_clip": 1.39241052, "balance_loss_mlp": 1.02183282, "epoch": 0.09487449271005562, "flos": 16662698887680.0, "grad_norm": 2.298275101277943, "language_loss": 0.8170172, "learning_rate": 3.956014047124844e-06, "loss": 0.84402359, "num_input_tokens_seen": 33897600, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.30834961, "step": 1578, "time_per_iteration": 2.862560749053955 }, { "auxiliary_loss_clip": 0.01659374, "auxiliary_loss_mlp": 0.01065341, "balance_loss_clip": 1.40223265, "balance_loss_mlp": 1.03367901, "epoch": 0.09493461596272358, "flos": 24285268252800.0, "grad_norm": 1.8945186008709878, "language_loss": 0.78810775, "learning_rate": 3.955932779253578e-06, "loss": 0.81535488, "num_input_tokens_seen": 33917365, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.31665039, "step": 1579, "time_per_iteration": 2.9497861862182617 }, { "auxiliary_loss_clip": 0.01654801, "auxiliary_loss_mlp": 0.01065365, "balance_loss_clip": 1.39822519, "balance_loss_mlp": 1.03253508, "epoch": 0.09499473921539155, "flos": 21879793710720.0, "grad_norm": 1.9438024873245814, "language_loss": 0.75129163, "learning_rate": 3.955851437213144e-06, "loss": 0.77849334, "num_input_tokens_seen": 33936680, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.328125, "step": 1580, "time_per_iteration": 2.956225872039795 }, { "auxiliary_loss_clip": 0.01641227, "auxiliary_loss_mlp": 0.01068635, "balance_loss_clip": 1.38873696, "balance_loss_mlp": 1.03837955, "epoch": 0.09505486246805953, "flos": 33560182897920.0, "grad_norm": 1.77094123245073, "language_loss": 0.78561193, "learning_rate": 3.955770021006627e-06, "loss": 0.81271052, "num_input_tokens_seen": 33960685, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.30273438, "step": 1581, "time_per_iteration": 2.9628002643585205 }, { "auxiliary_loss_clip": 0.01657623, "auxiliary_loss_mlp": 0.01067777, "balance_loss_clip": 1.40200663, "balance_loss_mlp": 1.03785503, "epoch": 0.09511498572072749, "flos": 21225289409280.0, "grad_norm": 1.9941683221572173, "language_loss": 0.89070201, "learning_rate": 3.955688530637116e-06, "loss": 0.91795605, "num_input_tokens_seen": 33980015, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.29956055, "step": 1582, "time_per_iteration": 4.397181987762451 }, { "auxiliary_loss_clip": 0.01667634, "auxiliary_loss_mlp": 0.01076367, "balance_loss_clip": 1.40494275, "balance_loss_mlp": 1.04351306, "epoch": 0.09517510897339546, "flos": 14619455366400.0, "grad_norm": 2.0892438557132276, "language_loss": 0.68198711, "learning_rate": 3.955606966107699e-06, "loss": 0.70942712, "num_input_tokens_seen": 33997705, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.32836914, "step": 1583, "time_per_iteration": 2.854550838470459 }, { "auxiliary_loss_clip": 0.01675859, "auxiliary_loss_mlp": 0.01076224, "balance_loss_clip": 1.41326928, "balance_loss_mlp": 1.04379928, "epoch": 0.09523523222606343, "flos": 27828499253760.0, "grad_norm": 1.7913228974781277, "language_loss": 0.721699, "learning_rate": 3.95552532742147e-06, "loss": 0.74921989, "num_input_tokens_seen": 34017465, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.32446289, "step": 1584, "time_per_iteration": 2.9217662811279297 }, { "auxiliary_loss_clip": 0.01669726, "auxiliary_loss_mlp": 0.01077647, "balance_loss_clip": 1.41111684, "balance_loss_mlp": 1.04672396, "epoch": 0.0952953554787314, "flos": 20716537167360.0, "grad_norm": 1.8652426024296636, "language_loss": 0.8206653, "learning_rate": 3.955443614581525e-06, "loss": 0.84813905, "num_input_tokens_seen": 34038550, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.30908203, "step": 1585, "time_per_iteration": 2.917325735092163 }, { "auxiliary_loss_clip": 0.01678077, "auxiliary_loss_mlp": 0.01086265, "balance_loss_clip": 1.40857971, "balance_loss_mlp": 1.05341101, "epoch": 0.09535547873139937, "flos": 24797640078720.0, "grad_norm": 1.590124253390054, "language_loss": 0.73552573, "learning_rate": 3.955361827590961e-06, "loss": 0.76316917, "num_input_tokens_seen": 34058665, "router_z_loss_clip": 2.70117188, "router_z_loss_mlp": 0.32836914, "step": 1586, "time_per_iteration": 3.0434961318969727 }, { "auxiliary_loss_clip": 0.01446017, "auxiliary_loss_mlp": 0.01074623, "balance_loss_clip": 1.28522432, "balance_loss_mlp": 1.04887414, "epoch": 0.09541560198406734, "flos": 71940001881600.0, "grad_norm": 0.8539689430566186, "language_loss": 0.55537951, "learning_rate": 3.955279966452883e-06, "loss": 0.5805859, "num_input_tokens_seen": 34109655, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.2578125, "step": 1587, "time_per_iteration": 4.681636810302734 }, { "auxiliary_loss_clip": 0.01662544, "auxiliary_loss_mlp": 0.01073677, "balance_loss_clip": 1.39844894, "balance_loss_mlp": 1.04184794, "epoch": 0.09547572523673531, "flos": 28993475099520.0, "grad_norm": 1.8439018553298865, "language_loss": 0.82488841, "learning_rate": 3.955198031170391e-06, "loss": 0.85225058, "num_input_tokens_seen": 34131115, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.31835938, "step": 1588, "time_per_iteration": 2.9555037021636963 }, { "auxiliary_loss_clip": 0.0166501, "auxiliary_loss_mlp": 0.01066182, "balance_loss_clip": 1.40398562, "balance_loss_mlp": 1.03604615, "epoch": 0.09553584848940327, "flos": 24144538366080.0, "grad_norm": 1.4001707759982354, "language_loss": 0.82546937, "learning_rate": 3.955116021746594e-06, "loss": 0.8527813, "num_input_tokens_seen": 34151925, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.30151367, "step": 1589, "time_per_iteration": 2.9987950325012207 }, { "auxiliary_loss_clip": 0.01651515, "auxiliary_loss_mlp": 0.01067807, "balance_loss_clip": 1.39294243, "balance_loss_mlp": 1.03330779, "epoch": 0.09559597174207124, "flos": 42866931899520.0, "grad_norm": 1.5258445915739465, "language_loss": 0.66045588, "learning_rate": 3.955033938184601e-06, "loss": 0.68764913, "num_input_tokens_seen": 34175395, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.3449707, "step": 1590, "time_per_iteration": 4.590895414352417 }, { "auxiliary_loss_clip": 0.01651451, "auxiliary_loss_mlp": 0.01064292, "balance_loss_clip": 1.39274037, "balance_loss_mlp": 1.03112757, "epoch": 0.09565609499473922, "flos": 32683498790400.0, "grad_norm": 1.752281569722935, "language_loss": 0.83885121, "learning_rate": 3.954951780487526e-06, "loss": 0.86600858, "num_input_tokens_seen": 34197760, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.33178711, "step": 1591, "time_per_iteration": 4.405884742736816 }, { "auxiliary_loss_clip": 0.01679153, "auxiliary_loss_mlp": 0.01055766, "balance_loss_clip": 1.41313875, "balance_loss_mlp": 1.02467656, "epoch": 0.09571621824740718, "flos": 18487517921280.0, "grad_norm": 3.2507097693941525, "language_loss": 0.76540911, "learning_rate": 3.9548695486584835e-06, "loss": 0.79275835, "num_input_tokens_seen": 34215330, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.31103516, "step": 1592, "time_per_iteration": 2.9665539264678955 }, { "auxiliary_loss_clip": 0.01644039, "auxiliary_loss_mlp": 0.01055466, "balance_loss_clip": 1.38541389, "balance_loss_mlp": 1.01958418, "epoch": 0.09577634150007515, "flos": 29399394614400.0, "grad_norm": 2.2332493747524635, "language_loss": 0.75289357, "learning_rate": 3.954787242700592e-06, "loss": 0.77988863, "num_input_tokens_seen": 34237745, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.35913086, "step": 1593, "time_per_iteration": 3.0665364265441895 }, { "auxiliary_loss_clip": 0.01644639, "auxiliary_loss_mlp": 0.01058236, "balance_loss_clip": 1.38794374, "balance_loss_mlp": 1.02402329, "epoch": 0.09583646475274313, "flos": 22758197120640.0, "grad_norm": 1.9911825847226712, "language_loss": 0.70502043, "learning_rate": 3.954704862616971e-06, "loss": 0.73204923, "num_input_tokens_seen": 34256565, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.34228516, "step": 1594, "time_per_iteration": 2.925469398498535 }, { "auxiliary_loss_clip": 0.0165673, "auxiliary_loss_mlp": 0.01053609, "balance_loss_clip": 1.39399767, "balance_loss_mlp": 1.01965797, "epoch": 0.0958965880054111, "flos": 23227875838080.0, "grad_norm": 2.2906333336460225, "language_loss": 0.83821642, "learning_rate": 3.954622408410747e-06, "loss": 0.86531979, "num_input_tokens_seen": 34275970, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.33959961, "step": 1595, "time_per_iteration": 2.9478964805603027 }, { "auxiliary_loss_clip": 0.01641882, "auxiliary_loss_mlp": 0.01054621, "balance_loss_clip": 1.38106644, "balance_loss_mlp": 1.02095592, "epoch": 0.09595671125807906, "flos": 21334320673920.0, "grad_norm": 1.9264892444302049, "language_loss": 0.86206651, "learning_rate": 3.954539880085045e-06, "loss": 0.88903147, "num_input_tokens_seen": 34295490, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.33666992, "step": 1596, "time_per_iteration": 2.871415615081787 }, { "auxiliary_loss_clip": 0.01662945, "auxiliary_loss_mlp": 0.01057168, "balance_loss_clip": 1.4018234, "balance_loss_mlp": 1.02283525, "epoch": 0.09601683451074704, "flos": 39618010195200.0, "grad_norm": 1.729284294176226, "language_loss": 0.70215195, "learning_rate": 3.9544572776429945e-06, "loss": 0.72935307, "num_input_tokens_seen": 34319990, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.34301758, "step": 1597, "time_per_iteration": 3.082613945007324 }, { "auxiliary_loss_clip": 0.01657317, "auxiliary_loss_mlp": 0.01059487, "balance_loss_clip": 1.39205611, "balance_loss_mlp": 1.0250361, "epoch": 0.096076957763415, "flos": 23743279065600.0, "grad_norm": 2.0382577318238027, "language_loss": 0.76589233, "learning_rate": 3.954374601087729e-06, "loss": 0.79306042, "num_input_tokens_seen": 34339225, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 0.34423828, "step": 1598, "time_per_iteration": 2.958341360092163 }, { "auxiliary_loss_clip": 0.01670949, "auxiliary_loss_mlp": 0.01054762, "balance_loss_clip": 1.40473258, "balance_loss_mlp": 1.01954758, "epoch": 0.09613708101608297, "flos": 34691424105600.0, "grad_norm": 1.8705984064355141, "language_loss": 0.70483571, "learning_rate": 3.954291850422382e-06, "loss": 0.7320928, "num_input_tokens_seen": 34361020, "router_z_loss_clip": 2.66015625, "router_z_loss_mlp": 0.35229492, "step": 1599, "time_per_iteration": 3.058208703994751 }, { "auxiliary_loss_clip": 0.01650815, "auxiliary_loss_mlp": 0.01052439, "balance_loss_clip": 1.39288926, "balance_loss_mlp": 1.01736784, "epoch": 0.09619720426875093, "flos": 20749593133440.0, "grad_norm": 1.9397325138855626, "language_loss": 0.85779208, "learning_rate": 3.954209025650093e-06, "loss": 0.88482463, "num_input_tokens_seen": 34378630, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.35107422, "step": 1600, "time_per_iteration": 2.936253070831299 }, { "auxiliary_loss_clip": 0.01660547, "auxiliary_loss_mlp": 0.01056365, "balance_loss_clip": 1.39506805, "balance_loss_mlp": 1.020769, "epoch": 0.09625732752141891, "flos": 13050596021760.0, "grad_norm": 2.1241624338002065, "language_loss": 0.82352507, "learning_rate": 3.954126126774001e-06, "loss": 0.85069418, "num_input_tokens_seen": 34397110, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.35571289, "step": 1601, "time_per_iteration": 2.853712797164917 }, { "auxiliary_loss_clip": 0.01658122, "auxiliary_loss_mlp": 0.01056592, "balance_loss_clip": 1.39030623, "balance_loss_mlp": 1.022331, "epoch": 0.09631745077408688, "flos": 22283903433600.0, "grad_norm": 2.266216223412545, "language_loss": 0.83876359, "learning_rate": 3.954043153797251e-06, "loss": 0.86591077, "num_input_tokens_seen": 34414165, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.34277344, "step": 1602, "time_per_iteration": 2.93969464302063 }, { "auxiliary_loss_clip": 0.01638309, "auxiliary_loss_mlp": 0.01053401, "balance_loss_clip": 1.38091302, "balance_loss_mlp": 1.01828206, "epoch": 0.09637757402675484, "flos": 24765081805440.0, "grad_norm": 2.1491817393407646, "language_loss": 0.64454544, "learning_rate": 3.953960106722989e-06, "loss": 0.67146254, "num_input_tokens_seen": 34434445, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.35131836, "step": 1603, "time_per_iteration": 2.9422457218170166 }, { "auxiliary_loss_clip": 0.01664992, "auxiliary_loss_mlp": 0.01052994, "balance_loss_clip": 1.39955497, "balance_loss_mlp": 1.01758862, "epoch": 0.09643769727942282, "flos": 22535248152960.0, "grad_norm": 2.5334867137647166, "language_loss": 0.72566396, "learning_rate": 3.953876985554364e-06, "loss": 0.7528438, "num_input_tokens_seen": 34453095, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.35400391, "step": 1604, "time_per_iteration": 3.066404104232788 }, { "auxiliary_loss_clip": 0.01641104, "auxiliary_loss_mlp": 0.01053797, "balance_loss_clip": 1.38544428, "balance_loss_mlp": 1.02008462, "epoch": 0.09649782053209079, "flos": 30933614424960.0, "grad_norm": 2.11611164836917, "language_loss": 0.80490959, "learning_rate": 3.953793790294527e-06, "loss": 0.83185863, "num_input_tokens_seen": 34473680, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.33666992, "step": 1605, "time_per_iteration": 3.0610921382904053 }, { "auxiliary_loss_clip": 0.0165535, "auxiliary_loss_mlp": 0.0104942, "balance_loss_clip": 1.38818598, "balance_loss_mlp": 1.01625609, "epoch": 0.09655794378475875, "flos": 25348497246720.0, "grad_norm": 1.8970586376790766, "language_loss": 0.76332021, "learning_rate": 3.953710520946634e-06, "loss": 0.79036784, "num_input_tokens_seen": 34492610, "router_z_loss_clip": 2.66992188, "router_z_loss_mlp": 0.33154297, "step": 1606, "time_per_iteration": 2.9301247596740723 }, { "auxiliary_loss_clip": 0.0164322, "auxiliary_loss_mlp": 0.01052336, "balance_loss_clip": 1.38363028, "balance_loss_mlp": 1.01931465, "epoch": 0.09661806703742673, "flos": 22356213903360.0, "grad_norm": 1.8953103984078128, "language_loss": 0.76853716, "learning_rate": 3.953627177513843e-06, "loss": 0.79549265, "num_input_tokens_seen": 34511855, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.33032227, "step": 1607, "time_per_iteration": 2.916126012802124 }, { "auxiliary_loss_clip": 0.01650555, "auxiliary_loss_mlp": 0.0104872, "balance_loss_clip": 1.38808537, "balance_loss_mlp": 1.0155561, "epoch": 0.0966781902900947, "flos": 17466620077440.0, "grad_norm": 2.167468443843805, "language_loss": 0.87912583, "learning_rate": 3.953543759999312e-06, "loss": 0.90611857, "num_input_tokens_seen": 34528905, "router_z_loss_clip": 2.625, "router_z_loss_mlp": 0.33178711, "step": 1608, "time_per_iteration": 2.831538200378418 }, { "auxiliary_loss_clip": 0.01675008, "auxiliary_loss_mlp": 0.01058749, "balance_loss_clip": 1.40776515, "balance_loss_mlp": 1.02320087, "epoch": 0.09673831354276266, "flos": 36917954887680.0, "grad_norm": 2.077952711627616, "language_loss": 0.72956705, "learning_rate": 3.953460268406207e-06, "loss": 0.7569046, "num_input_tokens_seen": 34548480, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.35546875, "step": 1609, "time_per_iteration": 2.9994795322418213 }, { "auxiliary_loss_clip": 0.01639346, "auxiliary_loss_mlp": 0.01052875, "balance_loss_clip": 1.37953639, "balance_loss_mlp": 1.01990163, "epoch": 0.09679843679543064, "flos": 20710383874560.0, "grad_norm": 2.0216202889848915, "language_loss": 0.86594737, "learning_rate": 3.953376702737693e-06, "loss": 0.89286953, "num_input_tokens_seen": 34565410, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.32983398, "step": 1610, "time_per_iteration": 2.8739261627197266 }, { "auxiliary_loss_clip": 0.01642223, "auxiliary_loss_mlp": 0.01052911, "balance_loss_clip": 1.38707066, "balance_loss_mlp": 1.02077258, "epoch": 0.0968585600480986, "flos": 23525080801920.0, "grad_norm": 2.469735117942462, "language_loss": 0.68302494, "learning_rate": 3.953293062996939e-06, "loss": 0.70997632, "num_input_tokens_seen": 34584840, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.32128906, "step": 1611, "time_per_iteration": 2.854700803756714 }, { "auxiliary_loss_clip": 0.01644342, "auxiliary_loss_mlp": 0.01059204, "balance_loss_clip": 1.38468432, "balance_loss_mlp": 1.02475262, "epoch": 0.09691868330076657, "flos": 20130814241280.0, "grad_norm": 2.1950074591485476, "language_loss": 0.82920468, "learning_rate": 3.953209349187115e-06, "loss": 0.85624015, "num_input_tokens_seen": 34603360, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.34472656, "step": 1612, "time_per_iteration": 2.889291763305664 }, { "auxiliary_loss_clip": 0.01659098, "auxiliary_loss_mlp": 0.01058563, "balance_loss_clip": 1.3990407, "balance_loss_mlp": 1.02601886, "epoch": 0.09697880655343454, "flos": 16553305664640.0, "grad_norm": 2.332418049687465, "language_loss": 0.82654285, "learning_rate": 3.953125561311398e-06, "loss": 0.85371947, "num_input_tokens_seen": 34620760, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.32543945, "step": 1613, "time_per_iteration": 2.868716239929199 }, { "auxiliary_loss_clip": 0.01643933, "auxiliary_loss_mlp": 0.01055775, "balance_loss_clip": 1.38937831, "balance_loss_mlp": 1.02149057, "epoch": 0.09703892980610251, "flos": 26115335683200.0, "grad_norm": 2.304378112165568, "language_loss": 0.86120695, "learning_rate": 3.953041699372964e-06, "loss": 0.88820404, "num_input_tokens_seen": 34640695, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.34301758, "step": 1614, "time_per_iteration": 2.9557573795318604 }, { "auxiliary_loss_clip": 0.01455909, "auxiliary_loss_mlp": 0.01202433, "balance_loss_clip": 1.29597092, "balance_loss_mlp": 1.07120788, "epoch": 0.09709905305877048, "flos": 60474797539200.0, "grad_norm": 0.8811916472240193, "language_loss": 0.54673225, "learning_rate": 3.952957763374992e-06, "loss": 0.57331568, "num_input_tokens_seen": 34702395, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 1.3125, "step": 1615, "time_per_iteration": 3.3516602516174316 }, { "auxiliary_loss_clip": 0.01449281, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.29068542, "balance_loss_mlp": 1.01389444, "epoch": 0.09715917631143844, "flos": 57671864259840.0, "grad_norm": 0.9169516296606716, "language_loss": 0.58343858, "learning_rate": 3.952873753320666e-06, "loss": 0.60826868, "num_input_tokens_seen": 34768910, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19824219, "step": 1616, "time_per_iteration": 3.454557180404663 }, { "auxiliary_loss_clip": 0.01652103, "auxiliary_loss_mlp": 0.01056017, "balance_loss_clip": 1.39345479, "balance_loss_mlp": 1.02399743, "epoch": 0.09721929956410642, "flos": 20567889440640.0, "grad_norm": 1.739163347547808, "language_loss": 0.69494361, "learning_rate": 3.952789669213172e-06, "loss": 0.7220248, "num_input_tokens_seen": 34787680, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.32006836, "step": 1617, "time_per_iteration": 4.305253505706787 }, { "auxiliary_loss_clip": 0.0164957, "auxiliary_loss_mlp": 0.01056067, "balance_loss_clip": 1.39020705, "balance_loss_mlp": 1.02252185, "epoch": 0.09727942281677439, "flos": 27355743889920.0, "grad_norm": 1.637223716991624, "language_loss": 0.81472832, "learning_rate": 3.952705511055698e-06, "loss": 0.84178472, "num_input_tokens_seen": 34808330, "router_z_loss_clip": 2.59570312, "router_z_loss_mlp": 0.33544922, "step": 1618, "time_per_iteration": 2.968405246734619 }, { "auxiliary_loss_clip": 0.01647255, "auxiliary_loss_mlp": 0.01055738, "balance_loss_clip": 1.39423156, "balance_loss_mlp": 1.0230267, "epoch": 0.09733954606944235, "flos": 24910833864960.0, "grad_norm": 1.5403005117357864, "language_loss": 0.93926758, "learning_rate": 3.952621278851435e-06, "loss": 0.96629751, "num_input_tokens_seen": 34830020, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.3269043, "step": 1619, "time_per_iteration": 2.92160701751709 }, { "auxiliary_loss_clip": 0.01645746, "auxiliary_loss_mlp": 0.01051729, "balance_loss_clip": 1.39275169, "balance_loss_mlp": 1.01968527, "epoch": 0.09739966932211033, "flos": 31516441683840.0, "grad_norm": 1.876683740818155, "language_loss": 0.90099275, "learning_rate": 3.9525369726035784e-06, "loss": 0.92796749, "num_input_tokens_seen": 34850330, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.32055664, "step": 1620, "time_per_iteration": 2.9349992275238037 }, { "auxiliary_loss_clip": 0.01673294, "auxiliary_loss_mlp": 0.01059554, "balance_loss_clip": 1.41336679, "balance_loss_mlp": 1.02329063, "epoch": 0.0974597925747783, "flos": 23889393083520.0, "grad_norm": 2.5199832440775207, "language_loss": 0.78703415, "learning_rate": 3.952452592315324e-06, "loss": 0.81436259, "num_input_tokens_seen": 34871640, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.36279297, "step": 1621, "time_per_iteration": 2.8896656036376953 }, { "auxiliary_loss_clip": 0.01659957, "auxiliary_loss_mlp": 0.01052864, "balance_loss_clip": 1.40289664, "balance_loss_mlp": 1.02010584, "epoch": 0.09751991582744626, "flos": 17028459002880.0, "grad_norm": 1.921034322446758, "language_loss": 0.79299259, "learning_rate": 3.952368137989871e-06, "loss": 0.82012081, "num_input_tokens_seen": 34888100, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.32739258, "step": 1622, "time_per_iteration": 4.305184841156006 }, { "auxiliary_loss_clip": 0.01678574, "auxiliary_loss_mlp": 0.01053947, "balance_loss_clip": 1.41343451, "balance_loss_mlp": 1.02159357, "epoch": 0.09758003908011423, "flos": 28414584138240.0, "grad_norm": 2.849900325175906, "language_loss": 0.86250341, "learning_rate": 3.9522836096304225e-06, "loss": 0.88982868, "num_input_tokens_seen": 34910485, "router_z_loss_clip": 2.65234375, "router_z_loss_mlp": 0.32324219, "step": 1623, "time_per_iteration": 2.9492335319519043 }, { "auxiliary_loss_clip": 0.01660984, "auxiliary_loss_mlp": 0.01050031, "balance_loss_clip": 1.40490985, "balance_loss_mlp": 1.01867926, "epoch": 0.09764016233278221, "flos": 18151827615360.0, "grad_norm": 2.0722409469359526, "language_loss": 0.81531084, "learning_rate": 3.952199007240184e-06, "loss": 0.842421, "num_input_tokens_seen": 34928615, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.31347656, "step": 1624, "time_per_iteration": 2.8749661445617676 }, { "auxiliary_loss_clip": 0.01671576, "auxiliary_loss_mlp": 0.01050663, "balance_loss_clip": 1.41634953, "balance_loss_mlp": 1.01881027, "epoch": 0.09770028558545017, "flos": 15273642954240.0, "grad_norm": 2.2243270137073035, "language_loss": 0.86989129, "learning_rate": 3.952114330822364e-06, "loss": 0.89711374, "num_input_tokens_seen": 34946045, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.31860352, "step": 1625, "time_per_iteration": 4.385685682296753 }, { "auxiliary_loss_clip": 0.01693149, "auxiliary_loss_mlp": 0.01058917, "balance_loss_clip": 1.42737448, "balance_loss_mlp": 1.02205789, "epoch": 0.09776040883811814, "flos": 23481663776640.0, "grad_norm": 2.0026177819574533, "language_loss": 0.86380178, "learning_rate": 3.952029580380172e-06, "loss": 0.89132249, "num_input_tokens_seen": 34962865, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.36865234, "step": 1626, "time_per_iteration": 4.45385479927063 }, { "auxiliary_loss_clip": 0.01688389, "auxiliary_loss_mlp": 0.01055092, "balance_loss_clip": 1.42240798, "balance_loss_mlp": 1.02164221, "epoch": 0.09782053209078612, "flos": 24510388970880.0, "grad_norm": 1.8367417036470772, "language_loss": 0.83688557, "learning_rate": 3.9519447559168234e-06, "loss": 0.8643204, "num_input_tokens_seen": 34983505, "router_z_loss_clip": 2.66210938, "router_z_loss_mlp": 0.33447266, "step": 1627, "time_per_iteration": 2.9997692108154297 }, { "auxiliary_loss_clip": 0.01685794, "auxiliary_loss_mlp": 0.01060409, "balance_loss_clip": 1.42672801, "balance_loss_mlp": 1.02631545, "epoch": 0.09788065534345408, "flos": 21590189873280.0, "grad_norm": 1.8147677850618489, "language_loss": 0.85667801, "learning_rate": 3.951859857435534e-06, "loss": 0.88414001, "num_input_tokens_seen": 35001825, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.34130859, "step": 1628, "time_per_iteration": 2.880930185317993 }, { "auxiliary_loss_clip": 0.01671965, "auxiliary_loss_mlp": 0.01054103, "balance_loss_clip": 1.41638947, "balance_loss_mlp": 1.02267981, "epoch": 0.09794077859612205, "flos": 23853124736640.0, "grad_norm": 1.6318197992984442, "language_loss": 0.77119827, "learning_rate": 3.951774884939523e-06, "loss": 0.79845893, "num_input_tokens_seen": 35023075, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.31420898, "step": 1629, "time_per_iteration": 2.9591474533081055 }, { "auxiliary_loss_clip": 0.01675769, "auxiliary_loss_mlp": 0.0105316, "balance_loss_clip": 1.41675663, "balance_loss_mlp": 1.02009177, "epoch": 0.09800090184879003, "flos": 23670470903040.0, "grad_norm": 2.5756197553458944, "language_loss": 0.79707205, "learning_rate": 3.951689838432013e-06, "loss": 0.82436126, "num_input_tokens_seen": 35043480, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.33056641, "step": 1630, "time_per_iteration": 2.9287426471710205 }, { "auxiliary_loss_clip": 0.01686935, "auxiliary_loss_mlp": 0.0105417, "balance_loss_clip": 1.42304778, "balance_loss_mlp": 1.02076793, "epoch": 0.09806102510145799, "flos": 17064591615360.0, "grad_norm": 3.2334987700011717, "language_loss": 0.87786329, "learning_rate": 3.951604717916228e-06, "loss": 0.90527433, "num_input_tokens_seen": 35061490, "router_z_loss_clip": 2.63867188, "router_z_loss_mlp": 0.33374023, "step": 1631, "time_per_iteration": 2.830043077468872 }, { "auxiliary_loss_clip": 0.01667944, "auxiliary_loss_mlp": 0.0105587, "balance_loss_clip": 1.41184628, "balance_loss_mlp": 1.02163315, "epoch": 0.09812114835412596, "flos": 23889031125120.0, "grad_norm": 1.845748022879846, "language_loss": 0.83683145, "learning_rate": 3.9515195233953975e-06, "loss": 0.86406958, "num_input_tokens_seen": 35079670, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.3425293, "step": 1632, "time_per_iteration": 3.027871608734131 }, { "auxiliary_loss_clip": 0.01674796, "auxiliary_loss_mlp": 0.01060629, "balance_loss_clip": 1.41676235, "balance_loss_mlp": 1.02710688, "epoch": 0.09818127160679392, "flos": 20605379397120.0, "grad_norm": 1.4980867869267873, "language_loss": 0.79750866, "learning_rate": 3.951434254872751e-06, "loss": 0.82486284, "num_input_tokens_seen": 35099205, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.3347168, "step": 1633, "time_per_iteration": 2.857276678085327 }, { "auxiliary_loss_clip": 0.01661805, "auxiliary_loss_mlp": 0.01054102, "balance_loss_clip": 1.40637767, "balance_loss_mlp": 1.0222255, "epoch": 0.0982413948594619, "flos": 15495868005120.0, "grad_norm": 5.829786364016594, "language_loss": 0.74253768, "learning_rate": 3.951348912351521e-06, "loss": 0.76969671, "num_input_tokens_seen": 35115270, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.31860352, "step": 1634, "time_per_iteration": 2.8926961421966553 }, { "auxiliary_loss_clip": 0.01685673, "auxiliary_loss_mlp": 0.01067148, "balance_loss_clip": 1.4189111, "balance_loss_mlp": 1.030527, "epoch": 0.09830151811212987, "flos": 24218884851840.0, "grad_norm": 2.7996077708593967, "language_loss": 0.74234295, "learning_rate": 3.951263495834947e-06, "loss": 0.76987118, "num_input_tokens_seen": 35134065, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.36669922, "step": 1635, "time_per_iteration": 2.945892333984375 }, { "auxiliary_loss_clip": 0.0169307, "auxiliary_loss_mlp": 0.01067713, "balance_loss_clip": 1.42735028, "balance_loss_mlp": 1.03216505, "epoch": 0.09836164136479783, "flos": 20604248277120.0, "grad_norm": 1.6861794853413723, "language_loss": 0.78761971, "learning_rate": 3.951178005326264e-06, "loss": 0.81522757, "num_input_tokens_seen": 35154870, "router_z_loss_clip": 2.65820312, "router_z_loss_mlp": 0.35546875, "step": 1636, "time_per_iteration": 2.940061569213867 }, { "auxiliary_loss_clip": 0.0168376, "auxiliary_loss_mlp": 0.01060349, "balance_loss_clip": 1.42006958, "balance_loss_mlp": 1.02575421, "epoch": 0.09842176461746581, "flos": 19942685786880.0, "grad_norm": 2.08279676563933, "language_loss": 0.7103622, "learning_rate": 3.951092440828715e-06, "loss": 0.73780322, "num_input_tokens_seen": 35171850, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.34594727, "step": 1637, "time_per_iteration": 2.893254518508911 }, { "auxiliary_loss_clip": 0.01675004, "auxiliary_loss_mlp": 0.01068969, "balance_loss_clip": 1.41543412, "balance_loss_mlp": 1.03492332, "epoch": 0.09848188787013377, "flos": 21224429758080.0, "grad_norm": 2.000875303067703, "language_loss": 0.7912671, "learning_rate": 3.951006802345545e-06, "loss": 0.81870681, "num_input_tokens_seen": 35188795, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.34033203, "step": 1638, "time_per_iteration": 2.841355323791504 }, { "auxiliary_loss_clip": 0.01652974, "auxiliary_loss_mlp": 0.01054674, "balance_loss_clip": 1.40154743, "balance_loss_mlp": 1.02120054, "epoch": 0.09854201112280174, "flos": 30166459274880.0, "grad_norm": 1.454420564234614, "language_loss": 0.73573679, "learning_rate": 3.950921089880003e-06, "loss": 0.76281333, "num_input_tokens_seen": 35212100, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.33496094, "step": 1639, "time_per_iteration": 2.96954083442688 }, { "auxiliary_loss_clip": 0.01683684, "auxiliary_loss_mlp": 0.01058141, "balance_loss_clip": 1.42171574, "balance_loss_mlp": 1.0230937, "epoch": 0.09860213437546972, "flos": 21804949532160.0, "grad_norm": 2.0919120124570245, "language_loss": 0.89702493, "learning_rate": 3.950835303435337e-06, "loss": 0.92444324, "num_input_tokens_seen": 35230390, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.35058594, "step": 1640, "time_per_iteration": 3.0248308181762695 }, { "auxiliary_loss_clip": 0.01674831, "auxiliary_loss_mlp": 0.01056629, "balance_loss_clip": 1.41494918, "balance_loss_mlp": 1.0232029, "epoch": 0.09866225762813768, "flos": 21845651869440.0, "grad_norm": 2.0434595825272694, "language_loss": 0.83053005, "learning_rate": 3.950749443014801e-06, "loss": 0.85784465, "num_input_tokens_seen": 35250405, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.33398438, "step": 1641, "time_per_iteration": 2.9520416259765625 }, { "auxiliary_loss_clip": 0.01673711, "auxiliary_loss_mlp": 0.01055096, "balance_loss_clip": 1.41183388, "balance_loss_mlp": 1.02181244, "epoch": 0.09872238088080565, "flos": 17607983391360.0, "grad_norm": 3.7281091732192246, "language_loss": 0.88705671, "learning_rate": 3.95066350862165e-06, "loss": 0.91434479, "num_input_tokens_seen": 35262820, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.33251953, "step": 1642, "time_per_iteration": 3.0409581661224365 }, { "auxiliary_loss_clip": 0.0167975, "auxiliary_loss_mlp": 0.01062298, "balance_loss_clip": 1.42301345, "balance_loss_mlp": 1.02894366, "epoch": 0.09878250413347361, "flos": 27647564722560.0, "grad_norm": 1.6246653175310566, "language_loss": 0.8221935, "learning_rate": 3.950577500259144e-06, "loss": 0.84961402, "num_input_tokens_seen": 35284490, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.33349609, "step": 1643, "time_per_iteration": 3.002387046813965 }, { "auxiliary_loss_clip": 0.01687656, "auxiliary_loss_mlp": 0.01066427, "balance_loss_clip": 1.4263258, "balance_loss_mlp": 1.03347778, "epoch": 0.0988426273861416, "flos": 16553350909440.0, "grad_norm": 1.8461821839513446, "language_loss": 0.83488715, "learning_rate": 3.950491417930543e-06, "loss": 0.86242801, "num_input_tokens_seen": 35302815, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.3293457, "step": 1644, "time_per_iteration": 2.945784330368042 }, { "auxiliary_loss_clip": 0.01672817, "auxiliary_loss_mlp": 0.010608, "balance_loss_clip": 1.41988802, "balance_loss_mlp": 1.02670634, "epoch": 0.09890275063880956, "flos": 21225198919680.0, "grad_norm": 1.6729221350538515, "language_loss": 0.69492525, "learning_rate": 3.9504052616391124e-06, "loss": 0.72226143, "num_input_tokens_seen": 35321175, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.34106445, "step": 1645, "time_per_iteration": 2.962468385696411 }, { "auxiliary_loss_clip": 0.01433318, "auxiliary_loss_mlp": 0.01082604, "balance_loss_clip": 1.27307642, "balance_loss_mlp": 1.0465548, "epoch": 0.09896287389147752, "flos": 59408672878080.0, "grad_norm": 0.8576296353662578, "language_loss": 0.60905313, "learning_rate": 3.950319031388119e-06, "loss": 0.63421237, "num_input_tokens_seen": 35381740, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.36132812, "step": 1646, "time_per_iteration": 3.3440983295440674 }, { "auxiliary_loss_clip": 0.01659233, "auxiliary_loss_mlp": 0.01055604, "balance_loss_clip": 1.40046811, "balance_loss_mlp": 1.02182043, "epoch": 0.0990229971441455, "flos": 29654585141760.0, "grad_norm": 1.5661887617310395, "language_loss": 0.73899096, "learning_rate": 3.950232727180833e-06, "loss": 0.76613927, "num_input_tokens_seen": 35403760, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.33764648, "step": 1647, "time_per_iteration": 2.991462230682373 }, { "auxiliary_loss_clip": 0.0167897, "auxiliary_loss_mlp": 0.0105975, "balance_loss_clip": 1.41851616, "balance_loss_mlp": 1.02901816, "epoch": 0.09908312039681347, "flos": 21844792218240.0, "grad_norm": 2.6522101328662533, "language_loss": 0.86134726, "learning_rate": 3.950146349020525e-06, "loss": 0.88873446, "num_input_tokens_seen": 35424050, "router_z_loss_clip": 2.60742188, "router_z_loss_mlp": 0.30712891, "step": 1648, "time_per_iteration": 2.9638943672180176 }, { "auxiliary_loss_clip": 0.01418548, "auxiliary_loss_mlp": 0.01028149, "balance_loss_clip": 1.2630887, "balance_loss_mlp": 1.00926626, "epoch": 0.09914324364948143, "flos": 57595527002880.0, "grad_norm": 0.7412887852309825, "language_loss": 0.55764318, "learning_rate": 3.950059896910473e-06, "loss": 0.58211017, "num_input_tokens_seen": 35481690, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18847656, "step": 1649, "time_per_iteration": 3.294149398803711 }, { "auxiliary_loss_clip": 0.01662693, "auxiliary_loss_mlp": 0.0107432, "balance_loss_clip": 1.40591872, "balance_loss_mlp": 1.04394603, "epoch": 0.09920336690214941, "flos": 34135318540800.0, "grad_norm": 2.146224760986189, "language_loss": 0.91636431, "learning_rate": 3.949973370853954e-06, "loss": 0.94373447, "num_input_tokens_seen": 35498635, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.30395508, "step": 1650, "time_per_iteration": 3.152336597442627 }, { "auxiliary_loss_clip": 0.01418269, "auxiliary_loss_mlp": 0.01026852, "balance_loss_clip": 1.26254106, "balance_loss_mlp": 1.00987685, "epoch": 0.09926349015481738, "flos": 71252866800000.0, "grad_norm": 0.8941358468866922, "language_loss": 0.63933432, "learning_rate": 3.94988677085425e-06, "loss": 0.66378552, "num_input_tokens_seen": 35565720, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.16992188, "step": 1651, "time_per_iteration": 3.469982624053955 }, { "auxiliary_loss_clip": 0.01656066, "auxiliary_loss_mlp": 0.01091817, "balance_loss_clip": 1.4001925, "balance_loss_mlp": 1.06053674, "epoch": 0.09932361340748534, "flos": 23159049217920.0, "grad_norm": 1.8632920635116326, "language_loss": 0.8870306, "learning_rate": 3.949800096914643e-06, "loss": 0.91450953, "num_input_tokens_seen": 35586000, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.31298828, "step": 1652, "time_per_iteration": 4.444247484207153 }, { "auxiliary_loss_clip": 0.01656101, "auxiliary_loss_mlp": 0.01099525, "balance_loss_clip": 1.40004015, "balance_loss_mlp": 1.06869769, "epoch": 0.09938373666015332, "flos": 19838179002240.0, "grad_norm": 2.0649214290469775, "language_loss": 0.83022273, "learning_rate": 3.949713349038422e-06, "loss": 0.85777891, "num_input_tokens_seen": 35604355, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.30834961, "step": 1653, "time_per_iteration": 2.9593770503997803 }, { "auxiliary_loss_clip": 0.01665012, "auxiliary_loss_mlp": 0.01109709, "balance_loss_clip": 1.40404069, "balance_loss_mlp": 1.07859564, "epoch": 0.09944385991282129, "flos": 22100751907200.0, "grad_norm": 1.6242681281374058, "language_loss": 0.81298214, "learning_rate": 3.949626527228875e-06, "loss": 0.84072936, "num_input_tokens_seen": 35625495, "router_z_loss_clip": 2.609375, "router_z_loss_mlp": 0.3112793, "step": 1654, "time_per_iteration": 2.939091682434082 }, { "auxiliary_loss_clip": 0.01642086, "auxiliary_loss_mlp": 0.01111678, "balance_loss_clip": 1.39667869, "balance_loss_mlp": 1.08092165, "epoch": 0.09950398316548925, "flos": 19838450471040.0, "grad_norm": 1.4842317120426711, "language_loss": 0.82624382, "learning_rate": 3.949539631489295e-06, "loss": 0.85378146, "num_input_tokens_seen": 35645030, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.30761719, "step": 1655, "time_per_iteration": 2.891676187515259 }, { "auxiliary_loss_clip": 0.01651685, "auxiliary_loss_mlp": 0.01112978, "balance_loss_clip": 1.39562845, "balance_loss_mlp": 1.08155417, "epoch": 0.09956410641815722, "flos": 25013033164800.0, "grad_norm": 1.8225771329591445, "language_loss": 0.82350165, "learning_rate": 3.9494526618229765e-06, "loss": 0.85114831, "num_input_tokens_seen": 35664305, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.31420898, "step": 1656, "time_per_iteration": 2.968209743499756 }, { "auxiliary_loss_clip": 0.01669599, "auxiliary_loss_mlp": 0.01116259, "balance_loss_clip": 1.41334581, "balance_loss_mlp": 1.08590841, "epoch": 0.0996242296708252, "flos": 19327074030720.0, "grad_norm": 1.5325067508829209, "language_loss": 0.89919537, "learning_rate": 3.949365618233217e-06, "loss": 0.92705393, "num_input_tokens_seen": 35684060, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.30371094, "step": 1657, "time_per_iteration": 4.365803241729736 }, { "auxiliary_loss_clip": 0.01676938, "auxiliary_loss_mlp": 0.01121521, "balance_loss_clip": 1.41021478, "balance_loss_mlp": 1.08964479, "epoch": 0.09968435292349316, "flos": 21881603502720.0, "grad_norm": 2.3752628990889897, "language_loss": 0.86534941, "learning_rate": 3.9492785007233195e-06, "loss": 0.89333403, "num_input_tokens_seen": 35703250, "router_z_loss_clip": 2.66601562, "router_z_loss_mlp": 0.31884766, "step": 1658, "time_per_iteration": 2.9380922317504883 }, { "auxiliary_loss_clip": 0.01406291, "auxiliary_loss_mlp": 0.01054549, "balance_loss_clip": 1.25232887, "balance_loss_mlp": 1.03280509, "epoch": 0.09974447617616113, "flos": 65411500481280.0, "grad_norm": 0.9029586435383485, "language_loss": 0.60973328, "learning_rate": 3.949191309296585e-06, "loss": 0.63434166, "num_input_tokens_seen": 35762165, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.21777344, "step": 1659, "time_per_iteration": 3.3646867275238037 }, { "auxiliary_loss_clip": 0.01654625, "auxiliary_loss_mlp": 0.01104976, "balance_loss_clip": 1.40116262, "balance_loss_mlp": 1.07321835, "epoch": 0.0998045994288291, "flos": 23670380413440.0, "grad_norm": 2.3456588179276086, "language_loss": 0.86984038, "learning_rate": 3.949104043956321e-06, "loss": 0.89743626, "num_input_tokens_seen": 35781520, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.31762695, "step": 1660, "time_per_iteration": 2.888805389404297 }, { "auxiliary_loss_clip": 0.01662228, "auxiliary_loss_mlp": 0.01088557, "balance_loss_clip": 1.40853071, "balance_loss_mlp": 1.05756259, "epoch": 0.09986472268149707, "flos": 19619166332160.0, "grad_norm": 2.079259343004954, "language_loss": 0.80868685, "learning_rate": 3.949016704705836e-06, "loss": 0.83619469, "num_input_tokens_seen": 35799565, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.31005859, "step": 1661, "time_per_iteration": 5.724136829376221 }, { "auxiliary_loss_clip": 0.01678603, "auxiliary_loss_mlp": 0.01090577, "balance_loss_clip": 1.41135836, "balance_loss_mlp": 1.05848575, "epoch": 0.09992484593416504, "flos": 26224412192640.0, "grad_norm": 1.8929103029583056, "language_loss": 0.84489989, "learning_rate": 3.948929291548443e-06, "loss": 0.87259167, "num_input_tokens_seen": 35821085, "router_z_loss_clip": 2.671875, "router_z_loss_mlp": 0.32104492, "step": 1662, "time_per_iteration": 2.992192268371582 }, { "auxiliary_loss_clip": 0.01645088, "auxiliary_loss_mlp": 0.01084047, "balance_loss_clip": 1.38852763, "balance_loss_mlp": 1.05019176, "epoch": 0.09998496918683301, "flos": 17502571710720.0, "grad_norm": 1.875665560280599, "language_loss": 0.90547681, "learning_rate": 3.9488418044874546e-06, "loss": 0.93276817, "num_input_tokens_seen": 35839840, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.33837891, "step": 1663, "time_per_iteration": 2.893289804458618 }, { "auxiliary_loss_clip": 0.0166504, "auxiliary_loss_mlp": 0.01073249, "balance_loss_clip": 1.40243971, "balance_loss_mlp": 1.04058564, "epoch": 0.10004509243950098, "flos": 22795460853120.0, "grad_norm": 1.6503906532829125, "language_loss": 0.7170831, "learning_rate": 3.948754243526191e-06, "loss": 0.74446595, "num_input_tokens_seen": 35861545, "router_z_loss_clip": 2.62695312, "router_z_loss_mlp": 0.32666016, "step": 1664, "time_per_iteration": 2.9072556495666504 }, { "auxiliary_loss_clip": 0.01640771, "auxiliary_loss_mlp": 0.01066298, "balance_loss_clip": 1.38310039, "balance_loss_mlp": 1.03244233, "epoch": 0.10010521569216894, "flos": 16262073014400.0, "grad_norm": 2.6469106304180334, "language_loss": 0.80001211, "learning_rate": 3.94866660866797e-06, "loss": 0.82708281, "num_input_tokens_seen": 35878295, "router_z_loss_clip": 2.57617188, "router_z_loss_mlp": 0.33862305, "step": 1665, "time_per_iteration": 2.8685007095336914 }, { "auxiliary_loss_clip": 0.01654298, "auxiliary_loss_mlp": 0.01072739, "balance_loss_clip": 1.39549804, "balance_loss_mlp": 1.03795362, "epoch": 0.10016533894483691, "flos": 23412610932480.0, "grad_norm": 1.716968963228285, "language_loss": 0.70704883, "learning_rate": 3.9485788999161165e-06, "loss": 0.73431921, "num_input_tokens_seen": 35898990, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.34790039, "step": 1666, "time_per_iteration": 2.902575969696045 }, { "auxiliary_loss_clip": 0.0166278, "auxiliary_loss_mlp": 0.01072177, "balance_loss_clip": 1.39978373, "balance_loss_mlp": 1.03636634, "epoch": 0.10022546219750489, "flos": 19363523356800.0, "grad_norm": 1.9047857404542026, "language_loss": 0.80325389, "learning_rate": 3.948491117273956e-06, "loss": 0.83060348, "num_input_tokens_seen": 35916225, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.35791016, "step": 1667, "time_per_iteration": 2.854438543319702 }, { "auxiliary_loss_clip": 0.0163957, "auxiliary_loss_mlp": 0.01063807, "balance_loss_clip": 1.38292968, "balance_loss_mlp": 1.02845001, "epoch": 0.10028558545017285, "flos": 27096255106560.0, "grad_norm": 2.455144623915397, "language_loss": 0.79675514, "learning_rate": 3.948403260744817e-06, "loss": 0.82378888, "num_input_tokens_seen": 35934630, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.35327148, "step": 1668, "time_per_iteration": 2.950324058532715 }, { "auxiliary_loss_clip": 0.01629477, "auxiliary_loss_mlp": 0.01059295, "balance_loss_clip": 1.37405825, "balance_loss_mlp": 1.02222097, "epoch": 0.10034570870284082, "flos": 25857792426240.0, "grad_norm": 2.0955813013551277, "language_loss": 0.79053175, "learning_rate": 3.948315330332031e-06, "loss": 0.81741947, "num_input_tokens_seen": 35953855, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.37060547, "step": 1669, "time_per_iteration": 2.970571279525757 }, { "auxiliary_loss_clip": 0.01652121, "auxiliary_loss_mlp": 0.01072164, "balance_loss_clip": 1.38983178, "balance_loss_mlp": 1.03611541, "epoch": 0.1004058319555088, "flos": 26260635294720.0, "grad_norm": 2.169091601472982, "language_loss": 0.86633027, "learning_rate": 3.948227326038933e-06, "loss": 0.89357316, "num_input_tokens_seen": 35974555, "router_z_loss_clip": 2.62304688, "router_z_loss_mlp": 0.3605957, "step": 1670, "time_per_iteration": 2.9880945682525635 }, { "auxiliary_loss_clip": 0.0162786, "auxiliary_loss_mlp": 0.0106165, "balance_loss_clip": 1.38059139, "balance_loss_mlp": 1.02681756, "epoch": 0.10046595520817676, "flos": 25385444265600.0, "grad_norm": 1.6463926918416225, "language_loss": 0.77591807, "learning_rate": 3.9481392478688586e-06, "loss": 0.80281317, "num_input_tokens_seen": 35996830, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.34838867, "step": 1671, "time_per_iteration": 2.9631965160369873 }, { "auxiliary_loss_clip": 0.01390151, "auxiliary_loss_mlp": 0.01055989, "balance_loss_clip": 1.23043489, "balance_loss_mlp": 1.02909517, "epoch": 0.10052607846084473, "flos": 67488704864640.0, "grad_norm": 0.7857511421673885, "language_loss": 0.60782909, "learning_rate": 3.948051095825149e-06, "loss": 0.63229048, "num_input_tokens_seen": 36054465, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.26953125, "step": 1672, "time_per_iteration": 3.3744518756866455 }, { "auxiliary_loss_clip": 0.01650999, "auxiliary_loss_mlp": 0.01065444, "balance_loss_clip": 1.39364076, "balance_loss_mlp": 1.02667689, "epoch": 0.10058620171351271, "flos": 21370272307200.0, "grad_norm": 1.9233180959994893, "language_loss": 0.7783345, "learning_rate": 3.947962869911147e-06, "loss": 0.80549896, "num_input_tokens_seen": 36073480, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.38793945, "step": 1673, "time_per_iteration": 2.835486650466919 }, { "auxiliary_loss_clip": 0.01647363, "auxiliary_loss_mlp": 0.01061035, "balance_loss_clip": 1.39086044, "balance_loss_mlp": 1.02484345, "epoch": 0.10064632496618067, "flos": 16808812905600.0, "grad_norm": 2.341539480492534, "language_loss": 0.75925916, "learning_rate": 3.947874570130197e-06, "loss": 0.78634322, "num_input_tokens_seen": 36091830, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.36181641, "step": 1674, "time_per_iteration": 2.829949140548706 }, { "auxiliary_loss_clip": 0.01639911, "auxiliary_loss_mlp": 0.01071286, "balance_loss_clip": 1.3825438, "balance_loss_mlp": 1.03685784, "epoch": 0.10070644821884864, "flos": 23635288431360.0, "grad_norm": 4.7178875817820485, "language_loss": 0.80480331, "learning_rate": 3.947786196485649e-06, "loss": 0.83191532, "num_input_tokens_seen": 36111400, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.34448242, "step": 1675, "time_per_iteration": 2.8814475536346436 }, { "auxiliary_loss_clip": 0.01629417, "auxiliary_loss_mlp": 0.01059034, "balance_loss_clip": 1.37404251, "balance_loss_mlp": 1.02398646, "epoch": 0.1007665714715166, "flos": 24473351462400.0, "grad_norm": 2.176203324708197, "language_loss": 0.82577938, "learning_rate": 3.947697748980853e-06, "loss": 0.85266387, "num_input_tokens_seen": 36129345, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.3503418, "step": 1676, "time_per_iteration": 2.866112470626831 }, { "auxiliary_loss_clip": 0.01655655, "auxiliary_loss_mlp": 0.01065716, "balance_loss_clip": 1.39657378, "balance_loss_mlp": 1.0286901, "epoch": 0.10082669472418458, "flos": 16807546051200.0, "grad_norm": 1.8931664892652669, "language_loss": 0.87307936, "learning_rate": 3.947609227619163e-06, "loss": 0.90029299, "num_input_tokens_seen": 36146255, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.37036133, "step": 1677, "time_per_iteration": 2.829728603363037 }, { "auxiliary_loss_clip": 0.01642844, "auxiliary_loss_mlp": 0.01060559, "balance_loss_clip": 1.38630915, "balance_loss_mlp": 1.02663219, "epoch": 0.10088681797685255, "flos": 13561520014080.0, "grad_norm": 1.8140118150399882, "language_loss": 0.87695408, "learning_rate": 3.947520632403936e-06, "loss": 0.90398806, "num_input_tokens_seen": 36164050, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.33935547, "step": 1678, "time_per_iteration": 2.863939046859741 }, { "auxiliary_loss_clip": 0.01643288, "auxiliary_loss_mlp": 0.01064866, "balance_loss_clip": 1.38737714, "balance_loss_mlp": 1.03093934, "epoch": 0.10094694122952051, "flos": 25276820204160.0, "grad_norm": 1.9409867713278317, "language_loss": 0.91164911, "learning_rate": 3.947431963338532e-06, "loss": 0.93873066, "num_input_tokens_seen": 36183530, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.33911133, "step": 1679, "time_per_iteration": 2.873224973678589 }, { "auxiliary_loss_clip": 0.01397275, "auxiliary_loss_mlp": 0.0104121, "balance_loss_clip": 1.24328172, "balance_loss_mlp": 1.02404392, "epoch": 0.10100706448218849, "flos": 69887229690240.0, "grad_norm": 0.7849845222796132, "language_loss": 0.53100234, "learning_rate": 3.947343220426312e-06, "loss": 0.5553872, "num_input_tokens_seen": 36248550, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.171875, "step": 1680, "time_per_iteration": 3.4003121852874756 }, { "auxiliary_loss_clip": 0.01636305, "auxiliary_loss_mlp": 0.01059589, "balance_loss_clip": 1.38392699, "balance_loss_mlp": 1.02442181, "epoch": 0.10106718773485646, "flos": 20015720173440.0, "grad_norm": 1.8279466723189983, "language_loss": 0.77787495, "learning_rate": 3.947254403670641e-06, "loss": 0.80483389, "num_input_tokens_seen": 36266065, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.3515625, "step": 1681, "time_per_iteration": 2.8465073108673096 }, { "auxiliary_loss_clip": 0.01657775, "auxiliary_loss_mlp": 0.01060418, "balance_loss_clip": 1.39468622, "balance_loss_mlp": 1.02527499, "epoch": 0.10112731098752442, "flos": 13487625976320.0, "grad_norm": 2.3866911160155375, "language_loss": 0.9551121, "learning_rate": 3.947165513074889e-06, "loss": 0.98229402, "num_input_tokens_seen": 36280960, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.35180664, "step": 1682, "time_per_iteration": 2.854020595550537 }, { "auxiliary_loss_clip": 0.01636086, "auxiliary_loss_mlp": 0.01060827, "balance_loss_clip": 1.3780334, "balance_loss_mlp": 1.02730572, "epoch": 0.1011874342401924, "flos": 18525279346560.0, "grad_norm": 1.908998024014615, "language_loss": 0.88646305, "learning_rate": 3.947076548642425e-06, "loss": 0.91343218, "num_input_tokens_seen": 36299010, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.33544922, "step": 1683, "time_per_iteration": 2.8425066471099854 }, { "auxiliary_loss_clip": 0.01633548, "auxiliary_loss_mlp": 0.01054732, "balance_loss_clip": 1.3801465, "balance_loss_mlp": 1.02216423, "epoch": 0.10124755749286037, "flos": 20712238911360.0, "grad_norm": 1.952470283015156, "language_loss": 0.76025844, "learning_rate": 3.946987510376624e-06, "loss": 0.7871412, "num_input_tokens_seen": 36318400, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.32568359, "step": 1684, "time_per_iteration": 2.9909896850585938 }, { "auxiliary_loss_clip": 0.01411753, "auxiliary_loss_mlp": 0.01035972, "balance_loss_clip": 1.25598288, "balance_loss_mlp": 1.0148958, "epoch": 0.10130768074552833, "flos": 56141490257280.0, "grad_norm": 0.7616883329885786, "language_loss": 0.61197436, "learning_rate": 3.9468983982808615e-06, "loss": 0.6364516, "num_input_tokens_seen": 36381815, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.2109375, "step": 1685, "time_per_iteration": 3.4471018314361572 }, { "auxiliary_loss_clip": 0.01631877, "auxiliary_loss_mlp": 0.01062662, "balance_loss_clip": 1.37613583, "balance_loss_mlp": 1.0272094, "epoch": 0.1013678039981963, "flos": 33415652448000.0, "grad_norm": 2.460523872673505, "language_loss": 0.62732834, "learning_rate": 3.946809212358516e-06, "loss": 0.65427375, "num_input_tokens_seen": 36404320, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.35449219, "step": 1686, "time_per_iteration": 2.977872133255005 }, { "auxiliary_loss_clip": 0.01626053, "auxiliary_loss_mlp": 0.01062362, "balance_loss_clip": 1.37669873, "balance_loss_mlp": 1.02617049, "epoch": 0.10142792725086427, "flos": 31917972453120.0, "grad_norm": 2.324665368725329, "language_loss": 0.81534594, "learning_rate": 3.946719952612972e-06, "loss": 0.84223008, "num_input_tokens_seen": 36427510, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.36181641, "step": 1687, "time_per_iteration": 3.0080950260162354 }, { "auxiliary_loss_clip": 0.01652481, "auxiliary_loss_mlp": 0.01063124, "balance_loss_clip": 1.39276397, "balance_loss_mlp": 1.02314115, "epoch": 0.10148805050353224, "flos": 28487709014400.0, "grad_norm": 1.6084524842911123, "language_loss": 0.73304212, "learning_rate": 3.94663061904761e-06, "loss": 0.76019812, "num_input_tokens_seen": 36448230, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.40014648, "step": 1688, "time_per_iteration": 4.3482506275177 }, { "auxiliary_loss_clip": 0.01622541, "auxiliary_loss_mlp": 0.01056848, "balance_loss_clip": 1.37256205, "balance_loss_mlp": 1.02299201, "epoch": 0.1015481737562002, "flos": 25158061307520.0, "grad_norm": 2.0632919782674644, "language_loss": 0.87715614, "learning_rate": 3.94654121166582e-06, "loss": 0.90394998, "num_input_tokens_seen": 36464395, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.33862305, "step": 1689, "time_per_iteration": 2.8477039337158203 }, { "auxiliary_loss_clip": 0.01601391, "auxiliary_loss_mlp": 0.01055077, "balance_loss_clip": 1.35162401, "balance_loss_mlp": 1.02331924, "epoch": 0.10160829700886818, "flos": 30894902858880.0, "grad_norm": 1.7257431313037477, "language_loss": 0.90173066, "learning_rate": 3.946451730470993e-06, "loss": 0.92829537, "num_input_tokens_seen": 36486475, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.31713867, "step": 1690, "time_per_iteration": 3.057968854904175 }, { "auxiliary_loss_clip": 0.01634154, "auxiliary_loss_mlp": 0.0105823, "balance_loss_clip": 1.37967944, "balance_loss_mlp": 1.02396917, "epoch": 0.10166842026153615, "flos": 20421594443520.0, "grad_norm": 2.349401431963509, "language_loss": 0.84699166, "learning_rate": 3.946362175466521e-06, "loss": 0.87391555, "num_input_tokens_seen": 36505310, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.34228516, "step": 1691, "time_per_iteration": 3.0034735202789307 }, { "auxiliary_loss_clip": 0.01631343, "auxiliary_loss_mlp": 0.01060545, "balance_loss_clip": 1.37709665, "balance_loss_mlp": 1.02711892, "epoch": 0.10172854351420411, "flos": 33490722850560.0, "grad_norm": 1.7145380462601965, "language_loss": 0.68260574, "learning_rate": 3.946272546655801e-06, "loss": 0.70952463, "num_input_tokens_seen": 36529820, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.33447266, "step": 1692, "time_per_iteration": 4.420594215393066 }, { "auxiliary_loss_clip": 0.01621381, "auxiliary_loss_mlp": 0.01062277, "balance_loss_clip": 1.37000465, "balance_loss_mlp": 1.02975702, "epoch": 0.1017886667668721, "flos": 23560851456000.0, "grad_norm": 1.8693052396234329, "language_loss": 0.77762872, "learning_rate": 3.94618284404223e-06, "loss": 0.80446529, "num_input_tokens_seen": 36549000, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.32519531, "step": 1693, "time_per_iteration": 2.916638135910034 }, { "auxiliary_loss_clip": 0.01624915, "auxiliary_loss_mlp": 0.01063789, "balance_loss_clip": 1.36832333, "balance_loss_mlp": 1.02940929, "epoch": 0.10184879001954006, "flos": 23306837293440.0, "grad_norm": 1.7251910500257541, "language_loss": 0.88054377, "learning_rate": 3.9460930676292105e-06, "loss": 0.90743083, "num_input_tokens_seen": 36567515, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.34399414, "step": 1694, "time_per_iteration": 2.899407148361206 }, { "auxiliary_loss_clip": 0.01634678, "auxiliary_loss_mlp": 0.01071615, "balance_loss_clip": 1.37615085, "balance_loss_mlp": 1.03501797, "epoch": 0.10190891327220802, "flos": 18342354044160.0, "grad_norm": 1.9353100494807025, "language_loss": 0.81407481, "learning_rate": 3.946003217420147e-06, "loss": 0.84113777, "num_input_tokens_seen": 36586190, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.36547852, "step": 1695, "time_per_iteration": 2.8439905643463135 }, { "auxiliary_loss_clip": 0.01634777, "auxiliary_loss_mlp": 0.01065726, "balance_loss_clip": 1.3777113, "balance_loss_mlp": 1.03020167, "epoch": 0.10196903652487599, "flos": 26475666422400.0, "grad_norm": 1.8203360153226271, "language_loss": 0.87410241, "learning_rate": 3.945913293418447e-06, "loss": 0.90110743, "num_input_tokens_seen": 36607495, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.35522461, "step": 1696, "time_per_iteration": 5.84446120262146 }, { "auxiliary_loss_clip": 0.01610557, "auxiliary_loss_mlp": 0.01060179, "balance_loss_clip": 1.36300743, "balance_loss_mlp": 1.02577507, "epoch": 0.10202915977754397, "flos": 21878979304320.0, "grad_norm": 1.7463107283667392, "language_loss": 0.82610399, "learning_rate": 3.945823295627519e-06, "loss": 0.85281134, "num_input_tokens_seen": 36628555, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.34375, "step": 1697, "time_per_iteration": 2.9421753883361816 }, { "auxiliary_loss_clip": 0.0164112, "auxiliary_loss_mlp": 0.01061551, "balance_loss_clip": 1.38431454, "balance_loss_mlp": 1.02831578, "epoch": 0.10208928303021193, "flos": 22319945556480.0, "grad_norm": 2.0228655405503746, "language_loss": 0.826581, "learning_rate": 3.9457332240507775e-06, "loss": 0.85360771, "num_input_tokens_seen": 36646250, "router_z_loss_clip": 2.56835938, "router_z_loss_mlp": 0.33227539, "step": 1698, "time_per_iteration": 2.8640811443328857 }, { "auxiliary_loss_clip": 0.01640887, "auxiliary_loss_mlp": 0.01056985, "balance_loss_clip": 1.38619721, "balance_loss_mlp": 1.02467918, "epoch": 0.1021494062828799, "flos": 22135255706880.0, "grad_norm": 4.768320934070104, "language_loss": 0.77478015, "learning_rate": 3.945643078691637e-06, "loss": 0.80175883, "num_input_tokens_seen": 36666675, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.32250977, "step": 1699, "time_per_iteration": 2.9023337364196777 }, { "auxiliary_loss_clip": 0.01620192, "auxiliary_loss_mlp": 0.01057445, "balance_loss_clip": 1.3701216, "balance_loss_mlp": 1.0254494, "epoch": 0.10220952953554788, "flos": 19656339575040.0, "grad_norm": 1.9656606228135034, "language_loss": 0.80870491, "learning_rate": 3.945552859553516e-06, "loss": 0.83548129, "num_input_tokens_seen": 36685225, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.31982422, "step": 1700, "time_per_iteration": 2.926096200942993 }, { "auxiliary_loss_clip": 0.01646348, "auxiliary_loss_mlp": 0.01060103, "balance_loss_clip": 1.39144063, "balance_loss_mlp": 1.02844143, "epoch": 0.10226965278821584, "flos": 29798889367680.0, "grad_norm": 1.7342292760684124, "language_loss": 0.78124261, "learning_rate": 3.945462566639836e-06, "loss": 0.80830705, "num_input_tokens_seen": 36705985, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.31665039, "step": 1701, "time_per_iteration": 2.9173777103424072 }, { "auxiliary_loss_clip": 0.01649863, "auxiliary_loss_mlp": 0.01064524, "balance_loss_clip": 1.38818121, "balance_loss_mlp": 1.03271842, "epoch": 0.10232977604088381, "flos": 27028333382400.0, "grad_norm": 1.782857266774266, "language_loss": 0.78808391, "learning_rate": 3.945372199954019e-06, "loss": 0.81522781, "num_input_tokens_seen": 36725815, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.31811523, "step": 1702, "time_per_iteration": 2.984719753265381 }, { "auxiliary_loss_clip": 0.01615345, "auxiliary_loss_mlp": 0.01057498, "balance_loss_clip": 1.36606944, "balance_loss_mlp": 1.0261699, "epoch": 0.10238989929355179, "flos": 20787354558720.0, "grad_norm": 1.8807014098569284, "language_loss": 0.96009856, "learning_rate": 3.945281759499494e-06, "loss": 0.98682702, "num_input_tokens_seen": 36742345, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.31347656, "step": 1703, "time_per_iteration": 2.875232458114624 }, { "auxiliary_loss_clip": 0.01399946, "auxiliary_loss_mlp": 0.01028909, "balance_loss_clip": 1.24483776, "balance_loss_mlp": 1.00802314, "epoch": 0.10245002254621975, "flos": 57726484012800.0, "grad_norm": 0.8912655234827841, "language_loss": 0.55162191, "learning_rate": 3.94519124527969e-06, "loss": 0.57591045, "num_input_tokens_seen": 36798775, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.20898438, "step": 1704, "time_per_iteration": 3.2951996326446533 }, { "auxiliary_loss_clip": 0.01629984, "auxiliary_loss_mlp": 0.01061435, "balance_loss_clip": 1.37667418, "balance_loss_mlp": 1.02791381, "epoch": 0.10251014579888772, "flos": 16808812905600.0, "grad_norm": 2.783030814298188, "language_loss": 0.85636693, "learning_rate": 3.945100657298039e-06, "loss": 0.88328117, "num_input_tokens_seen": 36816295, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.33544922, "step": 1705, "time_per_iteration": 2.8939990997314453 }, { "auxiliary_loss_clip": 0.01395563, "auxiliary_loss_mlp": 0.01027179, "balance_loss_clip": 1.24225283, "balance_loss_mlp": 1.01001251, "epoch": 0.1025702690515557, "flos": 68595034170240.0, "grad_norm": 0.771639341507712, "language_loss": 0.60469592, "learning_rate": 3.9450099955579765e-06, "loss": 0.62892336, "num_input_tokens_seen": 36882030, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.171875, "step": 1706, "time_per_iteration": 3.397503137588501 }, { "auxiliary_loss_clip": 0.01636447, "auxiliary_loss_mlp": 0.01051307, "balance_loss_clip": 1.38145471, "balance_loss_mlp": 1.0201931, "epoch": 0.10263039230422366, "flos": 14874872117760.0, "grad_norm": 2.0092338966768484, "language_loss": 0.87398982, "learning_rate": 3.94491926006294e-06, "loss": 0.90086734, "num_input_tokens_seen": 36899245, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.31103516, "step": 1707, "time_per_iteration": 2.943096876144409 }, { "auxiliary_loss_clip": 0.01620514, "auxiliary_loss_mlp": 0.01054021, "balance_loss_clip": 1.37343407, "balance_loss_mlp": 1.02283573, "epoch": 0.10269051555689163, "flos": 25348452001920.0, "grad_norm": 1.4620615314260867, "language_loss": 0.73841333, "learning_rate": 3.944828450816369e-06, "loss": 0.76515871, "num_input_tokens_seen": 36920950, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.31176758, "step": 1708, "time_per_iteration": 2.9941771030426025 }, { "auxiliary_loss_clip": 0.01630087, "auxiliary_loss_mlp": 0.01061027, "balance_loss_clip": 1.37949216, "balance_loss_mlp": 1.0273385, "epoch": 0.10275063880955959, "flos": 21078587208960.0, "grad_norm": 1.5984931920141832, "language_loss": 0.91603184, "learning_rate": 3.944737567821709e-06, "loss": 0.94294298, "num_input_tokens_seen": 36938900, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.33691406, "step": 1709, "time_per_iteration": 2.946044445037842 }, { "auxiliary_loss_clip": 0.01621395, "auxiliary_loss_mlp": 0.01057118, "balance_loss_clip": 1.37321138, "balance_loss_mlp": 1.02433586, "epoch": 0.10281076206222757, "flos": 30377418370560.0, "grad_norm": 1.9167532283037152, "language_loss": 0.89144075, "learning_rate": 3.944646611082406e-06, "loss": 0.91822582, "num_input_tokens_seen": 36957010, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.32763672, "step": 1710, "time_per_iteration": 3.063547134399414 }, { "auxiliary_loss_clip": 0.01615704, "auxiliary_loss_mlp": 0.01051974, "balance_loss_clip": 1.36817312, "balance_loss_mlp": 1.02124166, "epoch": 0.10287088531489554, "flos": 22428298149120.0, "grad_norm": 1.7118212119594622, "language_loss": 0.80330074, "learning_rate": 3.944555580601908e-06, "loss": 0.82997751, "num_input_tokens_seen": 36977690, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.30688477, "step": 1711, "time_per_iteration": 3.0635430812835693 }, { "auxiliary_loss_clip": 0.01626165, "auxiliary_loss_mlp": 0.01051418, "balance_loss_clip": 1.37362933, "balance_loss_mlp": 1.01908851, "epoch": 0.1029310085675635, "flos": 25126091216640.0, "grad_norm": 1.814239451675464, "language_loss": 0.74798751, "learning_rate": 3.944464476383668e-06, "loss": 0.77476335, "num_input_tokens_seen": 36997300, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.32324219, "step": 1712, "time_per_iteration": 2.9420578479766846 }, { "auxiliary_loss_clip": 0.01615244, "auxiliary_loss_mlp": 0.01060991, "balance_loss_clip": 1.3730433, "balance_loss_mlp": 1.02918637, "epoch": 0.10299113182023148, "flos": 19875307000320.0, "grad_norm": 1.9701052394331993, "language_loss": 0.87746233, "learning_rate": 3.94437329843114e-06, "loss": 0.90422463, "num_input_tokens_seen": 37016110, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.31811523, "step": 1713, "time_per_iteration": 2.9421658515930176 }, { "auxiliary_loss_clip": 0.01620043, "auxiliary_loss_mlp": 0.01058478, "balance_loss_clip": 1.37233698, "balance_loss_mlp": 1.02726865, "epoch": 0.10305125507289944, "flos": 20456957894400.0, "grad_norm": 4.83932868525954, "language_loss": 0.73517847, "learning_rate": 3.944282046747782e-06, "loss": 0.76196373, "num_input_tokens_seen": 37036405, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.31176758, "step": 1714, "time_per_iteration": 2.9591064453125 }, { "auxiliary_loss_clip": 0.01629074, "auxiliary_loss_mlp": 0.01055526, "balance_loss_clip": 1.37502635, "balance_loss_mlp": 1.02415037, "epoch": 0.10311137832556741, "flos": 26262128373120.0, "grad_norm": 2.6008867428651765, "language_loss": 0.91686141, "learning_rate": 3.944190721337053e-06, "loss": 0.94370747, "num_input_tokens_seen": 37057580, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.3137207, "step": 1715, "time_per_iteration": 3.167482852935791 }, { "auxiliary_loss_clip": 0.0161421, "auxiliary_loss_mlp": 0.01057665, "balance_loss_clip": 1.36636162, "balance_loss_mlp": 1.02190268, "epoch": 0.10317150157823539, "flos": 35312555727360.0, "grad_norm": 1.755137452651067, "language_loss": 0.76998538, "learning_rate": 3.944099322202418e-06, "loss": 0.79670411, "num_input_tokens_seen": 37079120, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.35766602, "step": 1716, "time_per_iteration": 3.0549886226654053 }, { "auxiliary_loss_clip": 0.01626232, "auxiliary_loss_mlp": 0.0106336, "balance_loss_clip": 1.37607586, "balance_loss_mlp": 1.03255641, "epoch": 0.10323162483090335, "flos": 25751159136000.0, "grad_norm": 1.9021524887456134, "language_loss": 0.86877179, "learning_rate": 3.944007849347342e-06, "loss": 0.89566767, "num_input_tokens_seen": 37099710, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.30786133, "step": 1717, "time_per_iteration": 2.9758265018463135 }, { "auxiliary_loss_clip": 0.01620729, "auxiliary_loss_mlp": 0.01069751, "balance_loss_clip": 1.36914396, "balance_loss_mlp": 1.03849483, "epoch": 0.10329174808357132, "flos": 16298567585280.0, "grad_norm": 1.8529052019820862, "language_loss": 0.83917761, "learning_rate": 3.943916302775292e-06, "loss": 0.86608243, "num_input_tokens_seen": 37117775, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.3125, "step": 1718, "time_per_iteration": 2.8774306774139404 }, { "auxiliary_loss_clip": 0.01623261, "auxiliary_loss_mlp": 0.01063341, "balance_loss_clip": 1.37648225, "balance_loss_mlp": 1.03175032, "epoch": 0.10335187133623928, "flos": 36701611660800.0, "grad_norm": 1.8804761346699725, "language_loss": 0.74158484, "learning_rate": 3.943824682489742e-06, "loss": 0.7684508, "num_input_tokens_seen": 37140280, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.31567383, "step": 1719, "time_per_iteration": 3.0115675926208496 }, { "auxiliary_loss_clip": 0.01613678, "auxiliary_loss_mlp": 0.01062561, "balance_loss_clip": 1.36660457, "balance_loss_mlp": 1.03108954, "epoch": 0.10341199458890726, "flos": 14983812892800.0, "grad_norm": 1.8839162917606667, "language_loss": 0.94453859, "learning_rate": 3.9437329884941665e-06, "loss": 0.97130096, "num_input_tokens_seen": 37158350, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.31494141, "step": 1720, "time_per_iteration": 2.893920660018921 }, { "auxiliary_loss_clip": 0.01621202, "auxiliary_loss_mlp": 0.01070525, "balance_loss_clip": 1.37060785, "balance_loss_mlp": 1.03626406, "epoch": 0.10347211784157523, "flos": 21041142497280.0, "grad_norm": 2.260984925709028, "language_loss": 0.80841869, "learning_rate": 3.943641220792039e-06, "loss": 0.83533597, "num_input_tokens_seen": 37177120, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.34228516, "step": 1721, "time_per_iteration": 2.898102283477783 }, { "auxiliary_loss_clip": 0.01631119, "auxiliary_loss_mlp": 0.01074171, "balance_loss_clip": 1.37593186, "balance_loss_mlp": 1.04272306, "epoch": 0.1035322410942432, "flos": 19801593941760.0, "grad_norm": 1.7245980568394772, "language_loss": 0.81792909, "learning_rate": 3.9435493793868434e-06, "loss": 0.84498203, "num_input_tokens_seen": 37195895, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.31420898, "step": 1722, "time_per_iteration": 2.916135311126709 }, { "auxiliary_loss_clip": 0.013979, "auxiliary_loss_mlp": 0.01050343, "balance_loss_clip": 1.24428964, "balance_loss_mlp": 1.02869463, "epoch": 0.10359236434691117, "flos": 52725506192640.0, "grad_norm": 0.9778290645399514, "language_loss": 0.67279178, "learning_rate": 3.943457464282059e-06, "loss": 0.69727421, "num_input_tokens_seen": 37247270, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.21679688, "step": 1723, "time_per_iteration": 4.5575971603393555 }, { "auxiliary_loss_clip": 0.01625279, "auxiliary_loss_mlp": 0.01064348, "balance_loss_clip": 1.37117147, "balance_loss_mlp": 1.03204262, "epoch": 0.10365248759957914, "flos": 18414483534720.0, "grad_norm": 2.6174777984540283, "language_loss": 0.80086613, "learning_rate": 3.9433654754811745e-06, "loss": 0.82776248, "num_input_tokens_seen": 37265595, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.32324219, "step": 1724, "time_per_iteration": 2.8233587741851807 }, { "auxiliary_loss_clip": 0.01626068, "auxiliary_loss_mlp": 0.0105828, "balance_loss_clip": 1.37080312, "balance_loss_mlp": 1.02699983, "epoch": 0.1037126108522471, "flos": 47569166432640.0, "grad_norm": 1.8249106344009134, "language_loss": 0.75916672, "learning_rate": 3.943273412987676e-06, "loss": 0.78601015, "num_input_tokens_seen": 37286660, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.31274414, "step": 1725, "time_per_iteration": 3.142331838607788 }, { "auxiliary_loss_clip": 0.01621412, "auxiliary_loss_mlp": 0.01057339, "balance_loss_clip": 1.37257409, "balance_loss_mlp": 1.02603436, "epoch": 0.10377273410491508, "flos": 22825983110400.0, "grad_norm": 2.0201606291783634, "language_loss": 0.76212686, "learning_rate": 3.943181276805054e-06, "loss": 0.78891438, "num_input_tokens_seen": 37304915, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.31298828, "step": 1726, "time_per_iteration": 2.8910768032073975 }, { "auxiliary_loss_clip": 0.01627055, "auxiliary_loss_mlp": 0.01067697, "balance_loss_clip": 1.3719933, "balance_loss_mlp": 1.0334363, "epoch": 0.10383285735758305, "flos": 26149341790080.0, "grad_norm": 2.067950945008, "language_loss": 0.76024151, "learning_rate": 3.9430890669368035e-06, "loss": 0.78718907, "num_input_tokens_seen": 37325265, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.34228516, "step": 1727, "time_per_iteration": 2.9034175872802734 }, { "auxiliary_loss_clip": 0.01615915, "auxiliary_loss_mlp": 0.01059528, "balance_loss_clip": 1.36618042, "balance_loss_mlp": 1.02741313, "epoch": 0.10389298061025101, "flos": 17100498003840.0, "grad_norm": 1.9939065911736176, "language_loss": 0.86376309, "learning_rate": 3.942996783386422e-06, "loss": 0.89051759, "num_input_tokens_seen": 37341650, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.32104492, "step": 1728, "time_per_iteration": 4.333642482757568 }, { "auxiliary_loss_clip": 0.01604985, "auxiliary_loss_mlp": 0.01056937, "balance_loss_clip": 1.35644317, "balance_loss_mlp": 1.02372575, "epoch": 0.10395310386291898, "flos": 20786087704320.0, "grad_norm": 1.8805302763898075, "language_loss": 0.71632564, "learning_rate": 3.942904426157406e-06, "loss": 0.74294484, "num_input_tokens_seen": 37360270, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.33227539, "step": 1729, "time_per_iteration": 2.8833014965057373 }, { "auxiliary_loss_clip": 0.01635032, "auxiliary_loss_mlp": 0.01054956, "balance_loss_clip": 1.37983799, "balance_loss_mlp": 1.02126718, "epoch": 0.10401322711558696, "flos": 12827511319680.0, "grad_norm": 3.6652999697063984, "language_loss": 0.83303535, "learning_rate": 3.9428119952532605e-06, "loss": 0.85993516, "num_input_tokens_seen": 37375225, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.33666992, "step": 1730, "time_per_iteration": 2.815133571624756 }, { "auxiliary_loss_clip": 0.01614463, "auxiliary_loss_mlp": 0.01062061, "balance_loss_clip": 1.36434507, "balance_loss_mlp": 1.03044677, "epoch": 0.10407335036825492, "flos": 23194910361600.0, "grad_norm": 1.9724207556201654, "language_loss": 0.7723, "learning_rate": 3.942719490677489e-06, "loss": 0.79906535, "num_input_tokens_seen": 37395165, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.31616211, "step": 1731, "time_per_iteration": 4.340686321258545 }, { "auxiliary_loss_clip": 0.01610036, "auxiliary_loss_mlp": 0.01055096, "balance_loss_clip": 1.36427915, "balance_loss_mlp": 1.02352917, "epoch": 0.10413347362092289, "flos": 26115335683200.0, "grad_norm": 1.8204582053592702, "language_loss": 0.84059095, "learning_rate": 3.9426269124336e-06, "loss": 0.86724234, "num_input_tokens_seen": 37414845, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.31518555, "step": 1732, "time_per_iteration": 2.9108691215515137 }, { "auxiliary_loss_clip": 0.01634259, "auxiliary_loss_mlp": 0.01062629, "balance_loss_clip": 1.38303089, "balance_loss_mlp": 1.02903569, "epoch": 0.10419359687359087, "flos": 12648748538880.0, "grad_norm": 1.9057474517808413, "language_loss": 0.85419464, "learning_rate": 3.942534260525104e-06, "loss": 0.88116348, "num_input_tokens_seen": 37432490, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.3359375, "step": 1733, "time_per_iteration": 2.8295109272003174 }, { "auxiliary_loss_clip": 0.01638901, "auxiliary_loss_mlp": 0.01059647, "balance_loss_clip": 1.38396883, "balance_loss_mlp": 1.02433693, "epoch": 0.10425372012625883, "flos": 12131309295360.0, "grad_norm": 2.210485712887533, "language_loss": 0.77781367, "learning_rate": 3.942441534955514e-06, "loss": 0.8047992, "num_input_tokens_seen": 37449435, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.35327148, "step": 1734, "time_per_iteration": 2.8586843013763428 }, { "auxiliary_loss_clip": 0.01620967, "auxiliary_loss_mlp": 0.01057149, "balance_loss_clip": 1.37493849, "balance_loss_mlp": 1.02574956, "epoch": 0.1043138433789268, "flos": 25348406757120.0, "grad_norm": 1.6602544486123059, "language_loss": 0.76127332, "learning_rate": 3.9423487357283465e-06, "loss": 0.78805453, "num_input_tokens_seen": 37469105, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.3137207, "step": 1735, "time_per_iteration": 2.87495493888855 }, { "auxiliary_loss_clip": 0.01642019, "auxiliary_loss_mlp": 0.01054354, "balance_loss_clip": 1.38703966, "balance_loss_mlp": 1.02204859, "epoch": 0.10437396663159478, "flos": 29178617397120.0, "grad_norm": 1.7367010196584898, "language_loss": 0.79951179, "learning_rate": 3.94225586284712e-06, "loss": 0.8264755, "num_input_tokens_seen": 37490540, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.32299805, "step": 1736, "time_per_iteration": 2.9355287551879883 }, { "auxiliary_loss_clip": 0.01626774, "auxiliary_loss_mlp": 0.01064528, "balance_loss_clip": 1.37747741, "balance_loss_mlp": 1.02852678, "epoch": 0.10443408988426274, "flos": 25091406437760.0, "grad_norm": 3.5098811020067897, "language_loss": 0.72250807, "learning_rate": 3.942162916315356e-06, "loss": 0.74942112, "num_input_tokens_seen": 37511905, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.35986328, "step": 1737, "time_per_iteration": 2.8991878032684326 }, { "auxiliary_loss_clip": 0.01649648, "auxiliary_loss_mlp": 0.01062488, "balance_loss_clip": 1.3893292, "balance_loss_mlp": 1.02689159, "epoch": 0.1044942131369307, "flos": 26770654391040.0, "grad_norm": 2.716988155793134, "language_loss": 0.83254182, "learning_rate": 3.942069896136581e-06, "loss": 0.85966313, "num_input_tokens_seen": 37533635, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.35620117, "step": 1738, "time_per_iteration": 2.9242448806762695 }, { "auxiliary_loss_clip": 0.01645684, "auxiliary_loss_mlp": 0.01062972, "balance_loss_clip": 1.38940787, "balance_loss_mlp": 1.02892566, "epoch": 0.10455433638959867, "flos": 18451837756800.0, "grad_norm": 2.2672206113952726, "language_loss": 0.76530719, "learning_rate": 3.9419768023143196e-06, "loss": 0.79239368, "num_input_tokens_seen": 37552035, "router_z_loss_clip": 2.56445312, "router_z_loss_mlp": 0.34057617, "step": 1739, "time_per_iteration": 2.8779656887054443 }, { "auxiliary_loss_clip": 0.01637994, "auxiliary_loss_mlp": 0.01051227, "balance_loss_clip": 1.38523638, "balance_loss_mlp": 1.01844454, "epoch": 0.10461445964226665, "flos": 23228644999680.0, "grad_norm": 2.064501779280056, "language_loss": 0.78779864, "learning_rate": 3.941883634852104e-06, "loss": 0.81469095, "num_input_tokens_seen": 37571540, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.32788086, "step": 1740, "time_per_iteration": 2.9048547744750977 }, { "auxiliary_loss_clip": 0.01647777, "auxiliary_loss_mlp": 0.01055398, "balance_loss_clip": 1.39894009, "balance_loss_mlp": 1.02061248, "epoch": 0.10467458289493461, "flos": 24353959138560.0, "grad_norm": 1.9460175288987018, "language_loss": 0.87532246, "learning_rate": 3.941790393753467e-06, "loss": 0.90235424, "num_input_tokens_seen": 37588265, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.34790039, "step": 1741, "time_per_iteration": 2.930488348007202 }, { "auxiliary_loss_clip": 0.01672172, "auxiliary_loss_mlp": 0.01055087, "balance_loss_clip": 1.41378522, "balance_loss_mlp": 1.02177966, "epoch": 0.10473470614760258, "flos": 21297826103040.0, "grad_norm": 2.2914045243327057, "language_loss": 0.77325141, "learning_rate": 3.941697079021942e-06, "loss": 0.800524, "num_input_tokens_seen": 37606860, "router_z_loss_clip": 2.58398438, "router_z_loss_mlp": 0.33276367, "step": 1742, "time_per_iteration": 2.8465445041656494 }, { "auxiliary_loss_clip": 0.01643238, "auxiliary_loss_mlp": 0.01059224, "balance_loss_clip": 1.39175677, "balance_loss_mlp": 1.02641714, "epoch": 0.10479482940027056, "flos": 21696642184320.0, "grad_norm": 2.2064236408363382, "language_loss": 0.88134134, "learning_rate": 3.94160369066107e-06, "loss": 0.90836596, "num_input_tokens_seen": 37625210, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.328125, "step": 1743, "time_per_iteration": 2.9496910572052 }, { "auxiliary_loss_clip": 0.01626785, "auxiliary_loss_mlp": 0.01057983, "balance_loss_clip": 1.37817609, "balance_loss_mlp": 1.0234834, "epoch": 0.10485495265293852, "flos": 21582045809280.0, "grad_norm": 2.28091701033822, "language_loss": 0.77172786, "learning_rate": 3.941510228674391e-06, "loss": 0.79857552, "num_input_tokens_seen": 37644110, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.3449707, "step": 1744, "time_per_iteration": 2.9551281929016113 }, { "auxiliary_loss_clip": 0.01646057, "auxiliary_loss_mlp": 0.01052004, "balance_loss_clip": 1.3967216, "balance_loss_mlp": 1.0191735, "epoch": 0.10491507590560649, "flos": 37976116464000.0, "grad_norm": 1.9689631735422777, "language_loss": 0.81944823, "learning_rate": 3.941416693065451e-06, "loss": 0.84642887, "num_input_tokens_seen": 37665800, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.32836914, "step": 1745, "time_per_iteration": 3.014305353164673 }, { "auxiliary_loss_clip": 0.01635324, "auxiliary_loss_mlp": 0.01054086, "balance_loss_clip": 1.38230014, "balance_loss_mlp": 1.02321076, "epoch": 0.10497519915827447, "flos": 26407563719040.0, "grad_norm": 2.0253298486749847, "language_loss": 0.84448278, "learning_rate": 3.941323083837794e-06, "loss": 0.87137681, "num_input_tokens_seen": 37685095, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.30834961, "step": 1746, "time_per_iteration": 2.9951868057250977 }, { "auxiliary_loss_clip": 0.01636734, "auxiliary_loss_mlp": 0.01060982, "balance_loss_clip": 1.38903379, "balance_loss_mlp": 1.02917719, "epoch": 0.10503532241094243, "flos": 40677167157120.0, "grad_norm": 1.5152703881523422, "language_loss": 0.71764612, "learning_rate": 3.941229400994971e-06, "loss": 0.7446233, "num_input_tokens_seen": 37707445, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.31811523, "step": 1747, "time_per_iteration": 3.1420838832855225 }, { "auxiliary_loss_clip": 0.01679089, "auxiliary_loss_mlp": 0.01060797, "balance_loss_clip": 1.41551018, "balance_loss_mlp": 1.02572584, "epoch": 0.1050954456636104, "flos": 29801558810880.0, "grad_norm": 2.1219481368241087, "language_loss": 0.85959506, "learning_rate": 3.941135644540535e-06, "loss": 0.88699389, "num_input_tokens_seen": 37728325, "router_z_loss_clip": 2.63671875, "router_z_loss_mlp": 0.35058594, "step": 1748, "time_per_iteration": 2.95402193069458 }, { "auxiliary_loss_clip": 0.01636631, "auxiliary_loss_mlp": 0.01058852, "balance_loss_clip": 1.38253999, "balance_loss_mlp": 1.02342248, "epoch": 0.10515556891627838, "flos": 23958626906880.0, "grad_norm": 1.7758675812088518, "language_loss": 0.73059434, "learning_rate": 3.941041814478041e-06, "loss": 0.75754917, "num_input_tokens_seen": 37748910, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.35400391, "step": 1749, "time_per_iteration": 2.9274566173553467 }, { "auxiliary_loss_clip": 0.0162871, "auxiliary_loss_mlp": 0.01053428, "balance_loss_clip": 1.38015485, "balance_loss_mlp": 1.01985884, "epoch": 0.10521569216894634, "flos": 18268686230400.0, "grad_norm": 2.0357412186842625, "language_loss": 0.83720255, "learning_rate": 3.940947910811047e-06, "loss": 0.86402392, "num_input_tokens_seen": 37765745, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.33569336, "step": 1750, "time_per_iteration": 3.0262739658355713 }, { "auxiliary_loss_clip": 0.01641418, "auxiliary_loss_mlp": 0.01059729, "balance_loss_clip": 1.38756061, "balance_loss_mlp": 1.02565897, "epoch": 0.10527581542161431, "flos": 15638860131840.0, "grad_norm": 2.465969131162752, "language_loss": 0.94313425, "learning_rate": 3.940853933543114e-06, "loss": 0.9701457, "num_input_tokens_seen": 37780520, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.34082031, "step": 1751, "time_per_iteration": 2.8915250301361084 }, { "auxiliary_loss_clip": 0.01645281, "auxiliary_loss_mlp": 0.01058645, "balance_loss_clip": 1.3952477, "balance_loss_mlp": 1.02650583, "epoch": 0.10533593867428227, "flos": 18305768983680.0, "grad_norm": 1.9159637713035294, "language_loss": 0.79866433, "learning_rate": 3.940759882677805e-06, "loss": 0.82570356, "num_input_tokens_seen": 37799515, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.32128906, "step": 1752, "time_per_iteration": 2.854999303817749 }, { "auxiliary_loss_clip": 0.01636921, "auxiliary_loss_mlp": 0.01063659, "balance_loss_clip": 1.38860869, "balance_loss_mlp": 1.03085268, "epoch": 0.10539606192695025, "flos": 29035172822400.0, "grad_norm": 1.7823232322914189, "language_loss": 0.77164829, "learning_rate": 3.940665758218686e-06, "loss": 0.79865408, "num_input_tokens_seen": 37818695, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.328125, "step": 1753, "time_per_iteration": 2.9355738162994385 }, { "auxiliary_loss_clip": 0.01663179, "auxiliary_loss_mlp": 0.01068393, "balance_loss_clip": 1.4014132, "balance_loss_mlp": 1.03327417, "epoch": 0.10545618517961822, "flos": 19977506300160.0, "grad_norm": 1.7554161544906135, "language_loss": 0.85043174, "learning_rate": 3.940571560169328e-06, "loss": 0.87774748, "num_input_tokens_seen": 37837860, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.35131836, "step": 1754, "time_per_iteration": 2.8851380348205566 }, { "auxiliary_loss_clip": 0.01666402, "auxiliary_loss_mlp": 0.01064036, "balance_loss_clip": 1.41022062, "balance_loss_mlp": 1.03010941, "epoch": 0.10551630843228618, "flos": 16151955874560.0, "grad_norm": 2.190111071812958, "language_loss": 0.70758605, "learning_rate": 3.940477288533302e-06, "loss": 0.7348904, "num_input_tokens_seen": 37856260, "router_z_loss_clip": 2.56054688, "router_z_loss_mlp": 0.33935547, "step": 1755, "time_per_iteration": 2.8315024375915527 }, { "auxiliary_loss_clip": 0.01665246, "auxiliary_loss_mlp": 0.01068477, "balance_loss_clip": 1.40214801, "balance_loss_mlp": 1.03333426, "epoch": 0.10557643168495416, "flos": 23450417602560.0, "grad_norm": 2.1960360692476946, "language_loss": 0.78195035, "learning_rate": 3.940382943314182e-06, "loss": 0.80928761, "num_input_tokens_seen": 37876960, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.3515625, "step": 1756, "time_per_iteration": 2.9532480239868164 }, { "auxiliary_loss_clip": 0.0166622, "auxiliary_loss_mlp": 0.01078873, "balance_loss_clip": 1.40685081, "balance_loss_mlp": 1.04499412, "epoch": 0.10563655493762213, "flos": 21809112053760.0, "grad_norm": 1.7558333036665354, "language_loss": 0.81170762, "learning_rate": 3.940288524515547e-06, "loss": 0.83915854, "num_input_tokens_seen": 37897070, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.33837891, "step": 1757, "time_per_iteration": 2.820737838745117 }, { "auxiliary_loss_clip": 0.01657761, "auxiliary_loss_mlp": 0.01060706, "balance_loss_clip": 1.39970267, "balance_loss_mlp": 1.02758968, "epoch": 0.10569667819029009, "flos": 53820234846720.0, "grad_norm": 1.5591721674401517, "language_loss": 0.79811817, "learning_rate": 3.940194032140976e-06, "loss": 0.82530284, "num_input_tokens_seen": 37923635, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.33105469, "step": 1758, "time_per_iteration": 4.524232864379883 }, { "auxiliary_loss_clip": 0.01668243, "auxiliary_loss_mlp": 0.01062286, "balance_loss_clip": 1.40790796, "balance_loss_mlp": 1.02955079, "epoch": 0.10575680144295807, "flos": 22934697661440.0, "grad_norm": 1.83881742724694, "language_loss": 0.92614478, "learning_rate": 3.940099466194054e-06, "loss": 0.95345008, "num_input_tokens_seen": 37942650, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.32714844, "step": 1759, "time_per_iteration": 2.896393060684204 }, { "auxiliary_loss_clip": 0.01651909, "auxiliary_loss_mlp": 0.01063633, "balance_loss_clip": 1.39297485, "balance_loss_mlp": 1.02925253, "epoch": 0.10581692469562604, "flos": 14144663986560.0, "grad_norm": 2.984907278519586, "language_loss": 0.77870953, "learning_rate": 3.940004826678365e-06, "loss": 0.80586493, "num_input_tokens_seen": 37960660, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.34375, "step": 1760, "time_per_iteration": 2.832669496536255 }, { "auxiliary_loss_clip": 0.01659411, "auxiliary_loss_mlp": 0.01063973, "balance_loss_clip": 1.39653373, "balance_loss_mlp": 1.02704191, "epoch": 0.105877047948294, "flos": 25969900337280.0, "grad_norm": 2.6940677493604657, "language_loss": 0.91170448, "learning_rate": 3.939910113597498e-06, "loss": 0.93893838, "num_input_tokens_seen": 37978625, "router_z_loss_clip": 2.62890625, "router_z_loss_mlp": 0.36914062, "step": 1761, "time_per_iteration": 2.9727752208709717 }, { "auxiliary_loss_clip": 0.0165895, "auxiliary_loss_mlp": 0.01059936, "balance_loss_clip": 1.40139043, "balance_loss_mlp": 1.02612793, "epoch": 0.10593717120096197, "flos": 30677066553600.0, "grad_norm": 1.9657485396386047, "language_loss": 0.79839504, "learning_rate": 3.9398153269550464e-06, "loss": 0.82558388, "num_input_tokens_seen": 38000005, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.33789062, "step": 1762, "time_per_iteration": 2.939302444458008 }, { "auxiliary_loss_clip": 0.01369406, "auxiliary_loss_mlp": 0.01050123, "balance_loss_clip": 1.21870828, "balance_loss_mlp": 1.03038204, "epoch": 0.10599729445362994, "flos": 66471851790720.0, "grad_norm": 0.770817293889527, "language_loss": 0.60645485, "learning_rate": 3.939720466754602e-06, "loss": 0.63065016, "num_input_tokens_seen": 38066165, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.19726562, "step": 1763, "time_per_iteration": 4.973220586776733 }, { "auxiliary_loss_clip": 0.01664396, "auxiliary_loss_mlp": 0.01060981, "balance_loss_clip": 1.40468216, "balance_loss_mlp": 1.02443099, "epoch": 0.10605741770629791, "flos": 23957902990080.0, "grad_norm": 1.8800996476986331, "language_loss": 0.81792796, "learning_rate": 3.939625532999763e-06, "loss": 0.8451817, "num_input_tokens_seen": 38086150, "router_z_loss_clip": 2.59765625, "router_z_loss_mlp": 0.36547852, "step": 1764, "time_per_iteration": 3.015533685684204 }, { "auxiliary_loss_clip": 0.01650475, "auxiliary_loss_mlp": 0.01059442, "balance_loss_clip": 1.39384472, "balance_loss_mlp": 1.02358377, "epoch": 0.10611754095896588, "flos": 19396081630080.0, "grad_norm": 2.54635969385903, "language_loss": 0.81169343, "learning_rate": 3.9395305256941314e-06, "loss": 0.83879262, "num_input_tokens_seen": 38104205, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.3581543, "step": 1765, "time_per_iteration": 3.051846981048584 }, { "auxiliary_loss_clip": 0.0164507, "auxiliary_loss_mlp": 0.01061545, "balance_loss_clip": 1.39066982, "balance_loss_mlp": 1.02914357, "epoch": 0.10617766421163385, "flos": 22248178024320.0, "grad_norm": 1.9517843127258139, "language_loss": 0.77885103, "learning_rate": 3.939435444841306e-06, "loss": 0.80591714, "num_input_tokens_seen": 38122005, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.32373047, "step": 1766, "time_per_iteration": 5.675248861312866 }, { "auxiliary_loss_clip": 0.01656229, "auxiliary_loss_mlp": 0.01060391, "balance_loss_clip": 1.40301871, "balance_loss_mlp": 1.026178, "epoch": 0.10623778746430182, "flos": 28416574909440.0, "grad_norm": 9.768060537263501, "language_loss": 0.78029341, "learning_rate": 3.939340290444895e-06, "loss": 0.80745959, "num_input_tokens_seen": 38143365, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.34228516, "step": 1767, "time_per_iteration": 2.9640796184539795 }, { "auxiliary_loss_clip": 0.01373993, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.22112584, "balance_loss_mlp": 1.00915802, "epoch": 0.10629791071696978, "flos": 64265889409920.0, "grad_norm": 0.6946962106775932, "language_loss": 0.58051407, "learning_rate": 3.939245062508506e-06, "loss": 0.60461068, "num_input_tokens_seen": 38210035, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.265625, "step": 1768, "time_per_iteration": 3.4650774002075195 }, { "auxiliary_loss_clip": 0.01660368, "auxiliary_loss_mlp": 0.01059397, "balance_loss_clip": 1.40369678, "balance_loss_mlp": 1.02675796, "epoch": 0.10635803396963776, "flos": 22757880407040.0, "grad_norm": 1.4344345373321545, "language_loss": 0.87348735, "learning_rate": 3.939149761035749e-06, "loss": 0.90068501, "num_input_tokens_seen": 38231230, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.32641602, "step": 1769, "time_per_iteration": 3.1858532428741455 }, { "auxiliary_loss_clip": 0.01664356, "auxiliary_loss_mlp": 0.01071507, "balance_loss_clip": 1.40534699, "balance_loss_mlp": 1.0374136, "epoch": 0.10641815722230573, "flos": 31408360560000.0, "grad_norm": 2.071546057868812, "language_loss": 0.62956864, "learning_rate": 3.9390543860302395e-06, "loss": 0.65692729, "num_input_tokens_seen": 38253890, "router_z_loss_clip": 2.59179688, "router_z_loss_mlp": 0.34082031, "step": 1770, "time_per_iteration": 2.957024097442627 }, { "auxiliary_loss_clip": 0.01387368, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.22880912, "balance_loss_mlp": 1.0023222, "epoch": 0.1064782804749737, "flos": 58575342568320.0, "grad_norm": 0.9005741925445382, "language_loss": 0.57217085, "learning_rate": 3.9389589374955925e-06, "loss": 0.596331, "num_input_tokens_seen": 38304290, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.26367188, "step": 1771, "time_per_iteration": 3.247133255004883 }, { "auxiliary_loss_clip": 0.01669666, "auxiliary_loss_mlp": 0.01081616, "balance_loss_clip": 1.41204107, "balance_loss_mlp": 1.0469501, "epoch": 0.10653840372764166, "flos": 23998379103360.0, "grad_norm": 1.9908329289812166, "language_loss": 0.89470625, "learning_rate": 3.938863415435429e-06, "loss": 0.92221904, "num_input_tokens_seen": 38324725, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 0.34692383, "step": 1772, "time_per_iteration": 2.8781545162200928 }, { "auxiliary_loss_clip": 0.01671037, "auxiliary_loss_mlp": 0.01081071, "balance_loss_clip": 1.40657508, "balance_loss_mlp": 1.04149318, "epoch": 0.10659852698030964, "flos": 18303371009280.0, "grad_norm": 2.7256897885723337, "language_loss": 0.78747731, "learning_rate": 3.93876781985337e-06, "loss": 0.81499839, "num_input_tokens_seen": 38340735, "router_z_loss_clip": 2.64648438, "router_z_loss_mlp": 0.39575195, "step": 1773, "time_per_iteration": 2.8259241580963135 }, { "auxiliary_loss_clip": 0.01671059, "auxiliary_loss_mlp": 0.0107467, "balance_loss_clip": 1.41134799, "balance_loss_mlp": 1.04248369, "epoch": 0.1066586502329776, "flos": 32173253470080.0, "grad_norm": 2.075994853031606, "language_loss": 0.85227686, "learning_rate": 3.938672150753041e-06, "loss": 0.87973416, "num_input_tokens_seen": 38361315, "router_z_loss_clip": 2.59960938, "router_z_loss_mlp": 0.32202148, "step": 1774, "time_per_iteration": 2.981714963912964 }, { "auxiliary_loss_clip": 0.0167782, "auxiliary_loss_mlp": 0.0108537, "balance_loss_clip": 1.41643429, "balance_loss_mlp": 1.05010796, "epoch": 0.10671877348564557, "flos": 17794075829760.0, "grad_norm": 2.7574443853371466, "language_loss": 0.78517091, "learning_rate": 3.9385764081380704e-06, "loss": 0.81280285, "num_input_tokens_seen": 38377425, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.35253906, "step": 1775, "time_per_iteration": 2.8423469066619873 }, { "auxiliary_loss_clip": 0.01375899, "auxiliary_loss_mlp": 0.01050615, "balance_loss_clip": 1.21844709, "balance_loss_mlp": 1.03058755, "epoch": 0.10677889673831355, "flos": 63542603733120.0, "grad_norm": 0.8255468392421951, "language_loss": 0.57552344, "learning_rate": 3.9384805920120876e-06, "loss": 0.59978855, "num_input_tokens_seen": 38440275, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.20019531, "step": 1776, "time_per_iteration": 3.4358584880828857 }, { "auxiliary_loss_clip": 0.01651181, "auxiliary_loss_mlp": 0.01076217, "balance_loss_clip": 1.39633441, "balance_loss_mlp": 1.04064488, "epoch": 0.10683901999098151, "flos": 22027762765440.0, "grad_norm": 1.5743561494173086, "language_loss": 0.84229231, "learning_rate": 3.938384702378727e-06, "loss": 0.86956632, "num_input_tokens_seen": 38461820, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.35571289, "step": 1777, "time_per_iteration": 2.8997440338134766 }, { "auxiliary_loss_clip": 0.0164965, "auxiliary_loss_mlp": 0.01070178, "balance_loss_clip": 1.39817154, "balance_loss_mlp": 1.03665638, "epoch": 0.10689914324364948, "flos": 25053011585280.0, "grad_norm": 2.0149006574604034, "language_loss": 0.88569981, "learning_rate": 3.938288739241625e-06, "loss": 0.91289806, "num_input_tokens_seen": 38482235, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.33520508, "step": 1778, "time_per_iteration": 2.9007174968719482 }, { "auxiliary_loss_clip": 0.01661561, "auxiliary_loss_mlp": 0.01069652, "balance_loss_clip": 1.40730691, "balance_loss_mlp": 1.03412795, "epoch": 0.10695926649631746, "flos": 16443686217600.0, "grad_norm": 2.0009593711947335, "language_loss": 0.85251212, "learning_rate": 3.938192702604417e-06, "loss": 0.87982428, "num_input_tokens_seen": 38500690, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.35522461, "step": 1779, "time_per_iteration": 2.9164516925811768 }, { "auxiliary_loss_clip": 0.01647789, "auxiliary_loss_mlp": 0.01061059, "balance_loss_clip": 1.39398301, "balance_loss_mlp": 1.02582026, "epoch": 0.10701938974898542, "flos": 16987077993600.0, "grad_norm": 2.2929798521459848, "language_loss": 0.68710697, "learning_rate": 3.9380965924707495e-06, "loss": 0.71419543, "num_input_tokens_seen": 38518405, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.35229492, "step": 1780, "time_per_iteration": 2.84983229637146 }, { "auxiliary_loss_clip": 0.016565, "auxiliary_loss_mlp": 0.01064065, "balance_loss_clip": 1.40190649, "balance_loss_mlp": 1.02939916, "epoch": 0.10707951300165339, "flos": 15896584368000.0, "grad_norm": 2.4682664356387902, "language_loss": 0.92984635, "learning_rate": 3.938000408844265e-06, "loss": 0.95705199, "num_input_tokens_seen": 38535060, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.34643555, "step": 1781, "time_per_iteration": 2.8994290828704834 }, { "auxiliary_loss_clip": 0.01656836, "auxiliary_loss_mlp": 0.01065303, "balance_loss_clip": 1.4015193, "balance_loss_mlp": 1.03097105, "epoch": 0.10713963625432135, "flos": 14255459798400.0, "grad_norm": 1.798870937743681, "language_loss": 0.80067283, "learning_rate": 3.9379041517286105e-06, "loss": 0.82789421, "num_input_tokens_seen": 38552855, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.34326172, "step": 1782, "time_per_iteration": 2.835123300552368 }, { "auxiliary_loss_clip": 0.01675397, "auxiliary_loss_mlp": 0.01061353, "balance_loss_clip": 1.41335559, "balance_loss_mlp": 1.02618575, "epoch": 0.10719975950698933, "flos": 16763676577920.0, "grad_norm": 1.77047306521287, "language_loss": 0.8050667, "learning_rate": 3.937807821127436e-06, "loss": 0.83243418, "num_input_tokens_seen": 38570075, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.3515625, "step": 1783, "time_per_iteration": 2.858966112136841 }, { "auxiliary_loss_clip": 0.01667612, "auxiliary_loss_mlp": 0.0106038, "balance_loss_clip": 1.40603566, "balance_loss_mlp": 1.02516556, "epoch": 0.1072598827596573, "flos": 22720797653760.0, "grad_norm": 1.8483117221906926, "language_loss": 0.88283324, "learning_rate": 3.937711417044395e-06, "loss": 0.9101131, "num_input_tokens_seen": 38587970, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.35229492, "step": 1784, "time_per_iteration": 2.864152193069458 }, { "auxiliary_loss_clip": 0.01658466, "auxiliary_loss_mlp": 0.01057425, "balance_loss_clip": 1.39988995, "balance_loss_mlp": 1.02218699, "epoch": 0.10732000601232526, "flos": 23268668664960.0, "grad_norm": 3.197251053794145, "language_loss": 1.03224194, "learning_rate": 3.937614939483143e-06, "loss": 1.0594008, "num_input_tokens_seen": 38605840, "router_z_loss_clip": 2.58789062, "router_z_loss_mlp": 0.35253906, "step": 1785, "time_per_iteration": 2.920448064804077 }, { "auxiliary_loss_clip": 0.01637775, "auxiliary_loss_mlp": 0.01061951, "balance_loss_clip": 1.38972759, "balance_loss_mlp": 1.02418578, "epoch": 0.10738012926499324, "flos": 24217753731840.0, "grad_norm": 1.3405170509890478, "language_loss": 0.86092997, "learning_rate": 3.937518388447339e-06, "loss": 0.88792717, "num_input_tokens_seen": 38627070, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.37744141, "step": 1786, "time_per_iteration": 2.94661021232605 }, { "auxiliary_loss_clip": 0.01677895, "auxiliary_loss_mlp": 0.01056498, "balance_loss_clip": 1.41478801, "balance_loss_mlp": 1.01827919, "epoch": 0.1074402525176612, "flos": 20932835149440.0, "grad_norm": 1.6708025748495405, "language_loss": 0.79802972, "learning_rate": 3.937421763940642e-06, "loss": 0.82537365, "num_input_tokens_seen": 38645840, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.38256836, "step": 1787, "time_per_iteration": 2.86024808883667 }, { "auxiliary_loss_clip": 0.01668039, "auxiliary_loss_mlp": 0.01055538, "balance_loss_clip": 1.40639114, "balance_loss_mlp": 1.02106249, "epoch": 0.10750037577032917, "flos": 16955877064320.0, "grad_norm": 1.7657097803914457, "language_loss": 0.8471064, "learning_rate": 3.937325065966719e-06, "loss": 0.87434214, "num_input_tokens_seen": 38664770, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.34472656, "step": 1788, "time_per_iteration": 2.9009547233581543 }, { "auxiliary_loss_clip": 0.01664713, "auxiliary_loss_mlp": 0.01061717, "balance_loss_clip": 1.40869713, "balance_loss_mlp": 1.02695549, "epoch": 0.10756049902299715, "flos": 20276204342400.0, "grad_norm": 1.801512337769345, "language_loss": 0.79350221, "learning_rate": 3.9372282945292335e-06, "loss": 0.82076651, "num_input_tokens_seen": 38683865, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.34765625, "step": 1789, "time_per_iteration": 2.893228769302368 }, { "auxiliary_loss_clip": 0.01652635, "auxiliary_loss_mlp": 0.01056626, "balance_loss_clip": 1.39461446, "balance_loss_mlp": 1.02043366, "epoch": 0.10762062227566511, "flos": 23596984068480.0, "grad_norm": 2.6600240657684013, "language_loss": 0.76599503, "learning_rate": 3.937131449631859e-06, "loss": 0.7930876, "num_input_tokens_seen": 38702485, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.36206055, "step": 1790, "time_per_iteration": 2.9434897899627686 }, { "auxiliary_loss_clip": 0.01667256, "auxiliary_loss_mlp": 0.01067609, "balance_loss_clip": 1.40683675, "balance_loss_mlp": 1.0310353, "epoch": 0.10768074552833308, "flos": 24319817297280.0, "grad_norm": 32.03016098518442, "language_loss": 0.80186504, "learning_rate": 3.9370345312782645e-06, "loss": 0.82921368, "num_input_tokens_seen": 38722475, "router_z_loss_clip": 2.60351562, "router_z_loss_mlp": 0.36547852, "step": 1791, "time_per_iteration": 2.9553351402282715 }, { "auxiliary_loss_clip": 0.01642211, "auxiliary_loss_mlp": 0.01058843, "balance_loss_clip": 1.39141226, "balance_loss_mlp": 1.02436757, "epoch": 0.10774086878100106, "flos": 25310509597440.0, "grad_norm": 2.4051165939131907, "language_loss": 0.72505844, "learning_rate": 3.936937539472126e-06, "loss": 0.752069, "num_input_tokens_seen": 38743285, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.34448242, "step": 1792, "time_per_iteration": 2.8725008964538574 }, { "auxiliary_loss_clip": 0.0164798, "auxiliary_loss_mlp": 0.01051931, "balance_loss_clip": 1.39233112, "balance_loss_mlp": 1.0179565, "epoch": 0.10780099203366902, "flos": 22064076357120.0, "grad_norm": 1.7055465024459227, "language_loss": 0.78020853, "learning_rate": 3.9368404742171236e-06, "loss": 0.8072077, "num_input_tokens_seen": 38763035, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.34008789, "step": 1793, "time_per_iteration": 4.266247510910034 }, { "auxiliary_loss_clip": 0.01653715, "auxiliary_loss_mlp": 0.0106525, "balance_loss_clip": 1.40382361, "balance_loss_mlp": 1.02820039, "epoch": 0.10786111528633699, "flos": 22757744672640.0, "grad_norm": 1.4613807417286013, "language_loss": 0.85996896, "learning_rate": 3.936743335516936e-06, "loss": 0.88715863, "num_input_tokens_seen": 38784900, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.37011719, "step": 1794, "time_per_iteration": 2.9946212768554688 }, { "auxiliary_loss_clip": 0.01683414, "auxiliary_loss_mlp": 0.01062161, "balance_loss_clip": 1.41548634, "balance_loss_mlp": 1.0241096, "epoch": 0.10792123853900495, "flos": 20861067617280.0, "grad_norm": 1.583120607367775, "language_loss": 0.76802135, "learning_rate": 3.936646123375246e-06, "loss": 0.79547715, "num_input_tokens_seen": 38804695, "router_z_loss_clip": 2.6796875, "router_z_loss_mlp": 0.38037109, "step": 1795, "time_per_iteration": 2.9647886753082275 }, { "auxiliary_loss_clip": 0.01662046, "auxiliary_loss_mlp": 0.01061752, "balance_loss_clip": 1.40165424, "balance_loss_mlp": 1.02732396, "epoch": 0.10798136179167293, "flos": 17757626503680.0, "grad_norm": 2.29474512237384, "language_loss": 0.83238059, "learning_rate": 3.936548837795741e-06, "loss": 0.85961854, "num_input_tokens_seen": 38822395, "router_z_loss_clip": 2.6015625, "router_z_loss_mlp": 0.34399414, "step": 1796, "time_per_iteration": 3.0808980464935303 }, { "auxiliary_loss_clip": 0.01690711, "auxiliary_loss_mlp": 0.01073733, "balance_loss_clip": 1.42690337, "balance_loss_mlp": 1.03873348, "epoch": 0.1080414850443409, "flos": 13597562136960.0, "grad_norm": 2.340393849571511, "language_loss": 0.76884443, "learning_rate": 3.936451478782111e-06, "loss": 0.79648888, "num_input_tokens_seen": 38839865, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.35009766, "step": 1797, "time_per_iteration": 4.293665170669556 }, { "auxiliary_loss_clip": 0.01640993, "auxiliary_loss_mlp": 0.01058211, "balance_loss_clip": 1.39056301, "balance_loss_mlp": 1.0243082, "epoch": 0.10810160829700886, "flos": 16262118259200.0, "grad_norm": 1.8488051374576389, "language_loss": 0.82786107, "learning_rate": 3.936354046338046e-06, "loss": 0.85485315, "num_input_tokens_seen": 38857300, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.33911133, "step": 1798, "time_per_iteration": 2.939959764480591 }, { "auxiliary_loss_clip": 0.01658255, "auxiliary_loss_mlp": 0.01056321, "balance_loss_clip": 1.40425158, "balance_loss_mlp": 1.02308583, "epoch": 0.10816173154967684, "flos": 15166195257600.0, "grad_norm": 2.3986012796790797, "language_loss": 0.87432593, "learning_rate": 3.936256540467242e-06, "loss": 0.90147167, "num_input_tokens_seen": 38874960, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.33227539, "step": 1799, "time_per_iteration": 2.9105916023254395 }, { "auxiliary_loss_clip": 0.01655034, "auxiliary_loss_mlp": 0.01065702, "balance_loss_clip": 1.40304112, "balance_loss_mlp": 1.03270507, "epoch": 0.10822185480234481, "flos": 17794302053760.0, "grad_norm": 1.924142278656181, "language_loss": 0.79055649, "learning_rate": 3.9361589611733955e-06, "loss": 0.81776381, "num_input_tokens_seen": 38893610, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.33007812, "step": 1800, "time_per_iteration": 5.824314594268799 }, { "auxiliary_loss_clip": 0.01652182, "auxiliary_loss_mlp": 0.01059406, "balance_loss_clip": 1.39958096, "balance_loss_mlp": 1.02693391, "epoch": 0.10828197805501277, "flos": 25567600406400.0, "grad_norm": 1.5844643017722237, "language_loss": 0.74124634, "learning_rate": 3.9360613084602075e-06, "loss": 0.76836228, "num_input_tokens_seen": 38913485, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.32495117, "step": 1801, "time_per_iteration": 3.055722713470459 }, { "auxiliary_loss_clip": 0.01666609, "auxiliary_loss_mlp": 0.01054197, "balance_loss_clip": 1.40785897, "balance_loss_mlp": 1.02201045, "epoch": 0.10834210130768075, "flos": 28995782584320.0, "grad_norm": 1.8227728666059286, "language_loss": 0.67621613, "learning_rate": 3.935963582331381e-06, "loss": 0.70342416, "num_input_tokens_seen": 38935650, "router_z_loss_clip": 2.58984375, "router_z_loss_mlp": 0.32202148, "step": 1802, "time_per_iteration": 2.932758331298828 }, { "auxiliary_loss_clip": 0.01666838, "auxiliary_loss_mlp": 0.01066298, "balance_loss_clip": 1.41338158, "balance_loss_mlp": 1.03201389, "epoch": 0.10840222456034872, "flos": 20273263430400.0, "grad_norm": 2.104081932561071, "language_loss": 0.83130378, "learning_rate": 3.935865782790621e-06, "loss": 0.85863513, "num_input_tokens_seen": 38954130, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.34277344, "step": 1803, "time_per_iteration": 2.999124526977539 }, { "auxiliary_loss_clip": 0.01649802, "auxiliary_loss_mlp": 0.01060713, "balance_loss_clip": 1.39870381, "balance_loss_mlp": 1.021088, "epoch": 0.10846234781301668, "flos": 19872049374720.0, "grad_norm": 4.261571665983494, "language_loss": 0.92204106, "learning_rate": 3.9357679098416365e-06, "loss": 0.94914615, "num_input_tokens_seen": 38972905, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.39648438, "step": 1804, "time_per_iteration": 2.9025673866271973 }, { "auxiliary_loss_clip": 0.01669299, "auxiliary_loss_mlp": 0.01054654, "balance_loss_clip": 1.41388154, "balance_loss_mlp": 1.02165675, "epoch": 0.10852247106568465, "flos": 26480281392000.0, "grad_norm": 2.080530908020713, "language_loss": 0.77698398, "learning_rate": 3.935669963488139e-06, "loss": 0.80422354, "num_input_tokens_seen": 38993255, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.32983398, "step": 1805, "time_per_iteration": 2.936420202255249 }, { "auxiliary_loss_clip": 0.01653807, "auxiliary_loss_mlp": 0.01055493, "balance_loss_clip": 1.40541828, "balance_loss_mlp": 1.02373552, "epoch": 0.10858259431835263, "flos": 30093786846720.0, "grad_norm": 1.6889860810304063, "language_loss": 0.87087715, "learning_rate": 3.935571943733843e-06, "loss": 0.89797008, "num_input_tokens_seen": 39012610, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.31738281, "step": 1806, "time_per_iteration": 2.9881298542022705 }, { "auxiliary_loss_clip": 0.01675394, "auxiliary_loss_mlp": 0.01060531, "balance_loss_clip": 1.42206168, "balance_loss_mlp": 1.02462554, "epoch": 0.10864271757102059, "flos": 19072697909760.0, "grad_norm": 2.0395093996297127, "language_loss": 0.81998384, "learning_rate": 3.9354738505824635e-06, "loss": 0.84734309, "num_input_tokens_seen": 39030120, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.359375, "step": 1807, "time_per_iteration": 2.8164284229278564 }, { "auxiliary_loss_clip": 0.01671712, "auxiliary_loss_mlp": 0.01061213, "balance_loss_clip": 1.41857851, "balance_loss_mlp": 1.0281918, "epoch": 0.10870284082368856, "flos": 24725148629760.0, "grad_norm": 1.8362949788031764, "language_loss": 0.79798311, "learning_rate": 3.9353756840377225e-06, "loss": 0.82531238, "num_input_tokens_seen": 39049875, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.33007812, "step": 1808, "time_per_iteration": 2.9492576122283936 }, { "auxiliary_loss_clip": 0.01673934, "auxiliary_loss_mlp": 0.01054853, "balance_loss_clip": 1.41644251, "balance_loss_mlp": 1.02078295, "epoch": 0.10876296407635654, "flos": 20636444592000.0, "grad_norm": 1.7266126194675402, "language_loss": 0.7990272, "learning_rate": 3.935277444103342e-06, "loss": 0.82631505, "num_input_tokens_seen": 39068935, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.34082031, "step": 1809, "time_per_iteration": 3.002049207687378 }, { "auxiliary_loss_clip": 0.01661325, "auxiliary_loss_mlp": 0.01054423, "balance_loss_clip": 1.40977192, "balance_loss_mlp": 1.02149773, "epoch": 0.1088230873290245, "flos": 21589782670080.0, "grad_norm": 2.4738526607636366, "language_loss": 0.8653999, "learning_rate": 3.935179130783046e-06, "loss": 0.89255738, "num_input_tokens_seen": 39087370, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.32885742, "step": 1810, "time_per_iteration": 2.904573440551758 }, { "auxiliary_loss_clip": 0.01699167, "auxiliary_loss_mlp": 0.01065831, "balance_loss_clip": 1.43554401, "balance_loss_mlp": 1.02975821, "epoch": 0.10888321058169247, "flos": 26480281392000.0, "grad_norm": 1.6441629543434517, "language_loss": 0.65335464, "learning_rate": 3.935080744080564e-06, "loss": 0.68100464, "num_input_tokens_seen": 39106635, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 0.3605957, "step": 1811, "time_per_iteration": 2.8799867630004883 }, { "auxiliary_loss_clip": 0.01672289, "auxiliary_loss_mlp": 0.01059761, "balance_loss_clip": 1.41412318, "balance_loss_mlp": 1.02778959, "epoch": 0.10894333383436045, "flos": 25859647463040.0, "grad_norm": 1.9922991203738685, "language_loss": 0.75363183, "learning_rate": 3.934982283999626e-06, "loss": 0.78095222, "num_input_tokens_seen": 39126335, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.31933594, "step": 1812, "time_per_iteration": 2.859090805053711 }, { "auxiliary_loss_clip": 0.01661227, "auxiliary_loss_mlp": 0.01062798, "balance_loss_clip": 1.4074558, "balance_loss_mlp": 1.0289191, "epoch": 0.10900345708702841, "flos": 19546629638400.0, "grad_norm": 1.5626948189007186, "language_loss": 0.74263477, "learning_rate": 3.934883750543966e-06, "loss": 0.76987505, "num_input_tokens_seen": 39144820, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.33837891, "step": 1813, "time_per_iteration": 2.870229482650757 }, { "auxiliary_loss_clip": 0.01656631, "auxiliary_loss_mlp": 0.01071631, "balance_loss_clip": 1.40733194, "balance_loss_mlp": 1.0380137, "epoch": 0.10906358033969638, "flos": 23633840597760.0, "grad_norm": 1.6660099693804868, "language_loss": 0.83760172, "learning_rate": 3.93478514371732e-06, "loss": 0.86488432, "num_input_tokens_seen": 39165945, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.33642578, "step": 1814, "time_per_iteration": 2.8601434230804443 }, { "auxiliary_loss_clip": 0.01676615, "auxiliary_loss_mlp": 0.01061646, "balance_loss_clip": 1.41923046, "balance_loss_mlp": 1.02790976, "epoch": 0.10912370359236434, "flos": 21224520247680.0, "grad_norm": 2.0016969463096252, "language_loss": 0.85410118, "learning_rate": 3.934686463523429e-06, "loss": 0.88148385, "num_input_tokens_seen": 39183520, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.33740234, "step": 1815, "time_per_iteration": 2.8639605045318604 }, { "auxiliary_loss_clip": 0.01654458, "auxiliary_loss_mlp": 0.0106251, "balance_loss_clip": 1.40570319, "balance_loss_mlp": 1.02877355, "epoch": 0.10918382684503232, "flos": 13561112810880.0, "grad_norm": 2.5928474257956524, "language_loss": 0.73184514, "learning_rate": 3.9345877099660315e-06, "loss": 0.75901484, "num_input_tokens_seen": 39201190, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.33764648, "step": 1816, "time_per_iteration": 2.845823287963867 }, { "auxiliary_loss_clip": 0.01666526, "auxiliary_loss_mlp": 0.01060952, "balance_loss_clip": 1.41068316, "balance_loss_mlp": 1.02514195, "epoch": 0.10924395009770028, "flos": 27975789636480.0, "grad_norm": 2.414422067454588, "language_loss": 0.75282502, "learning_rate": 3.9344888830488744e-06, "loss": 0.78009975, "num_input_tokens_seen": 39221210, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.35791016, "step": 1817, "time_per_iteration": 2.9115588665008545 }, { "auxiliary_loss_clip": 0.01662003, "auxiliary_loss_mlp": 0.01065791, "balance_loss_clip": 1.41061044, "balance_loss_mlp": 1.03219819, "epoch": 0.10930407335036825, "flos": 25604864138880.0, "grad_norm": 1.719690892501037, "language_loss": 0.68458045, "learning_rate": 3.934389982775706e-06, "loss": 0.71185839, "num_input_tokens_seen": 39242025, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.33618164, "step": 1818, "time_per_iteration": 2.923327684402466 }, { "auxiliary_loss_clip": 0.01682328, "auxiliary_loss_mlp": 0.01071389, "balance_loss_clip": 1.42408514, "balance_loss_mlp": 1.03829646, "epoch": 0.10936419660303623, "flos": 18415478920320.0, "grad_norm": 2.1591273767934656, "language_loss": 0.74901229, "learning_rate": 3.934291009150275e-06, "loss": 0.77654946, "num_input_tokens_seen": 39259870, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.33105469, "step": 1819, "time_per_iteration": 2.8560845851898193 }, { "auxiliary_loss_clip": 0.01671865, "auxiliary_loss_mlp": 0.01072477, "balance_loss_clip": 1.41970539, "balance_loss_mlp": 1.03735828, "epoch": 0.1094243198557042, "flos": 23850093335040.0, "grad_norm": 2.3534294407903094, "language_loss": 0.74171811, "learning_rate": 3.934191962176335e-06, "loss": 0.76916158, "num_input_tokens_seen": 39278500, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.35131836, "step": 1820, "time_per_iteration": 2.9484684467315674 }, { "auxiliary_loss_clip": 0.01642415, "auxiliary_loss_mlp": 0.01071342, "balance_loss_clip": 1.39250827, "balance_loss_mlp": 1.0318594, "epoch": 0.10948444310837216, "flos": 14651153988480.0, "grad_norm": 2.1726801686114814, "language_loss": 0.8347466, "learning_rate": 3.934092841857642e-06, "loss": 0.86188424, "num_input_tokens_seen": 39294800, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.39526367, "step": 1821, "time_per_iteration": 2.88749623298645 }, { "auxiliary_loss_clip": 0.01642364, "auxiliary_loss_mlp": 0.01058136, "balance_loss_clip": 1.3934958, "balance_loss_mlp": 1.02404237, "epoch": 0.10954456636104014, "flos": 27830354290560.0, "grad_norm": 1.9666655539870395, "language_loss": 0.78304422, "learning_rate": 3.933993648197955e-06, "loss": 0.81004924, "num_input_tokens_seen": 39314625, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.34057617, "step": 1822, "time_per_iteration": 2.9416422843933105 }, { "auxiliary_loss_clip": 0.01637683, "auxiliary_loss_mlp": 0.0106702, "balance_loss_clip": 1.39066935, "balance_loss_mlp": 1.03185368, "epoch": 0.1096046896137081, "flos": 33634212670080.0, "grad_norm": 2.7914516961817997, "language_loss": 0.81274152, "learning_rate": 3.933894381201034e-06, "loss": 0.83978844, "num_input_tokens_seen": 39336465, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.3515625, "step": 1823, "time_per_iteration": 2.9525787830352783 }, { "auxiliary_loss_clip": 0.01649119, "auxiliary_loss_mlp": 0.01069381, "balance_loss_clip": 1.40302587, "balance_loss_mlp": 1.03409517, "epoch": 0.10966481286637607, "flos": 26991431608320.0, "grad_norm": 1.4705873957146391, "language_loss": 0.80902714, "learning_rate": 3.933795040870645e-06, "loss": 0.83621216, "num_input_tokens_seen": 39357930, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.35302734, "step": 1824, "time_per_iteration": 2.915361166000366 }, { "auxiliary_loss_clip": 0.01649849, "auxiliary_loss_mlp": 0.01065102, "balance_loss_clip": 1.40107918, "balance_loss_mlp": 1.03141308, "epoch": 0.10972493611904403, "flos": 23046172145280.0, "grad_norm": 1.806577612849813, "language_loss": 0.89297706, "learning_rate": 3.933695627210554e-06, "loss": 0.92012656, "num_input_tokens_seen": 39376380, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.33666992, "step": 1825, "time_per_iteration": 2.9158551692962646 }, { "auxiliary_loss_clip": 0.01634706, "auxiliary_loss_mlp": 0.01077994, "balance_loss_clip": 1.38566613, "balance_loss_mlp": 1.04106259, "epoch": 0.10978505937171201, "flos": 38118022715520.0, "grad_norm": 2.537309949730699, "language_loss": 0.77586746, "learning_rate": 3.933596140224532e-06, "loss": 0.80299449, "num_input_tokens_seen": 39399935, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.36914062, "step": 1826, "time_per_iteration": 2.970384120941162 }, { "auxiliary_loss_clip": 0.01407482, "auxiliary_loss_mlp": 0.01033529, "balance_loss_clip": 1.25193715, "balance_loss_mlp": 1.01540875, "epoch": 0.10984518262437998, "flos": 59878080126720.0, "grad_norm": 0.8461556239851626, "language_loss": 0.550116, "learning_rate": 3.93349657991635e-06, "loss": 0.57452613, "num_input_tokens_seen": 39460685, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18164062, "step": 1827, "time_per_iteration": 4.752614259719849 }, { "auxiliary_loss_clip": 0.01407866, "auxiliary_loss_mlp": 0.01020111, "balance_loss_clip": 1.25666249, "balance_loss_mlp": 1.00065601, "epoch": 0.10990530587704794, "flos": 66752813871360.0, "grad_norm": 0.7537303323695871, "language_loss": 0.55401498, "learning_rate": 3.933396946289784e-06, "loss": 0.57829475, "num_input_tokens_seen": 39524765, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.19433594, "step": 1828, "time_per_iteration": 3.372375249862671 }, { "auxiliary_loss_clip": 0.01684989, "auxiliary_loss_mlp": 0.01069171, "balance_loss_clip": 1.42781055, "balance_loss_mlp": 1.03219247, "epoch": 0.10996542912971592, "flos": 25458116693760.0, "grad_norm": 2.565385374163624, "language_loss": 0.85528207, "learning_rate": 3.933297239348612e-06, "loss": 0.88282359, "num_input_tokens_seen": 39543640, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.37011719, "step": 1829, "time_per_iteration": 2.9499361515045166 }, { "auxiliary_loss_clip": 0.01665615, "auxiliary_loss_mlp": 0.01074324, "balance_loss_clip": 1.4108212, "balance_loss_mlp": 1.039276, "epoch": 0.11002555238238389, "flos": 44033853271680.0, "grad_norm": 1.8824245693733597, "language_loss": 0.90118903, "learning_rate": 3.933197459096614e-06, "loss": 0.92858845, "num_input_tokens_seen": 39567525, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.35058594, "step": 1830, "time_per_iteration": 3.107515811920166 }, { "auxiliary_loss_clip": 0.01396148, "auxiliary_loss_mlp": 0.01082121, "balance_loss_clip": 1.24750733, "balance_loss_mlp": 1.06543183, "epoch": 0.11008567563505185, "flos": 54096346454400.0, "grad_norm": 0.6984388693805477, "language_loss": 0.55562323, "learning_rate": 3.9330976055375756e-06, "loss": 0.58040595, "num_input_tokens_seen": 39628470, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.16699219, "step": 1831, "time_per_iteration": 3.287799119949341 }, { "auxiliary_loss_clip": 0.01671055, "auxiliary_loss_mlp": 0.01087214, "balance_loss_clip": 1.4143225, "balance_loss_mlp": 1.04944861, "epoch": 0.11014579888771983, "flos": 24253750609920.0, "grad_norm": 2.230561217715295, "language_loss": 0.92645419, "learning_rate": 3.932997678675282e-06, "loss": 0.95403683, "num_input_tokens_seen": 39646670, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.37768555, "step": 1832, "time_per_iteration": 2.8702781200408936 }, { "auxiliary_loss_clip": 0.01388233, "auxiliary_loss_mlp": 0.01065625, "balance_loss_clip": 1.23508716, "balance_loss_mlp": 1.04864979, "epoch": 0.1102059221403878, "flos": 57774516007680.0, "grad_norm": 0.71940886324166, "language_loss": 0.60084367, "learning_rate": 3.932897678513523e-06, "loss": 0.62538224, "num_input_tokens_seen": 39712915, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.16992188, "step": 1833, "time_per_iteration": 4.771799325942993 }, { "auxiliary_loss_clip": 0.01648775, "auxiliary_loss_mlp": 0.01071218, "balance_loss_clip": 1.3947643, "balance_loss_mlp": 1.03619456, "epoch": 0.11026604539305576, "flos": 16803519264000.0, "grad_norm": 2.3018917287816927, "language_loss": 0.81952155, "learning_rate": 3.93279760505609e-06, "loss": 0.84672153, "num_input_tokens_seen": 39730650, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.34985352, "step": 1834, "time_per_iteration": 2.9032599925994873 }, { "auxiliary_loss_clip": 0.01654474, "auxiliary_loss_mlp": 0.01069033, "balance_loss_clip": 1.40252173, "balance_loss_mlp": 1.03191125, "epoch": 0.11032616864572373, "flos": 23998695816960.0, "grad_norm": 2.149595243144503, "language_loss": 0.91308945, "learning_rate": 3.932697458306779e-06, "loss": 0.94032449, "num_input_tokens_seen": 39751065, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.37109375, "step": 1835, "time_per_iteration": 5.702774524688721 }, { "auxiliary_loss_clip": 0.01638956, "auxiliary_loss_mlp": 0.01065215, "balance_loss_clip": 1.38836253, "balance_loss_mlp": 1.03097773, "epoch": 0.1103862918983917, "flos": 19692743656320.0, "grad_norm": 1.9907897611475591, "language_loss": 0.65434349, "learning_rate": 3.932597238269386e-06, "loss": 0.68138516, "num_input_tokens_seen": 39769245, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.34228516, "step": 1836, "time_per_iteration": 2.986696243286133 }, { "auxiliary_loss_clip": 0.01627838, "auxiliary_loss_mlp": 0.01061398, "balance_loss_clip": 1.38021564, "balance_loss_mlp": 1.02499139, "epoch": 0.11044641515105967, "flos": 32173615428480.0, "grad_norm": 1.7085426176234162, "language_loss": 0.7456125, "learning_rate": 3.932496944947711e-06, "loss": 0.77250493, "num_input_tokens_seen": 39790830, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.36425781, "step": 1837, "time_per_iteration": 2.9840283393859863 }, { "auxiliary_loss_clip": 0.01637966, "auxiliary_loss_mlp": 0.01058195, "balance_loss_clip": 1.38720691, "balance_loss_mlp": 1.02269495, "epoch": 0.11050653840372764, "flos": 16697519400960.0, "grad_norm": 6.5514250287017, "language_loss": 0.7942645, "learning_rate": 3.93239657834556e-06, "loss": 0.82122612, "num_input_tokens_seen": 39809475, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.35498047, "step": 1838, "time_per_iteration": 2.985825300216675 }, { "auxiliary_loss_clip": 0.0162949, "auxiliary_loss_mlp": 0.01058516, "balance_loss_clip": 1.38025939, "balance_loss_mlp": 1.02256274, "epoch": 0.11056666165639562, "flos": 21217145345280.0, "grad_norm": 2.031016457326122, "language_loss": 0.72288847, "learning_rate": 3.932296138466736e-06, "loss": 0.74976856, "num_input_tokens_seen": 39826355, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.359375, "step": 1839, "time_per_iteration": 2.9799442291259766 }, { "auxiliary_loss_clip": 0.01644812, "auxiliary_loss_mlp": 0.01053144, "balance_loss_clip": 1.39069057, "balance_loss_mlp": 1.0169282, "epoch": 0.11062678490906358, "flos": 19173177907200.0, "grad_norm": 2.214835780168386, "language_loss": 0.80713892, "learning_rate": 3.93219562531505e-06, "loss": 0.83411849, "num_input_tokens_seen": 39845335, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.36230469, "step": 1840, "time_per_iteration": 3.0268166065216064 }, { "auxiliary_loss_clip": 0.01628844, "auxiliary_loss_mlp": 0.01052862, "balance_loss_clip": 1.38154745, "balance_loss_mlp": 1.01879215, "epoch": 0.11068690816173155, "flos": 24904861551360.0, "grad_norm": 2.0525940004975842, "language_loss": 0.89295304, "learning_rate": 3.932095038894311e-06, "loss": 0.91977006, "num_input_tokens_seen": 39865065, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.34057617, "step": 1841, "time_per_iteration": 2.9861416816711426 }, { "auxiliary_loss_clip": 0.01624435, "auxiliary_loss_mlp": 0.01060254, "balance_loss_clip": 1.37614453, "balance_loss_mlp": 1.02380013, "epoch": 0.11074703141439952, "flos": 16481130929280.0, "grad_norm": 2.140971247823854, "language_loss": 0.9160136, "learning_rate": 3.931994379208334e-06, "loss": 0.94286048, "num_input_tokens_seen": 39882780, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.36450195, "step": 1842, "time_per_iteration": 2.8882813453674316 }, { "auxiliary_loss_clip": 0.01635521, "auxiliary_loss_mlp": 0.01065127, "balance_loss_clip": 1.38401675, "balance_loss_mlp": 1.02953124, "epoch": 0.11080715466706749, "flos": 19181910153600.0, "grad_norm": 1.924028775228275, "language_loss": 0.87571752, "learning_rate": 3.931893646260937e-06, "loss": 0.90272403, "num_input_tokens_seen": 39900295, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.35571289, "step": 1843, "time_per_iteration": 2.9157257080078125 }, { "auxiliary_loss_clip": 0.01636513, "auxiliary_loss_mlp": 0.0105775, "balance_loss_clip": 1.38599646, "balance_loss_mlp": 1.02275038, "epoch": 0.11086727791973545, "flos": 27714988753920.0, "grad_norm": 4.0143476727091745, "language_loss": 0.75696582, "learning_rate": 3.931792840055941e-06, "loss": 0.78390843, "num_input_tokens_seen": 39922075, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.34960938, "step": 1844, "time_per_iteration": 2.8911240100860596 }, { "auxiliary_loss_clip": 0.0164876, "auxiliary_loss_mlp": 0.01057856, "balance_loss_clip": 1.39517415, "balance_loss_mlp": 1.02130687, "epoch": 0.11092740117240343, "flos": 18524102981760.0, "grad_norm": 2.1517356881508576, "language_loss": 0.76697141, "learning_rate": 3.931691960597165e-06, "loss": 0.79403758, "num_input_tokens_seen": 39940115, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.36572266, "step": 1845, "time_per_iteration": 2.8920705318450928 }, { "auxiliary_loss_clip": 0.01635894, "auxiliary_loss_mlp": 0.01057874, "balance_loss_clip": 1.38683081, "balance_loss_mlp": 1.02194476, "epoch": 0.1109875244250714, "flos": 20532299765760.0, "grad_norm": 1.5240346941220115, "language_loss": 0.77778852, "learning_rate": 3.9315910078884375e-06, "loss": 0.80472624, "num_input_tokens_seen": 39959920, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.35961914, "step": 1846, "time_per_iteration": 2.8380441665649414 }, { "auxiliary_loss_clip": 0.01665076, "auxiliary_loss_mlp": 0.01057006, "balance_loss_clip": 1.40710676, "balance_loss_mlp": 1.02067077, "epoch": 0.11104764767773936, "flos": 14106178644480.0, "grad_norm": 2.356262715001083, "language_loss": 0.87483555, "learning_rate": 3.931489981933584e-06, "loss": 0.9020564, "num_input_tokens_seen": 39974755, "router_z_loss_clip": 2.58007812, "router_z_loss_mlp": 0.36328125, "step": 1847, "time_per_iteration": 2.8154752254486084 }, { "auxiliary_loss_clip": 0.0164584, "auxiliary_loss_mlp": 0.0105266, "balance_loss_clip": 1.39213598, "balance_loss_mlp": 1.01859057, "epoch": 0.11110777093040733, "flos": 20604157787520.0, "grad_norm": 2.0268622062555783, "language_loss": 0.78274798, "learning_rate": 3.931388882736438e-06, "loss": 0.80973303, "num_input_tokens_seen": 39993355, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.34057617, "step": 1848, "time_per_iteration": 2.927241802215576 }, { "auxiliary_loss_clip": 0.01627519, "auxiliary_loss_mlp": 0.01055639, "balance_loss_clip": 1.38274431, "balance_loss_mlp": 1.02102089, "epoch": 0.11116789418307531, "flos": 21879657976320.0, "grad_norm": 2.0356482882871734, "language_loss": 0.78713334, "learning_rate": 3.931287710300832e-06, "loss": 0.81396496, "num_input_tokens_seen": 40012410, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.34619141, "step": 1849, "time_per_iteration": 2.8230950832366943 }, { "auxiliary_loss_clip": 0.01642746, "auxiliary_loss_mlp": 0.01052853, "balance_loss_clip": 1.38677943, "balance_loss_mlp": 1.0173285, "epoch": 0.11122801743574327, "flos": 15531457680000.0, "grad_norm": 3.548742938997845, "language_loss": 0.73455089, "learning_rate": 3.931186464630601e-06, "loss": 0.76150686, "num_input_tokens_seen": 40029315, "router_z_loss_clip": 2.5625, "router_z_loss_mlp": 0.35522461, "step": 1850, "time_per_iteration": 2.822286367416382 }, { "auxiliary_loss_clip": 0.01646059, "auxiliary_loss_mlp": 0.01055789, "balance_loss_clip": 1.39195824, "balance_loss_mlp": 1.02031267, "epoch": 0.11128814068841124, "flos": 14400397451520.0, "grad_norm": 2.418397696347503, "language_loss": 0.83094382, "learning_rate": 3.931085145729588e-06, "loss": 0.85796225, "num_input_tokens_seen": 40045765, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.35473633, "step": 1851, "time_per_iteration": 2.777526617050171 }, { "auxiliary_loss_clip": 0.01641197, "auxiliary_loss_mlp": 0.01060903, "balance_loss_clip": 1.39177895, "balance_loss_mlp": 1.02766788, "epoch": 0.11134826394107922, "flos": 16662517908480.0, "grad_norm": 2.6622498475088943, "language_loss": 0.90035301, "learning_rate": 3.930983753601631e-06, "loss": 0.92737401, "num_input_tokens_seen": 40061660, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.33227539, "step": 1852, "time_per_iteration": 2.870260715484619 }, { "auxiliary_loss_clip": 0.01638331, "auxiliary_loss_mlp": 0.01054477, "balance_loss_clip": 1.38930941, "balance_loss_mlp": 1.02109826, "epoch": 0.11140838719374718, "flos": 16699736396160.0, "grad_norm": 1.7902848803385545, "language_loss": 0.72899139, "learning_rate": 3.930882288250578e-06, "loss": 0.75591946, "num_input_tokens_seen": 40080180, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.33374023, "step": 1853, "time_per_iteration": 2.809765338897705 }, { "auxiliary_loss_clip": 0.01354151, "auxiliary_loss_mlp": 0.01069483, "balance_loss_clip": 1.20754099, "balance_loss_mlp": 1.05002809, "epoch": 0.11146851044641515, "flos": 61003891958400.0, "grad_norm": 0.8057884260550521, "language_loss": 0.53891981, "learning_rate": 3.930780749680273e-06, "loss": 0.56315613, "num_input_tokens_seen": 40138910, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.19433594, "step": 1854, "time_per_iteration": 3.330568313598633 }, { "auxiliary_loss_clip": 0.01660958, "auxiliary_loss_mlp": 0.01057536, "balance_loss_clip": 1.39677119, "balance_loss_mlp": 1.02208328, "epoch": 0.11152863369908313, "flos": 22202317779840.0, "grad_norm": 5.96628180795442, "language_loss": 0.86222744, "learning_rate": 3.9306791378945705e-06, "loss": 0.88941234, "num_input_tokens_seen": 40157745, "router_z_loss_clip": 2.64257812, "router_z_loss_mlp": 0.35449219, "step": 1855, "time_per_iteration": 2.939340353012085 }, { "auxiliary_loss_clip": 0.01637186, "auxiliary_loss_mlp": 0.01054759, "balance_loss_clip": 1.38427782, "balance_loss_mlp": 1.02149975, "epoch": 0.11158875695175109, "flos": 19547172576000.0, "grad_norm": 2.450912124315484, "language_loss": 0.83289385, "learning_rate": 3.9305774528973205e-06, "loss": 0.85981321, "num_input_tokens_seen": 40175375, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.33251953, "step": 1856, "time_per_iteration": 2.8480355739593506 }, { "auxiliary_loss_clip": 0.01625019, "auxiliary_loss_mlp": 0.0105757, "balance_loss_clip": 1.38110185, "balance_loss_mlp": 1.02302301, "epoch": 0.11164888020441906, "flos": 25452461093760.0, "grad_norm": 1.887611352929275, "language_loss": 0.83520371, "learning_rate": 3.93047569469238e-06, "loss": 0.86202955, "num_input_tokens_seen": 40195715, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.34521484, "step": 1857, "time_per_iteration": 2.9490907192230225 }, { "auxiliary_loss_clip": 0.01645625, "auxiliary_loss_mlp": 0.01058231, "balance_loss_clip": 1.38873518, "balance_loss_mlp": 1.02311182, "epoch": 0.11170900345708702, "flos": 15641167616640.0, "grad_norm": 3.2015901476721487, "language_loss": 0.84344876, "learning_rate": 3.930373863283608e-06, "loss": 0.87048727, "num_input_tokens_seen": 40213975, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.35107422, "step": 1858, "time_per_iteration": 2.819706916809082 }, { "auxiliary_loss_clip": 0.01638557, "auxiliary_loss_mlp": 0.01064856, "balance_loss_clip": 1.38975668, "balance_loss_mlp": 1.03147697, "epoch": 0.111769126709755, "flos": 23049791729280.0, "grad_norm": 1.9558637591306052, "language_loss": 0.92822933, "learning_rate": 3.930271958674866e-06, "loss": 0.95526338, "num_input_tokens_seen": 40233905, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.33349609, "step": 1859, "time_per_iteration": 2.9462645053863525 }, { "auxiliary_loss_clip": 0.01647078, "auxiliary_loss_mlp": 0.01079069, "balance_loss_clip": 1.38994169, "balance_loss_mlp": 1.04566681, "epoch": 0.11182924996242297, "flos": 20860615169280.0, "grad_norm": 2.783770588737898, "language_loss": 0.83969688, "learning_rate": 3.930169980870018e-06, "loss": 0.86695838, "num_input_tokens_seen": 40252810, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.33422852, "step": 1860, "time_per_iteration": 2.9032232761383057 }, { "auxiliary_loss_clip": 0.0163233, "auxiliary_loss_mlp": 0.0107674, "balance_loss_clip": 1.3847363, "balance_loss_mlp": 1.04326606, "epoch": 0.11188937321509093, "flos": 17463769655040.0, "grad_norm": 2.0594554813296497, "language_loss": 0.7726326, "learning_rate": 3.930067929872931e-06, "loss": 0.79972327, "num_input_tokens_seen": 40272000, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.3347168, "step": 1861, "time_per_iteration": 2.831134557723999 }, { "auxiliary_loss_clip": 0.01627568, "auxiliary_loss_mlp": 0.010744, "balance_loss_clip": 1.38165188, "balance_loss_mlp": 1.04216599, "epoch": 0.11194949646775891, "flos": 24106143513600.0, "grad_norm": 3.065421765526497, "language_loss": 0.90322745, "learning_rate": 3.929965805687474e-06, "loss": 0.93024719, "num_input_tokens_seen": 40290660, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.32226562, "step": 1862, "time_per_iteration": 4.2993481159210205 }, { "auxiliary_loss_clip": 0.01634528, "auxiliary_loss_mlp": 0.01093217, "balance_loss_clip": 1.38588977, "balance_loss_mlp": 1.0622946, "epoch": 0.11200961972042688, "flos": 25164350334720.0, "grad_norm": 2.2418783495232546, "language_loss": 0.88038135, "learning_rate": 3.92986360831752e-06, "loss": 0.90765882, "num_input_tokens_seen": 40307820, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.30908203, "step": 1863, "time_per_iteration": 2.8739309310913086 }, { "auxiliary_loss_clip": 0.01644105, "auxiliary_loss_mlp": 0.01101404, "balance_loss_clip": 1.39374948, "balance_loss_mlp": 1.06709528, "epoch": 0.11206974297309484, "flos": 21298278551040.0, "grad_norm": 1.720347646684224, "language_loss": 0.66049069, "learning_rate": 3.929761337766945e-06, "loss": 0.68794572, "num_input_tokens_seen": 40327430, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.34326172, "step": 1864, "time_per_iteration": 2.837022304534912 }, { "auxiliary_loss_clip": 0.01642712, "auxiliary_loss_mlp": 0.01083449, "balance_loss_clip": 1.39269328, "balance_loss_mlp": 1.05402851, "epoch": 0.11212986622576282, "flos": 18925136058240.0, "grad_norm": 2.0897821394719083, "language_loss": 0.7522462, "learning_rate": 3.929658994039627e-06, "loss": 0.77950782, "num_input_tokens_seen": 40344545, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.29418945, "step": 1865, "time_per_iteration": 2.86236834526062 }, { "auxiliary_loss_clip": 0.01661242, "auxiliary_loss_mlp": 0.01084772, "balance_loss_clip": 1.40540898, "balance_loss_mlp": 1.05280066, "epoch": 0.11218998947843078, "flos": 22064981253120.0, "grad_norm": 2.1544729068850437, "language_loss": 0.85864747, "learning_rate": 3.929556577139446e-06, "loss": 0.88610768, "num_input_tokens_seen": 40362300, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.31982422, "step": 1866, "time_per_iteration": 2.8472747802734375 }, { "auxiliary_loss_clip": 0.01655986, "auxiliary_loss_mlp": 0.01085421, "balance_loss_clip": 1.40061617, "balance_loss_mlp": 1.05201888, "epoch": 0.11225011273109875, "flos": 24582427971840.0, "grad_norm": 2.0437837376314665, "language_loss": 0.82286239, "learning_rate": 3.929454087070286e-06, "loss": 0.85027647, "num_input_tokens_seen": 40384720, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.33374023, "step": 1867, "time_per_iteration": 2.944963216781616 }, { "auxiliary_loss_clip": 0.01653831, "auxiliary_loss_mlp": 0.01093001, "balance_loss_clip": 1.40238571, "balance_loss_mlp": 1.0610528, "epoch": 0.11231023598376672, "flos": 28450264302720.0, "grad_norm": 2.2563361566156224, "language_loss": 0.87852168, "learning_rate": 3.929351523836035e-06, "loss": 0.90599, "num_input_tokens_seen": 40404000, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.31933594, "step": 1868, "time_per_iteration": 4.2724609375 }, { "auxiliary_loss_clip": 0.01638305, "auxiliary_loss_mlp": 0.01092106, "balance_loss_clip": 1.39220071, "balance_loss_mlp": 1.05948997, "epoch": 0.1123703592364347, "flos": 14433996355200.0, "grad_norm": 2.5509331863416707, "language_loss": 0.70306021, "learning_rate": 3.9292488874405795e-06, "loss": 0.73036432, "num_input_tokens_seen": 40418665, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.32617188, "step": 1869, "time_per_iteration": 2.8070859909057617 }, { "auxiliary_loss_clip": 0.01664939, "auxiliary_loss_mlp": 0.0108573, "balance_loss_clip": 1.40906429, "balance_loss_mlp": 1.05535507, "epoch": 0.11243048248910266, "flos": 22246322987520.0, "grad_norm": 2.877655991592143, "language_loss": 0.78348231, "learning_rate": 3.929146177887814e-06, "loss": 0.81098896, "num_input_tokens_seen": 40437870, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.30371094, "step": 1870, "time_per_iteration": 4.329606056213379 }, { "auxiliary_loss_clip": 0.0166453, "auxiliary_loss_mlp": 0.01083295, "balance_loss_clip": 1.40317988, "balance_loss_mlp": 1.05094171, "epoch": 0.11249060574177062, "flos": 18592341419520.0, "grad_norm": 1.7959799112918144, "language_loss": 0.76857561, "learning_rate": 3.929043395181631e-06, "loss": 0.79605389, "num_input_tokens_seen": 40455570, "router_z_loss_clip": 2.61328125, "router_z_loss_mlp": 0.32348633, "step": 1871, "time_per_iteration": 4.254576206207275 }, { "auxiliary_loss_clip": 0.01654372, "auxiliary_loss_mlp": 0.01071572, "balance_loss_clip": 1.39920235, "balance_loss_mlp": 1.03964758, "epoch": 0.1125507289944386, "flos": 22866775937280.0, "grad_norm": 1.876618550287375, "language_loss": 0.83641607, "learning_rate": 3.928940539325929e-06, "loss": 0.86367553, "num_input_tokens_seen": 40473600, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.3190918, "step": 1872, "time_per_iteration": 2.87638783454895 }, { "auxiliary_loss_clip": 0.01657465, "auxiliary_loss_mlp": 0.01065317, "balance_loss_clip": 1.40300024, "balance_loss_mlp": 1.03272533, "epoch": 0.11261085224710657, "flos": 19684237633920.0, "grad_norm": 2.1589935520138863, "language_loss": 0.84590745, "learning_rate": 3.9288376103246095e-06, "loss": 0.87313533, "num_input_tokens_seen": 40490025, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.32568359, "step": 1873, "time_per_iteration": 2.8316445350646973 }, { "auxiliary_loss_clip": 0.01672092, "auxiliary_loss_mlp": 0.01067576, "balance_loss_clip": 1.41052425, "balance_loss_mlp": 1.03331518, "epoch": 0.11267097549977453, "flos": 26073230757120.0, "grad_norm": 1.9032233603675803, "language_loss": 0.93002272, "learning_rate": 3.928734608181575e-06, "loss": 0.9574194, "num_input_tokens_seen": 40511580, "router_z_loss_clip": 2.61523438, "router_z_loss_mlp": 0.3425293, "step": 1874, "time_per_iteration": 2.930551528930664 }, { "auxiliary_loss_clip": 0.01649259, "auxiliary_loss_mlp": 0.0106707, "balance_loss_clip": 1.3978374, "balance_loss_mlp": 1.03435922, "epoch": 0.11273109875244251, "flos": 21077591823360.0, "grad_norm": 1.7873619829906509, "language_loss": 0.75292051, "learning_rate": 3.928631532900729e-06, "loss": 0.78008378, "num_input_tokens_seen": 40530155, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.32714844, "step": 1875, "time_per_iteration": 2.837376117706299 }, { "auxiliary_loss_clip": 0.01656105, "auxiliary_loss_mlp": 0.01062662, "balance_loss_clip": 1.40711713, "balance_loss_mlp": 1.03195357, "epoch": 0.11279122200511048, "flos": 27100327138560.0, "grad_norm": 1.84118064224662, "language_loss": 0.72393966, "learning_rate": 3.928528384485984e-06, "loss": 0.7511273, "num_input_tokens_seen": 40549500, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.30737305, "step": 1876, "time_per_iteration": 2.896716594696045 }, { "auxiliary_loss_clip": 0.0164538, "auxiliary_loss_mlp": 0.01060814, "balance_loss_clip": 1.3978101, "balance_loss_mlp": 1.02831769, "epoch": 0.11285134525777844, "flos": 20196066522240.0, "grad_norm": 1.8343960209518386, "language_loss": 0.77695894, "learning_rate": 3.9284251629412475e-06, "loss": 0.80402088, "num_input_tokens_seen": 40567475, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.32495117, "step": 1877, "time_per_iteration": 2.83687162399292 }, { "auxiliary_loss_clip": 0.01662992, "auxiliary_loss_mlp": 0.01058239, "balance_loss_clip": 1.41034865, "balance_loss_mlp": 1.02581358, "epoch": 0.11291146851044641, "flos": 12466049460480.0, "grad_norm": 2.383874108514402, "language_loss": 0.89382386, "learning_rate": 3.928321868270436e-06, "loss": 0.92103618, "num_input_tokens_seen": 40583280, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.32421875, "step": 1878, "time_per_iteration": 2.8291823863983154 }, { "auxiliary_loss_clip": 0.01661658, "auxiliary_loss_mlp": 0.01061327, "balance_loss_clip": 1.4077487, "balance_loss_mlp": 1.02706623, "epoch": 0.11297159176311439, "flos": 23852943757440.0, "grad_norm": 2.1440936834894946, "language_loss": 0.83088887, "learning_rate": 3.928218500477466e-06, "loss": 0.85811865, "num_input_tokens_seen": 40603080, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.34277344, "step": 1879, "time_per_iteration": 2.9513301849365234 }, { "auxiliary_loss_clip": 0.01665619, "auxiliary_loss_mlp": 0.01063175, "balance_loss_clip": 1.41115439, "balance_loss_mlp": 1.03003442, "epoch": 0.11303171501578235, "flos": 29941474291200.0, "grad_norm": 1.8970372825853974, "language_loss": 0.7156176, "learning_rate": 3.928115059566259e-06, "loss": 0.74290556, "num_input_tokens_seen": 40623255, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.33129883, "step": 1880, "time_per_iteration": 2.91438889503479 }, { "auxiliary_loss_clip": 0.01652282, "auxiliary_loss_mlp": 0.01055855, "balance_loss_clip": 1.40439749, "balance_loss_mlp": 1.02381158, "epoch": 0.11309183826845032, "flos": 16189264851840.0, "grad_norm": 1.5438667018631862, "language_loss": 0.73862588, "learning_rate": 3.928011545540734e-06, "loss": 0.76570719, "num_input_tokens_seen": 40641570, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.32055664, "step": 1881, "time_per_iteration": 2.8669984340667725 }, { "auxiliary_loss_clip": 0.01660831, "auxiliary_loss_mlp": 0.01053747, "balance_loss_clip": 1.40438342, "balance_loss_mlp": 1.02039194, "epoch": 0.1131519615211183, "flos": 12028205099520.0, "grad_norm": 3.4303585633456346, "language_loss": 0.75630862, "learning_rate": 3.927907958404819e-06, "loss": 0.78345442, "num_input_tokens_seen": 40658775, "router_z_loss_clip": 2.56640625, "router_z_loss_mlp": 0.33374023, "step": 1882, "time_per_iteration": 2.810260772705078 }, { "auxiliary_loss_clip": 0.01676587, "auxiliary_loss_mlp": 0.01056964, "balance_loss_clip": 1.41819596, "balance_loss_mlp": 1.0238955, "epoch": 0.11321208477378626, "flos": 26261313966720.0, "grad_norm": 2.3838427647283993, "language_loss": 0.80935657, "learning_rate": 3.92780429816244e-06, "loss": 0.83669209, "num_input_tokens_seen": 40679555, "router_z_loss_clip": 2.58203125, "router_z_loss_mlp": 0.33081055, "step": 1883, "time_per_iteration": 2.9002041816711426 }, { "auxiliary_loss_clip": 0.01659707, "auxiliary_loss_mlp": 0.01055521, "balance_loss_clip": 1.4043895, "balance_loss_mlp": 1.02116513, "epoch": 0.11327220802645423, "flos": 13634373421440.0, "grad_norm": 2.513887922149616, "language_loss": 0.78301257, "learning_rate": 3.927700564817529e-06, "loss": 0.81016481, "num_input_tokens_seen": 40697295, "router_z_loss_clip": 2.55078125, "router_z_loss_mlp": 0.34326172, "step": 1884, "time_per_iteration": 2.814112424850464 }, { "auxiliary_loss_clip": 0.01382589, "auxiliary_loss_mlp": 0.01022332, "balance_loss_clip": 1.23395729, "balance_loss_mlp": 1.01031542, "epoch": 0.1133323312791222, "flos": 57220944151680.0, "grad_norm": 0.8340940054542284, "language_loss": 0.55358803, "learning_rate": 3.927596758374019e-06, "loss": 0.57763726, "num_input_tokens_seen": 40758095, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.12011719, "step": 1885, "time_per_iteration": 3.2482433319091797 }, { "auxiliary_loss_clip": 0.01639852, "auxiliary_loss_mlp": 0.01059332, "balance_loss_clip": 1.39255214, "balance_loss_mlp": 1.02619219, "epoch": 0.11339245453179017, "flos": 24362103202560.0, "grad_norm": 1.9742138793123498, "language_loss": 0.91773832, "learning_rate": 3.927492878835848e-06, "loss": 0.94473016, "num_input_tokens_seen": 40777140, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.33129883, "step": 1886, "time_per_iteration": 2.904172420501709 }, { "auxiliary_loss_clip": 0.01675758, "auxiliary_loss_mlp": 0.01062019, "balance_loss_clip": 1.42271233, "balance_loss_mlp": 1.0295465, "epoch": 0.11345257778445814, "flos": 22679959582080.0, "grad_norm": 1.8128975459980492, "language_loss": 0.86057675, "learning_rate": 3.927388926206953e-06, "loss": 0.88795447, "num_input_tokens_seen": 40797505, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 0.32495117, "step": 1887, "time_per_iteration": 2.8360402584075928 }, { "auxiliary_loss_clip": 0.01657521, "auxiliary_loss_mlp": 0.01059757, "balance_loss_clip": 1.40727174, "balance_loss_mlp": 1.02881062, "epoch": 0.11351270103712612, "flos": 20996006169600.0, "grad_norm": 2.637537510521018, "language_loss": 0.79017317, "learning_rate": 3.927284900491277e-06, "loss": 0.81734592, "num_input_tokens_seen": 40812970, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.30932617, "step": 1888, "time_per_iteration": 2.8814618587493896 }, { "auxiliary_loss_clip": 0.01693333, "auxiliary_loss_mlp": 0.01071447, "balance_loss_clip": 1.43244219, "balance_loss_mlp": 1.03630376, "epoch": 0.11357282428979408, "flos": 37363581354240.0, "grad_norm": 1.652953250774899, "language_loss": 0.69104517, "learning_rate": 3.927180801692764e-06, "loss": 0.71869296, "num_input_tokens_seen": 40837745, "router_z_loss_clip": 2.61132812, "router_z_loss_mlp": 0.35131836, "step": 1889, "time_per_iteration": 3.017458438873291 }, { "auxiliary_loss_clip": 0.01669315, "auxiliary_loss_mlp": 0.0106626, "balance_loss_clip": 1.4172138, "balance_loss_mlp": 1.03323925, "epoch": 0.11363294754246205, "flos": 21765649783680.0, "grad_norm": 2.027506146688059, "language_loss": 0.84657806, "learning_rate": 3.927076629815362e-06, "loss": 0.87393385, "num_input_tokens_seen": 40856490, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.33032227, "step": 1890, "time_per_iteration": 2.897216558456421 }, { "auxiliary_loss_clip": 0.01665401, "auxiliary_loss_mlp": 0.01067324, "balance_loss_clip": 1.41462159, "balance_loss_mlp": 1.03418422, "epoch": 0.11369307079513001, "flos": 22611902123520.0, "grad_norm": 7.54288467608163, "language_loss": 0.6680755, "learning_rate": 3.926972384863022e-06, "loss": 0.69540274, "num_input_tokens_seen": 40874070, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.33105469, "step": 1891, "time_per_iteration": 2.870441198348999 }, { "auxiliary_loss_clip": 0.01697029, "auxiliary_loss_mlp": 0.01064271, "balance_loss_clip": 1.43371105, "balance_loss_mlp": 1.03120255, "epoch": 0.11375319404779799, "flos": 21954185441280.0, "grad_norm": 1.840886672137919, "language_loss": 0.88987601, "learning_rate": 3.9268680668396956e-06, "loss": 0.91748911, "num_input_tokens_seen": 40892425, "router_z_loss_clip": 2.6328125, "router_z_loss_mlp": 0.33081055, "step": 1892, "time_per_iteration": 2.819755792617798 }, { "auxiliary_loss_clip": 0.0169679, "auxiliary_loss_mlp": 0.01064905, "balance_loss_clip": 1.43439317, "balance_loss_mlp": 1.03317165, "epoch": 0.11381331730046595, "flos": 26406523088640.0, "grad_norm": 2.385998203777028, "language_loss": 0.74447221, "learning_rate": 3.926763675749339e-06, "loss": 0.77208912, "num_input_tokens_seen": 40912190, "router_z_loss_clip": 2.62109375, "router_z_loss_mlp": 0.31689453, "step": 1893, "time_per_iteration": 2.9209349155426025 }, { "auxiliary_loss_clip": 0.01673365, "auxiliary_loss_mlp": 0.01064614, "balance_loss_clip": 1.41975689, "balance_loss_mlp": 1.03338099, "epoch": 0.11387344055313392, "flos": 23815046597760.0, "grad_norm": 5.635954085318265, "language_loss": 0.80806255, "learning_rate": 3.92665921159591e-06, "loss": 0.83544236, "num_input_tokens_seen": 40928395, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.31201172, "step": 1894, "time_per_iteration": 2.8243119716644287 }, { "auxiliary_loss_clip": 0.01698643, "auxiliary_loss_mlp": 0.0106767, "balance_loss_clip": 1.4351685, "balance_loss_mlp": 1.03479242, "epoch": 0.1139335638058019, "flos": 34533727418880.0, "grad_norm": 3.2410356574812047, "language_loss": 0.81184793, "learning_rate": 3.926554674383371e-06, "loss": 0.83951104, "num_input_tokens_seen": 40946555, "router_z_loss_clip": 2.63476562, "router_z_loss_mlp": 0.32861328, "step": 1895, "time_per_iteration": 2.9811582565307617 }, { "auxiliary_loss_clip": 0.01381629, "auxiliary_loss_mlp": 0.01048075, "balance_loss_clip": 1.23468375, "balance_loss_mlp": 1.03443718, "epoch": 0.11399368705846986, "flos": 70621917056640.0, "grad_norm": 0.8205756428737664, "language_loss": 0.63462269, "learning_rate": 3.926450064115686e-06, "loss": 0.65891969, "num_input_tokens_seen": 41004910, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.13671875, "step": 1896, "time_per_iteration": 3.414736270904541 }, { "auxiliary_loss_clip": 0.01664781, "auxiliary_loss_mlp": 0.01067746, "balance_loss_clip": 1.41420054, "balance_loss_mlp": 1.03246033, "epoch": 0.11405381031113783, "flos": 21334094449920.0, "grad_norm": 1.770385539491751, "language_loss": 0.85732973, "learning_rate": 3.926345380796821e-06, "loss": 0.88465506, "num_input_tokens_seen": 41026385, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.35302734, "step": 1897, "time_per_iteration": 4.300881624221802 }, { "auxiliary_loss_clip": 0.0167727, "auxiliary_loss_mlp": 0.01055738, "balance_loss_clip": 1.42179346, "balance_loss_mlp": 1.02512527, "epoch": 0.11411393356380581, "flos": 19729238227200.0, "grad_norm": 2.4746044532681637, "language_loss": 0.81371707, "learning_rate": 3.9262406244307465e-06, "loss": 0.84104711, "num_input_tokens_seen": 41045315, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.30615234, "step": 1898, "time_per_iteration": 2.94209623336792 }, { "auxiliary_loss_clip": 0.01690943, "auxiliary_loss_mlp": 0.01059817, "balance_loss_clip": 1.42956042, "balance_loss_mlp": 1.0278213, "epoch": 0.11417405681647377, "flos": 17539473484800.0, "grad_norm": 1.891187306031339, "language_loss": 0.74964106, "learning_rate": 3.926135795021435e-06, "loss": 0.77714866, "num_input_tokens_seen": 41063390, "router_z_loss_clip": 2.6171875, "router_z_loss_mlp": 0.32006836, "step": 1899, "time_per_iteration": 2.9555864334106445 }, { "auxiliary_loss_clip": 0.01385252, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.23745561, "balance_loss_mlp": 1.01140928, "epoch": 0.11423418006914174, "flos": 59704927701120.0, "grad_norm": 0.9260692499633738, "language_loss": 0.63511467, "learning_rate": 3.92603089257286e-06, "loss": 0.65925014, "num_input_tokens_seen": 41124180, "router_z_loss_clip": 1.484375, "router_z_loss_mlp": 0.16894531, "step": 1900, "time_per_iteration": 3.2517733573913574 }, { "auxiliary_loss_clip": 0.0166306, "auxiliary_loss_mlp": 0.01062834, "balance_loss_clip": 1.41216469, "balance_loss_mlp": 1.03074265, "epoch": 0.1142943033218097, "flos": 22972594821120.0, "grad_norm": 1.6641204532924387, "language_loss": 0.79010677, "learning_rate": 3.925925917089001e-06, "loss": 0.81736565, "num_input_tokens_seen": 41143485, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.32080078, "step": 1901, "time_per_iteration": 2.8988401889801025 }, { "auxiliary_loss_clip": 0.01672062, "auxiliary_loss_mlp": 0.01056433, "balance_loss_clip": 1.4176439, "balance_loss_mlp": 1.02529562, "epoch": 0.11435442657447768, "flos": 18264478464000.0, "grad_norm": 1.9676991721351533, "language_loss": 0.85125136, "learning_rate": 3.925820868573839e-06, "loss": 0.87853628, "num_input_tokens_seen": 41161695, "router_z_loss_clip": 2.546875, "router_z_loss_mlp": 0.31103516, "step": 1902, "time_per_iteration": 2.7864139080047607 }, { "auxiliary_loss_clip": 0.01654376, "auxiliary_loss_mlp": 0.01053938, "balance_loss_clip": 1.39953578, "balance_loss_mlp": 1.02363455, "epoch": 0.11441454982714565, "flos": 24071865937920.0, "grad_norm": 4.265625057928174, "language_loss": 0.78950047, "learning_rate": 3.925715747031356e-06, "loss": 0.81658363, "num_input_tokens_seen": 41181715, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.30322266, "step": 1903, "time_per_iteration": 4.1965553760528564 }, { "auxiliary_loss_clip": 0.01660706, "auxiliary_loss_mlp": 0.01049136, "balance_loss_clip": 1.40800953, "balance_loss_mlp": 1.01776052, "epoch": 0.11447467307981361, "flos": 25348587736320.0, "grad_norm": 2.089030685452198, "language_loss": 0.76493818, "learning_rate": 3.925610552465539e-06, "loss": 0.79203659, "num_input_tokens_seen": 41201770, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.31396484, "step": 1904, "time_per_iteration": 2.8917083740234375 }, { "auxiliary_loss_clip": 0.0164803, "auxiliary_loss_mlp": 0.01058796, "balance_loss_clip": 1.39880157, "balance_loss_mlp": 1.02670527, "epoch": 0.11453479633248159, "flos": 21736122912000.0, "grad_norm": 2.289761968153971, "language_loss": 0.93618685, "learning_rate": 3.9255052848803764e-06, "loss": 0.96325505, "num_input_tokens_seen": 41220590, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.32104492, "step": 1905, "time_per_iteration": 4.213242053985596 }, { "auxiliary_loss_clip": 0.01686888, "auxiliary_loss_mlp": 0.01053888, "balance_loss_clip": 1.42166448, "balance_loss_mlp": 1.02213049, "epoch": 0.11459491958514956, "flos": 12977561635200.0, "grad_norm": 2.827277672448864, "language_loss": 0.7868185, "learning_rate": 3.925399944279861e-06, "loss": 0.81422627, "num_input_tokens_seen": 41237250, "router_z_loss_clip": 2.65429688, "router_z_loss_mlp": 0.31762695, "step": 1906, "time_per_iteration": 4.239685535430908 }, { "auxiliary_loss_clip": 0.01641635, "auxiliary_loss_mlp": 0.01062565, "balance_loss_clip": 1.39188266, "balance_loss_mlp": 1.02911472, "epoch": 0.11465504283781752, "flos": 22721612060160.0, "grad_norm": 2.5467313309445743, "language_loss": 0.83686471, "learning_rate": 3.925294530667986e-06, "loss": 0.86390668, "num_input_tokens_seen": 41256680, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.33422852, "step": 1907, "time_per_iteration": 2.820119619369507 }, { "auxiliary_loss_clip": 0.01638303, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.38808608, "balance_loss_mlp": 1.02393377, "epoch": 0.1147151660904855, "flos": 23407317290880.0, "grad_norm": 2.117392929735174, "language_loss": 0.86423796, "learning_rate": 3.92518904404875e-06, "loss": 0.89116573, "num_input_tokens_seen": 41270955, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.30517578, "step": 1908, "time_per_iteration": 2.87188982963562 }, { "auxiliary_loss_clip": 0.01374896, "auxiliary_loss_mlp": 0.01014223, "balance_loss_clip": 1.22866607, "balance_loss_mlp": 0.99534017, "epoch": 0.11477528934315347, "flos": 63037905540480.0, "grad_norm": 0.9217337795532026, "language_loss": 0.61163139, "learning_rate": 3.925083484426153e-06, "loss": 0.6355226, "num_input_tokens_seen": 41319180, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.18847656, "step": 1909, "time_per_iteration": 3.0713250637054443 }, { "auxiliary_loss_clip": 0.01634338, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.38582718, "balance_loss_mlp": 1.01866758, "epoch": 0.11483541259582143, "flos": 16334112015360.0, "grad_norm": 1.8785937704739248, "language_loss": 0.80510473, "learning_rate": 3.924977851804197e-06, "loss": 0.83195698, "num_input_tokens_seen": 41337480, "router_z_loss_clip": 2.48828125, "router_z_loss_mlp": 0.32177734, "step": 1910, "time_per_iteration": 2.843212604522705 }, { "auxiliary_loss_clip": 0.01645012, "auxiliary_loss_mlp": 0.01056312, "balance_loss_clip": 1.39531612, "balance_loss_mlp": 1.02298164, "epoch": 0.1148955358484894, "flos": 21590506586880.0, "grad_norm": 2.0611640541525524, "language_loss": 0.77724105, "learning_rate": 3.9248721461868875e-06, "loss": 0.80425429, "num_input_tokens_seen": 41354650, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.33325195, "step": 1911, "time_per_iteration": 2.8195383548736572 }, { "auxiliary_loss_clip": 0.01614505, "auxiliary_loss_mlp": 0.01046438, "balance_loss_clip": 1.37500489, "balance_loss_mlp": 1.01398945, "epoch": 0.11495565910115738, "flos": 27685416637440.0, "grad_norm": 1.6856586141314842, "language_loss": 0.80012608, "learning_rate": 3.9247663675782336e-06, "loss": 0.8267355, "num_input_tokens_seen": 41376935, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.32470703, "step": 1912, "time_per_iteration": 2.9688098430633545 }, { "auxiliary_loss_clip": 0.01631711, "auxiliary_loss_mlp": 0.01052105, "balance_loss_clip": 1.38720512, "balance_loss_mlp": 1.02118254, "epoch": 0.11501578235382534, "flos": 20642371660800.0, "grad_norm": 1.870224539798296, "language_loss": 0.79869175, "learning_rate": 3.924660515982246e-06, "loss": 0.82552993, "num_input_tokens_seen": 41396105, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.30957031, "step": 1913, "time_per_iteration": 2.8613626956939697 }, { "auxiliary_loss_clip": 0.01638379, "auxiliary_loss_mlp": 0.01050411, "balance_loss_clip": 1.39150214, "balance_loss_mlp": 1.01977396, "epoch": 0.1150759056064933, "flos": 19838676695040.0, "grad_norm": 1.7774886424880454, "language_loss": 0.71086192, "learning_rate": 3.924554591402939e-06, "loss": 0.73774987, "num_input_tokens_seen": 41415600, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.3059082, "step": 1914, "time_per_iteration": 2.8420538902282715 }, { "auxiliary_loss_clip": 0.01392863, "auxiliary_loss_mlp": 0.0105075, "balance_loss_clip": 1.24118495, "balance_loss_mlp": 1.03139079, "epoch": 0.11513602885916129, "flos": 70081104234240.0, "grad_norm": 0.7720932567460119, "language_loss": 0.61094427, "learning_rate": 3.92444859384433e-06, "loss": 0.63538039, "num_input_tokens_seen": 41478760, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.19335938, "step": 1915, "time_per_iteration": 3.4700827598571777 }, { "auxiliary_loss_clip": 0.01636639, "auxiliary_loss_mlp": 0.01056967, "balance_loss_clip": 1.39040875, "balance_loss_mlp": 1.02540016, "epoch": 0.11519615211182925, "flos": 15750063146880.0, "grad_norm": 2.4507270409296984, "language_loss": 0.95198739, "learning_rate": 3.924342523310436e-06, "loss": 0.97892344, "num_input_tokens_seen": 41495720, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.31518555, "step": 1916, "time_per_iteration": 2.8086276054382324 }, { "auxiliary_loss_clip": 0.01630103, "auxiliary_loss_mlp": 0.01059519, "balance_loss_clip": 1.38365626, "balance_loss_mlp": 1.02974057, "epoch": 0.11525627536449722, "flos": 20677327908480.0, "grad_norm": 1.958313030732003, "language_loss": 0.73675263, "learning_rate": 3.9242363798052806e-06, "loss": 0.76364887, "num_input_tokens_seen": 41513585, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.29760742, "step": 1917, "time_per_iteration": 2.8850314617156982 }, { "auxiliary_loss_clip": 0.01623877, "auxiliary_loss_mlp": 0.01051912, "balance_loss_clip": 1.38295829, "balance_loss_mlp": 1.01989198, "epoch": 0.1153163986171652, "flos": 20313015626880.0, "grad_norm": 2.1549398954251595, "language_loss": 0.76779997, "learning_rate": 3.92413016333289e-06, "loss": 0.79455787, "num_input_tokens_seen": 41533390, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.3203125, "step": 1918, "time_per_iteration": 2.9272842407226562 }, { "auxiliary_loss_clip": 0.01633424, "auxiliary_loss_mlp": 0.01058415, "balance_loss_clip": 1.38499808, "balance_loss_mlp": 1.0262289, "epoch": 0.11537652186983316, "flos": 17648233280640.0, "grad_norm": 2.1126943363991524, "language_loss": 0.8881954, "learning_rate": 3.92402387389729e-06, "loss": 0.91511381, "num_input_tokens_seen": 41551015, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.3215332, "step": 1919, "time_per_iteration": 2.814169406890869 }, { "auxiliary_loss_clip": 0.01626699, "auxiliary_loss_mlp": 0.01060252, "balance_loss_clip": 1.38417029, "balance_loss_mlp": 1.02820837, "epoch": 0.11543664512250112, "flos": 21079311125760.0, "grad_norm": 1.8369416067190785, "language_loss": 0.87410855, "learning_rate": 3.923917511502512e-06, "loss": 0.90097803, "num_input_tokens_seen": 41568055, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.3203125, "step": 1920, "time_per_iteration": 2.8678860664367676 }, { "auxiliary_loss_clip": 0.0161576, "auxiliary_loss_mlp": 0.01051158, "balance_loss_clip": 1.37714303, "balance_loss_mlp": 1.0200212, "epoch": 0.11549676837516909, "flos": 22757835162240.0, "grad_norm": 1.9120480231546784, "language_loss": 0.81114936, "learning_rate": 3.923811076152589e-06, "loss": 0.8378185, "num_input_tokens_seen": 41587435, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.3112793, "step": 1921, "time_per_iteration": 2.800992012023926 }, { "auxiliary_loss_clip": 0.01635643, "auxiliary_loss_mlp": 0.01076258, "balance_loss_clip": 1.38180542, "balance_loss_mlp": 1.0415678, "epoch": 0.11555689162783707, "flos": 19177566652800.0, "grad_norm": 1.7523872475809967, "language_loss": 0.7932651, "learning_rate": 3.923704567851557e-06, "loss": 0.82038414, "num_input_tokens_seen": 41604975, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.34692383, "step": 1922, "time_per_iteration": 2.879377841949463 }, { "auxiliary_loss_clip": 0.01618249, "auxiliary_loss_mlp": 0.01067342, "balance_loss_clip": 1.37285495, "balance_loss_mlp": 1.03606153, "epoch": 0.11561701488050503, "flos": 24582880419840.0, "grad_norm": 1.7946493062269744, "language_loss": 0.85143238, "learning_rate": 3.923597986603456e-06, "loss": 0.87828827, "num_input_tokens_seen": 41626155, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.31298828, "step": 1923, "time_per_iteration": 2.903189182281494 }, { "auxiliary_loss_clip": 0.0162974, "auxiliary_loss_mlp": 0.01065785, "balance_loss_clip": 1.38401794, "balance_loss_mlp": 1.03254962, "epoch": 0.115677138133173, "flos": 17101448144640.0, "grad_norm": 1.971587017038377, "language_loss": 0.82354975, "learning_rate": 3.9234913324123264e-06, "loss": 0.85050499, "num_input_tokens_seen": 41644805, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.33227539, "step": 1924, "time_per_iteration": 2.8708131313323975 }, { "auxiliary_loss_clip": 0.01401595, "auxiliary_loss_mlp": 0.01050279, "balance_loss_clip": 1.2491293, "balance_loss_mlp": 1.02805841, "epoch": 0.11573726138584098, "flos": 62735153448960.0, "grad_norm": 0.969200324207284, "language_loss": 0.61336768, "learning_rate": 3.923384605282212e-06, "loss": 0.63788646, "num_input_tokens_seen": 41709345, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.22265625, "step": 1925, "time_per_iteration": 3.368759870529175 }, { "auxiliary_loss_clip": 0.01616445, "auxiliary_loss_mlp": 0.01070957, "balance_loss_clip": 1.37339354, "balance_loss_mlp": 1.03920007, "epoch": 0.11579738463850894, "flos": 22611268696320.0, "grad_norm": 1.7578155488573342, "language_loss": 0.76553285, "learning_rate": 3.923277805217161e-06, "loss": 0.7924068, "num_input_tokens_seen": 41730210, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.31738281, "step": 1926, "time_per_iteration": 2.8073651790618896 }, { "auxiliary_loss_clip": 0.01635733, "auxiliary_loss_mlp": 0.01061601, "balance_loss_clip": 1.38034832, "balance_loss_mlp": 1.02970099, "epoch": 0.11585750789117691, "flos": 21736213401600.0, "grad_norm": 3.1362773429820536, "language_loss": 0.74668348, "learning_rate": 3.923170932221222e-06, "loss": 0.77365685, "num_input_tokens_seen": 41750270, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.31884766, "step": 1927, "time_per_iteration": 2.842369556427002 }, { "auxiliary_loss_clip": 0.01620441, "auxiliary_loss_mlp": 0.01056061, "balance_loss_clip": 1.3727622, "balance_loss_mlp": 1.02516246, "epoch": 0.11591763114384489, "flos": 26298984902400.0, "grad_norm": 2.1250443331058584, "language_loss": 0.87823772, "learning_rate": 3.92306398629845e-06, "loss": 0.90500271, "num_input_tokens_seen": 41772975, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.30883789, "step": 1928, "time_per_iteration": 2.8482730388641357 }, { "auxiliary_loss_clip": 0.0162706, "auxiliary_loss_mlp": 0.01055206, "balance_loss_clip": 1.37674904, "balance_loss_mlp": 1.02561784, "epoch": 0.11597775439651285, "flos": 23010175267200.0, "grad_norm": 1.6132691148975156, "language_loss": 0.79100442, "learning_rate": 3.922956967452898e-06, "loss": 0.81782705, "num_input_tokens_seen": 41791765, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.29589844, "step": 1929, "time_per_iteration": 2.8569881916046143 }, { "auxiliary_loss_clip": 0.01618724, "auxiliary_loss_mlp": 0.010582, "balance_loss_clip": 1.37592947, "balance_loss_mlp": 1.02794468, "epoch": 0.11603787764918082, "flos": 31954331289600.0, "grad_norm": 1.7771524798174432, "language_loss": 0.78176236, "learning_rate": 3.922849875688626e-06, "loss": 0.80853164, "num_input_tokens_seen": 41815615, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.30249023, "step": 1930, "time_per_iteration": 2.947993278503418 }, { "auxiliary_loss_clip": 0.01614923, "auxiliary_loss_mlp": 0.0105988, "balance_loss_clip": 1.37149298, "balance_loss_mlp": 1.02933824, "epoch": 0.1160980009018488, "flos": 22281414969600.0, "grad_norm": 5.600405423930789, "language_loss": 0.73321497, "learning_rate": 3.922742711009693e-06, "loss": 0.75996298, "num_input_tokens_seen": 41834810, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.30517578, "step": 1931, "time_per_iteration": 2.851008653640747 }, { "auxiliary_loss_clip": 0.01626813, "auxiliary_loss_mlp": 0.01057477, "balance_loss_clip": 1.38073719, "balance_loss_mlp": 1.02593446, "epoch": 0.11615812415451676, "flos": 22793832040320.0, "grad_norm": 1.5306770750771606, "language_loss": 0.83253825, "learning_rate": 3.922635473420164e-06, "loss": 0.85938114, "num_input_tokens_seen": 41854975, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.31542969, "step": 1932, "time_per_iteration": 4.297504425048828 }, { "auxiliary_loss_clip": 0.01399413, "auxiliary_loss_mlp": 0.0102507, "balance_loss_clip": 1.2548461, "balance_loss_mlp": 1.00284922, "epoch": 0.11621824740718473, "flos": 67176179896320.0, "grad_norm": 0.7819516798766193, "language_loss": 0.6104387, "learning_rate": 3.922528162924105e-06, "loss": 0.63468349, "num_input_tokens_seen": 41911105, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.22265625, "step": 1933, "time_per_iteration": 3.234625816345215 }, { "auxiliary_loss_clip": 0.01630379, "auxiliary_loss_mlp": 0.01053026, "balance_loss_clip": 1.37848091, "balance_loss_mlp": 1.02162623, "epoch": 0.11627837065985269, "flos": 20385597565440.0, "grad_norm": 2.0536543416230963, "language_loss": 0.87242043, "learning_rate": 3.922420779525586e-06, "loss": 0.89925456, "num_input_tokens_seen": 41931750, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.3137207, "step": 1934, "time_per_iteration": 2.831000804901123 }, { "auxiliary_loss_clip": 0.01649902, "auxiliary_loss_mlp": 0.01057245, "balance_loss_clip": 1.39246106, "balance_loss_mlp": 1.02400959, "epoch": 0.11633849391252067, "flos": 21735806198400.0, "grad_norm": 7.0849099866476175, "language_loss": 0.68154275, "learning_rate": 3.9223133232286776e-06, "loss": 0.70861423, "num_input_tokens_seen": 41949400, "router_z_loss_clip": 2.57421875, "router_z_loss_mlp": 0.33251953, "step": 1935, "time_per_iteration": 2.8797683715820312 }, { "auxiliary_loss_clip": 0.01628501, "auxiliary_loss_mlp": 0.01054739, "balance_loss_clip": 1.38033128, "balance_loss_mlp": 1.0234828, "epoch": 0.11639861716518864, "flos": 18814159267200.0, "grad_norm": 2.3927861703147424, "language_loss": 0.77390611, "learning_rate": 3.922205794037456e-06, "loss": 0.80073851, "num_input_tokens_seen": 41968100, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.3125, "step": 1936, "time_per_iteration": 2.849095582962036 }, { "auxiliary_loss_clip": 0.01638754, "auxiliary_loss_mlp": 0.01051576, "balance_loss_clip": 1.38656974, "balance_loss_mlp": 1.01691031, "epoch": 0.1164587404178566, "flos": 21224836961280.0, "grad_norm": 1.9258805335079625, "language_loss": 0.85200059, "learning_rate": 3.922098191955998e-06, "loss": 0.87890387, "num_input_tokens_seen": 41986375, "router_z_loss_clip": 2.52148438, "router_z_loss_mlp": 0.34667969, "step": 1937, "time_per_iteration": 4.372204065322876 }, { "auxiliary_loss_clip": 0.01614733, "auxiliary_loss_mlp": 0.01049373, "balance_loss_clip": 1.37230146, "balance_loss_mlp": 1.01771057, "epoch": 0.11651886367052458, "flos": 27830128066560.0, "grad_norm": 1.7918743152486405, "language_loss": 0.77388954, "learning_rate": 3.921990516988384e-06, "loss": 0.80053061, "num_input_tokens_seen": 42006055, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.31665039, "step": 1938, "time_per_iteration": 2.958155632019043 }, { "auxiliary_loss_clip": 0.01636528, "auxiliary_loss_mlp": 0.01051672, "balance_loss_clip": 1.38695073, "balance_loss_mlp": 1.01886511, "epoch": 0.11657898692319255, "flos": 22898881762560.0, "grad_norm": 1.6745859729483639, "language_loss": 0.80420578, "learning_rate": 3.921882769138696e-06, "loss": 0.83108783, "num_input_tokens_seen": 42024995, "router_z_loss_clip": 2.49609375, "router_z_loss_mlp": 0.328125, "step": 1939, "time_per_iteration": 2.8416757583618164 }, { "auxiliary_loss_clip": 0.01630732, "auxiliary_loss_mlp": 0.010587, "balance_loss_clip": 1.38280058, "balance_loss_mlp": 1.02510643, "epoch": 0.11663911017586051, "flos": 24326468282880.0, "grad_norm": 2.9626539954795392, "language_loss": 0.87410265, "learning_rate": 3.9217749484110215e-06, "loss": 0.90099698, "num_input_tokens_seen": 42042640, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.3359375, "step": 1940, "time_per_iteration": 4.378810882568359 }, { "auxiliary_loss_clip": 0.01611851, "auxiliary_loss_mlp": 0.0104899, "balance_loss_clip": 1.37223339, "balance_loss_mlp": 1.01639867, "epoch": 0.11669923342852849, "flos": 42355464969600.0, "grad_norm": 6.858556919654926, "language_loss": 0.76876587, "learning_rate": 3.921667054809449e-06, "loss": 0.79537427, "num_input_tokens_seen": 42067005, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.32592773, "step": 1941, "time_per_iteration": 4.345696210861206 }, { "auxiliary_loss_clip": 0.01628025, "auxiliary_loss_mlp": 0.01063419, "balance_loss_clip": 1.38198423, "balance_loss_mlp": 1.02791858, "epoch": 0.11675935668119646, "flos": 14649072727680.0, "grad_norm": 2.100945004161073, "language_loss": 0.90263677, "learning_rate": 3.921559088338068e-06, "loss": 0.92955124, "num_input_tokens_seen": 42082295, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.35498047, "step": 1942, "time_per_iteration": 2.85488224029541 }, { "auxiliary_loss_clip": 0.01616599, "auxiliary_loss_mlp": 0.01051221, "balance_loss_clip": 1.37515855, "balance_loss_mlp": 1.01943994, "epoch": 0.11681947993386442, "flos": 35130037628160.0, "grad_norm": 1.8343489287506176, "language_loss": 0.69120634, "learning_rate": 3.921451049000975e-06, "loss": 0.71788454, "num_input_tokens_seen": 42105295, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.31787109, "step": 1943, "time_per_iteration": 2.9278244972229004 }, { "auxiliary_loss_clip": 0.01618483, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.37522507, "balance_loss_mlp": 1.01732457, "epoch": 0.11687960318653239, "flos": 38997647735040.0, "grad_norm": 1.8250316012368597, "language_loss": 0.70805049, "learning_rate": 3.921342936802265e-06, "loss": 0.73474282, "num_input_tokens_seen": 42125520, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.33447266, "step": 1944, "time_per_iteration": 2.977511167526245 }, { "auxiliary_loss_clip": 0.01622775, "auxiliary_loss_mlp": 0.01054882, "balance_loss_clip": 1.38199544, "balance_loss_mlp": 1.02119303, "epoch": 0.11693972643920036, "flos": 26006168684160.0, "grad_norm": 1.6095932837683644, "language_loss": 0.8338933, "learning_rate": 3.921234751746038e-06, "loss": 0.86066985, "num_input_tokens_seen": 42146335, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.33691406, "step": 1945, "time_per_iteration": 2.8427557945251465 }, { "auxiliary_loss_clip": 0.01614885, "auxiliary_loss_mlp": 0.01051464, "balance_loss_clip": 1.37029243, "balance_loss_mlp": 1.01841879, "epoch": 0.11699984969186833, "flos": 27283659644160.0, "grad_norm": 2.017841983843651, "language_loss": 0.77471745, "learning_rate": 3.9211264938363975e-06, "loss": 0.80138093, "num_input_tokens_seen": 42165320, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.33056641, "step": 1946, "time_per_iteration": 2.902353525161743 }, { "auxiliary_loss_clip": 0.01604489, "auxiliary_loss_mlp": 0.01050677, "balance_loss_clip": 1.36244774, "balance_loss_mlp": 1.01801372, "epoch": 0.1170599729445363, "flos": 15276176663040.0, "grad_norm": 2.0398332160398787, "language_loss": 0.69937718, "learning_rate": 3.921018163077448e-06, "loss": 0.72592884, "num_input_tokens_seen": 42182955, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.32666016, "step": 1947, "time_per_iteration": 2.84840726852417 }, { "auxiliary_loss_clip": 0.01620663, "auxiliary_loss_mlp": 0.01051643, "balance_loss_clip": 1.37659955, "balance_loss_mlp": 1.01897943, "epoch": 0.11712009619720427, "flos": 17173079942400.0, "grad_norm": 3.0739033906511493, "language_loss": 0.85927612, "learning_rate": 3.920909759473295e-06, "loss": 0.8859992, "num_input_tokens_seen": 42200760, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.3269043, "step": 1948, "time_per_iteration": 2.806398391723633 }, { "auxiliary_loss_clip": 0.01401078, "auxiliary_loss_mlp": 0.01054135, "balance_loss_clip": 1.25390148, "balance_loss_mlp": 1.02800405, "epoch": 0.11718021944987224, "flos": 70972158205440.0, "grad_norm": 0.8857654417852417, "language_loss": 0.65239882, "learning_rate": 3.920801283028054e-06, "loss": 0.67695093, "num_input_tokens_seen": 42265745, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.26171875, "step": 1949, "time_per_iteration": 3.400667190551758 }, { "auxiliary_loss_clip": 0.01625736, "auxiliary_loss_mlp": 0.01059015, "balance_loss_clip": 1.3822639, "balance_loss_mlp": 1.02575517, "epoch": 0.1172403427025402, "flos": 27464775154560.0, "grad_norm": 1.5962900295590228, "language_loss": 0.7364347, "learning_rate": 3.920692733745835e-06, "loss": 0.76328224, "num_input_tokens_seen": 42286245, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.33251953, "step": 1950, "time_per_iteration": 2.89705491065979 }, { "auxiliary_loss_clip": 0.01622045, "auxiliary_loss_mlp": 0.01052963, "balance_loss_clip": 1.373564, "balance_loss_mlp": 1.01953697, "epoch": 0.11730046595520818, "flos": 15677028760320.0, "grad_norm": 8.98494982383053, "language_loss": 0.78145981, "learning_rate": 3.920584111630755e-06, "loss": 0.8082099, "num_input_tokens_seen": 42302710, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.33447266, "step": 1951, "time_per_iteration": 2.841495990753174 }, { "auxiliary_loss_clip": 0.01635878, "auxiliary_loss_mlp": 0.01053754, "balance_loss_clip": 1.38794291, "balance_loss_mlp": 1.02218747, "epoch": 0.11736058920787615, "flos": 25641268220160.0, "grad_norm": 4.680276844814568, "language_loss": 0.77217746, "learning_rate": 3.9204754166869325e-06, "loss": 0.79907382, "num_input_tokens_seen": 42324115, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.31518555, "step": 1952, "time_per_iteration": 2.912318468093872 }, { "auxiliary_loss_clip": 0.01638673, "auxiliary_loss_mlp": 0.0105861, "balance_loss_clip": 1.38808548, "balance_loss_mlp": 1.02659059, "epoch": 0.11742071246054411, "flos": 21444302079360.0, "grad_norm": 2.1966172253268934, "language_loss": 0.73236883, "learning_rate": 3.920366648918491e-06, "loss": 0.7593416, "num_input_tokens_seen": 42342505, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.3203125, "step": 1953, "time_per_iteration": 2.817875385284424 }, { "auxiliary_loss_clip": 0.01636054, "auxiliary_loss_mlp": 0.01056592, "balance_loss_clip": 1.38251925, "balance_loss_mlp": 1.02237892, "epoch": 0.11748083571321208, "flos": 16006430039040.0, "grad_norm": 2.9283380922772073, "language_loss": 0.81883478, "learning_rate": 3.920257808329552e-06, "loss": 0.84576124, "num_input_tokens_seen": 42360525, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.3425293, "step": 1954, "time_per_iteration": 2.8646225929260254 }, { "auxiliary_loss_clip": 0.01637084, "auxiliary_loss_mlp": 0.0105813, "balance_loss_clip": 1.38594925, "balance_loss_mlp": 1.02401268, "epoch": 0.11754095896588006, "flos": 16188631424640.0, "grad_norm": 1.8589066641785723, "language_loss": 0.86869931, "learning_rate": 3.920148894924246e-06, "loss": 0.89565146, "num_input_tokens_seen": 42377045, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.34106445, "step": 1955, "time_per_iteration": 2.7937235832214355 }, { "auxiliary_loss_clip": 0.0164272, "auxiliary_loss_mlp": 0.01057672, "balance_loss_clip": 1.39204121, "balance_loss_mlp": 1.02467513, "epoch": 0.11760108221854802, "flos": 13269518202240.0, "grad_norm": 2.2919183555517275, "language_loss": 0.78554517, "learning_rate": 3.920039908706701e-06, "loss": 0.81254905, "num_input_tokens_seen": 42393960, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.32983398, "step": 1956, "time_per_iteration": 2.823009729385376 }, { "auxiliary_loss_clip": 0.01612334, "auxiliary_loss_mlp": 0.01057773, "balance_loss_clip": 1.36899126, "balance_loss_mlp": 1.02441812, "epoch": 0.11766120547121599, "flos": 24509076871680.0, "grad_norm": 2.379460927913242, "language_loss": 0.81788468, "learning_rate": 3.91993084968105e-06, "loss": 0.84458572, "num_input_tokens_seen": 42413160, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.33325195, "step": 1957, "time_per_iteration": 2.8674838542938232 }, { "auxiliary_loss_clip": 0.01639825, "auxiliary_loss_mlp": 0.01055665, "balance_loss_clip": 1.38950109, "balance_loss_mlp": 1.02457559, "epoch": 0.11772132872388397, "flos": 17792627996160.0, "grad_norm": 2.0853427656977224, "language_loss": 0.79771483, "learning_rate": 3.919821717851428e-06, "loss": 0.82466972, "num_input_tokens_seen": 42432590, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.31079102, "step": 1958, "time_per_iteration": 2.8826892375946045 }, { "auxiliary_loss_clip": 0.0163205, "auxiliary_loss_mlp": 0.01060318, "balance_loss_clip": 1.38215148, "balance_loss_mlp": 1.02393508, "epoch": 0.11778145197655193, "flos": 13222843551360.0, "grad_norm": 1.6573814230741817, "language_loss": 0.78356433, "learning_rate": 3.919712513221976e-06, "loss": 0.81048799, "num_input_tokens_seen": 42450135, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.36425781, "step": 1959, "time_per_iteration": 2.8553364276885986 }, { "auxiliary_loss_clip": 0.01634177, "auxiliary_loss_mlp": 0.01052799, "balance_loss_clip": 1.38609338, "balance_loss_mlp": 1.0194447, "epoch": 0.1178415752292199, "flos": 20239890750720.0, "grad_norm": 1.9989154594490788, "language_loss": 0.71348727, "learning_rate": 3.919603235796832e-06, "loss": 0.74035698, "num_input_tokens_seen": 42470050, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.33325195, "step": 1960, "time_per_iteration": 2.8179805278778076 }, { "auxiliary_loss_clip": 0.01651076, "auxiliary_loss_mlp": 0.01060945, "balance_loss_clip": 1.39600825, "balance_loss_mlp": 1.02911603, "epoch": 0.11790169848188788, "flos": 13047338396160.0, "grad_norm": 2.307705689389334, "language_loss": 0.82922328, "learning_rate": 3.9194938855801406e-06, "loss": 0.85634351, "num_input_tokens_seen": 42484335, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.31835938, "step": 1961, "time_per_iteration": 2.8357348442077637 }, { "auxiliary_loss_clip": 0.01605067, "auxiliary_loss_mlp": 0.01052452, "balance_loss_clip": 1.36577058, "balance_loss_mlp": 1.02055192, "epoch": 0.11796182173455584, "flos": 22273949577600.0, "grad_norm": 1.8368248602544195, "language_loss": 0.93396556, "learning_rate": 3.919384462576049e-06, "loss": 0.96054077, "num_input_tokens_seen": 42502720, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.3190918, "step": 1962, "time_per_iteration": 2.8189504146575928 }, { "auxiliary_loss_clip": 0.01638184, "auxiliary_loss_mlp": 0.01056928, "balance_loss_clip": 1.38833714, "balance_loss_mlp": 1.02347755, "epoch": 0.1180219449872238, "flos": 10642361546880.0, "grad_norm": 2.2350262647410433, "language_loss": 0.8869524, "learning_rate": 3.919274966788707e-06, "loss": 0.91390359, "num_input_tokens_seen": 42519460, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.33422852, "step": 1963, "time_per_iteration": 2.824310302734375 }, { "auxiliary_loss_clip": 0.01649931, "auxiliary_loss_mlp": 0.01056923, "balance_loss_clip": 1.39766192, "balance_loss_mlp": 1.02564216, "epoch": 0.11808206823989177, "flos": 20933423331840.0, "grad_norm": 1.91263783886209, "language_loss": 0.85249692, "learning_rate": 3.919165398222265e-06, "loss": 0.87956548, "num_input_tokens_seen": 42539420, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.3125, "step": 1964, "time_per_iteration": 2.81848406791687 }, { "auxiliary_loss_clip": 0.01646597, "auxiliary_loss_mlp": 0.01054888, "balance_loss_clip": 1.39873147, "balance_loss_mlp": 1.02358377, "epoch": 0.11814219149255975, "flos": 20787671272320.0, "grad_norm": 1.7930537874067154, "language_loss": 0.84301108, "learning_rate": 3.919055756880879e-06, "loss": 0.87002587, "num_input_tokens_seen": 42558225, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.31323242, "step": 1965, "time_per_iteration": 2.8481717109680176 }, { "auxiliary_loss_clip": 0.01633826, "auxiliary_loss_mlp": 0.01059943, "balance_loss_clip": 1.38282156, "balance_loss_mlp": 1.02799535, "epoch": 0.11820231474522772, "flos": 48776473428480.0, "grad_norm": 1.8864015839202002, "language_loss": 0.75510311, "learning_rate": 3.918946042768707e-06, "loss": 0.78204083, "num_input_tokens_seen": 42580790, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.31933594, "step": 1966, "time_per_iteration": 3.1544883251190186 }, { "auxiliary_loss_clip": 0.01639138, "auxiliary_loss_mlp": 0.01060867, "balance_loss_clip": 1.38861752, "balance_loss_mlp": 1.02856112, "epoch": 0.11826243799789568, "flos": 16699238703360.0, "grad_norm": 2.4480582315859434, "language_loss": 0.74822688, "learning_rate": 3.918836255889908e-06, "loss": 0.77522695, "num_input_tokens_seen": 42597355, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.32324219, "step": 1967, "time_per_iteration": 4.176625728607178 }, { "auxiliary_loss_clip": 0.01621732, "auxiliary_loss_mlp": 0.01050274, "balance_loss_clip": 1.37471867, "balance_loss_mlp": 1.01853991, "epoch": 0.11832256125056366, "flos": 16918658576640.0, "grad_norm": 2.718382355311837, "language_loss": 0.90111315, "learning_rate": 3.9187263962486456e-06, "loss": 0.9278332, "num_input_tokens_seen": 42616060, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.31713867, "step": 1968, "time_per_iteration": 2.834907054901123 }, { "auxiliary_loss_clip": 0.01623874, "auxiliary_loss_mlp": 0.01050778, "balance_loss_clip": 1.37882495, "balance_loss_mlp": 1.01961637, "epoch": 0.11838268450323162, "flos": 22830688569600.0, "grad_norm": 1.8599788077303476, "language_loss": 0.6822868, "learning_rate": 3.918616463849087e-06, "loss": 0.70903331, "num_input_tokens_seen": 42636285, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.31201172, "step": 1969, "time_per_iteration": 2.882693290710449 }, { "auxiliary_loss_clip": 0.01630633, "auxiliary_loss_mlp": 0.01053524, "balance_loss_clip": 1.38541651, "balance_loss_mlp": 1.02119482, "epoch": 0.11844280775589959, "flos": 33558554085120.0, "grad_norm": 2.9259227076254173, "language_loss": 0.82404208, "learning_rate": 3.918506458695399e-06, "loss": 0.8508836, "num_input_tokens_seen": 42658320, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.32299805, "step": 1970, "time_per_iteration": 2.9069132804870605 }, { "auxiliary_loss_clip": 0.01391961, "auxiliary_loss_mlp": 0.01124374, "balance_loss_clip": 1.2440691, "balance_loss_mlp": 1.10377443, "epoch": 0.11850293100856757, "flos": 66382845989760.0, "grad_norm": 0.8170064597394192, "language_loss": 0.66161942, "learning_rate": 3.918396380791754e-06, "loss": 0.68678284, "num_input_tokens_seen": 42721500, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 0.20605469, "step": 1971, "time_per_iteration": 3.4016199111938477 }, { "auxiliary_loss_clip": 0.01647161, "auxiliary_loss_mlp": 0.01054808, "balance_loss_clip": 1.39633191, "balance_loss_mlp": 1.0224545, "epoch": 0.11856305426123553, "flos": 24691775950080.0, "grad_norm": 3.2765403030292495, "language_loss": 0.80649674, "learning_rate": 3.918286230142327e-06, "loss": 0.83351636, "num_input_tokens_seen": 42739825, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.32373047, "step": 1972, "time_per_iteration": 2.84053897857666 }, { "auxiliary_loss_clip": 0.01625499, "auxiliary_loss_mlp": 0.01054747, "balance_loss_clip": 1.38142276, "balance_loss_mlp": 1.02289391, "epoch": 0.1186231775139035, "flos": 24290833363200.0, "grad_norm": 5.542512978830529, "language_loss": 0.74306273, "learning_rate": 3.918176006751292e-06, "loss": 0.76986516, "num_input_tokens_seen": 42758695, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.31835938, "step": 1973, "time_per_iteration": 4.197857618331909 }, { "auxiliary_loss_clip": 0.0162081, "auxiliary_loss_mlp": 0.01045268, "balance_loss_clip": 1.37907052, "balance_loss_mlp": 1.01583564, "epoch": 0.11868330076657148, "flos": 21766147476480.0, "grad_norm": 1.6757297008400915, "language_loss": 0.73398846, "learning_rate": 3.918065710622832e-06, "loss": 0.76064926, "num_input_tokens_seen": 42778510, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.29431152, "step": 1974, "time_per_iteration": 2.897937774658203 }, { "auxiliary_loss_clip": 0.01629611, "auxiliary_loss_mlp": 0.01048469, "balance_loss_clip": 1.3839463, "balance_loss_mlp": 1.01809418, "epoch": 0.11874342401923944, "flos": 17199937370880.0, "grad_norm": 5.865056289949529, "language_loss": 0.79432863, "learning_rate": 3.917955341761128e-06, "loss": 0.82110941, "num_input_tokens_seen": 42793995, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.3034668, "step": 1975, "time_per_iteration": 5.693732500076294 }, { "auxiliary_loss_clip": 0.01622737, "auxiliary_loss_mlp": 0.01052649, "balance_loss_clip": 1.38310981, "balance_loss_mlp": 1.02139246, "epoch": 0.11880354727190741, "flos": 15237419852160.0, "grad_norm": 7.114599470975693, "language_loss": 0.76920015, "learning_rate": 3.917844900170364e-06, "loss": 0.79595399, "num_input_tokens_seen": 42809000, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.31225586, "step": 1976, "time_per_iteration": 2.8625831604003906 }, { "auxiliary_loss_clip": 0.01632616, "auxiliary_loss_mlp": 0.01048267, "balance_loss_clip": 1.38764226, "balance_loss_mlp": 1.01648545, "epoch": 0.11886367052457537, "flos": 27321556803840.0, "grad_norm": 1.7946953498693983, "language_loss": 0.76112443, "learning_rate": 3.91773438585473e-06, "loss": 0.78793323, "num_input_tokens_seen": 42831585, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.31762695, "step": 1977, "time_per_iteration": 2.897394895553589 }, { "auxiliary_loss_clip": 0.01643081, "auxiliary_loss_mlp": 0.01056974, "balance_loss_clip": 1.39340413, "balance_loss_mlp": 1.02648067, "epoch": 0.11892379377724335, "flos": 21808161912960.0, "grad_norm": 2.2135717929174605, "language_loss": 0.75168341, "learning_rate": 3.9176237988184165e-06, "loss": 0.77868396, "num_input_tokens_seen": 42848420, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.30517578, "step": 1978, "time_per_iteration": 2.8633077144622803 }, { "auxiliary_loss_clip": 0.01631046, "auxiliary_loss_mlp": 0.01054735, "balance_loss_clip": 1.38814652, "balance_loss_mlp": 1.02405024, "epoch": 0.11898391702991132, "flos": 13999047661440.0, "grad_norm": 1.9443553269737481, "language_loss": 0.74152172, "learning_rate": 3.917513139065616e-06, "loss": 0.76837951, "num_input_tokens_seen": 42866645, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.30688477, "step": 1979, "time_per_iteration": 2.8243184089660645 }, { "auxiliary_loss_clip": 0.01634357, "auxiliary_loss_mlp": 0.01058523, "balance_loss_clip": 1.38964629, "balance_loss_mlp": 1.02779078, "epoch": 0.11904404028257928, "flos": 32247509466240.0, "grad_norm": 1.6316963206858905, "language_loss": 0.99888003, "learning_rate": 3.917402406600525e-06, "loss": 1.02580893, "num_input_tokens_seen": 42888515, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.30688477, "step": 1980, "time_per_iteration": 2.959196090698242 }, { "auxiliary_loss_clip": 0.01649781, "auxiliary_loss_mlp": 0.01057721, "balance_loss_clip": 1.39881849, "balance_loss_mlp": 1.0243659, "epoch": 0.11910416353524726, "flos": 23596667354880.0, "grad_norm": 1.649537440218896, "language_loss": 0.86876345, "learning_rate": 3.917291601427342e-06, "loss": 0.89583844, "num_input_tokens_seen": 42909035, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.33349609, "step": 1981, "time_per_iteration": 2.8342201709747314 }, { "auxiliary_loss_clip": 0.0163372, "auxiliary_loss_mlp": 0.01054004, "balance_loss_clip": 1.38758159, "balance_loss_mlp": 1.02191246, "epoch": 0.11916428678791523, "flos": 25342705912320.0, "grad_norm": 1.7026565616149083, "language_loss": 0.86245465, "learning_rate": 3.91718072355027e-06, "loss": 0.88933194, "num_input_tokens_seen": 42927555, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.32055664, "step": 1982, "time_per_iteration": 2.9098153114318848 }, { "auxiliary_loss_clip": 0.01635011, "auxiliary_loss_mlp": 0.0104871, "balance_loss_clip": 1.39207435, "balance_loss_mlp": 1.01854992, "epoch": 0.11922441004058319, "flos": 19796978972160.0, "grad_norm": 1.8571862911600066, "language_loss": 0.8615942, "learning_rate": 3.917069772973513e-06, "loss": 0.88843137, "num_input_tokens_seen": 42945300, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.30126953, "step": 1983, "time_per_iteration": 2.839355707168579 }, { "auxiliary_loss_clip": 0.016441, "auxiliary_loss_mlp": 0.01051902, "balance_loss_clip": 1.39477718, "balance_loss_mlp": 1.0200491, "epoch": 0.11928453329325117, "flos": 21544465363200.0, "grad_norm": 2.546085826451954, "language_loss": 0.78706074, "learning_rate": 3.916958749701277e-06, "loss": 0.81402075, "num_input_tokens_seen": 42961295, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.31860352, "step": 1984, "time_per_iteration": 2.7857542037963867 }, { "auxiliary_loss_clip": 0.01646715, "auxiliary_loss_mlp": 0.01052873, "balance_loss_clip": 1.3984108, "balance_loss_mlp": 1.02445304, "epoch": 0.11934465654591914, "flos": 20825115984000.0, "grad_norm": 1.7074737498949688, "language_loss": 0.84247816, "learning_rate": 3.9168476537377745e-06, "loss": 0.86947405, "num_input_tokens_seen": 42980330, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.28442383, "step": 1985, "time_per_iteration": 2.861555337905884 }, { "auxiliary_loss_clip": 0.01629966, "auxiliary_loss_mlp": 0.01054774, "balance_loss_clip": 1.38519251, "balance_loss_mlp": 1.0246141, "epoch": 0.1194047797985871, "flos": 19069304549760.0, "grad_norm": 2.000799394102718, "language_loss": 0.75199908, "learning_rate": 3.916736485087216e-06, "loss": 0.77884638, "num_input_tokens_seen": 42996125, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.30151367, "step": 1986, "time_per_iteration": 2.7705626487731934 }, { "auxiliary_loss_clip": 0.01641384, "auxiliary_loss_mlp": 0.01057417, "balance_loss_clip": 1.39512956, "balance_loss_mlp": 1.02692342, "epoch": 0.11946490305125507, "flos": 27200173708800.0, "grad_norm": 1.783154624070136, "language_loss": 0.73950398, "learning_rate": 3.916625243753819e-06, "loss": 0.76649207, "num_input_tokens_seen": 43014180, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.3046875, "step": 1987, "time_per_iteration": 2.9006142616271973 }, { "auxiliary_loss_clip": 0.0164535, "auxiliary_loss_mlp": 0.01056965, "balance_loss_clip": 1.39704418, "balance_loss_mlp": 1.02587557, "epoch": 0.11952502630392305, "flos": 21150309496320.0, "grad_norm": 3.21165335386776, "language_loss": 0.73814934, "learning_rate": 3.916513929741799e-06, "loss": 0.76517254, "num_input_tokens_seen": 43032120, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.31103516, "step": 1988, "time_per_iteration": 2.8191041946411133 }, { "auxiliary_loss_clip": 0.01634008, "auxiliary_loss_mlp": 0.01057791, "balance_loss_clip": 1.39064336, "balance_loss_mlp": 1.02710629, "epoch": 0.11958514955659101, "flos": 22133581649280.0, "grad_norm": 1.7524285138265456, "language_loss": 0.81368124, "learning_rate": 3.91640254305538e-06, "loss": 0.84059918, "num_input_tokens_seen": 43052215, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.30664062, "step": 1989, "time_per_iteration": 2.9077816009521484 }, { "auxiliary_loss_clip": 0.01641137, "auxiliary_loss_mlp": 0.01057548, "balance_loss_clip": 1.39341569, "balance_loss_mlp": 1.02679229, "epoch": 0.11964527280925898, "flos": 17430577954560.0, "grad_norm": 2.18717446910425, "language_loss": 0.77796072, "learning_rate": 3.916291083698784e-06, "loss": 0.80494756, "num_input_tokens_seen": 43069720, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.30810547, "step": 1990, "time_per_iteration": 2.853093147277832 }, { "auxiliary_loss_clip": 0.01415727, "auxiliary_loss_mlp": 0.01046463, "balance_loss_clip": 1.26876545, "balance_loss_mlp": 1.02462375, "epoch": 0.11970539606192696, "flos": 70709321306880.0, "grad_norm": 0.8554876932194154, "language_loss": 0.55336595, "learning_rate": 3.916179551676238e-06, "loss": 0.57798779, "num_input_tokens_seen": 43123130, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.21875, "step": 1991, "time_per_iteration": 3.3667848110198975 }, { "auxiliary_loss_clip": 0.01624975, "auxiliary_loss_mlp": 0.01054974, "balance_loss_clip": 1.38706458, "balance_loss_mlp": 1.02507663, "epoch": 0.11976551931459492, "flos": 21224882206080.0, "grad_norm": 2.15023502285844, "language_loss": 0.79853141, "learning_rate": 3.916067946991971e-06, "loss": 0.82533091, "num_input_tokens_seen": 43140015, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.29882812, "step": 1992, "time_per_iteration": 2.8614344596862793 }, { "auxiliary_loss_clip": 0.01641222, "auxiliary_loss_mlp": 0.01055345, "balance_loss_clip": 1.39407158, "balance_loss_mlp": 1.02504253, "epoch": 0.11982564256726289, "flos": 25999020005760.0, "grad_norm": 1.6427895646402095, "language_loss": 0.79822439, "learning_rate": 3.915956269650216e-06, "loss": 0.82519007, "num_input_tokens_seen": 43160105, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.30273438, "step": 1993, "time_per_iteration": 2.896195888519287 }, { "auxiliary_loss_clip": 0.01634601, "auxiliary_loss_mlp": 0.01050249, "balance_loss_clip": 1.39066386, "balance_loss_mlp": 1.02116179, "epoch": 0.11988576581993086, "flos": 21660328592640.0, "grad_norm": 1.9588540730003674, "language_loss": 0.83678639, "learning_rate": 3.915844519655208e-06, "loss": 0.86363494, "num_input_tokens_seen": 43179835, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.29101562, "step": 1994, "time_per_iteration": 2.894418478012085 }, { "auxiliary_loss_clip": 0.0163142, "auxiliary_loss_mlp": 0.01056538, "balance_loss_clip": 1.3920325, "balance_loss_mlp": 1.02547216, "epoch": 0.11994588907259883, "flos": 17866160075520.0, "grad_norm": 2.722007902691198, "language_loss": 0.90140629, "learning_rate": 3.915732697011183e-06, "loss": 0.92828584, "num_input_tokens_seen": 43197210, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.31054688, "step": 1995, "time_per_iteration": 2.8299598693847656 }, { "auxiliary_loss_clip": 0.01631871, "auxiliary_loss_mlp": 0.01049652, "balance_loss_clip": 1.38650286, "balance_loss_mlp": 1.01899135, "epoch": 0.1200060123252668, "flos": 24473577686400.0, "grad_norm": 2.3139917583602885, "language_loss": 0.74968302, "learning_rate": 3.9156208017223825e-06, "loss": 0.7764982, "num_input_tokens_seen": 43215050, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.30664062, "step": 1996, "time_per_iteration": 2.850292444229126 }, { "auxiliary_loss_clip": 0.01628128, "auxiliary_loss_mlp": 0.01047788, "balance_loss_clip": 1.38765502, "balance_loss_mlp": 1.01667428, "epoch": 0.12006613557793476, "flos": 18740943901440.0, "grad_norm": 1.7770349507597787, "language_loss": 0.88840735, "learning_rate": 3.915508833793048e-06, "loss": 0.91516656, "num_input_tokens_seen": 43233900, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.31103516, "step": 1997, "time_per_iteration": 2.813540458679199 }, { "auxiliary_loss_clip": 0.0161829, "auxiliary_loss_mlp": 0.01051624, "balance_loss_clip": 1.37703943, "balance_loss_mlp": 1.0202477, "epoch": 0.12012625883060274, "flos": 22276483286400.0, "grad_norm": 1.8488331638574855, "language_loss": 0.79832798, "learning_rate": 3.915396793227428e-06, "loss": 0.82502711, "num_input_tokens_seen": 43252105, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.31347656, "step": 1998, "time_per_iteration": 2.817448616027832 }, { "auxiliary_loss_clip": 0.01618624, "auxiliary_loss_mlp": 0.01049176, "balance_loss_clip": 1.38106346, "balance_loss_mlp": 1.01882505, "epoch": 0.1201863820832707, "flos": 21768002513280.0, "grad_norm": 1.7128665612678604, "language_loss": 0.74748743, "learning_rate": 3.915284680029769e-06, "loss": 0.77416539, "num_input_tokens_seen": 43270315, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.30322266, "step": 1999, "time_per_iteration": 2.8867669105529785 }, { "auxiliary_loss_clip": 0.01630135, "auxiliary_loss_mlp": 0.01050208, "balance_loss_clip": 1.38606238, "balance_loss_mlp": 1.02069163, "epoch": 0.12024650533593867, "flos": 21917962339200.0, "grad_norm": 3.608338786650729, "language_loss": 0.77667332, "learning_rate": 3.915172494204323e-06, "loss": 0.80347681, "num_input_tokens_seen": 43289935, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.29516602, "step": 2000, "time_per_iteration": 2.8472232818603516 }, { "auxiliary_loss_clip": 0.01617985, "auxiliary_loss_mlp": 0.01049975, "balance_loss_clip": 1.37765408, "balance_loss_mlp": 1.01931405, "epoch": 0.12030662858860665, "flos": 21699175893120.0, "grad_norm": 1.4814596738643855, "language_loss": 0.85903108, "learning_rate": 3.915060235755344e-06, "loss": 0.88571066, "num_input_tokens_seen": 43309325, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.30664062, "step": 2001, "time_per_iteration": 4.317075967788696 }, { "auxiliary_loss_clip": 0.01617357, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.37809515, "balance_loss_mlp": 1.01702738, "epoch": 0.12036675184127461, "flos": 12941067064320.0, "grad_norm": 3.7620045920970493, "language_loss": 0.75965548, "learning_rate": 3.91494790468709e-06, "loss": 0.78629732, "num_input_tokens_seen": 43327010, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.29785156, "step": 2002, "time_per_iteration": 2.8462488651275635 }, { "auxiliary_loss_clip": 0.01637492, "auxiliary_loss_mlp": 0.0105935, "balance_loss_clip": 1.3893249, "balance_loss_mlp": 1.02804542, "epoch": 0.12042687509394258, "flos": 20861022372480.0, "grad_norm": 2.3809941058326247, "language_loss": 0.80095649, "learning_rate": 3.9148355010038185e-06, "loss": 0.82792497, "num_input_tokens_seen": 43345650, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.31298828, "step": 2003, "time_per_iteration": 2.826965093612671 }, { "auxiliary_loss_clip": 0.01614765, "auxiliary_loss_mlp": 0.01056053, "balance_loss_clip": 1.37722647, "balance_loss_mlp": 1.02412868, "epoch": 0.12048699834661056, "flos": 23889076369920.0, "grad_norm": 1.8262103782388928, "language_loss": 0.73449266, "learning_rate": 3.914723024709793e-06, "loss": 0.7612009, "num_input_tokens_seen": 43365555, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.3190918, "step": 2004, "time_per_iteration": 2.8599326610565186 }, { "auxiliary_loss_clip": 0.01637994, "auxiliary_loss_mlp": 0.01060316, "balance_loss_clip": 1.39074266, "balance_loss_mlp": 1.02805781, "epoch": 0.12054712159927852, "flos": 19766049511680.0, "grad_norm": 1.5702345220040477, "language_loss": 0.7960068, "learning_rate": 3.914610475809279e-06, "loss": 0.82298988, "num_input_tokens_seen": 43384990, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.32250977, "step": 2005, "time_per_iteration": 2.8613545894622803 }, { "auxiliary_loss_clip": 0.01410898, "auxiliary_loss_mlp": 0.01107105, "balance_loss_clip": 1.2707485, "balance_loss_mlp": 1.08393121, "epoch": 0.12060724485194649, "flos": 51697957363200.0, "grad_norm": 1.044777046042512, "language_loss": 0.58150733, "learning_rate": 3.914497854306543e-06, "loss": 0.60668731, "num_input_tokens_seen": 43436335, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.23144531, "step": 2006, "time_per_iteration": 3.1565887928009033 }, { "auxiliary_loss_clip": 0.01612811, "auxiliary_loss_mlp": 0.01057133, "balance_loss_clip": 1.37726843, "balance_loss_mlp": 1.022753, "epoch": 0.12066736810461445, "flos": 18999980236800.0, "grad_norm": 1.6405931835173404, "language_loss": 0.77916747, "learning_rate": 3.9143851602058575e-06, "loss": 0.8058669, "num_input_tokens_seen": 43456495, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.34375, "step": 2007, "time_per_iteration": 2.8739397525787354 }, { "auxiliary_loss_clip": 0.01617659, "auxiliary_loss_mlp": 0.01054622, "balance_loss_clip": 1.37417841, "balance_loss_mlp": 1.02279305, "epoch": 0.12072749135728243, "flos": 16480633236480.0, "grad_norm": 5.135065103620375, "language_loss": 0.85593915, "learning_rate": 3.914272393511494e-06, "loss": 0.88266194, "num_input_tokens_seen": 43473085, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.31811523, "step": 2008, "time_per_iteration": 4.412937164306641 }, { "auxiliary_loss_clip": 0.01607496, "auxiliary_loss_mlp": 0.01061741, "balance_loss_clip": 1.36626005, "balance_loss_mlp": 1.02817178, "epoch": 0.1207876146099504, "flos": 18086122886400.0, "grad_norm": 1.982377361588975, "language_loss": 0.8492341, "learning_rate": 3.91415955422773e-06, "loss": 0.87592643, "num_input_tokens_seen": 43491135, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.33544922, "step": 2009, "time_per_iteration": 2.8293211460113525 }, { "auxiliary_loss_clip": 0.01594824, "auxiliary_loss_mlp": 0.01055255, "balance_loss_clip": 1.35961223, "balance_loss_mlp": 1.02440405, "epoch": 0.12084773786261836, "flos": 21881558257920.0, "grad_norm": 1.8519385814977776, "language_loss": 0.85227579, "learning_rate": 3.914046642358844e-06, "loss": 0.87877661, "num_input_tokens_seen": 43510440, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.30859375, "step": 2010, "time_per_iteration": 5.669464826583862 }, { "auxiliary_loss_clip": 0.01608427, "auxiliary_loss_mlp": 0.01059629, "balance_loss_clip": 1.36823368, "balance_loss_mlp": 1.02715611, "epoch": 0.12090786111528634, "flos": 18342263554560.0, "grad_norm": 3.3810644587327103, "language_loss": 0.85478783, "learning_rate": 3.9139336579091174e-06, "loss": 0.88146842, "num_input_tokens_seen": 43530145, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.32446289, "step": 2011, "time_per_iteration": 2.8427391052246094 }, { "auxiliary_loss_clip": 0.01613522, "auxiliary_loss_mlp": 0.0105387, "balance_loss_clip": 1.3719672, "balance_loss_mlp": 1.02299452, "epoch": 0.1209679843679543, "flos": 21115760451840.0, "grad_norm": 2.5393634901421636, "language_loss": 0.97459733, "learning_rate": 3.913820600882834e-06, "loss": 1.00127125, "num_input_tokens_seen": 43549315, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.30834961, "step": 2012, "time_per_iteration": 2.8172810077667236 }, { "auxiliary_loss_clip": 0.01595965, "auxiliary_loss_mlp": 0.01050823, "balance_loss_clip": 1.36152864, "balance_loss_mlp": 1.01877987, "epoch": 0.12102810762062227, "flos": 29252782903680.0, "grad_norm": 1.8472118826441335, "language_loss": 0.8209306, "learning_rate": 3.913707471284283e-06, "loss": 0.84739846, "num_input_tokens_seen": 43569240, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.32006836, "step": 2013, "time_per_iteration": 2.943225860595703 }, { "auxiliary_loss_clip": 0.01616816, "auxiliary_loss_mlp": 0.01057815, "balance_loss_clip": 1.37174797, "balance_loss_mlp": 1.02639174, "epoch": 0.12108823087329025, "flos": 17939556420480.0, "grad_norm": 3.2206260927798245, "language_loss": 0.78378904, "learning_rate": 3.9135942691177515e-06, "loss": 0.81053543, "num_input_tokens_seen": 43587710, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.31396484, "step": 2014, "time_per_iteration": 2.8425710201263428 }, { "auxiliary_loss_clip": 0.01613316, "auxiliary_loss_mlp": 0.01060558, "balance_loss_clip": 1.37494349, "balance_loss_mlp": 1.03015924, "epoch": 0.12114835412595822, "flos": 22102244985600.0, "grad_norm": 2.7961281310746084, "language_loss": 0.88378966, "learning_rate": 3.913480994387535e-06, "loss": 0.9105283, "num_input_tokens_seen": 43606000, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.3034668, "step": 2015, "time_per_iteration": 2.8273189067840576 }, { "auxiliary_loss_clip": 0.01594537, "auxiliary_loss_mlp": 0.01056396, "balance_loss_clip": 1.35861802, "balance_loss_mlp": 1.02502012, "epoch": 0.12120847737862618, "flos": 20422001646720.0, "grad_norm": 1.9284370533913628, "language_loss": 0.70881188, "learning_rate": 3.913367647097926e-06, "loss": 0.73532116, "num_input_tokens_seen": 43624815, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.3137207, "step": 2016, "time_per_iteration": 2.8837335109710693 }, { "auxiliary_loss_clip": 0.01620071, "auxiliary_loss_mlp": 0.01062108, "balance_loss_clip": 1.37646377, "balance_loss_mlp": 1.03068495, "epoch": 0.12126860063129415, "flos": 22319040660480.0, "grad_norm": 2.372933768031466, "language_loss": 0.82272518, "learning_rate": 3.913254227253225e-06, "loss": 0.84954691, "num_input_tokens_seen": 43643960, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.3145752, "step": 2017, "time_per_iteration": 2.824951410293579 }, { "auxiliary_loss_clip": 0.0161418, "auxiliary_loss_mlp": 0.01066063, "balance_loss_clip": 1.37149048, "balance_loss_mlp": 1.03597403, "epoch": 0.12132872388396213, "flos": 13707272073600.0, "grad_norm": 2.382223348942275, "language_loss": 0.71578526, "learning_rate": 3.913140734857731e-06, "loss": 0.74258775, "num_input_tokens_seen": 43662650, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.30065918, "step": 2018, "time_per_iteration": 2.850609064102173 }, { "auxiliary_loss_clip": 0.01607876, "auxiliary_loss_mlp": 0.01059474, "balance_loss_clip": 1.36720335, "balance_loss_mlp": 1.02955282, "epoch": 0.12138884713663009, "flos": 26477657193600.0, "grad_norm": 1.6655361279672307, "language_loss": 0.73499447, "learning_rate": 3.91302716991575e-06, "loss": 0.76166797, "num_input_tokens_seen": 43684205, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.29931641, "step": 2019, "time_per_iteration": 2.867161989212036 }, { "auxiliary_loss_clip": 0.01609412, "auxiliary_loss_mlp": 0.01063381, "balance_loss_clip": 1.36775088, "balance_loss_mlp": 1.0324347, "epoch": 0.12144897038929806, "flos": 26153006618880.0, "grad_norm": 2.16267773991476, "language_loss": 0.93590027, "learning_rate": 3.912913532431586e-06, "loss": 0.96262825, "num_input_tokens_seen": 43706320, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.30957031, "step": 2020, "time_per_iteration": 2.9060051441192627 }, { "auxiliary_loss_clip": 0.01619783, "auxiliary_loss_mlp": 0.01056743, "balance_loss_clip": 1.3786931, "balance_loss_mlp": 1.02694094, "epoch": 0.12150909364196603, "flos": 24728270520960.0, "grad_norm": 1.8179579846040788, "language_loss": 0.7910918, "learning_rate": 3.912799822409549e-06, "loss": 0.81785703, "num_input_tokens_seen": 43724805, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.29785156, "step": 2021, "time_per_iteration": 2.890885353088379 }, { "auxiliary_loss_clip": 0.0160521, "auxiliary_loss_mlp": 0.01059462, "balance_loss_clip": 1.36697638, "balance_loss_mlp": 1.02880132, "epoch": 0.121569216894634, "flos": 25196591894400.0, "grad_norm": 1.9627962233383696, "language_loss": 0.81516278, "learning_rate": 3.912686039853952e-06, "loss": 0.84180951, "num_input_tokens_seen": 43742320, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.30664062, "step": 2022, "time_per_iteration": 2.8331868648529053 }, { "auxiliary_loss_clip": 0.01616074, "auxiliary_loss_mlp": 0.01061996, "balance_loss_clip": 1.37334764, "balance_loss_mlp": 1.03164577, "epoch": 0.12162934014730196, "flos": 13452081546240.0, "grad_norm": 1.6948097649401113, "language_loss": 0.85581809, "learning_rate": 3.912572184769108e-06, "loss": 0.88259876, "num_input_tokens_seen": 43760665, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.3034668, "step": 2023, "time_per_iteration": 2.9060311317443848 }, { "auxiliary_loss_clip": 0.01623903, "auxiliary_loss_mlp": 0.01060799, "balance_loss_clip": 1.37965965, "balance_loss_mlp": 1.03093696, "epoch": 0.12168946339996994, "flos": 16954881678720.0, "grad_norm": 2.8929058913279957, "language_loss": 0.88144171, "learning_rate": 3.912458257159335e-06, "loss": 0.90828872, "num_input_tokens_seen": 43779020, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.29846191, "step": 2024, "time_per_iteration": 2.8102779388427734 }, { "auxiliary_loss_clip": 0.0160577, "auxiliary_loss_mlp": 0.01058922, "balance_loss_clip": 1.36631131, "balance_loss_mlp": 1.02854729, "epoch": 0.12174958665263791, "flos": 29832624005760.0, "grad_norm": 3.009166264133383, "language_loss": 0.73268396, "learning_rate": 3.912344257028954e-06, "loss": 0.75933087, "num_input_tokens_seen": 43798850, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.3034668, "step": 2025, "time_per_iteration": 2.883385419845581 }, { "auxiliary_loss_clip": 0.01610539, "auxiliary_loss_mlp": 0.01063539, "balance_loss_clip": 1.37072647, "balance_loss_mlp": 1.0303514, "epoch": 0.12180970990530587, "flos": 24651933264000.0, "grad_norm": 1.583315508506995, "language_loss": 0.7689324, "learning_rate": 3.912230184382286e-06, "loss": 0.79567319, "num_input_tokens_seen": 43820130, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.33178711, "step": 2026, "time_per_iteration": 2.904548168182373 }, { "auxiliary_loss_clip": 0.01614673, "auxiliary_loss_mlp": 0.01060934, "balance_loss_clip": 1.37188292, "balance_loss_mlp": 1.02951002, "epoch": 0.12186983315797385, "flos": 20531349624960.0, "grad_norm": 2.494158278865927, "language_loss": 0.9006424, "learning_rate": 3.912116039223659e-06, "loss": 0.92739856, "num_input_tokens_seen": 43838485, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.31396484, "step": 2027, "time_per_iteration": 2.8451380729675293 }, { "auxiliary_loss_clip": 0.01594947, "auxiliary_loss_mlp": 0.01062326, "balance_loss_clip": 1.35983729, "balance_loss_mlp": 1.03156948, "epoch": 0.12192995641064182, "flos": 27829766108160.0, "grad_norm": 1.5757892421732966, "language_loss": 0.76913273, "learning_rate": 3.912001821557399e-06, "loss": 0.79570544, "num_input_tokens_seen": 43859080, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.30761719, "step": 2028, "time_per_iteration": 2.9301092624664307 }, { "auxiliary_loss_clip": 0.01603348, "auxiliary_loss_mlp": 0.01068738, "balance_loss_clip": 1.36393785, "balance_loss_mlp": 1.03657544, "epoch": 0.12199007966330978, "flos": 22027038848640.0, "grad_norm": 1.8307123857150265, "language_loss": 0.7841711, "learning_rate": 3.911887531387839e-06, "loss": 0.81089199, "num_input_tokens_seen": 43879030, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.3215332, "step": 2029, "time_per_iteration": 2.880915403366089 }, { "auxiliary_loss_clip": 0.01611125, "auxiliary_loss_mlp": 0.01065045, "balance_loss_clip": 1.37028217, "balance_loss_mlp": 1.03281116, "epoch": 0.12205020291597775, "flos": 23305751418240.0, "grad_norm": 1.905353357031078, "language_loss": 0.80856109, "learning_rate": 3.911773168719313e-06, "loss": 0.83532274, "num_input_tokens_seen": 43898505, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.32250977, "step": 2030, "time_per_iteration": 2.9699089527130127 }, { "auxiliary_loss_clip": 0.01590415, "auxiliary_loss_mlp": 0.01062462, "balance_loss_clip": 1.35463321, "balance_loss_mlp": 1.02953613, "epoch": 0.12211032616864573, "flos": 26042844234240.0, "grad_norm": 1.938281287557282, "language_loss": 0.75926119, "learning_rate": 3.911658733556155e-06, "loss": 0.78578997, "num_input_tokens_seen": 43917945, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.32910156, "step": 2031, "time_per_iteration": 2.8645670413970947 }, { "auxiliary_loss_clip": 0.01611133, "auxiliary_loss_mlp": 0.01057899, "balance_loss_clip": 1.37108469, "balance_loss_mlp": 1.02797771, "epoch": 0.12217044942131369, "flos": 20420237099520.0, "grad_norm": 2.117690868002167, "language_loss": 0.76141101, "learning_rate": 3.911544225902707e-06, "loss": 0.78810132, "num_input_tokens_seen": 43937385, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.29882812, "step": 2032, "time_per_iteration": 2.869957685470581 }, { "auxiliary_loss_clip": 0.01581308, "auxiliary_loss_mlp": 0.01058047, "balance_loss_clip": 1.34976387, "balance_loss_mlp": 1.02748203, "epoch": 0.12223057267398166, "flos": 22867499854080.0, "grad_norm": 1.8030644261618107, "language_loss": 0.90073138, "learning_rate": 3.911429645763311e-06, "loss": 0.92712498, "num_input_tokens_seen": 43958130, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.30566406, "step": 2033, "time_per_iteration": 2.9194748401641846 }, { "auxiliary_loss_clip": 0.01613895, "auxiliary_loss_mlp": 0.01057799, "balance_loss_clip": 1.36996901, "balance_loss_mlp": 1.02704275, "epoch": 0.12229069592664964, "flos": 20057146427520.0, "grad_norm": 2.1885423244281026, "language_loss": 0.67713737, "learning_rate": 3.911314993142311e-06, "loss": 0.70385432, "num_input_tokens_seen": 43976800, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.30786133, "step": 2034, "time_per_iteration": 2.859017848968506 }, { "auxiliary_loss_clip": 0.01592919, "auxiliary_loss_mlp": 0.01060032, "balance_loss_clip": 1.35861027, "balance_loss_mlp": 1.02670121, "epoch": 0.1223508191793176, "flos": 22284672595200.0, "grad_norm": 1.6189858144593374, "language_loss": 0.77124035, "learning_rate": 3.911200268044055e-06, "loss": 0.79776996, "num_input_tokens_seen": 43996620, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.33325195, "step": 2035, "time_per_iteration": 2.859151601791382 }, { "auxiliary_loss_clip": 0.01619147, "auxiliary_loss_mlp": 0.01061547, "balance_loss_clip": 1.37405109, "balance_loss_mlp": 1.03107738, "epoch": 0.12241094243198557, "flos": 21295337639040.0, "grad_norm": 1.8193557370409685, "language_loss": 0.72322679, "learning_rate": 3.911085470472892e-06, "loss": 0.75003374, "num_input_tokens_seen": 44016175, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.30444336, "step": 2036, "time_per_iteration": 2.8182883262634277 }, { "auxiliary_loss_clip": 0.015945, "auxiliary_loss_mlp": 0.01065439, "balance_loss_clip": 1.35581923, "balance_loss_mlp": 1.03277564, "epoch": 0.12247106568465355, "flos": 17390825758080.0, "grad_norm": 1.8572462155147782, "language_loss": 0.83913732, "learning_rate": 3.910970600433178e-06, "loss": 0.86573666, "num_input_tokens_seen": 44035060, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.32641602, "step": 2037, "time_per_iteration": 4.244171619415283 }, { "auxiliary_loss_clip": 0.01611316, "auxiliary_loss_mlp": 0.01055251, "balance_loss_clip": 1.37069726, "balance_loss_mlp": 1.02428031, "epoch": 0.12253118893732151, "flos": 27054919342080.0, "grad_norm": 2.312552818845248, "language_loss": 0.80960524, "learning_rate": 3.910855657929267e-06, "loss": 0.83627093, "num_input_tokens_seen": 44053330, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.30957031, "step": 2038, "time_per_iteration": 2.900172710418701 }, { "auxiliary_loss_clip": 0.01413415, "auxiliary_loss_mlp": 0.01032685, "balance_loss_clip": 1.26804149, "balance_loss_mlp": 1.00178552, "epoch": 0.12259131218998948, "flos": 53887179168000.0, "grad_norm": 0.8594070717713321, "language_loss": 0.58758175, "learning_rate": 3.910740642965518e-06, "loss": 0.61204278, "num_input_tokens_seen": 44107575, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.30859375, "step": 2039, "time_per_iteration": 3.2369508743286133 }, { "auxiliary_loss_clip": 0.01602628, "auxiliary_loss_mlp": 0.01064672, "balance_loss_clip": 1.36275828, "balance_loss_mlp": 1.0319134, "epoch": 0.12265143544265744, "flos": 17900166182400.0, "grad_norm": 2.629760277663384, "language_loss": 0.82376063, "learning_rate": 3.910625555546292e-06, "loss": 0.85043359, "num_input_tokens_seen": 44126075, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.32788086, "step": 2040, "time_per_iteration": 2.848524570465088 }, { "auxiliary_loss_clip": 0.01590687, "auxiliary_loss_mlp": 0.01053723, "balance_loss_clip": 1.35659456, "balance_loss_mlp": 1.02280021, "epoch": 0.12271155869532542, "flos": 21810243173760.0, "grad_norm": 2.068668669688981, "language_loss": 0.84131414, "learning_rate": 3.910510395675953e-06, "loss": 0.86775827, "num_input_tokens_seen": 44145605, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.30932617, "step": 2041, "time_per_iteration": 2.849118709564209 }, { "auxiliary_loss_clip": 0.01633911, "auxiliary_loss_mlp": 0.01054426, "balance_loss_clip": 1.38674068, "balance_loss_mlp": 1.02378941, "epoch": 0.12277168194799339, "flos": 19838314736640.0, "grad_norm": 1.724008152216135, "language_loss": 0.68690473, "learning_rate": 3.9103951633588694e-06, "loss": 0.71378809, "num_input_tokens_seen": 44164770, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.30615234, "step": 2042, "time_per_iteration": 2.8713154792785645 }, { "auxiliary_loss_clip": 0.01602858, "auxiliary_loss_mlp": 0.01057487, "balance_loss_clip": 1.36425996, "balance_loss_mlp": 1.02563477, "epoch": 0.12283180520066135, "flos": 23231223953280.0, "grad_norm": 1.7881255729665175, "language_loss": 0.82108468, "learning_rate": 3.910279858599409e-06, "loss": 0.84768808, "num_input_tokens_seen": 44184025, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.31860352, "step": 2043, "time_per_iteration": 4.252284526824951 }, { "auxiliary_loss_clip": 0.01612123, "auxiliary_loss_mlp": 0.01052437, "balance_loss_clip": 1.3688798, "balance_loss_mlp": 1.02137101, "epoch": 0.12289192845332933, "flos": 18597770795520.0, "grad_norm": 1.8349287941531383, "language_loss": 0.81947672, "learning_rate": 3.910164481401946e-06, "loss": 0.84612226, "num_input_tokens_seen": 44202950, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.31079102, "step": 2044, "time_per_iteration": 2.839778423309326 }, { "auxiliary_loss_clip": 0.01598773, "auxiliary_loss_mlp": 0.01056047, "balance_loss_clip": 1.35961199, "balance_loss_mlp": 1.02555346, "epoch": 0.1229520517059973, "flos": 25778966705280.0, "grad_norm": 1.9783434856162228, "language_loss": 0.7890442, "learning_rate": 3.910049031770853e-06, "loss": 0.81559241, "num_input_tokens_seen": 44221115, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.3046875, "step": 2045, "time_per_iteration": 4.430351495742798 }, { "auxiliary_loss_clip": 0.01603623, "auxiliary_loss_mlp": 0.01055205, "balance_loss_clip": 1.36371553, "balance_loss_mlp": 1.02232718, "epoch": 0.12301217495866526, "flos": 20897064495360.0, "grad_norm": 2.1510176839202138, "language_loss": 0.69105494, "learning_rate": 3.90993350971051e-06, "loss": 0.7176432, "num_input_tokens_seen": 44240575, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.32861328, "step": 2046, "time_per_iteration": 2.842721700668335 }, { "auxiliary_loss_clip": 0.01603081, "auxiliary_loss_mlp": 0.01054605, "balance_loss_clip": 1.36424696, "balance_loss_mlp": 1.02301407, "epoch": 0.12307229821133324, "flos": 22388003015040.0, "grad_norm": 2.3817370627726704, "language_loss": 0.74176931, "learning_rate": 3.909817915225297e-06, "loss": 0.76834619, "num_input_tokens_seen": 44257145, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.31616211, "step": 2047, "time_per_iteration": 2.860633611679077 }, { "auxiliary_loss_clip": 0.01618564, "auxiliary_loss_mlp": 0.01057989, "balance_loss_clip": 1.37717104, "balance_loss_mlp": 1.02615976, "epoch": 0.1231324214640012, "flos": 23377745174400.0, "grad_norm": 2.4750048746598057, "language_loss": 0.77941227, "learning_rate": 3.909702248319597e-06, "loss": 0.80617785, "num_input_tokens_seen": 44278035, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.31835938, "step": 2048, "time_per_iteration": 2.8816163539886475 }, { "auxiliary_loss_clip": 0.01598136, "auxiliary_loss_mlp": 0.01057161, "balance_loss_clip": 1.3599174, "balance_loss_mlp": 1.02738237, "epoch": 0.12319254471666917, "flos": 23777194682880.0, "grad_norm": 1.9893181066791714, "language_loss": 0.86376965, "learning_rate": 3.909586508997797e-06, "loss": 0.89032257, "num_input_tokens_seen": 44296980, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.29785156, "step": 2049, "time_per_iteration": 2.9355108737945557 }, { "auxiliary_loss_clip": 0.01614375, "auxiliary_loss_mlp": 0.01052964, "balance_loss_clip": 1.37123537, "balance_loss_mlp": 1.02166009, "epoch": 0.12325266796933713, "flos": 23560625232000.0, "grad_norm": 1.9791180563582706, "language_loss": 0.77364701, "learning_rate": 3.909470697264285e-06, "loss": 0.80032045, "num_input_tokens_seen": 44318005, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.31298828, "step": 2050, "time_per_iteration": 2.8553547859191895 }, { "auxiliary_loss_clip": 0.01606519, "auxiliary_loss_mlp": 0.01051928, "balance_loss_clip": 1.36376858, "balance_loss_mlp": 1.02026641, "epoch": 0.12331279122200511, "flos": 24434232693120.0, "grad_norm": 2.548830750245577, "language_loss": 0.82759798, "learning_rate": 3.909354813123452e-06, "loss": 0.85418248, "num_input_tokens_seen": 44335260, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.31640625, "step": 2051, "time_per_iteration": 2.9713497161865234 }, { "auxiliary_loss_clip": 0.01598831, "auxiliary_loss_mlp": 0.01054544, "balance_loss_clip": 1.36317182, "balance_loss_mlp": 1.02414525, "epoch": 0.12337291447467308, "flos": 25495380426240.0, "grad_norm": 1.7358271338958777, "language_loss": 0.80907071, "learning_rate": 3.909238856579693e-06, "loss": 0.83560443, "num_input_tokens_seen": 44355315, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.30444336, "step": 2052, "time_per_iteration": 2.888572931289673 }, { "auxiliary_loss_clip": 0.01628642, "auxiliary_loss_mlp": 0.01051434, "balance_loss_clip": 1.3838551, "balance_loss_mlp": 1.02067828, "epoch": 0.12343303772734104, "flos": 23560444252800.0, "grad_norm": 2.2721944820748092, "language_loss": 0.7566151, "learning_rate": 3.909122827637406e-06, "loss": 0.78341585, "num_input_tokens_seen": 44373020, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.30712891, "step": 2053, "time_per_iteration": 2.924380302429199 }, { "auxiliary_loss_clip": 0.01616202, "auxiliary_loss_mlp": 0.01058773, "balance_loss_clip": 1.37108541, "balance_loss_mlp": 1.02548957, "epoch": 0.12349316098000902, "flos": 47573600423040.0, "grad_norm": 2.0957633771191277, "language_loss": 0.75022447, "learning_rate": 3.909006726300991e-06, "loss": 0.7769742, "num_input_tokens_seen": 44397525, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.33251953, "step": 2054, "time_per_iteration": 3.070680856704712 }, { "auxiliary_loss_clip": 0.01585666, "auxiliary_loss_mlp": 0.01052613, "balance_loss_clip": 1.35108411, "balance_loss_mlp": 1.02080774, "epoch": 0.12355328423267699, "flos": 25056812148480.0, "grad_norm": 1.9862088767527892, "language_loss": 0.85733199, "learning_rate": 3.908890552574849e-06, "loss": 0.8837148, "num_input_tokens_seen": 44415890, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.31787109, "step": 2055, "time_per_iteration": 2.8612966537475586 }, { "auxiliary_loss_clip": 0.01613414, "auxiliary_loss_mlp": 0.01059438, "balance_loss_clip": 1.37337685, "balance_loss_mlp": 1.0290401, "epoch": 0.12361340748534495, "flos": 27720599109120.0, "grad_norm": 2.0199117849927846, "language_loss": 0.80013359, "learning_rate": 3.908774306463384e-06, "loss": 0.8268621, "num_input_tokens_seen": 44436625, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.30419922, "step": 2056, "time_per_iteration": 2.9823648929595947 }, { "auxiliary_loss_clip": 0.01607443, "auxiliary_loss_mlp": 0.01060568, "balance_loss_clip": 1.36569667, "balance_loss_mlp": 1.02904892, "epoch": 0.12367353073801293, "flos": 26151739764480.0, "grad_norm": 2.35527119978301, "language_loss": 0.84369427, "learning_rate": 3.908657987971009e-06, "loss": 0.87037444, "num_input_tokens_seen": 44455265, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.31542969, "step": 2057, "time_per_iteration": 2.8494343757629395 }, { "auxiliary_loss_clip": 0.01622938, "auxiliary_loss_mlp": 0.01060716, "balance_loss_clip": 1.37803745, "balance_loss_mlp": 1.02745605, "epoch": 0.1237336539906809, "flos": 25167065022720.0, "grad_norm": 1.494978850318772, "language_loss": 0.79206991, "learning_rate": 3.90854159710213e-06, "loss": 0.81890643, "num_input_tokens_seen": 44475815, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.33251953, "step": 2058, "time_per_iteration": 2.8970115184783936 }, { "auxiliary_loss_clip": 0.01631329, "auxiliary_loss_mlp": 0.01057908, "balance_loss_clip": 1.3837471, "balance_loss_mlp": 1.02455389, "epoch": 0.12379377724334886, "flos": 15312987947520.0, "grad_norm": 3.4367903217694824, "language_loss": 0.8438639, "learning_rate": 3.9084251338611624e-06, "loss": 0.87075627, "num_input_tokens_seen": 44494045, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.33349609, "step": 2059, "time_per_iteration": 2.8305108547210693 }, { "auxiliary_loss_clip": 0.01613191, "auxiliary_loss_mlp": 0.01055014, "balance_loss_clip": 1.36724842, "balance_loss_mlp": 1.02266085, "epoch": 0.12385390049601683, "flos": 21324593041920.0, "grad_norm": 2.2545793040938076, "language_loss": 0.82500726, "learning_rate": 3.908308598252523e-06, "loss": 0.85168928, "num_input_tokens_seen": 44509120, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.32324219, "step": 2060, "time_per_iteration": 2.8443989753723145 }, { "auxiliary_loss_clip": 0.01617736, "auxiliary_loss_mlp": 0.01057003, "balance_loss_clip": 1.37302625, "balance_loss_mlp": 1.02300465, "epoch": 0.1239140237486848, "flos": 15123140190720.0, "grad_norm": 1.8555092895554524, "language_loss": 0.87633944, "learning_rate": 3.9081919902806306e-06, "loss": 0.90308684, "num_input_tokens_seen": 44525780, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.33984375, "step": 2061, "time_per_iteration": 2.8386049270629883 }, { "auxiliary_loss_clip": 0.01601384, "auxiliary_loss_mlp": 0.0105633, "balance_loss_clip": 1.36383367, "balance_loss_mlp": 1.02361846, "epoch": 0.12397414700135277, "flos": 21985703084160.0, "grad_norm": 1.8263396253836652, "language_loss": 0.86061519, "learning_rate": 3.908075309949906e-06, "loss": 0.88719231, "num_input_tokens_seen": 44543125, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.32714844, "step": 2062, "time_per_iteration": 2.7986247539520264 }, { "auxiliary_loss_clip": 0.01609011, "auxiliary_loss_mlp": 0.01055532, "balance_loss_clip": 1.36881161, "balance_loss_mlp": 1.02262974, "epoch": 0.12403427025402074, "flos": 13407126197760.0, "grad_norm": 1.8372462898799482, "language_loss": 0.80020833, "learning_rate": 3.907958557264774e-06, "loss": 0.82685375, "num_input_tokens_seen": 44560275, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.32885742, "step": 2063, "time_per_iteration": 2.851421594619751 }, { "auxiliary_loss_clip": 0.0161179, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.37013507, "balance_loss_mlp": 1.02420235, "epoch": 0.12409439350668872, "flos": 15312716478720.0, "grad_norm": 1.9889517159580623, "language_loss": 0.80543524, "learning_rate": 3.907841732229663e-06, "loss": 0.83213866, "num_input_tokens_seen": 44577640, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.34350586, "step": 2064, "time_per_iteration": 2.8137762546539307 }, { "auxiliary_loss_clip": 0.01607451, "auxiliary_loss_mlp": 0.01059783, "balance_loss_clip": 1.36713505, "balance_loss_mlp": 1.02511668, "epoch": 0.12415451675935668, "flos": 25020181843200.0, "grad_norm": 2.109343347154861, "language_loss": 0.93825656, "learning_rate": 3.907724834849002e-06, "loss": 0.96492887, "num_input_tokens_seen": 44594860, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.34667969, "step": 2065, "time_per_iteration": 2.892699956893921 }, { "auxiliary_loss_clip": 0.01604085, "auxiliary_loss_mlp": 0.01049644, "balance_loss_clip": 1.36300373, "balance_loss_mlp": 1.01810098, "epoch": 0.12421464001202465, "flos": 23670199434240.0, "grad_norm": 2.0104308233958657, "language_loss": 0.82014525, "learning_rate": 3.907607865127225e-06, "loss": 0.84668249, "num_input_tokens_seen": 44614780, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.31518555, "step": 2066, "time_per_iteration": 2.941105604171753 }, { "auxiliary_loss_clip": 0.01421239, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.2773118, "balance_loss_mlp": 1.00878036, "epoch": 0.12427476326469263, "flos": 65765152972800.0, "grad_norm": 0.886175785832063, "language_loss": 0.63356197, "learning_rate": 3.907490823068766e-06, "loss": 0.65812725, "num_input_tokens_seen": 44671240, "router_z_loss_clip": 1.4375, "router_z_loss_mlp": 0.265625, "step": 2067, "time_per_iteration": 3.3501780033111572 }, { "auxiliary_loss_clip": 0.01616355, "auxiliary_loss_mlp": 0.01059055, "balance_loss_clip": 1.37255049, "balance_loss_mlp": 1.02653444, "epoch": 0.12433488651736059, "flos": 24546204869760.0, "grad_norm": 10.83668345769654, "language_loss": 0.94702333, "learning_rate": 3.907373708678063e-06, "loss": 0.97377741, "num_input_tokens_seen": 44691050, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.32543945, "step": 2068, "time_per_iteration": 2.8932948112487793 }, { "auxiliary_loss_clip": 0.01608624, "auxiliary_loss_mlp": 0.01054359, "balance_loss_clip": 1.36742496, "balance_loss_mlp": 1.02374649, "epoch": 0.12439500977002856, "flos": 21041232986880.0, "grad_norm": 1.8852381052852825, "language_loss": 0.82342184, "learning_rate": 3.9072565219595596e-06, "loss": 0.8500517, "num_input_tokens_seen": 44709850, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.30615234, "step": 2069, "time_per_iteration": 2.8271608352661133 }, { "auxiliary_loss_clip": 0.01618944, "auxiliary_loss_mlp": 0.01062394, "balance_loss_clip": 1.3759048, "balance_loss_mlp": 1.02984965, "epoch": 0.12445513302269653, "flos": 26841109824000.0, "grad_norm": 1.4983882398942479, "language_loss": 0.77530909, "learning_rate": 3.907139262917696e-06, "loss": 0.80212247, "num_input_tokens_seen": 44731475, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.32519531, "step": 2070, "time_per_iteration": 2.9832003116607666 }, { "auxiliary_loss_clip": 0.01621126, "auxiliary_loss_mlp": 0.01054236, "balance_loss_clip": 1.37846744, "balance_loss_mlp": 1.02173924, "epoch": 0.1245152562753645, "flos": 18377988963840.0, "grad_norm": 2.524178346039938, "language_loss": 0.82529175, "learning_rate": 3.907021931556922e-06, "loss": 0.85204536, "num_input_tokens_seen": 44749685, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.32470703, "step": 2071, "time_per_iteration": 4.245940685272217 }, { "auxiliary_loss_clip": 0.01602707, "auxiliary_loss_mlp": 0.01062166, "balance_loss_clip": 1.36543334, "balance_loss_mlp": 1.02966928, "epoch": 0.12457537952803246, "flos": 33120573989760.0, "grad_norm": 1.7099306617010328, "language_loss": 0.79223078, "learning_rate": 3.906904527881684e-06, "loss": 0.81887954, "num_input_tokens_seen": 44772165, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.32470703, "step": 2072, "time_per_iteration": 2.945786714553833 }, { "auxiliary_loss_clip": 0.01621866, "auxiliary_loss_mlp": 0.01060032, "balance_loss_clip": 1.37816036, "balance_loss_mlp": 1.02844179, "epoch": 0.12463550278070043, "flos": 22279605177600.0, "grad_norm": 5.450154722698608, "language_loss": 0.76755404, "learning_rate": 3.9067870518964355e-06, "loss": 0.79437298, "num_input_tokens_seen": 44790580, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.31591797, "step": 2073, "time_per_iteration": 2.8799378871917725 }, { "auxiliary_loss_clip": 0.01597206, "auxiliary_loss_mlp": 0.01063244, "balance_loss_clip": 1.36097407, "balance_loss_mlp": 1.03015184, "epoch": 0.12469562603336841, "flos": 14685884012160.0, "grad_norm": 1.8862269880444142, "language_loss": 0.91217256, "learning_rate": 3.906669503605631e-06, "loss": 0.93877709, "num_input_tokens_seen": 44806730, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.33081055, "step": 2074, "time_per_iteration": 2.8054869174957275 }, { "auxiliary_loss_clip": 0.01615172, "auxiliary_loss_mlp": 0.01054364, "balance_loss_clip": 1.36802602, "balance_loss_mlp": 1.02227283, "epoch": 0.12475574928603637, "flos": 24655552848000.0, "grad_norm": 2.3030832148173905, "language_loss": 0.84889865, "learning_rate": 3.906551883013728e-06, "loss": 0.87559402, "num_input_tokens_seen": 44825550, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.32104492, "step": 2075, "time_per_iteration": 2.9112493991851807 }, { "auxiliary_loss_clip": 0.01623921, "auxiliary_loss_mlp": 0.0106625, "balance_loss_clip": 1.37806177, "balance_loss_mlp": 1.03139365, "epoch": 0.12481587253870434, "flos": 21773160420480.0, "grad_norm": 3.7049025124925286, "language_loss": 0.74916637, "learning_rate": 3.9064341901251865e-06, "loss": 0.77606809, "num_input_tokens_seen": 44844155, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.34863281, "step": 2076, "time_per_iteration": 2.877589225769043 }, { "auxiliary_loss_clip": 0.01607246, "auxiliary_loss_mlp": 0.01051452, "balance_loss_clip": 1.36818254, "balance_loss_mlp": 1.02069616, "epoch": 0.12487599579137232, "flos": 21442085084160.0, "grad_norm": 1.8313329465103614, "language_loss": 0.76940942, "learning_rate": 3.906316424944469e-06, "loss": 0.79599637, "num_input_tokens_seen": 44863780, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.30786133, "step": 2077, "time_per_iteration": 2.827270984649658 }, { "auxiliary_loss_clip": 0.01614529, "auxiliary_loss_mlp": 0.01060402, "balance_loss_clip": 1.37228096, "balance_loss_mlp": 1.02533042, "epoch": 0.12493611904404028, "flos": 16116320954880.0, "grad_norm": 3.3446208754129887, "language_loss": 0.83976728, "learning_rate": 3.906198587476043e-06, "loss": 0.86651659, "num_input_tokens_seen": 44881480, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.35058594, "step": 2078, "time_per_iteration": 4.203848123550415 }, { "auxiliary_loss_clip": 0.01612547, "auxiliary_loss_mlp": 0.0105952, "balance_loss_clip": 1.36819601, "balance_loss_mlp": 1.02611768, "epoch": 0.12499624229670825, "flos": 21590325607680.0, "grad_norm": 1.7074101573953455, "language_loss": 0.76709235, "learning_rate": 3.906080677724374e-06, "loss": 0.79381305, "num_input_tokens_seen": 44900390, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.33374023, "step": 2079, "time_per_iteration": 2.8585753440856934 }, { "auxiliary_loss_clip": 0.01635932, "auxiliary_loss_mlp": 0.01064232, "balance_loss_clip": 1.38797569, "balance_loss_mlp": 1.032022, "epoch": 0.1250563655493762, "flos": 25709099454720.0, "grad_norm": 2.0218307805584326, "language_loss": 0.85496271, "learning_rate": 3.905962695693935e-06, "loss": 0.88196439, "num_input_tokens_seen": 44920375, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.32177734, "step": 2080, "time_per_iteration": 5.707514524459839 }, { "auxiliary_loss_clip": 0.01610978, "auxiliary_loss_mlp": 0.01050698, "balance_loss_clip": 1.36820436, "balance_loss_mlp": 1.01877403, "epoch": 0.12511648880204418, "flos": 16918206128640.0, "grad_norm": 2.083910855234556, "language_loss": 0.8624317, "learning_rate": 3.9058446413892e-06, "loss": 0.88904852, "num_input_tokens_seen": 44938415, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.3190918, "step": 2081, "time_per_iteration": 2.7983481884002686 }, { "auxiliary_loss_clip": 0.01612807, "auxiliary_loss_mlp": 0.01056378, "balance_loss_clip": 1.37084818, "balance_loss_mlp": 1.0238812, "epoch": 0.12517661205471217, "flos": 17576691972480.0, "grad_norm": 1.6667009268914932, "language_loss": 0.77643979, "learning_rate": 3.905726514814646e-06, "loss": 0.80313158, "num_input_tokens_seen": 44957135, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.32519531, "step": 2082, "time_per_iteration": 2.8548994064331055 }, { "auxiliary_loss_clip": 0.01648419, "auxiliary_loss_mlp": 0.01064546, "balance_loss_clip": 1.39516556, "balance_loss_mlp": 1.0305717, "epoch": 0.12523673530738014, "flos": 16042245937920.0, "grad_norm": 2.4157821951766203, "language_loss": 0.81551063, "learning_rate": 3.9056083159747495e-06, "loss": 0.84264028, "num_input_tokens_seen": 44974480, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.33984375, "step": 2083, "time_per_iteration": 2.811779499053955 }, { "auxiliary_loss_clip": 0.01631434, "auxiliary_loss_mlp": 0.01058664, "balance_loss_clip": 1.38342071, "balance_loss_mlp": 1.02604818, "epoch": 0.1252968585600481, "flos": 18817190668800.0, "grad_norm": 2.2044581870548585, "language_loss": 0.91196978, "learning_rate": 3.9054900448739966e-06, "loss": 0.93887079, "num_input_tokens_seen": 44990310, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.32617188, "step": 2084, "time_per_iteration": 2.8120594024658203 }, { "auxiliary_loss_clip": 0.01626564, "auxiliary_loss_mlp": 0.01057248, "balance_loss_clip": 1.38315868, "balance_loss_mlp": 1.02613425, "epoch": 0.12535698181271607, "flos": 27282980972160.0, "grad_norm": 1.7801604830099798, "language_loss": 0.81407249, "learning_rate": 3.905371701516869e-06, "loss": 0.84091055, "num_input_tokens_seen": 45010720, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.31091309, "step": 2085, "time_per_iteration": 2.9167349338531494 }, { "auxiliary_loss_clip": 0.01608488, "auxiliary_loss_mlp": 0.01055148, "balance_loss_clip": 1.37053466, "balance_loss_mlp": 1.02479708, "epoch": 0.12541710506538403, "flos": 22064166846720.0, "grad_norm": 2.2382760727042843, "language_loss": 0.8869642, "learning_rate": 3.905253285907856e-06, "loss": 0.91360056, "num_input_tokens_seen": 45030360, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.3034668, "step": 2086, "time_per_iteration": 2.8384876251220703 }, { "auxiliary_loss_clip": 0.01613716, "auxiliary_loss_mlp": 0.01056868, "balance_loss_clip": 1.3768326, "balance_loss_mlp": 1.02496779, "epoch": 0.125477228318052, "flos": 12610489420800.0, "grad_norm": 1.9929518909390866, "language_loss": 0.8774333, "learning_rate": 3.905134798051447e-06, "loss": 0.90413922, "num_input_tokens_seen": 45045085, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.31884766, "step": 2087, "time_per_iteration": 2.861651659011841 }, { "auxiliary_loss_clip": 0.01637354, "auxiliary_loss_mlp": 0.01064078, "balance_loss_clip": 1.39350486, "balance_loss_mlp": 1.03210652, "epoch": 0.12553735157071996, "flos": 23889121614720.0, "grad_norm": 8.259835630822895, "language_loss": 0.75077236, "learning_rate": 3.905016237952136e-06, "loss": 0.77778673, "num_input_tokens_seen": 45065145, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.31982422, "step": 2088, "time_per_iteration": 2.8551533222198486 }, { "auxiliary_loss_clip": 0.01404792, "auxiliary_loss_mlp": 0.01034445, "balance_loss_clip": 1.26363981, "balance_loss_mlp": 1.01079392, "epoch": 0.12559747482338796, "flos": 69952120012800.0, "grad_norm": 0.7616980179909959, "language_loss": 0.61779481, "learning_rate": 3.904897605614418e-06, "loss": 0.64218718, "num_input_tokens_seen": 45126230, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.23632812, "step": 2089, "time_per_iteration": 3.3417320251464844 }, { "auxiliary_loss_clip": 0.01618893, "auxiliary_loss_mlp": 0.010514, "balance_loss_clip": 1.37842083, "balance_loss_mlp": 1.02071595, "epoch": 0.12565759807605592, "flos": 24290154691200.0, "grad_norm": 1.862208469107207, "language_loss": 0.79005522, "learning_rate": 3.904778901042793e-06, "loss": 0.81675816, "num_input_tokens_seen": 45145545, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.30664062, "step": 2090, "time_per_iteration": 2.9206671714782715 }, { "auxiliary_loss_clip": 0.01396923, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 1.25551224, "balance_loss_mlp": 1.01258039, "epoch": 0.12571772132872389, "flos": 56477298314880.0, "grad_norm": 0.768343626443453, "language_loss": 0.59628248, "learning_rate": 3.90466012424176e-06, "loss": 0.62056255, "num_input_tokens_seen": 45206845, "router_z_loss_clip": 1.4140625, "router_z_loss_mlp": 0.18457031, "step": 2091, "time_per_iteration": 3.2083234786987305 }, { "auxiliary_loss_clip": 0.01618453, "auxiliary_loss_mlp": 0.01057602, "balance_loss_clip": 1.37841034, "balance_loss_mlp": 1.02613103, "epoch": 0.12577784458139185, "flos": 41260944556800.0, "grad_norm": 1.8650488554404532, "language_loss": 0.64780772, "learning_rate": 3.904541275215825e-06, "loss": 0.6745683, "num_input_tokens_seen": 45228495, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.31469727, "step": 2092, "time_per_iteration": 3.0515575408935547 }, { "auxiliary_loss_clip": 0.01642772, "auxiliary_loss_mlp": 0.01056566, "balance_loss_clip": 1.39556539, "balance_loss_mlp": 1.02516615, "epoch": 0.12583796783405982, "flos": 19764963636480.0, "grad_norm": 1.904035273090423, "language_loss": 0.81753719, "learning_rate": 3.904422353969493e-06, "loss": 0.84453058, "num_input_tokens_seen": 45245720, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.3137207, "step": 2093, "time_per_iteration": 2.8275680541992188 }, { "auxiliary_loss_clip": 0.01623204, "auxiliary_loss_mlp": 0.01056547, "balance_loss_clip": 1.38554382, "balance_loss_mlp": 1.0238831, "epoch": 0.12589809108672778, "flos": 22612490305920.0, "grad_norm": 1.7726798261647871, "language_loss": 0.77535284, "learning_rate": 3.904303360507276e-06, "loss": 0.80215031, "num_input_tokens_seen": 45265650, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.3269043, "step": 2094, "time_per_iteration": 2.926093101501465 }, { "auxiliary_loss_clip": 0.01613822, "auxiliary_loss_mlp": 0.01058494, "balance_loss_clip": 1.37693071, "balance_loss_mlp": 1.02645016, "epoch": 0.12595821433939577, "flos": 45238988517120.0, "grad_norm": 1.6652711529961535, "language_loss": 0.77948809, "learning_rate": 3.9041842948336835e-06, "loss": 0.80621123, "num_input_tokens_seen": 45287790, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.32055664, "step": 2095, "time_per_iteration": 3.127448320388794 }, { "auxiliary_loss_clip": 0.01625804, "auxiliary_loss_mlp": 0.01049803, "balance_loss_clip": 1.38095188, "balance_loss_mlp": 1.01966643, "epoch": 0.12601833759206374, "flos": 14328041736960.0, "grad_norm": 2.3084912788328595, "language_loss": 0.84641945, "learning_rate": 3.904065156953232e-06, "loss": 0.8731755, "num_input_tokens_seen": 45305720, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.30126953, "step": 2096, "time_per_iteration": 2.8092355728149414 }, { "auxiliary_loss_clip": 0.01637452, "auxiliary_loss_mlp": 0.01054167, "balance_loss_clip": 1.39316106, "balance_loss_mlp": 1.0242933, "epoch": 0.1260784608447317, "flos": 21298188061440.0, "grad_norm": 1.7994757369750805, "language_loss": 0.76743579, "learning_rate": 3.903945946870439e-06, "loss": 0.79435194, "num_input_tokens_seen": 45325290, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.29833984, "step": 2097, "time_per_iteration": 2.9253997802734375 }, { "auxiliary_loss_clip": 0.01632158, "auxiliary_loss_mlp": 0.01059634, "balance_loss_clip": 1.38937211, "balance_loss_mlp": 1.02933121, "epoch": 0.12613858409739967, "flos": 26262807045120.0, "grad_norm": 2.1447727644620973, "language_loss": 0.88983321, "learning_rate": 3.9038266645898246e-06, "loss": 0.91675115, "num_input_tokens_seen": 45344465, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.30297852, "step": 2098, "time_per_iteration": 2.8533358573913574 }, { "auxiliary_loss_clip": 0.01650087, "auxiliary_loss_mlp": 0.01062637, "balance_loss_clip": 1.39910543, "balance_loss_mlp": 1.03009272, "epoch": 0.12619870735006763, "flos": 21589963649280.0, "grad_norm": 1.733819333643832, "language_loss": 0.70768237, "learning_rate": 3.903707310115912e-06, "loss": 0.73480964, "num_input_tokens_seen": 45362465, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.32519531, "step": 2099, "time_per_iteration": 2.911386489868164 }, { "auxiliary_loss_clip": 0.01636305, "auxiliary_loss_mlp": 0.01060933, "balance_loss_clip": 1.39084482, "balance_loss_mlp": 1.02853239, "epoch": 0.1262588306027356, "flos": 23377292726400.0, "grad_norm": 2.12675627010851, "language_loss": 0.83638668, "learning_rate": 3.903587883453228e-06, "loss": 0.86335909, "num_input_tokens_seen": 45382700, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.32421875, "step": 2100, "time_per_iteration": 2.8673312664031982 }, { "auxiliary_loss_clip": 0.01645208, "auxiliary_loss_mlp": 0.01058839, "balance_loss_clip": 1.39914966, "balance_loss_mlp": 1.02724826, "epoch": 0.12631895385540357, "flos": 23959169844480.0, "grad_norm": 1.945000114730573, "language_loss": 0.81657124, "learning_rate": 3.903468384606302e-06, "loss": 0.84361172, "num_input_tokens_seen": 45401005, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.31591797, "step": 2101, "time_per_iteration": 2.887495756149292 }, { "auxiliary_loss_clip": 0.01400923, "auxiliary_loss_mlp": 0.0106181, "balance_loss_clip": 1.25897551, "balance_loss_mlp": 1.04540658, "epoch": 0.12637907710807156, "flos": 70312586486400.0, "grad_norm": 0.717195269222542, "language_loss": 0.57042456, "learning_rate": 3.903348813579662e-06, "loss": 0.59505188, "num_input_tokens_seen": 45466555, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.1640625, "step": 2102, "time_per_iteration": 3.414055585861206 }, { "auxiliary_loss_clip": 0.01629495, "auxiliary_loss_mlp": 0.01050772, "balance_loss_clip": 1.38639522, "balance_loss_mlp": 1.0211606, "epoch": 0.12643920036073952, "flos": 18924004938240.0, "grad_norm": 1.9850835855043087, "language_loss": 0.94781792, "learning_rate": 3.903229170377845e-06, "loss": 0.97462058, "num_input_tokens_seen": 45485165, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.29602051, "step": 2103, "time_per_iteration": 2.8365235328674316 }, { "auxiliary_loss_clip": 0.01618084, "auxiliary_loss_mlp": 0.0104901, "balance_loss_clip": 1.38236177, "balance_loss_mlp": 1.02070975, "epoch": 0.1264993236134075, "flos": 27794040698880.0, "grad_norm": 1.5537164557002243, "language_loss": 0.78657699, "learning_rate": 3.903109455005387e-06, "loss": 0.81324792, "num_input_tokens_seen": 45504630, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.28295898, "step": 2104, "time_per_iteration": 2.9261186122894287 }, { "auxiliary_loss_clip": 0.01651426, "auxiliary_loss_mlp": 0.01058029, "balance_loss_clip": 1.40574932, "balance_loss_mlp": 1.02944279, "epoch": 0.12655944686607545, "flos": 24765624743040.0, "grad_norm": 1.6743515165320075, "language_loss": 0.8253082, "learning_rate": 3.902989667466828e-06, "loss": 0.85240281, "num_input_tokens_seen": 45524885, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.28564453, "step": 2105, "time_per_iteration": 2.8930609226226807 }, { "auxiliary_loss_clip": 0.01656266, "auxiliary_loss_mlp": 0.01063911, "balance_loss_clip": 1.40566373, "balance_loss_mlp": 1.03396547, "epoch": 0.12661957011874342, "flos": 24143452490880.0, "grad_norm": 2.517018551895882, "language_loss": 0.84028685, "learning_rate": 3.90286980776671e-06, "loss": 0.86748862, "num_input_tokens_seen": 45545000, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.29931641, "step": 2106, "time_per_iteration": 2.918806791305542 }, { "auxiliary_loss_clip": 0.01632457, "auxiliary_loss_mlp": 0.01060376, "balance_loss_clip": 1.39177752, "balance_loss_mlp": 1.03043044, "epoch": 0.12667969337141138, "flos": 24579803773440.0, "grad_norm": 1.7240629244913295, "language_loss": 0.7427426, "learning_rate": 3.902749875909578e-06, "loss": 0.7696709, "num_input_tokens_seen": 45564210, "router_z_loss_clip": 2.40429688, "router_z_loss_mlp": 0.29931641, "step": 2107, "time_per_iteration": 4.348583698272705 }, { "auxiliary_loss_clip": 0.01628995, "auxiliary_loss_mlp": 0.01063398, "balance_loss_clip": 1.38919306, "balance_loss_mlp": 1.03428698, "epoch": 0.12673981662407935, "flos": 22970785029120.0, "grad_norm": 5.618936462235351, "language_loss": 0.80267894, "learning_rate": 3.90262987189998e-06, "loss": 0.82960296, "num_input_tokens_seen": 45583030, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.29125977, "step": 2108, "time_per_iteration": 2.8623218536376953 }, { "auxiliary_loss_clip": 0.01642652, "auxiliary_loss_mlp": 0.01064101, "balance_loss_clip": 1.39747572, "balance_loss_mlp": 1.03270102, "epoch": 0.12679993987674734, "flos": 17283785264640.0, "grad_norm": 2.467222786626317, "language_loss": 0.77198344, "learning_rate": 3.902509795742467e-06, "loss": 0.79905093, "num_input_tokens_seen": 45602265, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.31396484, "step": 2109, "time_per_iteration": 2.9067482948303223 }, { "auxiliary_loss_clip": 0.01622299, "auxiliary_loss_mlp": 0.01062975, "balance_loss_clip": 1.38525987, "balance_loss_mlp": 1.03264844, "epoch": 0.1268600631294153, "flos": 17284056733440.0, "grad_norm": 1.980992737430057, "language_loss": 0.83986288, "learning_rate": 3.902389647441592e-06, "loss": 0.86671567, "num_input_tokens_seen": 45620595, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.30297852, "step": 2110, "time_per_iteration": 2.9029057025909424 }, { "auxiliary_loss_clip": 0.01637493, "auxiliary_loss_mlp": 0.01065234, "balance_loss_clip": 1.39228368, "balance_loss_mlp": 1.03675449, "epoch": 0.12692018638208327, "flos": 24071684958720.0, "grad_norm": 1.6025071207210315, "language_loss": 0.79657531, "learning_rate": 3.90226942700191e-06, "loss": 0.82360256, "num_input_tokens_seen": 45641140, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.28503418, "step": 2111, "time_per_iteration": 2.860950231552124 }, { "auxiliary_loss_clip": 0.01654859, "auxiliary_loss_mlp": 0.01066714, "balance_loss_clip": 1.40322351, "balance_loss_mlp": 1.03622055, "epoch": 0.12698030963475124, "flos": 31844395128960.0, "grad_norm": 2.043379690302293, "language_loss": 0.78562099, "learning_rate": 3.902149134427982e-06, "loss": 0.81283671, "num_input_tokens_seen": 45662315, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.30493164, "step": 2112, "time_per_iteration": 2.973675489425659 }, { "auxiliary_loss_clip": 0.01631738, "auxiliary_loss_mlp": 0.01069191, "balance_loss_clip": 1.39040613, "balance_loss_mlp": 1.0371474, "epoch": 0.1270404328874192, "flos": 25197542035200.0, "grad_norm": 2.2602935370931867, "language_loss": 0.86211932, "learning_rate": 3.902028769724367e-06, "loss": 0.88912857, "num_input_tokens_seen": 45680335, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.32006836, "step": 2113, "time_per_iteration": 4.274584054946899 }, { "auxiliary_loss_clip": 0.01626175, "auxiliary_loss_mlp": 0.01065481, "balance_loss_clip": 1.38486576, "balance_loss_mlp": 1.03491616, "epoch": 0.12710055614008717, "flos": 16005796611840.0, "grad_norm": 2.451687823207901, "language_loss": 0.75314891, "learning_rate": 3.9019083328956315e-06, "loss": 0.78006542, "num_input_tokens_seen": 45696240, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.30566406, "step": 2114, "time_per_iteration": 2.7744691371917725 }, { "auxiliary_loss_clip": 0.0163325, "auxiliary_loss_mlp": 0.0106051, "balance_loss_clip": 1.39299345, "balance_loss_mlp": 1.02889562, "epoch": 0.12716067939275516, "flos": 15093975277440.0, "grad_norm": 2.219464555787657, "language_loss": 0.85105813, "learning_rate": 3.901787823946341e-06, "loss": 0.87799573, "num_input_tokens_seen": 45713695, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.31640625, "step": 2115, "time_per_iteration": 5.621707916259766 }, { "auxiliary_loss_clip": 0.01636374, "auxiliary_loss_mlp": 0.01059506, "balance_loss_clip": 1.39444315, "balance_loss_mlp": 1.0302999, "epoch": 0.12722080264542313, "flos": 28378587260160.0, "grad_norm": 1.544136934676335, "language_loss": 0.8793608, "learning_rate": 3.901667242881065e-06, "loss": 0.90631956, "num_input_tokens_seen": 45736655, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.29223633, "step": 2116, "time_per_iteration": 2.884709596633911 }, { "auxiliary_loss_clip": 0.01627702, "auxiliary_loss_mlp": 0.01054114, "balance_loss_clip": 1.3887639, "balance_loss_mlp": 1.02505064, "epoch": 0.1272809258980911, "flos": 32392718588160.0, "grad_norm": 1.862791447243837, "language_loss": 0.7174809, "learning_rate": 3.9015465897043775e-06, "loss": 0.74429905, "num_input_tokens_seen": 45758195, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.29064941, "step": 2117, "time_per_iteration": 2.9759459495544434 }, { "auxiliary_loss_clip": 0.0163415, "auxiliary_loss_mlp": 0.01059417, "balance_loss_clip": 1.39170349, "balance_loss_mlp": 1.02863693, "epoch": 0.12734104915075906, "flos": 16043015099520.0, "grad_norm": 2.3986544726083583, "language_loss": 0.87461388, "learning_rate": 3.901425864420852e-06, "loss": 0.90154952, "num_input_tokens_seen": 45774280, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.30786133, "step": 2118, "time_per_iteration": 2.7875070571899414 }, { "auxiliary_loss_clip": 0.0162712, "auxiliary_loss_mlp": 0.01050362, "balance_loss_clip": 1.38721049, "balance_loss_mlp": 1.02127552, "epoch": 0.12740117240342702, "flos": 18269591126400.0, "grad_norm": 2.0644264972240913, "language_loss": 0.88818753, "learning_rate": 3.901305067035068e-06, "loss": 0.91496235, "num_input_tokens_seen": 45792760, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.29125977, "step": 2119, "time_per_iteration": 2.9074556827545166 }, { "auxiliary_loss_clip": 0.01644144, "auxiliary_loss_mlp": 0.01055681, "balance_loss_clip": 1.40146899, "balance_loss_mlp": 1.02519894, "epoch": 0.127461295656095, "flos": 12127146773760.0, "grad_norm": 2.328997574418648, "language_loss": 0.89291978, "learning_rate": 3.901184197551605e-06, "loss": 0.91991794, "num_input_tokens_seen": 45804300, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.30456543, "step": 2120, "time_per_iteration": 2.7332515716552734 }, { "auxiliary_loss_clip": 0.0162673, "auxiliary_loss_mlp": 0.0104853, "balance_loss_clip": 1.38562417, "balance_loss_mlp": 1.01872814, "epoch": 0.12752141890876295, "flos": 23159546910720.0, "grad_norm": 2.5059785140602795, "language_loss": 0.77517253, "learning_rate": 3.901063255975046e-06, "loss": 0.80192512, "num_input_tokens_seen": 45823780, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.2980957, "step": 2121, "time_per_iteration": 2.8772099018096924 }, { "auxiliary_loss_clip": 0.01634002, "auxiliary_loss_mlp": 0.01056302, "balance_loss_clip": 1.39123988, "balance_loss_mlp": 1.02502179, "epoch": 0.12758154216143094, "flos": 21625870037760.0, "grad_norm": 2.1846583325754696, "language_loss": 0.84396738, "learning_rate": 3.900942242309978e-06, "loss": 0.87087047, "num_input_tokens_seen": 45840495, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.31298828, "step": 2122, "time_per_iteration": 2.813204526901245 }, { "auxiliary_loss_clip": 0.01658908, "auxiliary_loss_mlp": 0.01059865, "balance_loss_clip": 1.41334701, "balance_loss_mlp": 1.02820265, "epoch": 0.1276416654140989, "flos": 15933440897280.0, "grad_norm": 2.3287792807675896, "language_loss": 0.80762899, "learning_rate": 3.90082115656099e-06, "loss": 0.83481669, "num_input_tokens_seen": 45857735, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.31640625, "step": 2123, "time_per_iteration": 2.8450820446014404 }, { "auxiliary_loss_clip": 0.01647326, "auxiliary_loss_mlp": 0.010608, "balance_loss_clip": 1.40568304, "balance_loss_mlp": 1.02877998, "epoch": 0.12770178866676687, "flos": 22392889453440.0, "grad_norm": 1.6073357798633612, "language_loss": 0.80703634, "learning_rate": 3.900699998732673e-06, "loss": 0.83411753, "num_input_tokens_seen": 45876485, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.32006836, "step": 2124, "time_per_iteration": 2.8448784351348877 }, { "auxiliary_loss_clip": 0.01643872, "auxiliary_loss_mlp": 0.01053042, "balance_loss_clip": 1.39997065, "balance_loss_mlp": 1.02314484, "epoch": 0.12776191191943484, "flos": 21662364608640.0, "grad_norm": 2.05156327563068, "language_loss": 0.77346551, "learning_rate": 3.900578768829623e-06, "loss": 0.80043471, "num_input_tokens_seen": 45894645, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.29882812, "step": 2125, "time_per_iteration": 2.87032413482666 }, { "auxiliary_loss_clip": 0.01644311, "auxiliary_loss_mlp": 0.01050621, "balance_loss_clip": 1.40218139, "balance_loss_mlp": 1.01996028, "epoch": 0.1278220351721028, "flos": 25745639270400.0, "grad_norm": 1.9608092027795136, "language_loss": 0.79620743, "learning_rate": 3.900457466856434e-06, "loss": 0.82315677, "num_input_tokens_seen": 45913755, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.30639648, "step": 2126, "time_per_iteration": 2.937234878540039 }, { "auxiliary_loss_clip": 0.01656038, "auxiliary_loss_mlp": 0.01048206, "balance_loss_clip": 1.41027021, "balance_loss_mlp": 1.01938105, "epoch": 0.12788215842477077, "flos": 41257641686400.0, "grad_norm": 2.1356998171544808, "language_loss": 0.70240831, "learning_rate": 3.9003360928177085e-06, "loss": 0.7294507, "num_input_tokens_seen": 45936095, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.28833008, "step": 2127, "time_per_iteration": 2.9747846126556396 }, { "auxiliary_loss_clip": 0.01400837, "auxiliary_loss_mlp": 0.01030895, "balance_loss_clip": 1.26420188, "balance_loss_mlp": 1.0126797, "epoch": 0.12794228167743876, "flos": 70911023201280.0, "grad_norm": 0.8779028274893623, "language_loss": 0.62926447, "learning_rate": 3.900214646718047e-06, "loss": 0.65358174, "num_input_tokens_seen": 46004655, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.18261719, "step": 2128, "time_per_iteration": 3.4157347679138184 }, { "auxiliary_loss_clip": 0.01661456, "auxiliary_loss_mlp": 0.01055175, "balance_loss_clip": 1.41325331, "balance_loss_mlp": 1.02425194, "epoch": 0.12800240493010673, "flos": 16298386606080.0, "grad_norm": 2.2614201913971264, "language_loss": 0.79185015, "learning_rate": 3.900093128562056e-06, "loss": 0.81901646, "num_input_tokens_seen": 46023610, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.30908203, "step": 2129, "time_per_iteration": 2.87589168548584 }, { "auxiliary_loss_clip": 0.01677079, "auxiliary_loss_mlp": 0.01063254, "balance_loss_clip": 1.42477942, "balance_loss_mlp": 1.03168774, "epoch": 0.1280625281827747, "flos": 20641331030400.0, "grad_norm": 2.1950831985251407, "language_loss": 0.80688375, "learning_rate": 3.899971538354343e-06, "loss": 0.83428705, "num_input_tokens_seen": 46041725, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.31542969, "step": 2130, "time_per_iteration": 2.8276052474975586 }, { "auxiliary_loss_clip": 0.01663917, "auxiliary_loss_mlp": 0.01049147, "balance_loss_clip": 1.41508496, "balance_loss_mlp": 1.01905894, "epoch": 0.12812265143544266, "flos": 22648532428800.0, "grad_norm": 3.473482200945053, "language_loss": 0.72421002, "learning_rate": 3.899849876099518e-06, "loss": 0.75134063, "num_input_tokens_seen": 46061095, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.30053711, "step": 2131, "time_per_iteration": 2.903099775314331 }, { "auxiliary_loss_clip": 0.01645776, "auxiliary_loss_mlp": 0.0105358, "balance_loss_clip": 1.40505612, "balance_loss_mlp": 1.02425408, "epoch": 0.12818277468811062, "flos": 34728054410880.0, "grad_norm": 2.0677663901682437, "language_loss": 0.73489857, "learning_rate": 3.899728141802197e-06, "loss": 0.76189214, "num_input_tokens_seen": 46082670, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.29296875, "step": 2132, "time_per_iteration": 2.9577109813690186 }, { "auxiliary_loss_clip": 0.01634018, "auxiliary_loss_mlp": 0.01050213, "balance_loss_clip": 1.39928627, "balance_loss_mlp": 1.02101898, "epoch": 0.1282428979407786, "flos": 23122418912640.0, "grad_norm": 20.025020147764657, "language_loss": 0.82535523, "learning_rate": 3.8996063354669935e-06, "loss": 0.85219759, "num_input_tokens_seen": 46102410, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.29162598, "step": 2133, "time_per_iteration": 2.9344592094421387 }, { "auxiliary_loss_clip": 0.01660342, "auxiliary_loss_mlp": 0.0105712, "balance_loss_clip": 1.41056514, "balance_loss_mlp": 1.02610135, "epoch": 0.12830302119344655, "flos": 20896250088960.0, "grad_norm": 2.3522356312797217, "language_loss": 0.82245314, "learning_rate": 3.899484457098528e-06, "loss": 0.84962779, "num_input_tokens_seen": 46121145, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.31030273, "step": 2134, "time_per_iteration": 2.827894926071167 }, { "auxiliary_loss_clip": 0.0166065, "auxiliary_loss_mlp": 0.01053011, "balance_loss_clip": 1.41603231, "balance_loss_mlp": 1.02182615, "epoch": 0.12836314444611455, "flos": 21407762263680.0, "grad_norm": 1.8214021585310218, "language_loss": 0.84393668, "learning_rate": 3.899362506701421e-06, "loss": 0.87107331, "num_input_tokens_seen": 46140740, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.31201172, "step": 2135, "time_per_iteration": 2.9015562534332275 }, { "auxiliary_loss_clip": 0.01652724, "auxiliary_loss_mlp": 0.01056578, "balance_loss_clip": 1.4110781, "balance_loss_mlp": 1.02555943, "epoch": 0.1284232676987825, "flos": 13670370299520.0, "grad_norm": 2.205625632692994, "language_loss": 0.78079748, "learning_rate": 3.899240484280298e-06, "loss": 0.80789047, "num_input_tokens_seen": 46156805, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.31005859, "step": 2136, "time_per_iteration": 2.8503968715667725 }, { "auxiliary_loss_clip": 0.01413471, "auxiliary_loss_mlp": 0.0103998, "balance_loss_clip": 1.2771771, "balance_loss_mlp": 1.02443528, "epoch": 0.12848339095145048, "flos": 60022927290240.0, "grad_norm": 0.8938063797840368, "language_loss": 0.59234917, "learning_rate": 3.899118389839785e-06, "loss": 0.6168837, "num_input_tokens_seen": 46222085, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.15527344, "step": 2137, "time_per_iteration": 3.552185297012329 }, { "auxiliary_loss_clip": 0.01661881, "auxiliary_loss_mlp": 0.01056813, "balance_loss_clip": 1.41612601, "balance_loss_mlp": 1.02529454, "epoch": 0.12854351420411844, "flos": 13889337724800.0, "grad_norm": 3.312919840058935, "language_loss": 0.84773469, "learning_rate": 3.898996223384512e-06, "loss": 0.87492168, "num_input_tokens_seen": 46239970, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.31518555, "step": 2138, "time_per_iteration": 2.888374090194702 }, { "auxiliary_loss_clip": 0.01670092, "auxiliary_loss_mlp": 0.01054141, "balance_loss_clip": 1.42143655, "balance_loss_mlp": 1.02314615, "epoch": 0.1286036374567864, "flos": 22648170470400.0, "grad_norm": 2.2801957617192827, "language_loss": 0.79686952, "learning_rate": 3.898873984919113e-06, "loss": 0.82411182, "num_input_tokens_seen": 46257740, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.30981445, "step": 2139, "time_per_iteration": 2.861192464828491 }, { "auxiliary_loss_clip": 0.01653429, "auxiliary_loss_mlp": 0.01062905, "balance_loss_clip": 1.40953338, "balance_loss_mlp": 1.03179157, "epoch": 0.12866376070945437, "flos": 16333071384960.0, "grad_norm": 2.2597444541968983, "language_loss": 0.85780025, "learning_rate": 3.8987516744482215e-06, "loss": 0.88496351, "num_input_tokens_seen": 46275445, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.31103516, "step": 2140, "time_per_iteration": 2.907869338989258 }, { "auxiliary_loss_clip": 0.01646678, "auxiliary_loss_mlp": 0.0104885, "balance_loss_clip": 1.40486908, "balance_loss_mlp": 1.02164674, "epoch": 0.12872388396212234, "flos": 11881955347200.0, "grad_norm": 1.877366993386417, "language_loss": 0.86969358, "learning_rate": 3.898629291976476e-06, "loss": 0.89664888, "num_input_tokens_seen": 46291710, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.2722168, "step": 2141, "time_per_iteration": 2.818969488143921 }, { "auxiliary_loss_clip": 0.0168116, "auxiliary_loss_mlp": 0.01062141, "balance_loss_clip": 1.43166411, "balance_loss_mlp": 1.02981114, "epoch": 0.12878400721479033, "flos": 28378542015360.0, "grad_norm": 1.8534255426398376, "language_loss": 0.7010262, "learning_rate": 3.898506837508518e-06, "loss": 0.72845924, "num_input_tokens_seen": 46311335, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.32348633, "step": 2142, "time_per_iteration": 4.313765525817871 }, { "auxiliary_loss_clip": 0.01679738, "auxiliary_loss_mlp": 0.01054939, "balance_loss_clip": 1.42919934, "balance_loss_mlp": 1.02361059, "epoch": 0.1288441304674583, "flos": 25896820705920.0, "grad_norm": 1.8872673997452314, "language_loss": 0.84482837, "learning_rate": 3.89838431104899e-06, "loss": 0.87217516, "num_input_tokens_seen": 46330985, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.31323242, "step": 2143, "time_per_iteration": 2.8924560546875 }, { "auxiliary_loss_clip": 0.01683676, "auxiliary_loss_mlp": 0.01058418, "balance_loss_clip": 1.43367457, "balance_loss_mlp": 1.02832925, "epoch": 0.12890425372012626, "flos": 20823577660800.0, "grad_norm": 2.1884324994578424, "language_loss": 0.82754421, "learning_rate": 3.898261712602539e-06, "loss": 0.85496509, "num_input_tokens_seen": 46351295, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.30078125, "step": 2144, "time_per_iteration": 2.905400276184082 }, { "auxiliary_loss_clip": 0.01663716, "auxiliary_loss_mlp": 0.01057458, "balance_loss_clip": 1.41558099, "balance_loss_mlp": 1.02496171, "epoch": 0.12896437697279423, "flos": 22576312448640.0, "grad_norm": 2.0666111985358246, "language_loss": 0.80337083, "learning_rate": 3.898139042173813e-06, "loss": 0.83058262, "num_input_tokens_seen": 46368600, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.32470703, "step": 2145, "time_per_iteration": 2.8358120918273926 }, { "auxiliary_loss_clip": 0.01663341, "auxiliary_loss_mlp": 0.01057656, "balance_loss_clip": 1.41845322, "balance_loss_mlp": 1.0260179, "epoch": 0.1290245002254622, "flos": 17502752689920.0, "grad_norm": 2.0587666839129812, "language_loss": 0.84065914, "learning_rate": 3.898016299767465e-06, "loss": 0.86786908, "num_input_tokens_seen": 46387370, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.31640625, "step": 2146, "time_per_iteration": 2.9032530784606934 }, { "auxiliary_loss_clip": 0.01675224, "auxiliary_loss_mlp": 0.01063984, "balance_loss_clip": 1.42948318, "balance_loss_mlp": 1.03179729, "epoch": 0.12908462347813016, "flos": 36328657622400.0, "grad_norm": 1.9352277580541777, "language_loss": 0.71953797, "learning_rate": 3.897893485388149e-06, "loss": 0.74693, "num_input_tokens_seen": 46409570, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.32177734, "step": 2147, "time_per_iteration": 2.953518867492676 }, { "auxiliary_loss_clip": 0.01663753, "auxiliary_loss_mlp": 0.01057842, "balance_loss_clip": 1.41687703, "balance_loss_mlp": 1.02694261, "epoch": 0.12914474673079815, "flos": 22538958226560.0, "grad_norm": 2.070826211581388, "language_loss": 0.72703642, "learning_rate": 3.897770599040521e-06, "loss": 0.75425231, "num_input_tokens_seen": 46429320, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.30859375, "step": 2148, "time_per_iteration": 4.31352972984314 }, { "auxiliary_loss_clip": 0.0165272, "auxiliary_loss_mlp": 0.01053122, "balance_loss_clip": 1.40921736, "balance_loss_mlp": 1.02467871, "epoch": 0.12920486998346611, "flos": 21482199239040.0, "grad_norm": 1.7283897742699852, "language_loss": 0.79671329, "learning_rate": 3.897647640729242e-06, "loss": 0.82377172, "num_input_tokens_seen": 46450155, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.28466797, "step": 2149, "time_per_iteration": 2.9508273601531982 }, { "auxiliary_loss_clip": 0.0166817, "auxiliary_loss_mlp": 0.01057679, "balance_loss_clip": 1.41966498, "balance_loss_mlp": 1.02635098, "epoch": 0.12926499323613408, "flos": 27320108970240.0, "grad_norm": 2.773146694881116, "language_loss": 0.77161229, "learning_rate": 3.897524610458975e-06, "loss": 0.7988708, "num_input_tokens_seen": 46470280, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 0.31347656, "step": 2150, "time_per_iteration": 5.824554681777954 }, { "auxiliary_loss_clip": 0.01669165, "auxiliary_loss_mlp": 0.01052803, "balance_loss_clip": 1.4202714, "balance_loss_mlp": 1.02394247, "epoch": 0.12932511648880204, "flos": 22101340089600.0, "grad_norm": 2.2231695450853355, "language_loss": 0.72308683, "learning_rate": 3.8974015082343835e-06, "loss": 0.75030649, "num_input_tokens_seen": 46487605, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.28845215, "step": 2151, "time_per_iteration": 2.897841215133667 }, { "auxiliary_loss_clip": 0.01677673, "auxiliary_loss_mlp": 0.01052071, "balance_loss_clip": 1.43147624, "balance_loss_mlp": 1.02232814, "epoch": 0.12938523974147, "flos": 20312382199680.0, "grad_norm": 2.16190823645622, "language_loss": 0.85344219, "learning_rate": 3.897278334060137e-06, "loss": 0.88073957, "num_input_tokens_seen": 46505100, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.29724121, "step": 2152, "time_per_iteration": 2.818711757659912 }, { "auxiliary_loss_clip": 0.01676267, "auxiliary_loss_mlp": 0.01056459, "balance_loss_clip": 1.43103302, "balance_loss_mlp": 1.02707398, "epoch": 0.12944536299413797, "flos": 19509320661120.0, "grad_norm": 6.404591517578225, "language_loss": 0.80300915, "learning_rate": 3.897155087940906e-06, "loss": 0.83033645, "num_input_tokens_seen": 46524020, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.29382324, "step": 2153, "time_per_iteration": 2.906743288040161 }, { "auxiliary_loss_clip": 0.01666458, "auxiliary_loss_mlp": 0.01057693, "balance_loss_clip": 1.41959655, "balance_loss_mlp": 1.02653217, "epoch": 0.12950548624680594, "flos": 27719060785920.0, "grad_norm": 1.607278894727942, "language_loss": 0.81345147, "learning_rate": 3.897031769881364e-06, "loss": 0.840693, "num_input_tokens_seen": 46544640, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.31152344, "step": 2154, "time_per_iteration": 2.86448335647583 }, { "auxiliary_loss_clip": 0.0166299, "auxiliary_loss_mlp": 0.01048907, "balance_loss_clip": 1.41845918, "balance_loss_mlp": 1.01826978, "epoch": 0.12956560949947393, "flos": 17574474977280.0, "grad_norm": 1.8849033281997272, "language_loss": 0.84774804, "learning_rate": 3.896908379886188e-06, "loss": 0.87486702, "num_input_tokens_seen": 46561395, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.30639648, "step": 2155, "time_per_iteration": 2.8518054485321045 }, { "auxiliary_loss_clip": 0.01671121, "auxiliary_loss_mlp": 0.0105642, "balance_loss_clip": 1.42017663, "balance_loss_mlp": 1.02647495, "epoch": 0.1296257327521419, "flos": 20750452784640.0, "grad_norm": 2.4378881257668628, "language_loss": 0.77400887, "learning_rate": 3.896784917960055e-06, "loss": 0.80128425, "num_input_tokens_seen": 46579395, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.29980469, "step": 2156, "time_per_iteration": 2.8601319789886475 }, { "auxiliary_loss_clip": 0.01666537, "auxiliary_loss_mlp": 0.01052203, "balance_loss_clip": 1.4217602, "balance_loss_mlp": 1.02344966, "epoch": 0.12968585600480986, "flos": 16403526817920.0, "grad_norm": 1.6750137313956903, "language_loss": 0.87515533, "learning_rate": 3.896661384107648e-06, "loss": 0.9023428, "num_input_tokens_seen": 46597090, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.28808594, "step": 2157, "time_per_iteration": 2.8645756244659424 }, { "auxiliary_loss_clip": 0.01671601, "auxiliary_loss_mlp": 0.01056503, "balance_loss_clip": 1.41953635, "balance_loss_mlp": 1.02560413, "epoch": 0.12974597925747783, "flos": 28341187793280.0, "grad_norm": 4.010860267270154, "language_loss": 0.81924343, "learning_rate": 3.896537778333651e-06, "loss": 0.84652448, "num_input_tokens_seen": 46617355, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.30908203, "step": 2158, "time_per_iteration": 2.92305588722229 }, { "auxiliary_loss_clip": 0.01679802, "auxiliary_loss_mlp": 0.01060013, "balance_loss_clip": 1.42805564, "balance_loss_mlp": 1.02899432, "epoch": 0.1298061025101458, "flos": 9689249692800.0, "grad_norm": 2.1952580099865515, "language_loss": 0.76318783, "learning_rate": 3.896414100642752e-06, "loss": 0.79058599, "num_input_tokens_seen": 46633130, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.31005859, "step": 2159, "time_per_iteration": 2.8082902431488037 }, { "auxiliary_loss_clip": 0.01654883, "auxiliary_loss_mlp": 0.01049785, "balance_loss_clip": 1.41231084, "balance_loss_mlp": 1.0197444, "epoch": 0.12986622576281376, "flos": 27721096801920.0, "grad_norm": 1.8247818214995024, "language_loss": 0.83896816, "learning_rate": 3.89629035103964e-06, "loss": 0.86601484, "num_input_tokens_seen": 46650575, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.30029297, "step": 2160, "time_per_iteration": 2.9445321559906006 }, { "auxiliary_loss_clip": 0.01646985, "auxiliary_loss_mlp": 0.01054104, "balance_loss_clip": 1.40744376, "balance_loss_mlp": 1.02413523, "epoch": 0.12992634901548175, "flos": 18810856396800.0, "grad_norm": 1.488998792657995, "language_loss": 0.82969612, "learning_rate": 3.896166529529008e-06, "loss": 0.85670698, "num_input_tokens_seen": 46668780, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.29980469, "step": 2161, "time_per_iteration": 2.8418636322021484 }, { "auxiliary_loss_clip": 0.0166843, "auxiliary_loss_mlp": 0.01053073, "balance_loss_clip": 1.4221139, "balance_loss_mlp": 1.02224565, "epoch": 0.12998647226814972, "flos": 29138231773440.0, "grad_norm": 2.2169673703595696, "language_loss": 0.83763087, "learning_rate": 3.896042636115551e-06, "loss": 0.86484593, "num_input_tokens_seen": 46687550, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.30810547, "step": 2162, "time_per_iteration": 2.91082501411438 }, { "auxiliary_loss_clip": 0.01661861, "auxiliary_loss_mlp": 0.01061619, "balance_loss_clip": 1.41247511, "balance_loss_mlp": 1.03060031, "epoch": 0.13004659552081768, "flos": 19583259943680.0, "grad_norm": 2.940988713033312, "language_loss": 0.74394357, "learning_rate": 3.895918670803968e-06, "loss": 0.77117836, "num_input_tokens_seen": 46706730, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.31030273, "step": 2163, "time_per_iteration": 2.8295228481292725 }, { "auxiliary_loss_clip": 0.01671608, "auxiliary_loss_mlp": 0.01059392, "balance_loss_clip": 1.42239141, "balance_loss_mlp": 1.02768254, "epoch": 0.13010671877348565, "flos": 22500563374080.0, "grad_norm": 2.783218457135964, "language_loss": 0.82418096, "learning_rate": 3.895794633598958e-06, "loss": 0.85149097, "num_input_tokens_seen": 46724250, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.31665039, "step": 2164, "time_per_iteration": 2.869309186935425 }, { "auxiliary_loss_clip": 0.01671625, "auxiliary_loss_mlp": 0.01051001, "balance_loss_clip": 1.42148745, "balance_loss_mlp": 1.02112758, "epoch": 0.1301668420261536, "flos": 23888714411520.0, "grad_norm": 2.0251328738122054, "language_loss": 0.73176056, "learning_rate": 3.8956705245052256e-06, "loss": 0.75898683, "num_input_tokens_seen": 46744105, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.29907227, "step": 2165, "time_per_iteration": 2.9003772735595703 }, { "auxiliary_loss_clip": 0.0169594, "auxiliary_loss_mlp": 0.01054281, "balance_loss_clip": 1.44019222, "balance_loss_mlp": 1.02235699, "epoch": 0.13022696527882158, "flos": 23160768520320.0, "grad_norm": 2.1938649059543223, "language_loss": 0.76345921, "learning_rate": 3.8955463435274765e-06, "loss": 0.79096144, "num_input_tokens_seen": 46764250, "router_z_loss_clip": 2.55664062, "router_z_loss_mlp": 0.31933594, "step": 2166, "time_per_iteration": 2.8436427116394043 }, { "auxiliary_loss_clip": 0.01674011, "auxiliary_loss_mlp": 0.01062571, "balance_loss_clip": 1.42444944, "balance_loss_mlp": 1.0335083, "epoch": 0.13028708853148954, "flos": 26920342748160.0, "grad_norm": 1.7274862966768487, "language_loss": 0.8406989, "learning_rate": 3.895422090670421e-06, "loss": 0.86806476, "num_input_tokens_seen": 46786865, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.29052734, "step": 2167, "time_per_iteration": 2.9636118412017822 }, { "auxiliary_loss_clip": 0.01687372, "auxiliary_loss_mlp": 0.0106593, "balance_loss_clip": 1.43953013, "balance_loss_mlp": 1.03350472, "epoch": 0.13034721178415754, "flos": 21261331532160.0, "grad_norm": 12.55163550913738, "language_loss": 0.84672737, "learning_rate": 3.89529776593877e-06, "loss": 0.87426037, "num_input_tokens_seen": 46807030, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.32421875, "step": 2168, "time_per_iteration": 2.8446903228759766 }, { "auxiliary_loss_clip": 0.01684256, "auxiliary_loss_mlp": 0.01062946, "balance_loss_clip": 1.43317986, "balance_loss_mlp": 1.03114057, "epoch": 0.1304073350368255, "flos": 18775628680320.0, "grad_norm": 2.077894100124159, "language_loss": 0.81089401, "learning_rate": 3.8951733693372375e-06, "loss": 0.83836603, "num_input_tokens_seen": 46826280, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.31787109, "step": 2169, "time_per_iteration": 2.9294979572296143 }, { "auxiliary_loss_clip": 0.01696296, "auxiliary_loss_mlp": 0.01052024, "balance_loss_clip": 1.44444036, "balance_loss_mlp": 1.02183986, "epoch": 0.13046745828949347, "flos": 28375963061760.0, "grad_norm": 2.284785154246083, "language_loss": 0.68972051, "learning_rate": 3.8950489008705406e-06, "loss": 0.71720374, "num_input_tokens_seen": 46846505, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.30200195, "step": 2170, "time_per_iteration": 2.9302124977111816 }, { "auxiliary_loss_clip": 0.01678192, "auxiliary_loss_mlp": 0.01051462, "balance_loss_clip": 1.42814541, "balance_loss_mlp": 1.02146876, "epoch": 0.13052758154216143, "flos": 29616642737280.0, "grad_norm": 1.6425558998549556, "language_loss": 0.68182331, "learning_rate": 3.8949243605434e-06, "loss": 0.70911986, "num_input_tokens_seen": 46867380, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.29980469, "step": 2171, "time_per_iteration": 2.923474073410034 }, { "auxiliary_loss_clip": 0.01680082, "auxiliary_loss_mlp": 0.01058753, "balance_loss_clip": 1.42955065, "balance_loss_mlp": 1.02798557, "epoch": 0.1305877047948294, "flos": 19400289396480.0, "grad_norm": 1.867756000167489, "language_loss": 0.73864174, "learning_rate": 3.894799748360537e-06, "loss": 0.76603007, "num_input_tokens_seen": 46886810, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.30749512, "step": 2172, "time_per_iteration": 2.8557705879211426 }, { "auxiliary_loss_clip": 0.01666457, "auxiliary_loss_mlp": 0.01058209, "balance_loss_clip": 1.42362571, "balance_loss_mlp": 1.0287168, "epoch": 0.13064782804749736, "flos": 16882344984960.0, "grad_norm": 1.8194503484672866, "language_loss": 0.7670002, "learning_rate": 3.894675064326678e-06, "loss": 0.79424685, "num_input_tokens_seen": 46905620, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.29455566, "step": 2173, "time_per_iteration": 2.820206880569458 }, { "auxiliary_loss_clip": 0.01680233, "auxiliary_loss_mlp": 0.0105648, "balance_loss_clip": 1.42724478, "balance_loss_mlp": 1.02493751, "epoch": 0.13070795130016533, "flos": 24509891278080.0, "grad_norm": 3.5695277333585893, "language_loss": 0.72582513, "learning_rate": 3.894550308446551e-06, "loss": 0.75319231, "num_input_tokens_seen": 46925120, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.31494141, "step": 2174, "time_per_iteration": 2.908653736114502 }, { "auxiliary_loss_clip": 0.01425119, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.28536522, "balance_loss_mlp": 1.0155195, "epoch": 0.13076807455283332, "flos": 71086754580480.0, "grad_norm": 0.8526190183813145, "language_loss": 0.59141231, "learning_rate": 3.894425480724886e-06, "loss": 0.61599517, "num_input_tokens_seen": 46988195, "router_z_loss_clip": 1.3984375, "router_z_loss_mlp": 0.17675781, "step": 2175, "time_per_iteration": 3.5488102436065674 }, { "auxiliary_loss_clip": 0.01673109, "auxiliary_loss_mlp": 0.01062642, "balance_loss_clip": 1.42248654, "balance_loss_mlp": 1.03291154, "epoch": 0.13082819780550128, "flos": 20273625388800.0, "grad_norm": 5.281326316603206, "language_loss": 0.81397521, "learning_rate": 3.894300581166417e-06, "loss": 0.84133267, "num_input_tokens_seen": 47004720, "router_z_loss_clip": 2.5078125, "router_z_loss_mlp": 0.296875, "step": 2176, "time_per_iteration": 4.290143966674805 }, { "auxiliary_loss_clip": 0.01670321, "auxiliary_loss_mlp": 0.01069409, "balance_loss_clip": 1.420578, "balance_loss_mlp": 1.03383732, "epoch": 0.13088832105816925, "flos": 34216542236160.0, "grad_norm": 5.963752286514603, "language_loss": 0.75450838, "learning_rate": 3.894175609775881e-06, "loss": 0.78190571, "num_input_tokens_seen": 47024255, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.35546875, "step": 2177, "time_per_iteration": 3.013387441635132 }, { "auxiliary_loss_clip": 0.01677081, "auxiliary_loss_mlp": 0.01070559, "balance_loss_clip": 1.43039322, "balance_loss_mlp": 1.03877783, "epoch": 0.13094844431083721, "flos": 17904011990400.0, "grad_norm": 1.9404430617681954, "language_loss": 0.83576047, "learning_rate": 3.894050566558015e-06, "loss": 0.86323684, "num_input_tokens_seen": 47042465, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.31762695, "step": 2178, "time_per_iteration": 3.0000176429748535 }, { "auxiliary_loss_clip": 0.01671013, "auxiliary_loss_mlp": 0.01071914, "balance_loss_clip": 1.42280257, "balance_loss_mlp": 1.04282701, "epoch": 0.13100856756350518, "flos": 17320234590720.0, "grad_norm": 2.2237775612098654, "language_loss": 0.76001209, "learning_rate": 3.893925451517562e-06, "loss": 0.78744137, "num_input_tokens_seen": 47060370, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.29052734, "step": 2179, "time_per_iteration": 2.962258815765381 }, { "auxiliary_loss_clip": 0.0167029, "auxiliary_loss_mlp": 0.01074989, "balance_loss_clip": 1.42477083, "balance_loss_mlp": 1.04511487, "epoch": 0.13106869081617314, "flos": 22210778557440.0, "grad_norm": 2.143252182243673, "language_loss": 0.85702217, "learning_rate": 3.893800264659266e-06, "loss": 0.88447499, "num_input_tokens_seen": 47081415, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.29882812, "step": 2180, "time_per_iteration": 3.087979793548584 }, { "auxiliary_loss_clip": 0.01668468, "auxiliary_loss_mlp": 0.01083586, "balance_loss_clip": 1.42126429, "balance_loss_mlp": 1.05303311, "epoch": 0.13112881406884114, "flos": 21773341399680.0, "grad_norm": 2.1246501485878273, "language_loss": 0.90869927, "learning_rate": 3.8936750059878746e-06, "loss": 0.93621981, "num_input_tokens_seen": 47099860, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.30578613, "step": 2181, "time_per_iteration": 2.83451509475708 }, { "auxiliary_loss_clip": 0.0168031, "auxiliary_loss_mlp": 0.01077112, "balance_loss_clip": 1.4284761, "balance_loss_mlp": 1.04592705, "epoch": 0.1311889373215091, "flos": 23341476827520.0, "grad_norm": 1.8219470839792329, "language_loss": 0.7020452, "learning_rate": 3.893549675508137e-06, "loss": 0.72961938, "num_input_tokens_seen": 47118540, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.31152344, "step": 2182, "time_per_iteration": 2.8762240409851074 }, { "auxiliary_loss_clip": 0.01692515, "auxiliary_loss_mlp": 0.01076729, "balance_loss_clip": 1.43779349, "balance_loss_mlp": 1.04411364, "epoch": 0.13124906057417707, "flos": 21476498394240.0, "grad_norm": 5.007251636202205, "language_loss": 0.79953742, "learning_rate": 3.893424273224806e-06, "loss": 0.8272298, "num_input_tokens_seen": 47136710, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.32641602, "step": 2183, "time_per_iteration": 4.395808696746826 }, { "auxiliary_loss_clip": 0.01670554, "auxiliary_loss_mlp": 0.01070114, "balance_loss_clip": 1.42289698, "balance_loss_mlp": 1.03878617, "epoch": 0.13130918382684503, "flos": 23264958591360.0, "grad_norm": 1.65536070981637, "language_loss": 0.86277682, "learning_rate": 3.893298799142636e-06, "loss": 0.89018351, "num_input_tokens_seen": 47157155, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.31323242, "step": 2184, "time_per_iteration": 2.840151071548462 }, { "auxiliary_loss_clip": 0.01686953, "auxiliary_loss_mlp": 0.0107039, "balance_loss_clip": 1.4372921, "balance_loss_mlp": 1.03934765, "epoch": 0.131369307079513, "flos": 20860072231680.0, "grad_norm": 1.9195277516365847, "language_loss": 0.83438444, "learning_rate": 3.893173253266387e-06, "loss": 0.86195791, "num_input_tokens_seen": 47176820, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.31054688, "step": 2185, "time_per_iteration": 5.720970392227173 }, { "auxiliary_loss_clip": 0.01679977, "auxiliary_loss_mlp": 0.01079559, "balance_loss_clip": 1.42787266, "balance_loss_mlp": 1.04443979, "epoch": 0.13142943033218096, "flos": 17867336440320.0, "grad_norm": 1.7468460621663786, "language_loss": 0.74012113, "learning_rate": 3.893047635600818e-06, "loss": 0.76771653, "num_input_tokens_seen": 47195855, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.35131836, "step": 2186, "time_per_iteration": 2.8482980728149414 }, { "auxiliary_loss_clip": 0.01672699, "auxiliary_loss_mlp": 0.01067711, "balance_loss_clip": 1.42355955, "balance_loss_mlp": 1.03440392, "epoch": 0.13148955358484893, "flos": 21005824291200.0, "grad_norm": 2.3476334813928776, "language_loss": 0.81390864, "learning_rate": 3.892921946150693e-06, "loss": 0.84131277, "num_input_tokens_seen": 47214535, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.33325195, "step": 2187, "time_per_iteration": 2.939473867416382 }, { "auxiliary_loss_clip": 0.0139705, "auxiliary_loss_mlp": 0.01042524, "balance_loss_clip": 1.25633216, "balance_loss_mlp": 1.02583456, "epoch": 0.13154967683751692, "flos": 70202831304960.0, "grad_norm": 0.8398198373172424, "language_loss": 0.59166908, "learning_rate": 3.892796184920778e-06, "loss": 0.61606479, "num_input_tokens_seen": 47270300, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.16699219, "step": 2188, "time_per_iteration": 3.348147392272949 }, { "auxiliary_loss_clip": 0.01681324, "auxiliary_loss_mlp": 0.01066113, "balance_loss_clip": 1.43437457, "balance_loss_mlp": 1.0346421, "epoch": 0.1316098000901849, "flos": 20385869034240.0, "grad_norm": 1.718416462691601, "language_loss": 0.75070918, "learning_rate": 3.892670351915842e-06, "loss": 0.77818358, "num_input_tokens_seen": 47290720, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.31445312, "step": 2189, "time_per_iteration": 2.8964555263519287 }, { "auxiliary_loss_clip": 0.01682458, "auxiliary_loss_mlp": 0.0105622, "balance_loss_clip": 1.43206394, "balance_loss_mlp": 1.02498674, "epoch": 0.13166992334285285, "flos": 23231359687680.0, "grad_norm": 2.0466837355810723, "language_loss": 0.73773986, "learning_rate": 3.892544447140657e-06, "loss": 0.76512665, "num_input_tokens_seen": 47311820, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.31225586, "step": 2190, "time_per_iteration": 2.8651578426361084 }, { "auxiliary_loss_clip": 0.01669767, "auxiliary_loss_mlp": 0.01065176, "balance_loss_clip": 1.42175555, "balance_loss_mlp": 1.03456295, "epoch": 0.13173004659552082, "flos": 23341295848320.0, "grad_norm": 2.2119454856833625, "language_loss": 0.75608462, "learning_rate": 3.892418470599996e-06, "loss": 0.78343403, "num_input_tokens_seen": 47331605, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.30566406, "step": 2191, "time_per_iteration": 2.8664255142211914 }, { "auxiliary_loss_clip": 0.01678812, "auxiliary_loss_mlp": 0.01059315, "balance_loss_clip": 1.42433667, "balance_loss_mlp": 1.02734351, "epoch": 0.13179016984818878, "flos": 21261376776960.0, "grad_norm": 2.046379818784184, "language_loss": 0.79937422, "learning_rate": 3.892292422298637e-06, "loss": 0.82675552, "num_input_tokens_seen": 47350455, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.31958008, "step": 2192, "time_per_iteration": 2.877703905105591 }, { "auxiliary_loss_clip": 0.01690361, "auxiliary_loss_mlp": 0.01060155, "balance_loss_clip": 1.43742418, "balance_loss_mlp": 1.02618027, "epoch": 0.13185029310085675, "flos": 17785931765760.0, "grad_norm": 1.9960683765863096, "language_loss": 0.86217225, "learning_rate": 3.892166302241361e-06, "loss": 0.88967735, "num_input_tokens_seen": 47368225, "router_z_loss_clip": 2.52929688, "router_z_loss_mlp": 0.33959961, "step": 2193, "time_per_iteration": 2.8532724380493164 }, { "auxiliary_loss_clip": 0.01398099, "auxiliary_loss_mlp": 0.0103231, "balance_loss_clip": 1.25543666, "balance_loss_mlp": 1.01695621, "epoch": 0.1319104163535247, "flos": 69884062554240.0, "grad_norm": 0.8143482546912102, "language_loss": 0.54079807, "learning_rate": 3.8920401104329475e-06, "loss": 0.5651021, "num_input_tokens_seen": 47427125, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.15332031, "step": 2194, "time_per_iteration": 3.383409023284912 }, { "auxiliary_loss_clip": 0.01681369, "auxiliary_loss_mlp": 0.01061548, "balance_loss_clip": 1.43191123, "balance_loss_mlp": 1.02857459, "epoch": 0.1319705396061927, "flos": 25204283510400.0, "grad_norm": 1.642422156053264, "language_loss": 0.73202914, "learning_rate": 3.891913846878185e-06, "loss": 0.7594583, "num_input_tokens_seen": 47450275, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.3293457, "step": 2195, "time_per_iteration": 2.8602216243743896 }, { "auxiliary_loss_clip": 0.01690039, "auxiliary_loss_mlp": 0.01058075, "balance_loss_clip": 1.43334174, "balance_loss_mlp": 1.02600741, "epoch": 0.13203066285886067, "flos": 20750136071040.0, "grad_norm": 1.5969099195726997, "language_loss": 0.79383564, "learning_rate": 3.891787511581859e-06, "loss": 0.82131672, "num_input_tokens_seen": 47469155, "router_z_loss_clip": 2.5703125, "router_z_loss_mlp": 0.32080078, "step": 2196, "time_per_iteration": 2.8809094429016113 }, { "auxiliary_loss_clip": 0.01690226, "auxiliary_loss_mlp": 0.01065472, "balance_loss_clip": 1.43266702, "balance_loss_mlp": 1.03378665, "epoch": 0.13209078611152864, "flos": 22064528805120.0, "grad_norm": 1.9168544986567346, "language_loss": 0.76285124, "learning_rate": 3.89166110454876e-06, "loss": 0.79040825, "num_input_tokens_seen": 47488405, "router_z_loss_clip": 2.57226562, "router_z_loss_mlp": 0.31677246, "step": 2197, "time_per_iteration": 2.877366781234741 }, { "auxiliary_loss_clip": 0.01710094, "auxiliary_loss_mlp": 0.01062606, "balance_loss_clip": 1.4497025, "balance_loss_mlp": 1.02872682, "epoch": 0.1321509093641966, "flos": 16292369047680.0, "grad_norm": 2.121745585866298, "language_loss": 0.81095088, "learning_rate": 3.891534625783685e-06, "loss": 0.83867788, "num_input_tokens_seen": 47505650, "router_z_loss_clip": 2.60546875, "router_z_loss_mlp": 0.33886719, "step": 2198, "time_per_iteration": 2.793059825897217 }, { "auxiliary_loss_clip": 0.01675411, "auxiliary_loss_mlp": 0.01067863, "balance_loss_clip": 1.42110932, "balance_loss_mlp": 1.03386486, "epoch": 0.13221103261686457, "flos": 16991873942400.0, "grad_norm": 2.2277847034763516, "language_loss": 0.84749401, "learning_rate": 3.891408075291425e-06, "loss": 0.87492681, "num_input_tokens_seen": 47521540, "router_z_loss_clip": 2.54492188, "router_z_loss_mlp": 0.34008789, "step": 2199, "time_per_iteration": 2.83859920501709 }, { "auxiliary_loss_clip": 0.01676103, "auxiliary_loss_mlp": 0.0106285, "balance_loss_clip": 1.42487001, "balance_loss_mlp": 1.0287323, "epoch": 0.13227115586953253, "flos": 34245073722240.0, "grad_norm": 1.639017068424933, "language_loss": 0.70688629, "learning_rate": 3.8912814530767826e-06, "loss": 0.73427582, "num_input_tokens_seen": 47543625, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.34106445, "step": 2200, "time_per_iteration": 2.9393491744995117 }, { "auxiliary_loss_clip": 0.01662444, "auxiliary_loss_mlp": 0.01062167, "balance_loss_clip": 1.41594625, "balance_loss_mlp": 1.02847874, "epoch": 0.13233127912220052, "flos": 20714501151360.0, "grad_norm": 2.7658445640170486, "language_loss": 0.86538011, "learning_rate": 3.891154759144557e-06, "loss": 0.89262617, "num_input_tokens_seen": 47563740, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.33666992, "step": 2201, "time_per_iteration": 2.8718395233154297 }, { "auxiliary_loss_clip": 0.01684257, "auxiliary_loss_mlp": 0.01056494, "balance_loss_clip": 1.42944264, "balance_loss_mlp": 1.02385485, "epoch": 0.1323914023748685, "flos": 25814239666560.0, "grad_norm": 1.816227252080514, "language_loss": 0.87703508, "learning_rate": 3.891027993499554e-06, "loss": 0.90444261, "num_input_tokens_seen": 47582655, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.32641602, "step": 2202, "time_per_iteration": 2.9304707050323486 }, { "auxiliary_loss_clip": 0.01699439, "auxiliary_loss_mlp": 0.01061578, "balance_loss_clip": 1.44528806, "balance_loss_mlp": 1.02967763, "epoch": 0.13245152562753645, "flos": 21261422021760.0, "grad_norm": 1.9334356759875198, "language_loss": 0.73344547, "learning_rate": 3.89090115614658e-06, "loss": 0.76105565, "num_input_tokens_seen": 47600875, "router_z_loss_clip": 2.54101562, "router_z_loss_mlp": 0.31860352, "step": 2203, "time_per_iteration": 2.86452317237854 }, { "auxiliary_loss_clip": 0.01682608, "auxiliary_loss_mlp": 0.01062988, "balance_loss_clip": 1.43185163, "balance_loss_mlp": 1.02929997, "epoch": 0.13251164888020442, "flos": 26621192257920.0, "grad_norm": 2.036001192286987, "language_loss": 0.75164723, "learning_rate": 3.890774247090444e-06, "loss": 0.77910316, "num_input_tokens_seen": 47619250, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.33666992, "step": 2204, "time_per_iteration": 2.9218883514404297 }, { "auxiliary_loss_clip": 0.01684001, "auxiliary_loss_mlp": 0.01062396, "balance_loss_clip": 1.43226075, "balance_loss_mlp": 1.02949381, "epoch": 0.13257177213287238, "flos": 29838867788160.0, "grad_norm": 2.238147785848352, "language_loss": 0.79478085, "learning_rate": 3.89064726633596e-06, "loss": 0.82224488, "num_input_tokens_seen": 47639445, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.32910156, "step": 2205, "time_per_iteration": 2.8755714893341064 }, { "auxiliary_loss_clip": 0.01668532, "auxiliary_loss_mlp": 0.01057353, "balance_loss_clip": 1.4202466, "balance_loss_mlp": 1.02535689, "epoch": 0.13263189538554035, "flos": 21298459530240.0, "grad_norm": 1.9028270300135417, "language_loss": 0.80102271, "learning_rate": 3.890520213887941e-06, "loss": 0.82828152, "num_input_tokens_seen": 47658740, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.31982422, "step": 2206, "time_per_iteration": 2.8843042850494385 }, { "auxiliary_loss_clip": 0.01680659, "auxiliary_loss_mlp": 0.01061332, "balance_loss_clip": 1.42821443, "balance_loss_mlp": 1.02978921, "epoch": 0.13269201863820831, "flos": 16883068901760.0, "grad_norm": 1.9869978765794452, "language_loss": 0.75133812, "learning_rate": 3.890393089751208e-06, "loss": 0.77875799, "num_input_tokens_seen": 47676880, "router_z_loss_clip": 2.5234375, "router_z_loss_mlp": 0.31518555, "step": 2207, "time_per_iteration": 2.794328451156616 }, { "auxiliary_loss_clip": 0.01659297, "auxiliary_loss_mlp": 0.01052658, "balance_loss_clip": 1.41533446, "balance_loss_mlp": 1.02037621, "epoch": 0.1327521418908763, "flos": 23779411678080.0, "grad_norm": 2.876504972094219, "language_loss": 0.85398322, "learning_rate": 3.890265893930578e-06, "loss": 0.8811028, "num_input_tokens_seen": 47696635, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.32275391, "step": 2208, "time_per_iteration": 2.8406734466552734 }, { "auxiliary_loss_clip": 0.01652188, "auxiliary_loss_mlp": 0.01054228, "balance_loss_clip": 1.41390061, "balance_loss_mlp": 1.0228281, "epoch": 0.13281226514354427, "flos": 26516866452480.0, "grad_norm": 1.5230906910859807, "language_loss": 0.86054933, "learning_rate": 3.890138626430876e-06, "loss": 0.88761348, "num_input_tokens_seen": 47717760, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.31420898, "step": 2209, "time_per_iteration": 2.881795883178711 }, { "auxiliary_loss_clip": 0.01677194, "auxiliary_loss_mlp": 0.01060504, "balance_loss_clip": 1.42701781, "balance_loss_mlp": 1.02450287, "epoch": 0.13287238839621224, "flos": 24509484074880.0, "grad_norm": 2.1293400123410406, "language_loss": 0.83495939, "learning_rate": 3.890011287256929e-06, "loss": 0.8623364, "num_input_tokens_seen": 47737685, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.36010742, "step": 2210, "time_per_iteration": 2.8589446544647217 }, { "auxiliary_loss_clip": 0.01410452, "auxiliary_loss_mlp": 0.01032948, "balance_loss_clip": 1.26312733, "balance_loss_mlp": 1.01311207, "epoch": 0.1329325116488802, "flos": 67724820069120.0, "grad_norm": 0.7676414389135011, "language_loss": 0.58099425, "learning_rate": 3.889883876413563e-06, "loss": 0.60542834, "num_input_tokens_seen": 47802415, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 0.19824219, "step": 2211, "time_per_iteration": 4.889142274856567 }, { "auxiliary_loss_clip": 0.01414147, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.26469052, "balance_loss_mlp": 1.00319469, "epoch": 0.13299263490154817, "flos": 72295328430720.0, "grad_norm": 0.7945020295737409, "language_loss": 0.55375969, "learning_rate": 3.889756393905611e-06, "loss": 0.57818681, "num_input_tokens_seen": 47871485, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.25390625, "step": 2212, "time_per_iteration": 3.3749260902404785 }, { "auxiliary_loss_clip": 0.01676897, "auxiliary_loss_mlp": 0.01061147, "balance_loss_clip": 1.42343068, "balance_loss_mlp": 1.02726805, "epoch": 0.13305275815421613, "flos": 17940280337280.0, "grad_norm": 2.8677845111011604, "language_loss": 0.76297772, "learning_rate": 3.889628839737908e-06, "loss": 0.79035819, "num_input_tokens_seen": 47888315, "router_z_loss_clip": 2.53515625, "router_z_loss_mlp": 0.33886719, "step": 2213, "time_per_iteration": 2.8619086742401123 }, { "auxiliary_loss_clip": 0.0165671, "auxiliary_loss_mlp": 0.0105635, "balance_loss_clip": 1.41372371, "balance_loss_mlp": 1.02530766, "epoch": 0.13311288140688413, "flos": 22349698652160.0, "grad_norm": 1.8675936190606444, "language_loss": 0.80350614, "learning_rate": 3.889501213915291e-06, "loss": 0.83063668, "num_input_tokens_seen": 47906600, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.31054688, "step": 2214, "time_per_iteration": 2.9495420455932617 }, { "auxiliary_loss_clip": 0.01664263, "auxiliary_loss_mlp": 0.01057818, "balance_loss_clip": 1.41579175, "balance_loss_mlp": 1.02548814, "epoch": 0.1331730046595521, "flos": 31881523127040.0, "grad_norm": 2.1190203223969144, "language_loss": 0.70033765, "learning_rate": 3.889373516442597e-06, "loss": 0.72755849, "num_input_tokens_seen": 47927630, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.32348633, "step": 2215, "time_per_iteration": 2.9240376949310303 }, { "auxiliary_loss_clip": 0.01663946, "auxiliary_loss_mlp": 0.01052852, "balance_loss_clip": 1.41442728, "balance_loss_mlp": 1.02130914, "epoch": 0.13323312791222006, "flos": 22577217344640.0, "grad_norm": 1.6538586919457354, "language_loss": 0.81904364, "learning_rate": 3.889245747324671e-06, "loss": 0.84621167, "num_input_tokens_seen": 47947935, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.31542969, "step": 2216, "time_per_iteration": 2.945469856262207 }, { "auxiliary_loss_clip": 0.01669149, "auxiliary_loss_mlp": 0.01061474, "balance_loss_clip": 1.4211359, "balance_loss_mlp": 1.02809513, "epoch": 0.13329325116488802, "flos": 15093568074240.0, "grad_norm": 3.0655652030881173, "language_loss": 0.89233387, "learning_rate": 3.889117906566356e-06, "loss": 0.91964006, "num_input_tokens_seen": 47965515, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.33349609, "step": 2217, "time_per_iteration": 2.9750473499298096 }, { "auxiliary_loss_clip": 0.01650362, "auxiliary_loss_mlp": 0.01062849, "balance_loss_clip": 1.40606153, "balance_loss_mlp": 1.02541733, "epoch": 0.133353374417556, "flos": 27465046623360.0, "grad_norm": 3.6468459193249894, "language_loss": 0.75821477, "learning_rate": 3.888989994172501e-06, "loss": 0.78534687, "num_input_tokens_seen": 47985675, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.37451172, "step": 2218, "time_per_iteration": 4.425408363342285 }, { "auxiliary_loss_clip": 0.01667583, "auxiliary_loss_mlp": 0.01052221, "balance_loss_clip": 1.41787231, "balance_loss_mlp": 1.01862812, "epoch": 0.13341349767022395, "flos": 24104605190400.0, "grad_norm": 1.9672089722879005, "language_loss": 0.88079703, "learning_rate": 3.8888620101479565e-06, "loss": 0.9079951, "num_input_tokens_seen": 48004985, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.3359375, "step": 2219, "time_per_iteration": 2.887049674987793 }, { "auxiliary_loss_clip": 0.01691852, "auxiliary_loss_mlp": 0.01068274, "balance_loss_clip": 1.44059873, "balance_loss_mlp": 1.03456175, "epoch": 0.13347362092289192, "flos": 24143814449280.0, "grad_norm": 1.4661998144325583, "language_loss": 0.78493011, "learning_rate": 3.888733954497574e-06, "loss": 0.81253147, "num_input_tokens_seen": 48024965, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.33740234, "step": 2220, "time_per_iteration": 5.682001352310181 }, { "auxiliary_loss_clip": 0.01660942, "auxiliary_loss_mlp": 0.01060413, "balance_loss_clip": 1.41361594, "balance_loss_mlp": 1.02760613, "epoch": 0.1335337441755599, "flos": 18444236630400.0, "grad_norm": 2.721538810129121, "language_loss": 0.80234587, "learning_rate": 3.888605827226212e-06, "loss": 0.82955945, "num_input_tokens_seen": 48040890, "router_z_loss_clip": 2.47460938, "router_z_loss_mlp": 0.328125, "step": 2221, "time_per_iteration": 3.0044450759887695 }, { "auxiliary_loss_clip": 0.01372753, "auxiliary_loss_mlp": 0.01026743, "balance_loss_clip": 1.22806633, "balance_loss_mlp": 1.00461757, "epoch": 0.13359386742822787, "flos": 50636221447680.0, "grad_norm": 0.9701914452700214, "language_loss": 0.69067019, "learning_rate": 3.8884776283387275e-06, "loss": 0.71466517, "num_input_tokens_seen": 48091855, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.22167969, "step": 2222, "time_per_iteration": 3.234295129776001 }, { "auxiliary_loss_clip": 0.01668832, "auxiliary_loss_mlp": 0.01064124, "balance_loss_clip": 1.42110586, "balance_loss_mlp": 1.03217626, "epoch": 0.13365399068089584, "flos": 22787452523520.0, "grad_norm": 1.8615036882885343, "language_loss": 0.67828512, "learning_rate": 3.888349357839982e-06, "loss": 0.70561469, "num_input_tokens_seen": 48111350, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.31933594, "step": 2223, "time_per_iteration": 2.870704412460327 }, { "auxiliary_loss_clip": 0.01674499, "auxiliary_loss_mlp": 0.01065433, "balance_loss_clip": 1.4228878, "balance_loss_mlp": 1.02983713, "epoch": 0.1337141139335638, "flos": 12539264826240.0, "grad_norm": 1.8214662679262708, "language_loss": 0.83334225, "learning_rate": 3.88822101573484e-06, "loss": 0.86074162, "num_input_tokens_seen": 48129840, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.35595703, "step": 2224, "time_per_iteration": 2.8814492225646973 }, { "auxiliary_loss_clip": 0.01668246, "auxiliary_loss_mlp": 0.01063721, "balance_loss_clip": 1.41306114, "balance_loss_mlp": 1.03031802, "epoch": 0.13377423718623177, "flos": 23049203546880.0, "grad_norm": 2.3635478795185145, "language_loss": 0.6802423, "learning_rate": 3.888092602028167e-06, "loss": 0.70756197, "num_input_tokens_seen": 48149240, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.33398438, "step": 2225, "time_per_iteration": 2.8472397327423096 }, { "auxiliary_loss_clip": 0.01668403, "auxiliary_loss_mlp": 0.01065517, "balance_loss_clip": 1.41924739, "balance_loss_mlp": 1.03232932, "epoch": 0.13383436043889974, "flos": 16224945016320.0, "grad_norm": 2.2150772197248263, "language_loss": 0.91311574, "learning_rate": 3.887964116724835e-06, "loss": 0.94045496, "num_input_tokens_seen": 48166330, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.33203125, "step": 2226, "time_per_iteration": 2.8394641876220703 }, { "auxiliary_loss_clip": 0.01662154, "auxiliary_loss_mlp": 0.01066676, "balance_loss_clip": 1.41052139, "balance_loss_mlp": 1.03239179, "epoch": 0.1338944836915677, "flos": 24290290425600.0, "grad_norm": 17.462183445264127, "language_loss": 0.75271404, "learning_rate": 3.887835559829712e-06, "loss": 0.78000236, "num_input_tokens_seen": 48187600, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.34301758, "step": 2227, "time_per_iteration": 2.850189447402954 }, { "auxiliary_loss_clip": 0.01663371, "auxiliary_loss_mlp": 0.01056091, "balance_loss_clip": 1.41227126, "balance_loss_mlp": 1.02378511, "epoch": 0.1339546069442357, "flos": 17606580802560.0, "grad_norm": 1.916135448850988, "language_loss": 0.86129403, "learning_rate": 3.8877069313476764e-06, "loss": 0.88848865, "num_input_tokens_seen": 48204400, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.32275391, "step": 2228, "time_per_iteration": 2.844377040863037 }, { "auxiliary_loss_clip": 0.0164922, "auxiliary_loss_mlp": 0.01055874, "balance_loss_clip": 1.40495801, "balance_loss_mlp": 1.02342474, "epoch": 0.13401473019690366, "flos": 18999527788800.0, "grad_norm": 1.7417318010054608, "language_loss": 0.82190472, "learning_rate": 3.8875782312836054e-06, "loss": 0.84895563, "num_input_tokens_seen": 48222180, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.32446289, "step": 2229, "time_per_iteration": 2.8503780364990234 }, { "auxiliary_loss_clip": 0.01669237, "auxiliary_loss_mlp": 0.01066364, "balance_loss_clip": 1.41879451, "balance_loss_mlp": 1.03336644, "epoch": 0.13407485344957162, "flos": 26955298995840.0, "grad_norm": 2.6417757035260596, "language_loss": 0.75545585, "learning_rate": 3.887449459642378e-06, "loss": 0.78281188, "num_input_tokens_seen": 48243245, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.33007812, "step": 2230, "time_per_iteration": 2.907579183578491 }, { "auxiliary_loss_clip": 0.01656785, "auxiliary_loss_mlp": 0.01057601, "balance_loss_clip": 1.40785241, "balance_loss_mlp": 1.02498519, "epoch": 0.1341349767022396, "flos": 20349012504960.0, "grad_norm": 1.9170354954648896, "language_loss": 0.81355876, "learning_rate": 3.8873206164288785e-06, "loss": 0.84070265, "num_input_tokens_seen": 48262600, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.32568359, "step": 2231, "time_per_iteration": 2.8563761711120605 }, { "auxiliary_loss_clip": 0.01679295, "auxiliary_loss_mlp": 0.01069893, "balance_loss_clip": 1.42350769, "balance_loss_mlp": 1.03482127, "epoch": 0.13419509995490755, "flos": 29874095504640.0, "grad_norm": 1.4958494686436283, "language_loss": 0.73531151, "learning_rate": 3.887191701647992e-06, "loss": 0.76280344, "num_input_tokens_seen": 48285075, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.35058594, "step": 2232, "time_per_iteration": 2.955695390701294 }, { "auxiliary_loss_clip": 0.01679415, "auxiliary_loss_mlp": 0.01059809, "balance_loss_clip": 1.42565048, "balance_loss_mlp": 1.02671647, "epoch": 0.13425522320757552, "flos": 26954258365440.0, "grad_norm": 2.6218884210703304, "language_loss": 0.67043585, "learning_rate": 3.8870627153046066e-06, "loss": 0.69782805, "num_input_tokens_seen": 48301285, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.33081055, "step": 2233, "time_per_iteration": 2.9068074226379395 }, { "auxiliary_loss_clip": 0.0165973, "auxiliary_loss_mlp": 0.01064774, "balance_loss_clip": 1.40951562, "balance_loss_mlp": 1.02986979, "epoch": 0.1343153464602435, "flos": 15785381352960.0, "grad_norm": 3.2636277228708157, "language_loss": 0.83347625, "learning_rate": 3.886933657403615e-06, "loss": 0.86072129, "num_input_tokens_seen": 48317835, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.34887695, "step": 2234, "time_per_iteration": 2.8156683444976807 }, { "auxiliary_loss_clip": 0.01674627, "auxiliary_loss_mlp": 0.01060823, "balance_loss_clip": 1.42338347, "balance_loss_mlp": 1.02784967, "epoch": 0.13437546971291148, "flos": 24325020449280.0, "grad_norm": 2.543189045103409, "language_loss": 0.82392395, "learning_rate": 3.886804527949909e-06, "loss": 0.85127842, "num_input_tokens_seen": 48335670, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.32983398, "step": 2235, "time_per_iteration": 2.8894009590148926 }, { "auxiliary_loss_clip": 0.01674732, "auxiliary_loss_mlp": 0.01065244, "balance_loss_clip": 1.42466021, "balance_loss_mlp": 1.03115058, "epoch": 0.13443559296557944, "flos": 26661758860800.0, "grad_norm": 2.854772423550955, "language_loss": 0.87054396, "learning_rate": 3.8866753269483864e-06, "loss": 0.89794374, "num_input_tokens_seen": 48357805, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.34106445, "step": 2236, "time_per_iteration": 3.0925490856170654 }, { "auxiliary_loss_clip": 0.01676027, "auxiliary_loss_mlp": 0.01069359, "balance_loss_clip": 1.42150438, "balance_loss_mlp": 1.03273773, "epoch": 0.1344957162182474, "flos": 21805537714560.0, "grad_norm": 1.640106243547918, "language_loss": 0.78105074, "learning_rate": 3.886546054403946e-06, "loss": 0.80850458, "num_input_tokens_seen": 48377845, "router_z_loss_clip": 2.54296875, "router_z_loss_mlp": 0.36572266, "step": 2237, "time_per_iteration": 2.886073589324951 }, { "auxiliary_loss_clip": 0.01680352, "auxiliary_loss_mlp": 0.01065802, "balance_loss_clip": 1.4279809, "balance_loss_mlp": 1.03230405, "epoch": 0.13455583947091537, "flos": 19875216510720.0, "grad_norm": 1.9295275703255441, "language_loss": 0.80783224, "learning_rate": 3.886416710321491e-06, "loss": 0.83529377, "num_input_tokens_seen": 48394735, "router_z_loss_clip": 2.52539062, "router_z_loss_mlp": 0.33496094, "step": 2238, "time_per_iteration": 2.8664093017578125 }, { "auxiliary_loss_clip": 0.0165413, "auxiliary_loss_mlp": 0.01055559, "balance_loss_clip": 1.40627813, "balance_loss_mlp": 1.02227569, "epoch": 0.13461596272358334, "flos": 30859086960000.0, "grad_norm": 2.5617866330023746, "language_loss": 0.68794793, "learning_rate": 3.886287294705924e-06, "loss": 0.71504474, "num_input_tokens_seen": 48414200, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.33276367, "step": 2239, "time_per_iteration": 2.922264337539673 }, { "auxiliary_loss_clip": 0.01667044, "auxiliary_loss_mlp": 0.01056813, "balance_loss_clip": 1.4162885, "balance_loss_mlp": 1.02233815, "epoch": 0.1346760859762513, "flos": 12501910604160.0, "grad_norm": 2.2735017952947607, "language_loss": 0.83779114, "learning_rate": 3.8861578075621555e-06, "loss": 0.86502969, "num_input_tokens_seen": 48431065, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.34472656, "step": 2240, "time_per_iteration": 2.898653984069824 }, { "auxiliary_loss_clip": 0.01655942, "auxiliary_loss_mlp": 0.01052454, "balance_loss_clip": 1.40491486, "balance_loss_mlp": 1.01776409, "epoch": 0.1347362092289193, "flos": 21846240051840.0, "grad_norm": 1.6682507039330976, "language_loss": 0.79314375, "learning_rate": 3.886028248895093e-06, "loss": 0.82022774, "num_input_tokens_seen": 48450335, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.34692383, "step": 2241, "time_per_iteration": 2.8611223697662354 }, { "auxiliary_loss_clip": 0.01658151, "auxiliary_loss_mlp": 0.0105429, "balance_loss_clip": 1.41353095, "balance_loss_mlp": 1.02193689, "epoch": 0.13479633248158726, "flos": 23518837019520.0, "grad_norm": 4.032841671550493, "language_loss": 0.84832186, "learning_rate": 3.88589861870965e-06, "loss": 0.87544626, "num_input_tokens_seen": 48468555, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.32324219, "step": 2242, "time_per_iteration": 2.873443603515625 }, { "auxiliary_loss_clip": 0.01661343, "auxiliary_loss_mlp": 0.01063612, "balance_loss_clip": 1.41140091, "balance_loss_mlp": 1.02875507, "epoch": 0.13485645573425523, "flos": 29354484510720.0, "grad_norm": 2.7324749209159416, "language_loss": 0.66733652, "learning_rate": 3.885768917010744e-06, "loss": 0.69458604, "num_input_tokens_seen": 48488515, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.34887695, "step": 2243, "time_per_iteration": 2.946354866027832 }, { "auxiliary_loss_clip": 0.01633454, "auxiliary_loss_mlp": 0.0105853, "balance_loss_clip": 1.39133239, "balance_loss_mlp": 1.02646255, "epoch": 0.1349165789869232, "flos": 28048552554240.0, "grad_norm": 1.3540645627905992, "language_loss": 0.73497415, "learning_rate": 3.8856391438032895e-06, "loss": 0.76189399, "num_input_tokens_seen": 48510515, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.32055664, "step": 2244, "time_per_iteration": 2.9020535945892334 }, { "auxiliary_loss_clip": 0.01658021, "auxiliary_loss_mlp": 0.01057733, "balance_loss_clip": 1.41167855, "balance_loss_mlp": 1.02530789, "epoch": 0.13497670223959116, "flos": 22863201598080.0, "grad_norm": 1.6189596213389545, "language_loss": 0.87018055, "learning_rate": 3.88550929909221e-06, "loss": 0.89733815, "num_input_tokens_seen": 48529940, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.32446289, "step": 2245, "time_per_iteration": 2.9105844497680664 }, { "auxiliary_loss_clip": 0.01651272, "auxiliary_loss_mlp": 0.01059619, "balance_loss_clip": 1.41018355, "balance_loss_mlp": 1.02504802, "epoch": 0.13503682549225912, "flos": 16512241368960.0, "grad_norm": 1.5945270584582532, "language_loss": 0.7970829, "learning_rate": 3.88537938288243e-06, "loss": 0.82419181, "num_input_tokens_seen": 48548190, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.34570312, "step": 2246, "time_per_iteration": 4.309743881225586 }, { "auxiliary_loss_clip": 0.01393707, "auxiliary_loss_mlp": 0.01027418, "balance_loss_clip": 1.24254274, "balance_loss_mlp": 1.00510204, "epoch": 0.1350969487449271, "flos": 70786834928640.0, "grad_norm": 0.7861586810762631, "language_loss": 0.6058532, "learning_rate": 3.885249395178874e-06, "loss": 0.63006449, "num_input_tokens_seen": 48613165, "router_z_loss_clip": 1.515625, "router_z_loss_mlp": 0.22363281, "step": 2247, "time_per_iteration": 3.4398880004882812 }, { "auxiliary_loss_clip": 0.01692989, "auxiliary_loss_mlp": 0.01064746, "balance_loss_clip": 1.43436265, "balance_loss_mlp": 1.02955556, "epoch": 0.13515707199759508, "flos": 23086376789760.0, "grad_norm": 2.1930484931520824, "language_loss": 0.82009423, "learning_rate": 3.885119335986473e-06, "loss": 0.84767151, "num_input_tokens_seen": 48631705, "router_z_loss_clip": 2.5859375, "router_z_loss_mlp": 0.3515625, "step": 2248, "time_per_iteration": 2.9154210090637207 }, { "auxiliary_loss_clip": 0.01652188, "auxiliary_loss_mlp": 0.01051647, "balance_loss_clip": 1.40858138, "balance_loss_mlp": 1.0197227, "epoch": 0.13521719525026304, "flos": 23197036867200.0, "grad_norm": 2.3099519486092364, "language_loss": 0.77784169, "learning_rate": 3.884989205310157e-06, "loss": 0.80488002, "num_input_tokens_seen": 48649740, "router_z_loss_clip": 2.4375, "router_z_loss_mlp": 0.3190918, "step": 2249, "time_per_iteration": 2.9171531200408936 }, { "auxiliary_loss_clip": 0.01650041, "auxiliary_loss_mlp": 0.01059806, "balance_loss_clip": 1.40473747, "balance_loss_mlp": 1.02699912, "epoch": 0.135277318502931, "flos": 24801802600320.0, "grad_norm": 1.4402641800808507, "language_loss": 0.85612535, "learning_rate": 3.884859003154862e-06, "loss": 0.88322377, "num_input_tokens_seen": 48671565, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.32788086, "step": 2250, "time_per_iteration": 2.8820340633392334 }, { "auxiliary_loss_clip": 0.01665566, "auxiliary_loss_mlp": 0.01066516, "balance_loss_clip": 1.41424727, "balance_loss_mlp": 1.03046703, "epoch": 0.13533744175559898, "flos": 21918641011200.0, "grad_norm": 5.11872802917947, "language_loss": 0.83319372, "learning_rate": 3.884728729525524e-06, "loss": 0.86051452, "num_input_tokens_seen": 48690425, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.36035156, "step": 2251, "time_per_iteration": 2.8914918899536133 }, { "auxiliary_loss_clip": 0.01660265, "auxiliary_loss_mlp": 0.0106473, "balance_loss_clip": 1.41006017, "balance_loss_mlp": 1.03261459, "epoch": 0.13539756500826694, "flos": 21220629194880.0, "grad_norm": 1.7919121606973332, "language_loss": 0.86674696, "learning_rate": 3.884598384427084e-06, "loss": 0.89399695, "num_input_tokens_seen": 48707505, "router_z_loss_clip": 2.50390625, "router_z_loss_mlp": 0.32080078, "step": 2252, "time_per_iteration": 2.802342176437378 }, { "auxiliary_loss_clip": 0.01397023, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.24758983, "balance_loss_mlp": 1.00765324, "epoch": 0.1354576882609349, "flos": 63271170322560.0, "grad_norm": 0.7940009485906834, "language_loss": 0.61923593, "learning_rate": 3.884467967864485e-06, "loss": 0.6435011, "num_input_tokens_seen": 48775895, "router_z_loss_clip": 1.5, "router_z_loss_mlp": 0.21875, "step": 2253, "time_per_iteration": 4.861313581466675 }, { "auxiliary_loss_clip": 0.01673203, "auxiliary_loss_mlp": 0.01074073, "balance_loss_clip": 1.42429209, "balance_loss_mlp": 1.04202974, "epoch": 0.1355178115136029, "flos": 25493796858240.0, "grad_norm": 1.7040865700206325, "language_loss": 0.90616304, "learning_rate": 3.884337479842671e-06, "loss": 0.93363583, "num_input_tokens_seen": 48798370, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.32006836, "step": 2254, "time_per_iteration": 2.949700117111206 }, { "auxiliary_loss_clip": 0.01679305, "auxiliary_loss_mlp": 0.01077563, "balance_loss_clip": 1.42450094, "balance_loss_mlp": 1.04394603, "epoch": 0.13557793476627086, "flos": 21627091647360.0, "grad_norm": 1.7491077381864728, "language_loss": 0.86366153, "learning_rate": 3.884206920366591e-06, "loss": 0.89123023, "num_input_tokens_seen": 48817955, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.33618164, "step": 2255, "time_per_iteration": 5.677891731262207 }, { "auxiliary_loss_clip": 0.01660898, "auxiliary_loss_mlp": 0.01082293, "balance_loss_clip": 1.41341424, "balance_loss_mlp": 1.04903388, "epoch": 0.13563805801893883, "flos": 24936922131840.0, "grad_norm": 3.4424764159190784, "language_loss": 0.76227313, "learning_rate": 3.884076289441196e-06, "loss": 0.78970504, "num_input_tokens_seen": 48836330, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.33276367, "step": 2256, "time_per_iteration": 2.937514066696167 }, { "auxiliary_loss_clip": 0.01667709, "auxiliary_loss_mlp": 0.01077921, "balance_loss_clip": 1.41406727, "balance_loss_mlp": 1.04387486, "epoch": 0.1356981812716068, "flos": 14756701403520.0, "grad_norm": 1.8689864515134464, "language_loss": 0.8449868, "learning_rate": 3.88394558707144e-06, "loss": 0.87244308, "num_input_tokens_seen": 48851890, "router_z_loss_clip": 2.5390625, "router_z_loss_mlp": 0.34033203, "step": 2257, "time_per_iteration": 2.8200438022613525 }, { "auxiliary_loss_clip": 0.01681505, "auxiliary_loss_mlp": 0.01079882, "balance_loss_clip": 1.42657995, "balance_loss_mlp": 1.04690862, "epoch": 0.13575830452427476, "flos": 11115840827520.0, "grad_norm": 2.5465193847240295, "language_loss": 0.83550143, "learning_rate": 3.883814813262277e-06, "loss": 0.86311531, "num_input_tokens_seen": 48865510, "router_z_loss_clip": 2.55273438, "router_z_loss_mlp": 0.32983398, "step": 2258, "time_per_iteration": 2.8580639362335205 }, { "auxiliary_loss_clip": 0.01670677, "auxiliary_loss_mlp": 0.01071001, "balance_loss_clip": 1.41687489, "balance_loss_mlp": 1.03857636, "epoch": 0.13581842777694272, "flos": 17967363989760.0, "grad_norm": 3.1690223270620055, "language_loss": 0.84782302, "learning_rate": 3.883683968018669e-06, "loss": 0.87523979, "num_input_tokens_seen": 48882360, "router_z_loss_clip": 2.53710938, "router_z_loss_mlp": 0.32421875, "step": 2259, "time_per_iteration": 2.7921218872070312 }, { "auxiliary_loss_clip": 0.01648263, "auxiliary_loss_mlp": 0.01081888, "balance_loss_clip": 1.40323055, "balance_loss_mlp": 1.05060768, "epoch": 0.1358785510296107, "flos": 22867499854080.0, "grad_norm": 3.877350175983871, "language_loss": 0.74765307, "learning_rate": 3.8835530513455755e-06, "loss": 0.77495456, "num_input_tokens_seen": 48902700, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.31298828, "step": 2260, "time_per_iteration": 2.8964359760284424 }, { "auxiliary_loss_clip": 0.0164955, "auxiliary_loss_mlp": 0.01074562, "balance_loss_clip": 1.40301788, "balance_loss_mlp": 1.04344821, "epoch": 0.13593867428227868, "flos": 25750480464000.0, "grad_norm": 2.31451821812165, "language_loss": 0.76886541, "learning_rate": 3.883422063247961e-06, "loss": 0.79610658, "num_input_tokens_seen": 48922525, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.3112793, "step": 2261, "time_per_iteration": 2.8680355548858643 }, { "auxiliary_loss_clip": 0.01654392, "auxiliary_loss_mlp": 0.01073179, "balance_loss_clip": 1.40667892, "balance_loss_mlp": 1.04068255, "epoch": 0.13599879753494665, "flos": 31261839338880.0, "grad_norm": 2.3535543114740345, "language_loss": 0.64015353, "learning_rate": 3.883291003730794e-06, "loss": 0.66742921, "num_input_tokens_seen": 48942510, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.32519531, "step": 2262, "time_per_iteration": 2.9303812980651855 }, { "auxiliary_loss_clip": 0.01660415, "auxiliary_loss_mlp": 0.01064852, "balance_loss_clip": 1.41097331, "balance_loss_mlp": 1.03416777, "epoch": 0.1360589207876146, "flos": 23925616185600.0, "grad_norm": 2.7275256750757784, "language_loss": 0.84277189, "learning_rate": 3.883159872799043e-06, "loss": 0.87002462, "num_input_tokens_seen": 48962625, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.30664062, "step": 2263, "time_per_iteration": 2.912876844406128 }, { "auxiliary_loss_clip": 0.01656477, "auxiliary_loss_mlp": 0.01079756, "balance_loss_clip": 1.40524697, "balance_loss_mlp": 1.04575753, "epoch": 0.13611904404028258, "flos": 19983659592960.0, "grad_norm": 7.81441838823987, "language_loss": 0.88546008, "learning_rate": 3.8830286704576815e-06, "loss": 0.91282248, "num_input_tokens_seen": 48982525, "router_z_loss_clip": 2.51367188, "router_z_loss_mlp": 0.33984375, "step": 2264, "time_per_iteration": 2.870145082473755 }, { "auxiliary_loss_clip": 0.01657004, "auxiliary_loss_mlp": 0.01073734, "balance_loss_clip": 1.4040767, "balance_loss_mlp": 1.03856683, "epoch": 0.13617916729295054, "flos": 15348939580800.0, "grad_norm": 5.522414941165018, "language_loss": 0.72830987, "learning_rate": 3.882897396711683e-06, "loss": 0.75561726, "num_input_tokens_seen": 48997605, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.35180664, "step": 2265, "time_per_iteration": 2.8204309940338135 }, { "auxiliary_loss_clip": 0.01635791, "auxiliary_loss_mlp": 0.01069001, "balance_loss_clip": 1.3912468, "balance_loss_mlp": 1.03567028, "epoch": 0.1362392905456185, "flos": 27462241445760.0, "grad_norm": 2.0237558501741666, "language_loss": 0.67840707, "learning_rate": 3.882766051566027e-06, "loss": 0.70545495, "num_input_tokens_seen": 49018535, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.33325195, "step": 2266, "time_per_iteration": 2.8790993690490723 }, { "auxiliary_loss_clip": 0.01653075, "auxiliary_loss_mlp": 0.01073313, "balance_loss_clip": 1.40594435, "balance_loss_mlp": 1.03936195, "epoch": 0.1362994137982865, "flos": 25019729395200.0, "grad_norm": 1.581117339619779, "language_loss": 0.7757796, "learning_rate": 3.882634635025694e-06, "loss": 0.80304354, "num_input_tokens_seen": 49038865, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.33935547, "step": 2267, "time_per_iteration": 2.9164888858795166 }, { "auxiliary_loss_clip": 0.0164442, "auxiliary_loss_mlp": 0.01068306, "balance_loss_clip": 1.39749694, "balance_loss_mlp": 1.03578568, "epoch": 0.13635953705095447, "flos": 20312698913280.0, "grad_norm": 2.277653474466681, "language_loss": 0.83072019, "learning_rate": 3.882503147095667e-06, "loss": 0.85784745, "num_input_tokens_seen": 49058010, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.32495117, "step": 2268, "time_per_iteration": 2.82979679107666 }, { "auxiliary_loss_clip": 0.01642001, "auxiliary_loss_mlp": 0.01068814, "balance_loss_clip": 1.3992033, "balance_loss_mlp": 1.03526831, "epoch": 0.13641966030362243, "flos": 31371006337920.0, "grad_norm": 1.7735982068693383, "language_loss": 0.77372926, "learning_rate": 3.882371587780931e-06, "loss": 0.8008374, "num_input_tokens_seen": 49080330, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.33569336, "step": 2269, "time_per_iteration": 2.9534313678741455 }, { "auxiliary_loss_clip": 0.01660239, "auxiliary_loss_mlp": 0.01065574, "balance_loss_clip": 1.41085374, "balance_loss_mlp": 1.03338742, "epoch": 0.1364797835562904, "flos": 20485896583680.0, "grad_norm": 1.8665080539816428, "language_loss": 0.81702566, "learning_rate": 3.882239957086477e-06, "loss": 0.84428376, "num_input_tokens_seen": 49097035, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.32202148, "step": 2270, "time_per_iteration": 2.8775389194488525 }, { "auxiliary_loss_clip": 0.01639669, "auxiliary_loss_mlp": 0.01068205, "balance_loss_clip": 1.39274335, "balance_loss_mlp": 1.03430176, "epoch": 0.13653990680895836, "flos": 13086004717440.0, "grad_norm": 2.830614972113301, "language_loss": 0.7773959, "learning_rate": 3.882108255017295e-06, "loss": 0.80447465, "num_input_tokens_seen": 49113945, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.33886719, "step": 2271, "time_per_iteration": 2.9800453186035156 }, { "auxiliary_loss_clip": 0.01635982, "auxiliary_loss_mlp": 0.01069304, "balance_loss_clip": 1.38929415, "balance_loss_mlp": 1.0356636, "epoch": 0.13660003006162633, "flos": 16955605595520.0, "grad_norm": 2.1387795377239236, "language_loss": 0.8202256, "learning_rate": 3.881976481578379e-06, "loss": 0.84727848, "num_input_tokens_seen": 49132855, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.33666992, "step": 2272, "time_per_iteration": 2.921325445175171 }, { "auxiliary_loss_clip": 0.0137176, "auxiliary_loss_mlp": 0.01040989, "balance_loss_clip": 1.2307651, "balance_loss_mlp": 1.0183872, "epoch": 0.1366601533142943, "flos": 68714336004480.0, "grad_norm": 0.7019627995990968, "language_loss": 0.60813165, "learning_rate": 3.8818446367747255e-06, "loss": 0.63225913, "num_input_tokens_seen": 49198310, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.22558594, "step": 2273, "time_per_iteration": 3.500204086303711 }, { "auxiliary_loss_clip": 0.01638454, "auxiliary_loss_mlp": 0.01052516, "balance_loss_clip": 1.39592695, "balance_loss_mlp": 1.02044916, "epoch": 0.13672027656696228, "flos": 19253451461760.0, "grad_norm": 1.6407824163487523, "language_loss": 0.78502727, "learning_rate": 3.881712720611336e-06, "loss": 0.81193703, "num_input_tokens_seen": 49217250, "router_z_loss_clip": 2.42773438, "router_z_loss_mlp": 0.32055664, "step": 2274, "time_per_iteration": 2.8962368965148926 }, { "auxiliary_loss_clip": 0.01646454, "auxiliary_loss_mlp": 0.01060091, "balance_loss_clip": 1.39985061, "balance_loss_mlp": 1.02594924, "epoch": 0.13678039981963025, "flos": 24545571442560.0, "grad_norm": 1.8785823736114449, "language_loss": 0.79385293, "learning_rate": 3.881580733093211e-06, "loss": 0.82091832, "num_input_tokens_seen": 49236615, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.34130859, "step": 2275, "time_per_iteration": 2.924734354019165 }, { "auxiliary_loss_clip": 0.01642125, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.39433694, "balance_loss_mlp": 1.01625824, "epoch": 0.13684052307229821, "flos": 15677797921920.0, "grad_norm": 3.1705923874028636, "language_loss": 0.8263731, "learning_rate": 3.881448674225356e-06, "loss": 0.85327667, "num_input_tokens_seen": 49253935, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.31958008, "step": 2276, "time_per_iteration": 2.933454751968384 }, { "auxiliary_loss_clip": 0.01663632, "auxiliary_loss_mlp": 0.01060852, "balance_loss_clip": 1.4104178, "balance_loss_mlp": 1.02597153, "epoch": 0.13690064632496618, "flos": 28376008306560.0, "grad_norm": 2.2695959983170453, "language_loss": 0.72001362, "learning_rate": 3.881316544012779e-06, "loss": 0.74725854, "num_input_tokens_seen": 49273605, "router_z_loss_clip": 2.53320312, "router_z_loss_mlp": 0.34863281, "step": 2277, "time_per_iteration": 2.898597240447998 }, { "auxiliary_loss_clip": 0.01659231, "auxiliary_loss_mlp": 0.01059157, "balance_loss_clip": 1.40751433, "balance_loss_mlp": 1.02620721, "epoch": 0.13696076957763414, "flos": 23415370865280.0, "grad_norm": 3.0364842659991287, "language_loss": 0.81212533, "learning_rate": 3.88118434246049e-06, "loss": 0.83930922, "num_input_tokens_seen": 49291785, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.32958984, "step": 2278, "time_per_iteration": 2.833397626876831 }, { "auxiliary_loss_clip": 0.01646031, "auxiliary_loss_mlp": 0.01054383, "balance_loss_clip": 1.39948809, "balance_loss_mlp": 1.02238691, "epoch": 0.1370208928303021, "flos": 37210816350720.0, "grad_norm": 2.4041500475165125, "language_loss": 0.7602998, "learning_rate": 3.881052069573502e-06, "loss": 0.78730386, "num_input_tokens_seen": 49311405, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.31982422, "step": 2279, "time_per_iteration": 3.051732063293457 }, { "auxiliary_loss_clip": 0.01656124, "auxiliary_loss_mlp": 0.01054392, "balance_loss_clip": 1.40606272, "balance_loss_mlp": 1.02134705, "epoch": 0.13708101608297008, "flos": 26986771393920.0, "grad_norm": 1.8178205147775817, "language_loss": 0.77806687, "learning_rate": 3.880919725356831e-06, "loss": 0.80517203, "num_input_tokens_seen": 49331835, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 0.33056641, "step": 2280, "time_per_iteration": 2.9367496967315674 }, { "auxiliary_loss_clip": 0.01637707, "auxiliary_loss_mlp": 0.01053937, "balance_loss_clip": 1.39479423, "balance_loss_mlp": 1.02217996, "epoch": 0.13714113933563807, "flos": 32568314232960.0, "grad_norm": 2.2179777110813275, "language_loss": 0.806638, "learning_rate": 3.880787309815496e-06, "loss": 0.83355439, "num_input_tokens_seen": 49352290, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.31787109, "step": 2281, "time_per_iteration": 4.357746124267578 }, { "auxiliary_loss_clip": 0.016762, "auxiliary_loss_mlp": 0.01062428, "balance_loss_clip": 1.42039394, "balance_loss_mlp": 1.03024113, "epoch": 0.13720126258830603, "flos": 16108312625280.0, "grad_norm": 1.6606055116247374, "language_loss": 0.84687352, "learning_rate": 3.880654822954518e-06, "loss": 0.87425983, "num_input_tokens_seen": 49370285, "router_z_loss_clip": 2.55859375, "router_z_loss_mlp": 0.32177734, "step": 2282, "time_per_iteration": 2.9281721115112305 }, { "auxiliary_loss_clip": 0.01657257, "auxiliary_loss_mlp": 0.0105731, "balance_loss_clip": 1.41208744, "balance_loss_mlp": 1.02529025, "epoch": 0.137261385840974, "flos": 18962987973120.0, "grad_norm": 1.608301336826467, "language_loss": 0.74434549, "learning_rate": 3.8805222647789195e-06, "loss": 0.77149117, "num_input_tokens_seen": 49389610, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.31982422, "step": 2283, "time_per_iteration": 2.8594882488250732 }, { "auxiliary_loss_clip": 0.01646385, "auxiliary_loss_mlp": 0.01058813, "balance_loss_clip": 1.40360332, "balance_loss_mlp": 1.02750826, "epoch": 0.13732150909364196, "flos": 23305932397440.0, "grad_norm": 2.4711927899980637, "language_loss": 0.86167693, "learning_rate": 3.880389635293729e-06, "loss": 0.88872886, "num_input_tokens_seen": 49408390, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.31298828, "step": 2284, "time_per_iteration": 2.8668174743652344 }, { "auxiliary_loss_clip": 0.01671103, "auxiliary_loss_mlp": 0.01067691, "balance_loss_clip": 1.41624928, "balance_loss_mlp": 1.03393114, "epoch": 0.13738163234630993, "flos": 29363216757120.0, "grad_norm": 2.858938210318736, "language_loss": 0.76481891, "learning_rate": 3.880256934503974e-06, "loss": 0.79220688, "num_input_tokens_seen": 49427725, "router_z_loss_clip": 2.54882812, "router_z_loss_mlp": 0.33789062, "step": 2285, "time_per_iteration": 2.924229621887207 }, { "auxiliary_loss_clip": 0.01652349, "auxiliary_loss_mlp": 0.01059683, "balance_loss_clip": 1.40612292, "balance_loss_mlp": 1.02587509, "epoch": 0.1374417555989779, "flos": 26662573267200.0, "grad_norm": 1.63354201659485, "language_loss": 0.75588089, "learning_rate": 3.880124162414689e-06, "loss": 0.78300124, "num_input_tokens_seen": 49449000, "router_z_loss_clip": 2.46679688, "router_z_loss_mlp": 0.33813477, "step": 2286, "time_per_iteration": 2.8651490211486816 }, { "auxiliary_loss_clip": 0.01669816, "auxiliary_loss_mlp": 0.01056692, "balance_loss_clip": 1.41990328, "balance_loss_mlp": 1.02283621, "epoch": 0.1375018788516459, "flos": 28415308055040.0, "grad_norm": 2.1169969417480305, "language_loss": 0.87405366, "learning_rate": 3.879991319030908e-06, "loss": 0.90131879, "num_input_tokens_seen": 49468360, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.33862305, "step": 2287, "time_per_iteration": 2.8983330726623535 }, { "auxiliary_loss_clip": 0.01651091, "auxiliary_loss_mlp": 0.01059394, "balance_loss_clip": 1.40427899, "balance_loss_mlp": 1.02685022, "epoch": 0.13756200210431385, "flos": 37427612025600.0, "grad_norm": 1.8316114544046702, "language_loss": 0.69960749, "learning_rate": 3.879858404357666e-06, "loss": 0.72671229, "num_input_tokens_seen": 49493450, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.32543945, "step": 2288, "time_per_iteration": 4.37264347076416 }, { "auxiliary_loss_clip": 0.01648408, "auxiliary_loss_mlp": 0.01061103, "balance_loss_clip": 1.40008843, "balance_loss_mlp": 1.02622271, "epoch": 0.13762212535698182, "flos": 22721295346560.0, "grad_norm": 2.6579113753426897, "language_loss": 0.88650221, "learning_rate": 3.879725418400005e-06, "loss": 0.9135974, "num_input_tokens_seen": 49511220, "router_z_loss_clip": 2.48242188, "router_z_loss_mlp": 0.34863281, "step": 2289, "time_per_iteration": 2.8573062419891357 }, { "auxiliary_loss_clip": 0.01633658, "auxiliary_loss_mlp": 0.01051392, "balance_loss_clip": 1.39061356, "balance_loss_mlp": 1.02070785, "epoch": 0.13768224860964978, "flos": 23962879918080.0, "grad_norm": 1.7489766907309152, "language_loss": 0.75444436, "learning_rate": 3.879592361162969e-06, "loss": 0.78129482, "num_input_tokens_seen": 49529820, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.30639648, "step": 2290, "time_per_iteration": 5.800458908081055 }, { "auxiliary_loss_clip": 0.0136763, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.22327924, "balance_loss_mlp": 1.00239408, "epoch": 0.13774237186231775, "flos": 63624777569280.0, "grad_norm": 0.7047354480376266, "language_loss": 0.5167712, "learning_rate": 3.8794592326516015e-06, "loss": 0.54072702, "num_input_tokens_seen": 49595325, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 0.25585938, "step": 2291, "time_per_iteration": 3.4303178787231445 }, { "auxiliary_loss_clip": 0.01675091, "auxiliary_loss_mlp": 0.01054533, "balance_loss_clip": 1.42584825, "balance_loss_mlp": 1.02134526, "epoch": 0.1378024951149857, "flos": 24290018956800.0, "grad_norm": 1.9185490705451196, "language_loss": 0.72157234, "learning_rate": 3.879326032870952e-06, "loss": 0.74886858, "num_input_tokens_seen": 49615850, "router_z_loss_clip": 2.4921875, "router_z_loss_mlp": 0.33178711, "step": 2292, "time_per_iteration": 2.906963348388672 }, { "auxiliary_loss_clip": 0.01659613, "auxiliary_loss_mlp": 0.01062348, "balance_loss_clip": 1.4115634, "balance_loss_mlp": 1.02782488, "epoch": 0.13786261836765368, "flos": 14028031595520.0, "grad_norm": 2.3395717324450906, "language_loss": 0.81437409, "learning_rate": 3.879192761826071e-06, "loss": 0.84159362, "num_input_tokens_seen": 49631860, "router_z_loss_clip": 2.47851562, "router_z_loss_mlp": 0.34545898, "step": 2293, "time_per_iteration": 2.9694292545318604 }, { "auxiliary_loss_clip": 0.01674629, "auxiliary_loss_mlp": 0.01056532, "balance_loss_clip": 1.42215657, "balance_loss_mlp": 1.02541888, "epoch": 0.13792274162032167, "flos": 28890959086080.0, "grad_norm": 2.062802444330001, "language_loss": 0.80037081, "learning_rate": 3.879059419522011e-06, "loss": 0.8276825, "num_input_tokens_seen": 49652145, "router_z_loss_clip": 2.52734375, "router_z_loss_mlp": 0.31103516, "step": 2294, "time_per_iteration": 3.0505993366241455 }, { "auxiliary_loss_clip": 0.01639186, "auxiliary_loss_mlp": 0.01054567, "balance_loss_clip": 1.39530706, "balance_loss_mlp": 1.02421641, "epoch": 0.13798286487298964, "flos": 21150761944320.0, "grad_norm": 2.1494785522713826, "language_loss": 0.81638575, "learning_rate": 3.878926005963831e-06, "loss": 0.84332329, "num_input_tokens_seen": 49669880, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.3034668, "step": 2295, "time_per_iteration": 2.993037223815918 }, { "auxiliary_loss_clip": 0.01643949, "auxiliary_loss_mlp": 0.01058472, "balance_loss_clip": 1.39673519, "balance_loss_mlp": 1.02661967, "epoch": 0.1380429881256576, "flos": 22496491342080.0, "grad_norm": 2.167183569038902, "language_loss": 0.79739577, "learning_rate": 3.878792521156588e-06, "loss": 0.82441998, "num_input_tokens_seen": 49687255, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.31860352, "step": 2296, "time_per_iteration": 2.994899034500122 }, { "auxiliary_loss_clip": 0.01645051, "auxiliary_loss_mlp": 0.01057628, "balance_loss_clip": 1.39886665, "balance_loss_mlp": 1.02625251, "epoch": 0.13810311137832557, "flos": 21402604356480.0, "grad_norm": 2.2192163318827203, "language_loss": 0.78922355, "learning_rate": 3.8786589651053446e-06, "loss": 0.81625032, "num_input_tokens_seen": 49706650, "router_z_loss_clip": 2.46289062, "router_z_loss_mlp": 0.3137207, "step": 2297, "time_per_iteration": 3.0134103298187256 }, { "auxiliary_loss_clip": 0.01644474, "auxiliary_loss_mlp": 0.0106019, "balance_loss_clip": 1.40007186, "balance_loss_mlp": 1.02731204, "epoch": 0.13816323463099353, "flos": 25999698677760.0, "grad_norm": 2.1538941314988445, "language_loss": 0.69829965, "learning_rate": 3.878525337815164e-06, "loss": 0.72534633, "num_input_tokens_seen": 49725715, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.32861328, "step": 2298, "time_per_iteration": 2.923275947570801 }, { "auxiliary_loss_clip": 0.01662359, "auxiliary_loss_mlp": 0.01067735, "balance_loss_clip": 1.41073513, "balance_loss_mlp": 1.03464246, "epoch": 0.1382233578836615, "flos": 19253541951360.0, "grad_norm": 1.7355539396019553, "language_loss": 0.88303435, "learning_rate": 3.878391639291116e-06, "loss": 0.9103353, "num_input_tokens_seen": 49744710, "router_z_loss_clip": 2.51757812, "router_z_loss_mlp": 0.33056641, "step": 2299, "time_per_iteration": 2.9149131774902344 }, { "auxiliary_loss_clip": 0.01651722, "auxiliary_loss_mlp": 0.01072056, "balance_loss_clip": 1.40090823, "balance_loss_mlp": 1.0390116, "epoch": 0.1382834811363295, "flos": 25677310343040.0, "grad_norm": 5.734216485344874, "language_loss": 0.77082896, "learning_rate": 3.878257869538267e-06, "loss": 0.79806674, "num_input_tokens_seen": 49764300, "router_z_loss_clip": 2.50976562, "router_z_loss_mlp": 0.33032227, "step": 2300, "time_per_iteration": 2.8860790729522705 }, { "auxiliary_loss_clip": 0.01634787, "auxiliary_loss_mlp": 0.01064214, "balance_loss_clip": 1.38969612, "balance_loss_mlp": 1.0308826, "epoch": 0.13834360438899745, "flos": 19791640085760.0, "grad_norm": 2.6183225809528055, "language_loss": 0.837901, "learning_rate": 3.878124028561692e-06, "loss": 0.86489105, "num_input_tokens_seen": 49778380, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.33325195, "step": 2301, "time_per_iteration": 2.7982566356658936 }, { "auxiliary_loss_clip": 0.01619347, "auxiliary_loss_mlp": 0.01065431, "balance_loss_clip": 1.3763541, "balance_loss_mlp": 1.0302403, "epoch": 0.13840372764166542, "flos": 26663251939200.0, "grad_norm": 2.334312110046111, "language_loss": 0.86689854, "learning_rate": 3.877990116366466e-06, "loss": 0.89374626, "num_input_tokens_seen": 49797460, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.35180664, "step": 2302, "time_per_iteration": 2.933627128601074 }, { "auxiliary_loss_clip": 0.01375083, "auxiliary_loss_mlp": 0.01096188, "balance_loss_clip": 1.23007524, "balance_loss_mlp": 1.0637629, "epoch": 0.13846385089433338, "flos": 70544086721280.0, "grad_norm": 0.7694075619459086, "language_loss": 0.65696061, "learning_rate": 3.877856132957667e-06, "loss": 0.68167329, "num_input_tokens_seen": 49868005, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.32421875, "step": 2303, "time_per_iteration": 3.4920811653137207 }, { "auxiliary_loss_clip": 0.0163514, "auxiliary_loss_mlp": 0.01057899, "balance_loss_clip": 1.39082217, "balance_loss_mlp": 1.0250448, "epoch": 0.13852397414700135, "flos": 17357905526400.0, "grad_norm": 1.8102974722059613, "language_loss": 0.79097563, "learning_rate": 3.877722078340374e-06, "loss": 0.81790602, "num_input_tokens_seen": 49885825, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.32836914, "step": 2304, "time_per_iteration": 2.86325740814209 }, { "auxiliary_loss_clip": 0.01642673, "auxiliary_loss_mlp": 0.01066193, "balance_loss_clip": 1.39341402, "balance_loss_mlp": 1.03412545, "epoch": 0.13858409739966931, "flos": 21553831036800.0, "grad_norm": 1.7465539057971116, "language_loss": 0.78547144, "learning_rate": 3.877587952519672e-06, "loss": 0.81256008, "num_input_tokens_seen": 49905975, "router_z_loss_clip": 2.49023438, "router_z_loss_mlp": 0.32055664, "step": 2305, "time_per_iteration": 2.905118227005005 }, { "auxiliary_loss_clip": 0.01616813, "auxiliary_loss_mlp": 0.01053882, "balance_loss_clip": 1.37605703, "balance_loss_mlp": 1.02133751, "epoch": 0.13864422065233728, "flos": 21589873159680.0, "grad_norm": 1.8400297015789688, "language_loss": 0.88436997, "learning_rate": 3.877453755500647e-06, "loss": 0.9110769, "num_input_tokens_seen": 49925800, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.32543945, "step": 2306, "time_per_iteration": 2.9000682830810547 }, { "auxiliary_loss_clip": 0.01372888, "auxiliary_loss_mlp": 0.01042115, "balance_loss_clip": 1.22738898, "balance_loss_mlp": 1.01312292, "epoch": 0.13870434390500527, "flos": 53392525320960.0, "grad_norm": 0.8759171136330309, "language_loss": 0.59178662, "learning_rate": 3.877319487288387e-06, "loss": 0.61593664, "num_input_tokens_seen": 49977620, "router_z_loss_clip": 1.453125, "router_z_loss_mlp": 0.2890625, "step": 2307, "time_per_iteration": 3.4423067569732666 }, { "auxiliary_loss_clip": 0.01638553, "auxiliary_loss_mlp": 0.01055021, "balance_loss_clip": 1.38810611, "balance_loss_mlp": 1.022668, "epoch": 0.13876446715767324, "flos": 22576357693440.0, "grad_norm": 1.719251173169034, "language_loss": 0.81373048, "learning_rate": 3.877185147887984e-06, "loss": 0.84066623, "num_input_tokens_seen": 49996650, "router_z_loss_clip": 2.50585938, "router_z_loss_mlp": 0.32348633, "step": 2308, "time_per_iteration": 2.871844530105591 }, { "auxiliary_loss_clip": 0.01635757, "auxiliary_loss_mlp": 0.01055701, "balance_loss_clip": 1.39072704, "balance_loss_mlp": 1.02201259, "epoch": 0.1388245904103412, "flos": 20715225068160.0, "grad_norm": 3.1599756978129134, "language_loss": 0.79424459, "learning_rate": 3.877050737304533e-06, "loss": 0.82115918, "num_input_tokens_seen": 50015640, "router_z_loss_clip": 2.44921875, "router_z_loss_mlp": 0.33642578, "step": 2309, "time_per_iteration": 2.9168717861175537 }, { "auxiliary_loss_clip": 0.01650808, "auxiliary_loss_mlp": 0.01060165, "balance_loss_clip": 1.39875507, "balance_loss_mlp": 1.02213693, "epoch": 0.13888471366300917, "flos": 20563500695040.0, "grad_norm": 2.541335455070293, "language_loss": 0.69368148, "learning_rate": 3.876916255543129e-06, "loss": 0.72079116, "num_input_tokens_seen": 50033500, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.38061523, "step": 2310, "time_per_iteration": 2.9068870544433594 }, { "auxiliary_loss_clip": 0.01627544, "auxiliary_loss_mlp": 0.01051722, "balance_loss_clip": 1.38296056, "balance_loss_mlp": 1.01898658, "epoch": 0.13894483691567713, "flos": 13844201397120.0, "grad_norm": 1.920746616908058, "language_loss": 0.84710497, "learning_rate": 3.8767817026088725e-06, "loss": 0.87389767, "num_input_tokens_seen": 50050075, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.32739258, "step": 2311, "time_per_iteration": 2.7985341548919678 }, { "auxiliary_loss_clip": 0.01646788, "auxiliary_loss_mlp": 0.0105646, "balance_loss_clip": 1.39500189, "balance_loss_mlp": 1.02281928, "epoch": 0.1390049601683451, "flos": 28041087162240.0, "grad_norm": 2.790235069413957, "language_loss": 0.83126694, "learning_rate": 3.876647078506866e-06, "loss": 0.85829943, "num_input_tokens_seen": 50070080, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.33642578, "step": 2312, "time_per_iteration": 2.9148976802825928 }, { "auxiliary_loss_clip": 0.01640634, "auxiliary_loss_mlp": 0.01055744, "balance_loss_clip": 1.38843, "balance_loss_mlp": 1.02229369, "epoch": 0.13906508342101306, "flos": 26767125296640.0, "grad_norm": 2.036075385292159, "language_loss": 0.87701476, "learning_rate": 3.876512383242215e-06, "loss": 0.90397859, "num_input_tokens_seen": 50090040, "router_z_loss_clip": 2.51953125, "router_z_loss_mlp": 0.3347168, "step": 2313, "time_per_iteration": 2.88066029548645 }, { "auxiliary_loss_clip": 0.01630722, "auxiliary_loss_mlp": 0.01059798, "balance_loss_clip": 1.38329327, "balance_loss_mlp": 1.02703941, "epoch": 0.13912520667368106, "flos": 24545797666560.0, "grad_norm": 3.530057320834044, "language_loss": 0.81492472, "learning_rate": 3.876377616820024e-06, "loss": 0.84182996, "num_input_tokens_seen": 50110595, "router_z_loss_clip": 2.4765625, "router_z_loss_mlp": 0.32763672, "step": 2314, "time_per_iteration": 2.9306485652923584 }, { "auxiliary_loss_clip": 0.01632271, "auxiliary_loss_mlp": 0.01052571, "balance_loss_clip": 1.38508272, "balance_loss_mlp": 1.01990771, "epoch": 0.13918532992634902, "flos": 19390652254080.0, "grad_norm": 3.2142751539982592, "language_loss": 0.86546069, "learning_rate": 3.876242779245409e-06, "loss": 0.89230907, "num_input_tokens_seen": 50125430, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.32666016, "step": 2315, "time_per_iteration": 2.8000879287719727 }, { "auxiliary_loss_clip": 0.0161785, "auxiliary_loss_mlp": 0.0105618, "balance_loss_clip": 1.37289786, "balance_loss_mlp": 1.02289653, "epoch": 0.139245453179017, "flos": 21333461022720.0, "grad_norm": 9.465105356672096, "language_loss": 0.79370141, "learning_rate": 3.876107870523477e-06, "loss": 0.82044172, "num_input_tokens_seen": 50144120, "router_z_loss_clip": 2.44726562, "router_z_loss_mlp": 0.33276367, "step": 2316, "time_per_iteration": 4.340923309326172 }, { "auxiliary_loss_clip": 0.01638209, "auxiliary_loss_mlp": 0.01066801, "balance_loss_clip": 1.39142644, "balance_loss_mlp": 1.03239703, "epoch": 0.13930557643168495, "flos": 19510135067520.0, "grad_norm": 1.7566421288694996, "language_loss": 0.78027683, "learning_rate": 3.875972890659349e-06, "loss": 0.80732691, "num_input_tokens_seen": 50162500, "router_z_loss_clip": 2.46875, "router_z_loss_mlp": 0.34399414, "step": 2317, "time_per_iteration": 2.9390931129455566 }, { "auxiliary_loss_clip": 0.01627939, "auxiliary_loss_mlp": 0.01053554, "balance_loss_clip": 1.37960374, "balance_loss_mlp": 1.02100945, "epoch": 0.13936569968435292, "flos": 25421757857280.0, "grad_norm": 1.82452261010462, "language_loss": 0.8217653, "learning_rate": 3.875837839658139e-06, "loss": 0.8485803, "num_input_tokens_seen": 50182415, "router_z_loss_clip": 2.48632812, "router_z_loss_mlp": 0.32519531, "step": 2318, "time_per_iteration": 2.9527127742767334 }, { "auxiliary_loss_clip": 0.01352149, "auxiliary_loss_mlp": 0.01050376, "balance_loss_clip": 1.21307516, "balance_loss_mlp": 1.02815557, "epoch": 0.13942582293702088, "flos": 70805430541440.0, "grad_norm": 0.8594569330444389, "language_loss": 0.59070164, "learning_rate": 3.87570271752497e-06, "loss": 0.6147269, "num_input_tokens_seen": 50245160, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.22265625, "step": 2319, "time_per_iteration": 3.4224491119384766 }, { "auxiliary_loss_clip": 0.01642888, "auxiliary_loss_mlp": 0.01059322, "balance_loss_clip": 1.39111853, "balance_loss_mlp": 1.02572858, "epoch": 0.13948594618968888, "flos": 35604602784000.0, "grad_norm": 2.1918677684076338, "language_loss": 0.66975778, "learning_rate": 3.875567524264967e-06, "loss": 0.69677985, "num_input_tokens_seen": 50268215, "router_z_loss_clip": 2.515625, "router_z_loss_mlp": 0.3359375, "step": 2320, "time_per_iteration": 3.0264053344726562 }, { "auxiliary_loss_clip": 0.01623993, "auxiliary_loss_mlp": 0.01056995, "balance_loss_clip": 1.38063312, "balance_loss_mlp": 1.02392662, "epoch": 0.13954606944235684, "flos": 21114946045440.0, "grad_norm": 1.8762078423529711, "language_loss": 0.71743512, "learning_rate": 3.875432259883256e-06, "loss": 0.74424505, "num_input_tokens_seen": 50288575, "router_z_loss_clip": 2.43554688, "router_z_loss_mlp": 0.33081055, "step": 2321, "time_per_iteration": 2.8691439628601074 }, { "auxiliary_loss_clip": 0.0165045, "auxiliary_loss_mlp": 0.01068449, "balance_loss_clip": 1.40010047, "balance_loss_mlp": 1.0340457, "epoch": 0.1396061926950248, "flos": 25054866622080.0, "grad_norm": 1.7783384032489562, "language_loss": 0.87280405, "learning_rate": 3.875296924384965e-06, "loss": 0.89999306, "num_input_tokens_seen": 50308735, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.34423828, "step": 2322, "time_per_iteration": 2.893277168273926 }, { "auxiliary_loss_clip": 0.0159457, "auxiliary_loss_mlp": 0.01064542, "balance_loss_clip": 1.3581804, "balance_loss_mlp": 1.03237975, "epoch": 0.13966631594769277, "flos": 37648253508480.0, "grad_norm": 1.7267943961018501, "language_loss": 0.68017215, "learning_rate": 3.875161517775226e-06, "loss": 0.70676327, "num_input_tokens_seen": 50331025, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.32128906, "step": 2323, "time_per_iteration": 4.3679749965667725 }, { "auxiliary_loss_clip": 0.01664538, "auxiliary_loss_mlp": 0.01065105, "balance_loss_clip": 1.40908074, "balance_loss_mlp": 1.0327282, "epoch": 0.13972643920036074, "flos": 16699736396160.0, "grad_norm": 1.927920641441456, "language_loss": 0.90066803, "learning_rate": 3.875026040059175e-06, "loss": 0.92796439, "num_input_tokens_seen": 50349725, "router_z_loss_clip": 2.5546875, "router_z_loss_mlp": 0.32373047, "step": 2324, "time_per_iteration": 2.8434062004089355 }, { "auxiliary_loss_clip": 0.01632375, "auxiliary_loss_mlp": 0.01071455, "balance_loss_clip": 1.38592601, "balance_loss_mlp": 1.03845787, "epoch": 0.1397865624530287, "flos": 23341431582720.0, "grad_norm": 2.1257148628097555, "language_loss": 0.73364228, "learning_rate": 3.8748904912419485e-06, "loss": 0.76068056, "num_input_tokens_seen": 50367965, "router_z_loss_clip": 2.46484375, "router_z_loss_mlp": 0.32958984, "step": 2325, "time_per_iteration": 5.6795454025268555 }, { "auxiliary_loss_clip": 0.01649918, "auxiliary_loss_mlp": 0.01075518, "balance_loss_clip": 1.40032721, "balance_loss_mlp": 1.04366517, "epoch": 0.13984668570569667, "flos": 22787135809920.0, "grad_norm": 3.7035930695530275, "language_loss": 0.83274931, "learning_rate": 3.874754871328688e-06, "loss": 0.86000371, "num_input_tokens_seen": 50385605, "router_z_loss_clip": 2.49804688, "router_z_loss_mlp": 0.31835938, "step": 2326, "time_per_iteration": 2.8711376190185547 }, { "auxiliary_loss_clip": 0.01614925, "auxiliary_loss_mlp": 0.01075556, "balance_loss_clip": 1.37575102, "balance_loss_mlp": 1.0432744, "epoch": 0.13990680895836466, "flos": 19474183434240.0, "grad_norm": 1.7635959704967323, "language_loss": 0.90110981, "learning_rate": 3.874619180324534e-06, "loss": 0.92801458, "num_input_tokens_seen": 50403985, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.32275391, "step": 2327, "time_per_iteration": 3.0145423412323 }, { "auxiliary_loss_clip": 0.01628261, "auxiliary_loss_mlp": 0.01073342, "balance_loss_clip": 1.38491499, "balance_loss_mlp": 1.04280066, "epoch": 0.13996693221103262, "flos": 20313015626880.0, "grad_norm": 3.4510086391731516, "language_loss": 0.86017036, "learning_rate": 3.874483418234632e-06, "loss": 0.88718641, "num_input_tokens_seen": 50421590, "router_z_loss_clip": 2.43359375, "router_z_loss_mlp": 0.30541992, "step": 2328, "time_per_iteration": 2.8392908573150635 }, { "auxiliary_loss_clip": 0.01633239, "auxiliary_loss_mlp": 0.01075743, "balance_loss_clip": 1.38645601, "balance_loss_mlp": 1.04224467, "epoch": 0.1400270554637006, "flos": 26629698280320.0, "grad_norm": 1.6475550783505535, "language_loss": 0.75199139, "learning_rate": 3.874347585064131e-06, "loss": 0.77908123, "num_input_tokens_seen": 50443945, "router_z_loss_clip": 2.47070312, "router_z_loss_mlp": 0.33496094, "step": 2329, "time_per_iteration": 2.9233975410461426 }, { "auxiliary_loss_clip": 0.01652117, "auxiliary_loss_mlp": 0.01078985, "balance_loss_clip": 1.40495968, "balance_loss_mlp": 1.04779935, "epoch": 0.14008717871636855, "flos": 19401058558080.0, "grad_norm": 2.3462378288022814, "language_loss": 0.7931031, "learning_rate": 3.874211680818183e-06, "loss": 0.82041419, "num_input_tokens_seen": 50462065, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.31152344, "step": 2330, "time_per_iteration": 2.835026264190674 }, { "auxiliary_loss_clip": 0.01632307, "auxiliary_loss_mlp": 0.01067094, "balance_loss_clip": 1.38814712, "balance_loss_mlp": 1.03583753, "epoch": 0.14014730196903652, "flos": 15312580744320.0, "grad_norm": 2.3779340487929557, "language_loss": 0.73317617, "learning_rate": 3.87407570550194e-06, "loss": 0.76017016, "num_input_tokens_seen": 50479565, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.3125, "step": 2331, "time_per_iteration": 2.884646415710449 }, { "auxiliary_loss_clip": 0.01612648, "auxiliary_loss_mlp": 0.01070681, "balance_loss_clip": 1.3779223, "balance_loss_mlp": 1.03858995, "epoch": 0.14020742522170448, "flos": 14947996993920.0, "grad_norm": 1.7071936378706467, "language_loss": 0.73611492, "learning_rate": 3.873939659120557e-06, "loss": 0.76294822, "num_input_tokens_seen": 50497305, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.32055664, "step": 2332, "time_per_iteration": 2.852910041809082 }, { "auxiliary_loss_clip": 0.013816, "auxiliary_loss_mlp": 0.01055754, "balance_loss_clip": 1.23955917, "balance_loss_mlp": 1.03381908, "epoch": 0.14026754847437245, "flos": 48848439922560.0, "grad_norm": 0.8318201338907145, "language_loss": 0.56159484, "learning_rate": 3.873803541679196e-06, "loss": 0.58596838, "num_input_tokens_seen": 50549735, "router_z_loss_clip": 1.421875, "router_z_loss_mlp": 0.21972656, "step": 2333, "time_per_iteration": 3.272254228591919 }, { "auxiliary_loss_clip": 0.01616496, "auxiliary_loss_mlp": 0.01066817, "balance_loss_clip": 1.37790334, "balance_loss_mlp": 1.03577447, "epoch": 0.14032767172704044, "flos": 25783672164480.0, "grad_norm": 6.766443035333321, "language_loss": 0.83434403, "learning_rate": 3.873667353183016e-06, "loss": 0.86117721, "num_input_tokens_seen": 50570100, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.30981445, "step": 2334, "time_per_iteration": 2.9175710678100586 }, { "auxiliary_loss_clip": 0.01612441, "auxiliary_loss_mlp": 0.01065029, "balance_loss_clip": 1.36926818, "balance_loss_mlp": 1.03238964, "epoch": 0.1403877949797084, "flos": 21226511018880.0, "grad_norm": 1.6988844538218988, "language_loss": 0.81767982, "learning_rate": 3.8735310936371825e-06, "loss": 0.84445453, "num_input_tokens_seen": 50589185, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.32617188, "step": 2335, "time_per_iteration": 2.881754159927368 }, { "auxiliary_loss_clip": 0.01635649, "auxiliary_loss_mlp": 0.01058145, "balance_loss_clip": 1.38581192, "balance_loss_mlp": 1.02402771, "epoch": 0.14044791823237637, "flos": 22758378099840.0, "grad_norm": 1.9862901466903873, "language_loss": 0.82837868, "learning_rate": 3.873394763046862e-06, "loss": 0.85531664, "num_input_tokens_seen": 50609645, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.34106445, "step": 2336, "time_per_iteration": 2.901790142059326 }, { "auxiliary_loss_clip": 0.01607115, "auxiliary_loss_mlp": 0.0105752, "balance_loss_clip": 1.3682971, "balance_loss_mlp": 1.02709782, "epoch": 0.14050804148504434, "flos": 22974268878720.0, "grad_norm": 1.6517150943130394, "language_loss": 0.81906366, "learning_rate": 3.873258361417225e-06, "loss": 0.84571004, "num_input_tokens_seen": 50628385, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.30371094, "step": 2337, "time_per_iteration": 2.9112491607666016 }, { "auxiliary_loss_clip": 0.01624454, "auxiliary_loss_mlp": 0.01063361, "balance_loss_clip": 1.38274181, "balance_loss_mlp": 1.03241444, "epoch": 0.1405681647377123, "flos": 22210507088640.0, "grad_norm": 1.8362264257672276, "language_loss": 0.79956633, "learning_rate": 3.873121888753442e-06, "loss": 0.82644451, "num_input_tokens_seen": 50647260, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.30957031, "step": 2338, "time_per_iteration": 2.862473487854004 }, { "auxiliary_loss_clip": 0.01640933, "auxiliary_loss_mlp": 0.01058551, "balance_loss_clip": 1.39533687, "balance_loss_mlp": 1.02522027, "epoch": 0.14062828799038027, "flos": 23743369555200.0, "grad_norm": 3.9264249806134552, "language_loss": 0.82020855, "learning_rate": 3.87298534506069e-06, "loss": 0.84720337, "num_input_tokens_seen": 50666130, "router_z_loss_clip": 2.45703125, "router_z_loss_mlp": 0.33325195, "step": 2339, "time_per_iteration": 2.879000425338745 }, { "auxiliary_loss_clip": 0.01617225, "auxiliary_loss_mlp": 0.01057463, "balance_loss_clip": 1.37799239, "balance_loss_mlp": 1.02651584, "epoch": 0.14068841124304826, "flos": 39217022363520.0, "grad_norm": 1.6947342328099662, "language_loss": 0.66542888, "learning_rate": 3.872848730344146e-06, "loss": 0.69217575, "num_input_tokens_seen": 50687440, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.30932617, "step": 2340, "time_per_iteration": 2.9939119815826416 }, { "auxiliary_loss_clip": 0.01605963, "auxiliary_loss_mlp": 0.01054857, "balance_loss_clip": 1.37140179, "balance_loss_mlp": 1.02441144, "epoch": 0.14074853449571623, "flos": 20201586387840.0, "grad_norm": 2.4732193786028627, "language_loss": 0.80035716, "learning_rate": 3.87271204460899e-06, "loss": 0.82696533, "num_input_tokens_seen": 50704030, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.3046875, "step": 2341, "time_per_iteration": 2.8511369228363037 }, { "auxiliary_loss_clip": 0.01611395, "auxiliary_loss_mlp": 0.01058356, "balance_loss_clip": 1.37125754, "balance_loss_mlp": 1.02688491, "epoch": 0.1408086577483842, "flos": 18415071717120.0, "grad_norm": 2.006100483965079, "language_loss": 0.8335678, "learning_rate": 3.8725752878604066e-06, "loss": 0.86026525, "num_input_tokens_seen": 50723305, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.31445312, "step": 2342, "time_per_iteration": 2.8208444118499756 }, { "auxiliary_loss_clip": 0.01609853, "auxiliary_loss_mlp": 0.01062728, "balance_loss_clip": 1.37654722, "balance_loss_mlp": 1.03094697, "epoch": 0.14086878100105216, "flos": 25275417615360.0, "grad_norm": 1.8388903944664412, "language_loss": 0.79156315, "learning_rate": 3.87243846010358e-06, "loss": 0.81828898, "num_input_tokens_seen": 50743270, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.31762695, "step": 2343, "time_per_iteration": 2.875600576400757 }, { "auxiliary_loss_clip": 0.01406192, "auxiliary_loss_mlp": 0.01075144, "balance_loss_clip": 1.26912808, "balance_loss_mlp": 1.05435371, "epoch": 0.14092890425372012, "flos": 66008127404160.0, "grad_norm": 0.8398256394052303, "language_loss": 0.61598265, "learning_rate": 3.872301561343699e-06, "loss": 0.64079601, "num_input_tokens_seen": 50802710, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.20800781, "step": 2344, "time_per_iteration": 3.286590814590454 }, { "auxiliary_loss_clip": 0.01599927, "auxiliary_loss_mlp": 0.01054911, "balance_loss_clip": 1.36303604, "balance_loss_mlp": 1.02539515, "epoch": 0.1409890275063881, "flos": 23705200926720.0, "grad_norm": 1.6145457035707937, "language_loss": 0.66377115, "learning_rate": 3.872164591585956e-06, "loss": 0.69031948, "num_input_tokens_seen": 50822625, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.29492188, "step": 2345, "time_per_iteration": 2.8507354259490967 }, { "auxiliary_loss_clip": 0.01629398, "auxiliary_loss_mlp": 0.01068856, "balance_loss_clip": 1.380422, "balance_loss_mlp": 1.03645444, "epoch": 0.14104915075905605, "flos": 23633659618560.0, "grad_norm": 5.623152688799827, "language_loss": 0.75418937, "learning_rate": 3.8720275508355435e-06, "loss": 0.78117192, "num_input_tokens_seen": 50842330, "router_z_loss_clip": 2.49414062, "router_z_loss_mlp": 0.32373047, "step": 2346, "time_per_iteration": 2.9753735065460205 }, { "auxiliary_loss_clip": 0.01617351, "auxiliary_loss_mlp": 0.01061118, "balance_loss_clip": 1.37582469, "balance_loss_mlp": 1.03012383, "epoch": 0.14110927401172405, "flos": 20604972193920.0, "grad_norm": 1.9414593574026042, "language_loss": 0.78218257, "learning_rate": 3.8718904390976585e-06, "loss": 0.80896729, "num_input_tokens_seen": 50861035, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.30981445, "step": 2347, "time_per_iteration": 2.8268840312957764 }, { "auxiliary_loss_clip": 0.01617053, "auxiliary_loss_mlp": 0.01064168, "balance_loss_clip": 1.37634468, "balance_loss_mlp": 1.03434181, "epoch": 0.141169397264392, "flos": 28559657525760.0, "grad_norm": 2.2796688253939856, "language_loss": 0.77822477, "learning_rate": 3.8717532563775e-06, "loss": 0.80503696, "num_input_tokens_seen": 50880105, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.2980957, "step": 2348, "time_per_iteration": 3.0220208168029785 }, { "auxiliary_loss_clip": 0.01609035, "auxiliary_loss_mlp": 0.01062174, "balance_loss_clip": 1.36837077, "balance_loss_mlp": 1.03167999, "epoch": 0.14122952051705998, "flos": 17101086186240.0, "grad_norm": 1.7163639182311072, "language_loss": 0.87789166, "learning_rate": 3.871616002680272e-06, "loss": 0.90460378, "num_input_tokens_seen": 50897720, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.3046875, "step": 2349, "time_per_iteration": 2.894247055053711 }, { "auxiliary_loss_clip": 0.01610279, "auxiliary_loss_mlp": 0.01065742, "balance_loss_clip": 1.37187839, "balance_loss_mlp": 1.03248239, "epoch": 0.14128964376972794, "flos": 28957840179840.0, "grad_norm": 1.749751954119047, "language_loss": 0.89668572, "learning_rate": 3.871478678011177e-06, "loss": 0.92344594, "num_input_tokens_seen": 50918385, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.33251953, "step": 2350, "time_per_iteration": 2.912184476852417 }, { "auxiliary_loss_clip": 0.01614644, "auxiliary_loss_mlp": 0.01054722, "balance_loss_clip": 1.37495542, "balance_loss_mlp": 1.02191591, "epoch": 0.1413497670223959, "flos": 18999165830400.0, "grad_norm": 1.6404993629008684, "language_loss": 0.82245195, "learning_rate": 3.871341282375423e-06, "loss": 0.84914565, "num_input_tokens_seen": 50938270, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.328125, "step": 2351, "time_per_iteration": 4.283477306365967 }, { "auxiliary_loss_clip": 0.0162686, "auxiliary_loss_mlp": 0.01065809, "balance_loss_clip": 1.38428259, "balance_loss_mlp": 1.03405178, "epoch": 0.14140989027506387, "flos": 29874366973440.0, "grad_norm": 2.395217832178772, "language_loss": 0.85020399, "learning_rate": 3.871203815778219e-06, "loss": 0.87713069, "num_input_tokens_seen": 50958155, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.31738281, "step": 2352, "time_per_iteration": 2.9445481300354004 }, { "auxiliary_loss_clip": 0.01392869, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.25452483, "balance_loss_mlp": 1.00222671, "epoch": 0.14147001352773186, "flos": 62109587836800.0, "grad_norm": 0.9085418427079294, "language_loss": 0.61959434, "learning_rate": 3.87106627822478e-06, "loss": 0.64380473, "num_input_tokens_seen": 51020705, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.25976562, "step": 2353, "time_per_iteration": 3.305818796157837 }, { "auxiliary_loss_clip": 0.01594299, "auxiliary_loss_mlp": 0.01053789, "balance_loss_clip": 1.35818315, "balance_loss_mlp": 1.02329564, "epoch": 0.14153013678039983, "flos": 22027536541440.0, "grad_norm": 1.8057413114390988, "language_loss": 0.88154662, "learning_rate": 3.8709286697203196e-06, "loss": 0.90802753, "num_input_tokens_seen": 51039995, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.3046875, "step": 2354, "time_per_iteration": 2.889789581298828 }, { "auxiliary_loss_clip": 0.01604279, "auxiliary_loss_mlp": 0.01067295, "balance_loss_clip": 1.36557984, "balance_loss_mlp": 1.03494215, "epoch": 0.1415902600330678, "flos": 19729826409600.0, "grad_norm": 1.9319551634847458, "language_loss": 0.75634724, "learning_rate": 3.870790990270057e-06, "loss": 0.78306299, "num_input_tokens_seen": 51059075, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.32324219, "step": 2355, "time_per_iteration": 2.846365451812744 }, { "auxiliary_loss_clip": 0.01379029, "auxiliary_loss_mlp": 0.01050853, "balance_loss_clip": 1.24227977, "balance_loss_mlp": 1.03092134, "epoch": 0.14165038328573576, "flos": 65929482662400.0, "grad_norm": 0.691843487536347, "language_loss": 0.51918292, "learning_rate": 3.870653239879212e-06, "loss": 0.54348171, "num_input_tokens_seen": 51120380, "router_z_loss_clip": 1.3671875, "router_z_loss_mlp": 0.19921875, "step": 2356, "time_per_iteration": 3.268338918685913 }, { "auxiliary_loss_clip": 0.01600487, "auxiliary_loss_mlp": 0.01067468, "balance_loss_clip": 1.36443973, "balance_loss_mlp": 1.03687882, "epoch": 0.14171050653840372, "flos": 12137688812160.0, "grad_norm": 2.92411820521841, "language_loss": 0.7202217, "learning_rate": 3.8705154185530095e-06, "loss": 0.74690127, "num_input_tokens_seen": 51136950, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.30566406, "step": 2357, "time_per_iteration": 4.253419876098633 }, { "auxiliary_loss_clip": 0.01610081, "auxiliary_loss_mlp": 0.01076981, "balance_loss_clip": 1.3675406, "balance_loss_mlp": 1.04560494, "epoch": 0.1417706297910717, "flos": 20422182625920.0, "grad_norm": 2.022395953579635, "language_loss": 0.83696049, "learning_rate": 3.870377526296674e-06, "loss": 0.8638311, "num_input_tokens_seen": 51155175, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.31323242, "step": 2358, "time_per_iteration": 2.841686248779297 }, { "auxiliary_loss_clip": 0.01623141, "auxiliary_loss_mlp": 0.01075058, "balance_loss_clip": 1.37743437, "balance_loss_mlp": 1.04229939, "epoch": 0.14183075304373965, "flos": 22390265255040.0, "grad_norm": 2.3433211237524185, "language_loss": 0.73283213, "learning_rate": 3.870239563115436e-06, "loss": 0.75981414, "num_input_tokens_seen": 51174500, "router_z_loss_clip": 2.45507812, "router_z_loss_mlp": 0.32739258, "step": 2359, "time_per_iteration": 2.8791491985321045 }, { "auxiliary_loss_clip": 0.01620303, "auxiliary_loss_mlp": 0.01085294, "balance_loss_clip": 1.37884593, "balance_loss_mlp": 1.05491948, "epoch": 0.14189087629640765, "flos": 21591140014080.0, "grad_norm": 2.9742112684483266, "language_loss": 0.77477169, "learning_rate": 3.870101529014526e-06, "loss": 0.80182767, "num_input_tokens_seen": 51194270, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.30371094, "step": 2360, "time_per_iteration": 4.3731913566589355 }, { "auxiliary_loss_clip": 0.01604834, "auxiliary_loss_mlp": 0.01090343, "balance_loss_clip": 1.37089419, "balance_loss_mlp": 1.05872917, "epoch": 0.1419509995490756, "flos": 20017484720640.0, "grad_norm": 2.252993672458751, "language_loss": 0.822909, "learning_rate": 3.869963423999178e-06, "loss": 0.84986079, "num_input_tokens_seen": 51211850, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.31616211, "step": 2361, "time_per_iteration": 2.8596935272216797 }, { "auxiliary_loss_clip": 0.01592287, "auxiliary_loss_mlp": 0.010972, "balance_loss_clip": 1.35875511, "balance_loss_mlp": 1.06425071, "epoch": 0.14201112280174358, "flos": 31954014576000.0, "grad_norm": 4.316711822545349, "language_loss": 0.75756466, "learning_rate": 3.86982524807463e-06, "loss": 0.78445947, "num_input_tokens_seen": 51233545, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.32910156, "step": 2362, "time_per_iteration": 3.011115550994873 }, { "auxiliary_loss_clip": 0.01600215, "auxiliary_loss_mlp": 0.01093529, "balance_loss_clip": 1.36895275, "balance_loss_mlp": 1.06110382, "epoch": 0.14207124605441154, "flos": 41479142820480.0, "grad_norm": 2.1826705631937333, "language_loss": 0.75180179, "learning_rate": 3.869687001246122e-06, "loss": 0.77873921, "num_input_tokens_seen": 51257615, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.32421875, "step": 2363, "time_per_iteration": 3.109760046005249 }, { "auxiliary_loss_clip": 0.01601792, "auxiliary_loss_mlp": 0.01096876, "balance_loss_clip": 1.36596751, "balance_loss_mlp": 1.06418872, "epoch": 0.1421313693070795, "flos": 31917655739520.0, "grad_norm": 2.2864797542817756, "language_loss": 0.73866224, "learning_rate": 3.8695486835188946e-06, "loss": 0.76564896, "num_input_tokens_seen": 51279645, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.32666016, "step": 2364, "time_per_iteration": 2.9079737663269043 }, { "auxiliary_loss_clip": 0.01586114, "auxiliary_loss_mlp": 0.01093332, "balance_loss_clip": 1.35753059, "balance_loss_mlp": 1.06386375, "epoch": 0.14219149255974747, "flos": 26882717057280.0, "grad_norm": 2.387982881766686, "language_loss": 0.91494268, "learning_rate": 3.869410294898195e-06, "loss": 0.94173712, "num_input_tokens_seen": 51299775, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29467773, "step": 2365, "time_per_iteration": 2.870861053466797 }, { "auxiliary_loss_clip": 0.01600975, "auxiliary_loss_mlp": 0.0107681, "balance_loss_clip": 1.36483407, "balance_loss_mlp": 1.04572022, "epoch": 0.14225161581241544, "flos": 27465996764160.0, "grad_norm": 1.6627374264620778, "language_loss": 0.66033643, "learning_rate": 3.869271835389268e-06, "loss": 0.6871143, "num_input_tokens_seen": 51319430, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.31079102, "step": 2366, "time_per_iteration": 2.870525360107422 }, { "auxiliary_loss_clip": 0.01596831, "auxiliary_loss_mlp": 0.01081826, "balance_loss_clip": 1.36514783, "balance_loss_mlp": 1.05023527, "epoch": 0.14231173906508343, "flos": 10568603243520.0, "grad_norm": 2.0408473866747947, "language_loss": 0.81706887, "learning_rate": 3.8691333049973665e-06, "loss": 0.84385538, "num_input_tokens_seen": 51336045, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.31542969, "step": 2367, "time_per_iteration": 2.840629816055298 }, { "auxiliary_loss_clip": 0.01620593, "auxiliary_loss_mlp": 0.01082158, "balance_loss_clip": 1.38275743, "balance_loss_mlp": 1.04861271, "epoch": 0.1423718623177514, "flos": 28371212357760.0, "grad_norm": 13.71021952936621, "language_loss": 0.84252203, "learning_rate": 3.868994703727742e-06, "loss": 0.86954951, "num_input_tokens_seen": 51357030, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.33544922, "step": 2368, "time_per_iteration": 2.9260501861572266 }, { "auxiliary_loss_clip": 0.01612974, "auxiliary_loss_mlp": 0.01077798, "balance_loss_clip": 1.37518716, "balance_loss_mlp": 1.04775691, "epoch": 0.14243198557041936, "flos": 19363613846400.0, "grad_norm": 2.4487238844388406, "language_loss": 0.87895322, "learning_rate": 3.868856031585652e-06, "loss": 0.9058609, "num_input_tokens_seen": 51374890, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.30053711, "step": 2369, "time_per_iteration": 2.8076565265655518 }, { "auxiliary_loss_clip": 0.01619131, "auxiliary_loss_mlp": 0.01069732, "balance_loss_clip": 1.37982154, "balance_loss_mlp": 1.03873801, "epoch": 0.14249210882308733, "flos": 28819417777920.0, "grad_norm": 1.4827280377032062, "language_loss": 0.7668035, "learning_rate": 3.868717288576354e-06, "loss": 0.79369217, "num_input_tokens_seen": 51398100, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.30981445, "step": 2370, "time_per_iteration": 2.9657251834869385 }, { "auxiliary_loss_clip": 0.01594248, "auxiliary_loss_mlp": 0.01067187, "balance_loss_clip": 1.36166203, "balance_loss_mlp": 1.03616846, "epoch": 0.1425522320757553, "flos": 21844973197440.0, "grad_norm": 1.5559044979928462, "language_loss": 0.83770508, "learning_rate": 3.868578474705109e-06, "loss": 0.86431944, "num_input_tokens_seen": 51418745, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.31005859, "step": 2371, "time_per_iteration": 2.863511323928833 }, { "auxiliary_loss_clip": 0.01607922, "auxiliary_loss_mlp": 0.01058515, "balance_loss_clip": 1.37283063, "balance_loss_mlp": 1.02728176, "epoch": 0.14261235532842326, "flos": 17320415569920.0, "grad_norm": 1.8955851715887884, "language_loss": 0.83669901, "learning_rate": 3.868439589977181e-06, "loss": 0.86336344, "num_input_tokens_seen": 51437455, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.31201172, "step": 2372, "time_per_iteration": 2.8337435722351074 }, { "auxiliary_loss_clip": 0.01597041, "auxiliary_loss_mlp": 0.01063265, "balance_loss_clip": 1.362795, "balance_loss_mlp": 1.03141248, "epoch": 0.14267247858109125, "flos": 18815923814400.0, "grad_norm": 2.112139036569057, "language_loss": 0.86047512, "learning_rate": 3.868300634397836e-06, "loss": 0.88707817, "num_input_tokens_seen": 51455710, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.31835938, "step": 2373, "time_per_iteration": 2.8205771446228027 }, { "auxiliary_loss_clip": 0.01597377, "auxiliary_loss_mlp": 0.01053792, "balance_loss_clip": 1.36404777, "balance_loss_mlp": 1.02458572, "epoch": 0.14273260183375922, "flos": 11364923306880.0, "grad_norm": 2.749592346777685, "language_loss": 0.86694163, "learning_rate": 3.8681616079723445e-06, "loss": 0.89345336, "num_input_tokens_seen": 51471270, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.29223633, "step": 2374, "time_per_iteration": 2.808540105819702 }, { "auxiliary_loss_clip": 0.01612694, "auxiliary_loss_mlp": 0.01056301, "balance_loss_clip": 1.37529993, "balance_loss_mlp": 1.02554488, "epoch": 0.14279272508642718, "flos": 27580276425600.0, "grad_norm": 2.038114009381148, "language_loss": 0.79840702, "learning_rate": 3.868022510705977e-06, "loss": 0.82509696, "num_input_tokens_seen": 51492705, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.30737305, "step": 2375, "time_per_iteration": 2.9454026222229004 }, { "auxiliary_loss_clip": 0.01596752, "auxiliary_loss_mlp": 0.0106175, "balance_loss_clip": 1.36423099, "balance_loss_mlp": 1.03120875, "epoch": 0.14285284833909515, "flos": 16261032384000.0, "grad_norm": 2.372152200894251, "language_loss": 0.78125644, "learning_rate": 3.867883342604009e-06, "loss": 0.80784148, "num_input_tokens_seen": 51510780, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.30541992, "step": 2376, "time_per_iteration": 2.8123724460601807 }, { "auxiliary_loss_clip": 0.01608547, "auxiliary_loss_mlp": 0.0105849, "balance_loss_clip": 1.37433434, "balance_loss_mlp": 1.02773452, "epoch": 0.1429129715917631, "flos": 19765099370880.0, "grad_norm": 1.672735133373138, "language_loss": 0.94192588, "learning_rate": 3.867744103671717e-06, "loss": 0.96859622, "num_input_tokens_seen": 51531400, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.30737305, "step": 2377, "time_per_iteration": 2.8993492126464844 }, { "auxiliary_loss_clip": 0.01593244, "auxiliary_loss_mlp": 0.01056885, "balance_loss_clip": 1.35885954, "balance_loss_mlp": 1.02512801, "epoch": 0.14297309484443108, "flos": 21145649281920.0, "grad_norm": 1.9915095951989057, "language_loss": 0.92629075, "learning_rate": 3.867604793914382e-06, "loss": 0.95279205, "num_input_tokens_seen": 51548215, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.31738281, "step": 2378, "time_per_iteration": 2.8385934829711914 }, { "auxiliary_loss_clip": 0.01611593, "auxiliary_loss_mlp": 0.0105057, "balance_loss_clip": 1.37269735, "balance_loss_mlp": 1.01914608, "epoch": 0.14303321809709904, "flos": 23597165047680.0, "grad_norm": 1.6268927608668478, "language_loss": 0.7502929, "learning_rate": 3.8674654133372864e-06, "loss": 0.77691448, "num_input_tokens_seen": 51566820, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.31420898, "step": 2379, "time_per_iteration": 2.8775177001953125 }, { "auxiliary_loss_clip": 0.01608942, "auxiliary_loss_mlp": 0.01059672, "balance_loss_clip": 1.37264907, "balance_loss_mlp": 1.02834356, "epoch": 0.14309334134976703, "flos": 15896810592000.0, "grad_norm": 2.1356605410274536, "language_loss": 0.80074501, "learning_rate": 3.867325961945714e-06, "loss": 0.82743108, "num_input_tokens_seen": 51585075, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.31323242, "step": 2380, "time_per_iteration": 2.8050918579101562 }, { "auxiliary_loss_clip": 0.01614913, "auxiliary_loss_mlp": 0.0105147, "balance_loss_clip": 1.37609935, "balance_loss_mlp": 1.01904488, "epoch": 0.143153464602435, "flos": 16334112015360.0, "grad_norm": 6.7112238299017735, "language_loss": 0.89772177, "learning_rate": 3.867186439744955e-06, "loss": 0.92438561, "num_input_tokens_seen": 51603185, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.32421875, "step": 2381, "time_per_iteration": 2.837433338165283 }, { "auxiliary_loss_clip": 0.01599519, "auxiliary_loss_mlp": 0.01058189, "balance_loss_clip": 1.36684406, "balance_loss_mlp": 1.02626514, "epoch": 0.14321358785510296, "flos": 17100859962240.0, "grad_norm": 3.2887071472440463, "language_loss": 0.78557754, "learning_rate": 3.867046846740299e-06, "loss": 0.81215465, "num_input_tokens_seen": 51620880, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.3190918, "step": 2382, "time_per_iteration": 2.831972599029541 }, { "auxiliary_loss_clip": 0.01608142, "auxiliary_loss_mlp": 0.01056688, "balance_loss_clip": 1.37095809, "balance_loss_mlp": 1.02669466, "epoch": 0.14327371110777093, "flos": 26334076884480.0, "grad_norm": 6.409145745830681, "language_loss": 0.78877008, "learning_rate": 3.866907182937039e-06, "loss": 0.81541836, "num_input_tokens_seen": 51640170, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.29980469, "step": 2383, "time_per_iteration": 2.8583290576934814 }, { "auxiliary_loss_clip": 0.01602983, "auxiliary_loss_mlp": 0.01058576, "balance_loss_clip": 1.36641049, "balance_loss_mlp": 1.02712882, "epoch": 0.1433338343604389, "flos": 18085308480000.0, "grad_norm": 2.215159205351764, "language_loss": 0.890674, "learning_rate": 3.866767448340471e-06, "loss": 0.91728956, "num_input_tokens_seen": 51656580, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.31469727, "step": 2384, "time_per_iteration": 2.8275039196014404 }, { "auxiliary_loss_clip": 0.01613571, "auxiliary_loss_mlp": 0.01072719, "balance_loss_clip": 1.37281752, "balance_loss_mlp": 1.03996038, "epoch": 0.14339395761310686, "flos": 15531050476800.0, "grad_norm": 2.3934938586838603, "language_loss": 0.81163818, "learning_rate": 3.866627642955895e-06, "loss": 0.8385011, "num_input_tokens_seen": 51674645, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.32739258, "step": 2385, "time_per_iteration": 2.8215246200561523 }, { "auxiliary_loss_clip": 0.01594399, "auxiliary_loss_mlp": 0.01063634, "balance_loss_clip": 1.35633123, "balance_loss_mlp": 1.02999365, "epoch": 0.14345408086577485, "flos": 28560562421760.0, "grad_norm": 2.2438886305930166, "language_loss": 0.76533985, "learning_rate": 3.866487766788612e-06, "loss": 0.79192019, "num_input_tokens_seen": 51695770, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.33642578, "step": 2386, "time_per_iteration": 4.308884620666504 }, { "auxiliary_loss_clip": 0.01602323, "auxiliary_loss_mlp": 0.01059319, "balance_loss_clip": 1.36807406, "balance_loss_mlp": 1.02858698, "epoch": 0.14351420411844282, "flos": 20239709771520.0, "grad_norm": 2.020131010993685, "language_loss": 0.79871094, "learning_rate": 3.866347819843925e-06, "loss": 0.82532734, "num_input_tokens_seen": 51714165, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.30712891, "step": 2387, "time_per_iteration": 2.8708927631378174 }, { "auxiliary_loss_clip": 0.01607053, "auxiliary_loss_mlp": 0.0106969, "balance_loss_clip": 1.37247372, "balance_loss_mlp": 1.03781295, "epoch": 0.14357432737111078, "flos": 19874537838720.0, "grad_norm": 1.9970171609362568, "language_loss": 0.83954918, "learning_rate": 3.866207802127143e-06, "loss": 0.86631662, "num_input_tokens_seen": 51734440, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.31860352, "step": 2388, "time_per_iteration": 2.9311416149139404 }, { "auxiliary_loss_clip": 0.01610535, "auxiliary_loss_mlp": 0.0106912, "balance_loss_clip": 1.37482572, "balance_loss_mlp": 1.03865027, "epoch": 0.14363445062377875, "flos": 28268515365120.0, "grad_norm": 2.146347198167039, "language_loss": 0.84124887, "learning_rate": 3.866067713643573e-06, "loss": 0.86804545, "num_input_tokens_seen": 51753730, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.3046875, "step": 2389, "time_per_iteration": 2.9113101959228516 }, { "auxiliary_loss_clip": 0.01632146, "auxiliary_loss_mlp": 0.01068372, "balance_loss_clip": 1.38858056, "balance_loss_mlp": 1.0363282, "epoch": 0.1436945738764467, "flos": 18195742333440.0, "grad_norm": 1.9013253007090372, "language_loss": 0.84623563, "learning_rate": 3.8659275543985285e-06, "loss": 0.87324083, "num_input_tokens_seen": 51771195, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.3203125, "step": 2390, "time_per_iteration": 2.827235698699951 }, { "auxiliary_loss_clip": 0.01612364, "auxiliary_loss_mlp": 0.01071307, "balance_loss_clip": 1.37565136, "balance_loss_mlp": 1.04000306, "epoch": 0.14375469712911468, "flos": 27319294563840.0, "grad_norm": 1.5995499239569122, "language_loss": 0.75786912, "learning_rate": 3.865787324397324e-06, "loss": 0.78470588, "num_input_tokens_seen": 51792290, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.31298828, "step": 2391, "time_per_iteration": 2.95215106010437 }, { "auxiliary_loss_clip": 0.01394926, "auxiliary_loss_mlp": 0.01024197, "balance_loss_clip": 1.26203871, "balance_loss_mlp": 1.01008272, "epoch": 0.14381482038178264, "flos": 56918309811840.0, "grad_norm": 1.1034181602899682, "language_loss": 0.61851257, "learning_rate": 3.865647023645277e-06, "loss": 0.64270383, "num_input_tokens_seen": 51843675, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.14160156, "step": 2392, "time_per_iteration": 4.752182245254517 }, { "auxiliary_loss_clip": 0.01614418, "auxiliary_loss_mlp": 0.01059714, "balance_loss_clip": 1.37473547, "balance_loss_mlp": 1.02750349, "epoch": 0.14387494363445064, "flos": 14290099332480.0, "grad_norm": 2.249445640393279, "language_loss": 0.78363097, "learning_rate": 3.865506652147709e-06, "loss": 0.81037223, "num_input_tokens_seen": 51860285, "router_z_loss_clip": 2.39648438, "router_z_loss_mlp": 0.32226562, "step": 2393, "time_per_iteration": 2.8432326316833496 }, { "auxiliary_loss_clip": 0.0159347, "auxiliary_loss_mlp": 0.01063395, "balance_loss_clip": 1.35851943, "balance_loss_mlp": 1.03278255, "epoch": 0.1439350668871186, "flos": 26772961875840.0, "grad_norm": 2.1132475218764113, "language_loss": 0.77863598, "learning_rate": 3.865366209909941e-06, "loss": 0.80520469, "num_input_tokens_seen": 51880105, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.3059082, "step": 2394, "time_per_iteration": 4.298219442367554 }, { "auxiliary_loss_clip": 0.01594426, "auxiliary_loss_mlp": 0.01064728, "balance_loss_clip": 1.36001563, "balance_loss_mlp": 1.03387702, "epoch": 0.14399519013978657, "flos": 40715381030400.0, "grad_norm": 1.5343387424281707, "language_loss": 0.86661482, "learning_rate": 3.8652256969372994e-06, "loss": 0.89320636, "num_input_tokens_seen": 51905175, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.30834961, "step": 2395, "time_per_iteration": 4.431222200393677 }, { "auxiliary_loss_clip": 0.01596243, "auxiliary_loss_mlp": 0.01052844, "balance_loss_clip": 1.36416352, "balance_loss_mlp": 1.02208769, "epoch": 0.14405531339245453, "flos": 20567029789440.0, "grad_norm": 1.6233470949404256, "language_loss": 0.83255321, "learning_rate": 3.865085113235113e-06, "loss": 0.85904408, "num_input_tokens_seen": 51924490, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.30786133, "step": 2396, "time_per_iteration": 2.863414764404297 }, { "auxiliary_loss_clip": 0.01593356, "auxiliary_loss_mlp": 0.01050513, "balance_loss_clip": 1.36140323, "balance_loss_mlp": 1.01715803, "epoch": 0.1441154366451225, "flos": 19582309802880.0, "grad_norm": 2.156972783251991, "language_loss": 0.84967053, "learning_rate": 3.864944458808712e-06, "loss": 0.87610924, "num_input_tokens_seen": 51940490, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.33349609, "step": 2397, "time_per_iteration": 2.841856002807617 }, { "auxiliary_loss_clip": 0.01602675, "auxiliary_loss_mlp": 0.01047348, "balance_loss_clip": 1.36672473, "balance_loss_mlp": 1.01849914, "epoch": 0.14417555989779046, "flos": 18525053122560.0, "grad_norm": 2.1261393892660925, "language_loss": 0.81257457, "learning_rate": 3.86480373366343e-06, "loss": 0.83907479, "num_input_tokens_seen": 51957910, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.28857422, "step": 2398, "time_per_iteration": 2.8316028118133545 }, { "auxiliary_loss_clip": 0.01592775, "auxiliary_loss_mlp": 0.01055806, "balance_loss_clip": 1.36033463, "balance_loss_mlp": 1.02347636, "epoch": 0.14423568315045843, "flos": 26042934723840.0, "grad_norm": 2.034103437813451, "language_loss": 0.664698, "learning_rate": 3.864662937804603e-06, "loss": 0.69118387, "num_input_tokens_seen": 51978010, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.32275391, "step": 2399, "time_per_iteration": 2.901139974594116 }, { "auxiliary_loss_clip": 0.015951, "auxiliary_loss_mlp": 0.01044679, "balance_loss_clip": 1.36251307, "balance_loss_mlp": 1.0145669, "epoch": 0.14429580640312642, "flos": 21298821488640.0, "grad_norm": 2.5643106553571333, "language_loss": 0.835513, "learning_rate": 3.864522071237571e-06, "loss": 0.86191082, "num_input_tokens_seen": 51998515, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.30126953, "step": 2400, "time_per_iteration": 2.841078042984009 }, { "auxiliary_loss_clip": 0.01605178, "auxiliary_loss_mlp": 0.01050308, "balance_loss_clip": 1.36731982, "balance_loss_mlp": 1.01876545, "epoch": 0.14435592965579438, "flos": 25638372552960.0, "grad_norm": 1.6554615050129031, "language_loss": 0.76015818, "learning_rate": 3.864381133967676e-06, "loss": 0.786713, "num_input_tokens_seen": 52019270, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.31518555, "step": 2401, "time_per_iteration": 2.9077584743499756 }, { "auxiliary_loss_clip": 0.01586079, "auxiliary_loss_mlp": 0.01050051, "balance_loss_clip": 1.35593474, "balance_loss_mlp": 1.02020133, "epoch": 0.14441605290846235, "flos": 22975038040320.0, "grad_norm": 1.5019647851175775, "language_loss": 0.81528318, "learning_rate": 3.86424012600026e-06, "loss": 0.84164447, "num_input_tokens_seen": 52039315, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.2980957, "step": 2402, "time_per_iteration": 2.8525078296661377 }, { "auxiliary_loss_clip": 0.01599304, "auxiliary_loss_mlp": 0.0105837, "balance_loss_clip": 1.36496878, "balance_loss_mlp": 1.02675557, "epoch": 0.14447617616113032, "flos": 17356548182400.0, "grad_norm": 2.1686336451362607, "language_loss": 0.85697448, "learning_rate": 3.864099047340673e-06, "loss": 0.88355124, "num_input_tokens_seen": 52056555, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.31591797, "step": 2403, "time_per_iteration": 2.9182560443878174 }, { "auxiliary_loss_clip": 0.01593269, "auxiliary_loss_mlp": 0.01056956, "balance_loss_clip": 1.36167645, "balance_loss_mlp": 1.0249846, "epoch": 0.14453629941379828, "flos": 24070599083520.0, "grad_norm": 1.6363283321188002, "language_loss": 0.71530497, "learning_rate": 3.863957897994262e-06, "loss": 0.74180722, "num_input_tokens_seen": 52075800, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.31958008, "step": 2404, "time_per_iteration": 2.9120168685913086 }, { "auxiliary_loss_clip": 0.01579333, "auxiliary_loss_mlp": 0.01053011, "balance_loss_clip": 1.34751749, "balance_loss_mlp": 1.02282739, "epoch": 0.14459642266646625, "flos": 14437887408000.0, "grad_norm": 2.0298217178332507, "language_loss": 0.74419796, "learning_rate": 3.863816677966381e-06, "loss": 0.7705214, "num_input_tokens_seen": 52092585, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.30151367, "step": 2405, "time_per_iteration": 2.83675217628479 }, { "auxiliary_loss_clip": 0.01586212, "auxiliary_loss_mlp": 0.01056759, "balance_loss_clip": 1.35589576, "balance_loss_mlp": 1.0255022, "epoch": 0.14465654591913424, "flos": 9873713318400.0, "grad_norm": 2.0825444185469424, "language_loss": 0.74851823, "learning_rate": 3.863675387262386e-06, "loss": 0.77494794, "num_input_tokens_seen": 52108990, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.3125, "step": 2406, "time_per_iteration": 2.946133613586426 }, { "auxiliary_loss_clip": 0.01590213, "auxiliary_loss_mlp": 0.01059567, "balance_loss_clip": 1.35655117, "balance_loss_mlp": 1.02802372, "epoch": 0.1447166691718022, "flos": 24983732517120.0, "grad_norm": 4.04976035652705, "language_loss": 0.76477027, "learning_rate": 3.8635340258876325e-06, "loss": 0.79126811, "num_input_tokens_seen": 52125385, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.31542969, "step": 2407, "time_per_iteration": 2.867593288421631 }, { "auxiliary_loss_clip": 0.01580872, "auxiliary_loss_mlp": 0.01055166, "balance_loss_clip": 1.35105896, "balance_loss_mlp": 1.02421904, "epoch": 0.14477679242447017, "flos": 21918007584000.0, "grad_norm": 1.7706805467702038, "language_loss": 0.79922009, "learning_rate": 3.8633925938474826e-06, "loss": 0.82558054, "num_input_tokens_seen": 52144985, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.30932617, "step": 2408, "time_per_iteration": 2.9143497943878174 }, { "auxiliary_loss_clip": 0.0158526, "auxiliary_loss_mlp": 0.0105749, "balance_loss_clip": 1.35634005, "balance_loss_mlp": 1.02516067, "epoch": 0.14483691567713813, "flos": 20750498029440.0, "grad_norm": 5.409536954959013, "language_loss": 0.83895898, "learning_rate": 3.863251091147299e-06, "loss": 0.86538649, "num_input_tokens_seen": 52163885, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.32324219, "step": 2409, "time_per_iteration": 2.8502981662750244 }, { "auxiliary_loss_clip": 0.01600371, "auxiliary_loss_mlp": 0.01056002, "balance_loss_clip": 1.36257112, "balance_loss_mlp": 1.02605629, "epoch": 0.1448970389298061, "flos": 35421179788800.0, "grad_norm": 2.888511675735519, "language_loss": 0.76108909, "learning_rate": 3.863109517792446e-06, "loss": 0.78765285, "num_input_tokens_seen": 52184325, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.29956055, "step": 2410, "time_per_iteration": 3.0120110511779785 }, { "auxiliary_loss_clip": 0.01581924, "auxiliary_loss_mlp": 0.0104922, "balance_loss_clip": 1.35067391, "balance_loss_mlp": 1.01954901, "epoch": 0.14495716218247406, "flos": 15422878863360.0, "grad_norm": 1.7346306939130545, "language_loss": 0.81896108, "learning_rate": 3.8629678737882945e-06, "loss": 0.84527254, "num_input_tokens_seen": 52202740, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.29650879, "step": 2411, "time_per_iteration": 2.8616509437561035 }, { "auxiliary_loss_clip": 0.01577283, "auxiliary_loss_mlp": 0.01061336, "balance_loss_clip": 1.34692359, "balance_loss_mlp": 1.02862465, "epoch": 0.14501728543514203, "flos": 33706342160640.0, "grad_norm": 2.6051819251714283, "language_loss": 0.71400374, "learning_rate": 3.862826159140214e-06, "loss": 0.74038988, "num_input_tokens_seen": 52223100, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.32714844, "step": 2412, "time_per_iteration": 2.929386854171753 }, { "auxiliary_loss_clip": 0.01575743, "auxiliary_loss_mlp": 0.01052976, "balance_loss_clip": 1.34697652, "balance_loss_mlp": 1.02152848, "epoch": 0.14507740868781002, "flos": 15604492066560.0, "grad_norm": 1.8761303436893757, "language_loss": 0.7789284, "learning_rate": 3.862684373853579e-06, "loss": 0.8052156, "num_input_tokens_seen": 52239690, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.31469727, "step": 2413, "time_per_iteration": 2.852015495300293 }, { "auxiliary_loss_clip": 0.01370912, "auxiliary_loss_mlp": 0.01075062, "balance_loss_clip": 1.23643339, "balance_loss_mlp": 1.05589318, "epoch": 0.145137531940478, "flos": 66706229710080.0, "grad_norm": 0.9317366031918292, "language_loss": 0.58966589, "learning_rate": 3.8625425179337656e-06, "loss": 0.61412561, "num_input_tokens_seen": 52296705, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.19140625, "step": 2414, "time_per_iteration": 3.2903523445129395 }, { "auxiliary_loss_clip": 0.01365527, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.23498487, "balance_loss_mlp": 1.01983476, "epoch": 0.14519765519314595, "flos": 67554518065920.0, "grad_norm": 0.8550514228093964, "language_loss": 0.62377703, "learning_rate": 3.862400591386154e-06, "loss": 0.64779377, "num_input_tokens_seen": 52361830, "router_z_loss_clip": 1.3046875, "router_z_loss_mlp": 0.16308594, "step": 2415, "time_per_iteration": 3.317106246948242 }, { "auxiliary_loss_clip": 0.01574332, "auxiliary_loss_mlp": 0.01059263, "balance_loss_clip": 1.34486985, "balance_loss_mlp": 1.02748179, "epoch": 0.14525777844581392, "flos": 17207040804480.0, "grad_norm": 2.144750038224124, "language_loss": 0.73410118, "learning_rate": 3.8622585942161245e-06, "loss": 0.76043713, "num_input_tokens_seen": 52379420, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.31787109, "step": 2416, "time_per_iteration": 2.9066548347473145 }, { "auxiliary_loss_clip": 0.01361149, "auxiliary_loss_mlp": 0.01065733, "balance_loss_clip": 1.23172808, "balance_loss_mlp": 1.05095088, "epoch": 0.14531790169848188, "flos": 65436792324480.0, "grad_norm": 0.7329295152518381, "language_loss": 0.60450971, "learning_rate": 3.8621165264290635e-06, "loss": 0.62877846, "num_input_tokens_seen": 52446290, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.14746094, "step": 2417, "time_per_iteration": 3.3311612606048584 }, { "auxiliary_loss_clip": 0.01587677, "auxiliary_loss_mlp": 0.01061117, "balance_loss_clip": 1.35425115, "balance_loss_mlp": 1.03174376, "epoch": 0.14537802495114985, "flos": 32575372421760.0, "grad_norm": 8.367554620212376, "language_loss": 0.80068278, "learning_rate": 3.861974388030356e-06, "loss": 0.82717073, "num_input_tokens_seen": 52467295, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.29345703, "step": 2418, "time_per_iteration": 3.0119051933288574 }, { "auxiliary_loss_clip": 0.01556466, "auxiliary_loss_mlp": 0.01069577, "balance_loss_clip": 1.33280754, "balance_loss_mlp": 1.03996539, "epoch": 0.1454381482038178, "flos": 20235456760320.0, "grad_norm": 1.9943320614385767, "language_loss": 0.72661495, "learning_rate": 3.861832179025394e-06, "loss": 0.75287545, "num_input_tokens_seen": 52487295, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.29577637, "step": 2419, "time_per_iteration": 2.874368906021118 }, { "auxiliary_loss_clip": 0.01569359, "auxiliary_loss_mlp": 0.01075027, "balance_loss_clip": 1.34072208, "balance_loss_mlp": 1.04560614, "epoch": 0.1454982714564858, "flos": 22903360997760.0, "grad_norm": 2.4676035133631777, "language_loss": 0.91476238, "learning_rate": 3.861689899419569e-06, "loss": 0.94120634, "num_input_tokens_seen": 52504220, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29443359, "step": 2420, "time_per_iteration": 2.8443875312805176 }, { "auxiliary_loss_clip": 0.01580572, "auxiliary_loss_mlp": 0.0107745, "balance_loss_clip": 1.35306334, "balance_loss_mlp": 1.04898262, "epoch": 0.14555839470915377, "flos": 20239438302720.0, "grad_norm": 2.056867247746963, "language_loss": 0.83415735, "learning_rate": 3.861547549218276e-06, "loss": 0.86073756, "num_input_tokens_seen": 52521900, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28466797, "step": 2421, "time_per_iteration": 4.289407253265381 }, { "auxiliary_loss_clip": 0.01583608, "auxiliary_loss_mlp": 0.01082025, "balance_loss_clip": 1.35238981, "balance_loss_mlp": 1.05293846, "epoch": 0.14561851796182174, "flos": 22246322987520.0, "grad_norm": 1.5365264888965833, "language_loss": 0.82462537, "learning_rate": 3.861405128426914e-06, "loss": 0.85128164, "num_input_tokens_seen": 52540495, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.29101562, "step": 2422, "time_per_iteration": 2.8514363765716553 }, { "auxiliary_loss_clip": 0.01364927, "auxiliary_loss_mlp": 0.01050362, "balance_loss_clip": 1.23624635, "balance_loss_mlp": 1.03596163, "epoch": 0.1456786412144897, "flos": 52666226225280.0, "grad_norm": 0.9183582729142833, "language_loss": 0.63465953, "learning_rate": 3.861262637050883e-06, "loss": 0.6588124, "num_input_tokens_seen": 52603305, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.14355469, "step": 2423, "time_per_iteration": 3.37003755569458 }, { "auxiliary_loss_clip": 0.01581372, "auxiliary_loss_mlp": 0.0108397, "balance_loss_clip": 1.35357964, "balance_loss_mlp": 1.05590761, "epoch": 0.14573876446715767, "flos": 23232038359680.0, "grad_norm": 1.8448660604883096, "language_loss": 0.8257817, "learning_rate": 3.861120075095585e-06, "loss": 0.85243511, "num_input_tokens_seen": 52623435, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.28076172, "step": 2424, "time_per_iteration": 2.9177470207214355 }, { "auxiliary_loss_clip": 0.01575357, "auxiliary_loss_mlp": 0.01081601, "balance_loss_clip": 1.34791541, "balance_loss_mlp": 1.05186987, "epoch": 0.14579888771982563, "flos": 18123612842880.0, "grad_norm": 2.3030985538441424, "language_loss": 0.79468012, "learning_rate": 3.860977442566429e-06, "loss": 0.8212496, "num_input_tokens_seen": 52642255, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.29736328, "step": 2425, "time_per_iteration": 2.8158977031707764 }, { "auxiliary_loss_clip": 0.01583432, "auxiliary_loss_mlp": 0.01069328, "balance_loss_clip": 1.35475028, "balance_loss_mlp": 1.03990746, "epoch": 0.14585901097249362, "flos": 23010989673600.0, "grad_norm": 3.2397204750564756, "language_loss": 0.84608501, "learning_rate": 3.860834739468821e-06, "loss": 0.8726126, "num_input_tokens_seen": 52658700, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.29394531, "step": 2426, "time_per_iteration": 2.8816447257995605 }, { "auxiliary_loss_clip": 0.01583948, "auxiliary_loss_mlp": 0.01057699, "balance_loss_clip": 1.35471892, "balance_loss_mlp": 1.03006601, "epoch": 0.1459191342251616, "flos": 21918821990400.0, "grad_norm": 1.90532483152315, "language_loss": 0.88048673, "learning_rate": 3.860691965808173e-06, "loss": 0.90690321, "num_input_tokens_seen": 52678140, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27648926, "step": 2427, "time_per_iteration": 4.269078254699707 }, { "auxiliary_loss_clip": 0.01579121, "auxiliary_loss_mlp": 0.01055558, "balance_loss_clip": 1.34598458, "balance_loss_mlp": 1.02599406, "epoch": 0.14597925747782955, "flos": 14983631913600.0, "grad_norm": 2.7558737183506437, "language_loss": 0.68661225, "learning_rate": 3.8605491215899e-06, "loss": 0.71295905, "num_input_tokens_seen": 52696825, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.29541016, "step": 2428, "time_per_iteration": 2.81520414352417 }, { "auxiliary_loss_clip": 0.01562188, "auxiliary_loss_mlp": 0.01058032, "balance_loss_clip": 1.33635032, "balance_loss_mlp": 1.02903974, "epoch": 0.14603938073049752, "flos": 21078677698560.0, "grad_norm": 5.233510220377944, "language_loss": 0.84185523, "learning_rate": 3.860406206819417e-06, "loss": 0.86805749, "num_input_tokens_seen": 52715125, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.28955078, "step": 2429, "time_per_iteration": 4.34030294418335 }, { "auxiliary_loss_clip": 0.01566806, "auxiliary_loss_mlp": 0.01064581, "balance_loss_clip": 1.34114206, "balance_loss_mlp": 1.02886629, "epoch": 0.14609950398316549, "flos": 19874221125120.0, "grad_norm": 1.7173646408902759, "language_loss": 0.79886985, "learning_rate": 3.860263221502145e-06, "loss": 0.82518369, "num_input_tokens_seen": 52734015, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.35742188, "step": 2430, "time_per_iteration": 4.372245788574219 }, { "auxiliary_loss_clip": 0.01591279, "auxiliary_loss_mlp": 0.01053677, "balance_loss_clip": 1.36054361, "balance_loss_mlp": 1.02451897, "epoch": 0.14615962723583345, "flos": 22428976821120.0, "grad_norm": 3.1586732169302643, "language_loss": 0.84933364, "learning_rate": 3.860120165643504e-06, "loss": 0.87578321, "num_input_tokens_seen": 52753025, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.29150391, "step": 2431, "time_per_iteration": 2.9006173610687256 }, { "auxiliary_loss_clip": 0.01601984, "auxiliary_loss_mlp": 0.01052978, "balance_loss_clip": 1.36653161, "balance_loss_mlp": 1.0224607, "epoch": 0.14621975048850142, "flos": 22356440127360.0, "grad_norm": 1.8143097874824667, "language_loss": 0.8007319, "learning_rate": 3.859977039248921e-06, "loss": 0.82728148, "num_input_tokens_seen": 52773420, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.30517578, "step": 2432, "time_per_iteration": 2.853865623474121 }, { "auxiliary_loss_clip": 0.01576944, "auxiliary_loss_mlp": 0.01053462, "balance_loss_clip": 1.34744596, "balance_loss_mlp": 1.0248518, "epoch": 0.1462798737411694, "flos": 24399819383040.0, "grad_norm": 1.9265081765371002, "language_loss": 0.81709933, "learning_rate": 3.859833842323822e-06, "loss": 0.8434034, "num_input_tokens_seen": 52792870, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28637695, "step": 2433, "time_per_iteration": 2.9274325370788574 }, { "auxiliary_loss_clip": 0.01568038, "auxiliary_loss_mlp": 0.01052679, "balance_loss_clip": 1.34438705, "balance_loss_mlp": 1.0231626, "epoch": 0.14633999699383737, "flos": 19253994399360.0, "grad_norm": 1.9570749873944084, "language_loss": 0.79375911, "learning_rate": 3.859690574873638e-06, "loss": 0.81996632, "num_input_tokens_seen": 52811615, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.29541016, "step": 2434, "time_per_iteration": 2.8409407138824463 }, { "auxiliary_loss_clip": 0.01343004, "auxiliary_loss_mlp": 0.01020744, "balance_loss_clip": 1.21680844, "balance_loss_mlp": 1.00329196, "epoch": 0.14640012024650534, "flos": 62690152855680.0, "grad_norm": 0.8585401572324304, "language_loss": 0.5845117, "learning_rate": 3.8595472369038e-06, "loss": 0.60814917, "num_input_tokens_seen": 52873230, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.17480469, "step": 2435, "time_per_iteration": 3.321195363998413 }, { "auxiliary_loss_clip": 0.01555757, "auxiliary_loss_mlp": 0.01052843, "balance_loss_clip": 1.33559108, "balance_loss_mlp": 1.02342176, "epoch": 0.1464602434991733, "flos": 12283848074880.0, "grad_norm": 2.5635179615582615, "language_loss": 0.89512515, "learning_rate": 3.859403828419744e-06, "loss": 0.92121112, "num_input_tokens_seen": 52889325, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.29443359, "step": 2436, "time_per_iteration": 2.836540460586548 }, { "auxiliary_loss_clip": 0.01585934, "auxiliary_loss_mlp": 0.01056023, "balance_loss_clip": 1.3560164, "balance_loss_mlp": 1.02774692, "epoch": 0.14652036675184127, "flos": 20931885008640.0, "grad_norm": 1.8102987678387317, "language_loss": 0.75726384, "learning_rate": 3.85926034942691e-06, "loss": 0.78368342, "num_input_tokens_seen": 52909705, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.28295898, "step": 2437, "time_per_iteration": 2.8478307723999023 }, { "auxiliary_loss_clip": 0.0157737, "auxiliary_loss_mlp": 0.01049367, "balance_loss_clip": 1.34832513, "balance_loss_mlp": 1.01756191, "epoch": 0.14658049000450923, "flos": 27713721899520.0, "grad_norm": 2.544263970186643, "language_loss": 0.75034577, "learning_rate": 3.859116799930736e-06, "loss": 0.77661312, "num_input_tokens_seen": 52930300, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.31787109, "step": 2438, "time_per_iteration": 2.9239120483398438 }, { "auxiliary_loss_clip": 0.01590221, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.36167932, "balance_loss_mlp": 1.0230062, "epoch": 0.14664061325717723, "flos": 24947328435840.0, "grad_norm": 2.29104043141979, "language_loss": 0.75217468, "learning_rate": 3.858973179936668e-06, "loss": 0.77859163, "num_input_tokens_seen": 52949955, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.28491211, "step": 2439, "time_per_iteration": 2.8377459049224854 }, { "auxiliary_loss_clip": 0.01574495, "auxiliary_loss_mlp": 0.01047685, "balance_loss_clip": 1.34895861, "balance_loss_mlp": 1.01953971, "epoch": 0.1467007365098452, "flos": 40312628651520.0, "grad_norm": 2.208045634929891, "language_loss": 0.75875598, "learning_rate": 3.85882948945015e-06, "loss": 0.78497779, "num_input_tokens_seen": 52972905, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.28137207, "step": 2440, "time_per_iteration": 2.9897854328155518 }, { "auxiliary_loss_clip": 0.01560824, "auxiliary_loss_mlp": 0.01051798, "balance_loss_clip": 1.33978009, "balance_loss_mlp": 1.02267528, "epoch": 0.14676085976251316, "flos": 26552094168960.0, "grad_norm": 1.5289094849396603, "language_loss": 0.83531106, "learning_rate": 3.85868572847663e-06, "loss": 0.86143726, "num_input_tokens_seen": 52994850, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.2911377, "step": 2441, "time_per_iteration": 2.8824429512023926 }, { "auxiliary_loss_clip": 0.01593841, "auxiliary_loss_mlp": 0.01049517, "balance_loss_clip": 1.36109829, "balance_loss_mlp": 1.01973855, "epoch": 0.14682098301518112, "flos": 23560579987200.0, "grad_norm": 4.927821074044505, "language_loss": 0.73849154, "learning_rate": 3.858541897021563e-06, "loss": 0.76492506, "num_input_tokens_seen": 53014740, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.29772949, "step": 2442, "time_per_iteration": 2.905592679977417 }, { "auxiliary_loss_clip": 0.01606083, "auxiliary_loss_mlp": 0.01047369, "balance_loss_clip": 1.36716187, "balance_loss_mlp": 1.01747131, "epoch": 0.1468811062678491, "flos": 11657241832320.0, "grad_norm": 3.3918386416994726, "language_loss": 0.83834481, "learning_rate": 3.8583979950904e-06, "loss": 0.86487937, "num_input_tokens_seen": 53029780, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.29882812, "step": 2443, "time_per_iteration": 2.887540817260742 }, { "auxiliary_loss_clip": 0.01572524, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.34579921, "balance_loss_mlp": 1.01852942, "epoch": 0.14694122952051705, "flos": 23012889955200.0, "grad_norm": 2.4454216051832858, "language_loss": 0.83900464, "learning_rate": 3.858254022688599e-06, "loss": 0.86521602, "num_input_tokens_seen": 53048620, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.30102539, "step": 2444, "time_per_iteration": 2.8506932258605957 }, { "auxiliary_loss_clip": 0.01580207, "auxiliary_loss_mlp": 0.01050748, "balance_loss_clip": 1.35115623, "balance_loss_mlp": 1.02149463, "epoch": 0.14700135277318502, "flos": 26513563582080.0, "grad_norm": 3.999043060794391, "language_loss": 0.72562361, "learning_rate": 3.85810997982162e-06, "loss": 0.75193322, "num_input_tokens_seen": 53070055, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29272461, "step": 2445, "time_per_iteration": 2.9209847450256348 }, { "auxiliary_loss_clip": 0.01346392, "auxiliary_loss_mlp": 0.01023744, "balance_loss_clip": 1.2227, "balance_loss_mlp": 1.0120616, "epoch": 0.147061476025853, "flos": 59477997191040.0, "grad_norm": 0.8312323359699864, "language_loss": 0.63300699, "learning_rate": 3.857965866494923e-06, "loss": 0.65670836, "num_input_tokens_seen": 53126945, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.11669922, "step": 2446, "time_per_iteration": 3.293339490890503 }, { "auxiliary_loss_clip": 0.015799, "auxiliary_loss_mlp": 0.0105626, "balance_loss_clip": 1.35137892, "balance_loss_mlp": 1.02686274, "epoch": 0.14712159927852098, "flos": 28342907095680.0, "grad_norm": 1.5366549896560606, "language_loss": 0.75865173, "learning_rate": 3.857821682713975e-06, "loss": 0.78501332, "num_input_tokens_seen": 53149130, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.29394531, "step": 2447, "time_per_iteration": 2.956453323364258 }, { "auxiliary_loss_clip": 0.01577446, "auxiliary_loss_mlp": 0.01055541, "balance_loss_clip": 1.35084093, "balance_loss_mlp": 1.02633488, "epoch": 0.14718172253118894, "flos": 27101367768960.0, "grad_norm": 1.9467285956481803, "language_loss": 0.86396098, "learning_rate": 3.857677428484242e-06, "loss": 0.89029086, "num_input_tokens_seen": 53167120, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.29174805, "step": 2448, "time_per_iteration": 3.068194627761841 }, { "auxiliary_loss_clip": 0.01349095, "auxiliary_loss_mlp": 0.01058682, "balance_loss_clip": 1.22351503, "balance_loss_mlp": 1.04227912, "epoch": 0.1472418457838569, "flos": 66736344764160.0, "grad_norm": 0.7725532966744346, "language_loss": 0.56921339, "learning_rate": 3.857533103811195e-06, "loss": 0.59329116, "num_input_tokens_seen": 53227945, "router_z_loss_clip": 1.2578125, "router_z_loss_mlp": 0.1640625, "step": 2449, "time_per_iteration": 3.2737629413604736 }, { "auxiliary_loss_clip": 0.01560233, "auxiliary_loss_mlp": 0.01057431, "balance_loss_clip": 1.3374387, "balance_loss_mlp": 1.02693677, "epoch": 0.14730196903652487, "flos": 19583124209280.0, "grad_norm": 2.422147013724452, "language_loss": 0.86359602, "learning_rate": 3.857388708700307e-06, "loss": 0.88977265, "num_input_tokens_seen": 53244615, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.30493164, "step": 2450, "time_per_iteration": 2.8711159229278564 }, { "auxiliary_loss_clip": 0.01580493, "auxiliary_loss_mlp": 0.01053035, "balance_loss_clip": 1.3503201, "balance_loss_mlp": 1.02403164, "epoch": 0.14736209228919284, "flos": 16079690649600.0, "grad_norm": 1.8123697517463242, "language_loss": 0.76290739, "learning_rate": 3.857244243157052e-06, "loss": 0.78924263, "num_input_tokens_seen": 53262205, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.28967285, "step": 2451, "time_per_iteration": 2.82853102684021 }, { "auxiliary_loss_clip": 0.01561362, "auxiliary_loss_mlp": 0.01053265, "balance_loss_clip": 1.34118867, "balance_loss_mlp": 1.0243454, "epoch": 0.1474222155418608, "flos": 23049836974080.0, "grad_norm": 1.9823887850699196, "language_loss": 0.82716966, "learning_rate": 3.85709970718691e-06, "loss": 0.85331589, "num_input_tokens_seen": 53282445, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.28881836, "step": 2452, "time_per_iteration": 2.8779518604278564 }, { "auxiliary_loss_clip": 0.01573375, "auxiliary_loss_mlp": 0.01053269, "balance_loss_clip": 1.34931266, "balance_loss_mlp": 1.02486098, "epoch": 0.1474823387945288, "flos": 17027282638080.0, "grad_norm": 1.851489603150712, "language_loss": 0.74730116, "learning_rate": 3.856955100795361e-06, "loss": 0.77356762, "num_input_tokens_seen": 53299060, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.28417969, "step": 2453, "time_per_iteration": 2.8405158519744873 }, { "auxiliary_loss_clip": 0.01584998, "auxiliary_loss_mlp": 0.01056789, "balance_loss_clip": 1.35608816, "balance_loss_mlp": 1.02777338, "epoch": 0.14754246204719676, "flos": 17904102480000.0, "grad_norm": 2.0879907031203224, "language_loss": 0.78202295, "learning_rate": 3.856810423987889e-06, "loss": 0.8084408, "num_input_tokens_seen": 53315970, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29003906, "step": 2454, "time_per_iteration": 2.8011245727539062 }, { "auxiliary_loss_clip": 0.01568388, "auxiliary_loss_mlp": 0.01058141, "balance_loss_clip": 1.34540582, "balance_loss_mlp": 1.03007889, "epoch": 0.14760258529986472, "flos": 13086954858240.0, "grad_norm": 3.7247532349304953, "language_loss": 0.84016323, "learning_rate": 3.856665676769979e-06, "loss": 0.86642855, "num_input_tokens_seen": 53332940, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.28076172, "step": 2455, "time_per_iteration": 2.8156449794769287 }, { "auxiliary_loss_clip": 0.01594737, "auxiliary_loss_mlp": 0.01058132, "balance_loss_clip": 1.36274457, "balance_loss_mlp": 1.02925968, "epoch": 0.1476627085525327, "flos": 30817615461120.0, "grad_norm": 1.8067855492741638, "language_loss": 0.84960383, "learning_rate": 3.85652085914712e-06, "loss": 0.87613249, "num_input_tokens_seen": 53353295, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.28881836, "step": 2456, "time_per_iteration": 4.3665876388549805 }, { "auxiliary_loss_clip": 0.0156687, "auxiliary_loss_mlp": 0.01055587, "balance_loss_clip": 1.34509969, "balance_loss_mlp": 1.02659535, "epoch": 0.14772283180520066, "flos": 21699447361920.0, "grad_norm": 1.6570976695327901, "language_loss": 0.85331202, "learning_rate": 3.856375971124805e-06, "loss": 0.87953663, "num_input_tokens_seen": 53373410, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.28955078, "step": 2457, "time_per_iteration": 2.893263101577759 }, { "auxiliary_loss_clip": 0.01561137, "auxiliary_loss_mlp": 0.01056159, "balance_loss_clip": 1.34075928, "balance_loss_mlp": 1.02590418, "epoch": 0.14778295505786862, "flos": 18779610222720.0, "grad_norm": 2.0235330563948737, "language_loss": 0.76298487, "learning_rate": 3.856231012708527e-06, "loss": 0.78915787, "num_input_tokens_seen": 53391430, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.30273438, "step": 2458, "time_per_iteration": 2.8886444568634033 }, { "auxiliary_loss_clip": 0.01599291, "auxiliary_loss_mlp": 0.01060264, "balance_loss_clip": 1.36543226, "balance_loss_mlp": 1.03060508, "epoch": 0.1478430783105366, "flos": 22903903935360.0, "grad_norm": 2.072392561101042, "language_loss": 0.84571671, "learning_rate": 3.856085983903782e-06, "loss": 0.87231225, "num_input_tokens_seen": 53409960, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.29602051, "step": 2459, "time_per_iteration": 2.815490245819092 }, { "auxiliary_loss_clip": 0.01565602, "auxiliary_loss_mlp": 0.0105169, "balance_loss_clip": 1.3433665, "balance_loss_mlp": 1.02362835, "epoch": 0.14790320156320458, "flos": 15093568074240.0, "grad_norm": 2.3041233555635605, "language_loss": 0.76149702, "learning_rate": 3.855940884716071e-06, "loss": 0.7876699, "num_input_tokens_seen": 53426160, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.28076172, "step": 2460, "time_per_iteration": 2.8124966621398926 }, { "auxiliary_loss_clip": 0.01574041, "auxiliary_loss_mlp": 0.01053948, "balance_loss_clip": 1.34450173, "balance_loss_mlp": 1.02502751, "epoch": 0.14796332481587254, "flos": 26515825822080.0, "grad_norm": 1.6567060379051366, "language_loss": 0.82039517, "learning_rate": 3.855795715150896e-06, "loss": 0.84667504, "num_input_tokens_seen": 53448530, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.28930664, "step": 2461, "time_per_iteration": 2.8879010677337646 }, { "auxiliary_loss_clip": 0.01567956, "auxiliary_loss_mlp": 0.010576, "balance_loss_clip": 1.34080243, "balance_loss_mlp": 1.02662992, "epoch": 0.1480234480685405, "flos": 17571262596480.0, "grad_norm": 2.4670100001332678, "language_loss": 0.68136472, "learning_rate": 3.855650475213761e-06, "loss": 0.70762032, "num_input_tokens_seen": 53465915, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.30981445, "step": 2462, "time_per_iteration": 4.256190299987793 }, { "auxiliary_loss_clip": 0.01576922, "auxiliary_loss_mlp": 0.01055861, "balance_loss_clip": 1.34876978, "balance_loss_mlp": 1.02543879, "epoch": 0.14808357132120847, "flos": 53601584135040.0, "grad_norm": 1.754159694087517, "language_loss": 0.68145126, "learning_rate": 3.8555051649101745e-06, "loss": 0.70777905, "num_input_tokens_seen": 53496055, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.30444336, "step": 2463, "time_per_iteration": 3.153669595718384 }, { "auxiliary_loss_clip": 0.01576067, "auxiliary_loss_mlp": 0.01053601, "balance_loss_clip": 1.34836817, "balance_loss_mlp": 1.02370334, "epoch": 0.14814369457387644, "flos": 19838857674240.0, "grad_norm": 2.018223381303197, "language_loss": 0.78488469, "learning_rate": 3.855359784245646e-06, "loss": 0.81118131, "num_input_tokens_seen": 53513790, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.29882812, "step": 2464, "time_per_iteration": 5.678634166717529 }, { "auxiliary_loss_clip": 0.01565264, "auxiliary_loss_mlp": 0.01046345, "balance_loss_clip": 1.3419764, "balance_loss_mlp": 1.01853299, "epoch": 0.1482038178265444, "flos": 23925525696000.0, "grad_norm": 1.8028690473382691, "language_loss": 0.80604744, "learning_rate": 3.855214333225688e-06, "loss": 0.83216351, "num_input_tokens_seen": 53533410, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.2779541, "step": 2465, "time_per_iteration": 2.90708327293396 }, { "auxiliary_loss_clip": 0.01594262, "auxiliary_loss_mlp": 0.01054808, "balance_loss_clip": 1.36080277, "balance_loss_mlp": 1.02257442, "epoch": 0.1482639410792124, "flos": 24181168671360.0, "grad_norm": 1.9008397545447315, "language_loss": 0.77257395, "learning_rate": 3.855068811855817e-06, "loss": 0.79906464, "num_input_tokens_seen": 53554775, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.32226562, "step": 2466, "time_per_iteration": 2.842912435531616 }, { "auxiliary_loss_clip": 0.0137556, "auxiliary_loss_mlp": 0.01036337, "balance_loss_clip": 1.23823953, "balance_loss_mlp": 1.01764512, "epoch": 0.14832406433188036, "flos": 66219222234240.0, "grad_norm": 0.8151143362827461, "language_loss": 0.60201919, "learning_rate": 3.854923220141551e-06, "loss": 0.62613821, "num_input_tokens_seen": 53609675, "router_z_loss_clip": 1.375, "router_z_loss_mlp": 0.18652344, "step": 2467, "time_per_iteration": 3.4398300647735596 }, { "auxiliary_loss_clip": 0.01583321, "auxiliary_loss_mlp": 0.0104779, "balance_loss_clip": 1.35543346, "balance_loss_mlp": 1.016891, "epoch": 0.14838418758454833, "flos": 25422300794880.0, "grad_norm": 2.2672539722125156, "language_loss": 0.88651955, "learning_rate": 3.85477755808841e-06, "loss": 0.91283065, "num_input_tokens_seen": 53626950, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.30883789, "step": 2468, "time_per_iteration": 2.9043235778808594 }, { "auxiliary_loss_clip": 0.01588538, "auxiliary_loss_mlp": 0.01052995, "balance_loss_clip": 1.35529876, "balance_loss_mlp": 1.02152348, "epoch": 0.1484443108372163, "flos": 23299055187840.0, "grad_norm": 2.0019168657756117, "language_loss": 0.77245438, "learning_rate": 3.854631825701919e-06, "loss": 0.79886973, "num_input_tokens_seen": 53644200, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.31469727, "step": 2469, "time_per_iteration": 2.849865674972534 }, { "auxiliary_loss_clip": 0.01573398, "auxiliary_loss_mlp": 0.01050516, "balance_loss_clip": 1.34565401, "balance_loss_mlp": 1.02083337, "epoch": 0.14850443408988426, "flos": 14655859447680.0, "grad_norm": 2.0674842589291527, "language_loss": 0.77087063, "learning_rate": 3.854486022987603e-06, "loss": 0.79710978, "num_input_tokens_seen": 53659650, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.296875, "step": 2470, "time_per_iteration": 2.8888399600982666 }, { "auxiliary_loss_clip": 0.01569185, "auxiliary_loss_mlp": 0.01046759, "balance_loss_clip": 1.34607434, "balance_loss_mlp": 1.01779079, "epoch": 0.14856455734255222, "flos": 23558317747200.0, "grad_norm": 1.732708116492121, "language_loss": 0.73290253, "learning_rate": 3.8543401499509905e-06, "loss": 0.75906193, "num_input_tokens_seen": 53680275, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.28955078, "step": 2471, "time_per_iteration": 2.8913724422454834 }, { "auxiliary_loss_clip": 0.01592351, "auxiliary_loss_mlp": 0.01053073, "balance_loss_clip": 1.35794723, "balance_loss_mlp": 1.02129173, "epoch": 0.1486246805952202, "flos": 18086349110400.0, "grad_norm": 1.7085807737250474, "language_loss": 0.90655851, "learning_rate": 3.854194206597615e-06, "loss": 0.93301278, "num_input_tokens_seen": 53698270, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.31811523, "step": 2472, "time_per_iteration": 2.8207309246063232 }, { "auxiliary_loss_clip": 0.01575005, "auxiliary_loss_mlp": 0.01049289, "balance_loss_clip": 1.34886146, "balance_loss_mlp": 1.01901042, "epoch": 0.14868480384788818, "flos": 19362844684800.0, "grad_norm": 2.4397001065258848, "language_loss": 0.82335877, "learning_rate": 3.854048192933008e-06, "loss": 0.84960175, "num_input_tokens_seen": 53716845, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.30322266, "step": 2473, "time_per_iteration": 2.7941675186157227 }, { "auxiliary_loss_clip": 0.01584387, "auxiliary_loss_mlp": 0.01054184, "balance_loss_clip": 1.35510969, "balance_loss_mlp": 1.02383351, "epoch": 0.14874492710055615, "flos": 22210326109440.0, "grad_norm": 2.1202060645004175, "language_loss": 0.79939437, "learning_rate": 3.853902108962709e-06, "loss": 0.82578009, "num_input_tokens_seen": 53734970, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.30322266, "step": 2474, "time_per_iteration": 2.8389248847961426 }, { "auxiliary_loss_clip": 0.01593018, "auxiliary_loss_mlp": 0.01054813, "balance_loss_clip": 1.35854149, "balance_loss_mlp": 1.02429569, "epoch": 0.1488050503532241, "flos": 21112729050240.0, "grad_norm": 1.7330099272939044, "language_loss": 0.83658731, "learning_rate": 3.853755954692255e-06, "loss": 0.8630656, "num_input_tokens_seen": 53753415, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.30541992, "step": 2475, "time_per_iteration": 2.8134829998016357 }, { "auxiliary_loss_clip": 0.01579512, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.35233831, "balance_loss_mlp": 1.02488708, "epoch": 0.14886517360589208, "flos": 12794183884800.0, "grad_norm": 1.726402244021351, "language_loss": 0.81743729, "learning_rate": 3.85360973012719e-06, "loss": 0.84379548, "num_input_tokens_seen": 53770305, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.31420898, "step": 2476, "time_per_iteration": 2.816865921020508 }, { "auxiliary_loss_clip": 0.01562594, "auxiliary_loss_mlp": 0.01049872, "balance_loss_clip": 1.34267426, "balance_loss_mlp": 1.01890159, "epoch": 0.14892529685856004, "flos": 29034720374400.0, "grad_norm": 2.134126377427737, "language_loss": 0.7875638, "learning_rate": 3.853463435273058e-06, "loss": 0.81368846, "num_input_tokens_seen": 53788895, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.30957031, "step": 2477, "time_per_iteration": 3.0155012607574463 }, { "auxiliary_loss_clip": 0.01366724, "auxiliary_loss_mlp": 0.01040245, "balance_loss_clip": 1.22768402, "balance_loss_mlp": 1.02002716, "epoch": 0.148985420111228, "flos": 61954198634880.0, "grad_norm": 0.8136519674463651, "language_loss": 0.60190082, "learning_rate": 3.853317070135407e-06, "loss": 0.62597048, "num_input_tokens_seen": 53850260, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.20214844, "step": 2478, "time_per_iteration": 3.433900833129883 }, { "auxiliary_loss_clip": 0.01578561, "auxiliary_loss_mlp": 0.01055321, "balance_loss_clip": 1.35040665, "balance_loss_mlp": 1.02644849, "epoch": 0.149045543363896, "flos": 23925706675200.0, "grad_norm": 3.3462954398109206, "language_loss": 0.71683741, "learning_rate": 3.853170634719787e-06, "loss": 0.74317622, "num_input_tokens_seen": 53867520, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.2890625, "step": 2479, "time_per_iteration": 2.8847897052764893 }, { "auxiliary_loss_clip": 0.01586486, "auxiliary_loss_mlp": 0.01055305, "balance_loss_clip": 1.3568666, "balance_loss_mlp": 1.02590847, "epoch": 0.14910566661656396, "flos": 23663548448640.0, "grad_norm": 1.5766715478820439, "language_loss": 0.81642568, "learning_rate": 3.853024129031751e-06, "loss": 0.84284353, "num_input_tokens_seen": 53886620, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.29394531, "step": 2480, "time_per_iteration": 2.863640785217285 }, { "auxiliary_loss_clip": 0.01608447, "auxiliary_loss_mlp": 0.01059295, "balance_loss_clip": 1.37329602, "balance_loss_mlp": 1.03051805, "epoch": 0.14916578986923193, "flos": 20523296050560.0, "grad_norm": 2.4328650103556186, "language_loss": 0.84873092, "learning_rate": 3.852877553076854e-06, "loss": 0.87540829, "num_input_tokens_seen": 53902230, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.2878418, "step": 2481, "time_per_iteration": 2.8146698474884033 }, { "auxiliary_loss_clip": 0.0158862, "auxiliary_loss_mlp": 0.01060844, "balance_loss_clip": 1.35699892, "balance_loss_mlp": 1.02925384, "epoch": 0.1492259131218999, "flos": 22501965962880.0, "grad_norm": 1.913244575723662, "language_loss": 0.78515172, "learning_rate": 3.8527309068606546e-06, "loss": 0.8116464, "num_input_tokens_seen": 53919475, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.31591797, "step": 2482, "time_per_iteration": 2.8864920139312744 }, { "auxiliary_loss_clip": 0.01614341, "auxiliary_loss_mlp": 0.01062443, "balance_loss_clip": 1.37569475, "balance_loss_mlp": 1.03323638, "epoch": 0.14928603637456786, "flos": 23196403440000.0, "grad_norm": 2.1115634098474523, "language_loss": 0.81500131, "learning_rate": 3.852584190388713e-06, "loss": 0.8417691, "num_input_tokens_seen": 53939150, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.29223633, "step": 2483, "time_per_iteration": 2.8445591926574707 }, { "auxiliary_loss_clip": 0.0159343, "auxiliary_loss_mlp": 0.01062421, "balance_loss_clip": 1.3684566, "balance_loss_mlp": 1.03361988, "epoch": 0.14934615962723582, "flos": 21663314749440.0, "grad_norm": 1.6152145951689327, "language_loss": 0.71160209, "learning_rate": 3.852437403666595e-06, "loss": 0.73816061, "num_input_tokens_seen": 53958735, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.28808594, "step": 2484, "time_per_iteration": 2.895587682723999 }, { "auxiliary_loss_clip": 0.01599585, "auxiliary_loss_mlp": 0.01066277, "balance_loss_clip": 1.36653781, "balance_loss_mlp": 1.03604579, "epoch": 0.1494062828799038, "flos": 27019827360000.0, "grad_norm": 1.8859334464284532, "language_loss": 0.85205758, "learning_rate": 3.852290546699863e-06, "loss": 0.87871617, "num_input_tokens_seen": 53975065, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.30249023, "step": 2485, "time_per_iteration": 2.8532397747039795 }, { "auxiliary_loss_clip": 0.01601726, "auxiliary_loss_mlp": 0.01066782, "balance_loss_clip": 1.36763835, "balance_loss_mlp": 1.03837395, "epoch": 0.14946640613257178, "flos": 21224791716480.0, "grad_norm": 1.9856483252174038, "language_loss": 0.86183703, "learning_rate": 3.8521436194940894e-06, "loss": 0.88852215, "num_input_tokens_seen": 53993330, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.2845459, "step": 2486, "time_per_iteration": 2.85714054107666 }, { "auxiliary_loss_clip": 0.01586767, "auxiliary_loss_mlp": 0.01059346, "balance_loss_clip": 1.36299908, "balance_loss_mlp": 1.03172541, "epoch": 0.14952652938523975, "flos": 13378775690880.0, "grad_norm": 2.157492212902992, "language_loss": 0.75486183, "learning_rate": 3.851996622054842e-06, "loss": 0.78132296, "num_input_tokens_seen": 54010515, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27648926, "step": 2487, "time_per_iteration": 2.8233225345611572 }, { "auxiliary_loss_clip": 0.01599413, "auxiliary_loss_mlp": 0.01064572, "balance_loss_clip": 1.36829543, "balance_loss_mlp": 1.03519833, "epoch": 0.1495866526379077, "flos": 35531342173440.0, "grad_norm": 1.854263091786796, "language_loss": 0.72683024, "learning_rate": 3.8518495543877e-06, "loss": 0.75347006, "num_input_tokens_seen": 54031315, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.29345703, "step": 2488, "time_per_iteration": 2.973961114883423 }, { "auxiliary_loss_clip": 0.01621543, "auxiliary_loss_mlp": 0.01065134, "balance_loss_clip": 1.38320971, "balance_loss_mlp": 1.03676164, "epoch": 0.14964677589057568, "flos": 17639953482240.0, "grad_norm": 3.3625234778704334, "language_loss": 0.71787453, "learning_rate": 3.851702416498235e-06, "loss": 0.74474126, "num_input_tokens_seen": 54045965, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.28369141, "step": 2489, "time_per_iteration": 2.8675944805145264 }, { "auxiliary_loss_clip": 0.01618391, "auxiliary_loss_mlp": 0.01076761, "balance_loss_clip": 1.38224351, "balance_loss_mlp": 1.04733992, "epoch": 0.14970689914324364, "flos": 20192763651840.0, "grad_norm": 4.7721797399625885, "language_loss": 0.83096647, "learning_rate": 3.8515552083920295e-06, "loss": 0.85791796, "num_input_tokens_seen": 54059960, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.29443359, "step": 2490, "time_per_iteration": 4.32551646232605 }, { "auxiliary_loss_clip": 0.01628426, "auxiliary_loss_mlp": 0.01079905, "balance_loss_clip": 1.39350033, "balance_loss_mlp": 1.05264211, "epoch": 0.1497670223959116, "flos": 37241791056000.0, "grad_norm": 2.438645370294875, "language_loss": 0.81092119, "learning_rate": 3.851407930074666e-06, "loss": 0.83800453, "num_input_tokens_seen": 54079330, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27282715, "step": 2491, "time_per_iteration": 2.9879095554351807 }, { "auxiliary_loss_clip": 0.01613618, "auxiliary_loss_mlp": 0.01070407, "balance_loss_clip": 1.37878942, "balance_loss_mlp": 1.04101038, "epoch": 0.1498271456485796, "flos": 24464890684800.0, "grad_norm": 1.6562272764995425, "language_loss": 0.91951412, "learning_rate": 3.851260581551727e-06, "loss": 0.94635439, "num_input_tokens_seen": 54097555, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.29370117, "step": 2492, "time_per_iteration": 2.9076340198516846 }, { "auxiliary_loss_clip": 0.01612428, "auxiliary_loss_mlp": 0.01071034, "balance_loss_clip": 1.38002348, "balance_loss_mlp": 1.04211402, "epoch": 0.14988726890124757, "flos": 16262344483200.0, "grad_norm": 2.279238099326968, "language_loss": 0.82309246, "learning_rate": 3.851113162828802e-06, "loss": 0.84992707, "num_input_tokens_seen": 54115600, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.28930664, "step": 2493, "time_per_iteration": 2.7941198348999023 }, { "auxiliary_loss_clip": 0.01622126, "auxiliary_loss_mlp": 0.0106894, "balance_loss_clip": 1.38497853, "balance_loss_mlp": 1.03947115, "epoch": 0.14994739215391553, "flos": 20675970564480.0, "grad_norm": 1.9662286675992946, "language_loss": 0.81213319, "learning_rate": 3.85096567391148e-06, "loss": 0.8390438, "num_input_tokens_seen": 54135220, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.29492188, "step": 2494, "time_per_iteration": 2.8706235885620117 }, { "auxiliary_loss_clip": 0.01603992, "auxiliary_loss_mlp": 0.01064057, "balance_loss_clip": 1.37315524, "balance_loss_mlp": 1.03506505, "epoch": 0.1500075154065835, "flos": 70674844769280.0, "grad_norm": 1.799707891145228, "language_loss": 0.67484945, "learning_rate": 3.850818114805354e-06, "loss": 0.70152998, "num_input_tokens_seen": 54161065, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.2902832, "step": 2495, "time_per_iteration": 3.234606981277466 }, { "auxiliary_loss_clip": 0.01373944, "auxiliary_loss_mlp": 0.01064887, "balance_loss_clip": 1.23830998, "balance_loss_mlp": 1.04361999, "epoch": 0.15006763865925146, "flos": 68039697767040.0, "grad_norm": 0.8938968667786334, "language_loss": 0.59531236, "learning_rate": 3.850670485516019e-06, "loss": 0.61970061, "num_input_tokens_seen": 54225095, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.21289062, "step": 2496, "time_per_iteration": 3.397343635559082 }, { "auxiliary_loss_clip": 0.01622583, "auxiliary_loss_mlp": 0.01054606, "balance_loss_clip": 1.38503027, "balance_loss_mlp": 1.02632964, "epoch": 0.15012776191191943, "flos": 18925498016640.0, "grad_norm": 1.8480056741254496, "language_loss": 0.66835529, "learning_rate": 3.850522786049075e-06, "loss": 0.69512719, "num_input_tokens_seen": 54243750, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.28295898, "step": 2497, "time_per_iteration": 4.300506591796875 }, { "auxiliary_loss_clip": 0.016113, "auxiliary_loss_mlp": 0.01058123, "balance_loss_clip": 1.3771522, "balance_loss_mlp": 1.03068137, "epoch": 0.1501878851645874, "flos": 23713209256320.0, "grad_norm": 1.5262253072980674, "language_loss": 0.76015729, "learning_rate": 3.850375016410121e-06, "loss": 0.78685153, "num_input_tokens_seen": 54266185, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.2746582, "step": 2498, "time_per_iteration": 4.32857871055603 }, { "auxiliary_loss_clip": 0.01624837, "auxiliary_loss_mlp": 0.01055719, "balance_loss_clip": 1.38760042, "balance_loss_mlp": 1.02462912, "epoch": 0.15024800841725539, "flos": 20422227870720.0, "grad_norm": 2.1325572635186716, "language_loss": 0.72917664, "learning_rate": 3.850227176604761e-06, "loss": 0.75598216, "num_input_tokens_seen": 54283940, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.3112793, "step": 2499, "time_per_iteration": 4.273524522781372 }, { "auxiliary_loss_clip": 0.01617872, "auxiliary_loss_mlp": 0.01052227, "balance_loss_clip": 1.38276005, "balance_loss_mlp": 1.0227828, "epoch": 0.15030813166992335, "flos": 31843264008960.0, "grad_norm": 1.8672666170380017, "language_loss": 0.72872341, "learning_rate": 3.850079266638601e-06, "loss": 0.75542444, "num_input_tokens_seen": 54304830, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.29443359, "step": 2500, "time_per_iteration": 2.925649642944336 }, { "auxiliary_loss_clip": 0.01604748, "auxiliary_loss_mlp": 0.01052024, "balance_loss_clip": 1.37215281, "balance_loss_mlp": 1.0232712, "epoch": 0.15036825492259132, "flos": 35669990799360.0, "grad_norm": 1.972318449261161, "language_loss": 0.66499323, "learning_rate": 3.849931286517249e-06, "loss": 0.69156098, "num_input_tokens_seen": 54325595, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.28735352, "step": 2501, "time_per_iteration": 2.989948034286499 }, { "auxiliary_loss_clip": 0.0160788, "auxiliary_loss_mlp": 0.01052206, "balance_loss_clip": 1.37502563, "balance_loss_mlp": 1.02266574, "epoch": 0.15042837817525928, "flos": 18846038868480.0, "grad_norm": 2.7061487107907083, "language_loss": 0.84778643, "learning_rate": 3.849783236246318e-06, "loss": 0.87438732, "num_input_tokens_seen": 54342180, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.29589844, "step": 2502, "time_per_iteration": 2.8654980659484863 }, { "auxiliary_loss_clip": 0.01605688, "auxiliary_loss_mlp": 0.01056408, "balance_loss_clip": 1.37317264, "balance_loss_mlp": 1.02741647, "epoch": 0.15048850142792725, "flos": 19544684112000.0, "grad_norm": 1.8374711822035608, "language_loss": 0.77874076, "learning_rate": 3.849635115831421e-06, "loss": 0.80536175, "num_input_tokens_seen": 54360255, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.28979492, "step": 2503, "time_per_iteration": 2.840341091156006 }, { "auxiliary_loss_clip": 0.01605196, "auxiliary_loss_mlp": 0.01050208, "balance_loss_clip": 1.37503326, "balance_loss_mlp": 1.02171719, "epoch": 0.1505486246805952, "flos": 22027581786240.0, "grad_norm": 1.7615489857848237, "language_loss": 0.86261505, "learning_rate": 3.849486925278176e-06, "loss": 0.88916916, "num_input_tokens_seen": 54378260, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.2845459, "step": 2504, "time_per_iteration": 2.8831355571746826 }, { "auxiliary_loss_clip": 0.01593642, "auxiliary_loss_mlp": 0.01052581, "balance_loss_clip": 1.36484826, "balance_loss_mlp": 1.02373278, "epoch": 0.15060874793326318, "flos": 20753529431040.0, "grad_norm": 1.5854309510794777, "language_loss": 0.83310044, "learning_rate": 3.8493386645922e-06, "loss": 0.8595627, "num_input_tokens_seen": 54399745, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.28857422, "step": 2505, "time_per_iteration": 2.8599815368652344 }, { "auxiliary_loss_clip": 0.01602427, "auxiliary_loss_mlp": 0.01049483, "balance_loss_clip": 1.37026608, "balance_loss_mlp": 1.01980019, "epoch": 0.15066887118593117, "flos": 16480542746880.0, "grad_norm": 2.2385796362695713, "language_loss": 0.77354944, "learning_rate": 3.849190333779117e-06, "loss": 0.8000685, "num_input_tokens_seen": 54417105, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.29663086, "step": 2506, "time_per_iteration": 2.868196487426758 }, { "auxiliary_loss_clip": 0.0162217, "auxiliary_loss_mlp": 0.01052109, "balance_loss_clip": 1.38238823, "balance_loss_mlp": 1.02234256, "epoch": 0.15072899443859913, "flos": 19867343915520.0, "grad_norm": 2.973015422606169, "language_loss": 0.78165102, "learning_rate": 3.849041932844552e-06, "loss": 0.80839384, "num_input_tokens_seen": 54433920, "router_z_loss_clip": 2.39453125, "router_z_loss_mlp": 0.29797363, "step": 2507, "time_per_iteration": 2.8609442710876465 }, { "auxiliary_loss_clip": 0.01601825, "auxiliary_loss_mlp": 0.01058746, "balance_loss_clip": 1.37402892, "balance_loss_mlp": 1.0287528, "epoch": 0.1507891176912671, "flos": 20785725745920.0, "grad_norm": 2.051840978766195, "language_loss": 0.70352858, "learning_rate": 3.848893461794131e-06, "loss": 0.73013425, "num_input_tokens_seen": 54451540, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.30029297, "step": 2508, "time_per_iteration": 2.8914198875427246 }, { "auxiliary_loss_clip": 0.01625115, "auxiliary_loss_mlp": 0.01054986, "balance_loss_clip": 1.38872039, "balance_loss_mlp": 1.0262686, "epoch": 0.15084924094393506, "flos": 23597119802880.0, "grad_norm": 2.267611538489173, "language_loss": 0.78644097, "learning_rate": 3.8487449206334845e-06, "loss": 0.81324196, "num_input_tokens_seen": 54470800, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.28723145, "step": 2509, "time_per_iteration": 2.9159038066864014 }, { "auxiliary_loss_clip": 0.01631631, "auxiliary_loss_mlp": 0.01058989, "balance_loss_clip": 1.38951218, "balance_loss_mlp": 1.02792335, "epoch": 0.15090936419660303, "flos": 18919254234240.0, "grad_norm": 3.982505650285804, "language_loss": 0.83239502, "learning_rate": 3.848596309368246e-06, "loss": 0.85930121, "num_input_tokens_seen": 54486525, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.3104248, "step": 2510, "time_per_iteration": 2.8010637760162354 }, { "auxiliary_loss_clip": 0.01610491, "auxiliary_loss_mlp": 0.01051064, "balance_loss_clip": 1.37485719, "balance_loss_mlp": 1.01990235, "epoch": 0.150969487449271, "flos": 17936434529280.0, "grad_norm": 2.0322452607826156, "language_loss": 0.7523759, "learning_rate": 3.8484476280040495e-06, "loss": 0.77899146, "num_input_tokens_seen": 54503795, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.31152344, "step": 2511, "time_per_iteration": 2.8725454807281494 }, { "auxiliary_loss_clip": 0.01599533, "auxiliary_loss_mlp": 0.01051421, "balance_loss_clip": 1.36880207, "balance_loss_mlp": 1.02259648, "epoch": 0.151029610701939, "flos": 24253660120320.0, "grad_norm": 2.1215117536320194, "language_loss": 0.70300722, "learning_rate": 3.848298876546534e-06, "loss": 0.72951674, "num_input_tokens_seen": 54523025, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.28845215, "step": 2512, "time_per_iteration": 2.8383121490478516 }, { "auxiliary_loss_clip": 0.01598163, "auxiliary_loss_mlp": 0.01053415, "balance_loss_clip": 1.36750031, "balance_loss_mlp": 1.02559137, "epoch": 0.15108973395460695, "flos": 30274449909120.0, "grad_norm": 2.3107800456872636, "language_loss": 0.75363833, "learning_rate": 3.84815005500134e-06, "loss": 0.78015411, "num_input_tokens_seen": 54545025, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.27819824, "step": 2513, "time_per_iteration": 3.0004611015319824 }, { "auxiliary_loss_clip": 0.0135552, "auxiliary_loss_mlp": 0.01021172, "balance_loss_clip": 1.22169447, "balance_loss_mlp": 1.0036248, "epoch": 0.15114985720727492, "flos": 60467241657600.0, "grad_norm": 0.875193166726578, "language_loss": 0.6494174, "learning_rate": 3.84800116337411e-06, "loss": 0.67318434, "num_input_tokens_seen": 54604545, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.17578125, "step": 2514, "time_per_iteration": 3.2983970642089844 }, { "auxiliary_loss_clip": 0.01597282, "auxiliary_loss_mlp": 0.0104902, "balance_loss_clip": 1.3673358, "balance_loss_mlp": 1.02008843, "epoch": 0.15120998045994288, "flos": 20531349624960.0, "grad_norm": 2.4441521147422716, "language_loss": 0.74837184, "learning_rate": 3.8478522016704916e-06, "loss": 0.77483481, "num_input_tokens_seen": 54620590, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28930664, "step": 2515, "time_per_iteration": 2.829244613647461 }, { "auxiliary_loss_clip": 0.01585766, "auxiliary_loss_mlp": 0.01044615, "balance_loss_clip": 1.35671401, "balance_loss_mlp": 1.01710105, "epoch": 0.15127010371261085, "flos": 21188840083200.0, "grad_norm": 2.140563342594692, "language_loss": 0.78517151, "learning_rate": 3.8477031698961325e-06, "loss": 0.81147528, "num_input_tokens_seen": 54640410, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27514648, "step": 2516, "time_per_iteration": 2.871809959411621 }, { "auxiliary_loss_clip": 0.01357589, "auxiliary_loss_mlp": 0.01023917, "balance_loss_clip": 1.21955538, "balance_loss_mlp": 1.00417638, "epoch": 0.1513302269652788, "flos": 65349958273920.0, "grad_norm": 0.7381256305676883, "language_loss": 0.54631591, "learning_rate": 3.8475540680566835e-06, "loss": 0.570131, "num_input_tokens_seen": 54701430, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.19726562, "step": 2517, "time_per_iteration": 3.304560661315918 }, { "auxiliary_loss_clip": 0.01594815, "auxiliary_loss_mlp": 0.01051012, "balance_loss_clip": 1.3648839, "balance_loss_mlp": 1.0225215, "epoch": 0.15139035021794678, "flos": 19145279848320.0, "grad_norm": 2.8061316718824196, "language_loss": 0.79389095, "learning_rate": 3.8474048961577995e-06, "loss": 0.82034922, "num_input_tokens_seen": 54720845, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28491211, "step": 2518, "time_per_iteration": 2.8398139476776123 }, { "auxiliary_loss_clip": 0.01618596, "auxiliary_loss_mlp": 0.01059716, "balance_loss_clip": 1.38198149, "balance_loss_mlp": 1.031165, "epoch": 0.15145047347061477, "flos": 26589855594240.0, "grad_norm": 1.9058183080191238, "language_loss": 0.71580529, "learning_rate": 3.847255654205137e-06, "loss": 0.74258846, "num_input_tokens_seen": 54740495, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.28552246, "step": 2519, "time_per_iteration": 2.8662071228027344 }, { "auxiliary_loss_clip": 0.01603511, "auxiliary_loss_mlp": 0.0104319, "balance_loss_clip": 1.3705864, "balance_loss_mlp": 1.01554501, "epoch": 0.15151059672328274, "flos": 20312608423680.0, "grad_norm": 1.9077953421611842, "language_loss": 0.80221963, "learning_rate": 3.847106342204354e-06, "loss": 0.82868671, "num_input_tokens_seen": 54758415, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27624512, "step": 2520, "time_per_iteration": 2.944742441177368 }, { "auxiliary_loss_clip": 0.01609853, "auxiliary_loss_mlp": 0.01052924, "balance_loss_clip": 1.373945, "balance_loss_mlp": 1.02457631, "epoch": 0.1515707199759507, "flos": 27238025623680.0, "grad_norm": 1.7239883373332952, "language_loss": 0.76870579, "learning_rate": 3.846956960161114e-06, "loss": 0.79533356, "num_input_tokens_seen": 54779355, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.28308105, "step": 2521, "time_per_iteration": 2.9181275367736816 }, { "auxiliary_loss_clip": 0.01603707, "auxiliary_loss_mlp": 0.01062517, "balance_loss_clip": 1.36643648, "balance_loss_mlp": 1.03414559, "epoch": 0.15163084322861867, "flos": 23598024698880.0, "grad_norm": 2.0711503737590036, "language_loss": 0.83304751, "learning_rate": 3.84680750808108e-06, "loss": 0.85970974, "num_input_tokens_seen": 54799465, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.28393555, "step": 2522, "time_per_iteration": 2.8675577640533447 }, { "auxiliary_loss_clip": 0.0134799, "auxiliary_loss_mlp": 0.01036755, "balance_loss_clip": 1.21232629, "balance_loss_mlp": 1.01100564, "epoch": 0.15169096648128663, "flos": 66919360556160.0, "grad_norm": 0.828044025361081, "language_loss": 0.57985806, "learning_rate": 3.846657985969922e-06, "loss": 0.60370553, "num_input_tokens_seen": 54857665, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.2578125, "step": 2523, "time_per_iteration": 3.4418258666992188 }, { "auxiliary_loss_clip": 0.01591048, "auxiliary_loss_mlp": 0.01054671, "balance_loss_clip": 1.36251783, "balance_loss_mlp": 1.02680027, "epoch": 0.1517510897339546, "flos": 29107211823360.0, "grad_norm": 7.081408362405522, "language_loss": 0.75517678, "learning_rate": 3.8465083938333066e-06, "loss": 0.78163397, "num_input_tokens_seen": 54879895, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27880859, "step": 2524, "time_per_iteration": 2.9421255588531494 }, { "auxiliary_loss_clip": 0.01612422, "auxiliary_loss_mlp": 0.01059374, "balance_loss_clip": 1.37821436, "balance_loss_mlp": 1.03201509, "epoch": 0.1518112129866226, "flos": 18415931368320.0, "grad_norm": 1.9758242374627144, "language_loss": 0.7601856, "learning_rate": 3.8463587316769085e-06, "loss": 0.78690362, "num_input_tokens_seen": 54898245, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27392578, "step": 2525, "time_per_iteration": 2.871349334716797 }, { "auxiliary_loss_clip": 0.0161856, "auxiliary_loss_mlp": 0.01062267, "balance_loss_clip": 1.38014829, "balance_loss_mlp": 1.03260744, "epoch": 0.15187133623929056, "flos": 19434340748160.0, "grad_norm": 1.6956979987806005, "language_loss": 0.81109738, "learning_rate": 3.846208999506402e-06, "loss": 0.83790565, "num_input_tokens_seen": 54917060, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.29663086, "step": 2526, "time_per_iteration": 4.267542839050293 }, { "auxiliary_loss_clip": 0.0160476, "auxiliary_loss_mlp": 0.01051937, "balance_loss_clip": 1.37509894, "balance_loss_mlp": 1.02516246, "epoch": 0.15193145949195852, "flos": 17575063159680.0, "grad_norm": 1.5872837807902482, "language_loss": 0.85716796, "learning_rate": 3.846059197327466e-06, "loss": 0.88373482, "num_input_tokens_seen": 54936365, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26782227, "step": 2527, "time_per_iteration": 2.932478427886963 }, { "auxiliary_loss_clip": 0.01615607, "auxiliary_loss_mlp": 0.0105715, "balance_loss_clip": 1.38213837, "balance_loss_mlp": 1.0288496, "epoch": 0.15199158274462649, "flos": 36190551934080.0, "grad_norm": 1.893735875856623, "language_loss": 0.70337021, "learning_rate": 3.845909325145779e-06, "loss": 0.73009777, "num_input_tokens_seen": 54961365, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.28320312, "step": 2528, "time_per_iteration": 2.9962005615234375 }, { "auxiliary_loss_clip": 0.01600693, "auxiliary_loss_mlp": 0.01055812, "balance_loss_clip": 1.37083828, "balance_loss_mlp": 1.0283457, "epoch": 0.15205170599729445, "flos": 23083933570560.0, "grad_norm": 1.7599441703041991, "language_loss": 0.88067943, "learning_rate": 3.845759382967026e-06, "loss": 0.90724444, "num_input_tokens_seen": 54980750, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.27429199, "step": 2529, "time_per_iteration": 2.877779960632324 }, { "auxiliary_loss_clip": 0.01598379, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.36963797, "balance_loss_mlp": 1.02368271, "epoch": 0.15211182924996242, "flos": 21918460032000.0, "grad_norm": 1.900061628596223, "language_loss": 0.84238911, "learning_rate": 3.845609370796893e-06, "loss": 0.86889184, "num_input_tokens_seen": 54999675, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28222656, "step": 2530, "time_per_iteration": 2.8227410316467285 }, { "auxiliary_loss_clip": 0.01608967, "auxiliary_loss_mlp": 0.01059353, "balance_loss_clip": 1.37574029, "balance_loss_mlp": 1.03012323, "epoch": 0.15217195250263038, "flos": 13889518704000.0, "grad_norm": 1.9667472561004713, "language_loss": 0.81538713, "learning_rate": 3.845459288641066e-06, "loss": 0.84207034, "num_input_tokens_seen": 55018295, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.29248047, "step": 2531, "time_per_iteration": 2.8331246376037598 }, { "auxiliary_loss_clip": 0.01602951, "auxiliary_loss_mlp": 0.0105516, "balance_loss_clip": 1.37220788, "balance_loss_mlp": 1.02752745, "epoch": 0.15223207575529837, "flos": 24546069135360.0, "grad_norm": 1.6903550646707037, "language_loss": 0.79937828, "learning_rate": 3.8453091365052394e-06, "loss": 0.82595944, "num_input_tokens_seen": 55037975, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.27624512, "step": 2532, "time_per_iteration": 4.296976804733276 }, { "auxiliary_loss_clip": 0.01607795, "auxiliary_loss_mlp": 0.01054614, "balance_loss_clip": 1.37674105, "balance_loss_mlp": 1.0247159, "epoch": 0.15229219900796634, "flos": 25567781385600.0, "grad_norm": 1.801418016022388, "language_loss": 0.88761526, "learning_rate": 3.845158914395105e-06, "loss": 0.91423935, "num_input_tokens_seen": 55057135, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.29907227, "step": 2533, "time_per_iteration": 4.333013534545898 }, { "auxiliary_loss_clip": 0.01601476, "auxiliary_loss_mlp": 0.01051406, "balance_loss_clip": 1.3673563, "balance_loss_mlp": 1.02081656, "epoch": 0.1523523222606343, "flos": 18225495429120.0, "grad_norm": 2.279287925521899, "language_loss": 0.80412591, "learning_rate": 3.84500862231636e-06, "loss": 0.83065474, "num_input_tokens_seen": 55075525, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.30541992, "step": 2534, "time_per_iteration": 4.271279335021973 }, { "auxiliary_loss_clip": 0.0161644, "auxiliary_loss_mlp": 0.01054225, "balance_loss_clip": 1.37415671, "balance_loss_mlp": 1.02430367, "epoch": 0.15241244551330227, "flos": 13267617920640.0, "grad_norm": 2.4348671456241715, "language_loss": 0.78204334, "learning_rate": 3.844858260274702e-06, "loss": 0.80875003, "num_input_tokens_seen": 55090845, "router_z_loss_clip": 2.41992188, "router_z_loss_mlp": 0.29907227, "step": 2535, "time_per_iteration": 2.8210387229919434 }, { "auxiliary_loss_clip": 0.01621277, "auxiliary_loss_mlp": 0.01054582, "balance_loss_clip": 1.38071632, "balance_loss_mlp": 1.02399278, "epoch": 0.15247256876597023, "flos": 19724125564800.0, "grad_norm": 2.158950239940822, "language_loss": 0.78930646, "learning_rate": 3.844707828275835e-06, "loss": 0.81606501, "num_input_tokens_seen": 55108750, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.3059082, "step": 2536, "time_per_iteration": 2.8771963119506836 }, { "auxiliary_loss_clip": 0.01601775, "auxiliary_loss_mlp": 0.01044813, "balance_loss_clip": 1.3705709, "balance_loss_mlp": 1.01629794, "epoch": 0.1525326920186382, "flos": 20385507075840.0, "grad_norm": 2.631227871317481, "language_loss": 0.76710749, "learning_rate": 3.844557326325461e-06, "loss": 0.79357326, "num_input_tokens_seen": 55126750, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.28503418, "step": 2537, "time_per_iteration": 2.831979751586914 }, { "auxiliary_loss_clip": 0.01610423, "auxiliary_loss_mlp": 0.01058816, "balance_loss_clip": 1.37476563, "balance_loss_mlp": 1.02839398, "epoch": 0.15259281527130616, "flos": 13597969340160.0, "grad_norm": 3.3427726505628166, "language_loss": 0.78977305, "learning_rate": 3.8444067544292896e-06, "loss": 0.81646544, "num_input_tokens_seen": 55144690, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.30395508, "step": 2538, "time_per_iteration": 2.8561840057373047 }, { "auxiliary_loss_clip": 0.01606132, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.37579846, "balance_loss_mlp": 1.02053642, "epoch": 0.15265293852397416, "flos": 22871571886080.0, "grad_norm": 2.0223592652886597, "language_loss": 0.90815651, "learning_rate": 3.844256112593029e-06, "loss": 0.93471807, "num_input_tokens_seen": 55166055, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.29492188, "step": 2539, "time_per_iteration": 2.8530733585357666 }, { "auxiliary_loss_clip": 0.01620063, "auxiliary_loss_mlp": 0.01051825, "balance_loss_clip": 1.38335359, "balance_loss_mlp": 1.02321482, "epoch": 0.15271306177664212, "flos": 29249072830080.0, "grad_norm": 1.7946378020313043, "language_loss": 0.94076133, "learning_rate": 3.844105400822391e-06, "loss": 0.96748018, "num_input_tokens_seen": 55186285, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.28613281, "step": 2540, "time_per_iteration": 2.9425933361053467 }, { "auxiliary_loss_clip": 0.01613366, "auxiliary_loss_mlp": 0.0105178, "balance_loss_clip": 1.3796804, "balance_loss_mlp": 1.02357519, "epoch": 0.1527731850293101, "flos": 31258129265280.0, "grad_norm": 1.636194491730063, "language_loss": 0.76019597, "learning_rate": 3.843954619123092e-06, "loss": 0.78684747, "num_input_tokens_seen": 55207915, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.2824707, "step": 2541, "time_per_iteration": 2.909701347351074 }, { "auxiliary_loss_clip": 0.01601418, "auxiliary_loss_mlp": 0.01060243, "balance_loss_clip": 1.36797404, "balance_loss_mlp": 1.02972567, "epoch": 0.15283330828197805, "flos": 22392120291840.0, "grad_norm": 1.589992988885785, "language_loss": 0.8223393, "learning_rate": 3.84380376750085e-06, "loss": 0.84895593, "num_input_tokens_seen": 55227860, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.30493164, "step": 2542, "time_per_iteration": 2.8625752925872803 }, { "auxiliary_loss_clip": 0.01622406, "auxiliary_loss_mlp": 0.01054275, "balance_loss_clip": 1.38269484, "balance_loss_mlp": 1.02437687, "epoch": 0.15289343153464602, "flos": 25531060590720.0, "grad_norm": 3.143938363340901, "language_loss": 0.79537344, "learning_rate": 3.843652845961383e-06, "loss": 0.82214022, "num_input_tokens_seen": 55247330, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.29882812, "step": 2543, "time_per_iteration": 2.9173190593719482 }, { "auxiliary_loss_clip": 0.01612847, "auxiliary_loss_mlp": 0.01049985, "balance_loss_clip": 1.3794055, "balance_loss_mlp": 1.02175641, "epoch": 0.15295355478731398, "flos": 22720028492160.0, "grad_norm": 1.9130884976258637, "language_loss": 0.87528157, "learning_rate": 3.843501854510416e-06, "loss": 0.90190995, "num_input_tokens_seen": 55266195, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.2824707, "step": 2544, "time_per_iteration": 2.8403937816619873 }, { "auxiliary_loss_clip": 0.01641615, "auxiliary_loss_mlp": 0.0106493, "balance_loss_clip": 1.39800692, "balance_loss_mlp": 1.03419816, "epoch": 0.15301367803998198, "flos": 23260660335360.0, "grad_norm": 2.3069622516942787, "language_loss": 0.83658034, "learning_rate": 3.843350793153673e-06, "loss": 0.86364579, "num_input_tokens_seen": 55283305, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.30712891, "step": 2545, "time_per_iteration": 2.8957438468933105 }, { "auxiliary_loss_clip": 0.01625337, "auxiliary_loss_mlp": 0.01070759, "balance_loss_clip": 1.38932848, "balance_loss_mlp": 1.04064715, "epoch": 0.15307380129264994, "flos": 25897635112320.0, "grad_norm": 2.043272240600342, "language_loss": 0.71978742, "learning_rate": 3.843199661896884e-06, "loss": 0.74674839, "num_input_tokens_seen": 55303035, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.30090332, "step": 2546, "time_per_iteration": 2.917083263397217 }, { "auxiliary_loss_clip": 0.01618081, "auxiliary_loss_mlp": 0.01060333, "balance_loss_clip": 1.38059497, "balance_loss_mlp": 1.03177083, "epoch": 0.1531339245453179, "flos": 46989596799360.0, "grad_norm": 1.9186969118974935, "language_loss": 0.79109573, "learning_rate": 3.843048460745779e-06, "loss": 0.81787992, "num_input_tokens_seen": 55327570, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.28515625, "step": 2547, "time_per_iteration": 3.1293866634368896 }, { "auxiliary_loss_clip": 0.0162461, "auxiliary_loss_mlp": 0.01068745, "balance_loss_clip": 1.38408995, "balance_loss_mlp": 1.03748846, "epoch": 0.15319404779798587, "flos": 35895654455040.0, "grad_norm": 1.98241495911055, "language_loss": 0.75145614, "learning_rate": 3.842897189706092e-06, "loss": 0.77838975, "num_input_tokens_seen": 55351090, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.3125, "step": 2548, "time_per_iteration": 3.0342581272125244 }, { "auxiliary_loss_clip": 0.01606617, "auxiliary_loss_mlp": 0.01059911, "balance_loss_clip": 1.37318587, "balance_loss_mlp": 1.02979875, "epoch": 0.15325417105065384, "flos": 25675229082240.0, "grad_norm": 5.4083055913706595, "language_loss": 0.81362462, "learning_rate": 3.842745848783558e-06, "loss": 0.84028983, "num_input_tokens_seen": 55371050, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.30078125, "step": 2549, "time_per_iteration": 2.9085745811462402 }, { "auxiliary_loss_clip": 0.01630905, "auxiliary_loss_mlp": 0.01053578, "balance_loss_clip": 1.39250612, "balance_loss_mlp": 1.02453828, "epoch": 0.1533142943033218, "flos": 18780153160320.0, "grad_norm": 2.158907651393464, "language_loss": 0.76123041, "learning_rate": 3.842594437983917e-06, "loss": 0.78807521, "num_input_tokens_seen": 55390375, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.29003906, "step": 2550, "time_per_iteration": 2.8572804927825928 }, { "auxiliary_loss_clip": 0.0163423, "auxiliary_loss_mlp": 0.0104573, "balance_loss_clip": 1.39037204, "balance_loss_mlp": 1.01542664, "epoch": 0.15337441755598977, "flos": 23116446599040.0, "grad_norm": 2.274522318010894, "language_loss": 0.78222334, "learning_rate": 3.8424429573129115e-06, "loss": 0.8090229, "num_input_tokens_seen": 55408890, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.30273438, "step": 2551, "time_per_iteration": 2.8460793495178223 }, { "auxiliary_loss_clip": 0.01363331, "auxiliary_loss_mlp": 0.0103843, "balance_loss_clip": 1.22314072, "balance_loss_mlp": 1.0220269, "epoch": 0.15343454080865776, "flos": 59892422728320.0, "grad_norm": 0.9713160147500561, "language_loss": 0.56856275, "learning_rate": 3.842291406776283e-06, "loss": 0.59258044, "num_input_tokens_seen": 55463815, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.1640625, "step": 2552, "time_per_iteration": 3.309238910675049 }, { "auxiliary_loss_clip": 0.01640009, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.398772, "balance_loss_mlp": 1.02129292, "epoch": 0.15349466406132573, "flos": 11918268938880.0, "grad_norm": 1.8297669066218012, "language_loss": 0.90286374, "learning_rate": 3.84213978637978e-06, "loss": 0.92979121, "num_input_tokens_seen": 55481050, "router_z_loss_clip": 2.4140625, "router_z_loss_mlp": 0.31445312, "step": 2553, "time_per_iteration": 2.8197031021118164 }, { "auxiliary_loss_clip": 0.01658465, "auxiliary_loss_mlp": 0.01056023, "balance_loss_clip": 1.41397941, "balance_loss_mlp": 1.02438462, "epoch": 0.1535547873139937, "flos": 24107410368000.0, "grad_norm": 2.566335381084273, "language_loss": 0.79279679, "learning_rate": 3.841988096129152e-06, "loss": 0.81994164, "num_input_tokens_seen": 55500050, "router_z_loss_clip": 2.4453125, "router_z_loss_mlp": 0.31665039, "step": 2554, "time_per_iteration": 2.896726131439209 }, { "auxiliary_loss_clip": 0.01654047, "auxiliary_loss_mlp": 0.01056247, "balance_loss_clip": 1.4095118, "balance_loss_mlp": 1.02437091, "epoch": 0.15361491056666166, "flos": 17575379873280.0, "grad_norm": 2.2512903173466223, "language_loss": 0.79565585, "learning_rate": 3.841836336030151e-06, "loss": 0.82275879, "num_input_tokens_seen": 55518125, "router_z_loss_clip": 2.44335938, "router_z_loss_mlp": 0.31860352, "step": 2555, "time_per_iteration": 2.859384298324585 }, { "auxiliary_loss_clip": 0.01631185, "auxiliary_loss_mlp": 0.01049292, "balance_loss_clip": 1.39451671, "balance_loss_mlp": 1.01953769, "epoch": 0.15367503381932962, "flos": 25056902638080.0, "grad_norm": 1.4847337663701996, "language_loss": 0.7829572, "learning_rate": 3.8416845060885305e-06, "loss": 0.80976188, "num_input_tokens_seen": 55540960, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.29760742, "step": 2556, "time_per_iteration": 2.873593330383301 }, { "auxiliary_loss_clip": 0.01632628, "auxiliary_loss_mlp": 0.01045278, "balance_loss_clip": 1.39608479, "balance_loss_mlp": 1.01392555, "epoch": 0.15373515707199759, "flos": 21517381710720.0, "grad_norm": 1.8732318244536432, "language_loss": 0.9135133, "learning_rate": 3.84153260631005e-06, "loss": 0.94029236, "num_input_tokens_seen": 55559210, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.31347656, "step": 2557, "time_per_iteration": 2.8639166355133057 }, { "auxiliary_loss_clip": 0.01648201, "auxiliary_loss_mlp": 0.01050964, "balance_loss_clip": 1.40629637, "balance_loss_mlp": 1.01982677, "epoch": 0.15379528032466555, "flos": 26005851970560.0, "grad_norm": 1.9350034792994426, "language_loss": 0.71392351, "learning_rate": 3.841380636700468e-06, "loss": 0.74091518, "num_input_tokens_seen": 55578925, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.31152344, "step": 2558, "time_per_iteration": 2.8799209594726562 }, { "auxiliary_loss_clip": 0.01649946, "auxiliary_loss_mlp": 0.0104901, "balance_loss_clip": 1.40728509, "balance_loss_mlp": 1.01782441, "epoch": 0.15385540357733354, "flos": 19285647776640.0, "grad_norm": 2.1118031545197593, "language_loss": 0.92297351, "learning_rate": 3.841228597265548e-06, "loss": 0.94996303, "num_input_tokens_seen": 55597255, "router_z_loss_clip": 2.42578125, "router_z_loss_mlp": 0.31201172, "step": 2559, "time_per_iteration": 2.8617568016052246 }, { "auxiliary_loss_clip": 0.01659146, "auxiliary_loss_mlp": 0.01056209, "balance_loss_clip": 1.41513181, "balance_loss_mlp": 1.02387977, "epoch": 0.1539155268300015, "flos": 28561014869760.0, "grad_norm": 3.3760310849836648, "language_loss": 0.6536662, "learning_rate": 3.841076488011055e-06, "loss": 0.68081981, "num_input_tokens_seen": 55619515, "router_z_loss_clip": 2.43945312, "router_z_loss_mlp": 0.32299805, "step": 2560, "time_per_iteration": 4.383769512176514 }, { "auxiliary_loss_clip": 0.01664383, "auxiliary_loss_mlp": 0.01055329, "balance_loss_clip": 1.4165231, "balance_loss_mlp": 1.02185559, "epoch": 0.15397565008266947, "flos": 23557774809600.0, "grad_norm": 1.5144029758520932, "language_loss": 0.88866031, "learning_rate": 3.8409243089427574e-06, "loss": 0.91585737, "num_input_tokens_seen": 55640050, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.33496094, "step": 2561, "time_per_iteration": 2.88041615486145 }, { "auxiliary_loss_clip": 0.01631185, "auxiliary_loss_mlp": 0.01046782, "balance_loss_clip": 1.39907479, "balance_loss_mlp": 1.01671767, "epoch": 0.15403577333533744, "flos": 17138938101120.0, "grad_norm": 1.7497227769065473, "language_loss": 0.84260285, "learning_rate": 3.840772060066425e-06, "loss": 0.8693825, "num_input_tokens_seen": 55658695, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.30102539, "step": 2562, "time_per_iteration": 2.9252164363861084 }, { "auxiliary_loss_clip": 0.01667845, "auxiliary_loss_mlp": 0.01057996, "balance_loss_clip": 1.41801012, "balance_loss_mlp": 1.02356863, "epoch": 0.1540958965880054, "flos": 17903423808000.0, "grad_norm": 1.6810025159445527, "language_loss": 0.75984597, "learning_rate": 3.840619741387832e-06, "loss": 0.78710437, "num_input_tokens_seen": 55676340, "router_z_loss_clip": 2.50195312, "router_z_loss_mlp": 0.34423828, "step": 2563, "time_per_iteration": 2.8953909873962402 }, { "auxiliary_loss_clip": 0.01675115, "auxiliary_loss_mlp": 0.01047234, "balance_loss_clip": 1.42406607, "balance_loss_mlp": 1.01404631, "epoch": 0.15415601984067337, "flos": 32173841652480.0, "grad_norm": 2.7384725173102673, "language_loss": 0.77780342, "learning_rate": 3.8404673529127534e-06, "loss": 0.80502689, "num_input_tokens_seen": 55698890, "router_z_loss_clip": 2.51171875, "router_z_loss_mlp": 0.33203125, "step": 2564, "time_per_iteration": 2.9364709854125977 }, { "auxiliary_loss_clip": 0.01646751, "auxiliary_loss_mlp": 0.01056328, "balance_loss_clip": 1.40505695, "balance_loss_mlp": 1.02557158, "epoch": 0.15421614309334136, "flos": 24035009408640.0, "grad_norm": 1.8395371730323034, "language_loss": 0.71759623, "learning_rate": 3.840314894646969e-06, "loss": 0.744627, "num_input_tokens_seen": 55718535, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.30737305, "step": 2565, "time_per_iteration": 2.9317879676818848 }, { "auxiliary_loss_clip": 0.0164565, "auxiliary_loss_mlp": 0.01053375, "balance_loss_clip": 1.40488219, "balance_loss_mlp": 1.02157009, "epoch": 0.15427626634600933, "flos": 24396199799040.0, "grad_norm": 2.1125994759249704, "language_loss": 0.73441565, "learning_rate": 3.840162366596259e-06, "loss": 0.76140594, "num_input_tokens_seen": 55738970, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.31787109, "step": 2566, "time_per_iteration": 2.8782708644866943 }, { "auxiliary_loss_clip": 0.0163274, "auxiliary_loss_mlp": 0.01051025, "balance_loss_clip": 1.39658511, "balance_loss_mlp": 1.01969671, "epoch": 0.1543363895986773, "flos": 23342019765120.0, "grad_norm": 1.7208219148838477, "language_loss": 0.86316633, "learning_rate": 3.840009768766408e-06, "loss": 0.89000404, "num_input_tokens_seen": 55759585, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.31347656, "step": 2567, "time_per_iteration": 4.276357173919678 }, { "auxiliary_loss_clip": 0.01637526, "auxiliary_loss_mlp": 0.01056177, "balance_loss_clip": 1.40028715, "balance_loss_mlp": 1.02556372, "epoch": 0.15439651285134526, "flos": 24283503705600.0, "grad_norm": 3.011891565577649, "language_loss": 0.79423457, "learning_rate": 3.839857101163202e-06, "loss": 0.82117158, "num_input_tokens_seen": 55779250, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.3059082, "step": 2568, "time_per_iteration": 2.8969149589538574 }, { "auxiliary_loss_clip": 0.01648715, "auxiliary_loss_mlp": 0.01049655, "balance_loss_clip": 1.4097954, "balance_loss_mlp": 1.01875579, "epoch": 0.15445663610401322, "flos": 22466512022400.0, "grad_norm": 1.760446726466699, "language_loss": 0.70999748, "learning_rate": 3.83970436379243e-06, "loss": 0.73698115, "num_input_tokens_seen": 55800470, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.30883789, "step": 2569, "time_per_iteration": 5.690165281295776 }, { "auxiliary_loss_clip": 0.0164007, "auxiliary_loss_mlp": 0.01049718, "balance_loss_clip": 1.40339041, "balance_loss_mlp": 1.01951027, "epoch": 0.1545167593566812, "flos": 22058375512320.0, "grad_norm": 1.7697668783169271, "language_loss": 0.77817261, "learning_rate": 3.839551556659884e-06, "loss": 0.80507052, "num_input_tokens_seen": 55817795, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.30175781, "step": 2570, "time_per_iteration": 2.8607263565063477 }, { "auxiliary_loss_clip": 0.01642218, "auxiliary_loss_mlp": 0.01048776, "balance_loss_clip": 1.40605521, "balance_loss_mlp": 1.01894951, "epoch": 0.15457688260934915, "flos": 19327752702720.0, "grad_norm": 2.4000825507321344, "language_loss": 0.7923736, "learning_rate": 3.839398679771359e-06, "loss": 0.81928355, "num_input_tokens_seen": 55836125, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.29833984, "step": 2571, "time_per_iteration": 2.845846176147461 }, { "auxiliary_loss_clip": 0.01645298, "auxiliary_loss_mlp": 0.0105296, "balance_loss_clip": 1.40611112, "balance_loss_mlp": 1.0237056, "epoch": 0.15463700586201715, "flos": 24144628855680.0, "grad_norm": 1.8250865490177208, "language_loss": 0.83798134, "learning_rate": 3.839245733132652e-06, "loss": 0.86496389, "num_input_tokens_seen": 55855280, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.29272461, "step": 2572, "time_per_iteration": 2.9069573879241943 }, { "auxiliary_loss_clip": 0.01646523, "auxiliary_loss_mlp": 0.01054361, "balance_loss_clip": 1.4043386, "balance_loss_mlp": 1.02446294, "epoch": 0.1546971291146851, "flos": 22431601019520.0, "grad_norm": 1.8138126767720473, "language_loss": 0.91531181, "learning_rate": 3.839092716749563e-06, "loss": 0.94232064, "num_input_tokens_seen": 55875695, "router_z_loss_clip": 2.421875, "router_z_loss_mlp": 0.29931641, "step": 2573, "time_per_iteration": 2.816009759902954 }, { "auxiliary_loss_clip": 0.0163679, "auxiliary_loss_mlp": 0.01049755, "balance_loss_clip": 1.39792073, "balance_loss_mlp": 1.01918936, "epoch": 0.15475725236735308, "flos": 17539202016000.0, "grad_norm": 2.8787973916440524, "language_loss": 0.71869111, "learning_rate": 3.838939630627893e-06, "loss": 0.74555653, "num_input_tokens_seen": 55894575, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.30566406, "step": 2574, "time_per_iteration": 2.8743228912353516 }, { "auxiliary_loss_clip": 0.01648084, "auxiliary_loss_mlp": 0.01052061, "balance_loss_clip": 1.40736151, "balance_loss_mlp": 1.02137685, "epoch": 0.15481737562002104, "flos": 22571109296640.0, "grad_norm": 1.7093769091606277, "language_loss": 0.8355512, "learning_rate": 3.838786474773448e-06, "loss": 0.86255264, "num_input_tokens_seen": 55912855, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.30664062, "step": 2575, "time_per_iteration": 2.91850209236145 }, { "auxiliary_loss_clip": 0.01618323, "auxiliary_loss_mlp": 0.01053939, "balance_loss_clip": 1.38124096, "balance_loss_mlp": 1.0240413, "epoch": 0.154877498872689, "flos": 24911512536960.0, "grad_norm": 2.2171883783522297, "language_loss": 0.85930359, "learning_rate": 3.838633249192036e-06, "loss": 0.88602626, "num_input_tokens_seen": 55932375, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.29833984, "step": 2576, "time_per_iteration": 2.93196177482605 }, { "auxiliary_loss_clip": 0.01621224, "auxiliary_loss_mlp": 0.01050313, "balance_loss_clip": 1.38507676, "balance_loss_mlp": 1.02184582, "epoch": 0.15493762212535697, "flos": 28159212631680.0, "grad_norm": 1.5179753885282103, "language_loss": 0.83058643, "learning_rate": 3.838479953889465e-06, "loss": 0.85730177, "num_input_tokens_seen": 55953970, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.28491211, "step": 2577, "time_per_iteration": 2.9156129360198975 }, { "auxiliary_loss_clip": 0.016335, "auxiliary_loss_mlp": 0.01052816, "balance_loss_clip": 1.396119, "balance_loss_mlp": 1.02291846, "epoch": 0.15499774537802496, "flos": 25422074570880.0, "grad_norm": 2.1036529255703185, "language_loss": 0.78839374, "learning_rate": 3.8383265888715525e-06, "loss": 0.81525689, "num_input_tokens_seen": 55973120, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.29907227, "step": 2578, "time_per_iteration": 2.8711061477661133 }, { "auxiliary_loss_clip": 0.0163146, "auxiliary_loss_mlp": 0.01053, "balance_loss_clip": 1.38903952, "balance_loss_mlp": 1.02245867, "epoch": 0.15505786863069293, "flos": 22101883027200.0, "grad_norm": 2.175117819425214, "language_loss": 0.83617848, "learning_rate": 3.83817315414411e-06, "loss": 0.8630231, "num_input_tokens_seen": 55993260, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.30493164, "step": 2579, "time_per_iteration": 2.898716926574707 }, { "auxiliary_loss_clip": 0.01643176, "auxiliary_loss_mlp": 0.01057406, "balance_loss_clip": 1.4018507, "balance_loss_mlp": 1.02781808, "epoch": 0.1551179918833609, "flos": 18926583891840.0, "grad_norm": 1.6545136227095534, "language_loss": 0.81470209, "learning_rate": 3.838019649712958e-06, "loss": 0.84170789, "num_input_tokens_seen": 56012130, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.29541016, "step": 2580, "time_per_iteration": 2.8383419513702393 }, { "auxiliary_loss_clip": 0.01367625, "auxiliary_loss_mlp": 0.01069783, "balance_loss_clip": 1.22730815, "balance_loss_mlp": 1.05214047, "epoch": 0.15517811513602886, "flos": 66270376120320.0, "grad_norm": 0.8451640433253461, "language_loss": 0.58839059, "learning_rate": 3.8378660755839166e-06, "loss": 0.61276466, "num_input_tokens_seen": 56079045, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 0.17675781, "step": 2581, "time_per_iteration": 3.509321928024292 }, { "auxiliary_loss_clip": 0.01636611, "auxiliary_loss_mlp": 0.01053532, "balance_loss_clip": 1.39536929, "balance_loss_mlp": 1.02384901, "epoch": 0.15523823838869683, "flos": 24030801642240.0, "grad_norm": 1.8538154073485402, "language_loss": 0.85848027, "learning_rate": 3.8377124317628095e-06, "loss": 0.88538164, "num_input_tokens_seen": 56098745, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.29711914, "step": 2582, "time_per_iteration": 2.8957931995391846 }, { "auxiliary_loss_clip": 0.01640808, "auxiliary_loss_mlp": 0.01060677, "balance_loss_clip": 1.39889836, "balance_loss_mlp": 1.03011131, "epoch": 0.1552983616413648, "flos": 20494674074880.0, "grad_norm": 2.1890321762433214, "language_loss": 0.79397821, "learning_rate": 3.8375587182554625e-06, "loss": 0.82099301, "num_input_tokens_seen": 56117655, "router_z_loss_clip": 2.41601562, "router_z_loss_mlp": 0.30541992, "step": 2583, "time_per_iteration": 2.997246265411377 }, { "auxiliary_loss_clip": 0.01647022, "auxiliary_loss_mlp": 0.01051806, "balance_loss_clip": 1.40578878, "balance_loss_mlp": 1.02166963, "epoch": 0.15535848489403276, "flos": 32136261206400.0, "grad_norm": 4.0253241972033, "language_loss": 0.76962703, "learning_rate": 3.837404935067705e-06, "loss": 0.7966153, "num_input_tokens_seen": 56141960, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.30126953, "step": 2584, "time_per_iteration": 3.0020172595977783 }, { "auxiliary_loss_clip": 0.01642277, "auxiliary_loss_mlp": 0.01049477, "balance_loss_clip": 1.39979959, "balance_loss_mlp": 1.01943612, "epoch": 0.15541860814670075, "flos": 19107654157440.0, "grad_norm": 1.644858288043672, "language_loss": 0.76656365, "learning_rate": 3.837251082205368e-06, "loss": 0.79348123, "num_input_tokens_seen": 56161430, "router_z_loss_clip": 2.4296875, "router_z_loss_mlp": 0.30053711, "step": 2585, "time_per_iteration": 2.8652732372283936 }, { "auxiliary_loss_clip": 0.01631761, "auxiliary_loss_mlp": 0.0104904, "balance_loss_clip": 1.39608371, "balance_loss_mlp": 1.02066827, "epoch": 0.1554787313993687, "flos": 19181502950400.0, "grad_norm": 2.0707799477094913, "language_loss": 0.62286085, "learning_rate": 3.837097159674286e-06, "loss": 0.64966881, "num_input_tokens_seen": 56179390, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.28344727, "step": 2586, "time_per_iteration": 2.8492679595947266 }, { "auxiliary_loss_clip": 0.01636353, "auxiliary_loss_mlp": 0.01053844, "balance_loss_clip": 1.39523315, "balance_loss_mlp": 1.0229454, "epoch": 0.15553885465203668, "flos": 16152453567360.0, "grad_norm": 1.6819079601048255, "language_loss": 0.82534117, "learning_rate": 3.836943167480296e-06, "loss": 0.85224313, "num_input_tokens_seen": 56198020, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.30908203, "step": 2587, "time_per_iteration": 2.8134825229644775 }, { "auxiliary_loss_clip": 0.0165496, "auxiliary_loss_mlp": 0.01056467, "balance_loss_clip": 1.40857446, "balance_loss_mlp": 1.02592576, "epoch": 0.15559897790470464, "flos": 25348678225920.0, "grad_norm": 2.761328846378903, "language_loss": 0.90401161, "learning_rate": 3.836789105629236e-06, "loss": 0.93112588, "num_input_tokens_seen": 56218165, "router_z_loss_clip": 2.4609375, "router_z_loss_mlp": 0.30541992, "step": 2588, "time_per_iteration": 2.870312213897705 }, { "auxiliary_loss_clip": 0.01632087, "auxiliary_loss_mlp": 0.01053354, "balance_loss_clip": 1.3934195, "balance_loss_mlp": 1.02357531, "epoch": 0.1556591011573726, "flos": 23159139707520.0, "grad_norm": 2.1257687359530637, "language_loss": 0.66311598, "learning_rate": 3.83663497412695e-06, "loss": 0.68997037, "num_input_tokens_seen": 56237160, "router_z_loss_clip": 2.38476562, "router_z_loss_mlp": 0.29785156, "step": 2589, "time_per_iteration": 2.8901350498199463 }, { "auxiliary_loss_clip": 0.01627282, "auxiliary_loss_mlp": 0.01051066, "balance_loss_clip": 1.38932514, "balance_loss_mlp": 1.0210017, "epoch": 0.15571922441004057, "flos": 25380919785600.0, "grad_norm": 1.776521864989682, "language_loss": 0.84029883, "learning_rate": 3.836480772979281e-06, "loss": 0.8670823, "num_input_tokens_seen": 56257610, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.30053711, "step": 2590, "time_per_iteration": 2.872222423553467 }, { "auxiliary_loss_clip": 0.0163915, "auxiliary_loss_mlp": 0.01056081, "balance_loss_clip": 1.3979435, "balance_loss_mlp": 1.02601647, "epoch": 0.15577934766270854, "flos": 14509428716160.0, "grad_norm": 7.249075988758377, "language_loss": 0.81759346, "learning_rate": 3.836326502192077e-06, "loss": 0.84454572, "num_input_tokens_seen": 56275215, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.30029297, "step": 2591, "time_per_iteration": 2.824805498123169 }, { "auxiliary_loss_clip": 0.0162729, "auxiliary_loss_mlp": 0.01053223, "balance_loss_clip": 1.39095283, "balance_loss_mlp": 1.02476799, "epoch": 0.15583947091537653, "flos": 37428562166400.0, "grad_norm": 2.1291681946592553, "language_loss": 0.66590863, "learning_rate": 3.836172161771189e-06, "loss": 0.69271374, "num_input_tokens_seen": 56297130, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.28479004, "step": 2592, "time_per_iteration": 2.9606101512908936 }, { "auxiliary_loss_clip": 0.01659898, "auxiliary_loss_mlp": 0.01054634, "balance_loss_clip": 1.41462398, "balance_loss_mlp": 1.02594066, "epoch": 0.1558995941680445, "flos": 21844611239040.0, "grad_norm": 2.3890943494258385, "language_loss": 0.83769858, "learning_rate": 3.836017751722467e-06, "loss": 0.8648439, "num_input_tokens_seen": 56314995, "router_z_loss_clip": 2.45117188, "router_z_loss_mlp": 0.28674316, "step": 2593, "time_per_iteration": 2.856794834136963 }, { "auxiliary_loss_clip": 0.01633834, "auxiliary_loss_mlp": 0.01053213, "balance_loss_clip": 1.39889145, "balance_loss_mlp": 1.02493644, "epoch": 0.15595971742071246, "flos": 19802272613760.0, "grad_norm": 1.9298025127565106, "language_loss": 0.74484283, "learning_rate": 3.8358632720517695e-06, "loss": 0.77171326, "num_input_tokens_seen": 56334005, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.28295898, "step": 2594, "time_per_iteration": 2.8282275199890137 }, { "auxiliary_loss_clip": 0.01609529, "auxiliary_loss_mlp": 0.01054237, "balance_loss_clip": 1.37908435, "balance_loss_mlp": 1.02526891, "epoch": 0.15601984067338043, "flos": 26732847720960.0, "grad_norm": 1.9759041152572763, "language_loss": 0.82317543, "learning_rate": 3.835708722764952e-06, "loss": 0.8498131, "num_input_tokens_seen": 56353795, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.28955078, "step": 2595, "time_per_iteration": 2.907241106033325 }, { "auxiliary_loss_clip": 0.01638377, "auxiliary_loss_mlp": 0.01056838, "balance_loss_clip": 1.39898801, "balance_loss_mlp": 1.027632, "epoch": 0.1560799639260484, "flos": 18378441411840.0, "grad_norm": 1.7251041729597314, "language_loss": 0.88243955, "learning_rate": 3.835554103867876e-06, "loss": 0.9093917, "num_input_tokens_seen": 56373195, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.29199219, "step": 2596, "time_per_iteration": 4.3701560497283936 }, { "auxiliary_loss_clip": 0.01624927, "auxiliary_loss_mlp": 0.01051928, "balance_loss_clip": 1.39265728, "balance_loss_mlp": 1.02424717, "epoch": 0.15614008717871636, "flos": 22608554008320.0, "grad_norm": 1.6739902056060947, "language_loss": 0.69295347, "learning_rate": 3.835399415366404e-06, "loss": 0.71972203, "num_input_tokens_seen": 56391525, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27697754, "step": 2597, "time_per_iteration": 2.8819804191589355 }, { "auxiliary_loss_clip": 0.01600648, "auxiliary_loss_mlp": 0.01060049, "balance_loss_clip": 1.37275147, "balance_loss_mlp": 1.03115249, "epoch": 0.15620021043138435, "flos": 22756975511040.0, "grad_norm": 1.6257081396852486, "language_loss": 0.80769765, "learning_rate": 3.8352446572664035e-06, "loss": 0.83430469, "num_input_tokens_seen": 56410715, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28881836, "step": 2598, "time_per_iteration": 2.8694796562194824 }, { "auxiliary_loss_clip": 0.01601019, "auxiliary_loss_mlp": 0.01053026, "balance_loss_clip": 1.37291944, "balance_loss_mlp": 1.02422535, "epoch": 0.15626033368405232, "flos": 13122635022720.0, "grad_norm": 2.004055032550572, "language_loss": 0.84066558, "learning_rate": 3.8350898295737405e-06, "loss": 0.8672061, "num_input_tokens_seen": 56429170, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.2878418, "step": 2599, "time_per_iteration": 2.8526926040649414 }, { "auxiliary_loss_clip": 0.01629077, "auxiliary_loss_mlp": 0.01052619, "balance_loss_clip": 1.38870692, "balance_loss_mlp": 1.02324617, "epoch": 0.15632045693672028, "flos": 16481040439680.0, "grad_norm": 1.9435163368768915, "language_loss": 0.82507885, "learning_rate": 3.834934932294287e-06, "loss": 0.85189581, "num_input_tokens_seen": 56445685, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.29345703, "step": 2600, "time_per_iteration": 2.8128502368927 }, { "auxiliary_loss_clip": 0.0160758, "auxiliary_loss_mlp": 0.01056959, "balance_loss_clip": 1.37216425, "balance_loss_mlp": 1.02689457, "epoch": 0.15638058018938825, "flos": 20860162721280.0, "grad_norm": 3.314469505972626, "language_loss": 0.88979423, "learning_rate": 3.834779965433917e-06, "loss": 0.91643959, "num_input_tokens_seen": 56465900, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.30004883, "step": 2601, "time_per_iteration": 4.260806322097778 }, { "auxiliary_loss_clip": 0.01623239, "auxiliary_loss_mlp": 0.01069632, "balance_loss_clip": 1.38507938, "balance_loss_mlp": 1.03861356, "epoch": 0.1564407034420562, "flos": 21882372664320.0, "grad_norm": 1.6580691003442998, "language_loss": 0.79554772, "learning_rate": 3.834624928998508e-06, "loss": 0.82247645, "num_input_tokens_seen": 56485020, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.31005859, "step": 2602, "time_per_iteration": 2.904654026031494 }, { "auxiliary_loss_clip": 0.01614628, "auxiliary_loss_mlp": 0.01060934, "balance_loss_clip": 1.37922549, "balance_loss_mlp": 1.03089309, "epoch": 0.15650082669472418, "flos": 21844385015040.0, "grad_norm": 1.937439385571446, "language_loss": 0.75048679, "learning_rate": 3.8344698229939376e-06, "loss": 0.77724242, "num_input_tokens_seen": 56505205, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.30065918, "step": 2603, "time_per_iteration": 4.331468105316162 }, { "auxiliary_loss_clip": 0.01614298, "auxiliary_loss_mlp": 0.01053195, "balance_loss_clip": 1.38080955, "balance_loss_mlp": 1.02320218, "epoch": 0.15656094994739214, "flos": 13806439971840.0, "grad_norm": 3.102604255606653, "language_loss": 0.88840109, "learning_rate": 3.8343146474260865e-06, "loss": 0.91507608, "num_input_tokens_seen": 56521495, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.30029297, "step": 2604, "time_per_iteration": 4.3682026863098145 }, { "auxiliary_loss_clip": 0.01622214, "auxiliary_loss_mlp": 0.0105485, "balance_loss_clip": 1.38359237, "balance_loss_mlp": 1.02585793, "epoch": 0.15662107320006013, "flos": 27319656522240.0, "grad_norm": 2.7819181130353225, "language_loss": 0.85959208, "learning_rate": 3.834159402300841e-06, "loss": 0.88636273, "num_input_tokens_seen": 56540665, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.28979492, "step": 2605, "time_per_iteration": 2.950221061706543 }, { "auxiliary_loss_clip": 0.01638175, "auxiliary_loss_mlp": 0.01052516, "balance_loss_clip": 1.39072037, "balance_loss_mlp": 1.02226114, "epoch": 0.1566811964527281, "flos": 26695629233280.0, "grad_norm": 3.287559990298224, "language_loss": 0.74554956, "learning_rate": 3.834004087624087e-06, "loss": 0.77245653, "num_input_tokens_seen": 56560805, "router_z_loss_clip": 2.47265625, "router_z_loss_mlp": 0.30224609, "step": 2606, "time_per_iteration": 2.907951593399048 }, { "auxiliary_loss_clip": 0.01620137, "auxiliary_loss_mlp": 0.01048257, "balance_loss_clip": 1.38536084, "balance_loss_mlp": 1.01974177, "epoch": 0.15674131970539606, "flos": 16111027313280.0, "grad_norm": 2.9467967731541864, "language_loss": 0.77702117, "learning_rate": 3.8338487034017145e-06, "loss": 0.80370522, "num_input_tokens_seen": 56576335, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.28540039, "step": 2607, "time_per_iteration": 2.7910096645355225 }, { "auxiliary_loss_clip": 0.01611903, "auxiliary_loss_mlp": 0.01049577, "balance_loss_clip": 1.37883162, "balance_loss_mlp": 1.02056122, "epoch": 0.15680144295806403, "flos": 19178471548800.0, "grad_norm": 1.7571131684443941, "language_loss": 0.8295809, "learning_rate": 3.833693249639615e-06, "loss": 0.85619569, "num_input_tokens_seen": 56595880, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.29003906, "step": 2608, "time_per_iteration": 2.850051164627075 }, { "auxiliary_loss_clip": 0.0162538, "auxiliary_loss_mlp": 0.01054015, "balance_loss_clip": 1.385234, "balance_loss_mlp": 1.02297282, "epoch": 0.156861566210732, "flos": 20823125212800.0, "grad_norm": 2.1172031564850733, "language_loss": 0.73623216, "learning_rate": 3.833537726343684e-06, "loss": 0.76302612, "num_input_tokens_seen": 56615130, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.31030273, "step": 2609, "time_per_iteration": 2.816152334213257 }, { "auxiliary_loss_clip": 0.01619397, "auxiliary_loss_mlp": 0.01048576, "balance_loss_clip": 1.38121343, "balance_loss_mlp": 1.01844001, "epoch": 0.15692168946339996, "flos": 20057598875520.0, "grad_norm": 2.002125679261872, "language_loss": 0.73359919, "learning_rate": 3.833382133519818e-06, "loss": 0.76027894, "num_input_tokens_seen": 56634005, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.30102539, "step": 2610, "time_per_iteration": 2.8395426273345947 }, { "auxiliary_loss_clip": 0.01625978, "auxiliary_loss_mlp": 0.01052899, "balance_loss_clip": 1.38267303, "balance_loss_mlp": 1.02278638, "epoch": 0.15698181271606793, "flos": 21408078977280.0, "grad_norm": 1.69024868178728, "language_loss": 0.7370851, "learning_rate": 3.833226471173919e-06, "loss": 0.76387388, "num_input_tokens_seen": 56653480, "router_z_loss_clip": 2.43164062, "router_z_loss_mlp": 0.30102539, "step": 2611, "time_per_iteration": 2.832233428955078 }, { "auxiliary_loss_clip": 0.01611017, "auxiliary_loss_mlp": 0.01050401, "balance_loss_clip": 1.37721848, "balance_loss_mlp": 1.02033663, "epoch": 0.15704193596873592, "flos": 20854416631680.0, "grad_norm": 2.5824780507557676, "language_loss": 0.71838188, "learning_rate": 3.833070739311887e-06, "loss": 0.74499607, "num_input_tokens_seen": 56672270, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.30078125, "step": 2612, "time_per_iteration": 2.8446860313415527 }, { "auxiliary_loss_clip": 0.01617556, "auxiliary_loss_mlp": 0.01054512, "balance_loss_clip": 1.38031936, "balance_loss_mlp": 1.02466202, "epoch": 0.15710205922140388, "flos": 21773069930880.0, "grad_norm": 1.915964718607551, "language_loss": 0.77198845, "learning_rate": 3.83291493793963e-06, "loss": 0.79870909, "num_input_tokens_seen": 56691510, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.29858398, "step": 2613, "time_per_iteration": 2.849599599838257 }, { "auxiliary_loss_clip": 0.01608759, "auxiliary_loss_mlp": 0.01052729, "balance_loss_clip": 1.37265515, "balance_loss_mlp": 1.02223551, "epoch": 0.15716218247407185, "flos": 25018145827200.0, "grad_norm": 1.591344254563311, "language_loss": 0.67076796, "learning_rate": 3.832759067063055e-06, "loss": 0.69738281, "num_input_tokens_seen": 56712230, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.3046875, "step": 2614, "time_per_iteration": 2.8693268299102783 }, { "auxiliary_loss_clip": 0.01620297, "auxiliary_loss_mlp": 0.01054102, "balance_loss_clip": 1.38018143, "balance_loss_mlp": 1.02301192, "epoch": 0.1572223057267398, "flos": 20200952960640.0, "grad_norm": 2.02623135942265, "language_loss": 0.75767171, "learning_rate": 3.832603126688072e-06, "loss": 0.78441572, "num_input_tokens_seen": 56727490, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.31079102, "step": 2615, "time_per_iteration": 2.8613414764404297 }, { "auxiliary_loss_clip": 0.01589078, "auxiliary_loss_mlp": 0.0105237, "balance_loss_clip": 1.3614471, "balance_loss_mlp": 1.02247202, "epoch": 0.15728242897940778, "flos": 20969284475520.0, "grad_norm": 1.8688301357663177, "language_loss": 0.73472822, "learning_rate": 3.832447116820594e-06, "loss": 0.76114267, "num_input_tokens_seen": 56747385, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.29907227, "step": 2616, "time_per_iteration": 2.852351427078247 }, { "auxiliary_loss_clip": 0.01608666, "auxiliary_loss_mlp": 0.01048719, "balance_loss_clip": 1.37402534, "balance_loss_mlp": 1.0199182, "epoch": 0.15734255223207574, "flos": 23048343895680.0, "grad_norm": 3.121672885322192, "language_loss": 0.73382115, "learning_rate": 3.832291037466539e-06, "loss": 0.76039505, "num_input_tokens_seen": 56768055, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.2878418, "step": 2617, "time_per_iteration": 2.9475014209747314 }, { "auxiliary_loss_clip": 0.01600495, "auxiliary_loss_mlp": 0.01050506, "balance_loss_clip": 1.36931086, "balance_loss_mlp": 1.02072811, "epoch": 0.15740267548474374, "flos": 20559157194240.0, "grad_norm": 2.0214347638481853, "language_loss": 0.75326204, "learning_rate": 3.8321348886318235e-06, "loss": 0.77977204, "num_input_tokens_seen": 56785110, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.29785156, "step": 2618, "time_per_iteration": 3.0378968715667725 }, { "auxiliary_loss_clip": 0.01634777, "auxiliary_loss_mlp": 0.0105651, "balance_loss_clip": 1.39019561, "balance_loss_mlp": 1.02563429, "epoch": 0.1574627987374117, "flos": 22675208878080.0, "grad_norm": 2.326808137895127, "language_loss": 0.79776192, "learning_rate": 3.8319786703223695e-06, "loss": 0.82467473, "num_input_tokens_seen": 56804975, "router_z_loss_clip": 2.44140625, "router_z_loss_mlp": 0.30859375, "step": 2619, "time_per_iteration": 2.874711036682129 }, { "auxiliary_loss_clip": 0.01599739, "auxiliary_loss_mlp": 0.01059076, "balance_loss_clip": 1.36874652, "balance_loss_mlp": 1.0295831, "epoch": 0.15752292199007967, "flos": 16809491577600.0, "grad_norm": 8.26704663131024, "language_loss": 0.77854824, "learning_rate": 3.831822382544101e-06, "loss": 0.80513638, "num_input_tokens_seen": 56822470, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.29516602, "step": 2620, "time_per_iteration": 2.871803045272827 }, { "auxiliary_loss_clip": 0.01606977, "auxiliary_loss_mlp": 0.01059856, "balance_loss_clip": 1.37299037, "balance_loss_mlp": 1.02809858, "epoch": 0.15758304524274763, "flos": 29838189116160.0, "grad_norm": 1.7140126462016083, "language_loss": 0.72915179, "learning_rate": 3.831666025302944e-06, "loss": 0.75582016, "num_input_tokens_seen": 56842100, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.31738281, "step": 2621, "time_per_iteration": 2.9047112464904785 }, { "auxiliary_loss_clip": 0.01617432, "auxiliary_loss_mlp": 0.01052878, "balance_loss_clip": 1.37913704, "balance_loss_mlp": 1.02307546, "epoch": 0.1576431684954156, "flos": 53595566576640.0, "grad_norm": 2.34860319127087, "language_loss": 0.73683274, "learning_rate": 3.831509598604828e-06, "loss": 0.76353586, "num_input_tokens_seen": 56865920, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.29785156, "step": 2622, "time_per_iteration": 3.155513048171997 }, { "auxiliary_loss_clip": 0.01597266, "auxiliary_loss_mlp": 0.01052297, "balance_loss_clip": 1.36777747, "balance_loss_mlp": 1.02285194, "epoch": 0.15770329174808356, "flos": 20823351436800.0, "grad_norm": 1.5996322987874487, "language_loss": 0.8877691, "learning_rate": 3.831353102455684e-06, "loss": 0.91426474, "num_input_tokens_seen": 56885265, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.29418945, "step": 2623, "time_per_iteration": 2.8539695739746094 }, { "auxiliary_loss_clip": 0.01597133, "auxiliary_loss_mlp": 0.01072343, "balance_loss_clip": 1.3657403, "balance_loss_mlp": 1.04115748, "epoch": 0.15776341500075153, "flos": 24985089861120.0, "grad_norm": 1.647105977284795, "language_loss": 0.82549006, "learning_rate": 3.831196536861448e-06, "loss": 0.85218489, "num_input_tokens_seen": 56906710, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.31176758, "step": 2624, "time_per_iteration": 2.8981943130493164 }, { "auxiliary_loss_clip": 0.0160729, "auxiliary_loss_mlp": 0.01065559, "balance_loss_clip": 1.36984444, "balance_loss_mlp": 1.03535187, "epoch": 0.15782353825341952, "flos": 21917871849600.0, "grad_norm": 2.064116908777265, "language_loss": 0.81132948, "learning_rate": 3.831039901828054e-06, "loss": 0.83805799, "num_input_tokens_seen": 56924275, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.30175781, "step": 2625, "time_per_iteration": 2.8564672470092773 }, { "auxiliary_loss_clip": 0.0160038, "auxiliary_loss_mlp": 0.01058964, "balance_loss_clip": 1.36514795, "balance_loss_mlp": 1.02885211, "epoch": 0.15788366150608749, "flos": 26188370069760.0, "grad_norm": 2.108462591272416, "language_loss": 0.81901252, "learning_rate": 3.830883197361445e-06, "loss": 0.84560597, "num_input_tokens_seen": 56941525, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.30078125, "step": 2626, "time_per_iteration": 2.9134159088134766 }, { "auxiliary_loss_clip": 0.01612493, "auxiliary_loss_mlp": 0.01056829, "balance_loss_clip": 1.37884748, "balance_loss_mlp": 1.0263114, "epoch": 0.15794378475875545, "flos": 27721458760320.0, "grad_norm": 1.5806516177776577, "language_loss": 0.75221908, "learning_rate": 3.830726423467561e-06, "loss": 0.77891237, "num_input_tokens_seen": 56962145, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.30517578, "step": 2627, "time_per_iteration": 2.9704458713531494 }, { "auxiliary_loss_clip": 0.01580814, "auxiliary_loss_mlp": 0.01063142, "balance_loss_clip": 1.35104084, "balance_loss_mlp": 1.03286278, "epoch": 0.15800390801142342, "flos": 12137688812160.0, "grad_norm": 2.0847956288084206, "language_loss": 0.86668396, "learning_rate": 3.830569580152348e-06, "loss": 0.89312351, "num_input_tokens_seen": 56977505, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.30249023, "step": 2628, "time_per_iteration": 2.7923219203948975 }, { "auxiliary_loss_clip": 0.01588342, "auxiliary_loss_mlp": 0.01061353, "balance_loss_clip": 1.35676146, "balance_loss_mlp": 1.03038216, "epoch": 0.15806403126409138, "flos": 20714591640960.0, "grad_norm": 1.6580929155312911, "language_loss": 0.77814448, "learning_rate": 3.830412667421752e-06, "loss": 0.80464137, "num_input_tokens_seen": 56996770, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.31005859, "step": 2629, "time_per_iteration": 2.8830883502960205 }, { "auxiliary_loss_clip": 0.01604155, "auxiliary_loss_mlp": 0.01062098, "balance_loss_clip": 1.36911631, "balance_loss_mlp": 1.0314852, "epoch": 0.15812415451675935, "flos": 17830660890240.0, "grad_norm": 2.004518686400378, "language_loss": 0.74859357, "learning_rate": 3.8302556852817245e-06, "loss": 0.77525616, "num_input_tokens_seen": 57014970, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.30615234, "step": 2630, "time_per_iteration": 2.8465445041656494 }, { "auxiliary_loss_clip": 0.01616332, "auxiliary_loss_mlp": 0.01057651, "balance_loss_clip": 1.3761946, "balance_loss_mlp": 1.0271337, "epoch": 0.15818427776942734, "flos": 20093640998400.0, "grad_norm": 2.740947650616946, "language_loss": 0.85272038, "learning_rate": 3.8300986337382184e-06, "loss": 0.87946022, "num_input_tokens_seen": 57034045, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.30517578, "step": 2631, "time_per_iteration": 4.258861303329468 }, { "auxiliary_loss_clip": 0.01591371, "auxiliary_loss_mlp": 0.01052354, "balance_loss_clip": 1.3582803, "balance_loss_mlp": 1.02102613, "epoch": 0.1582444010220953, "flos": 21224746471680.0, "grad_norm": 1.5943112649802726, "language_loss": 0.80922747, "learning_rate": 3.8299415127971895e-06, "loss": 0.83566475, "num_input_tokens_seen": 57053695, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.31298828, "step": 2632, "time_per_iteration": 2.8729360103607178 }, { "auxiliary_loss_clip": 0.01609232, "auxiliary_loss_mlp": 0.01057632, "balance_loss_clip": 1.37338853, "balance_loss_mlp": 1.02620888, "epoch": 0.15830452427476327, "flos": 17867336440320.0, "grad_norm": 2.179945172219195, "language_loss": 0.84350991, "learning_rate": 3.829784322464594e-06, "loss": 0.87017858, "num_input_tokens_seen": 57071290, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.31396484, "step": 2633, "time_per_iteration": 2.8289215564727783 }, { "auxiliary_loss_clip": 0.01613383, "auxiliary_loss_mlp": 0.01058635, "balance_loss_clip": 1.37582099, "balance_loss_mlp": 1.02890444, "epoch": 0.15836464752743123, "flos": 24545616687360.0, "grad_norm": 4.769344172342034, "language_loss": 0.7843591, "learning_rate": 3.829627062746394e-06, "loss": 0.81107932, "num_input_tokens_seen": 57091465, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.29711914, "step": 2634, "time_per_iteration": 2.882767915725708 }, { "auxiliary_loss_clip": 0.01601833, "auxiliary_loss_mlp": 0.0106642, "balance_loss_clip": 1.36312985, "balance_loss_mlp": 1.03416193, "epoch": 0.1584247707800992, "flos": 20130497527680.0, "grad_norm": 1.806932623852216, "language_loss": 0.906142, "learning_rate": 3.829469733648552e-06, "loss": 0.93282455, "num_input_tokens_seen": 57110075, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.32250977, "step": 2635, "time_per_iteration": 2.864513874053955 }, { "auxiliary_loss_clip": 0.01626635, "auxiliary_loss_mlp": 0.01057267, "balance_loss_clip": 1.38550043, "balance_loss_mlp": 1.02601039, "epoch": 0.15848489403276717, "flos": 20385642810240.0, "grad_norm": 2.3002295264599764, "language_loss": 0.77474111, "learning_rate": 3.829312335177034e-06, "loss": 0.80158013, "num_input_tokens_seen": 57128945, "router_z_loss_clip": 2.41210938, "router_z_loss_mlp": 0.3125, "step": 2636, "time_per_iteration": 4.223943710327148 }, { "auxiliary_loss_clip": 0.01604533, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.36422682, "balance_loss_mlp": 1.02407885, "epoch": 0.15854501728543513, "flos": 39361055120640.0, "grad_norm": 2.4787139436363574, "language_loss": 0.73554635, "learning_rate": 3.82915486733781e-06, "loss": 0.7621572, "num_input_tokens_seen": 57152385, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.32495117, "step": 2637, "time_per_iteration": 3.032208204269409 }, { "auxiliary_loss_clip": 0.01609362, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.3735292, "balance_loss_mlp": 1.02361941, "epoch": 0.15860514053810312, "flos": 24875198945280.0, "grad_norm": 2.0738940886622803, "language_loss": 0.79394639, "learning_rate": 3.82899733013685e-06, "loss": 0.82057691, "num_input_tokens_seen": 57172620, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.30053711, "step": 2638, "time_per_iteration": 4.290754556655884 }, { "auxiliary_loss_clip": 0.0161002, "auxiliary_loss_mlp": 0.0105545, "balance_loss_clip": 1.37200952, "balance_loss_mlp": 1.02455139, "epoch": 0.1586652637907711, "flos": 26189003496960.0, "grad_norm": 1.698112253020446, "language_loss": 0.76742268, "learning_rate": 3.828839723580128e-06, "loss": 0.7940774, "num_input_tokens_seen": 57194680, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.30908203, "step": 2639, "time_per_iteration": 4.3289453983306885 }, { "auxiliary_loss_clip": 0.01612791, "auxiliary_loss_mlp": 0.01059362, "balance_loss_clip": 1.37601399, "balance_loss_mlp": 1.02738976, "epoch": 0.15872538704343905, "flos": 19801548696960.0, "grad_norm": 1.7634147481983948, "language_loss": 0.81772196, "learning_rate": 3.82868204767362e-06, "loss": 0.8444435, "num_input_tokens_seen": 57214675, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.31958008, "step": 2640, "time_per_iteration": 2.850390672683716 }, { "auxiliary_loss_clip": 0.01590862, "auxiliary_loss_mlp": 0.01054014, "balance_loss_clip": 1.36016381, "balance_loss_mlp": 1.02247167, "epoch": 0.15878551029610702, "flos": 28487030342400.0, "grad_norm": 3.074691249356448, "language_loss": 0.6780107, "learning_rate": 3.828524302423306e-06, "loss": 0.70445943, "num_input_tokens_seen": 57235830, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.31542969, "step": 2641, "time_per_iteration": 2.9436044692993164 }, { "auxiliary_loss_clip": 0.01635535, "auxiliary_loss_mlp": 0.01058979, "balance_loss_clip": 1.38949549, "balance_loss_mlp": 1.02738881, "epoch": 0.15884563354877498, "flos": 24217301283840.0, "grad_norm": 2.214073619051438, "language_loss": 0.76988006, "learning_rate": 3.828366487835167e-06, "loss": 0.79682511, "num_input_tokens_seen": 57255970, "router_z_loss_clip": 2.45898438, "router_z_loss_mlp": 0.31567383, "step": 2642, "time_per_iteration": 2.8773348331451416 }, { "auxiliary_loss_clip": 0.01601123, "auxiliary_loss_mlp": 0.01053487, "balance_loss_clip": 1.36640704, "balance_loss_mlp": 1.02325523, "epoch": 0.15890575680144295, "flos": 23959939006080.0, "grad_norm": 2.235749034698769, "language_loss": 0.71417439, "learning_rate": 3.828208603915186e-06, "loss": 0.74072051, "num_input_tokens_seen": 57274435, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.30249023, "step": 2643, "time_per_iteration": 2.9342520236968994 }, { "auxiliary_loss_clip": 0.0160646, "auxiliary_loss_mlp": 0.01055976, "balance_loss_clip": 1.37334967, "balance_loss_mlp": 1.02481484, "epoch": 0.15896588005411091, "flos": 21224882206080.0, "grad_norm": 2.008854224017572, "language_loss": 0.79803884, "learning_rate": 3.828050650669353e-06, "loss": 0.82466316, "num_input_tokens_seen": 57293115, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.3112793, "step": 2644, "time_per_iteration": 2.8819541931152344 }, { "auxiliary_loss_clip": 0.01611601, "auxiliary_loss_mlp": 0.01050496, "balance_loss_clip": 1.37814069, "balance_loss_mlp": 1.01971591, "epoch": 0.1590260033067789, "flos": 24362962853760.0, "grad_norm": 3.531467892318732, "language_loss": 0.83227396, "learning_rate": 3.827892628103657e-06, "loss": 0.85889488, "num_input_tokens_seen": 57312565, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.30761719, "step": 2645, "time_per_iteration": 2.8480734825134277 }, { "auxiliary_loss_clip": 0.01612952, "auxiliary_loss_mlp": 0.01058966, "balance_loss_clip": 1.37552297, "balance_loss_mlp": 1.02768576, "epoch": 0.15908612655944687, "flos": 32061009824640.0, "grad_norm": 2.0677842008116176, "language_loss": 0.71077991, "learning_rate": 3.827734536224087e-06, "loss": 0.73749912, "num_input_tokens_seen": 57333360, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.3125, "step": 2646, "time_per_iteration": 2.953688383102417 }, { "auxiliary_loss_clip": 0.01612105, "auxiliary_loss_mlp": 0.01055931, "balance_loss_clip": 1.37933159, "balance_loss_mlp": 1.02262425, "epoch": 0.15914624981211484, "flos": 17794347298560.0, "grad_norm": 2.6887426794725466, "language_loss": 0.64177781, "learning_rate": 3.827576375036642e-06, "loss": 0.66845822, "num_input_tokens_seen": 57350575, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.33300781, "step": 2647, "time_per_iteration": 2.903324842453003 }, { "auxiliary_loss_clip": 0.01615079, "auxiliary_loss_mlp": 0.01048587, "balance_loss_clip": 1.38211095, "balance_loss_mlp": 1.01604295, "epoch": 0.1592063730647828, "flos": 17721493891200.0, "grad_norm": 1.9949579235957824, "language_loss": 0.90778589, "learning_rate": 3.827418144547318e-06, "loss": 0.93442255, "num_input_tokens_seen": 57367570, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.32568359, "step": 2648, "time_per_iteration": 2.8464112281799316 }, { "auxiliary_loss_clip": 0.0159995, "auxiliary_loss_mlp": 0.01049785, "balance_loss_clip": 1.37163115, "balance_loss_mlp": 1.02050734, "epoch": 0.15926649631745077, "flos": 18811897027200.0, "grad_norm": 1.7659156693240317, "language_loss": 0.92583275, "learning_rate": 3.827259844762114e-06, "loss": 0.95233011, "num_input_tokens_seen": 57383980, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.29272461, "step": 2649, "time_per_iteration": 2.8318769931793213 }, { "auxiliary_loss_clip": 0.01646041, "auxiliary_loss_mlp": 0.01057363, "balance_loss_clip": 1.39780092, "balance_loss_mlp": 1.02462816, "epoch": 0.15932661957011873, "flos": 17575108404480.0, "grad_norm": 2.475911996594256, "language_loss": 0.73907804, "learning_rate": 3.827101475687033e-06, "loss": 0.76611209, "num_input_tokens_seen": 57400840, "router_z_loss_clip": 2.48046875, "router_z_loss_mlp": 0.32714844, "step": 2650, "time_per_iteration": 2.856846809387207 }, { "auxiliary_loss_clip": 0.01599089, "auxiliary_loss_mlp": 0.0104906, "balance_loss_clip": 1.37090087, "balance_loss_mlp": 1.01787508, "epoch": 0.15938674282278673, "flos": 13342235875200.0, "grad_norm": 9.06325930139845, "language_loss": 0.73372757, "learning_rate": 3.826943037328082e-06, "loss": 0.76020908, "num_input_tokens_seen": 57419230, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.31176758, "step": 2651, "time_per_iteration": 2.8261358737945557 }, { "auxiliary_loss_clip": 0.01615088, "auxiliary_loss_mlp": 0.01051003, "balance_loss_clip": 1.38042283, "balance_loss_mlp": 1.01931739, "epoch": 0.1594468660754547, "flos": 22498210644480.0, "grad_norm": 2.2080079602833216, "language_loss": 0.81045806, "learning_rate": 3.8267845296912674e-06, "loss": 0.83711892, "num_input_tokens_seen": 57439315, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.31689453, "step": 2652, "time_per_iteration": 2.842430591583252 }, { "auxiliary_loss_clip": 0.01594656, "auxiliary_loss_mlp": 0.01047915, "balance_loss_clip": 1.3647778, "balance_loss_mlp": 1.01947224, "epoch": 0.15950698932812266, "flos": 15014244660480.0, "grad_norm": 3.3576818225381397, "language_loss": 0.71258742, "learning_rate": 3.826625952782601e-06, "loss": 0.73901308, "num_input_tokens_seen": 57454635, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28405762, "step": 2653, "time_per_iteration": 2.859027624130249 }, { "auxiliary_loss_clip": 0.01607326, "auxiliary_loss_mlp": 0.01046195, "balance_loss_clip": 1.37278712, "balance_loss_mlp": 1.01620197, "epoch": 0.15956711258079062, "flos": 30167499905280.0, "grad_norm": 2.3601181635757125, "language_loss": 0.78717971, "learning_rate": 3.826467306608095e-06, "loss": 0.81371486, "num_input_tokens_seen": 57476805, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.30004883, "step": 2654, "time_per_iteration": 2.93780779838562 }, { "auxiliary_loss_clip": 0.01593966, "auxiliary_loss_mlp": 0.01050175, "balance_loss_clip": 1.3645823, "balance_loss_mlp": 1.02080154, "epoch": 0.1596272358334586, "flos": 21042545086080.0, "grad_norm": 1.7879940375890833, "language_loss": 0.82858014, "learning_rate": 3.826308591173765e-06, "loss": 0.85502148, "num_input_tokens_seen": 57496400, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.29370117, "step": 2655, "time_per_iteration": 2.8724172115325928 }, { "auxiliary_loss_clip": 0.01603668, "auxiliary_loss_mlp": 0.01055927, "balance_loss_clip": 1.37008476, "balance_loss_mlp": 1.0262084, "epoch": 0.15968735908612655, "flos": 15276402887040.0, "grad_norm": 2.2824374618427776, "language_loss": 0.74793398, "learning_rate": 3.826149806485631e-06, "loss": 0.77452993, "num_input_tokens_seen": 57513700, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.29699707, "step": 2656, "time_per_iteration": 2.8425276279449463 }, { "auxiliary_loss_clip": 0.01580695, "auxiliary_loss_mlp": 0.01047362, "balance_loss_clip": 1.35638452, "balance_loss_mlp": 1.01767898, "epoch": 0.15974748233879452, "flos": 52683745242240.0, "grad_norm": 1.8806308988735125, "language_loss": 0.78882951, "learning_rate": 3.825990952549713e-06, "loss": 0.81511009, "num_input_tokens_seen": 57536180, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.296875, "step": 2657, "time_per_iteration": 3.185101270675659 }, { "auxiliary_loss_clip": 0.01593332, "auxiliary_loss_mlp": 0.01048499, "balance_loss_clip": 1.36397934, "balance_loss_mlp": 1.01979351, "epoch": 0.1598076055914625, "flos": 18742256000640.0, "grad_norm": 1.9121184795779922, "language_loss": 0.76022953, "learning_rate": 3.825832029372035e-06, "loss": 0.78664786, "num_input_tokens_seen": 57555025, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28723145, "step": 2658, "time_per_iteration": 2.8831698894500732 }, { "auxiliary_loss_clip": 0.01600454, "auxiliary_loss_mlp": 0.01055891, "balance_loss_clip": 1.36806679, "balance_loss_mlp": 1.02372789, "epoch": 0.15986772884413047, "flos": 34362339540480.0, "grad_norm": 1.9309123985165926, "language_loss": 0.76653248, "learning_rate": 3.825673036958624e-06, "loss": 0.79309595, "num_input_tokens_seen": 57577660, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.3215332, "step": 2659, "time_per_iteration": 2.9735920429229736 }, { "auxiliary_loss_clip": 0.01608684, "auxiliary_loss_mlp": 0.01054269, "balance_loss_clip": 1.3704617, "balance_loss_mlp": 1.02394271, "epoch": 0.15992785209679844, "flos": 22065252721920.0, "grad_norm": 2.092294779487242, "language_loss": 0.91538513, "learning_rate": 3.825513975315508e-06, "loss": 0.94201463, "num_input_tokens_seen": 57596335, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.30322266, "step": 2660, "time_per_iteration": 2.889688730239868 }, { "auxiliary_loss_clip": 0.01611046, "auxiliary_loss_mlp": 0.01050055, "balance_loss_clip": 1.3747344, "balance_loss_mlp": 1.01994288, "epoch": 0.1599879753494664, "flos": 33077971370880.0, "grad_norm": 2.306836262570851, "language_loss": 0.79088128, "learning_rate": 3.82535484444872e-06, "loss": 0.81749225, "num_input_tokens_seen": 57616830, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.30102539, "step": 2661, "time_per_iteration": 2.9748692512512207 }, { "auxiliary_loss_clip": 0.01587119, "auxiliary_loss_mlp": 0.01047929, "balance_loss_clip": 1.35695589, "balance_loss_mlp": 1.01750684, "epoch": 0.16004809860213437, "flos": 28049683674240.0, "grad_norm": 1.8473604986134236, "language_loss": 0.74831957, "learning_rate": 3.825195644364292e-06, "loss": 0.77467012, "num_input_tokens_seen": 57635515, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.30419922, "step": 2662, "time_per_iteration": 2.98846697807312 }, { "auxiliary_loss_clip": 0.01607017, "auxiliary_loss_mlp": 0.01050391, "balance_loss_clip": 1.37133384, "balance_loss_mlp": 1.02061212, "epoch": 0.16010822185480234, "flos": 22789398049920.0, "grad_norm": 1.8779810872207148, "language_loss": 0.82742882, "learning_rate": 3.825036375068263e-06, "loss": 0.85400295, "num_input_tokens_seen": 57654250, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.29760742, "step": 2663, "time_per_iteration": 2.8696935176849365 }, { "auxiliary_loss_clip": 0.01600454, "auxiliary_loss_mlp": 0.01046339, "balance_loss_clip": 1.36755121, "balance_loss_mlp": 1.01575017, "epoch": 0.16016834510747033, "flos": 20093595753600.0, "grad_norm": 3.015397025760737, "language_loss": 0.81480312, "learning_rate": 3.824877036566672e-06, "loss": 0.84127104, "num_input_tokens_seen": 57672645, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.30566406, "step": 2664, "time_per_iteration": 2.8440945148468018 }, { "auxiliary_loss_clip": 0.0159103, "auxiliary_loss_mlp": 0.0105058, "balance_loss_clip": 1.35932112, "balance_loss_mlp": 1.02032495, "epoch": 0.1602284683601383, "flos": 21182460566400.0, "grad_norm": 1.6213041638617758, "language_loss": 0.9485153, "learning_rate": 3.824717628865561e-06, "loss": 0.97493136, "num_input_tokens_seen": 57691055, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.30273438, "step": 2665, "time_per_iteration": 2.8728368282318115 }, { "auxiliary_loss_clip": 0.01590397, "auxiliary_loss_mlp": 0.01048334, "balance_loss_clip": 1.35747623, "balance_loss_mlp": 1.01698184, "epoch": 0.16028859161280626, "flos": 14655723713280.0, "grad_norm": 2.223637824848212, "language_loss": 0.8592754, "learning_rate": 3.824558151970974e-06, "loss": 0.88566267, "num_input_tokens_seen": 57707235, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.3137207, "step": 2666, "time_per_iteration": 4.247317790985107 }, { "auxiliary_loss_clip": 0.01585727, "auxiliary_loss_mlp": 0.01053972, "balance_loss_clip": 1.35585093, "balance_loss_mlp": 1.02476621, "epoch": 0.16034871486547422, "flos": 20998630368000.0, "grad_norm": 1.9034036223601567, "language_loss": 0.82624924, "learning_rate": 3.8243986058889595e-06, "loss": 0.85264623, "num_input_tokens_seen": 57724190, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.29199219, "step": 2667, "time_per_iteration": 2.845552682876587 }, { "auxiliary_loss_clip": 0.01591541, "auxiliary_loss_mlp": 0.01056965, "balance_loss_clip": 1.36273456, "balance_loss_mlp": 1.02613759, "epoch": 0.1604088381181422, "flos": 21407717018880.0, "grad_norm": 1.7699693423341676, "language_loss": 0.75165534, "learning_rate": 3.824238990625567e-06, "loss": 0.77814043, "num_input_tokens_seen": 57743620, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.30859375, "step": 2668, "time_per_iteration": 2.896623373031616 }, { "auxiliary_loss_clip": 0.0158348, "auxiliary_loss_mlp": 0.01053431, "balance_loss_clip": 1.35352409, "balance_loss_mlp": 1.0241766, "epoch": 0.16046896137081015, "flos": 23887221333120.0, "grad_norm": 1.7203920087994096, "language_loss": 0.78023243, "learning_rate": 3.824079306186848e-06, "loss": 0.80660146, "num_input_tokens_seen": 57764810, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.29248047, "step": 2669, "time_per_iteration": 2.8599913120269775 }, { "auxiliary_loss_clip": 0.01375579, "auxiliary_loss_mlp": 0.01057861, "balance_loss_clip": 1.24050307, "balance_loss_mlp": 1.03640366, "epoch": 0.16052908462347812, "flos": 59833396247040.0, "grad_norm": 0.8129268739191908, "language_loss": 0.55579388, "learning_rate": 3.823919552578861e-06, "loss": 0.58012831, "num_input_tokens_seen": 57824390, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.21484375, "step": 2670, "time_per_iteration": 3.2614214420318604 }, { "auxiliary_loss_clip": 0.01592961, "auxiliary_loss_mlp": 0.01055774, "balance_loss_clip": 1.35915184, "balance_loss_mlp": 1.02544713, "epoch": 0.1605892078761461, "flos": 18305768983680.0, "grad_norm": 1.9794305651126223, "language_loss": 0.7896781, "learning_rate": 3.82375972980766e-06, "loss": 0.81616545, "num_input_tokens_seen": 57843665, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.30322266, "step": 2671, "time_per_iteration": 4.320765495300293 }, { "auxiliary_loss_clip": 0.01593988, "auxiliary_loss_mlp": 0.01057038, "balance_loss_clip": 1.36140537, "balance_loss_mlp": 1.0261631, "epoch": 0.16064933112881408, "flos": 32173434449280.0, "grad_norm": 1.7858999089260297, "language_loss": 0.65871245, "learning_rate": 3.8235998378793086e-06, "loss": 0.68522269, "num_input_tokens_seen": 57863305, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.30883789, "step": 2672, "time_per_iteration": 2.9302310943603516 }, { "auxiliary_loss_clip": 0.0159015, "auxiliary_loss_mlp": 0.01064486, "balance_loss_clip": 1.35478151, "balance_loss_mlp": 1.03353894, "epoch": 0.16070945438148204, "flos": 19838359981440.0, "grad_norm": 1.7102616335339451, "language_loss": 0.87586981, "learning_rate": 3.8234398767998675e-06, "loss": 0.90241611, "num_input_tokens_seen": 57883025, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.30932617, "step": 2673, "time_per_iteration": 4.291688442230225 }, { "auxiliary_loss_clip": 0.01581296, "auxiliary_loss_mlp": 0.01062934, "balance_loss_clip": 1.35150528, "balance_loss_mlp": 1.03379941, "epoch": 0.16076957763415, "flos": 18921561719040.0, "grad_norm": 2.1323214392725607, "language_loss": 0.73599589, "learning_rate": 3.823279846575403e-06, "loss": 0.76243818, "num_input_tokens_seen": 57901430, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.29125977, "step": 2674, "time_per_iteration": 4.266111612319946 }, { "auxiliary_loss_clip": 0.015852, "auxiliary_loss_mlp": 0.01071962, "balance_loss_clip": 1.35572255, "balance_loss_mlp": 1.04132557, "epoch": 0.16082970088681797, "flos": 16773223230720.0, "grad_norm": 1.5026744987343803, "language_loss": 0.85460067, "learning_rate": 3.823119747211986e-06, "loss": 0.88117224, "num_input_tokens_seen": 57919550, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.30639648, "step": 2675, "time_per_iteration": 2.881798505783081 }, { "auxiliary_loss_clip": 0.01591788, "auxiliary_loss_mlp": 0.01062902, "balance_loss_clip": 1.35874534, "balance_loss_mlp": 1.03224111, "epoch": 0.16088982413948594, "flos": 35163138839040.0, "grad_norm": 1.7432678347691335, "language_loss": 0.83230954, "learning_rate": 3.822959578715685e-06, "loss": 0.85885644, "num_input_tokens_seen": 57939890, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.30664062, "step": 2676, "time_per_iteration": 2.959638833999634 }, { "auxiliary_loss_clip": 0.01588409, "auxiliary_loss_mlp": 0.01068798, "balance_loss_clip": 1.36366427, "balance_loss_mlp": 1.03913903, "epoch": 0.1609499473921539, "flos": 18634446345600.0, "grad_norm": 1.687461807249356, "language_loss": 0.75042188, "learning_rate": 3.822799341092573e-06, "loss": 0.77699393, "num_input_tokens_seen": 57957410, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.29638672, "step": 2677, "time_per_iteration": 2.864696741104126 }, { "auxiliary_loss_clip": 0.01596053, "auxiliary_loss_mlp": 0.01061102, "balance_loss_clip": 1.36756015, "balance_loss_mlp": 1.03284943, "epoch": 0.1610100706448219, "flos": 33158652128640.0, "grad_norm": 1.7158489402383736, "language_loss": 0.77605093, "learning_rate": 3.822639034348728e-06, "loss": 0.80262244, "num_input_tokens_seen": 57977900, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.28198242, "step": 2678, "time_per_iteration": 2.9880831241607666 }, { "auxiliary_loss_clip": 0.01579959, "auxiliary_loss_mlp": 0.01054422, "balance_loss_clip": 1.35108423, "balance_loss_mlp": 1.02461982, "epoch": 0.16107019389748986, "flos": 34689659558400.0, "grad_norm": 1.9616875181724773, "language_loss": 0.71030337, "learning_rate": 3.822478658490228e-06, "loss": 0.73664719, "num_input_tokens_seen": 57998210, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29785156, "step": 2679, "time_per_iteration": 2.965186595916748 }, { "auxiliary_loss_clip": 0.01378908, "auxiliary_loss_mlp": 0.01037569, "balance_loss_clip": 1.23960102, "balance_loss_mlp": 1.01811433, "epoch": 0.16113031715015783, "flos": 65740585046400.0, "grad_norm": 0.7918206362109462, "language_loss": 0.51853359, "learning_rate": 3.822318213523154e-06, "loss": 0.54269826, "num_input_tokens_seen": 58059420, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.19433594, "step": 2680, "time_per_iteration": 3.416412591934204 }, { "auxiliary_loss_clip": 0.01595864, "auxiliary_loss_mlp": 0.01057234, "balance_loss_clip": 1.36326814, "balance_loss_mlp": 1.02645421, "epoch": 0.1611904404028258, "flos": 20819324649600.0, "grad_norm": 1.6152444160079937, "language_loss": 0.81228745, "learning_rate": 3.8221576994535925e-06, "loss": 0.83881843, "num_input_tokens_seen": 58078370, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.30761719, "step": 2681, "time_per_iteration": 2.8343873023986816 }, { "auxiliary_loss_clip": 0.01593064, "auxiliary_loss_mlp": 0.01059294, "balance_loss_clip": 1.36561704, "balance_loss_mlp": 1.03130364, "epoch": 0.16125056365549376, "flos": 27023944636800.0, "grad_norm": 1.7766701767194217, "language_loss": 0.70335263, "learning_rate": 3.821997116287627e-06, "loss": 0.72987622, "num_input_tokens_seen": 58097395, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.28027344, "step": 2682, "time_per_iteration": 2.9119699001312256 }, { "auxiliary_loss_clip": 0.01585919, "auxiliary_loss_mlp": 0.01053084, "balance_loss_clip": 1.35715437, "balance_loss_mlp": 1.02459252, "epoch": 0.16131068690816172, "flos": 19284652391040.0, "grad_norm": 1.9559692945712817, "language_loss": 0.88658893, "learning_rate": 3.821836464031348e-06, "loss": 0.91297895, "num_input_tokens_seen": 58115630, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28442383, "step": 2683, "time_per_iteration": 2.8799540996551514 }, { "auxiliary_loss_clip": 0.01592412, "auxiliary_loss_mlp": 0.0106876, "balance_loss_clip": 1.36380267, "balance_loss_mlp": 1.04014933, "epoch": 0.16137081016082971, "flos": 35352217434240.0, "grad_norm": 1.5974300914206567, "language_loss": 0.75010633, "learning_rate": 3.821675742690849e-06, "loss": 0.77671802, "num_input_tokens_seen": 58138655, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.28613281, "step": 2684, "time_per_iteration": 2.9951331615448 }, { "auxiliary_loss_clip": 0.01594362, "auxiliary_loss_mlp": 0.01061377, "balance_loss_clip": 1.3617897, "balance_loss_mlp": 1.03228974, "epoch": 0.16143093341349768, "flos": 34247154983040.0, "grad_norm": 1.7600596975420553, "language_loss": 0.7113508, "learning_rate": 3.821514952272223e-06, "loss": 0.73790818, "num_input_tokens_seen": 58157440, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.29077148, "step": 2685, "time_per_iteration": 2.950395107269287 }, { "auxiliary_loss_clip": 0.01573704, "auxiliary_loss_mlp": 0.01055227, "balance_loss_clip": 1.35068476, "balance_loss_mlp": 1.0256151, "epoch": 0.16149105666616564, "flos": 28010338680960.0, "grad_norm": 2.247434190655626, "language_loss": 0.72669697, "learning_rate": 3.821354092781567e-06, "loss": 0.75298631, "num_input_tokens_seen": 58176660, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.29589844, "step": 2686, "time_per_iteration": 2.895186424255371 }, { "auxiliary_loss_clip": 0.01582732, "auxiliary_loss_mlp": 0.01053929, "balance_loss_clip": 1.35291028, "balance_loss_mlp": 1.02384067, "epoch": 0.1615511799188336, "flos": 19430313960960.0, "grad_norm": 4.068738315098536, "language_loss": 0.82724714, "learning_rate": 3.821193164224981e-06, "loss": 0.85361373, "num_input_tokens_seen": 58195085, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.30053711, "step": 2687, "time_per_iteration": 2.91774845123291 }, { "auxiliary_loss_clip": 0.01602428, "auxiliary_loss_mlp": 0.01055369, "balance_loss_clip": 1.36609113, "balance_loss_mlp": 1.02439833, "epoch": 0.16161130317150157, "flos": 22864694676480.0, "grad_norm": 2.391970464282639, "language_loss": 0.7313624, "learning_rate": 3.821032166608568e-06, "loss": 0.75794041, "num_input_tokens_seen": 58213540, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.30981445, "step": 2688, "time_per_iteration": 2.8329267501831055 }, { "auxiliary_loss_clip": 0.01591972, "auxiliary_loss_mlp": 0.01051436, "balance_loss_clip": 1.36088467, "balance_loss_mlp": 1.02232516, "epoch": 0.16167142642416954, "flos": 26123389257600.0, "grad_norm": 1.659713035245241, "language_loss": 0.7688719, "learning_rate": 3.8208710999384325e-06, "loss": 0.79530597, "num_input_tokens_seen": 58236995, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.29101562, "step": 2689, "time_per_iteration": 2.927610158920288 }, { "auxiliary_loss_clip": 0.01578839, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.3539412, "balance_loss_mlp": 1.01992774, "epoch": 0.1617315496768375, "flos": 22788809867520.0, "grad_norm": 1.733325902024169, "language_loss": 0.88225782, "learning_rate": 3.820709964220683e-06, "loss": 0.90853226, "num_input_tokens_seen": 58257230, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.28710938, "step": 2690, "time_per_iteration": 2.9339325428009033 }, { "auxiliary_loss_clip": 0.01585746, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.35783434, "balance_loss_mlp": 1.0189867, "epoch": 0.1617916729295055, "flos": 22027265072640.0, "grad_norm": 1.5984491444769409, "language_loss": 0.88968188, "learning_rate": 3.8205487594614284e-06, "loss": 0.91601616, "num_input_tokens_seen": 58277080, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.28735352, "step": 2691, "time_per_iteration": 2.8716928958892822 }, { "auxiliary_loss_clip": 0.01609192, "auxiliary_loss_mlp": 0.01054179, "balance_loss_clip": 1.37140656, "balance_loss_mlp": 1.02282691, "epoch": 0.16185179618217346, "flos": 23448064872960.0, "grad_norm": 1.8690730831954052, "language_loss": 0.83437407, "learning_rate": 3.820387485666784e-06, "loss": 0.86100775, "num_input_tokens_seen": 58294815, "router_z_loss_clip": 2.37890625, "router_z_loss_mlp": 0.3137207, "step": 2692, "time_per_iteration": 2.908597230911255 }, { "auxiliary_loss_clip": 0.01607296, "auxiliary_loss_mlp": 0.01055683, "balance_loss_clip": 1.37011027, "balance_loss_mlp": 1.0241636, "epoch": 0.16191191943484143, "flos": 25677084119040.0, "grad_norm": 2.1541684727812345, "language_loss": 0.83114487, "learning_rate": 3.820226142842862e-06, "loss": 0.85777467, "num_input_tokens_seen": 58313215, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.31518555, "step": 2693, "time_per_iteration": 2.8760287761688232 }, { "auxiliary_loss_clip": 0.01573365, "auxiliary_loss_mlp": 0.01051779, "balance_loss_clip": 1.34891105, "balance_loss_mlp": 1.02252543, "epoch": 0.1619720426875094, "flos": 23487862314240.0, "grad_norm": 1.501855241174925, "language_loss": 0.8533839, "learning_rate": 3.820064730995783e-06, "loss": 0.87963539, "num_input_tokens_seen": 58333215, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.29272461, "step": 2694, "time_per_iteration": 2.876495838165283 }, { "auxiliary_loss_clip": 0.01593449, "auxiliary_loss_mlp": 0.01062834, "balance_loss_clip": 1.35982609, "balance_loss_mlp": 1.02914536, "epoch": 0.16203216594017736, "flos": 24144221652480.0, "grad_norm": 1.8835494212949921, "language_loss": 0.70133942, "learning_rate": 3.819903250131667e-06, "loss": 0.72790229, "num_input_tokens_seen": 58351160, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.3371582, "step": 2695, "time_per_iteration": 2.869046688079834 }, { "auxiliary_loss_clip": 0.01605448, "auxiliary_loss_mlp": 0.01054915, "balance_loss_clip": 1.37189412, "balance_loss_mlp": 1.02437329, "epoch": 0.16209228919284532, "flos": 22349698652160.0, "grad_norm": 1.8630886311549049, "language_loss": 0.8382442, "learning_rate": 3.819741700256637e-06, "loss": 0.86484778, "num_input_tokens_seen": 58368505, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.30541992, "step": 2696, "time_per_iteration": 2.847719430923462 }, { "auxiliary_loss_clip": 0.01609281, "auxiliary_loss_mlp": 0.01059733, "balance_loss_clip": 1.36838996, "balance_loss_mlp": 1.02780831, "epoch": 0.1621524124455133, "flos": 15823595226240.0, "grad_norm": 2.095555561456398, "language_loss": 0.90546346, "learning_rate": 3.8195800813768194e-06, "loss": 0.93215358, "num_input_tokens_seen": 58385085, "router_z_loss_clip": 2.40820312, "router_z_loss_mlp": 0.3190918, "step": 2697, "time_per_iteration": 2.947194814682007 }, { "auxiliary_loss_clip": 0.01575833, "auxiliary_loss_mlp": 0.01048603, "balance_loss_clip": 1.35069537, "balance_loss_mlp": 1.01901484, "epoch": 0.16221253569818128, "flos": 30197886428160.0, "grad_norm": 1.638272167800017, "language_loss": 0.8192848, "learning_rate": 3.819418393498343e-06, "loss": 0.8455292, "num_input_tokens_seen": 58406985, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.29589844, "step": 2698, "time_per_iteration": 2.9250917434692383 }, { "auxiliary_loss_clip": 0.01580149, "auxiliary_loss_mlp": 0.0105099, "balance_loss_clip": 1.35692525, "balance_loss_mlp": 1.02028179, "epoch": 0.16227265895084925, "flos": 24616162609920.0, "grad_norm": 1.5416982538881954, "language_loss": 0.78489995, "learning_rate": 3.819256636627339e-06, "loss": 0.81121135, "num_input_tokens_seen": 58426205, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.30688477, "step": 2699, "time_per_iteration": 2.9487640857696533 }, { "auxiliary_loss_clip": 0.01588639, "auxiliary_loss_mlp": 0.01053927, "balance_loss_clip": 1.3591013, "balance_loss_mlp": 1.02500629, "epoch": 0.1623327822035172, "flos": 19582807495680.0, "grad_norm": 2.064988544353501, "language_loss": 0.87959051, "learning_rate": 3.81909481076994e-06, "loss": 0.90601611, "num_input_tokens_seen": 58443830, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.2890625, "step": 2700, "time_per_iteration": 4.248523473739624 }, { "auxiliary_loss_clip": 0.01594848, "auxiliary_loss_mlp": 0.01054137, "balance_loss_clip": 1.3659507, "balance_loss_mlp": 1.02282083, "epoch": 0.16239290545618518, "flos": 26479557475200.0, "grad_norm": 1.5642403883383815, "language_loss": 0.81302071, "learning_rate": 3.818932915932284e-06, "loss": 0.83951056, "num_input_tokens_seen": 58464405, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.31311035, "step": 2701, "time_per_iteration": 2.858170747756958 }, { "auxiliary_loss_clip": 0.01588956, "auxiliary_loss_mlp": 0.01053957, "balance_loss_clip": 1.35817051, "balance_loss_mlp": 1.02391624, "epoch": 0.16245302870885314, "flos": 15860497000320.0, "grad_norm": 1.500038689765602, "language_loss": 0.74944043, "learning_rate": 3.818770952120511e-06, "loss": 0.77586961, "num_input_tokens_seen": 58483295, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.30053711, "step": 2702, "time_per_iteration": 2.863884449005127 }, { "auxiliary_loss_clip": 0.01598279, "auxiliary_loss_mlp": 0.01052962, "balance_loss_clip": 1.36484575, "balance_loss_mlp": 1.02263534, "epoch": 0.1625131519615211, "flos": 14764573998720.0, "grad_norm": 1.8711781713318716, "language_loss": 0.73938787, "learning_rate": 3.81860891934076e-06, "loss": 0.76590031, "num_input_tokens_seen": 58501205, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.3034668, "step": 2703, "time_per_iteration": 2.7934072017669678 }, { "auxiliary_loss_clip": 0.01595509, "auxiliary_loss_mlp": 0.01055165, "balance_loss_clip": 1.36306775, "balance_loss_mlp": 1.02536297, "epoch": 0.1625732752141891, "flos": 28232247018240.0, "grad_norm": 1.7764492351537695, "language_loss": 0.715734, "learning_rate": 3.818446817599176e-06, "loss": 0.74224073, "num_input_tokens_seen": 58522315, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.2980957, "step": 2704, "time_per_iteration": 2.937492847442627 }, { "auxiliary_loss_clip": 0.0135888, "auxiliary_loss_mlp": 0.01023523, "balance_loss_clip": 1.22629595, "balance_loss_mlp": 1.00673795, "epoch": 0.16263339846685707, "flos": 67357928833920.0, "grad_norm": 0.7880556130178895, "language_loss": 0.53408283, "learning_rate": 3.818284646901907e-06, "loss": 0.55790687, "num_input_tokens_seen": 58586695, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.16796875, "step": 2705, "time_per_iteration": 3.357480049133301 }, { "auxiliary_loss_clip": 0.01599516, "auxiliary_loss_mlp": 0.01053716, "balance_loss_clip": 1.36505532, "balance_loss_mlp": 1.02522492, "epoch": 0.16269352171952503, "flos": 14327227330560.0, "grad_norm": 2.185713522063839, "language_loss": 0.76696676, "learning_rate": 3.818122407255102e-06, "loss": 0.79349911, "num_input_tokens_seen": 58602435, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.28491211, "step": 2706, "time_per_iteration": 4.326091289520264 }, { "auxiliary_loss_clip": 0.01585112, "auxiliary_loss_mlp": 0.01050743, "balance_loss_clip": 1.3537432, "balance_loss_mlp": 1.0223, "epoch": 0.162753644972193, "flos": 28371619560960.0, "grad_norm": 1.9106819087472995, "language_loss": 0.74335587, "learning_rate": 3.817960098664914e-06, "loss": 0.76971447, "num_input_tokens_seen": 58621275, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.28466797, "step": 2707, "time_per_iteration": 2.905479907989502 }, { "auxiliary_loss_clip": 0.01585845, "auxiliary_loss_mlp": 0.01056884, "balance_loss_clip": 1.35736537, "balance_loss_mlp": 1.02932262, "epoch": 0.16281376822486096, "flos": 19947346001280.0, "grad_norm": 2.5729147518875974, "language_loss": 0.85108483, "learning_rate": 3.817797721137495e-06, "loss": 0.8775121, "num_input_tokens_seen": 58637550, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27587891, "step": 2708, "time_per_iteration": 4.251897573471069 }, { "auxiliary_loss_clip": 0.01603282, "auxiliary_loss_mlp": 0.0105321, "balance_loss_clip": 1.36826265, "balance_loss_mlp": 1.02262115, "epoch": 0.16287389147752893, "flos": 21261648245760.0, "grad_norm": 2.0650967786453687, "language_loss": 0.86887348, "learning_rate": 3.817635274679006e-06, "loss": 0.89543843, "num_input_tokens_seen": 58654135, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.3059082, "step": 2709, "time_per_iteration": 4.352931499481201 }, { "auxiliary_loss_clip": 0.01604238, "auxiliary_loss_mlp": 0.0105051, "balance_loss_clip": 1.37168622, "balance_loss_mlp": 1.023736, "epoch": 0.1629340147301969, "flos": 19253994399360.0, "grad_norm": 2.241123526908354, "language_loss": 0.92233521, "learning_rate": 3.817472759295605e-06, "loss": 0.9488827, "num_input_tokens_seen": 58674320, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.26757812, "step": 2710, "time_per_iteration": 2.839712381362915 }, { "auxiliary_loss_clip": 0.01594448, "auxiliary_loss_mlp": 0.01054685, "balance_loss_clip": 1.36519611, "balance_loss_mlp": 1.02652752, "epoch": 0.16299413798286488, "flos": 21259250271360.0, "grad_norm": 2.2486898392510826, "language_loss": 0.82904702, "learning_rate": 3.817310174993453e-06, "loss": 0.85553837, "num_input_tokens_seen": 58691000, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28173828, "step": 2711, "time_per_iteration": 2.8382749557495117 }, { "auxiliary_loss_clip": 0.01620201, "auxiliary_loss_mlp": 0.01057233, "balance_loss_clip": 1.38112998, "balance_loss_mlp": 1.02909994, "epoch": 0.16305426123553285, "flos": 18779836446720.0, "grad_norm": 2.115160502879445, "language_loss": 0.82627457, "learning_rate": 3.817147521778719e-06, "loss": 0.85304892, "num_input_tokens_seen": 58710230, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.28149414, "step": 2712, "time_per_iteration": 2.850048542022705 }, { "auxiliary_loss_clip": 0.01609243, "auxiliary_loss_mlp": 0.01058863, "balance_loss_clip": 1.37273932, "balance_loss_mlp": 1.03175521, "epoch": 0.16311438448820081, "flos": 22096951344000.0, "grad_norm": 1.6347075971863712, "language_loss": 0.78146076, "learning_rate": 3.816984799657568e-06, "loss": 0.80814183, "num_input_tokens_seen": 58728610, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.27148438, "step": 2713, "time_per_iteration": 2.8194034099578857 }, { "auxiliary_loss_clip": 0.0157939, "auxiliary_loss_mlp": 0.01053864, "balance_loss_clip": 1.35503006, "balance_loss_mlp": 1.02592182, "epoch": 0.16317450774086878, "flos": 16475249105280.0, "grad_norm": 2.1529777362633467, "language_loss": 0.80450857, "learning_rate": 3.8168220086361715e-06, "loss": 0.83084106, "num_input_tokens_seen": 58744385, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.27954102, "step": 2714, "time_per_iteration": 2.847712516784668 }, { "auxiliary_loss_clip": 0.01603275, "auxiliary_loss_mlp": 0.01059504, "balance_loss_clip": 1.37555337, "balance_loss_mlp": 1.03239632, "epoch": 0.16323463099353674, "flos": 24363641525760.0, "grad_norm": 2.078578984082396, "language_loss": 0.79171741, "learning_rate": 3.816659148720702e-06, "loss": 0.81834519, "num_input_tokens_seen": 58763905, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27148438, "step": 2715, "time_per_iteration": 2.907170295715332 }, { "auxiliary_loss_clip": 0.01598268, "auxiliary_loss_mlp": 0.01047684, "balance_loss_clip": 1.36796737, "balance_loss_mlp": 1.02087343, "epoch": 0.1632947542462047, "flos": 24911784005760.0, "grad_norm": 2.141270755157285, "language_loss": 0.82947874, "learning_rate": 3.816496219917336e-06, "loss": 0.85593826, "num_input_tokens_seen": 58785580, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.26806641, "step": 2716, "time_per_iteration": 2.920557975769043 }, { "auxiliary_loss_clip": 0.01608257, "auxiliary_loss_mlp": 0.01054143, "balance_loss_clip": 1.37573314, "balance_loss_mlp": 1.02686739, "epoch": 0.1633548774988727, "flos": 24911150578560.0, "grad_norm": 1.6589996224188939, "language_loss": 0.87833977, "learning_rate": 3.816333222232251e-06, "loss": 0.90496373, "num_input_tokens_seen": 58806075, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27282715, "step": 2717, "time_per_iteration": 2.873380184173584 }, { "auxiliary_loss_clip": 0.01594787, "auxiliary_loss_mlp": 0.01053976, "balance_loss_clip": 1.36740768, "balance_loss_mlp": 1.02686739, "epoch": 0.16341500075154067, "flos": 30452986465920.0, "grad_norm": 3.4529431227198617, "language_loss": 0.77940929, "learning_rate": 3.816170155671629e-06, "loss": 0.80589688, "num_input_tokens_seen": 58827405, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.27087402, "step": 2718, "time_per_iteration": 2.921759843826294 }, { "auxiliary_loss_clip": 0.01608118, "auxiliary_loss_mlp": 0.01055188, "balance_loss_clip": 1.37610018, "balance_loss_mlp": 1.02705431, "epoch": 0.16347512400420863, "flos": 22794963160320.0, "grad_norm": 1.79458183173792, "language_loss": 0.75323707, "learning_rate": 3.816007020241652e-06, "loss": 0.77987015, "num_input_tokens_seen": 58847205, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.28125, "step": 2719, "time_per_iteration": 2.9134223461151123 }, { "auxiliary_loss_clip": 0.01596169, "auxiliary_loss_mlp": 0.01055139, "balance_loss_clip": 1.36702919, "balance_loss_mlp": 1.02787566, "epoch": 0.1635352472568766, "flos": 22642288646400.0, "grad_norm": 1.6900624207785235, "language_loss": 0.73370445, "learning_rate": 3.815843815948507e-06, "loss": 0.76021749, "num_input_tokens_seen": 58866865, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27258301, "step": 2720, "time_per_iteration": 2.832446813583374 }, { "auxiliary_loss_clip": 0.01590967, "auxiliary_loss_mlp": 0.01053426, "balance_loss_clip": 1.36393452, "balance_loss_mlp": 1.02510142, "epoch": 0.16359537050954456, "flos": 15531321945600.0, "grad_norm": 2.0157200159257203, "language_loss": 0.76900476, "learning_rate": 3.8156805427983824e-06, "loss": 0.79544872, "num_input_tokens_seen": 58885200, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.28295898, "step": 2721, "time_per_iteration": 2.861409902572632 }, { "auxiliary_loss_clip": 0.01602785, "auxiliary_loss_mlp": 0.01051729, "balance_loss_clip": 1.37004828, "balance_loss_mlp": 1.02456129, "epoch": 0.16365549376221253, "flos": 22100118480000.0, "grad_norm": 1.893099167812414, "language_loss": 0.79755056, "learning_rate": 3.8155172007974695e-06, "loss": 0.82409573, "num_input_tokens_seen": 58906385, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27172852, "step": 2722, "time_per_iteration": 2.8554818630218506 }, { "auxiliary_loss_clip": 0.01603838, "auxiliary_loss_mlp": 0.01060137, "balance_loss_clip": 1.36861515, "balance_loss_mlp": 1.02938056, "epoch": 0.1637156170148805, "flos": 24070870552320.0, "grad_norm": 1.938939394878388, "language_loss": 0.86279172, "learning_rate": 3.8153537899519624e-06, "loss": 0.88943148, "num_input_tokens_seen": 58925040, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.30761719, "step": 2723, "time_per_iteration": 2.9678895473480225 }, { "auxiliary_loss_clip": 0.01579694, "auxiliary_loss_mlp": 0.01045007, "balance_loss_clip": 1.35644579, "balance_loss_mlp": 1.01802969, "epoch": 0.1637757402675485, "flos": 26695991191680.0, "grad_norm": 1.9178513351905633, "language_loss": 0.71912241, "learning_rate": 3.815190310268058e-06, "loss": 0.74536943, "num_input_tokens_seen": 58944790, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26965332, "step": 2724, "time_per_iteration": 2.8785109519958496 }, { "auxiliary_loss_clip": 0.01588206, "auxiliary_loss_mlp": 0.01051457, "balance_loss_clip": 1.3640275, "balance_loss_mlp": 1.02502787, "epoch": 0.16383586352021645, "flos": 16115461303680.0, "grad_norm": 1.7879332302423578, "language_loss": 0.71879905, "learning_rate": 3.815026761751955e-06, "loss": 0.74519569, "num_input_tokens_seen": 58962500, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26428223, "step": 2725, "time_per_iteration": 2.8342082500457764 }, { "auxiliary_loss_clip": 0.01592833, "auxiliary_loss_mlp": 0.01042416, "balance_loss_clip": 1.36833858, "balance_loss_mlp": 1.016011, "epoch": 0.16389598677288442, "flos": 19173811334400.0, "grad_norm": 1.7683384916261575, "language_loss": 0.8869341, "learning_rate": 3.814863144409855e-06, "loss": 0.91328663, "num_input_tokens_seen": 58980355, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26403809, "step": 2726, "time_per_iteration": 2.8463456630706787 }, { "auxiliary_loss_clip": 0.01593928, "auxiliary_loss_mlp": 0.01044843, "balance_loss_clip": 1.36755335, "balance_loss_mlp": 1.01823592, "epoch": 0.16395611002555238, "flos": 21516974507520.0, "grad_norm": 1.8115273600200583, "language_loss": 0.7472235, "learning_rate": 3.814699458247963e-06, "loss": 0.77361119, "num_input_tokens_seen": 58999505, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26623535, "step": 2727, "time_per_iteration": 2.8463990688323975 }, { "auxiliary_loss_clip": 0.01588306, "auxiliary_loss_mlp": 0.01048298, "balance_loss_clip": 1.36570191, "balance_loss_mlp": 1.02326393, "epoch": 0.16401623327822035, "flos": 21480977629440.0, "grad_norm": 1.5288230464573362, "language_loss": 0.83873481, "learning_rate": 3.8145357032724855e-06, "loss": 0.86510086, "num_input_tokens_seen": 59017930, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.25048828, "step": 2728, "time_per_iteration": 2.853574275970459 }, { "auxiliary_loss_clip": 0.01608897, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.3768611, "balance_loss_mlp": 1.02181005, "epoch": 0.1640763565308883, "flos": 13634192442240.0, "grad_norm": 2.021904478365285, "language_loss": 0.86451054, "learning_rate": 3.814371879489633e-06, "loss": 0.8910892, "num_input_tokens_seen": 59035130, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27124023, "step": 2729, "time_per_iteration": 2.8147494792938232 }, { "auxiliary_loss_clip": 0.01597602, "auxiliary_loss_mlp": 0.01051486, "balance_loss_clip": 1.36969411, "balance_loss_mlp": 1.02363825, "epoch": 0.16413647978355628, "flos": 15460142595840.0, "grad_norm": 1.856458318423306, "language_loss": 0.73904687, "learning_rate": 3.814207986905616e-06, "loss": 0.76553774, "num_input_tokens_seen": 59053080, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.27880859, "step": 2730, "time_per_iteration": 2.8573720455169678 }, { "auxiliary_loss_clip": 0.01603044, "auxiliary_loss_mlp": 0.01054538, "balance_loss_clip": 1.37058663, "balance_loss_mlp": 1.02514148, "epoch": 0.16419660303622427, "flos": 45894759672960.0, "grad_norm": 1.5285417413020395, "language_loss": 0.75369555, "learning_rate": 3.814044025526651e-06, "loss": 0.78027141, "num_input_tokens_seen": 59075610, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.29394531, "step": 2731, "time_per_iteration": 3.055819511413574 }, { "auxiliary_loss_clip": 0.01605237, "auxiliary_loss_mlp": 0.01049768, "balance_loss_clip": 1.37328148, "balance_loss_mlp": 1.02245712, "epoch": 0.16425672628889224, "flos": 18962083077120.0, "grad_norm": 2.096944475819252, "language_loss": 0.8103013, "learning_rate": 3.8138799953589548e-06, "loss": 0.83685136, "num_input_tokens_seen": 59094555, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.27319336, "step": 2732, "time_per_iteration": 2.863966703414917 }, { "auxiliary_loss_clip": 0.01596423, "auxiliary_loss_mlp": 0.01051613, "balance_loss_clip": 1.36686659, "balance_loss_mlp": 1.02488601, "epoch": 0.1643168495415602, "flos": 24322803454080.0, "grad_norm": 1.8209462009688147, "language_loss": 0.70347261, "learning_rate": 3.8137158964087473e-06, "loss": 0.72995305, "num_input_tokens_seen": 59113515, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26733398, "step": 2733, "time_per_iteration": 2.928468704223633 }, { "auxiliary_loss_clip": 0.01604449, "auxiliary_loss_mlp": 0.01060298, "balance_loss_clip": 1.37542129, "balance_loss_mlp": 1.03130674, "epoch": 0.16437697279422817, "flos": 26438583669120.0, "grad_norm": 1.669515368625835, "language_loss": 0.82145083, "learning_rate": 3.8135517286822508e-06, "loss": 0.8480984, "num_input_tokens_seen": 59133275, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.28955078, "step": 2734, "time_per_iteration": 2.9648277759552 }, { "auxiliary_loss_clip": 0.01595056, "auxiliary_loss_mlp": 0.01051337, "balance_loss_clip": 1.36549258, "balance_loss_mlp": 1.02403808, "epoch": 0.16443709604689613, "flos": 34545536311680.0, "grad_norm": 2.2059251172237246, "language_loss": 0.82975954, "learning_rate": 3.8133874921856914e-06, "loss": 0.85622346, "num_input_tokens_seen": 59154095, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.27294922, "step": 2735, "time_per_iteration": 4.3453216552734375 }, { "auxiliary_loss_clip": 0.01584857, "auxiliary_loss_mlp": 0.01047989, "balance_loss_clip": 1.36327088, "balance_loss_mlp": 1.02187061, "epoch": 0.1644972192995641, "flos": 23268397196160.0, "grad_norm": 2.3714330018903627, "language_loss": 0.79278493, "learning_rate": 3.813223186925296e-06, "loss": 0.81911337, "num_input_tokens_seen": 59173795, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26098633, "step": 2736, "time_per_iteration": 2.904789447784424 }, { "auxiliary_loss_clip": 0.01604648, "auxiliary_loss_mlp": 0.01060634, "balance_loss_clip": 1.37664306, "balance_loss_mlp": 1.03309703, "epoch": 0.1645573425522321, "flos": 26990843425920.0, "grad_norm": 2.026261997995536, "language_loss": 0.82117724, "learning_rate": 3.8130588129072964e-06, "loss": 0.84783012, "num_input_tokens_seen": 59191610, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27563477, "step": 2737, "time_per_iteration": 2.9587416648864746 }, { "auxiliary_loss_clip": 0.01608238, "auxiliary_loss_mlp": 0.01052486, "balance_loss_clip": 1.37728405, "balance_loss_mlp": 1.0272851, "epoch": 0.16461746580490005, "flos": 28743759192960.0, "grad_norm": 1.7758283393732583, "language_loss": 0.88713461, "learning_rate": 3.8128943701379246e-06, "loss": 0.91374183, "num_input_tokens_seen": 59213000, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.25170898, "step": 2738, "time_per_iteration": 2.9515221118927 }, { "auxiliary_loss_clip": 0.0160604, "auxiliary_loss_mlp": 0.01055116, "balance_loss_clip": 1.37575412, "balance_loss_mlp": 1.02800739, "epoch": 0.16467758905756802, "flos": 24939320106240.0, "grad_norm": 1.808680046642556, "language_loss": 0.72810495, "learning_rate": 3.8127298586234167e-06, "loss": 0.75471652, "num_input_tokens_seen": 59232340, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27148438, "step": 2739, "time_per_iteration": 2.8563249111175537 }, { "auxiliary_loss_clip": 0.01600997, "auxiliary_loss_mlp": 0.01052333, "balance_loss_clip": 1.37344003, "balance_loss_mlp": 1.02562976, "epoch": 0.16473771231023598, "flos": 24837166051200.0, "grad_norm": 1.6378156737256593, "language_loss": 0.82714778, "learning_rate": 3.8125652783700104e-06, "loss": 0.85368109, "num_input_tokens_seen": 59253950, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.26733398, "step": 2740, "time_per_iteration": 4.390120983123779 }, { "auxiliary_loss_clip": 0.01622298, "auxiliary_loss_mlp": 0.01069029, "balance_loss_clip": 1.38910198, "balance_loss_mlp": 1.04134881, "epoch": 0.16479783556290395, "flos": 39910871658240.0, "grad_norm": 2.177495743261992, "language_loss": 0.70371485, "learning_rate": 3.8124006293839475e-06, "loss": 0.73062813, "num_input_tokens_seen": 59275545, "router_z_loss_clip": 2.33398438, "router_z_loss_mlp": 0.27685547, "step": 2741, "time_per_iteration": 3.0364878177642822 }, { "auxiliary_loss_clip": 0.01626802, "auxiliary_loss_mlp": 0.01063983, "balance_loss_clip": 1.3949697, "balance_loss_mlp": 1.03633881, "epoch": 0.16485795881557191, "flos": 19905422054400.0, "grad_norm": 1.773815238008752, "language_loss": 0.80618739, "learning_rate": 3.812235911671472e-06, "loss": 0.83309525, "num_input_tokens_seen": 59293480, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27648926, "step": 2742, "time_per_iteration": 2.8167736530303955 }, { "auxiliary_loss_clip": 0.01616064, "auxiliary_loss_mlp": 0.01059975, "balance_loss_clip": 1.38725471, "balance_loss_mlp": 1.03225923, "epoch": 0.16491808206823988, "flos": 20565265242240.0, "grad_norm": 1.6991793314733272, "language_loss": 0.86206609, "learning_rate": 3.8120711252388274e-06, "loss": 0.88882649, "num_input_tokens_seen": 59313435, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.27709961, "step": 2743, "time_per_iteration": 5.794186353683472 }, { "auxiliary_loss_clip": 0.01601915, "auxiliary_loss_mlp": 0.01054882, "balance_loss_clip": 1.37516189, "balance_loss_mlp": 1.02928829, "epoch": 0.16497820532090787, "flos": 23810114914560.0, "grad_norm": 1.5287917868819372, "language_loss": 0.86309063, "learning_rate": 3.811906270092265e-06, "loss": 0.88965857, "num_input_tokens_seen": 59331535, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.25634766, "step": 2744, "time_per_iteration": 2.83689284324646 }, { "auxiliary_loss_clip": 0.01589881, "auxiliary_loss_mlp": 0.01057203, "balance_loss_clip": 1.36994553, "balance_loss_mlp": 1.03156066, "epoch": 0.16503832857357584, "flos": 25493299165440.0, "grad_norm": 1.7879826868946753, "language_loss": 0.83612132, "learning_rate": 3.811741346238036e-06, "loss": 0.86259222, "num_input_tokens_seen": 59350680, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25646973, "step": 2745, "time_per_iteration": 2.9046008586883545 }, { "auxiliary_loss_clip": 0.0161549, "auxiliary_loss_mlp": 0.0106651, "balance_loss_clip": 1.38643146, "balance_loss_mlp": 1.03931832, "epoch": 0.1650984518262438, "flos": 17685044565120.0, "grad_norm": 1.8826265246282474, "language_loss": 0.77842623, "learning_rate": 3.8115763536823923e-06, "loss": 0.80524623, "num_input_tokens_seen": 59367020, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.27172852, "step": 2746, "time_per_iteration": 2.894489049911499 }, { "auxiliary_loss_clip": 0.01599921, "auxiliary_loss_mlp": 0.01055336, "balance_loss_clip": 1.37301099, "balance_loss_mlp": 1.02934813, "epoch": 0.16515857507891177, "flos": 18707752200960.0, "grad_norm": 1.6605029303979764, "language_loss": 0.8120116, "learning_rate": 3.811411292431592e-06, "loss": 0.83856416, "num_input_tokens_seen": 59386075, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26000977, "step": 2747, "time_per_iteration": 2.8995914459228516 }, { "auxiliary_loss_clip": 0.01617245, "auxiliary_loss_mlp": 0.01054455, "balance_loss_clip": 1.39019036, "balance_loss_mlp": 1.02838373, "epoch": 0.16521869833157973, "flos": 15018226202880.0, "grad_norm": 6.6997436522379195, "language_loss": 0.71750724, "learning_rate": 3.8112461624918945e-06, "loss": 0.74422419, "num_input_tokens_seen": 59402690, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26074219, "step": 2748, "time_per_iteration": 2.8298449516296387 }, { "auxiliary_loss_clip": 0.0160504, "auxiliary_loss_mlp": 0.01059302, "balance_loss_clip": 1.37904882, "balance_loss_mlp": 1.03233707, "epoch": 0.1652788215842477, "flos": 22129690596480.0, "grad_norm": 2.3616412096320434, "language_loss": 0.89128625, "learning_rate": 3.811080963869561e-06, "loss": 0.91792971, "num_input_tokens_seen": 59421130, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26953125, "step": 2749, "time_per_iteration": 2.833669424057007 }, { "auxiliary_loss_clip": 0.01602778, "auxiliary_loss_mlp": 0.01048203, "balance_loss_clip": 1.37270153, "balance_loss_mlp": 1.02160788, "epoch": 0.16533894483691566, "flos": 18342308799360.0, "grad_norm": 1.8462377256156128, "language_loss": 0.80379081, "learning_rate": 3.8109156965708557e-06, "loss": 0.83030057, "num_input_tokens_seen": 59438970, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26623535, "step": 2750, "time_per_iteration": 2.877185106277466 }, { "auxiliary_loss_clip": 0.01598146, "auxiliary_loss_mlp": 0.01049827, "balance_loss_clip": 1.37143171, "balance_loss_mlp": 1.02219439, "epoch": 0.16539906808958366, "flos": 22392165536640.0, "grad_norm": 1.6527178372335332, "language_loss": 0.95802462, "learning_rate": 3.8107503606020455e-06, "loss": 0.9845044, "num_input_tokens_seen": 59458510, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.27624512, "step": 2751, "time_per_iteration": 2.852721691131592 }, { "auxiliary_loss_clip": 0.01601404, "auxiliary_loss_mlp": 0.01050252, "balance_loss_clip": 1.3777492, "balance_loss_mlp": 1.02342963, "epoch": 0.16545919134225162, "flos": 22721385836160.0, "grad_norm": 2.9745700888473676, "language_loss": 0.71423185, "learning_rate": 3.8105849559693997e-06, "loss": 0.74074841, "num_input_tokens_seen": 59477110, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26806641, "step": 2752, "time_per_iteration": 2.9071462154388428 }, { "auxiliary_loss_clip": 0.01363652, "auxiliary_loss_mlp": 0.01047124, "balance_loss_clip": 1.22961998, "balance_loss_mlp": 1.02538037, "epoch": 0.1655193145949196, "flos": 67833263151360.0, "grad_norm": 0.7816193247288641, "language_loss": 0.54139882, "learning_rate": 3.810419482679192e-06, "loss": 0.56550658, "num_input_tokens_seen": 59541155, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.21777344, "step": 2753, "time_per_iteration": 3.4628214836120605 }, { "auxiliary_loss_clip": 0.01598749, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.37384295, "balance_loss_mlp": 1.01654935, "epoch": 0.16557943784758755, "flos": 24291195321600.0, "grad_norm": 2.117350844613627, "language_loss": 0.76228255, "learning_rate": 3.8102539407376954e-06, "loss": 0.78869784, "num_input_tokens_seen": 59561155, "router_z_loss_clip": 2.25, "router_z_loss_mlp": 0.26257324, "step": 2754, "time_per_iteration": 2.8683784008026123 }, { "auxiliary_loss_clip": 0.01617654, "auxiliary_loss_mlp": 0.01054258, "balance_loss_clip": 1.38281131, "balance_loss_mlp": 1.02517056, "epoch": 0.16563956110025552, "flos": 20093098060800.0, "grad_norm": 2.049923216316248, "language_loss": 0.88161379, "learning_rate": 3.810088330151188e-06, "loss": 0.90833288, "num_input_tokens_seen": 59580460, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.2902832, "step": 2755, "time_per_iteration": 2.913374662399292 }, { "auxiliary_loss_clip": 0.01600084, "auxiliary_loss_mlp": 0.01053088, "balance_loss_clip": 1.37415957, "balance_loss_mlp": 1.02689791, "epoch": 0.16569968435292348, "flos": 28045204439040.0, "grad_norm": 2.171444428920699, "language_loss": 0.74609953, "learning_rate": 3.80992265092595e-06, "loss": 0.77263129, "num_input_tokens_seen": 59600025, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26184082, "step": 2756, "time_per_iteration": 2.9047093391418457 }, { "auxiliary_loss_clip": 0.01582794, "auxiliary_loss_mlp": 0.01049403, "balance_loss_clip": 1.3619163, "balance_loss_mlp": 1.02293825, "epoch": 0.16575980760559147, "flos": 26261449701120.0, "grad_norm": 1.525332380133221, "language_loss": 0.7650488, "learning_rate": 3.8097569030682636e-06, "loss": 0.79137075, "num_input_tokens_seen": 59620600, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.2644043, "step": 2757, "time_per_iteration": 2.9518797397613525 }, { "auxiliary_loss_clip": 0.01598562, "auxiliary_loss_mlp": 0.01043807, "balance_loss_clip": 1.37401974, "balance_loss_mlp": 1.01742578, "epoch": 0.16581993085825944, "flos": 26955479975040.0, "grad_norm": 1.6351623188186124, "language_loss": 0.85588235, "learning_rate": 3.8095910865844137e-06, "loss": 0.88230598, "num_input_tokens_seen": 59641385, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26403809, "step": 2758, "time_per_iteration": 2.9060680866241455 }, { "auxiliary_loss_clip": 0.01606054, "auxiliary_loss_mlp": 0.01050195, "balance_loss_clip": 1.37945616, "balance_loss_mlp": 1.02381361, "epoch": 0.1658800541109274, "flos": 21663812442240.0, "grad_norm": 1.9506120249273695, "language_loss": 0.80618447, "learning_rate": 3.809425201480689e-06, "loss": 0.83274698, "num_input_tokens_seen": 59659865, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26379395, "step": 2759, "time_per_iteration": 2.8620259761810303 }, { "auxiliary_loss_clip": 0.01603685, "auxiliary_loss_mlp": 0.01051071, "balance_loss_clip": 1.37382078, "balance_loss_mlp": 1.02452302, "epoch": 0.16594017736359537, "flos": 16444138665600.0, "grad_norm": 2.0966080529369004, "language_loss": 0.7677834, "learning_rate": 3.8092592477633793e-06, "loss": 0.79433095, "num_input_tokens_seen": 59678780, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26538086, "step": 2760, "time_per_iteration": 2.845219135284424 }, { "auxiliary_loss_clip": 0.01612188, "auxiliary_loss_mlp": 0.01053424, "balance_loss_clip": 1.38032818, "balance_loss_mlp": 1.02557635, "epoch": 0.16600030061626334, "flos": 22647356064000.0, "grad_norm": 1.6984243941316202, "language_loss": 0.74273872, "learning_rate": 3.8090932254387774e-06, "loss": 0.76939487, "num_input_tokens_seen": 59698795, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.27856445, "step": 2761, "time_per_iteration": 2.850893497467041 }, { "auxiliary_loss_clip": 0.01605943, "auxiliary_loss_mlp": 0.01049256, "balance_loss_clip": 1.37816739, "balance_loss_mlp": 1.02121794, "epoch": 0.1660604238689313, "flos": 26407925677440.0, "grad_norm": 1.800612317312686, "language_loss": 0.89373171, "learning_rate": 3.8089271345131788e-06, "loss": 0.92028368, "num_input_tokens_seen": 59718795, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.28027344, "step": 2762, "time_per_iteration": 2.905747413635254 }, { "auxiliary_loss_clip": 0.01609356, "auxiliary_loss_mlp": 0.01049375, "balance_loss_clip": 1.3762269, "balance_loss_mlp": 1.0181663, "epoch": 0.16612054712159927, "flos": 23050515646080.0, "grad_norm": 1.6678352312702054, "language_loss": 0.89313686, "learning_rate": 3.8087609749928822e-06, "loss": 0.91972417, "num_input_tokens_seen": 59737555, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.31225586, "step": 2763, "time_per_iteration": 2.8344943523406982 }, { "auxiliary_loss_clip": 0.01383506, "auxiliary_loss_mlp": 0.01026317, "balance_loss_clip": 1.24781585, "balance_loss_mlp": 1.00915086, "epoch": 0.16618067037426726, "flos": 59272512716160.0, "grad_norm": 0.7818588704811839, "language_loss": 0.59703863, "learning_rate": 3.8085947468841885e-06, "loss": 0.62113678, "num_input_tokens_seen": 59800915, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.171875, "step": 2764, "time_per_iteration": 3.3896310329437256 }, { "auxiliary_loss_clip": 0.01600294, "auxiliary_loss_mlp": 0.01054392, "balance_loss_clip": 1.37272048, "balance_loss_mlp": 1.02625847, "epoch": 0.16624079362693522, "flos": 27210263299200.0, "grad_norm": 3.0038514368067455, "language_loss": 0.8278631, "learning_rate": 3.808428450193401e-06, "loss": 0.85440993, "num_input_tokens_seen": 59822910, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.28149414, "step": 2765, "time_per_iteration": 2.929936647415161 }, { "auxiliary_loss_clip": 0.01630398, "auxiliary_loss_mlp": 0.01048708, "balance_loss_clip": 1.39151156, "balance_loss_mlp": 1.01988316, "epoch": 0.1663009168796032, "flos": 10932010629120.0, "grad_norm": 2.2816297940319896, "language_loss": 0.71593374, "learning_rate": 3.8082620849268244e-06, "loss": 0.74272478, "num_input_tokens_seen": 59838805, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.28796387, "step": 2766, "time_per_iteration": 2.7745814323425293 }, { "auxiliary_loss_clip": 0.01611286, "auxiliary_loss_mlp": 0.01047835, "balance_loss_clip": 1.38481927, "balance_loss_mlp": 1.02034569, "epoch": 0.16636104013227115, "flos": 17903333318400.0, "grad_norm": 2.1142283960677584, "language_loss": 0.90370381, "learning_rate": 3.808095651090769e-06, "loss": 0.93029505, "num_input_tokens_seen": 59855345, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27453613, "step": 2767, "time_per_iteration": 2.8963611125946045 }, { "auxiliary_loss_clip": 0.01375673, "auxiliary_loss_mlp": 0.0102693, "balance_loss_clip": 1.24150515, "balance_loss_mlp": 1.00995469, "epoch": 0.16642116338493912, "flos": 66760641221760.0, "grad_norm": 0.6493214302939047, "language_loss": 0.53006876, "learning_rate": 3.8079291486915447e-06, "loss": 0.55409479, "num_input_tokens_seen": 59917710, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.16992188, "step": 2768, "time_per_iteration": 3.4092953205108643 }, { "auxiliary_loss_clip": 0.01629141, "auxiliary_loss_mlp": 0.01046672, "balance_loss_clip": 1.39357114, "balance_loss_mlp": 1.01930106, "epoch": 0.16648128663760708, "flos": 19035388932480.0, "grad_norm": 2.221244132513017, "language_loss": 0.86572361, "learning_rate": 3.8077625777354667e-06, "loss": 0.89248168, "num_input_tokens_seen": 59935105, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.27380371, "step": 2769, "time_per_iteration": 2.875262975692749 }, { "auxiliary_loss_clip": 0.01378907, "auxiliary_loss_mlp": 0.01039843, "balance_loss_clip": 1.24386811, "balance_loss_mlp": 1.01924348, "epoch": 0.16654140989027508, "flos": 70165069879680.0, "grad_norm": 0.8256878176546514, "language_loss": 0.5762465, "learning_rate": 3.80759593822885e-06, "loss": 0.60043406, "num_input_tokens_seen": 59984085, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.20605469, "step": 2770, "time_per_iteration": 4.558475017547607 }, { "auxiliary_loss_clip": 0.01371159, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.23561716, "balance_loss_mlp": 1.01177144, "epoch": 0.16660153314294304, "flos": 70300325145600.0, "grad_norm": 0.8669892154633343, "language_loss": 0.56280625, "learning_rate": 3.807429230178015e-06, "loss": 0.58684343, "num_input_tokens_seen": 60043470, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.20800781, "step": 2771, "time_per_iteration": 3.1377315521240234 }, { "auxiliary_loss_clip": 0.0161457, "auxiliary_loss_mlp": 0.01058525, "balance_loss_clip": 1.38789535, "balance_loss_mlp": 1.02974844, "epoch": 0.166661656395611, "flos": 23085200424960.0, "grad_norm": 1.9914496041899008, "language_loss": 0.71804589, "learning_rate": 3.8072624535892817e-06, "loss": 0.74477684, "num_input_tokens_seen": 60063045, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.2878418, "step": 2772, "time_per_iteration": 2.93401837348938 }, { "auxiliary_loss_clip": 0.01599499, "auxiliary_loss_mlp": 0.01049049, "balance_loss_clip": 1.37349856, "balance_loss_mlp": 1.02048635, "epoch": 0.16672177964827897, "flos": 28378089567360.0, "grad_norm": 1.7744444628120086, "language_loss": 0.87364137, "learning_rate": 3.807095608468975e-06, "loss": 0.90012681, "num_input_tokens_seen": 60081945, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.28564453, "step": 2773, "time_per_iteration": 2.962458848953247 }, { "auxiliary_loss_clip": 0.016082, "auxiliary_loss_mlp": 0.01050883, "balance_loss_clip": 1.38043809, "balance_loss_mlp": 1.02253485, "epoch": 0.16678190290094694, "flos": 19098288483840.0, "grad_norm": 2.9035173147030573, "language_loss": 0.82424635, "learning_rate": 3.8069286948234224e-06, "loss": 0.85083717, "num_input_tokens_seen": 60096820, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28320312, "step": 2774, "time_per_iteration": 2.8210043907165527 }, { "auxiliary_loss_clip": 0.0161443, "auxiliary_loss_mlp": 0.01052821, "balance_loss_clip": 1.38447618, "balance_loss_mlp": 1.02337623, "epoch": 0.1668420261536149, "flos": 21809202543360.0, "grad_norm": 2.6816967603304107, "language_loss": 0.84382415, "learning_rate": 3.806761712658952e-06, "loss": 0.87049669, "num_input_tokens_seen": 60116140, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.29443359, "step": 2775, "time_per_iteration": 2.9129908084869385 }, { "auxiliary_loss_clip": 0.01601791, "auxiliary_loss_mlp": 0.01059828, "balance_loss_clip": 1.37349606, "balance_loss_mlp": 1.03233802, "epoch": 0.16690214940628287, "flos": 19071883503360.0, "grad_norm": 1.7854333428420543, "language_loss": 0.81832075, "learning_rate": 3.806594661981897e-06, "loss": 0.84493691, "num_input_tokens_seen": 60134235, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27490234, "step": 2776, "time_per_iteration": 4.377525091171265 }, { "auxiliary_loss_clip": 0.01594496, "auxiliary_loss_mlp": 0.01051699, "balance_loss_clip": 1.37337494, "balance_loss_mlp": 1.02389908, "epoch": 0.16696227265895086, "flos": 18597861285120.0, "grad_norm": 1.776314486351668, "language_loss": 0.80625343, "learning_rate": 3.8064275427985906e-06, "loss": 0.83271539, "num_input_tokens_seen": 60153275, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.27832031, "step": 2777, "time_per_iteration": 2.8709166049957275 }, { "auxiliary_loss_clip": 0.01609522, "auxiliary_loss_mlp": 0.01055268, "balance_loss_clip": 1.38025033, "balance_loss_mlp": 1.02638376, "epoch": 0.16702239591161883, "flos": 23304394074240.0, "grad_norm": 1.7477294245594004, "language_loss": 0.85741556, "learning_rate": 3.806260355115371e-06, "loss": 0.88406348, "num_input_tokens_seen": 60173215, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28894043, "step": 2778, "time_per_iteration": 4.250521183013916 }, { "auxiliary_loss_clip": 0.016164, "auxiliary_loss_mlp": 0.01053506, "balance_loss_clip": 1.38450801, "balance_loss_mlp": 1.02558708, "epoch": 0.1670825191642868, "flos": 24436313953920.0, "grad_norm": 1.9941612244790587, "language_loss": 0.75028366, "learning_rate": 3.8060930989385778e-06, "loss": 0.77698267, "num_input_tokens_seen": 60190515, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27929688, "step": 2779, "time_per_iteration": 4.389432191848755 }, { "auxiliary_loss_clip": 0.01619454, "auxiliary_loss_mlp": 0.01056769, "balance_loss_clip": 1.38575304, "balance_loss_mlp": 1.02734804, "epoch": 0.16714264241695476, "flos": 26808868264320.0, "grad_norm": 1.9632521668421017, "language_loss": 0.66767561, "learning_rate": 3.805925774274554e-06, "loss": 0.69443786, "num_input_tokens_seen": 60211655, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.29418945, "step": 2780, "time_per_iteration": 2.913623094558716 }, { "auxiliary_loss_clip": 0.01611811, "auxiliary_loss_mlp": 0.01050401, "balance_loss_clip": 1.38283253, "balance_loss_mlp": 1.02226758, "epoch": 0.16720276566962272, "flos": 21845335155840.0, "grad_norm": 2.2742941893791433, "language_loss": 0.79272687, "learning_rate": 3.805758381129643e-06, "loss": 0.81934893, "num_input_tokens_seen": 60230860, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.28125, "step": 2781, "time_per_iteration": 2.8073558807373047 }, { "auxiliary_loss_clip": 0.01616329, "auxiliary_loss_mlp": 0.01044999, "balance_loss_clip": 1.38484335, "balance_loss_mlp": 1.01754498, "epoch": 0.1672628889222907, "flos": 21480344202240.0, "grad_norm": 1.4941241268741021, "language_loss": 0.75631189, "learning_rate": 3.805590919510193e-06, "loss": 0.78292513, "num_input_tokens_seen": 60250535, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.27441406, "step": 2782, "time_per_iteration": 2.857926607131958 }, { "auxiliary_loss_clip": 0.01634169, "auxiliary_loss_mlp": 0.01050465, "balance_loss_clip": 1.39194274, "balance_loss_mlp": 1.02248621, "epoch": 0.16732301217495865, "flos": 30786278797440.0, "grad_norm": 2.01513793046619, "language_loss": 0.68764257, "learning_rate": 3.8054233894225547e-06, "loss": 0.71448886, "num_input_tokens_seen": 60269530, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.27966309, "step": 2783, "time_per_iteration": 2.880960464477539 }, { "auxiliary_loss_clip": 0.01610119, "auxiliary_loss_mlp": 0.01049322, "balance_loss_clip": 1.38040268, "balance_loss_mlp": 1.0232029, "epoch": 0.16738313542762664, "flos": 23484423709440.0, "grad_norm": 1.678104323982807, "language_loss": 0.70707327, "learning_rate": 3.805255790873081e-06, "loss": 0.73366761, "num_input_tokens_seen": 60289900, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26135254, "step": 2784, "time_per_iteration": 2.884366035461426 }, { "auxiliary_loss_clip": 0.01609809, "auxiliary_loss_mlp": 0.01054782, "balance_loss_clip": 1.3765316, "balance_loss_mlp": 1.02497923, "epoch": 0.1674432586802946, "flos": 29801151607680.0, "grad_norm": 1.8626186891941043, "language_loss": 0.62335467, "learning_rate": 3.805088123868126e-06, "loss": 0.65000057, "num_input_tokens_seen": 60310025, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.29785156, "step": 2785, "time_per_iteration": 2.8714518547058105 }, { "auxiliary_loss_clip": 0.0133823, "auxiliary_loss_mlp": 0.0105088, "balance_loss_clip": 1.20751691, "balance_loss_mlp": 1.03533506, "epoch": 0.16750338193296258, "flos": 66168900737280.0, "grad_norm": 0.8034414241294885, "language_loss": 0.589607, "learning_rate": 3.8049203884140492e-06, "loss": 0.61349815, "num_input_tokens_seen": 60377800, "router_z_loss_clip": 1.3125, "router_z_loss_mlp": 0.15527344, "step": 2786, "time_per_iteration": 3.4300501346588135 }, { "auxiliary_loss_clip": 0.01623086, "auxiliary_loss_mlp": 0.01053165, "balance_loss_clip": 1.38767767, "balance_loss_mlp": 1.0249958, "epoch": 0.16756350518563054, "flos": 25706249032320.0, "grad_norm": 1.6358852934009838, "language_loss": 0.77025211, "learning_rate": 3.80475258451721e-06, "loss": 0.79701459, "num_input_tokens_seen": 60398215, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.28137207, "step": 2787, "time_per_iteration": 2.9150922298431396 }, { "auxiliary_loss_clip": 0.01624278, "auxiliary_loss_mlp": 0.01054225, "balance_loss_clip": 1.39066386, "balance_loss_mlp": 1.02712882, "epoch": 0.1676236284382985, "flos": 23845116407040.0, "grad_norm": 1.86449052251365, "language_loss": 0.78385091, "learning_rate": 3.804584712183972e-06, "loss": 0.81063592, "num_input_tokens_seen": 60416910, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.27111816, "step": 2788, "time_per_iteration": 2.8436031341552734 }, { "auxiliary_loss_clip": 0.01350726, "auxiliary_loss_mlp": 0.01065554, "balance_loss_clip": 1.21678388, "balance_loss_mlp": 1.04953194, "epoch": 0.16768375169096647, "flos": 59900367830400.0, "grad_norm": 0.8723305933151231, "language_loss": 0.59402454, "learning_rate": 3.8044167714207013e-06, "loss": 0.61818731, "num_input_tokens_seen": 60468660, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.16015625, "step": 2789, "time_per_iteration": 3.2209463119506836 }, { "auxiliary_loss_clip": 0.01627909, "auxiliary_loss_mlp": 0.01069503, "balance_loss_clip": 1.39139032, "balance_loss_mlp": 1.03905678, "epoch": 0.16774387494363446, "flos": 38449821968640.0, "grad_norm": 1.4652685085799015, "language_loss": 0.71048516, "learning_rate": 3.804248762233765e-06, "loss": 0.7374593, "num_input_tokens_seen": 60492370, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.30444336, "step": 2790, "time_per_iteration": 3.08988881111145 }, { "auxiliary_loss_clip": 0.01626645, "auxiliary_loss_mlp": 0.01068408, "balance_loss_clip": 1.39407873, "balance_loss_mlp": 1.0415504, "epoch": 0.16780399819630243, "flos": 22647763267200.0, "grad_norm": 1.8347936673068679, "language_loss": 0.80022144, "learning_rate": 3.8040806846295356e-06, "loss": 0.82717198, "num_input_tokens_seen": 60512655, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.26855469, "step": 2791, "time_per_iteration": 2.8578810691833496 }, { "auxiliary_loss_clip": 0.01625309, "auxiliary_loss_mlp": 0.01070108, "balance_loss_clip": 1.39460397, "balance_loss_mlp": 1.04159367, "epoch": 0.1678641214489704, "flos": 32904683210880.0, "grad_norm": 1.6821492542686085, "language_loss": 0.72790956, "learning_rate": 3.8039125386143853e-06, "loss": 0.75486368, "num_input_tokens_seen": 60533090, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.28491211, "step": 2792, "time_per_iteration": 3.012328624725342 }, { "auxiliary_loss_clip": 0.01614637, "auxiliary_loss_mlp": 0.01064513, "balance_loss_clip": 1.38145101, "balance_loss_mlp": 1.03709435, "epoch": 0.16792424470163836, "flos": 19984564488960.0, "grad_norm": 1.890484521125794, "language_loss": 0.72768176, "learning_rate": 3.803744324194691e-06, "loss": 0.75447333, "num_input_tokens_seen": 60553190, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.27441406, "step": 2793, "time_per_iteration": 2.834085464477539 }, { "auxiliary_loss_clip": 0.01615966, "auxiliary_loss_mlp": 0.01063767, "balance_loss_clip": 1.38325179, "balance_loss_mlp": 1.03569293, "epoch": 0.16798436795430632, "flos": 19729554940800.0, "grad_norm": 1.8527835437950408, "language_loss": 0.78148866, "learning_rate": 3.803576041376831e-06, "loss": 0.80828601, "num_input_tokens_seen": 60571995, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.28088379, "step": 2794, "time_per_iteration": 2.8562545776367188 }, { "auxiliary_loss_clip": 0.01617735, "auxiliary_loss_mlp": 0.01057524, "balance_loss_clip": 1.38319361, "balance_loss_mlp": 1.02918768, "epoch": 0.1680444912069743, "flos": 28114935955200.0, "grad_norm": 2.107896202389036, "language_loss": 0.72491789, "learning_rate": 3.803407690167187e-06, "loss": 0.75167048, "num_input_tokens_seen": 60591275, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.2833252, "step": 2795, "time_per_iteration": 2.916116714477539 }, { "auxiliary_loss_clip": 0.01600953, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.37129819, "balance_loss_mlp": 1.03312504, "epoch": 0.16810461445964225, "flos": 18083724912000.0, "grad_norm": 1.9095803018111115, "language_loss": 0.84981704, "learning_rate": 3.803239270572142e-06, "loss": 0.87643456, "num_input_tokens_seen": 60609235, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27661133, "step": 2796, "time_per_iteration": 2.8631703853607178 }, { "auxiliary_loss_clip": 0.01631265, "auxiliary_loss_mlp": 0.01060051, "balance_loss_clip": 1.3931191, "balance_loss_mlp": 1.03217983, "epoch": 0.16816473771231025, "flos": 23889257349120.0, "grad_norm": 2.156130284384972, "language_loss": 0.83095014, "learning_rate": 3.8030707825980838e-06, "loss": 0.85786331, "num_input_tokens_seen": 60629880, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.27868652, "step": 2797, "time_per_iteration": 2.881422996520996 }, { "auxiliary_loss_clip": 0.01592851, "auxiliary_loss_mlp": 0.01050881, "balance_loss_clip": 1.36933303, "balance_loss_mlp": 1.02471471, "epoch": 0.1682248609649782, "flos": 22793786795520.0, "grad_norm": 1.409335606914219, "language_loss": 0.76224887, "learning_rate": 3.802902226251401e-06, "loss": 0.78868616, "num_input_tokens_seen": 60651175, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26171875, "step": 2798, "time_per_iteration": 2.8413352966308594 }, { "auxiliary_loss_clip": 0.01623268, "auxiliary_loss_mlp": 0.01052766, "balance_loss_clip": 1.39022005, "balance_loss_mlp": 1.02570581, "epoch": 0.16828498421764618, "flos": 20715089333760.0, "grad_norm": 1.392369436209934, "language_loss": 0.80759901, "learning_rate": 3.8027336015384845e-06, "loss": 0.83435929, "num_input_tokens_seen": 60670210, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.27099609, "step": 2799, "time_per_iteration": 2.871710777282715 }, { "auxiliary_loss_clip": 0.0162851, "auxiliary_loss_mlp": 0.01048016, "balance_loss_clip": 1.39297366, "balance_loss_mlp": 1.01917887, "epoch": 0.16834510747031414, "flos": 29431636174080.0, "grad_norm": 2.0399396974007438, "language_loss": 0.71797371, "learning_rate": 3.8025649084657296e-06, "loss": 0.74473894, "num_input_tokens_seen": 60690895, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.28808594, "step": 2800, "time_per_iteration": 2.903916358947754 }, { "auxiliary_loss_clip": 0.01619026, "auxiliary_loss_mlp": 0.01049481, "balance_loss_clip": 1.38622439, "balance_loss_mlp": 1.02126408, "epoch": 0.1684052307229821, "flos": 18153184959360.0, "grad_norm": 1.973732544480156, "language_loss": 0.847247, "learning_rate": 3.8023961470395326e-06, "loss": 0.87393206, "num_input_tokens_seen": 60708280, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.28186035, "step": 2801, "time_per_iteration": 2.8598334789276123 }, { "auxiliary_loss_clip": 0.01630832, "auxiliary_loss_mlp": 0.01052073, "balance_loss_clip": 1.39588833, "balance_loss_mlp": 1.02515531, "epoch": 0.16846535397565007, "flos": 16581927640320.0, "grad_norm": 2.1392848212626583, "language_loss": 0.8443954, "learning_rate": 3.8022273172662933e-06, "loss": 0.87122446, "num_input_tokens_seen": 60724150, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.26953125, "step": 2802, "time_per_iteration": 2.815573215484619 }, { "auxiliary_loss_clip": 0.01629104, "auxiliary_loss_mlp": 0.01057139, "balance_loss_clip": 1.39627147, "balance_loss_mlp": 1.02783751, "epoch": 0.16852547722831807, "flos": 30420971130240.0, "grad_norm": 1.4172072361107468, "language_loss": 0.82270634, "learning_rate": 3.802058419152413e-06, "loss": 0.84956872, "num_input_tokens_seen": 60746485, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.29321289, "step": 2803, "time_per_iteration": 2.9673140048980713 }, { "auxiliary_loss_clip": 0.01616471, "auxiliary_loss_mlp": 0.01044616, "balance_loss_clip": 1.38573802, "balance_loss_mlp": 1.0167923, "epoch": 0.16858560048098603, "flos": 33519616295040.0, "grad_norm": 2.4350058750240655, "language_loss": 0.77881527, "learning_rate": 3.801889452704297e-06, "loss": 0.80542612, "num_input_tokens_seen": 60762875, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.27832031, "step": 2804, "time_per_iteration": 2.990938901901245 }, { "auxiliary_loss_clip": 0.01326541, "auxiliary_loss_mlp": 0.01051316, "balance_loss_clip": 1.19291353, "balance_loss_mlp": 1.02747393, "epoch": 0.168645723733654, "flos": 67402069776000.0, "grad_norm": 0.873745274085285, "language_loss": 0.55485404, "learning_rate": 3.8017204179283526e-06, "loss": 0.57863265, "num_input_tokens_seen": 60825510, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.23828125, "step": 2805, "time_per_iteration": 4.784535884857178 }, { "auxiliary_loss_clip": 0.01608137, "auxiliary_loss_mlp": 0.01047849, "balance_loss_clip": 1.38376284, "balance_loss_mlp": 1.02176571, "epoch": 0.16870584698632196, "flos": 21334546897920.0, "grad_norm": 1.904249226389861, "language_loss": 0.74102151, "learning_rate": 3.8015513148309892e-06, "loss": 0.76758146, "num_input_tokens_seen": 60844440, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.26123047, "step": 2806, "time_per_iteration": 2.864797830581665 }, { "auxiliary_loss_clip": 0.01596721, "auxiliary_loss_mlp": 0.01049811, "balance_loss_clip": 1.37034154, "balance_loss_mlp": 1.0219872, "epoch": 0.16876597023898993, "flos": 20750498029440.0, "grad_norm": 1.7486331085808124, "language_loss": 0.71319467, "learning_rate": 3.80138214341862e-06, "loss": 0.73966002, "num_input_tokens_seen": 60863210, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.2779541, "step": 2807, "time_per_iteration": 2.9024243354797363 }, { "auxiliary_loss_clip": 0.01606184, "auxiliary_loss_mlp": 0.01049983, "balance_loss_clip": 1.37690067, "balance_loss_mlp": 1.02142072, "epoch": 0.1688260934916579, "flos": 20313287095680.0, "grad_norm": 2.3214225518164673, "language_loss": 0.7161938, "learning_rate": 3.8012129036976587e-06, "loss": 0.74275547, "num_input_tokens_seen": 60882510, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.28552246, "step": 2808, "time_per_iteration": 2.837751865386963 }, { "auxiliary_loss_clip": 0.01628368, "auxiliary_loss_mlp": 0.01049174, "balance_loss_clip": 1.3933022, "balance_loss_mlp": 1.02006352, "epoch": 0.16888621674432586, "flos": 20350686562560.0, "grad_norm": 2.161041544060095, "language_loss": 0.81397098, "learning_rate": 3.8010435956745236e-06, "loss": 0.8407464, "num_input_tokens_seen": 60901105, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.29150391, "step": 2809, "time_per_iteration": 2.8653998374938965 }, { "auxiliary_loss_clip": 0.01628159, "auxiliary_loss_mlp": 0.0105088, "balance_loss_clip": 1.3919313, "balance_loss_mlp": 1.02324688, "epoch": 0.16894633999699385, "flos": 16251259507200.0, "grad_norm": 1.9906322657510123, "language_loss": 0.88974905, "learning_rate": 3.8008742193556358e-06, "loss": 0.91653949, "num_input_tokens_seen": 60915340, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27661133, "step": 2810, "time_per_iteration": 2.8020527362823486 }, { "auxiliary_loss_clip": 0.01621038, "auxiliary_loss_mlp": 0.01057835, "balance_loss_clip": 1.38579369, "balance_loss_mlp": 1.03001189, "epoch": 0.16900646324966181, "flos": 19619754514560.0, "grad_norm": 1.7612844260987612, "language_loss": 0.93506283, "learning_rate": 3.800704774747416e-06, "loss": 0.9618516, "num_input_tokens_seen": 60933735, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.27807617, "step": 2811, "time_per_iteration": 4.2328972816467285 }, { "auxiliary_loss_clip": 0.01620167, "auxiliary_loss_mlp": 0.01049526, "balance_loss_clip": 1.38676262, "balance_loss_mlp": 1.02253687, "epoch": 0.16906658650232978, "flos": 22028260458240.0, "grad_norm": 2.02111539014105, "language_loss": 0.79789007, "learning_rate": 3.800535261856291e-06, "loss": 0.82458705, "num_input_tokens_seen": 60953105, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.27026367, "step": 2812, "time_per_iteration": 2.869966983795166 }, { "auxiliary_loss_clip": 0.01623165, "auxiliary_loss_mlp": 0.01053876, "balance_loss_clip": 1.39141703, "balance_loss_mlp": 1.02750695, "epoch": 0.16912670975499774, "flos": 11770707087360.0, "grad_norm": 2.072021587414466, "language_loss": 0.7617147, "learning_rate": 3.8003656806886887e-06, "loss": 0.78848505, "num_input_tokens_seen": 60969150, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.26367188, "step": 2813, "time_per_iteration": 4.243016958236694 }, { "auxiliary_loss_clip": 0.01633459, "auxiliary_loss_mlp": 0.01056463, "balance_loss_clip": 1.39576721, "balance_loss_mlp": 1.02873504, "epoch": 0.1691868330076657, "flos": 17169234134400.0, "grad_norm": 4.409729988489119, "language_loss": 0.70997715, "learning_rate": 3.8001960312510396e-06, "loss": 0.73687631, "num_input_tokens_seen": 60982825, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.27758789, "step": 2814, "time_per_iteration": 4.360554218292236 }, { "auxiliary_loss_clip": 0.01623903, "auxiliary_loss_mlp": 0.01057984, "balance_loss_clip": 1.39090264, "balance_loss_mlp": 1.03001738, "epoch": 0.16924695626033368, "flos": 22426081153920.0, "grad_norm": 1.7123896977945778, "language_loss": 0.63036692, "learning_rate": 3.800026313549776e-06, "loss": 0.65718585, "num_input_tokens_seen": 61000875, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27954102, "step": 2815, "time_per_iteration": 2.8277604579925537 }, { "auxiliary_loss_clip": 0.01612454, "auxiliary_loss_mlp": 0.01058628, "balance_loss_clip": 1.38140047, "balance_loss_mlp": 1.03085256, "epoch": 0.16930707951300164, "flos": 25750797177600.0, "grad_norm": 1.531774173569356, "language_loss": 0.83058381, "learning_rate": 3.7998565275913342e-06, "loss": 0.85729462, "num_input_tokens_seen": 61021940, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.27783203, "step": 2816, "time_per_iteration": 2.9322831630706787 }, { "auxiliary_loss_clip": 0.01631358, "auxiliary_loss_mlp": 0.01056747, "balance_loss_clip": 1.39629841, "balance_loss_mlp": 1.02782726, "epoch": 0.16936720276566963, "flos": 22757201735040.0, "grad_norm": 1.9634597170648742, "language_loss": 0.89029086, "learning_rate": 3.799686673382153e-06, "loss": 0.9171719, "num_input_tokens_seen": 61040285, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.28955078, "step": 2817, "time_per_iteration": 2.8612382411956787 }, { "auxiliary_loss_clip": 0.01605863, "auxiliary_loss_mlp": 0.01068504, "balance_loss_clip": 1.37635565, "balance_loss_mlp": 1.04003668, "epoch": 0.1694273260183376, "flos": 19583621902080.0, "grad_norm": 1.6186674899766902, "language_loss": 0.82643259, "learning_rate": 3.799516750928672e-06, "loss": 0.85317624, "num_input_tokens_seen": 61059020, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.28491211, "step": 2818, "time_per_iteration": 2.8798255920410156 }, { "auxiliary_loss_clip": 0.01621188, "auxiliary_loss_mlp": 0.01057403, "balance_loss_clip": 1.38933063, "balance_loss_mlp": 1.0305804, "epoch": 0.16948744927100556, "flos": 12465280298880.0, "grad_norm": 2.731194865262301, "language_loss": 0.82945752, "learning_rate": 3.799346760237336e-06, "loss": 0.85624343, "num_input_tokens_seen": 61074245, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.26818848, "step": 2819, "time_per_iteration": 2.819330930709839 }, { "auxiliary_loss_clip": 0.01347946, "auxiliary_loss_mlp": 0.01082433, "balance_loss_clip": 1.21149695, "balance_loss_mlp": 1.06402719, "epoch": 0.16954757252367353, "flos": 71319277463040.0, "grad_norm": 0.9517858556259979, "language_loss": 0.61326921, "learning_rate": 3.7991767013145902e-06, "loss": 0.637573, "num_input_tokens_seen": 61127080, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.18359375, "step": 2820, "time_per_iteration": 3.2812106609344482 }, { "auxiliary_loss_clip": 0.01639983, "auxiliary_loss_mlp": 0.01062917, "balance_loss_clip": 1.40477633, "balance_loss_mlp": 1.03591573, "epoch": 0.1696076957763415, "flos": 29618497774080.0, "grad_norm": 2.4370234912406183, "language_loss": 0.79150999, "learning_rate": 3.7990065741668844e-06, "loss": 0.81853902, "num_input_tokens_seen": 61146955, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.26989746, "step": 2821, "time_per_iteration": 2.9248547554016113 }, { "auxiliary_loss_clip": 0.01633746, "auxiliary_loss_mlp": 0.01062869, "balance_loss_clip": 1.3996464, "balance_loss_mlp": 1.03440154, "epoch": 0.16966781902900946, "flos": 24399095466240.0, "grad_norm": 1.9089503889405433, "language_loss": 0.79614437, "learning_rate": 3.7988363788006685e-06, "loss": 0.82311058, "num_input_tokens_seen": 61166605, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.28430176, "step": 2822, "time_per_iteration": 2.869173526763916 }, { "auxiliary_loss_clip": 0.01610703, "auxiliary_loss_mlp": 0.01055513, "balance_loss_clip": 1.3819344, "balance_loss_mlp": 1.02792811, "epoch": 0.16972794228167745, "flos": 23049022567680.0, "grad_norm": 1.773372201814986, "language_loss": 0.75966936, "learning_rate": 3.7986661152223967e-06, "loss": 0.78633153, "num_input_tokens_seen": 61186535, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.27612305, "step": 2823, "time_per_iteration": 2.8967485427856445 }, { "auxiliary_loss_clip": 0.01643595, "auxiliary_loss_mlp": 0.01067333, "balance_loss_clip": 1.40696633, "balance_loss_mlp": 1.03860354, "epoch": 0.16978806553434542, "flos": 35243005190400.0, "grad_norm": 1.752924672250813, "language_loss": 0.60586309, "learning_rate": 3.7984957834385257e-06, "loss": 0.63297242, "num_input_tokens_seen": 61208965, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.28723145, "step": 2824, "time_per_iteration": 2.9743940830230713 }, { "auxiliary_loss_clip": 0.01627224, "auxiliary_loss_mlp": 0.01059007, "balance_loss_clip": 1.39524722, "balance_loss_mlp": 1.03007519, "epoch": 0.16984818878701338, "flos": 32026822738560.0, "grad_norm": 2.332364146939632, "language_loss": 0.74103343, "learning_rate": 3.7983253834555144e-06, "loss": 0.7678957, "num_input_tokens_seen": 61230670, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.28942871, "step": 2825, "time_per_iteration": 2.9573376178741455 }, { "auxiliary_loss_clip": 0.01666574, "auxiliary_loss_mlp": 0.01067032, "balance_loss_clip": 1.42101514, "balance_loss_mlp": 1.03937578, "epoch": 0.16990831203968135, "flos": 22828426329600.0, "grad_norm": 1.9742259902937975, "language_loss": 0.86451685, "learning_rate": 3.7981549152798245e-06, "loss": 0.89185292, "num_input_tokens_seen": 61249510, "router_z_loss_clip": 2.453125, "router_z_loss_mlp": 0.2767334, "step": 2826, "time_per_iteration": 2.94449520111084 }, { "auxiliary_loss_clip": 0.01640563, "auxiliary_loss_mlp": 0.01060833, "balance_loss_clip": 1.40151918, "balance_loss_mlp": 1.03349829, "epoch": 0.1699684352923493, "flos": 23050017953280.0, "grad_norm": 1.725359448694524, "language_loss": 0.8317672, "learning_rate": 3.7979843789179196e-06, "loss": 0.85878122, "num_input_tokens_seen": 61269440, "router_z_loss_clip": 2.38671875, "router_z_loss_mlp": 0.27331543, "step": 2827, "time_per_iteration": 2.8417346477508545 }, { "auxiliary_loss_clip": 0.01647723, "auxiliary_loss_mlp": 0.01060087, "balance_loss_clip": 1.40631211, "balance_loss_mlp": 1.03167963, "epoch": 0.17002855854501728, "flos": 21444075855360.0, "grad_norm": 1.7150798976900403, "language_loss": 0.74646032, "learning_rate": 3.797813774376267e-06, "loss": 0.77353841, "num_input_tokens_seen": 61288195, "router_z_loss_clip": 2.41015625, "router_z_loss_mlp": 0.28405762, "step": 2828, "time_per_iteration": 2.9025771617889404 }, { "auxiliary_loss_clip": 0.01341808, "auxiliary_loss_mlp": 0.01017136, "balance_loss_clip": 1.20686519, "balance_loss_mlp": 1.00082779, "epoch": 0.17008868179768524, "flos": 71485253948160.0, "grad_norm": 0.7614167373342754, "language_loss": 0.56567609, "learning_rate": 3.797643101661336e-06, "loss": 0.58926558, "num_input_tokens_seen": 61350850, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.16308594, "step": 2829, "time_per_iteration": 3.401987075805664 }, { "auxiliary_loss_clip": 0.01622315, "auxiliary_loss_mlp": 0.01059285, "balance_loss_clip": 1.38945925, "balance_loss_mlp": 1.03290427, "epoch": 0.17014880505035324, "flos": 24911195823360.0, "grad_norm": 1.748877294333387, "language_loss": 0.84664994, "learning_rate": 3.7974723607795983e-06, "loss": 0.87346596, "num_input_tokens_seen": 61370765, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.26391602, "step": 2830, "time_per_iteration": 2.9089419841766357 }, { "auxiliary_loss_clip": 0.01640631, "auxiliary_loss_mlp": 0.010556, "balance_loss_clip": 1.40171957, "balance_loss_mlp": 1.02644145, "epoch": 0.1702089283030212, "flos": 29874593197440.0, "grad_norm": 2.4733871200220263, "language_loss": 0.79934978, "learning_rate": 3.797301551737529e-06, "loss": 0.82631207, "num_input_tokens_seen": 61388935, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 0.29174805, "step": 2831, "time_per_iteration": 2.9880707263946533 }, { "auxiliary_loss_clip": 0.01647137, "auxiliary_loss_mlp": 0.01051107, "balance_loss_clip": 1.40727878, "balance_loss_mlp": 1.02371264, "epoch": 0.17026905155568917, "flos": 17751925658880.0, "grad_norm": 1.9290870453598357, "language_loss": 0.8064847, "learning_rate": 3.7971306745416044e-06, "loss": 0.83346713, "num_input_tokens_seen": 61407350, "router_z_loss_clip": 2.3984375, "router_z_loss_mlp": 0.27392578, "step": 2832, "time_per_iteration": 2.889906644821167 }, { "auxiliary_loss_clip": 0.01623735, "auxiliary_loss_mlp": 0.01061064, "balance_loss_clip": 1.38978839, "balance_loss_mlp": 1.03341961, "epoch": 0.17032917480835713, "flos": 23159003973120.0, "grad_norm": 1.6185858591342732, "language_loss": 0.89880717, "learning_rate": 3.7969597291983046e-06, "loss": 0.92565513, "num_input_tokens_seen": 61429010, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.27648926, "step": 2833, "time_per_iteration": 2.9115099906921387 }, { "auxiliary_loss_clip": 0.01622766, "auxiliary_loss_mlp": 0.01058704, "balance_loss_clip": 1.3910799, "balance_loss_mlp": 1.0319773, "epoch": 0.1703892980610251, "flos": 39217112853120.0, "grad_norm": 2.090492320068467, "language_loss": 0.74131179, "learning_rate": 3.7967887157141115e-06, "loss": 0.76812649, "num_input_tokens_seen": 61450040, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26733398, "step": 2834, "time_per_iteration": 3.008330821990967 }, { "auxiliary_loss_clip": 0.0162429, "auxiliary_loss_mlp": 0.01060576, "balance_loss_clip": 1.38863993, "balance_loss_mlp": 1.0328958, "epoch": 0.17044942131369306, "flos": 23049384526080.0, "grad_norm": 2.6425822801442584, "language_loss": 0.87634879, "learning_rate": 3.7966176340955106e-06, "loss": 0.90319741, "num_input_tokens_seen": 61468585, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.27661133, "step": 2835, "time_per_iteration": 2.8872783184051514 }, { "auxiliary_loss_clip": 0.01632597, "auxiliary_loss_mlp": 0.0106033, "balance_loss_clip": 1.3909992, "balance_loss_mlp": 1.03213751, "epoch": 0.17050954456636103, "flos": 17063596229760.0, "grad_norm": 2.3010789670334524, "language_loss": 0.75271428, "learning_rate": 3.796446484348989e-06, "loss": 0.77964354, "num_input_tokens_seen": 61486330, "router_z_loss_clip": 2.41796875, "router_z_loss_mlp": 0.28173828, "step": 2836, "time_per_iteration": 2.81679105758667 }, { "auxiliary_loss_clip": 0.01646348, "auxiliary_loss_mlp": 0.01048971, "balance_loss_clip": 1.40619111, "balance_loss_mlp": 1.02086174, "epoch": 0.17056966781902902, "flos": 16845759924480.0, "grad_norm": 2.1592794453561304, "language_loss": 0.80802172, "learning_rate": 3.796275266481036e-06, "loss": 0.83497494, "num_input_tokens_seen": 61503950, "router_z_loss_clip": 2.40039062, "router_z_loss_mlp": 0.28100586, "step": 2837, "time_per_iteration": 2.832899570465088 }, { "auxiliary_loss_clip": 0.01612678, "auxiliary_loss_mlp": 0.01053941, "balance_loss_clip": 1.38628578, "balance_loss_mlp": 1.02611804, "epoch": 0.17062979107169698, "flos": 17721539136000.0, "grad_norm": 1.8882289705037207, "language_loss": 0.84764183, "learning_rate": 3.7961039804981456e-06, "loss": 0.87430799, "num_input_tokens_seen": 61523550, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.27832031, "step": 2838, "time_per_iteration": 2.9394476413726807 }, { "auxiliary_loss_clip": 0.0162409, "auxiliary_loss_mlp": 0.01052391, "balance_loss_clip": 1.3926059, "balance_loss_mlp": 1.02525949, "epoch": 0.17068991432436495, "flos": 22534343256960.0, "grad_norm": 1.6464511963149713, "language_loss": 0.94335306, "learning_rate": 3.795932626406812e-06, "loss": 0.97011787, "num_input_tokens_seen": 61542720, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.27124023, "step": 2839, "time_per_iteration": 4.3152430057525635 }, { "auxiliary_loss_clip": 0.01642902, "auxiliary_loss_mlp": 0.01055714, "balance_loss_clip": 1.40643537, "balance_loss_mlp": 1.02674603, "epoch": 0.17075003757703291, "flos": 25893427345920.0, "grad_norm": 1.8042351478714167, "language_loss": 0.84555954, "learning_rate": 3.7957612042135336e-06, "loss": 0.87254572, "num_input_tokens_seen": 61563040, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.28955078, "step": 2840, "time_per_iteration": 2.87135910987854 }, { "auxiliary_loss_clip": 0.01624254, "auxiliary_loss_mlp": 0.01056115, "balance_loss_clip": 1.39069688, "balance_loss_mlp": 1.02926922, "epoch": 0.17081016082970088, "flos": 20130361793280.0, "grad_norm": 2.0316283431778817, "language_loss": 0.77820551, "learning_rate": 3.79558971392481e-06, "loss": 0.80500925, "num_input_tokens_seen": 61581890, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.26855469, "step": 2841, "time_per_iteration": 2.852778673171997 }, { "auxiliary_loss_clip": 0.01636451, "auxiliary_loss_mlp": 0.01053294, "balance_loss_clip": 1.40133142, "balance_loss_mlp": 1.02606666, "epoch": 0.17087028408236885, "flos": 24947283191040.0, "grad_norm": 1.7500509155725317, "language_loss": 0.77878666, "learning_rate": 3.7954181555471443e-06, "loss": 0.80568409, "num_input_tokens_seen": 61602095, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.2722168, "step": 2842, "time_per_iteration": 2.8538520336151123 }, { "auxiliary_loss_clip": 0.01618756, "auxiliary_loss_mlp": 0.01053739, "balance_loss_clip": 1.39110851, "balance_loss_mlp": 1.02489042, "epoch": 0.17093040733503684, "flos": 19066001679360.0, "grad_norm": 1.9276078649091348, "language_loss": 0.86907762, "learning_rate": 3.795246529087043e-06, "loss": 0.89580262, "num_input_tokens_seen": 61620400, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.28833008, "step": 2843, "time_per_iteration": 2.868248462677002 }, { "auxiliary_loss_clip": 0.0163288, "auxiliary_loss_mlp": 0.0105591, "balance_loss_clip": 1.4012152, "balance_loss_mlp": 1.0295763, "epoch": 0.1709905305877048, "flos": 13086773879040.0, "grad_norm": 2.8605043197785722, "language_loss": 0.70111001, "learning_rate": 3.7950748345510126e-06, "loss": 0.7279979, "num_input_tokens_seen": 61637680, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26330566, "step": 2844, "time_per_iteration": 2.8288321495056152 }, { "auxiliary_loss_clip": 0.01636478, "auxiliary_loss_mlp": 0.01054393, "balance_loss_clip": 1.40265238, "balance_loss_mlp": 1.02680826, "epoch": 0.17105065384037277, "flos": 19218947662080.0, "grad_norm": 2.184491945257295, "language_loss": 0.79141927, "learning_rate": 3.7949030719455646e-06, "loss": 0.81832796, "num_input_tokens_seen": 61655630, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.27600098, "step": 2845, "time_per_iteration": 2.8958675861358643 }, { "auxiliary_loss_clip": 0.01639165, "auxiliary_loss_mlp": 0.01050365, "balance_loss_clip": 1.40375876, "balance_loss_mlp": 1.02274394, "epoch": 0.17111077709304073, "flos": 18524374450560.0, "grad_norm": 2.177958445239607, "language_loss": 0.79524291, "learning_rate": 3.7947312412772127e-06, "loss": 0.82213825, "num_input_tokens_seen": 61673475, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.27587891, "step": 2846, "time_per_iteration": 4.243098735809326 }, { "auxiliary_loss_clip": 0.01624325, "auxiliary_loss_mlp": 0.01055314, "balance_loss_clip": 1.39539266, "balance_loss_mlp": 1.02741957, "epoch": 0.1711709003457087, "flos": 25093578188160.0, "grad_norm": 1.76729089380069, "language_loss": 0.80911696, "learning_rate": 3.794559342552472e-06, "loss": 0.83591342, "num_input_tokens_seen": 61693370, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.27856445, "step": 2847, "time_per_iteration": 2.876802921295166 }, { "auxiliary_loss_clip": 0.01639016, "auxiliary_loss_mlp": 0.01052588, "balance_loss_clip": 1.40262842, "balance_loss_mlp": 1.02526522, "epoch": 0.17123102359837666, "flos": 17575289383680.0, "grad_norm": 2.4383593959308842, "language_loss": 0.88338697, "learning_rate": 3.7943873757778614e-06, "loss": 0.910303, "num_input_tokens_seen": 61710820, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.2734375, "step": 2848, "time_per_iteration": 4.2619102001190186 }, { "auxiliary_loss_clip": 0.0162222, "auxiliary_loss_mlp": 0.01047007, "balance_loss_clip": 1.3883481, "balance_loss_mlp": 1.01846826, "epoch": 0.17129114685104463, "flos": 26183890834560.0, "grad_norm": 1.810445930291935, "language_loss": 0.75843549, "learning_rate": 3.794215340959902e-06, "loss": 0.78512782, "num_input_tokens_seen": 61729855, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28540039, "step": 2849, "time_per_iteration": 4.429996490478516 }, { "auxiliary_loss_clip": 0.01336644, "auxiliary_loss_mlp": 0.01024633, "balance_loss_clip": 1.20783257, "balance_loss_mlp": 1.01004148, "epoch": 0.17135127010371262, "flos": 69302275925760.0, "grad_norm": 0.7985522339736943, "language_loss": 0.57518601, "learning_rate": 3.7940432381051163e-06, "loss": 0.59879887, "num_input_tokens_seen": 61790290, "router_z_loss_clip": 1.2890625, "router_z_loss_mlp": 0.14550781, "step": 2850, "time_per_iteration": 3.3634066581726074 }, { "auxiliary_loss_clip": 0.01618069, "auxiliary_loss_mlp": 0.01049314, "balance_loss_clip": 1.39044285, "balance_loss_mlp": 1.0233978, "epoch": 0.1714113933563806, "flos": 23560579987200.0, "grad_norm": 2.1162594894939364, "language_loss": 0.81841159, "learning_rate": 3.793871067220031e-06, "loss": 0.84508538, "num_input_tokens_seen": 61809265, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25927734, "step": 2851, "time_per_iteration": 2.8618435859680176 }, { "auxiliary_loss_clip": 0.01619241, "auxiliary_loss_mlp": 0.01044214, "balance_loss_clip": 1.39160442, "balance_loss_mlp": 1.0185957, "epoch": 0.17147151660904855, "flos": 21152390757120.0, "grad_norm": 1.732074913186605, "language_loss": 0.94374502, "learning_rate": 3.7936988283111764e-06, "loss": 0.97037959, "num_input_tokens_seen": 61828980, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25610352, "step": 2852, "time_per_iteration": 2.8610286712646484 }, { "auxiliary_loss_clip": 0.01646739, "auxiliary_loss_mlp": 0.01060638, "balance_loss_clip": 1.40904355, "balance_loss_mlp": 1.03283858, "epoch": 0.17153163986171652, "flos": 18634174876800.0, "grad_norm": 1.8005997772224238, "language_loss": 0.70013833, "learning_rate": 3.7935265213850817e-06, "loss": 0.72721213, "num_input_tokens_seen": 61847915, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.2779541, "step": 2853, "time_per_iteration": 2.9042739868164062 }, { "auxiliary_loss_clip": 0.0163249, "auxiliary_loss_mlp": 0.01053343, "balance_loss_clip": 1.39740908, "balance_loss_mlp": 1.02587724, "epoch": 0.17159176311438448, "flos": 18232372638720.0, "grad_norm": 1.9651989038372577, "language_loss": 0.67544657, "learning_rate": 3.7933541464482815e-06, "loss": 0.7023049, "num_input_tokens_seen": 61865570, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.2746582, "step": 2854, "time_per_iteration": 2.813811779022217 }, { "auxiliary_loss_clip": 0.01607327, "auxiliary_loss_mlp": 0.01054934, "balance_loss_clip": 1.37902641, "balance_loss_mlp": 1.02833879, "epoch": 0.17165188636705245, "flos": 20748281034240.0, "grad_norm": 1.4999436386006164, "language_loss": 0.89763689, "learning_rate": 3.7931817035073124e-06, "loss": 0.92425954, "num_input_tokens_seen": 61883340, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26599121, "step": 2855, "time_per_iteration": 2.8968088626861572 }, { "auxiliary_loss_clip": 0.01621035, "auxiliary_loss_mlp": 0.01048449, "balance_loss_clip": 1.39150953, "balance_loss_mlp": 1.02199674, "epoch": 0.17171200961972044, "flos": 24910652885760.0, "grad_norm": 3.2177419471930815, "language_loss": 0.84161317, "learning_rate": 3.7930091925687134e-06, "loss": 0.86830795, "num_input_tokens_seen": 61900610, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26452637, "step": 2856, "time_per_iteration": 2.875809907913208 }, { "auxiliary_loss_clip": 0.01631782, "auxiliary_loss_mlp": 0.01051543, "balance_loss_clip": 1.39961922, "balance_loss_mlp": 1.02365971, "epoch": 0.1717721328723884, "flos": 20166901608960.0, "grad_norm": 1.8378713394671833, "language_loss": 0.87111503, "learning_rate": 3.792836613639026e-06, "loss": 0.89794827, "num_input_tokens_seen": 61916795, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.27868652, "step": 2857, "time_per_iteration": 2.8666584491729736 }, { "auxiliary_loss_clip": 0.01636696, "auxiliary_loss_mlp": 0.01056204, "balance_loss_clip": 1.40285432, "balance_loss_mlp": 1.02804708, "epoch": 0.17183225612505637, "flos": 23370506006400.0, "grad_norm": 1.9723516071370553, "language_loss": 0.78922021, "learning_rate": 3.7926639667247947e-06, "loss": 0.81614912, "num_input_tokens_seen": 61936665, "router_z_loss_clip": 2.34375, "router_z_loss_mlp": 0.28198242, "step": 2858, "time_per_iteration": 2.875959873199463 }, { "auxiliary_loss_clip": 0.01647275, "auxiliary_loss_mlp": 0.01059591, "balance_loss_clip": 1.40510428, "balance_loss_mlp": 1.03043246, "epoch": 0.17189237937772434, "flos": 18123431863680.0, "grad_norm": 1.9171488850389518, "language_loss": 0.77805775, "learning_rate": 3.7924912518325663e-06, "loss": 0.80512643, "num_input_tokens_seen": 61954415, "router_z_loss_clip": 2.42382812, "router_z_loss_mlp": 0.29150391, "step": 2859, "time_per_iteration": 2.8499233722686768 }, { "auxiliary_loss_clip": 0.01622195, "auxiliary_loss_mlp": 0.0105319, "balance_loss_clip": 1.3912468, "balance_loss_mlp": 1.02671373, "epoch": 0.1719525026303923, "flos": 23268849644160.0, "grad_norm": 1.8187726868356588, "language_loss": 0.7751565, "learning_rate": 3.7923184689688902e-06, "loss": 0.8019104, "num_input_tokens_seen": 61973940, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.26477051, "step": 2860, "time_per_iteration": 2.903247117996216 }, { "auxiliary_loss_clip": 0.01633084, "auxiliary_loss_mlp": 0.01052044, "balance_loss_clip": 1.39671302, "balance_loss_mlp": 1.02530503, "epoch": 0.17201262588306027, "flos": 20819324649600.0, "grad_norm": 2.1701632797666126, "language_loss": 0.82243323, "learning_rate": 3.792145618140317e-06, "loss": 0.84928453, "num_input_tokens_seen": 61991845, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.2677002, "step": 2861, "time_per_iteration": 2.834003210067749 }, { "auxiliary_loss_clip": 0.01634026, "auxiliary_loss_mlp": 0.01053526, "balance_loss_clip": 1.39978838, "balance_loss_mlp": 1.02728772, "epoch": 0.17207274913572823, "flos": 20385507075840.0, "grad_norm": 2.0616255801719494, "language_loss": 0.87142777, "learning_rate": 3.7919726993534038e-06, "loss": 0.89830327, "num_input_tokens_seen": 62009395, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.26269531, "step": 2862, "time_per_iteration": 2.8595054149627686 }, { "auxiliary_loss_clip": 0.01617773, "auxiliary_loss_mlp": 0.01049987, "balance_loss_clip": 1.38941741, "balance_loss_mlp": 1.0230813, "epoch": 0.17213287238839622, "flos": 26809139733120.0, "grad_norm": 1.8477965123029554, "language_loss": 0.7916314, "learning_rate": 3.7917997126147054e-06, "loss": 0.81830895, "num_input_tokens_seen": 62029005, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.26916504, "step": 2863, "time_per_iteration": 2.8680238723754883 }, { "auxiliary_loss_clip": 0.01619498, "auxiliary_loss_mlp": 0.01046168, "balance_loss_clip": 1.39095819, "balance_loss_mlp": 1.01840448, "epoch": 0.1721929956410642, "flos": 26041848848640.0, "grad_norm": 2.1053294688458384, "language_loss": 0.73797679, "learning_rate": 3.7916266579307823e-06, "loss": 0.76463342, "num_input_tokens_seen": 62048730, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27758789, "step": 2864, "time_per_iteration": 2.9081218242645264 }, { "auxiliary_loss_clip": 0.01649655, "auxiliary_loss_mlp": 0.01049555, "balance_loss_clip": 1.41287899, "balance_loss_mlp": 1.02156425, "epoch": 0.17225311889373215, "flos": 22283224761600.0, "grad_norm": 1.754793282992755, "language_loss": 0.73624116, "learning_rate": 3.7914535353081973e-06, "loss": 0.7632333, "num_input_tokens_seen": 62069000, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.28027344, "step": 2865, "time_per_iteration": 2.842339277267456 }, { "auxiliary_loss_clip": 0.01633465, "auxiliary_loss_mlp": 0.0104925, "balance_loss_clip": 1.40062165, "balance_loss_mlp": 1.0214262, "epoch": 0.17231324214640012, "flos": 21297599879040.0, "grad_norm": 2.250846319648036, "language_loss": 0.80163974, "learning_rate": 3.7912803447535145e-06, "loss": 0.82846689, "num_input_tokens_seen": 62086750, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.27783203, "step": 2866, "time_per_iteration": 2.8709352016448975 }, { "auxiliary_loss_clip": 0.01627577, "auxiliary_loss_mlp": 0.01046946, "balance_loss_clip": 1.3927834, "balance_loss_mlp": 1.01801372, "epoch": 0.17237336539906808, "flos": 19689712254720.0, "grad_norm": 1.6613665084014997, "language_loss": 0.80434775, "learning_rate": 3.7911070862733016e-06, "loss": 0.83109295, "num_input_tokens_seen": 62106240, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.28942871, "step": 2867, "time_per_iteration": 2.8786492347717285 }, { "auxiliary_loss_clip": 0.01627817, "auxiliary_loss_mlp": 0.01053462, "balance_loss_clip": 1.39518309, "balance_loss_mlp": 1.02470875, "epoch": 0.17243348865173605, "flos": 17538478099200.0, "grad_norm": 1.9220886547893297, "language_loss": 0.80312109, "learning_rate": 3.7909337598741276e-06, "loss": 0.82993388, "num_input_tokens_seen": 62124895, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.28771973, "step": 2868, "time_per_iteration": 2.807342767715454 }, { "auxiliary_loss_clip": 0.01641906, "auxiliary_loss_mlp": 0.01048285, "balance_loss_clip": 1.40398574, "balance_loss_mlp": 1.01977038, "epoch": 0.17249361190440402, "flos": 18269002944000.0, "grad_norm": 2.5580785067046277, "language_loss": 0.84989595, "learning_rate": 3.7907603655625674e-06, "loss": 0.87679785, "num_input_tokens_seen": 62143510, "router_z_loss_clip": 2.37695312, "router_z_loss_mlp": 0.28540039, "step": 2869, "time_per_iteration": 2.8509926795959473 }, { "auxiliary_loss_clip": 0.01633436, "auxiliary_loss_mlp": 0.01051731, "balance_loss_clip": 1.39994264, "balance_loss_mlp": 1.02221465, "epoch": 0.172553735157072, "flos": 21183139238400.0, "grad_norm": 1.817340716412683, "language_loss": 0.78106964, "learning_rate": 3.7905869033451932e-06, "loss": 0.80792129, "num_input_tokens_seen": 62162285, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.29541016, "step": 2870, "time_per_iteration": 2.8495724201202393 }, { "auxiliary_loss_clip": 0.01619664, "auxiliary_loss_mlp": 0.01036897, "balance_loss_clip": 1.39453387, "balance_loss_mlp": 1.01152956, "epoch": 0.17261385840973997, "flos": 22283450985600.0, "grad_norm": 1.714521544714359, "language_loss": 0.78355873, "learning_rate": 3.7904133732285857e-06, "loss": 0.81012428, "num_input_tokens_seen": 62180970, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.25378418, "step": 2871, "time_per_iteration": 2.9107699394226074 }, { "auxiliary_loss_clip": 0.01640803, "auxiliary_loss_mlp": 0.01056483, "balance_loss_clip": 1.40466702, "balance_loss_mlp": 1.02858853, "epoch": 0.17267398166240794, "flos": 27932236876800.0, "grad_norm": 2.110833919427104, "language_loss": 0.75499499, "learning_rate": 3.7902397752193228e-06, "loss": 0.78196782, "num_input_tokens_seen": 62198965, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.27905273, "step": 2872, "time_per_iteration": 2.9394659996032715 }, { "auxiliary_loss_clip": 0.01625771, "auxiliary_loss_mlp": 0.01050417, "balance_loss_clip": 1.39689016, "balance_loss_mlp": 1.02298737, "epoch": 0.1727341049150759, "flos": 21955271316480.0, "grad_norm": 1.6396490380994089, "language_loss": 0.83021456, "learning_rate": 3.790066109323988e-06, "loss": 0.85697645, "num_input_tokens_seen": 62219890, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27429199, "step": 2873, "time_per_iteration": 2.8937807083129883 }, { "auxiliary_loss_clip": 0.01627455, "auxiliary_loss_mlp": 0.01049929, "balance_loss_clip": 1.3965919, "balance_loss_mlp": 1.02066326, "epoch": 0.17279422816774387, "flos": 18115423534080.0, "grad_norm": 2.01797401827311, "language_loss": 0.75674272, "learning_rate": 3.7898923755491678e-06, "loss": 0.78351653, "num_input_tokens_seen": 62237140, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.29284668, "step": 2874, "time_per_iteration": 4.301179647445679 }, { "auxiliary_loss_clip": 0.01636421, "auxiliary_loss_mlp": 0.010555, "balance_loss_clip": 1.40091753, "balance_loss_mlp": 1.02722394, "epoch": 0.17285435142041183, "flos": 21845516135040.0, "grad_norm": 2.1079780662817367, "language_loss": 0.81981921, "learning_rate": 3.7897185739014487e-06, "loss": 0.8467384, "num_input_tokens_seen": 62255405, "router_z_loss_clip": 2.35742188, "router_z_loss_mlp": 0.28271484, "step": 2875, "time_per_iteration": 2.865319013595581 }, { "auxiliary_loss_clip": 0.01639776, "auxiliary_loss_mlp": 0.01056509, "balance_loss_clip": 1.40273213, "balance_loss_mlp": 1.02866232, "epoch": 0.17291447467307983, "flos": 18377400781440.0, "grad_norm": 2.317335940964352, "language_loss": 0.88980877, "learning_rate": 3.7895447043874217e-06, "loss": 0.91677165, "num_input_tokens_seen": 62271280, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.27856445, "step": 2876, "time_per_iteration": 2.820697546005249 }, { "auxiliary_loss_clip": 0.01641822, "auxiliary_loss_mlp": 0.01052387, "balance_loss_clip": 1.40844154, "balance_loss_mlp": 1.02468276, "epoch": 0.1729745979257478, "flos": 18633858163200.0, "grad_norm": 2.042506142251443, "language_loss": 0.86009514, "learning_rate": 3.789370767013681e-06, "loss": 0.88703716, "num_input_tokens_seen": 62289140, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.27697754, "step": 2877, "time_per_iteration": 2.8731465339660645 }, { "auxiliary_loss_clip": 0.01643229, "auxiliary_loss_mlp": 0.01053168, "balance_loss_clip": 1.4077388, "balance_loss_mlp": 1.02644074, "epoch": 0.17303472117841576, "flos": 23007370089600.0, "grad_norm": 2.585433965030499, "language_loss": 0.80597019, "learning_rate": 3.7891967617868204e-06, "loss": 0.8329342, "num_input_tokens_seen": 62307490, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.26733398, "step": 2878, "time_per_iteration": 2.8330061435699463 }, { "auxiliary_loss_clip": 0.01641771, "auxiliary_loss_mlp": 0.01056844, "balance_loss_clip": 1.40820384, "balance_loss_mlp": 1.03016472, "epoch": 0.17309484443108372, "flos": 25674912368640.0, "grad_norm": 1.5165889813248241, "language_loss": 0.71685362, "learning_rate": 3.78902268871344e-06, "loss": 0.7438398, "num_input_tokens_seen": 62328570, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.26660156, "step": 2879, "time_per_iteration": 2.906954050064087 }, { "auxiliary_loss_clip": 0.01633527, "auxiliary_loss_mlp": 0.01054234, "balance_loss_clip": 1.39746094, "balance_loss_mlp": 1.02567124, "epoch": 0.1731549676837517, "flos": 13560750852480.0, "grad_norm": 2.075902235316382, "language_loss": 0.84916943, "learning_rate": 3.78884854780014e-06, "loss": 0.87604707, "num_input_tokens_seen": 62345735, "router_z_loss_clip": 2.359375, "router_z_loss_mlp": 0.28588867, "step": 2880, "time_per_iteration": 2.7952699661254883 }, { "auxiliary_loss_clip": 0.01650575, "auxiliary_loss_mlp": 0.01051124, "balance_loss_clip": 1.4116025, "balance_loss_mlp": 1.02381301, "epoch": 0.17321509093641965, "flos": 22867409364480.0, "grad_norm": 2.0587562863674034, "language_loss": 0.82571435, "learning_rate": 3.7886743390535236e-06, "loss": 0.85273135, "num_input_tokens_seen": 62365525, "router_z_loss_clip": 2.39257812, "router_z_loss_mlp": 0.27355957, "step": 2881, "time_per_iteration": 4.250097036361694 }, { "auxiliary_loss_clip": 0.01631244, "auxiliary_loss_mlp": 0.01049813, "balance_loss_clip": 1.39733875, "balance_loss_mlp": 1.02471972, "epoch": 0.17327521418908762, "flos": 24363460546560.0, "grad_norm": 1.7693101890567067, "language_loss": 0.77868998, "learning_rate": 3.788500062480197e-06, "loss": 0.80550063, "num_input_tokens_seen": 62385160, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.25097656, "step": 2882, "time_per_iteration": 2.892826795578003 }, { "auxiliary_loss_clip": 0.01636301, "auxiliary_loss_mlp": 0.01054038, "balance_loss_clip": 1.40472627, "balance_loss_mlp": 1.02726388, "epoch": 0.1733353374417556, "flos": 33117633077760.0, "grad_norm": 1.9491577240449371, "language_loss": 0.77310652, "learning_rate": 3.788325718086769e-06, "loss": 0.80000985, "num_input_tokens_seen": 62405280, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.2677002, "step": 2883, "time_per_iteration": 4.383300065994263 }, { "auxiliary_loss_clip": 0.01625288, "auxiliary_loss_mlp": 0.01052051, "balance_loss_clip": 1.3917774, "balance_loss_mlp": 1.02434695, "epoch": 0.17339546069442358, "flos": 24399547914240.0, "grad_norm": 2.015922795578572, "language_loss": 0.86269844, "learning_rate": 3.7881513058798503e-06, "loss": 0.88947183, "num_input_tokens_seen": 62423665, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.27709961, "step": 2884, "time_per_iteration": 4.3652215003967285 }, { "auxiliary_loss_clip": 0.01629365, "auxiliary_loss_mlp": 0.01048082, "balance_loss_clip": 1.39744365, "balance_loss_mlp": 1.022452, "epoch": 0.17345558394709154, "flos": 27465680050560.0, "grad_norm": 1.4877797200063771, "language_loss": 0.75731319, "learning_rate": 3.787976825866055e-06, "loss": 0.78408766, "num_input_tokens_seen": 62445170, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.25646973, "step": 2885, "time_per_iteration": 2.9380688667297363 }, { "auxiliary_loss_clip": 0.01630786, "auxiliary_loss_mlp": 0.01047394, "balance_loss_clip": 1.40302038, "balance_loss_mlp": 1.02199006, "epoch": 0.1735157071997595, "flos": 24693178538880.0, "grad_norm": 1.4327145704245927, "language_loss": 0.7188493, "learning_rate": 3.7878022780519998e-06, "loss": 0.7456311, "num_input_tokens_seen": 62466135, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.25427246, "step": 2886, "time_per_iteration": 2.932492971420288 }, { "auxiliary_loss_clip": 0.01628649, "auxiliary_loss_mlp": 0.01056137, "balance_loss_clip": 1.39567304, "balance_loss_mlp": 1.0283618, "epoch": 0.17357583045242747, "flos": 21698813934720.0, "grad_norm": 2.1183313962572847, "language_loss": 0.70203257, "learning_rate": 3.7876276624443024e-06, "loss": 0.72888041, "num_input_tokens_seen": 62483910, "router_z_loss_clip": 2.33007812, "router_z_loss_mlp": 0.27783203, "step": 2887, "time_per_iteration": 2.8332271575927734 }, { "auxiliary_loss_clip": 0.01633627, "auxiliary_loss_mlp": 0.01053127, "balance_loss_clip": 1.40089309, "balance_loss_mlp": 1.02712786, "epoch": 0.17363595370509544, "flos": 15383669604480.0, "grad_norm": 1.6030740472724412, "language_loss": 0.85973579, "learning_rate": 3.787452979049585e-06, "loss": 0.88660336, "num_input_tokens_seen": 62501530, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26013184, "step": 2888, "time_per_iteration": 2.8336009979248047 }, { "auxiliary_loss_clip": 0.01649746, "auxiliary_loss_mlp": 0.01054382, "balance_loss_clip": 1.41486764, "balance_loss_mlp": 1.02527082, "epoch": 0.1736960769577634, "flos": 23451458232960.0, "grad_norm": 2.203216711235482, "language_loss": 0.80506301, "learning_rate": 3.7872782278744718e-06, "loss": 0.83210427, "num_input_tokens_seen": 62521295, "router_z_loss_clip": 2.34570312, "router_z_loss_mlp": 0.29101562, "step": 2889, "time_per_iteration": 2.886361837387085 }, { "auxiliary_loss_clip": 0.01625194, "auxiliary_loss_mlp": 0.01054048, "balance_loss_clip": 1.39716625, "balance_loss_mlp": 1.02667785, "epoch": 0.1737562002104314, "flos": 18596956389120.0, "grad_norm": 2.650369707809917, "language_loss": 0.85291713, "learning_rate": 3.7871034089255883e-06, "loss": 0.8797096, "num_input_tokens_seen": 62539615, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.27392578, "step": 2890, "time_per_iteration": 2.8939504623413086 }, { "auxiliary_loss_clip": 0.01650691, "auxiliary_loss_mlp": 0.01050952, "balance_loss_clip": 1.41426754, "balance_loss_mlp": 1.02343893, "epoch": 0.17381632346309936, "flos": 16006701507840.0, "grad_norm": 2.6683810226414804, "language_loss": 0.83437383, "learning_rate": 3.7869285222095653e-06, "loss": 0.86139023, "num_input_tokens_seen": 62556820, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.27539062, "step": 2891, "time_per_iteration": 2.871232032775879 }, { "auxiliary_loss_clip": 0.01659437, "auxiliary_loss_mlp": 0.01052409, "balance_loss_clip": 1.418648, "balance_loss_mlp": 1.02387071, "epoch": 0.17387644671576732, "flos": 13377463591680.0, "grad_norm": 2.157608811100299, "language_loss": 0.82369673, "learning_rate": 3.7867535677330334e-06, "loss": 0.85081518, "num_input_tokens_seen": 62572450, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.28540039, "step": 2892, "time_per_iteration": 2.776482582092285 }, { "auxiliary_loss_clip": 0.01660172, "auxiliary_loss_mlp": 0.01058968, "balance_loss_clip": 1.42324495, "balance_loss_mlp": 1.03076267, "epoch": 0.1739365699684353, "flos": 26626938347520.0, "grad_norm": 1.6740093401938398, "language_loss": 0.75374609, "learning_rate": 3.786578545502627e-06, "loss": 0.78093755, "num_input_tokens_seen": 62592580, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.28173828, "step": 2893, "time_per_iteration": 2.9211268424987793 }, { "auxiliary_loss_clip": 0.0164104, "auxiliary_loss_mlp": 0.01048002, "balance_loss_clip": 1.40517497, "balance_loss_mlp": 1.02102542, "epoch": 0.17399669322110325, "flos": 23378469091200.0, "grad_norm": 1.8835910226118764, "language_loss": 0.82699746, "learning_rate": 3.7864034555249828e-06, "loss": 0.85388792, "num_input_tokens_seen": 62611220, "router_z_loss_clip": 2.36328125, "router_z_loss_mlp": 0.26928711, "step": 2894, "time_per_iteration": 2.839189291000366 }, { "auxiliary_loss_clip": 0.01648383, "auxiliary_loss_mlp": 0.01051619, "balance_loss_clip": 1.41346383, "balance_loss_mlp": 1.02117324, "epoch": 0.17405681647377122, "flos": 22064166846720.0, "grad_norm": 1.8884708948098978, "language_loss": 0.75275254, "learning_rate": 3.786228297806741e-06, "loss": 0.77975255, "num_input_tokens_seen": 62629185, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.30444336, "step": 2895, "time_per_iteration": 2.8679981231689453 }, { "auxiliary_loss_clip": 0.0136592, "auxiliary_loss_mlp": 0.01036913, "balance_loss_clip": 1.23162127, "balance_loss_mlp": 1.0207963, "epoch": 0.1741169397264392, "flos": 61487008381440.0, "grad_norm": 0.877650649642171, "language_loss": 0.62851411, "learning_rate": 3.7860530723545435e-06, "loss": 0.65254241, "num_input_tokens_seen": 62691895, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.16113281, "step": 2896, "time_per_iteration": 3.499652147293091 }, { "auxiliary_loss_clip": 0.01640483, "auxiliary_loss_mlp": 0.01050256, "balance_loss_clip": 1.40570939, "balance_loss_mlp": 1.02129984, "epoch": 0.17417706297910718, "flos": 27029419257600.0, "grad_norm": 1.6974573428065702, "language_loss": 0.76829791, "learning_rate": 3.785877779175034e-06, "loss": 0.7952053, "num_input_tokens_seen": 62713790, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.28967285, "step": 2897, "time_per_iteration": 2.9249603748321533 }, { "auxiliary_loss_clip": 0.01624301, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.39589, "balance_loss_mlp": 1.01615369, "epoch": 0.17423718623177514, "flos": 33521426087040.0, "grad_norm": 1.6961086473796378, "language_loss": 0.69573861, "learning_rate": 3.7857024182748606e-06, "loss": 0.72241998, "num_input_tokens_seen": 62736285, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27697754, "step": 2898, "time_per_iteration": 2.999910354614258 }, { "auxiliary_loss_clip": 0.01656325, "auxiliary_loss_mlp": 0.01048403, "balance_loss_clip": 1.41804075, "balance_loss_mlp": 1.02067542, "epoch": 0.1742973094844431, "flos": 27210037075200.0, "grad_norm": 2.342783761968962, "language_loss": 0.77579415, "learning_rate": 3.7855269896606717e-06, "loss": 0.80284148, "num_input_tokens_seen": 62756240, "router_z_loss_clip": 2.38085938, "router_z_loss_mlp": 0.27746582, "step": 2899, "time_per_iteration": 2.897024393081665 }, { "auxiliary_loss_clip": 0.01621999, "auxiliary_loss_mlp": 0.01044264, "balance_loss_clip": 1.39324284, "balance_loss_mlp": 1.01648855, "epoch": 0.17435743273711107, "flos": 22720933388160.0, "grad_norm": 1.8292987891557757, "language_loss": 0.73842347, "learning_rate": 3.785351493339121e-06, "loss": 0.76508605, "num_input_tokens_seen": 62775910, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.27746582, "step": 2900, "time_per_iteration": 2.9009578227996826 }, { "auxiliary_loss_clip": 0.01639036, "auxiliary_loss_mlp": 0.01050714, "balance_loss_clip": 1.40720308, "balance_loss_mlp": 1.02224684, "epoch": 0.17441755598977904, "flos": 41661479940480.0, "grad_norm": 1.4913788490848425, "language_loss": 0.70868152, "learning_rate": 3.785175929316863e-06, "loss": 0.73557901, "num_input_tokens_seen": 62799385, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.28466797, "step": 2901, "time_per_iteration": 3.035282850265503 }, { "auxiliary_loss_clip": 0.01652639, "auxiliary_loss_mlp": 0.01042661, "balance_loss_clip": 1.41750813, "balance_loss_mlp": 1.01467037, "epoch": 0.174477679242447, "flos": 26298396720000.0, "grad_norm": 1.6980890118681062, "language_loss": 0.76763457, "learning_rate": 3.7850002976005543e-06, "loss": 0.79458761, "num_input_tokens_seen": 62819380, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.27978516, "step": 2902, "time_per_iteration": 2.892570734024048 }, { "auxiliary_loss_clip": 0.01643983, "auxiliary_loss_mlp": 0.01050635, "balance_loss_clip": 1.41057622, "balance_loss_mlp": 1.02228713, "epoch": 0.174537802495115, "flos": 17867426929920.0, "grad_norm": 4.941679178011067, "language_loss": 0.82878745, "learning_rate": 3.7848245981968558e-06, "loss": 0.85573363, "num_input_tokens_seen": 62836205, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.28344727, "step": 2903, "time_per_iteration": 2.89803409576416 }, { "auxiliary_loss_clip": 0.01628102, "auxiliary_loss_mlp": 0.01046342, "balance_loss_clip": 1.39958954, "balance_loss_mlp": 1.01959109, "epoch": 0.17459792574778296, "flos": 16948049713920.0, "grad_norm": 2.0291559735126308, "language_loss": 0.74938911, "learning_rate": 3.784648831112429e-06, "loss": 0.77613354, "num_input_tokens_seen": 62854045, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.2677002, "step": 2904, "time_per_iteration": 2.804347515106201 }, { "auxiliary_loss_clip": 0.01627131, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.39500189, "balance_loss_mlp": 1.02063894, "epoch": 0.17465804900045093, "flos": 25530924856320.0, "grad_norm": 1.7724340846560571, "language_loss": 0.65704334, "learning_rate": 3.7844729963539406e-06, "loss": 0.68378103, "num_input_tokens_seen": 62873075, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26013184, "step": 2905, "time_per_iteration": 2.922879219055176 }, { "auxiliary_loss_clip": 0.01657209, "auxiliary_loss_mlp": 0.01052628, "balance_loss_clip": 1.41621935, "balance_loss_mlp": 1.02296829, "epoch": 0.1747181722531189, "flos": 24139244724480.0, "grad_norm": 1.8968010165143598, "language_loss": 0.80352318, "learning_rate": 3.7842970939280566e-06, "loss": 0.83062154, "num_input_tokens_seen": 62892675, "router_z_loss_clip": 2.40625, "router_z_loss_mlp": 0.29663086, "step": 2906, "time_per_iteration": 2.8492133617401123 }, { "auxiliary_loss_clip": 0.01641726, "auxiliary_loss_mlp": 0.01061291, "balance_loss_clip": 1.40648675, "balance_loss_mlp": 1.03352702, "epoch": 0.17477829550578686, "flos": 17757762238080.0, "grad_norm": 1.8111513567333353, "language_loss": 0.82217312, "learning_rate": 3.784121123841449e-06, "loss": 0.84920329, "num_input_tokens_seen": 62910675, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.27770996, "step": 2907, "time_per_iteration": 2.8808391094207764 }, { "auxiliary_loss_clip": 0.01636596, "auxiliary_loss_mlp": 0.01052954, "balance_loss_clip": 1.40230727, "balance_loss_mlp": 1.02584541, "epoch": 0.17483841875845482, "flos": 15385705620480.0, "grad_norm": 2.006332569175584, "language_loss": 0.83116955, "learning_rate": 3.7839450861007886e-06, "loss": 0.85806501, "num_input_tokens_seen": 62928130, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.27124023, "step": 2908, "time_per_iteration": 2.8707082271575928 }, { "auxiliary_loss_clip": 0.0162986, "auxiliary_loss_mlp": 0.01059422, "balance_loss_clip": 1.39904976, "balance_loss_mlp": 1.03119349, "epoch": 0.17489854201112282, "flos": 17171586864000.0, "grad_norm": 2.318113387163912, "language_loss": 0.81748843, "learning_rate": 3.7837689807127518e-06, "loss": 0.84438127, "num_input_tokens_seen": 62944290, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.28186035, "step": 2909, "time_per_iteration": 4.289958715438843 }, { "auxiliary_loss_clip": 0.01642522, "auxiliary_loss_mlp": 0.01056213, "balance_loss_clip": 1.40884233, "balance_loss_mlp": 1.02910447, "epoch": 0.17495866526379078, "flos": 19764692167680.0, "grad_norm": 1.7836296017424638, "language_loss": 0.77106416, "learning_rate": 3.783592807684017e-06, "loss": 0.79805148, "num_input_tokens_seen": 62963505, "router_z_loss_clip": 2.3359375, "router_z_loss_mlp": 0.27124023, "step": 2910, "time_per_iteration": 2.894209384918213 }, { "auxiliary_loss_clip": 0.01631241, "auxiliary_loss_mlp": 0.01051031, "balance_loss_clip": 1.40010786, "balance_loss_mlp": 1.02305245, "epoch": 0.17501878851645875, "flos": 28522620017280.0, "grad_norm": 10.751554962096531, "language_loss": 0.87556946, "learning_rate": 3.7834165670212645e-06, "loss": 0.90239215, "num_input_tokens_seen": 62985020, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 0.27990723, "step": 2911, "time_per_iteration": 2.943964719772339 }, { "auxiliary_loss_clip": 0.01645683, "auxiliary_loss_mlp": 0.01049063, "balance_loss_clip": 1.41398501, "balance_loss_mlp": 1.02281296, "epoch": 0.1750789117691267, "flos": 17940189847680.0, "grad_norm": 2.0176788172800153, "language_loss": 0.90612984, "learning_rate": 3.7832402587311764e-06, "loss": 0.93307739, "num_input_tokens_seen": 63001745, "router_z_loss_clip": 2.31640625, "router_z_loss_mlp": 0.26269531, "step": 2912, "time_per_iteration": 2.7971255779266357 }, { "auxiliary_loss_clip": 0.01656304, "auxiliary_loss_mlp": 0.01055331, "balance_loss_clip": 1.41879404, "balance_loss_mlp": 1.02848518, "epoch": 0.17513903502179468, "flos": 18268460006400.0, "grad_norm": 1.821388732532384, "language_loss": 0.74081528, "learning_rate": 3.783063882820439e-06, "loss": 0.76793158, "num_input_tokens_seen": 63019750, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.26879883, "step": 2913, "time_per_iteration": 2.8408894538879395 }, { "auxiliary_loss_clip": 0.01634102, "auxiliary_loss_mlp": 0.01050385, "balance_loss_clip": 1.40582705, "balance_loss_mlp": 1.02382517, "epoch": 0.17519915827446264, "flos": 20714591640960.0, "grad_norm": 1.8280092042543734, "language_loss": 0.70523477, "learning_rate": 3.782887439295741e-06, "loss": 0.73207963, "num_input_tokens_seen": 63039500, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26550293, "step": 2914, "time_per_iteration": 2.816277265548706 }, { "auxiliary_loss_clip": 0.01630143, "auxiliary_loss_mlp": 0.01050798, "balance_loss_clip": 1.40147889, "balance_loss_mlp": 1.02450085, "epoch": 0.1752592815271306, "flos": 20533521375360.0, "grad_norm": 2.410958784187916, "language_loss": 0.94405955, "learning_rate": 3.782710928163772e-06, "loss": 0.97086906, "num_input_tokens_seen": 63059785, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26306152, "step": 2915, "time_per_iteration": 2.861619472503662 }, { "auxiliary_loss_clip": 0.01621858, "auxiliary_loss_mlp": 0.01055466, "balance_loss_clip": 1.3949188, "balance_loss_mlp": 1.02823877, "epoch": 0.1753194047797986, "flos": 21809293032960.0, "grad_norm": 1.7615862567716367, "language_loss": 0.81756401, "learning_rate": 3.782534349431226e-06, "loss": 0.84433722, "num_input_tokens_seen": 63079385, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.27246094, "step": 2916, "time_per_iteration": 4.355432033538818 }, { "auxiliary_loss_clip": 0.01643309, "auxiliary_loss_mlp": 0.01054005, "balance_loss_clip": 1.41060579, "balance_loss_mlp": 1.0269326, "epoch": 0.17537952803246656, "flos": 20678232804480.0, "grad_norm": 1.6689210587084746, "language_loss": 0.74754357, "learning_rate": 3.782357703104799e-06, "loss": 0.7745167, "num_input_tokens_seen": 63098970, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.27075195, "step": 2917, "time_per_iteration": 2.8803858757019043 }, { "auxiliary_loss_clip": 0.01630025, "auxiliary_loss_mlp": 0.01051694, "balance_loss_clip": 1.40310526, "balance_loss_mlp": 1.02527761, "epoch": 0.17543965128513453, "flos": 23305570439040.0, "grad_norm": 1.9342664611213547, "language_loss": 0.77496624, "learning_rate": 3.7821809891911897e-06, "loss": 0.80178344, "num_input_tokens_seen": 63118750, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26403809, "step": 2918, "time_per_iteration": 4.300485849380493 }, { "auxiliary_loss_clip": 0.01661164, "auxiliary_loss_mlp": 0.01053977, "balance_loss_clip": 1.42300403, "balance_loss_mlp": 1.02643955, "epoch": 0.1754997745378025, "flos": 29107438047360.0, "grad_norm": 2.034844487956608, "language_loss": 0.75399876, "learning_rate": 3.782004207697098e-06, "loss": 0.78115022, "num_input_tokens_seen": 63136865, "router_z_loss_clip": 2.3828125, "router_z_loss_mlp": 0.27563477, "step": 2919, "time_per_iteration": 4.387455940246582 }, { "auxiliary_loss_clip": 0.01643606, "auxiliary_loss_mlp": 0.01057846, "balance_loss_clip": 1.40769362, "balance_loss_mlp": 1.03153682, "epoch": 0.17555989779047046, "flos": 30383797887360.0, "grad_norm": 1.720157431518989, "language_loss": 0.75575703, "learning_rate": 3.781827358629228e-06, "loss": 0.78277159, "num_input_tokens_seen": 63158325, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.26306152, "step": 2920, "time_per_iteration": 2.927471160888672 }, { "auxiliary_loss_clip": 0.01622832, "auxiliary_loss_mlp": 0.0104534, "balance_loss_clip": 1.39551866, "balance_loss_mlp": 1.02085447, "epoch": 0.17562002104313842, "flos": 23296340499840.0, "grad_norm": 2.0757910658212024, "language_loss": 0.80180454, "learning_rate": 3.7816504419942873e-06, "loss": 0.8284862, "num_input_tokens_seen": 63173115, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.24462891, "step": 2921, "time_per_iteration": 2.810398817062378 }, { "auxiliary_loss_clip": 0.0165343, "auxiliary_loss_mlp": 0.01053292, "balance_loss_clip": 1.41668248, "balance_loss_mlp": 1.02632642, "epoch": 0.1756801442958064, "flos": 24801259662720.0, "grad_norm": 1.5969847538369375, "language_loss": 0.88529325, "learning_rate": 3.7814734577989823e-06, "loss": 0.91236043, "num_input_tokens_seen": 63192880, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.26989746, "step": 2922, "time_per_iteration": 2.945934772491455 }, { "auxiliary_loss_clip": 0.01635525, "auxiliary_loss_mlp": 0.01052364, "balance_loss_clip": 1.40031171, "balance_loss_mlp": 1.02563739, "epoch": 0.17574026754847438, "flos": 25781636148480.0, "grad_norm": 3.8933677402424056, "language_loss": 0.64108264, "learning_rate": 3.7812964060500253e-06, "loss": 0.66796154, "num_input_tokens_seen": 63214395, "router_z_loss_clip": 2.3515625, "router_z_loss_mlp": 0.26757812, "step": 2923, "time_per_iteration": 2.9647111892700195 }, { "auxiliary_loss_clip": 0.01641889, "auxiliary_loss_mlp": 0.01047811, "balance_loss_clip": 1.40977192, "balance_loss_mlp": 1.02111959, "epoch": 0.17580039080114235, "flos": 17465398467840.0, "grad_norm": 2.703590365731881, "language_loss": 0.82282996, "learning_rate": 3.78111928675413e-06, "loss": 0.84972697, "num_input_tokens_seen": 63231020, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26696777, "step": 2924, "time_per_iteration": 2.8882668018341064 }, { "auxiliary_loss_clip": 0.01651215, "auxiliary_loss_mlp": 0.01057336, "balance_loss_clip": 1.41392851, "balance_loss_mlp": 1.02919078, "epoch": 0.1758605140538103, "flos": 14872383653760.0, "grad_norm": 1.8390265932495413, "language_loss": 0.72686321, "learning_rate": 3.7809420999180126e-06, "loss": 0.75394869, "num_input_tokens_seen": 63246245, "router_z_loss_clip": 2.37109375, "router_z_loss_mlp": 0.28149414, "step": 2925, "time_per_iteration": 2.7982075214385986 }, { "auxiliary_loss_clip": 0.01623006, "auxiliary_loss_mlp": 0.01051523, "balance_loss_clip": 1.39615619, "balance_loss_mlp": 1.02327085, "epoch": 0.17592063730647828, "flos": 23014971216000.0, "grad_norm": 1.622814802581916, "language_loss": 0.71985406, "learning_rate": 3.7807648455483934e-06, "loss": 0.74659932, "num_input_tokens_seen": 63267790, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.28271484, "step": 2926, "time_per_iteration": 2.894348621368408 }, { "auxiliary_loss_clip": 0.01653045, "auxiliary_loss_mlp": 0.01046597, "balance_loss_clip": 1.41603494, "balance_loss_mlp": 1.01860631, "epoch": 0.17598076055914624, "flos": 20751448170240.0, "grad_norm": 1.7417307172847563, "language_loss": 0.86313659, "learning_rate": 3.7805875236519918e-06, "loss": 0.89013302, "num_input_tokens_seen": 63286830, "router_z_loss_clip": 2.36914062, "router_z_loss_mlp": 0.2800293, "step": 2927, "time_per_iteration": 2.902040958404541 }, { "auxiliary_loss_clip": 0.01623528, "auxiliary_loss_mlp": 0.01046521, "balance_loss_clip": 1.39614916, "balance_loss_mlp": 1.02087879, "epoch": 0.1760408838118142, "flos": 34105112997120.0, "grad_norm": 3.294574193743095, "language_loss": 0.72379196, "learning_rate": 3.7804101342355336e-06, "loss": 0.75049245, "num_input_tokens_seen": 63308870, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.25634766, "step": 2928, "time_per_iteration": 2.943513870239258 }, { "auxiliary_loss_clip": 0.01619244, "auxiliary_loss_mlp": 0.01045458, "balance_loss_clip": 1.3934741, "balance_loss_mlp": 1.01945829, "epoch": 0.1761010070644822, "flos": 24178499228160.0, "grad_norm": 1.9034731352695664, "language_loss": 0.83693314, "learning_rate": 3.780232677305744e-06, "loss": 0.86358017, "num_input_tokens_seen": 63329005, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26037598, "step": 2929, "time_per_iteration": 2.8696253299713135 }, { "auxiliary_loss_clip": 0.01630605, "auxiliary_loss_mlp": 0.01047196, "balance_loss_clip": 1.40099072, "balance_loss_mlp": 1.02151799, "epoch": 0.17616113031715017, "flos": 26587502864640.0, "grad_norm": 1.698358140683655, "language_loss": 0.80336797, "learning_rate": 3.7800551528693535e-06, "loss": 0.83014596, "num_input_tokens_seen": 63349390, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.25695801, "step": 2930, "time_per_iteration": 2.8695695400238037 }, { "auxiliary_loss_clip": 0.01642028, "auxiliary_loss_mlp": 0.01046963, "balance_loss_clip": 1.40805709, "balance_loss_mlp": 1.01892519, "epoch": 0.17622125356981813, "flos": 25677808035840.0, "grad_norm": 2.0377069883495555, "language_loss": 0.78336084, "learning_rate": 3.7798775609330927e-06, "loss": 0.81025082, "num_input_tokens_seen": 63368835, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28027344, "step": 2931, "time_per_iteration": 2.900754690170288 }, { "auxiliary_loss_clip": 0.01627901, "auxiliary_loss_mlp": 0.0104158, "balance_loss_clip": 1.3988477, "balance_loss_mlp": 1.01548505, "epoch": 0.1762813768224861, "flos": 16517218296960.0, "grad_norm": 2.694097764737487, "language_loss": 0.76317978, "learning_rate": 3.779699901503696e-06, "loss": 0.78987461, "num_input_tokens_seen": 63385220, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.26098633, "step": 2932, "time_per_iteration": 2.802534580230713 }, { "auxiliary_loss_clip": 0.01642127, "auxiliary_loss_mlp": 0.01046134, "balance_loss_clip": 1.40504503, "balance_loss_mlp": 1.01962233, "epoch": 0.17634150007515406, "flos": 11217542434560.0, "grad_norm": 2.2659570715040056, "language_loss": 0.91414493, "learning_rate": 3.7795221745879016e-06, "loss": 0.94102752, "num_input_tokens_seen": 63400865, "router_z_loss_clip": 2.375, "router_z_loss_mlp": 0.26550293, "step": 2933, "time_per_iteration": 2.8136985301971436 }, { "auxiliary_loss_clip": 0.01621127, "auxiliary_loss_mlp": 0.01051061, "balance_loss_clip": 1.39368033, "balance_loss_mlp": 1.0239526, "epoch": 0.17640162332782203, "flos": 23670516147840.0, "grad_norm": 1.6739750940612477, "language_loss": 0.88702762, "learning_rate": 3.779344380192448e-06, "loss": 0.91374946, "num_input_tokens_seen": 63421390, "router_z_loss_clip": 2.2734375, "router_z_loss_mlp": 0.27124023, "step": 2934, "time_per_iteration": 2.8906068801879883 }, { "auxiliary_loss_clip": 0.01617, "auxiliary_loss_mlp": 0.01044954, "balance_loss_clip": 1.39147055, "balance_loss_mlp": 1.01988423, "epoch": 0.17646174658049, "flos": 53815348408320.0, "grad_norm": 1.5956990718187216, "language_loss": 0.71835947, "learning_rate": 3.779166518324077e-06, "loss": 0.74497896, "num_input_tokens_seen": 63444715, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25036621, "step": 2935, "time_per_iteration": 3.1136581897735596 }, { "auxiliary_loss_clip": 0.01656298, "auxiliary_loss_mlp": 0.01043461, "balance_loss_clip": 1.41606569, "balance_loss_mlp": 1.0168891, "epoch": 0.17652186983315798, "flos": 24254655505920.0, "grad_norm": 1.9671984420325368, "language_loss": 0.71587646, "learning_rate": 3.7789885889895325e-06, "loss": 0.74287403, "num_input_tokens_seen": 63465525, "router_z_loss_clip": 2.40234375, "router_z_loss_mlp": 0.26574707, "step": 2936, "time_per_iteration": 2.9083011150360107 }, { "auxiliary_loss_clip": 0.01635802, "auxiliary_loss_mlp": 0.0104069, "balance_loss_clip": 1.40728438, "balance_loss_mlp": 1.01568007, "epoch": 0.17658199308582595, "flos": 27465815784960.0, "grad_norm": 3.6092565786113657, "language_loss": 0.72567838, "learning_rate": 3.7788105921955634e-06, "loss": 0.75244331, "num_input_tokens_seen": 63485815, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.25012207, "step": 2937, "time_per_iteration": 2.8812270164489746 }, { "auxiliary_loss_clip": 0.01645452, "auxiliary_loss_mlp": 0.01046486, "balance_loss_clip": 1.40912163, "balance_loss_mlp": 1.02016497, "epoch": 0.17664211633849392, "flos": 22428750597120.0, "grad_norm": 2.5949458988276564, "language_loss": 0.77297288, "learning_rate": 3.7786325279489184e-06, "loss": 0.79989225, "num_input_tokens_seen": 63503905, "router_z_loss_clip": 2.36523438, "router_z_loss_mlp": 0.26342773, "step": 2938, "time_per_iteration": 2.865772247314453 }, { "auxiliary_loss_clip": 0.01644154, "auxiliary_loss_mlp": 0.01053537, "balance_loss_clip": 1.40884066, "balance_loss_mlp": 1.02576137, "epoch": 0.17670223959116188, "flos": 24725239119360.0, "grad_norm": 2.187387106995236, "language_loss": 0.71918607, "learning_rate": 3.7784543962563495e-06, "loss": 0.74616301, "num_input_tokens_seen": 63521985, "router_z_loss_clip": 2.35351562, "router_z_loss_mlp": 0.27783203, "step": 2939, "time_per_iteration": 2.8809831142425537 }, { "auxiliary_loss_clip": 0.01633497, "auxiliary_loss_mlp": 0.01049019, "balance_loss_clip": 1.40299845, "balance_loss_mlp": 1.02291214, "epoch": 0.17676236284382985, "flos": 22536876965760.0, "grad_norm": 2.083775139836246, "language_loss": 0.74803144, "learning_rate": 3.7782761971246115e-06, "loss": 0.77485657, "num_input_tokens_seen": 63539830, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.2611084, "step": 2940, "time_per_iteration": 2.8164851665496826 }, { "auxiliary_loss_clip": 0.01655813, "auxiliary_loss_mlp": 0.01050521, "balance_loss_clip": 1.42109883, "balance_loss_mlp": 1.02278125, "epoch": 0.1768224860964978, "flos": 12392653115520.0, "grad_norm": 2.4586285286176697, "language_loss": 0.87024164, "learning_rate": 3.7780979305604616e-06, "loss": 0.89730501, "num_input_tokens_seen": 63555495, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27746582, "step": 2941, "time_per_iteration": 2.8304901123046875 }, { "auxiliary_loss_clip": 0.01648675, "auxiliary_loss_mlp": 0.01056187, "balance_loss_clip": 1.41319454, "balance_loss_mlp": 1.02882874, "epoch": 0.1768826093491658, "flos": 24364184463360.0, "grad_norm": 2.1935046670056697, "language_loss": 0.77445686, "learning_rate": 3.7779195965706607e-06, "loss": 0.80150551, "num_input_tokens_seen": 63575290, "router_z_loss_clip": 2.35546875, "router_z_loss_mlp": 0.27380371, "step": 2942, "time_per_iteration": 2.919424057006836 }, { "auxiliary_loss_clip": 0.01651599, "auxiliary_loss_mlp": 0.0105045, "balance_loss_clip": 1.4149276, "balance_loss_mlp": 1.02362752, "epoch": 0.17694273260183377, "flos": 23597662740480.0, "grad_norm": 1.680600467525539, "language_loss": 0.80957931, "learning_rate": 3.77774119516197e-06, "loss": 0.83659983, "num_input_tokens_seen": 63594670, "router_z_loss_clip": 2.3671875, "router_z_loss_mlp": 0.26855469, "step": 2943, "time_per_iteration": 2.9306061267852783 }, { "auxiliary_loss_clip": 0.01661323, "auxiliary_loss_mlp": 0.01051633, "balance_loss_clip": 1.42393816, "balance_loss_mlp": 1.02545428, "epoch": 0.17700285585450173, "flos": 26772328448640.0, "grad_norm": 2.825912607285867, "language_loss": 0.82061505, "learning_rate": 3.777562726341155e-06, "loss": 0.84774458, "num_input_tokens_seen": 63614780, "router_z_loss_clip": 2.37304688, "router_z_loss_mlp": 0.26184082, "step": 2944, "time_per_iteration": 4.327883005142212 }, { "auxiliary_loss_clip": 0.01626638, "auxiliary_loss_mlp": 0.01049496, "balance_loss_clip": 1.39399886, "balance_loss_mlp": 1.02352071, "epoch": 0.1770629791071697, "flos": 42791318559360.0, "grad_norm": 1.766634963712301, "language_loss": 0.7472102, "learning_rate": 3.7773841901149835e-06, "loss": 0.77397156, "num_input_tokens_seen": 63637190, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.26013184, "step": 2945, "time_per_iteration": 3.016019821166992 }, { "auxiliary_loss_clip": 0.01648341, "auxiliary_loss_mlp": 0.01046474, "balance_loss_clip": 1.41502905, "balance_loss_mlp": 1.02086735, "epoch": 0.17712310235983766, "flos": 17353923984000.0, "grad_norm": 2.2989242289044394, "language_loss": 0.7939626, "learning_rate": 3.7772055864902256e-06, "loss": 0.82091069, "num_input_tokens_seen": 63652140, "router_z_loss_clip": 2.33203125, "router_z_loss_mlp": 0.25585938, "step": 2946, "time_per_iteration": 2.829660177230835 }, { "auxiliary_loss_clip": 0.0163548, "auxiliary_loss_mlp": 0.01054483, "balance_loss_clip": 1.40610456, "balance_loss_mlp": 1.02868652, "epoch": 0.17718322561250563, "flos": 23889031125120.0, "grad_norm": 1.842446015105371, "language_loss": 0.7738834, "learning_rate": 3.7770269154736535e-06, "loss": 0.80078304, "num_input_tokens_seen": 63671700, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.25793457, "step": 2947, "time_per_iteration": 2.955284833908081 }, { "auxiliary_loss_clip": 0.01636186, "auxiliary_loss_mlp": 0.01048579, "balance_loss_clip": 1.40516829, "balance_loss_mlp": 1.0218519, "epoch": 0.1772433488651736, "flos": 36480246261120.0, "grad_norm": 1.9503009720327058, "language_loss": 0.73518908, "learning_rate": 3.7768481770720424e-06, "loss": 0.76203674, "num_input_tokens_seen": 63691685, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.26721191, "step": 2948, "time_per_iteration": 2.9634721279144287 }, { "auxiliary_loss_clip": 0.01609223, "auxiliary_loss_mlp": 0.01048962, "balance_loss_clip": 1.38399613, "balance_loss_mlp": 1.02355814, "epoch": 0.1773034721178416, "flos": 26695176785280.0, "grad_norm": 2.0575593281937725, "language_loss": 0.82462519, "learning_rate": 3.776669371292171e-06, "loss": 0.85120702, "num_input_tokens_seen": 63711720, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.25415039, "step": 2949, "time_per_iteration": 2.907304286956787 }, { "auxiliary_loss_clip": 0.01380706, "auxiliary_loss_mlp": 0.01095307, "balance_loss_clip": 1.24239326, "balance_loss_mlp": 1.07442188, "epoch": 0.17736359537050955, "flos": 57146235707520.0, "grad_norm": 0.7702825210729315, "language_loss": 0.65063787, "learning_rate": 3.7764904981408186e-06, "loss": 0.67539799, "num_input_tokens_seen": 63776280, "router_z_loss_clip": 1.3828125, "router_z_loss_mlp": 0.20898438, "step": 2950, "time_per_iteration": 4.854967355728149 }, { "auxiliary_loss_clip": 0.01611571, "auxiliary_loss_mlp": 0.01046875, "balance_loss_clip": 1.38550007, "balance_loss_mlp": 1.02273524, "epoch": 0.17742371862317752, "flos": 27209494137600.0, "grad_norm": 3.1502187614615953, "language_loss": 0.85082304, "learning_rate": 3.7763115576247686e-06, "loss": 0.87740755, "num_input_tokens_seen": 63797535, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.24133301, "step": 2951, "time_per_iteration": 2.8940465450286865 }, { "auxiliary_loss_clip": 0.01628466, "auxiliary_loss_mlp": 0.01055359, "balance_loss_clip": 1.39576638, "balance_loss_mlp": 1.03028917, "epoch": 0.17748384187584548, "flos": 20969510699520.0, "grad_norm": 2.408494408044741, "language_loss": 0.81750107, "learning_rate": 3.776132549750806e-06, "loss": 0.84433931, "num_input_tokens_seen": 63817045, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.25073242, "step": 2952, "time_per_iteration": 2.8993639945983887 }, { "auxiliary_loss_clip": 0.01626639, "auxiliary_loss_mlp": 0.01049495, "balance_loss_clip": 1.39694929, "balance_loss_mlp": 1.02126646, "epoch": 0.17754396512851345, "flos": 25020951004800.0, "grad_norm": 1.7956034835201298, "language_loss": 0.80914211, "learning_rate": 3.7759534745257194e-06, "loss": 0.83590353, "num_input_tokens_seen": 63837665, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.28222656, "step": 2953, "time_per_iteration": 4.326072454452515 }, { "auxiliary_loss_clip": 0.01629756, "auxiliary_loss_mlp": 0.01048982, "balance_loss_clip": 1.39822245, "balance_loss_mlp": 1.02394819, "epoch": 0.1776040883811814, "flos": 32064312695040.0, "grad_norm": 5.316177094787945, "language_loss": 0.89274025, "learning_rate": 3.7757743319562994e-06, "loss": 0.91952765, "num_input_tokens_seen": 63858455, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.25061035, "step": 2954, "time_per_iteration": 4.37099289894104 }, { "auxiliary_loss_clip": 0.01629607, "auxiliary_loss_mlp": 0.01056179, "balance_loss_clip": 1.39975619, "balance_loss_mlp": 1.02997637, "epoch": 0.17766421163384938, "flos": 21582588746880.0, "grad_norm": 1.7321218794579412, "language_loss": 0.86359781, "learning_rate": 3.7755951220493386e-06, "loss": 0.89045566, "num_input_tokens_seen": 63876935, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26208496, "step": 2955, "time_per_iteration": 2.8261091709136963 }, { "auxiliary_loss_clip": 0.01616259, "auxiliary_loss_mlp": 0.01046224, "balance_loss_clip": 1.38821959, "balance_loss_mlp": 1.02156007, "epoch": 0.17772433488651737, "flos": 22429565003520.0, "grad_norm": 1.6816381996325576, "language_loss": 0.72227752, "learning_rate": 3.7754158448116327e-06, "loss": 0.74890232, "num_input_tokens_seen": 63896815, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.24682617, "step": 2956, "time_per_iteration": 2.9160516262054443 }, { "auxiliary_loss_clip": 0.01604729, "auxiliary_loss_mlp": 0.01050459, "balance_loss_clip": 1.38040805, "balance_loss_mlp": 1.02438831, "epoch": 0.17778445813918534, "flos": 25640363324160.0, "grad_norm": 1.8372343765698587, "language_loss": 0.83878374, "learning_rate": 3.7752365002499795e-06, "loss": 0.86533558, "num_input_tokens_seen": 63916140, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26086426, "step": 2957, "time_per_iteration": 2.8634071350097656 }, { "auxiliary_loss_clip": 0.01606613, "auxiliary_loss_mlp": 0.01045167, "balance_loss_clip": 1.38088655, "balance_loss_mlp": 1.01963234, "epoch": 0.1778445813918533, "flos": 25639639407360.0, "grad_norm": 1.569165147851712, "language_loss": 0.75876069, "learning_rate": 3.7750570883711807e-06, "loss": 0.78527844, "num_input_tokens_seen": 63935220, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25537109, "step": 2958, "time_per_iteration": 2.899592638015747 }, { "auxiliary_loss_clip": 0.01632393, "auxiliary_loss_mlp": 0.01046973, "balance_loss_clip": 1.40318155, "balance_loss_mlp": 1.02145004, "epoch": 0.17790470464452127, "flos": 22355535231360.0, "grad_norm": 2.6932169538337267, "language_loss": 0.81959009, "learning_rate": 3.7748776091820397e-06, "loss": 0.84638381, "num_input_tokens_seen": 63954550, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.25524902, "step": 2959, "time_per_iteration": 2.886813163757324 }, { "auxiliary_loss_clip": 0.01638388, "auxiliary_loss_mlp": 0.01045764, "balance_loss_clip": 1.40372586, "balance_loss_mlp": 1.01813138, "epoch": 0.17796482789718923, "flos": 18773909377920.0, "grad_norm": 1.8258093158987274, "language_loss": 0.5291487, "learning_rate": 3.774698062689362e-06, "loss": 0.55599022, "num_input_tokens_seen": 63972425, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.27624512, "step": 2960, "time_per_iteration": 2.872018337249756 }, { "auxiliary_loss_clip": 0.0161701, "auxiliary_loss_mlp": 0.01048812, "balance_loss_clip": 1.38705218, "balance_loss_mlp": 1.02194238, "epoch": 0.1780249511498572, "flos": 23451458232960.0, "grad_norm": 1.796405615850173, "language_loss": 0.90128624, "learning_rate": 3.7745184488999548e-06, "loss": 0.92794448, "num_input_tokens_seen": 63992165, "router_z_loss_clip": 2.30078125, "router_z_loss_mlp": 0.26867676, "step": 2961, "time_per_iteration": 2.9000484943389893 }, { "auxiliary_loss_clip": 0.01640946, "auxiliary_loss_mlp": 0.01050057, "balance_loss_clip": 1.40679646, "balance_loss_mlp": 1.02200675, "epoch": 0.1780850744025252, "flos": 23377835664000.0, "grad_norm": 1.5396357020657288, "language_loss": 0.8010301, "learning_rate": 3.774338767820631e-06, "loss": 0.82794011, "num_input_tokens_seen": 64013470, "router_z_loss_clip": 2.34179688, "router_z_loss_mlp": 0.28063965, "step": 2962, "time_per_iteration": 2.850339889526367 }, { "auxiliary_loss_clip": 0.01631854, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.39933097, "balance_loss_mlp": 1.01647806, "epoch": 0.17814519765519315, "flos": 13779944501760.0, "grad_norm": 1.617243927754289, "language_loss": 0.76183093, "learning_rate": 3.774159019458203e-06, "loss": 0.7885828, "num_input_tokens_seen": 64030975, "router_z_loss_clip": 2.32617188, "router_z_loss_mlp": 0.26867676, "step": 2963, "time_per_iteration": 2.890949249267578 }, { "auxiliary_loss_clip": 0.0165682, "auxiliary_loss_mlp": 0.01047661, "balance_loss_clip": 1.41783059, "balance_loss_mlp": 1.01921797, "epoch": 0.17820532090786112, "flos": 21985205391360.0, "grad_norm": 1.473427185578327, "language_loss": 0.7942754, "learning_rate": 3.7739792038194877e-06, "loss": 0.82132018, "num_input_tokens_seen": 64050075, "router_z_loss_clip": 2.38867188, "router_z_loss_mlp": 0.28417969, "step": 2964, "time_per_iteration": 2.840341806411743 }, { "auxiliary_loss_clip": 0.01627709, "auxiliary_loss_mlp": 0.01050203, "balance_loss_clip": 1.3959794, "balance_loss_mlp": 1.02404881, "epoch": 0.17826544416052909, "flos": 24801576376320.0, "grad_norm": 1.5932770154028075, "language_loss": 0.82208145, "learning_rate": 3.7737993209113027e-06, "loss": 0.84886056, "num_input_tokens_seen": 64071920, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.26171875, "step": 2965, "time_per_iteration": 2.9253883361816406 }, { "auxiliary_loss_clip": 0.01629136, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.39808202, "balance_loss_mlp": 1.02271652, "epoch": 0.17832556741319705, "flos": 13887075484800.0, "grad_norm": 2.705253662378235, "language_loss": 0.95503563, "learning_rate": 3.7736193707404698e-06, "loss": 0.98181301, "num_input_tokens_seen": 64086835, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.2590332, "step": 2966, "time_per_iteration": 3.0353846549987793 }, { "auxiliary_loss_clip": 0.01634378, "auxiliary_loss_mlp": 0.01048985, "balance_loss_clip": 1.40333652, "balance_loss_mlp": 1.02125669, "epoch": 0.17838569066586502, "flos": 36653805889920.0, "grad_norm": 2.467933445685487, "language_loss": 0.73162389, "learning_rate": 3.7734393533138127e-06, "loss": 0.75845754, "num_input_tokens_seen": 64107360, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.27734375, "step": 2967, "time_per_iteration": 2.972776412963867 }, { "auxiliary_loss_clip": 0.01618129, "auxiliary_loss_mlp": 0.01051719, "balance_loss_clip": 1.39357269, "balance_loss_mlp": 1.02550554, "epoch": 0.17844581391853298, "flos": 18734745363840.0, "grad_norm": 1.9656983214698367, "language_loss": 0.77106416, "learning_rate": 3.773259268638157e-06, "loss": 0.79776269, "num_input_tokens_seen": 64124690, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.2623291, "step": 2968, "time_per_iteration": 2.8052916526794434 }, { "auxiliary_loss_clip": 0.01624457, "auxiliary_loss_mlp": 0.01046865, "balance_loss_clip": 1.39600265, "balance_loss_mlp": 1.02067506, "epoch": 0.17850593717120097, "flos": 27388980835200.0, "grad_norm": 2.2786863643561426, "language_loss": 0.76668841, "learning_rate": 3.7730791167203333e-06, "loss": 0.7934016, "num_input_tokens_seen": 64146315, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26171875, "step": 2969, "time_per_iteration": 2.8895046710968018 }, { "auxiliary_loss_clip": 0.01384487, "auxiliary_loss_mlp": 0.01019812, "balance_loss_clip": 1.25388145, "balance_loss_mlp": 0.99930811, "epoch": 0.17856606042386894, "flos": 67024998460800.0, "grad_norm": 0.8316687423556082, "language_loss": 0.69017637, "learning_rate": 3.772898897567171e-06, "loss": 0.71421933, "num_input_tokens_seen": 64210875, "router_z_loss_clip": 1.3125, "router_z_loss_mlp": 0.20507812, "step": 2970, "time_per_iteration": 3.4533793926239014 }, { "auxiliary_loss_clip": 0.01621727, "auxiliary_loss_mlp": 0.01047074, "balance_loss_clip": 1.38884199, "balance_loss_mlp": 1.01958418, "epoch": 0.1786261836765369, "flos": 36990989274240.0, "grad_norm": 1.9521587244786585, "language_loss": 0.68928576, "learning_rate": 3.772718611185505e-06, "loss": 0.71597379, "num_input_tokens_seen": 64230740, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 0.27526855, "step": 2971, "time_per_iteration": 2.982680559158325 }, { "auxiliary_loss_clip": 0.01617922, "auxiliary_loss_mlp": 0.01051552, "balance_loss_clip": 1.38717365, "balance_loss_mlp": 1.02251291, "epoch": 0.17868630692920487, "flos": 24835808707200.0, "grad_norm": 1.8815234562354384, "language_loss": 0.90671802, "learning_rate": 3.7725382575821717e-06, "loss": 0.93341279, "num_input_tokens_seen": 64252300, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.2902832, "step": 2972, "time_per_iteration": 2.8624298572540283 }, { "auxiliary_loss_clip": 0.01621218, "auxiliary_loss_mlp": 0.01053471, "balance_loss_clip": 1.39070415, "balance_loss_mlp": 1.02699471, "epoch": 0.17874643018187283, "flos": 16990561843200.0, "grad_norm": 1.8729448739556978, "language_loss": 0.88928127, "learning_rate": 3.77235783676401e-06, "loss": 0.9160282, "num_input_tokens_seen": 64270105, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.26489258, "step": 2973, "time_per_iteration": 2.8614211082458496 }, { "auxiliary_loss_clip": 0.0161896, "auxiliary_loss_mlp": 0.01048597, "balance_loss_clip": 1.38838482, "balance_loss_mlp": 1.02125001, "epoch": 0.1788065534345408, "flos": 21041956903680.0, "grad_norm": 8.615802125905443, "language_loss": 0.76767939, "learning_rate": 3.7721773487378615e-06, "loss": 0.79435498, "num_input_tokens_seen": 64287250, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.27392578, "step": 2974, "time_per_iteration": 2.8256115913391113 }, { "auxiliary_loss_clip": 0.01616259, "auxiliary_loss_mlp": 0.01056347, "balance_loss_clip": 1.38759375, "balance_loss_mlp": 1.02801061, "epoch": 0.17886667668720876, "flos": 23998243368960.0, "grad_norm": 2.189199356447207, "language_loss": 0.76109707, "learning_rate": 3.7719967935105705e-06, "loss": 0.78782314, "num_input_tokens_seen": 64307140, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.28320312, "step": 2975, "time_per_iteration": 2.863884925842285 }, { "auxiliary_loss_clip": 0.0160543, "auxiliary_loss_mlp": 0.0104719, "balance_loss_clip": 1.3807472, "balance_loss_mlp": 1.02067769, "epoch": 0.17892679993987676, "flos": 25750570953600.0, "grad_norm": 1.5029797385412778, "language_loss": 0.7392534, "learning_rate": 3.7718161710889833e-06, "loss": 0.76577961, "num_input_tokens_seen": 64328760, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26513672, "step": 2976, "time_per_iteration": 2.878466844558716 }, { "auxiliary_loss_clip": 0.01589862, "auxiliary_loss_mlp": 0.01047731, "balance_loss_clip": 1.37133121, "balance_loss_mlp": 1.02322197, "epoch": 0.17898692319254472, "flos": 25709913861120.0, "grad_norm": 1.5082773337038489, "language_loss": 0.77882051, "learning_rate": 3.7716354814799495e-06, "loss": 0.8051964, "num_input_tokens_seen": 64348800, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24511719, "step": 2977, "time_per_iteration": 2.975379705429077 }, { "auxiliary_loss_clip": 0.01623631, "auxiliary_loss_mlp": 0.01054665, "balance_loss_clip": 1.39691019, "balance_loss_mlp": 1.02721155, "epoch": 0.1790470464452127, "flos": 19327164520320.0, "grad_norm": 2.054767639507437, "language_loss": 0.80732775, "learning_rate": 3.7714547246903203e-06, "loss": 0.83411068, "num_input_tokens_seen": 64367955, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2746582, "step": 2978, "time_per_iteration": 2.973639488220215 }, { "auxiliary_loss_clip": 0.01619872, "auxiliary_loss_mlp": 0.01059272, "balance_loss_clip": 1.39080286, "balance_loss_mlp": 1.02975559, "epoch": 0.17910716969788065, "flos": 30056342135040.0, "grad_norm": 1.4666729429733372, "language_loss": 0.77505744, "learning_rate": 3.7712739007269508e-06, "loss": 0.80184889, "num_input_tokens_seen": 64389805, "router_z_loss_clip": 2.2890625, "router_z_loss_mlp": 0.29541016, "step": 2979, "time_per_iteration": 4.302835941314697 }, { "auxiliary_loss_clip": 0.01597727, "auxiliary_loss_mlp": 0.01059374, "balance_loss_clip": 1.37575638, "balance_loss_mlp": 1.03106165, "epoch": 0.17916729295054862, "flos": 19437236415360.0, "grad_norm": 1.77361566499179, "language_loss": 0.70758104, "learning_rate": 3.7710930095966976e-06, "loss": 0.73415208, "num_input_tokens_seen": 64408220, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.28344727, "step": 2980, "time_per_iteration": 2.9176223278045654 }, { "auxiliary_loss_clip": 0.01613347, "auxiliary_loss_mlp": 0.01050449, "balance_loss_clip": 1.38462484, "balance_loss_mlp": 1.01902556, "epoch": 0.17922741620321658, "flos": 14619817324800.0, "grad_norm": 1.7247385515337654, "language_loss": 0.71766913, "learning_rate": 3.7709120513064196e-06, "loss": 0.74430716, "num_input_tokens_seen": 64426380, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.31420898, "step": 2981, "time_per_iteration": 2.9716856479644775 }, { "auxiliary_loss_clip": 0.01630647, "auxiliary_loss_mlp": 0.01055466, "balance_loss_clip": 1.39822352, "balance_loss_mlp": 1.02723694, "epoch": 0.17928753945588458, "flos": 17174165817600.0, "grad_norm": 2.117777545824384, "language_loss": 0.82878584, "learning_rate": 3.7707310258629796e-06, "loss": 0.85564697, "num_input_tokens_seen": 64444355, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.28234863, "step": 2982, "time_per_iteration": 2.853656768798828 }, { "auxiliary_loss_clip": 0.01596498, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.37164021, "balance_loss_mlp": 1.0243721, "epoch": 0.17934766270855254, "flos": 31408405804800.0, "grad_norm": 1.4726549271428797, "language_loss": 0.83827752, "learning_rate": 3.7705499332732413e-06, "loss": 0.86476624, "num_input_tokens_seen": 64467800, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27990723, "step": 2983, "time_per_iteration": 3.0725090503692627 }, { "auxiliary_loss_clip": 0.01617942, "auxiliary_loss_mlp": 0.01052617, "balance_loss_clip": 1.38429534, "balance_loss_mlp": 1.02467477, "epoch": 0.1794077859612205, "flos": 20824075353600.0, "grad_norm": 2.0526427173994675, "language_loss": 0.86775917, "learning_rate": 3.7703687735440718e-06, "loss": 0.89446473, "num_input_tokens_seen": 64487230, "router_z_loss_clip": 2.33789062, "router_z_loss_mlp": 0.27954102, "step": 2984, "time_per_iteration": 2.9185433387756348 }, { "auxiliary_loss_clip": 0.01621136, "auxiliary_loss_mlp": 0.01046714, "balance_loss_clip": 1.39019859, "balance_loss_mlp": 1.01891446, "epoch": 0.17946790921388847, "flos": 28998316293120.0, "grad_norm": 1.3893461191685796, "language_loss": 0.89761209, "learning_rate": 3.7701875466823416e-06, "loss": 0.92429054, "num_input_tokens_seen": 64509165, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.27783203, "step": 2985, "time_per_iteration": 2.9402990341186523 }, { "auxiliary_loss_clip": 0.01596829, "auxiliary_loss_mlp": 0.01044294, "balance_loss_clip": 1.37592387, "balance_loss_mlp": 1.01803195, "epoch": 0.17952803246655644, "flos": 20746290263040.0, "grad_norm": 1.9970787276600273, "language_loss": 0.71329236, "learning_rate": 3.770006252694922e-06, "loss": 0.7397036, "num_input_tokens_seen": 64527940, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26269531, "step": 2986, "time_per_iteration": 4.266680479049683 }, { "auxiliary_loss_clip": 0.01596939, "auxiliary_loss_mlp": 0.01045282, "balance_loss_clip": 1.37221336, "balance_loss_mlp": 1.01729202, "epoch": 0.1795881557192244, "flos": 28267474734720.0, "grad_norm": 2.6105307330933787, "language_loss": 0.78426725, "learning_rate": 3.769824891588688e-06, "loss": 0.81068945, "num_input_tokens_seen": 64545230, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.2800293, "step": 2987, "time_per_iteration": 2.8936524391174316 }, { "auxiliary_loss_clip": 0.01625778, "auxiliary_loss_mlp": 0.01047969, "balance_loss_clip": 1.39434302, "balance_loss_mlp": 1.0201931, "epoch": 0.17964827897189237, "flos": 18561321469440.0, "grad_norm": 1.7373078006280476, "language_loss": 0.79623419, "learning_rate": 3.7696434633705164e-06, "loss": 0.82297164, "num_input_tokens_seen": 64563820, "router_z_loss_clip": 2.31445312, "router_z_loss_mlp": 0.27783203, "step": 2988, "time_per_iteration": 5.822129726409912 }, { "auxiliary_loss_clip": 0.01383321, "auxiliary_loss_mlp": 0.01050976, "balance_loss_clip": 1.25470257, "balance_loss_mlp": 1.02818298, "epoch": 0.17970840222456036, "flos": 58191412026240.0, "grad_norm": 0.7665769679826688, "language_loss": 0.62763727, "learning_rate": 3.7694619680472875e-06, "loss": 0.65198028, "num_input_tokens_seen": 64621315, "router_z_loss_clip": 1.2890625, "router_z_loss_mlp": 0.22753906, "step": 2989, "time_per_iteration": 3.3023786544799805 }, { "auxiliary_loss_clip": 0.01612458, "auxiliary_loss_mlp": 0.01049321, "balance_loss_clip": 1.38476038, "balance_loss_mlp": 1.02121186, "epoch": 0.17976852547722832, "flos": 20309893735680.0, "grad_norm": 1.8565261101538135, "language_loss": 0.7194463, "learning_rate": 3.7692804056258837e-06, "loss": 0.74606407, "num_input_tokens_seen": 64639885, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.28100586, "step": 2990, "time_per_iteration": 2.8790295124053955 }, { "auxiliary_loss_clip": 0.01611332, "auxiliary_loss_mlp": 0.01052472, "balance_loss_clip": 1.38038731, "balance_loss_mlp": 1.02448201, "epoch": 0.1798286487298963, "flos": 39682086111360.0, "grad_norm": 2.0921015494584974, "language_loss": 0.70240724, "learning_rate": 3.7690987761131893e-06, "loss": 0.72904527, "num_input_tokens_seen": 64661220, "router_z_loss_clip": 2.30859375, "router_z_loss_mlp": 0.28027344, "step": 2991, "time_per_iteration": 2.9883179664611816 }, { "auxiliary_loss_clip": 0.01600666, "auxiliary_loss_mlp": 0.01057478, "balance_loss_clip": 1.37411189, "balance_loss_mlp": 1.029917, "epoch": 0.17988877198256426, "flos": 25531286814720.0, "grad_norm": 2.011859847521587, "language_loss": 0.83643544, "learning_rate": 3.7689170795160924e-06, "loss": 0.86301684, "num_input_tokens_seen": 64682530, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.27587891, "step": 2992, "time_per_iteration": 2.887079954147339 }, { "auxiliary_loss_clip": 0.01581138, "auxiliary_loss_mlp": 0.01049317, "balance_loss_clip": 1.36227262, "balance_loss_mlp": 1.02284026, "epoch": 0.17994889523523222, "flos": 18816873955200.0, "grad_norm": 1.9624032307797579, "language_loss": 0.83637226, "learning_rate": 3.7687353158414822e-06, "loss": 0.86267686, "num_input_tokens_seen": 64701025, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26489258, "step": 2993, "time_per_iteration": 2.8448290824890137 }, { "auxiliary_loss_clip": 0.01606904, "auxiliary_loss_mlp": 0.01050248, "balance_loss_clip": 1.3782258, "balance_loss_mlp": 1.02312815, "epoch": 0.18000901848790019, "flos": 21113995904640.0, "grad_norm": 1.7209832391186357, "language_loss": 0.79388607, "learning_rate": 3.7685534850962517e-06, "loss": 0.82045758, "num_input_tokens_seen": 64719570, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.27124023, "step": 2994, "time_per_iteration": 2.8345654010772705 }, { "auxiliary_loss_clip": 0.01616864, "auxiliary_loss_mlp": 0.01055484, "balance_loss_clip": 1.38722324, "balance_loss_mlp": 1.02803016, "epoch": 0.18006914174056818, "flos": 19655977616640.0, "grad_norm": 2.2088684044925695, "language_loss": 0.81640399, "learning_rate": 3.768371587287296e-06, "loss": 0.84312743, "num_input_tokens_seen": 64738110, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27453613, "step": 2995, "time_per_iteration": 2.866931200027466 }, { "auxiliary_loss_clip": 0.01614502, "auxiliary_loss_mlp": 0.01049488, "balance_loss_clip": 1.3873806, "balance_loss_mlp": 1.02439463, "epoch": 0.18012926499323614, "flos": 19509230171520.0, "grad_norm": 1.5732920666537045, "language_loss": 0.85159761, "learning_rate": 3.768189622421512e-06, "loss": 0.87823755, "num_input_tokens_seen": 64756345, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.25097656, "step": 2996, "time_per_iteration": 2.8279361724853516 }, { "auxiliary_loss_clip": 0.01591131, "auxiliary_loss_mlp": 0.01051631, "balance_loss_clip": 1.3711952, "balance_loss_mlp": 1.02519035, "epoch": 0.1801893882459041, "flos": 19474183434240.0, "grad_norm": 1.5162828933255263, "language_loss": 0.88859338, "learning_rate": 3.7680075905058006e-06, "loss": 0.915021, "num_input_tokens_seen": 64776375, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.2644043, "step": 2997, "time_per_iteration": 2.891843557357788 }, { "auxiliary_loss_clip": 0.01612156, "auxiliary_loss_mlp": 0.01051717, "balance_loss_clip": 1.3797996, "balance_loss_mlp": 1.02421546, "epoch": 0.18024951149857207, "flos": 26881993140480.0, "grad_norm": 1.715242369542393, "language_loss": 0.86400568, "learning_rate": 3.7678254915470643e-06, "loss": 0.89064431, "num_input_tokens_seen": 64796210, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.27478027, "step": 2998, "time_per_iteration": 2.875412702560425 }, { "auxiliary_loss_clip": 0.01595914, "auxiliary_loss_mlp": 0.01046023, "balance_loss_clip": 1.37371993, "balance_loss_mlp": 1.0191294, "epoch": 0.18030963475124004, "flos": 30238724499840.0, "grad_norm": 1.555222437926751, "language_loss": 0.85787821, "learning_rate": 3.7676433255522084e-06, "loss": 0.88429755, "num_input_tokens_seen": 64818590, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.26928711, "step": 2999, "time_per_iteration": 2.9452645778656006 }, { "auxiliary_loss_clip": 0.01599154, "auxiliary_loss_mlp": 0.01047772, "balance_loss_clip": 1.37348986, "balance_loss_mlp": 1.01975775, "epoch": 0.180369758003908, "flos": 22317095134080.0, "grad_norm": 3.1077099751266144, "language_loss": 0.76249516, "learning_rate": 3.76746109252814e-06, "loss": 0.78896439, "num_input_tokens_seen": 64838350, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.28015137, "step": 3000, "time_per_iteration": 2.9183127880096436 }, { "auxiliary_loss_clip": 0.0158489, "auxiliary_loss_mlp": 0.01054641, "balance_loss_clip": 1.36469936, "balance_loss_mlp": 1.02654326, "epoch": 0.18042988125657597, "flos": 23742419414400.0, "grad_norm": 1.7054782225155767, "language_loss": 0.7210319, "learning_rate": 3.76727879248177e-06, "loss": 0.74742723, "num_input_tokens_seen": 64858065, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.28100586, "step": 3001, "time_per_iteration": 2.851259231567383 }, { "auxiliary_loss_clip": 0.01607875, "auxiliary_loss_mlp": 0.01048931, "balance_loss_clip": 1.37844682, "balance_loss_mlp": 1.02139354, "epoch": 0.18049000450924396, "flos": 24103202601600.0, "grad_norm": 2.2414862607342507, "language_loss": 0.90090787, "learning_rate": 3.767096425420011e-06, "loss": 0.92747593, "num_input_tokens_seen": 64877305, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.2755127, "step": 3002, "time_per_iteration": 3.0116467475891113 }, { "auxiliary_loss_clip": 0.01595342, "auxiliary_loss_mlp": 0.01046933, "balance_loss_clip": 1.37004817, "balance_loss_mlp": 1.01984906, "epoch": 0.18055012776191193, "flos": 22173152866560.0, "grad_norm": 2.0011063982639956, "language_loss": 0.81863594, "learning_rate": 3.7669139913497788e-06, "loss": 0.84505874, "num_input_tokens_seen": 64896955, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27099609, "step": 3003, "time_per_iteration": 2.8391895294189453 }, { "auxiliary_loss_clip": 0.01595438, "auxiliary_loss_mlp": 0.01048747, "balance_loss_clip": 1.36877847, "balance_loss_mlp": 1.01992202, "epoch": 0.1806102510145799, "flos": 28925372396160.0, "grad_norm": 1.9514632307231041, "language_loss": 0.68379909, "learning_rate": 3.7667314902779907e-06, "loss": 0.71024096, "num_input_tokens_seen": 64917080, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.28808594, "step": 3004, "time_per_iteration": 2.925210952758789 }, { "auxiliary_loss_clip": 0.01604791, "auxiliary_loss_mlp": 0.01043112, "balance_loss_clip": 1.37688792, "balance_loss_mlp": 1.01576591, "epoch": 0.18067037426724786, "flos": 19034936484480.0, "grad_norm": 1.668662036338473, "language_loss": 0.86156332, "learning_rate": 3.7665489222115677e-06, "loss": 0.88804239, "num_input_tokens_seen": 64935215, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.2734375, "step": 3005, "time_per_iteration": 2.8200674057006836 }, { "auxiliary_loss_clip": 0.01585038, "auxiliary_loss_mlp": 0.01045773, "balance_loss_clip": 1.36439681, "balance_loss_mlp": 1.01903439, "epoch": 0.18073049751991582, "flos": 27465001378560.0, "grad_norm": 1.4200395099157528, "language_loss": 0.8350445, "learning_rate": 3.766366287157432e-06, "loss": 0.86135268, "num_input_tokens_seen": 64956275, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26745605, "step": 3006, "time_per_iteration": 2.8907268047332764 }, { "auxiliary_loss_clip": 0.01595843, "auxiliary_loss_mlp": 0.01051116, "balance_loss_clip": 1.37060285, "balance_loss_mlp": 1.02229154, "epoch": 0.1807906207725838, "flos": 28740320588160.0, "grad_norm": 2.080788231387073, "language_loss": 0.78320837, "learning_rate": 3.7661835851225103e-06, "loss": 0.80967796, "num_input_tokens_seen": 64979390, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.2878418, "step": 3007, "time_per_iteration": 2.9011638164520264 }, { "auxiliary_loss_clip": 0.01372586, "auxiliary_loss_mlp": 0.0106574, "balance_loss_clip": 1.2390368, "balance_loss_mlp": 1.03980052, "epoch": 0.18085074402525175, "flos": 64501190208000.0, "grad_norm": 0.8189027334258258, "language_loss": 0.56896174, "learning_rate": 3.7660008161137294e-06, "loss": 0.59334505, "num_input_tokens_seen": 65043135, "router_z_loss_clip": 1.3359375, "router_z_loss_mlp": 0.25976562, "step": 3008, "time_per_iteration": 3.518815279006958 }, { "auxiliary_loss_clip": 0.01599811, "auxiliary_loss_mlp": 0.01052934, "balance_loss_clip": 1.37089169, "balance_loss_mlp": 1.02480125, "epoch": 0.18091086727791975, "flos": 23487364621440.0, "grad_norm": 1.6635814046776505, "language_loss": 0.69269216, "learning_rate": 3.765817980138021e-06, "loss": 0.71921957, "num_input_tokens_seen": 65062845, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.28149414, "step": 3009, "time_per_iteration": 2.9329047203063965 }, { "auxiliary_loss_clip": 0.01601786, "auxiliary_loss_mlp": 0.0104115, "balance_loss_clip": 1.37486005, "balance_loss_mlp": 1.01493585, "epoch": 0.1809709905305877, "flos": 24181123426560.0, "grad_norm": 1.821955060163229, "language_loss": 0.770661, "learning_rate": 3.7656350772023177e-06, "loss": 0.79709041, "num_input_tokens_seen": 65082110, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.2623291, "step": 3010, "time_per_iteration": 2.8764238357543945 }, { "auxiliary_loss_clip": 0.01563878, "auxiliary_loss_mlp": 0.01043254, "balance_loss_clip": 1.34680688, "balance_loss_mlp": 1.0171833, "epoch": 0.18103111378325568, "flos": 21660373837440.0, "grad_norm": 1.7261735831896263, "language_loss": 0.69120538, "learning_rate": 3.7654521073135553e-06, "loss": 0.71727669, "num_input_tokens_seen": 65101985, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26098633, "step": 3011, "time_per_iteration": 2.834564685821533 }, { "auxiliary_loss_clip": 0.0158813, "auxiliary_loss_mlp": 0.01048524, "balance_loss_clip": 1.36339331, "balance_loss_mlp": 1.02074838, "epoch": 0.18109123703592364, "flos": 53705095534080.0, "grad_norm": 1.6236451804044802, "language_loss": 0.7213937, "learning_rate": 3.7652690704786723e-06, "loss": 0.7477603, "num_input_tokens_seen": 65129295, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27783203, "step": 3012, "time_per_iteration": 3.198803663253784 }, { "auxiliary_loss_clip": 0.0157766, "auxiliary_loss_mlp": 0.01048352, "balance_loss_clip": 1.35901237, "balance_loss_mlp": 1.02009976, "epoch": 0.1811513602885916, "flos": 35859159884160.0, "grad_norm": 2.1952698678065383, "language_loss": 0.63972127, "learning_rate": 3.765085966704609e-06, "loss": 0.66598141, "num_input_tokens_seen": 65150625, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.28222656, "step": 3013, "time_per_iteration": 2.95149302482605 }, { "auxiliary_loss_clip": 0.01596709, "auxiliary_loss_mlp": 0.0104878, "balance_loss_clip": 1.37229824, "balance_loss_mlp": 1.02182722, "epoch": 0.18121148354125957, "flos": 23743098086400.0, "grad_norm": 1.6542926420986608, "language_loss": 0.768538, "learning_rate": 3.764902795998309e-06, "loss": 0.79499292, "num_input_tokens_seen": 65170880, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26953125, "step": 3014, "time_per_iteration": 4.232020378112793 }, { "auxiliary_loss_clip": 0.01621062, "auxiliary_loss_mlp": 0.01046864, "balance_loss_clip": 1.38858461, "balance_loss_mlp": 1.01808667, "epoch": 0.18127160679392756, "flos": 28739415692160.0, "grad_norm": 2.6504177648213, "language_loss": 0.66336465, "learning_rate": 3.7647195583667184e-06, "loss": 0.69004393, "num_input_tokens_seen": 65192530, "router_z_loss_clip": 2.32421875, "router_z_loss_mlp": 0.2878418, "step": 3015, "time_per_iteration": 2.9526240825653076 }, { "auxiliary_loss_clip": 0.01588193, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.36676002, "balance_loss_mlp": 1.01636672, "epoch": 0.18133173004659553, "flos": 20494674074880.0, "grad_norm": 1.6132220761490403, "language_loss": 0.78970933, "learning_rate": 3.764536253816785e-06, "loss": 0.81601655, "num_input_tokens_seen": 65211675, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26196289, "step": 3016, "time_per_iteration": 2.9327392578125 }, { "auxiliary_loss_clip": 0.01616927, "auxiliary_loss_mlp": 0.01055377, "balance_loss_clip": 1.3875432, "balance_loss_mlp": 1.02869785, "epoch": 0.1813918532992635, "flos": 22861120337280.0, "grad_norm": 2.639956202503164, "language_loss": 0.84328079, "learning_rate": 3.7643528823554602e-06, "loss": 0.87000376, "num_input_tokens_seen": 65231185, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.26647949, "step": 3017, "time_per_iteration": 2.880951404571533 }, { "auxiliary_loss_clip": 0.01589937, "auxiliary_loss_mlp": 0.01039348, "balance_loss_clip": 1.37006938, "balance_loss_mlp": 1.01376581, "epoch": 0.18145197655193146, "flos": 36078941715840.0, "grad_norm": 1.7469010677750592, "language_loss": 0.68389225, "learning_rate": 3.764169443989697e-06, "loss": 0.71018517, "num_input_tokens_seen": 65251645, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25598145, "step": 3018, "time_per_iteration": 3.005589246749878 }, { "auxiliary_loss_clip": 0.0159934, "auxiliary_loss_mlp": 0.01041548, "balance_loss_clip": 1.37099195, "balance_loss_mlp": 1.01526237, "epoch": 0.18151209980459942, "flos": 24034421226240.0, "grad_norm": 2.2229472042761564, "language_loss": 0.77318674, "learning_rate": 3.7639859387264518e-06, "loss": 0.79959565, "num_input_tokens_seen": 65271125, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.26318359, "step": 3019, "time_per_iteration": 2.8761940002441406 }, { "auxiliary_loss_clip": 0.01605896, "auxiliary_loss_mlp": 0.01051326, "balance_loss_clip": 1.37794065, "balance_loss_mlp": 1.02355015, "epoch": 0.1815722230572674, "flos": 23962246490880.0, "grad_norm": 1.9909950487827597, "language_loss": 0.82786679, "learning_rate": 3.7638023665726834e-06, "loss": 0.85443902, "num_input_tokens_seen": 65290600, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.27746582, "step": 3020, "time_per_iteration": 4.235971450805664 }, { "auxiliary_loss_clip": 0.01594797, "auxiliary_loss_mlp": 0.01044329, "balance_loss_clip": 1.37069631, "balance_loss_mlp": 1.01848459, "epoch": 0.18163234630993536, "flos": 24396426023040.0, "grad_norm": 2.262727653193273, "language_loss": 0.79409486, "learning_rate": 3.763618727535352e-06, "loss": 0.82048607, "num_input_tokens_seen": 65311040, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.25830078, "step": 3021, "time_per_iteration": 2.8747997283935547 }, { "auxiliary_loss_clip": 0.01573456, "auxiliary_loss_mlp": 0.01049085, "balance_loss_clip": 1.35263133, "balance_loss_mlp": 1.02111816, "epoch": 0.18169246956260335, "flos": 24692137908480.0, "grad_norm": 1.5309916704481707, "language_loss": 0.85972619, "learning_rate": 3.763435021621422e-06, "loss": 0.88595164, "num_input_tokens_seen": 65332115, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.27954102, "step": 3022, "time_per_iteration": 2.898169755935669 }, { "auxiliary_loss_clip": 0.01590368, "auxiliary_loss_mlp": 0.01047169, "balance_loss_clip": 1.36484706, "balance_loss_mlp": 1.02044272, "epoch": 0.1817525928152713, "flos": 24253931589120.0, "grad_norm": 1.7296631339070057, "language_loss": 0.70935148, "learning_rate": 3.763251248837859e-06, "loss": 0.73572683, "num_input_tokens_seen": 65352210, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26721191, "step": 3023, "time_per_iteration": 5.948983192443848 }, { "auxiliary_loss_clip": 0.0157593, "auxiliary_loss_mlp": 0.01045735, "balance_loss_clip": 1.35315979, "balance_loss_mlp": 1.02000976, "epoch": 0.18181271606793928, "flos": 16480768970880.0, "grad_norm": 1.6427241051333257, "language_loss": 0.75114805, "learning_rate": 3.7630674091916317e-06, "loss": 0.77736473, "num_input_tokens_seen": 65370600, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25744629, "step": 3024, "time_per_iteration": 2.8479886054992676 }, { "auxiliary_loss_clip": 0.01581776, "auxiliary_loss_mlp": 0.01041632, "balance_loss_clip": 1.35910428, "balance_loss_mlp": 1.0165143, "epoch": 0.18187283932060724, "flos": 18588405121920.0, "grad_norm": 2.716885197252315, "language_loss": 0.89975691, "learning_rate": 3.7628835026897123e-06, "loss": 0.92599094, "num_input_tokens_seen": 65387270, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25097656, "step": 3025, "time_per_iteration": 2.8938727378845215 }, { "auxiliary_loss_clip": 0.01585352, "auxiliary_loss_mlp": 0.01050309, "balance_loss_clip": 1.36393666, "balance_loss_mlp": 1.02416599, "epoch": 0.1819329625732752, "flos": 20276566300800.0, "grad_norm": 1.7875175134911327, "language_loss": 0.79645306, "learning_rate": 3.7626995293390735e-06, "loss": 0.82280964, "num_input_tokens_seen": 65406550, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26147461, "step": 3026, "time_per_iteration": 2.8368606567382812 }, { "auxiliary_loss_clip": 0.0160756, "auxiliary_loss_mlp": 0.01045796, "balance_loss_clip": 1.38061786, "balance_loss_mlp": 1.01896167, "epoch": 0.18199308582594317, "flos": 25924583030400.0, "grad_norm": 1.7312307695549096, "language_loss": 0.76823127, "learning_rate": 3.762515489146692e-06, "loss": 0.79476482, "num_input_tokens_seen": 65425955, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26806641, "step": 3027, "time_per_iteration": 2.8911874294281006 }, { "auxiliary_loss_clip": 0.01610078, "auxiliary_loss_mlp": 0.01052672, "balance_loss_clip": 1.37770486, "balance_loss_mlp": 1.02511096, "epoch": 0.18205320907861114, "flos": 15385931844480.0, "grad_norm": 2.07349572309412, "language_loss": 0.86137521, "learning_rate": 3.762331382119546e-06, "loss": 0.88800275, "num_input_tokens_seen": 65442820, "router_z_loss_clip": 2.32226562, "router_z_loss_mlp": 0.27563477, "step": 3028, "time_per_iteration": 2.7945613861083984 }, { "auxiliary_loss_clip": 0.01600691, "auxiliary_loss_mlp": 0.01046652, "balance_loss_clip": 1.3737191, "balance_loss_mlp": 1.02019906, "epoch": 0.18211333233127913, "flos": 25633893317760.0, "grad_norm": 1.6982789691425906, "language_loss": 0.83440506, "learning_rate": 3.7621472082646183e-06, "loss": 0.86087847, "num_input_tokens_seen": 65461825, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26452637, "step": 3029, "time_per_iteration": 2.88871431350708 }, { "auxiliary_loss_clip": 0.01613224, "auxiliary_loss_mlp": 0.01053521, "balance_loss_clip": 1.38581991, "balance_loss_mlp": 1.02662754, "epoch": 0.1821734555839471, "flos": 14984265340800.0, "grad_norm": 1.8990703094625885, "language_loss": 0.79456413, "learning_rate": 3.761962967588891e-06, "loss": 0.82123154, "num_input_tokens_seen": 65479480, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26904297, "step": 3030, "time_per_iteration": 2.8022425174713135 }, { "auxiliary_loss_clip": 0.01601537, "auxiliary_loss_mlp": 0.01054784, "balance_loss_clip": 1.37310338, "balance_loss_mlp": 1.02797365, "epoch": 0.18223357883661506, "flos": 20203758138240.0, "grad_norm": 2.42246219127193, "language_loss": 0.86471856, "learning_rate": 3.761778660099352e-06, "loss": 0.89128178, "num_input_tokens_seen": 65497775, "router_z_loss_clip": 2.28515625, "router_z_loss_mlp": 0.26818848, "step": 3031, "time_per_iteration": 2.833371639251709 }, { "auxiliary_loss_clip": 0.01602777, "auxiliary_loss_mlp": 0.01051704, "balance_loss_clip": 1.37483692, "balance_loss_mlp": 1.02472734, "epoch": 0.18229370208928303, "flos": 15240451253760.0, "grad_norm": 1.6932489490197948, "language_loss": 0.81326652, "learning_rate": 3.76159428580299e-06, "loss": 0.83981133, "num_input_tokens_seen": 65516505, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27001953, "step": 3032, "time_per_iteration": 2.7804315090179443 }, { "auxiliary_loss_clip": 0.0161761, "auxiliary_loss_mlp": 0.01059246, "balance_loss_clip": 1.38292086, "balance_loss_mlp": 1.03330612, "epoch": 0.182353825341951, "flos": 23850636272640.0, "grad_norm": 1.9536347406283008, "language_loss": 0.82120931, "learning_rate": 3.761409844706795e-06, "loss": 0.84797788, "num_input_tokens_seen": 65536160, "router_z_loss_clip": 2.34765625, "router_z_loss_mlp": 0.2598877, "step": 3033, "time_per_iteration": 2.872609853744507 }, { "auxiliary_loss_clip": 0.01342822, "auxiliary_loss_mlp": 0.01041541, "balance_loss_clip": 1.20859289, "balance_loss_mlp": 1.01655507, "epoch": 0.18241394859461896, "flos": 61217430007680.0, "grad_norm": 0.8938365965950319, "language_loss": 0.63502586, "learning_rate": 3.7612253368177625e-06, "loss": 0.6588695, "num_input_tokens_seen": 65589375, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.25, "step": 3034, "time_per_iteration": 3.263434648513794 }, { "auxiliary_loss_clip": 0.0159771, "auxiliary_loss_mlp": 0.01049952, "balance_loss_clip": 1.37023187, "balance_loss_mlp": 1.0226295, "epoch": 0.18247407184728695, "flos": 18479057143680.0, "grad_norm": 1.9252464848018913, "language_loss": 0.81432569, "learning_rate": 3.7610407621428893e-06, "loss": 0.84080231, "num_input_tokens_seen": 65606720, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27331543, "step": 3035, "time_per_iteration": 2.8233375549316406 }, { "auxiliary_loss_clip": 0.01577208, "auxiliary_loss_mlp": 0.01051572, "balance_loss_clip": 1.35646927, "balance_loss_mlp": 1.02501202, "epoch": 0.18253419509995492, "flos": 21804270860160.0, "grad_norm": 1.9406629030865525, "language_loss": 0.85605156, "learning_rate": 3.7608561206891735e-06, "loss": 0.88233942, "num_input_tokens_seen": 65625495, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26586914, "step": 3036, "time_per_iteration": 2.9440064430236816 }, { "auxiliary_loss_clip": 0.01566036, "auxiliary_loss_mlp": 0.01041941, "balance_loss_clip": 1.35066855, "balance_loss_mlp": 1.01570296, "epoch": 0.18259431835262288, "flos": 20157490690560.0, "grad_norm": 1.87785190411536, "language_loss": 0.80992097, "learning_rate": 3.760671412463617e-06, "loss": 0.83600074, "num_input_tokens_seen": 65643515, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26245117, "step": 3037, "time_per_iteration": 2.8726749420166016 }, { "auxiliary_loss_clip": 0.0158822, "auxiliary_loss_mlp": 0.01051862, "balance_loss_clip": 1.36233878, "balance_loss_mlp": 1.02322817, "epoch": 0.18265444160529085, "flos": 16990154640000.0, "grad_norm": 3.3174196948556665, "language_loss": 0.81417024, "learning_rate": 3.7604866374732246e-06, "loss": 0.84057105, "num_input_tokens_seen": 65658155, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.28662109, "step": 3038, "time_per_iteration": 2.8423168659210205 }, { "auxiliary_loss_clip": 0.01576221, "auxiliary_loss_mlp": 0.01052155, "balance_loss_clip": 1.35638511, "balance_loss_mlp": 1.02566695, "epoch": 0.1827145648579588, "flos": 34436640781440.0, "grad_norm": 2.0546133293214526, "language_loss": 0.68376994, "learning_rate": 3.7603017957250023e-06, "loss": 0.71005368, "num_input_tokens_seen": 65679310, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26489258, "step": 3039, "time_per_iteration": 2.9753291606903076 }, { "auxiliary_loss_clip": 0.01590552, "auxiliary_loss_mlp": 0.0105184, "balance_loss_clip": 1.36614609, "balance_loss_mlp": 1.02629352, "epoch": 0.18277468811062678, "flos": 53305465046400.0, "grad_norm": 1.7009241634455452, "language_loss": 0.74687195, "learning_rate": 3.7601168872259593e-06, "loss": 0.77329582, "num_input_tokens_seen": 65705235, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25585938, "step": 3040, "time_per_iteration": 3.1346511840820312 }, { "auxiliary_loss_clip": 0.01584328, "auxiliary_loss_mlp": 0.01054913, "balance_loss_clip": 1.36361814, "balance_loss_mlp": 1.02613616, "epoch": 0.18283481136329474, "flos": 31663415352960.0, "grad_norm": 1.600453395815543, "language_loss": 0.61668408, "learning_rate": 3.7599319119831075e-06, "loss": 0.64307648, "num_input_tokens_seen": 65727575, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.28808594, "step": 3041, "time_per_iteration": 2.9518566131591797 }, { "auxiliary_loss_clip": 0.01596128, "auxiliary_loss_mlp": 0.01051832, "balance_loss_clip": 1.37067115, "balance_loss_mlp": 1.02429461, "epoch": 0.18289493461596273, "flos": 53155188506880.0, "grad_norm": 1.4857973684734103, "language_loss": 0.60755801, "learning_rate": 3.7597468700034616e-06, "loss": 0.63403767, "num_input_tokens_seen": 65751370, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.27539062, "step": 3042, "time_per_iteration": 3.129289388656616 }, { "auxiliary_loss_clip": 0.01590056, "auxiliary_loss_mlp": 0.01050955, "balance_loss_clip": 1.3668561, "balance_loss_mlp": 1.02275014, "epoch": 0.1829550578686307, "flos": 25599344273280.0, "grad_norm": 1.482706649275083, "language_loss": 0.88618481, "learning_rate": 3.7595617612940374e-06, "loss": 0.91259497, "num_input_tokens_seen": 65771040, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.28222656, "step": 3043, "time_per_iteration": 2.871034860610962 }, { "auxiliary_loss_clip": 0.01589577, "auxiliary_loss_mlp": 0.01046097, "balance_loss_clip": 1.36364007, "balance_loss_mlp": 1.01882195, "epoch": 0.18301518112129866, "flos": 22611630654720.0, "grad_norm": 1.8556504998421095, "language_loss": 0.71551585, "learning_rate": 3.7593765858618552e-06, "loss": 0.74187255, "num_input_tokens_seen": 65789345, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27294922, "step": 3044, "time_per_iteration": 2.83789324760437 }, { "auxiliary_loss_clip": 0.01620762, "auxiliary_loss_mlp": 0.01052758, "balance_loss_clip": 1.39048076, "balance_loss_mlp": 1.02423084, "epoch": 0.18307530437396663, "flos": 34033797912960.0, "grad_norm": 2.6027762387892523, "language_loss": 0.6581406, "learning_rate": 3.7591913437139365e-06, "loss": 0.68487585, "num_input_tokens_seen": 65810990, "router_z_loss_clip": 2.30273438, "router_z_loss_mlp": 0.28527832, "step": 3045, "time_per_iteration": 2.992873191833496 }, { "auxiliary_loss_clip": 0.0160302, "auxiliary_loss_mlp": 0.01049151, "balance_loss_clip": 1.3807795, "balance_loss_mlp": 1.02334189, "epoch": 0.1831354276266346, "flos": 21287510288640.0, "grad_norm": 2.6264895227660583, "language_loss": 0.80077684, "learning_rate": 3.7590060348573066e-06, "loss": 0.82729852, "num_input_tokens_seen": 65827230, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.2578125, "step": 3046, "time_per_iteration": 2.885042428970337 }, { "auxiliary_loss_clip": 0.01601826, "auxiliary_loss_mlp": 0.0105003, "balance_loss_clip": 1.37252033, "balance_loss_mlp": 1.02184868, "epoch": 0.18319555087930256, "flos": 21042861799680.0, "grad_norm": 1.7252790470988058, "language_loss": 0.79782617, "learning_rate": 3.7588206592989903e-06, "loss": 0.82434469, "num_input_tokens_seen": 65845900, "router_z_loss_clip": 2.29101562, "router_z_loss_mlp": 0.28161621, "step": 3047, "time_per_iteration": 2.8199899196624756 }, { "auxiliary_loss_clip": 0.01589127, "auxiliary_loss_mlp": 0.01048802, "balance_loss_clip": 1.36805117, "balance_loss_mlp": 1.02219462, "epoch": 0.18325567413197055, "flos": 34395350261760.0, "grad_norm": 2.8035015328506825, "language_loss": 0.81619906, "learning_rate": 3.7586352170460194e-06, "loss": 0.84257835, "num_input_tokens_seen": 65868730, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26635742, "step": 3048, "time_per_iteration": 2.9440200328826904 }, { "auxiliary_loss_clip": 0.01598112, "auxiliary_loss_mlp": 0.01048835, "balance_loss_clip": 1.37411475, "balance_loss_mlp": 1.02250195, "epoch": 0.18331579738463852, "flos": 20568206154240.0, "grad_norm": 1.9058231202847264, "language_loss": 0.87774944, "learning_rate": 3.758449708105424e-06, "loss": 0.90421903, "num_input_tokens_seen": 65888420, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26367188, "step": 3049, "time_per_iteration": 4.247037887573242 }, { "auxiliary_loss_clip": 0.01623745, "auxiliary_loss_mlp": 0.01057916, "balance_loss_clip": 1.38744581, "balance_loss_mlp": 1.03025973, "epoch": 0.18337592063730648, "flos": 19616858847360.0, "grad_norm": 2.7706086819088855, "language_loss": 0.78558272, "learning_rate": 3.75826413248424e-06, "loss": 0.81239933, "num_input_tokens_seen": 65905840, "router_z_loss_clip": 2.36132812, "router_z_loss_mlp": 0.27661133, "step": 3050, "time_per_iteration": 2.787618398666382 }, { "auxiliary_loss_clip": 0.01598559, "auxiliary_loss_mlp": 0.01055698, "balance_loss_clip": 1.37307954, "balance_loss_mlp": 1.02916241, "epoch": 0.18343604388997445, "flos": 20860841393280.0, "grad_norm": 2.213225992882047, "language_loss": 1.00510514, "learning_rate": 3.7580784901895035e-06, "loss": 1.03164768, "num_input_tokens_seen": 65922845, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.265625, "step": 3051, "time_per_iteration": 2.9342830181121826 }, { "auxiliary_loss_clip": 0.01600912, "auxiliary_loss_mlp": 0.01059523, "balance_loss_clip": 1.37968016, "balance_loss_mlp": 1.0330112, "epoch": 0.1834961671426424, "flos": 24406379879040.0, "grad_norm": 1.4188719815068753, "language_loss": 0.87281567, "learning_rate": 3.7578927812282542e-06, "loss": 0.89942002, "num_input_tokens_seen": 65945555, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26501465, "step": 3052, "time_per_iteration": 2.8994123935699463 }, { "auxiliary_loss_clip": 0.01590446, "auxiliary_loss_mlp": 0.01056849, "balance_loss_clip": 1.36940038, "balance_loss_mlp": 1.03183913, "epoch": 0.18355629039531038, "flos": 21261512511360.0, "grad_norm": 1.7822702781039785, "language_loss": 0.74749172, "learning_rate": 3.7577070056075356e-06, "loss": 0.77396464, "num_input_tokens_seen": 65963965, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25, "step": 3053, "time_per_iteration": 2.835766315460205 }, { "auxiliary_loss_clip": 0.01621529, "auxiliary_loss_mlp": 0.01069215, "balance_loss_clip": 1.39365673, "balance_loss_mlp": 1.04053307, "epoch": 0.18361641364797834, "flos": 28667874384000.0, "grad_norm": 1.8770222838086152, "language_loss": 0.63112718, "learning_rate": 3.7575211633343902e-06, "loss": 0.65803462, "num_input_tokens_seen": 65985965, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.28710938, "step": 3054, "time_per_iteration": 2.9282031059265137 }, { "auxiliary_loss_clip": 0.01600925, "auxiliary_loss_mlp": 0.01048639, "balance_loss_clip": 1.37507975, "balance_loss_mlp": 1.02399874, "epoch": 0.18367653690064634, "flos": 20927496263040.0, "grad_norm": 1.828336230725532, "language_loss": 0.79577047, "learning_rate": 3.7573352544158663e-06, "loss": 0.8222661, "num_input_tokens_seen": 66005645, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.24658203, "step": 3055, "time_per_iteration": 2.8252155780792236 }, { "auxiliary_loss_clip": 0.01591882, "auxiliary_loss_mlp": 0.01069175, "balance_loss_clip": 1.37298083, "balance_loss_mlp": 1.04404533, "epoch": 0.1837366601533143, "flos": 28776905648640.0, "grad_norm": 1.7063745000853896, "language_loss": 0.70883799, "learning_rate": 3.757149278859014e-06, "loss": 0.73544854, "num_input_tokens_seen": 66025675, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25109863, "step": 3056, "time_per_iteration": 4.451047420501709 }, { "auxiliary_loss_clip": 0.01593275, "auxiliary_loss_mlp": 0.01056205, "balance_loss_clip": 1.3692323, "balance_loss_mlp": 1.03180289, "epoch": 0.18379678340598227, "flos": 21261286287360.0, "grad_norm": 1.7862100494979685, "language_loss": 0.80993521, "learning_rate": 3.7569632366708842e-06, "loss": 0.83643007, "num_input_tokens_seen": 66046125, "router_z_loss_clip": 2.23730469, "router_z_loss_mlp": 0.24414062, "step": 3057, "time_per_iteration": 4.323470115661621 }, { "auxiliary_loss_clip": 0.01633036, "auxiliary_loss_mlp": 0.01066663, "balance_loss_clip": 1.39809716, "balance_loss_mlp": 1.03910196, "epoch": 0.18385690665865023, "flos": 20459491603200.0, "grad_norm": 2.0278979092414064, "language_loss": 0.83253753, "learning_rate": 3.756777127858533e-06, "loss": 0.8595345, "num_input_tokens_seen": 66064375, "router_z_loss_clip": 2.34960938, "router_z_loss_mlp": 0.27612305, "step": 3058, "time_per_iteration": 4.268897533416748 }, { "auxiliary_loss_clip": 0.01604676, "auxiliary_loss_mlp": 0.0105737, "balance_loss_clip": 1.37649333, "balance_loss_mlp": 1.03183579, "epoch": 0.1839170299113182, "flos": 26151694519680.0, "grad_norm": 2.289172897646812, "language_loss": 0.86897445, "learning_rate": 3.756590952429017e-06, "loss": 0.89559484, "num_input_tokens_seen": 66084590, "router_z_loss_clip": 2.28320312, "router_z_loss_mlp": 0.2557373, "step": 3059, "time_per_iteration": 2.980544328689575 }, { "auxiliary_loss_clip": 0.01598213, "auxiliary_loss_mlp": 0.01050105, "balance_loss_clip": 1.37391472, "balance_loss_mlp": 1.02494025, "epoch": 0.18397715316398616, "flos": 31770274867200.0, "grad_norm": 1.5384141609670152, "language_loss": 0.73199058, "learning_rate": 3.756404710389396e-06, "loss": 0.75847375, "num_input_tokens_seen": 66107105, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.25170898, "step": 3060, "time_per_iteration": 3.0302295684814453 }, { "auxiliary_loss_clip": 0.01592993, "auxiliary_loss_mlp": 0.01051649, "balance_loss_clip": 1.36662543, "balance_loss_mlp": 1.02480352, "epoch": 0.18403727641665413, "flos": 24623718491520.0, "grad_norm": 1.5342711444995145, "language_loss": 0.73482126, "learning_rate": 3.7562184017467323e-06, "loss": 0.76126766, "num_input_tokens_seen": 66129295, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26843262, "step": 3061, "time_per_iteration": 2.931584358215332 }, { "auxiliary_loss_clip": 0.0160961, "auxiliary_loss_mlp": 0.01051453, "balance_loss_clip": 1.38447404, "balance_loss_mlp": 1.02643096, "epoch": 0.18409739966932212, "flos": 23450146133760.0, "grad_norm": 1.6370563509800602, "language_loss": 0.81887186, "learning_rate": 3.7560320265080906e-06, "loss": 0.84548247, "num_input_tokens_seen": 66146910, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.25024414, "step": 3062, "time_per_iteration": 2.819403886795044 }, { "auxiliary_loss_clip": 0.01610298, "auxiliary_loss_mlp": 0.01053328, "balance_loss_clip": 1.3805604, "balance_loss_mlp": 1.02593398, "epoch": 0.18415752292199009, "flos": 21882236929920.0, "grad_norm": 1.8567265541757927, "language_loss": 0.747298, "learning_rate": 3.7558455846805383e-06, "loss": 0.77393425, "num_input_tokens_seen": 66165370, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.27441406, "step": 3063, "time_per_iteration": 2.861109495162964 }, { "auxiliary_loss_clip": 0.01589805, "auxiliary_loss_mlp": 0.01042849, "balance_loss_clip": 1.36544287, "balance_loss_mlp": 1.0183754, "epoch": 0.18421764617465805, "flos": 25421576878080.0, "grad_norm": 1.6457789439200605, "language_loss": 0.66683215, "learning_rate": 3.7556590762711463e-06, "loss": 0.69315869, "num_input_tokens_seen": 66186210, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.24475098, "step": 3064, "time_per_iteration": 2.8432421684265137 }, { "auxiliary_loss_clip": 0.01597441, "auxiliary_loss_mlp": 0.01049273, "balance_loss_clip": 1.37401342, "balance_loss_mlp": 1.02174747, "epoch": 0.18427776942732602, "flos": 27209403648000.0, "grad_norm": 2.013671463011374, "language_loss": 0.69988918, "learning_rate": 3.7554725012869853e-06, "loss": 0.72635639, "num_input_tokens_seen": 66204800, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.27490234, "step": 3065, "time_per_iteration": 2.8677608966827393 }, { "auxiliary_loss_clip": 0.01611328, "auxiliary_loss_mlp": 0.01049372, "balance_loss_clip": 1.38190746, "balance_loss_mlp": 1.02231133, "epoch": 0.18433789267999398, "flos": 27863138787840.0, "grad_norm": 3.2088341141597363, "language_loss": 0.73982489, "learning_rate": 3.7552858597351318e-06, "loss": 0.76643187, "num_input_tokens_seen": 66222195, "router_z_loss_clip": 2.29492188, "router_z_loss_mlp": 0.27062988, "step": 3066, "time_per_iteration": 2.869561195373535 }, { "auxiliary_loss_clip": 0.01600077, "auxiliary_loss_mlp": 0.01043098, "balance_loss_clip": 1.37322176, "balance_loss_mlp": 1.01608515, "epoch": 0.18439801593266195, "flos": 17865571893120.0, "grad_norm": 2.1749540738599484, "language_loss": 0.83962184, "learning_rate": 3.7550991516226622e-06, "loss": 0.86605358, "num_input_tokens_seen": 66239505, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.2701416, "step": 3067, "time_per_iteration": 2.826500415802002 }, { "auxiliary_loss_clip": 0.0134491, "auxiliary_loss_mlp": 0.01030496, "balance_loss_clip": 1.21193385, "balance_loss_mlp": 1.00531864, "epoch": 0.18445813918532994, "flos": 56418561285120.0, "grad_norm": 0.7960857806283291, "language_loss": 0.59891534, "learning_rate": 3.754912376956657e-06, "loss": 0.62266934, "num_input_tokens_seen": 66295695, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.25195312, "step": 3068, "time_per_iteration": 3.22337007522583 }, { "auxiliary_loss_clip": 0.01589571, "auxiliary_loss_mlp": 0.01050094, "balance_loss_clip": 1.36847401, "balance_loss_mlp": 1.02222323, "epoch": 0.1845182624379979, "flos": 20966343563520.0, "grad_norm": 1.6228102970853626, "language_loss": 0.7708047, "learning_rate": 3.7547255357441987e-06, "loss": 0.7972014, "num_input_tokens_seen": 66315315, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27880859, "step": 3069, "time_per_iteration": 2.852297067642212 }, { "auxiliary_loss_clip": 0.01607469, "auxiliary_loss_mlp": 0.01046422, "balance_loss_clip": 1.38000727, "balance_loss_mlp": 1.01863384, "epoch": 0.18457838569066587, "flos": 20494809809280.0, "grad_norm": 1.8623436679341008, "language_loss": 0.86134636, "learning_rate": 3.7545386279923718e-06, "loss": 0.88788533, "num_input_tokens_seen": 66333675, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.27770996, "step": 3070, "time_per_iteration": 2.9017724990844727 }, { "auxiliary_loss_clip": 0.01607099, "auxiliary_loss_mlp": 0.01046627, "balance_loss_clip": 1.38117766, "balance_loss_mlp": 1.01903057, "epoch": 0.18463850894333383, "flos": 25020996249600.0, "grad_norm": 1.856812338504629, "language_loss": 0.78813571, "learning_rate": 3.754351653708265e-06, "loss": 0.81467301, "num_input_tokens_seen": 66354075, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.27600098, "step": 3071, "time_per_iteration": 2.872328281402588 }, { "auxiliary_loss_clip": 0.01607673, "auxiliary_loss_mlp": 0.01050075, "balance_loss_clip": 1.37988663, "balance_loss_mlp": 1.02277577, "epoch": 0.1846986321960018, "flos": 16809446332800.0, "grad_norm": 2.1287905309281197, "language_loss": 0.79425293, "learning_rate": 3.7541646128989674e-06, "loss": 0.8208304, "num_input_tokens_seen": 66372520, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27282715, "step": 3072, "time_per_iteration": 2.8471953868865967 }, { "auxiliary_loss_clip": 0.01586199, "auxiliary_loss_mlp": 0.01044691, "balance_loss_clip": 1.35995579, "balance_loss_mlp": 1.01803565, "epoch": 0.18475875544866976, "flos": 20824256332800.0, "grad_norm": 1.736123124276282, "language_loss": 0.8752318, "learning_rate": 3.7539775055715715e-06, "loss": 0.90154076, "num_input_tokens_seen": 66390745, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.26696777, "step": 3073, "time_per_iteration": 2.812178373336792 }, { "auxiliary_loss_clip": 0.01601434, "auxiliary_loss_mlp": 0.01050143, "balance_loss_clip": 1.37336254, "balance_loss_mlp": 1.02303445, "epoch": 0.18481887870133773, "flos": 22611630654720.0, "grad_norm": 2.130435358591496, "language_loss": 0.93073797, "learning_rate": 3.7537903317331732e-06, "loss": 0.95725381, "num_input_tokens_seen": 66410525, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27124023, "step": 3074, "time_per_iteration": 2.8817856311798096 }, { "auxiliary_loss_clip": 0.0159196, "auxiliary_loss_mlp": 0.01047226, "balance_loss_clip": 1.36770582, "balance_loss_mlp": 1.01911616, "epoch": 0.18487900195400572, "flos": 29470302495360.0, "grad_norm": 1.7090676629681654, "language_loss": 0.65549338, "learning_rate": 3.75360309139087e-06, "loss": 0.68188518, "num_input_tokens_seen": 66432535, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.28149414, "step": 3075, "time_per_iteration": 2.915580987930298 }, { "auxiliary_loss_clip": 0.01586594, "auxiliary_loss_mlp": 0.01049849, "balance_loss_clip": 1.36525321, "balance_loss_mlp": 1.02483916, "epoch": 0.1849391252066737, "flos": 20637847180800.0, "grad_norm": 1.8422215436623135, "language_loss": 0.73748791, "learning_rate": 3.753415784551761e-06, "loss": 0.76385224, "num_input_tokens_seen": 66450620, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25012207, "step": 3076, "time_per_iteration": 2.81750226020813 }, { "auxiliary_loss_clip": 0.01608004, "auxiliary_loss_mlp": 0.01049633, "balance_loss_clip": 1.38103187, "balance_loss_mlp": 1.02351427, "epoch": 0.18499924845934165, "flos": 14436620553600.0, "grad_norm": 2.2051118359075796, "language_loss": 0.81717765, "learning_rate": 3.7532284112229507e-06, "loss": 0.84375399, "num_input_tokens_seen": 66467865, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26147461, "step": 3077, "time_per_iteration": 2.808136463165283 }, { "auxiliary_loss_clip": 0.01585913, "auxiliary_loss_mlp": 0.01047312, "balance_loss_clip": 1.36635745, "balance_loss_mlp": 1.02182472, "epoch": 0.18505937171200962, "flos": 23736899548800.0, "grad_norm": 1.7644228152478623, "language_loss": 0.79103899, "learning_rate": 3.7530409714115424e-06, "loss": 0.81737125, "num_input_tokens_seen": 66486245, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25500488, "step": 3078, "time_per_iteration": 2.8228471279144287 }, { "auxiliary_loss_clip": 0.01589914, "auxiliary_loss_mlp": 0.01049566, "balance_loss_clip": 1.36853755, "balance_loss_mlp": 1.02393651, "epoch": 0.18511949496467758, "flos": 25968407258880.0, "grad_norm": 2.381254428289374, "language_loss": 0.78845322, "learning_rate": 3.7528534651246453e-06, "loss": 0.81484807, "num_input_tokens_seen": 66506510, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.2565918, "step": 3079, "time_per_iteration": 2.8915460109710693 }, { "auxiliary_loss_clip": 0.01579392, "auxiliary_loss_mlp": 0.01048225, "balance_loss_clip": 1.35788453, "balance_loss_mlp": 1.02193975, "epoch": 0.18517961821734555, "flos": 42428997048960.0, "grad_norm": 2.1670813518155767, "language_loss": 0.82928956, "learning_rate": 3.752665892369369e-06, "loss": 0.85556567, "num_input_tokens_seen": 66530960, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26281738, "step": 3080, "time_per_iteration": 3.0204455852508545 }, { "auxiliary_loss_clip": 0.01610891, "auxiliary_loss_mlp": 0.01054851, "balance_loss_clip": 1.37876654, "balance_loss_mlp": 1.02805293, "epoch": 0.18523974147001354, "flos": 24108224774400.0, "grad_norm": 2.346149553027522, "language_loss": 0.7522769, "learning_rate": 3.7524782531528266e-06, "loss": 0.77893436, "num_input_tokens_seen": 66550275, "router_z_loss_clip": 2.3203125, "router_z_loss_mlp": 0.26831055, "step": 3081, "time_per_iteration": 2.852064371109009 }, { "auxiliary_loss_clip": 0.01595403, "auxiliary_loss_mlp": 0.01054909, "balance_loss_clip": 1.37214148, "balance_loss_mlp": 1.02870679, "epoch": 0.1852998647226815, "flos": 27385225516800.0, "grad_norm": 2.170746243683545, "language_loss": 0.72607875, "learning_rate": 3.7522905474821334e-06, "loss": 0.75258189, "num_input_tokens_seen": 66569040, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26245117, "step": 3082, "time_per_iteration": 2.949664354324341 }, { "auxiliary_loss_clip": 0.01605714, "auxiliary_loss_mlp": 0.01047599, "balance_loss_clip": 1.37885666, "balance_loss_mlp": 1.02237391, "epoch": 0.18535998797534947, "flos": 18341901596160.0, "grad_norm": 2.0191752026535954, "language_loss": 0.7183038, "learning_rate": 3.752102775364407e-06, "loss": 0.74483693, "num_input_tokens_seen": 66587775, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.25219727, "step": 3083, "time_per_iteration": 2.7898952960968018 }, { "auxiliary_loss_clip": 0.01576542, "auxiliary_loss_mlp": 0.010496, "balance_loss_clip": 1.35810089, "balance_loss_mlp": 1.02507854, "epoch": 0.18542011122801744, "flos": 37858307708160.0, "grad_norm": 2.8464044840684415, "language_loss": 0.70341402, "learning_rate": 3.751914936806767e-06, "loss": 0.72967541, "num_input_tokens_seen": 66610800, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24523926, "step": 3084, "time_per_iteration": 4.505472898483276 }, { "auxiliary_loss_clip": 0.01589471, "auxiliary_loss_mlp": 0.01045367, "balance_loss_clip": 1.36871672, "balance_loss_mlp": 1.02032113, "epoch": 0.1854802344806854, "flos": 25195506019200.0, "grad_norm": 1.5634431893791922, "language_loss": 0.78721899, "learning_rate": 3.7517270318163377e-06, "loss": 0.8135674, "num_input_tokens_seen": 66630960, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25061035, "step": 3085, "time_per_iteration": 2.9087185859680176 }, { "auxiliary_loss_clip": 0.01577388, "auxiliary_loss_mlp": 0.01047763, "balance_loss_clip": 1.35466838, "balance_loss_mlp": 1.02353978, "epoch": 0.18554035773335337, "flos": 26695267274880.0, "grad_norm": 1.7348941432389504, "language_loss": 0.74432713, "learning_rate": 3.751539060400244e-06, "loss": 0.77057862, "num_input_tokens_seen": 66650585, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24230957, "step": 3086, "time_per_iteration": 2.859485149383545 }, { "auxiliary_loss_clip": 0.01588021, "auxiliary_loss_mlp": 0.01049415, "balance_loss_clip": 1.36376607, "balance_loss_mlp": 1.02428532, "epoch": 0.18560048098602133, "flos": 22357254533760.0, "grad_norm": 2.200940108815574, "language_loss": 0.7042948, "learning_rate": 3.7513510225656132e-06, "loss": 0.73066914, "num_input_tokens_seen": 66670045, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.25085449, "step": 3087, "time_per_iteration": 2.8246090412139893 }, { "auxiliary_loss_clip": 0.01595528, "auxiliary_loss_mlp": 0.0105542, "balance_loss_clip": 1.37228227, "balance_loss_mlp": 1.02883673, "epoch": 0.18566060423868933, "flos": 17757400279680.0, "grad_norm": 2.0853841219800264, "language_loss": 0.73787624, "learning_rate": 3.7511629183195764e-06, "loss": 0.76438582, "num_input_tokens_seen": 66688790, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26574707, "step": 3088, "time_per_iteration": 2.8525400161743164 }, { "auxiliary_loss_clip": 0.0157282, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.35476875, "balance_loss_mlp": 1.02140141, "epoch": 0.1857207274913573, "flos": 24687025246080.0, "grad_norm": 1.7420464362943386, "language_loss": 0.92892414, "learning_rate": 3.7509747476692663e-06, "loss": 0.9551062, "num_input_tokens_seen": 66708090, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.23986816, "step": 3089, "time_per_iteration": 2.8494317531585693 }, { "auxiliary_loss_clip": 0.01586337, "auxiliary_loss_mlp": 0.01047262, "balance_loss_clip": 1.36514401, "balance_loss_mlp": 1.02294302, "epoch": 0.18578085074402526, "flos": 28159619834880.0, "grad_norm": 4.026178448699983, "language_loss": 0.58596957, "learning_rate": 3.7507865106218176e-06, "loss": 0.61230558, "num_input_tokens_seen": 66727320, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.2434082, "step": 3090, "time_per_iteration": 4.317478179931641 }, { "auxiliary_loss_clip": 0.01574362, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.35477138, "balance_loss_mlp": 1.01903987, "epoch": 0.18584097399669322, "flos": 23962563204480.0, "grad_norm": 1.9706383359133086, "language_loss": 0.82662368, "learning_rate": 3.7505982071843695e-06, "loss": 0.85280418, "num_input_tokens_seen": 66747505, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24682617, "step": 3091, "time_per_iteration": 2.8373935222625732 }, { "auxiliary_loss_clip": 0.01596492, "auxiliary_loss_mlp": 0.01048154, "balance_loss_clip": 1.37011766, "balance_loss_mlp": 1.02236867, "epoch": 0.18590109724936119, "flos": 17210524654080.0, "grad_norm": 6.603371660398848, "language_loss": 0.8560307, "learning_rate": 3.7504098373640617e-06, "loss": 0.88247722, "num_input_tokens_seen": 66766425, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.25805664, "step": 3092, "time_per_iteration": 2.8349838256835938 }, { "auxiliary_loss_clip": 0.01596332, "auxiliary_loss_mlp": 0.01049586, "balance_loss_clip": 1.36758494, "balance_loss_mlp": 1.02397943, "epoch": 0.18596122050202915, "flos": 17242585234560.0, "grad_norm": 2.0359302782008353, "language_loss": 0.94295681, "learning_rate": 3.750221401168038e-06, "loss": 0.9694159, "num_input_tokens_seen": 66781130, "router_z_loss_clip": 2.28710938, "router_z_loss_mlp": 0.25622559, "step": 3093, "time_per_iteration": 5.73705792427063 }, { "auxiliary_loss_clip": 0.01587062, "auxiliary_loss_mlp": 0.01050137, "balance_loss_clip": 1.3635478, "balance_loss_mlp": 1.02428055, "epoch": 0.18602134375469712, "flos": 19028692702080.0, "grad_norm": 1.7760503156419998, "language_loss": 0.78137785, "learning_rate": 3.750032898603443e-06, "loss": 0.80774987, "num_input_tokens_seen": 66797535, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.25830078, "step": 3094, "time_per_iteration": 2.8374452590942383 }, { "auxiliary_loss_clip": 0.01576762, "auxiliary_loss_mlp": 0.01048331, "balance_loss_clip": 1.35852695, "balance_loss_mlp": 1.02299929, "epoch": 0.1860814670073651, "flos": 50967459780480.0, "grad_norm": 1.5040545335937183, "language_loss": 0.7075128, "learning_rate": 3.749844329677425e-06, "loss": 0.73376375, "num_input_tokens_seen": 66821720, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.25354004, "step": 3095, "time_per_iteration": 3.1230666637420654 }, { "auxiliary_loss_clip": 0.01605037, "auxiliary_loss_mlp": 0.01050539, "balance_loss_clip": 1.37517095, "balance_loss_mlp": 1.02395535, "epoch": 0.18614159026003307, "flos": 19400425130880.0, "grad_norm": 1.9666147567122005, "language_loss": 0.8160435, "learning_rate": 3.749655694397135e-06, "loss": 0.84259927, "num_input_tokens_seen": 66839060, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.26586914, "step": 3096, "time_per_iteration": 2.849066734313965 }, { "auxiliary_loss_clip": 0.01593557, "auxiliary_loss_mlp": 0.01053632, "balance_loss_clip": 1.36806417, "balance_loss_mlp": 1.02838373, "epoch": 0.18620171351270104, "flos": 21808795340160.0, "grad_norm": 1.8990177457502415, "language_loss": 0.76614517, "learning_rate": 3.7494669927697255e-06, "loss": 0.79261708, "num_input_tokens_seen": 66857760, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.25231934, "step": 3097, "time_per_iteration": 2.8417656421661377 }, { "auxiliary_loss_clip": 0.01580902, "auxiliary_loss_mlp": 0.01043567, "balance_loss_clip": 1.36206722, "balance_loss_mlp": 1.01881897, "epoch": 0.186261836765369, "flos": 16371601971840.0, "grad_norm": 2.5236827785577796, "language_loss": 0.67612875, "learning_rate": 3.749278224802352e-06, "loss": 0.70237345, "num_input_tokens_seen": 66876460, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24755859, "step": 3098, "time_per_iteration": 2.8494958877563477 }, { "auxiliary_loss_clip": 0.01597372, "auxiliary_loss_mlp": 0.01053154, "balance_loss_clip": 1.36911786, "balance_loss_mlp": 1.02535391, "epoch": 0.18632196001803697, "flos": 23381093289600.0, "grad_norm": 1.6121854396080253, "language_loss": 0.70674115, "learning_rate": 3.7490893905021733e-06, "loss": 0.73324645, "num_input_tokens_seen": 66897960, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.27807617, "step": 3099, "time_per_iteration": 2.896617889404297 }, { "auxiliary_loss_clip": 0.01588226, "auxiliary_loss_mlp": 0.0104522, "balance_loss_clip": 1.36664987, "balance_loss_mlp": 1.01929212, "epoch": 0.18638208327070493, "flos": 22502192186880.0, "grad_norm": 1.400517788288609, "language_loss": 0.72710669, "learning_rate": 3.7489004898763494e-06, "loss": 0.75344121, "num_input_tokens_seen": 66917675, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.25927734, "step": 3100, "time_per_iteration": 2.82242488861084 }, { "auxiliary_loss_clip": 0.01585829, "auxiliary_loss_mlp": 0.01054425, "balance_loss_clip": 1.36110568, "balance_loss_mlp": 1.02787685, "epoch": 0.18644220652337293, "flos": 29176219422720.0, "grad_norm": 1.64665895939195, "language_loss": 0.80493307, "learning_rate": 3.7487115229320444e-06, "loss": 0.83133566, "num_input_tokens_seen": 66936000, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26574707, "step": 3101, "time_per_iteration": 2.8969950675964355 }, { "auxiliary_loss_clip": 0.01570735, "auxiliary_loss_mlp": 0.01039504, "balance_loss_clip": 1.35500503, "balance_loss_mlp": 1.01510143, "epoch": 0.1865023297760409, "flos": 24254565016320.0, "grad_norm": 1.678994472610055, "language_loss": 0.77573967, "learning_rate": 3.7485224896764222e-06, "loss": 0.80184209, "num_input_tokens_seen": 66955700, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24414062, "step": 3102, "time_per_iteration": 2.8357949256896973 }, { "auxiliary_loss_clip": 0.01583343, "auxiliary_loss_mlp": 0.01048053, "balance_loss_clip": 1.35898161, "balance_loss_mlp": 1.02325714, "epoch": 0.18656245302870886, "flos": 19136321377920.0, "grad_norm": 2.0220148303767194, "language_loss": 0.77671146, "learning_rate": 3.7483333901166525e-06, "loss": 0.80302548, "num_input_tokens_seen": 66972815, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24816895, "step": 3103, "time_per_iteration": 2.8197343349456787 }, { "auxiliary_loss_clip": 0.01587597, "auxiliary_loss_mlp": 0.01048269, "balance_loss_clip": 1.36489046, "balance_loss_mlp": 1.02284133, "epoch": 0.18662257628137682, "flos": 17795252194560.0, "grad_norm": 1.5422258845471915, "language_loss": 0.80031168, "learning_rate": 3.7481442242599054e-06, "loss": 0.82667035, "num_input_tokens_seen": 66992280, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.25463867, "step": 3104, "time_per_iteration": 2.8937339782714844 }, { "auxiliary_loss_clip": 0.01587057, "auxiliary_loss_mlp": 0.01046773, "balance_loss_clip": 1.36821222, "balance_loss_mlp": 1.02293134, "epoch": 0.1866826995340448, "flos": 24034964163840.0, "grad_norm": 1.9829630349458243, "language_loss": 0.86477798, "learning_rate": 3.747954992113354e-06, "loss": 0.89111626, "num_input_tokens_seen": 67012220, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.23840332, "step": 3105, "time_per_iteration": 2.860485792160034 }, { "auxiliary_loss_clip": 0.01599053, "auxiliary_loss_mlp": 0.01050577, "balance_loss_clip": 1.37010515, "balance_loss_mlp": 1.02442288, "epoch": 0.18674282278671275, "flos": 26152554170880.0, "grad_norm": 2.2503152319466393, "language_loss": 0.88044596, "learning_rate": 3.7477656936841742e-06, "loss": 0.90694225, "num_input_tokens_seen": 67032030, "router_z_loss_clip": 2.29296875, "router_z_loss_mlp": 0.26171875, "step": 3106, "time_per_iteration": 2.9390101432800293 }, { "auxiliary_loss_clip": 0.01611999, "auxiliary_loss_mlp": 0.01050351, "balance_loss_clip": 1.38261056, "balance_loss_mlp": 1.02333808, "epoch": 0.18680294603938072, "flos": 19209627233280.0, "grad_norm": 1.7627480877800392, "language_loss": 0.79199135, "learning_rate": 3.7475763289795445e-06, "loss": 0.81861484, "num_input_tokens_seen": 67048920, "router_z_loss_clip": 2.296875, "router_z_loss_mlp": 0.2701416, "step": 3107, "time_per_iteration": 2.808230400085449 }, { "auxiliary_loss_clip": 0.01583091, "auxiliary_loss_mlp": 0.01049799, "balance_loss_clip": 1.35752559, "balance_loss_mlp": 1.0237515, "epoch": 0.1868630692920487, "flos": 28555675983360.0, "grad_norm": 2.0043511386521944, "language_loss": 0.75405759, "learning_rate": 3.7473868980066446e-06, "loss": 0.78038651, "num_input_tokens_seen": 67068645, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.26049805, "step": 3108, "time_per_iteration": 2.871408700942993 }, { "auxiliary_loss_clip": 0.015852, "auxiliary_loss_mlp": 0.01046015, "balance_loss_clip": 1.36164904, "balance_loss_mlp": 1.01958632, "epoch": 0.18692319254471668, "flos": 17246838245760.0, "grad_norm": 1.8943349607537183, "language_loss": 0.75499725, "learning_rate": 3.747197400772658e-06, "loss": 0.78130937, "num_input_tokens_seen": 67087075, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26428223, "step": 3109, "time_per_iteration": 2.7994892597198486 }, { "auxiliary_loss_clip": 0.01585231, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.36222601, "balance_loss_mlp": 1.01905417, "epoch": 0.18698331579738464, "flos": 23195543788800.0, "grad_norm": 1.4826598194296665, "language_loss": 0.85746121, "learning_rate": 3.747007837284772e-06, "loss": 0.88375998, "num_input_tokens_seen": 67108040, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25610352, "step": 3110, "time_per_iteration": 2.8691396713256836 }, { "auxiliary_loss_clip": 0.01597003, "auxiliary_loss_mlp": 0.01046001, "balance_loss_clip": 1.3732183, "balance_loss_mlp": 1.01950121, "epoch": 0.1870434390500526, "flos": 25526762334720.0, "grad_norm": 1.4901486294504438, "language_loss": 0.85208362, "learning_rate": 3.7468182075501737e-06, "loss": 0.87851369, "num_input_tokens_seen": 67127605, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26489258, "step": 3111, "time_per_iteration": 2.9198365211486816 }, { "auxiliary_loss_clip": 0.01590754, "auxiliary_loss_mlp": 0.01043847, "balance_loss_clip": 1.36846089, "balance_loss_mlp": 1.01894462, "epoch": 0.18710356230272057, "flos": 19510406536320.0, "grad_norm": 1.826205982519557, "language_loss": 0.77759337, "learning_rate": 3.7466285115760536e-06, "loss": 0.80393934, "num_input_tokens_seen": 67145785, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.24914551, "step": 3112, "time_per_iteration": 2.850834608078003 }, { "auxiliary_loss_clip": 0.0158572, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 1.36319566, "balance_loss_mlp": 1.02132416, "epoch": 0.18716368555538854, "flos": 26772147469440.0, "grad_norm": 2.4476659706045685, "language_loss": 0.65902406, "learning_rate": 3.7464387493696046e-06, "loss": 0.6853388, "num_input_tokens_seen": 67165930, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.2442627, "step": 3113, "time_per_iteration": 2.931129217147827 }, { "auxiliary_loss_clip": 0.01593778, "auxiliary_loss_mlp": 0.01049287, "balance_loss_clip": 1.36923981, "balance_loss_mlp": 1.02267981, "epoch": 0.1872238088080565, "flos": 25200166233600.0, "grad_norm": 2.2963519187591355, "language_loss": 0.83063322, "learning_rate": 3.746248920938024e-06, "loss": 0.85706389, "num_input_tokens_seen": 67185830, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26635742, "step": 3114, "time_per_iteration": 2.8817389011383057 }, { "auxiliary_loss_clip": 0.01591971, "auxiliary_loss_mlp": 0.01054448, "balance_loss_clip": 1.36721897, "balance_loss_mlp": 1.02776921, "epoch": 0.1872839320607245, "flos": 24145036058880.0, "grad_norm": 2.1944214557027775, "language_loss": 0.5835095, "learning_rate": 3.74605902628851e-06, "loss": 0.60997367, "num_input_tokens_seen": 67206930, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.2668457, "step": 3115, "time_per_iteration": 2.9038774967193604 }, { "auxiliary_loss_clip": 0.01598096, "auxiliary_loss_mlp": 0.01050945, "balance_loss_clip": 1.37683654, "balance_loss_mlp": 1.02523112, "epoch": 0.18734405531339246, "flos": 21182415321600.0, "grad_norm": 1.590893337288736, "language_loss": 0.72453189, "learning_rate": 3.745869065428261e-06, "loss": 0.75102222, "num_input_tokens_seen": 67226290, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25732422, "step": 3116, "time_per_iteration": 2.837353229522705 }, { "auxiliary_loss_clip": 0.01571462, "auxiliary_loss_mlp": 0.01040123, "balance_loss_clip": 1.35603535, "balance_loss_mlp": 1.01668644, "epoch": 0.18740417856606043, "flos": 17246431042560.0, "grad_norm": 3.7191052120659966, "language_loss": 0.80037874, "learning_rate": 3.7456790383644833e-06, "loss": 0.82649457, "num_input_tokens_seen": 67244410, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.23449707, "step": 3117, "time_per_iteration": 2.9050958156585693 }, { "auxiliary_loss_clip": 0.01569233, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.35454333, "balance_loss_mlp": 1.01805079, "epoch": 0.1874643018187284, "flos": 32569807311360.0, "grad_norm": 1.547163060619897, "language_loss": 0.84943068, "learning_rate": 3.745488945104381e-06, "loss": 0.8755604, "num_input_tokens_seen": 67264470, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25708008, "step": 3118, "time_per_iteration": 2.978087902069092 }, { "auxiliary_loss_clip": 0.01590921, "auxiliary_loss_mlp": 0.01049839, "balance_loss_clip": 1.36653662, "balance_loss_mlp": 1.02572274, "epoch": 0.18752442507139636, "flos": 23268532930560.0, "grad_norm": 2.840494128987577, "language_loss": 0.77771026, "learning_rate": 3.7452987856551636e-06, "loss": 0.80411792, "num_input_tokens_seen": 67284315, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.24133301, "step": 3119, "time_per_iteration": 4.25692081451416 }, { "auxiliary_loss_clip": 0.01581536, "auxiliary_loss_mlp": 0.01046367, "balance_loss_clip": 1.36002707, "balance_loss_mlp": 1.02160728, "epoch": 0.18758454832406432, "flos": 21770581466880.0, "grad_norm": 1.7316178218629545, "language_loss": 0.830284, "learning_rate": 3.7451085600240406e-06, "loss": 0.85656303, "num_input_tokens_seen": 67302780, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.24780273, "step": 3120, "time_per_iteration": 2.9192965030670166 }, { "auxiliary_loss_clip": 0.01580911, "auxiliary_loss_mlp": 0.01043073, "balance_loss_clip": 1.36139274, "balance_loss_mlp": 1.01882529, "epoch": 0.1876446715767323, "flos": 29582229427200.0, "grad_norm": 1.8069756972226652, "language_loss": 0.85813129, "learning_rate": 3.7449182682182263e-06, "loss": 0.88437104, "num_input_tokens_seen": 67323405, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.24243164, "step": 3121, "time_per_iteration": 2.9062294960021973 }, { "auxiliary_loss_clip": 0.0159161, "auxiliary_loss_mlp": 0.01039883, "balance_loss_clip": 1.37297118, "balance_loss_mlp": 1.01574278, "epoch": 0.18770479482940028, "flos": 30353139895680.0, "grad_norm": 1.6043620037311233, "language_loss": 0.71786362, "learning_rate": 3.744727910244937e-06, "loss": 0.74417853, "num_input_tokens_seen": 67345800, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24169922, "step": 3122, "time_per_iteration": 2.929694652557373 }, { "auxiliary_loss_clip": 0.01588379, "auxiliary_loss_mlp": 0.01047755, "balance_loss_clip": 1.36917233, "balance_loss_mlp": 1.02278054, "epoch": 0.18776491808206824, "flos": 14473522327680.0, "grad_norm": 3.3403560020744787, "language_loss": 0.72174466, "learning_rate": 3.7445374861113905e-06, "loss": 0.74810606, "num_input_tokens_seen": 67363575, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.25, "step": 3123, "time_per_iteration": 2.8495266437530518 }, { "auxiliary_loss_clip": 0.01576781, "auxiliary_loss_mlp": 0.01042212, "balance_loss_clip": 1.3606987, "balance_loss_mlp": 1.01804841, "epoch": 0.1878250413347362, "flos": 24509212606080.0, "grad_norm": 1.8164713711841194, "language_loss": 0.75616777, "learning_rate": 3.7443469958248066e-06, "loss": 0.78235763, "num_input_tokens_seen": 67381765, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.24145508, "step": 3124, "time_per_iteration": 2.869356155395508 }, { "auxiliary_loss_clip": 0.01584355, "auxiliary_loss_mlp": 0.01048333, "balance_loss_clip": 1.36234462, "balance_loss_mlp": 1.02159429, "epoch": 0.18788516458740417, "flos": 39800845008000.0, "grad_norm": 2.015170282472192, "language_loss": 0.81808943, "learning_rate": 3.7441564393924106e-06, "loss": 0.84441632, "num_input_tokens_seen": 67405000, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26745605, "step": 3125, "time_per_iteration": 4.401974678039551 }, { "auxiliary_loss_clip": 0.01359135, "auxiliary_loss_mlp": 0.01033646, "balance_loss_clip": 1.22696829, "balance_loss_mlp": 1.01380944, "epoch": 0.18794528784007214, "flos": 64728102735360.0, "grad_norm": 0.9735989894225896, "language_loss": 0.63697881, "learning_rate": 3.7439658168214273e-06, "loss": 0.66090661, "num_input_tokens_seen": 67467140, "router_z_loss_clip": 1.3203125, "router_z_loss_mlp": 0.19824219, "step": 3126, "time_per_iteration": 3.4352986812591553 }, { "auxiliary_loss_clip": 0.01581972, "auxiliary_loss_mlp": 0.01048275, "balance_loss_clip": 1.36365223, "balance_loss_mlp": 1.02289557, "epoch": 0.1880054110927401, "flos": 28633958766720.0, "grad_norm": 1.651650625777953, "language_loss": 0.82102275, "learning_rate": 3.7437751281190857e-06, "loss": 0.84732521, "num_input_tokens_seen": 67487980, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25390625, "step": 3127, "time_per_iteration": 4.402543783187866 }, { "auxiliary_loss_clip": 0.01350581, "auxiliary_loss_mlp": 0.01033535, "balance_loss_clip": 1.217453, "balance_loss_mlp": 1.00549686, "epoch": 0.1880655343454081, "flos": 64519994062080.0, "grad_norm": 0.7685551272780244, "language_loss": 0.6197114, "learning_rate": 3.7435843732926164e-06, "loss": 0.64355254, "num_input_tokens_seen": 67552500, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.28125, "step": 3128, "time_per_iteration": 4.892225503921509 }, { "auxiliary_loss_clip": 0.01589486, "auxiliary_loss_mlp": 0.01047048, "balance_loss_clip": 1.36585546, "balance_loss_mlp": 1.02097726, "epoch": 0.18812565759807606, "flos": 32137211347200.0, "grad_norm": 2.1490249533292336, "language_loss": 0.72626084, "learning_rate": 3.7433935523492536e-06, "loss": 0.75262618, "num_input_tokens_seen": 67573295, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26098633, "step": 3129, "time_per_iteration": 3.0445449352264404 }, { "auxiliary_loss_clip": 0.01585369, "auxiliary_loss_mlp": 0.01046369, "balance_loss_clip": 1.36416221, "balance_loss_mlp": 1.02117991, "epoch": 0.18818578085074403, "flos": 20632508294400.0, "grad_norm": 1.9181718491171293, "language_loss": 0.85652661, "learning_rate": 3.7432026652962314e-06, "loss": 0.88284397, "num_input_tokens_seen": 67590010, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.25195312, "step": 3130, "time_per_iteration": 2.950244903564453 }, { "auxiliary_loss_clip": 0.01570039, "auxiliary_loss_mlp": 0.01044461, "balance_loss_clip": 1.3491683, "balance_loss_mlp": 1.01974893, "epoch": 0.188245904103412, "flos": 28852338009600.0, "grad_norm": 1.853391730323631, "language_loss": 0.77780187, "learning_rate": 3.7430117121407897e-06, "loss": 0.80394685, "num_input_tokens_seen": 67611110, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24731445, "step": 3131, "time_per_iteration": 2.9085958003997803 }, { "auxiliary_loss_clip": 0.01564238, "auxiliary_loss_mlp": 0.01047548, "balance_loss_clip": 1.34777474, "balance_loss_mlp": 1.02071357, "epoch": 0.18830602735607996, "flos": 29431274215680.0, "grad_norm": 1.7790181437957948, "language_loss": 0.82812393, "learning_rate": 3.74282069289017e-06, "loss": 0.85424167, "num_input_tokens_seen": 67631990, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.26794434, "step": 3132, "time_per_iteration": 2.923907995223999 }, { "auxiliary_loss_clip": 0.01602914, "auxiliary_loss_mlp": 0.01046883, "balance_loss_clip": 1.376122, "balance_loss_mlp": 1.02125299, "epoch": 0.18836615060874792, "flos": 28883719918080.0, "grad_norm": 2.0463172809136254, "language_loss": 0.8052392, "learning_rate": 3.742629607551614e-06, "loss": 0.83173716, "num_input_tokens_seen": 67650490, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.25646973, "step": 3133, "time_per_iteration": 2.923058032989502 }, { "auxiliary_loss_clip": 0.01568736, "auxiliary_loss_mlp": 0.01050731, "balance_loss_clip": 1.34821379, "balance_loss_mlp": 1.02437353, "epoch": 0.18842627386141592, "flos": 22611947368320.0, "grad_norm": 1.9597009988494538, "language_loss": 0.83933455, "learning_rate": 3.7424384561323698e-06, "loss": 0.86552918, "num_input_tokens_seen": 67668860, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26318359, "step": 3134, "time_per_iteration": 2.870147466659546 }, { "auxiliary_loss_clip": 0.01575129, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.35624349, "balance_loss_mlp": 1.02464628, "epoch": 0.18848639711408388, "flos": 24584825946240.0, "grad_norm": 1.4995475234490754, "language_loss": 0.83689392, "learning_rate": 3.742247238639684e-06, "loss": 0.86314881, "num_input_tokens_seen": 67690220, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25744629, "step": 3135, "time_per_iteration": 3.0020697116851807 }, { "auxiliary_loss_clip": 0.01572072, "auxiliary_loss_mlp": 0.01046891, "balance_loss_clip": 1.35194159, "balance_loss_mlp": 1.02126086, "epoch": 0.18854652036675185, "flos": 34180002420480.0, "grad_norm": 1.8433972534520813, "language_loss": 0.79523027, "learning_rate": 3.7420559550808083e-06, "loss": 0.82141984, "num_input_tokens_seen": 67709820, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.25598145, "step": 3136, "time_per_iteration": 2.9804697036743164 }, { "auxiliary_loss_clip": 0.01583548, "auxiliary_loss_mlp": 0.01051345, "balance_loss_clip": 1.36327267, "balance_loss_mlp": 1.02534521, "epoch": 0.1886066436194198, "flos": 24209157219840.0, "grad_norm": 1.9329436979961732, "language_loss": 0.82266217, "learning_rate": 3.741864605462996e-06, "loss": 0.84901112, "num_input_tokens_seen": 67729490, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.26000977, "step": 3137, "time_per_iteration": 2.935488224029541 }, { "auxiliary_loss_clip": 0.01586322, "auxiliary_loss_mlp": 0.01046991, "balance_loss_clip": 1.36569595, "balance_loss_mlp": 1.02052689, "epoch": 0.18866676687208778, "flos": 21260879084160.0, "grad_norm": 1.6190350428345992, "language_loss": 0.81924915, "learning_rate": 3.741673189793504e-06, "loss": 0.84558231, "num_input_tokens_seen": 67749665, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26501465, "step": 3138, "time_per_iteration": 2.8867318630218506 }, { "auxiliary_loss_clip": 0.0159655, "auxiliary_loss_mlp": 0.01058622, "balance_loss_clip": 1.37180257, "balance_loss_mlp": 1.03146625, "epoch": 0.18872689012475574, "flos": 37323602933760.0, "grad_norm": 1.6560756618210744, "language_loss": 0.64399552, "learning_rate": 3.7414817080795896e-06, "loss": 0.67054725, "num_input_tokens_seen": 67776230, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.2713623, "step": 3139, "time_per_iteration": 2.998769760131836 }, { "auxiliary_loss_clip": 0.01572381, "auxiliary_loss_mlp": 0.01047252, "balance_loss_clip": 1.35099769, "balance_loss_mlp": 1.02051353, "epoch": 0.1887870133774237, "flos": 21662138384640.0, "grad_norm": 3.024390013021514, "language_loss": 0.72151548, "learning_rate": 3.741290160328514e-06, "loss": 0.74771184, "num_input_tokens_seen": 67795080, "router_z_loss_clip": 2.21191406, "router_z_loss_mlp": 0.26745605, "step": 3140, "time_per_iteration": 2.889688491821289 }, { "auxiliary_loss_clip": 0.01572464, "auxiliary_loss_mlp": 0.01051052, "balance_loss_clip": 1.34947443, "balance_loss_mlp": 1.02344322, "epoch": 0.1888471366300917, "flos": 15933169428480.0, "grad_norm": 3.4574888421783543, "language_loss": 0.88838363, "learning_rate": 3.7410985465475412e-06, "loss": 0.91461879, "num_input_tokens_seen": 67813110, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27612305, "step": 3141, "time_per_iteration": 2.806288719177246 }, { "auxiliary_loss_clip": 0.01590465, "auxiliary_loss_mlp": 0.01050153, "balance_loss_clip": 1.3641001, "balance_loss_mlp": 1.02395034, "epoch": 0.18890725988275966, "flos": 18561230979840.0, "grad_norm": 1.9605831349115537, "language_loss": 0.78162777, "learning_rate": 3.7409068667439378e-06, "loss": 0.80803394, "num_input_tokens_seen": 67831070, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26208496, "step": 3142, "time_per_iteration": 2.8683576583862305 }, { "auxiliary_loss_clip": 0.01573383, "auxiliary_loss_mlp": 0.01045915, "balance_loss_clip": 1.35593534, "balance_loss_mlp": 1.01989186, "epoch": 0.18896738313542763, "flos": 28852790457600.0, "grad_norm": 1.625834258340113, "language_loss": 0.79664505, "learning_rate": 3.740715120924971e-06, "loss": 0.82283807, "num_input_tokens_seen": 67852170, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.26025391, "step": 3143, "time_per_iteration": 2.8949053287506104 }, { "auxiliary_loss_clip": 0.01574516, "auxiliary_loss_mlp": 0.01049702, "balance_loss_clip": 1.3529563, "balance_loss_mlp": 1.02239108, "epoch": 0.1890275063880956, "flos": 22421285205120.0, "grad_norm": 2.28951866884211, "language_loss": 0.72816223, "learning_rate": 3.740523309097912e-06, "loss": 0.75440443, "num_input_tokens_seen": 67869945, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.27331543, "step": 3144, "time_per_iteration": 2.8312060832977295 }, { "auxiliary_loss_clip": 0.01579794, "auxiliary_loss_mlp": 0.01054268, "balance_loss_clip": 1.35553062, "balance_loss_mlp": 1.02830446, "epoch": 0.18908762964076356, "flos": 24254700750720.0, "grad_norm": 2.2119200425956254, "language_loss": 0.7517854, "learning_rate": 3.7403314312700356e-06, "loss": 0.77812606, "num_input_tokens_seen": 67890240, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.25976562, "step": 3145, "time_per_iteration": 2.903623342514038 }, { "auxiliary_loss_clip": 0.01560943, "auxiliary_loss_mlp": 0.01045631, "balance_loss_clip": 1.34371567, "balance_loss_mlp": 1.02084708, "epoch": 0.18914775289343153, "flos": 16991738208000.0, "grad_norm": 2.7309401302308483, "language_loss": 0.77655154, "learning_rate": 3.740139487448616e-06, "loss": 0.80261731, "num_input_tokens_seen": 67907825, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24804688, "step": 3146, "time_per_iteration": 2.843816041946411 }, { "auxiliary_loss_clip": 0.0158161, "auxiliary_loss_mlp": 0.01040152, "balance_loss_clip": 1.35920358, "balance_loss_mlp": 1.01435471, "epoch": 0.1892078761460995, "flos": 21553831036800.0, "grad_norm": 1.7162968386169448, "language_loss": 0.79957664, "learning_rate": 3.7399474776409326e-06, "loss": 0.82579428, "num_input_tokens_seen": 67926670, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.25793457, "step": 3147, "time_per_iteration": 2.886833667755127 }, { "auxiliary_loss_clip": 0.01571583, "auxiliary_loss_mlp": 0.01046008, "balance_loss_clip": 1.35192704, "balance_loss_mlp": 1.02018762, "epoch": 0.18926799939876748, "flos": 23011532611200.0, "grad_norm": 2.199921705597021, "language_loss": 0.6775803, "learning_rate": 3.739755401854267e-06, "loss": 0.70375621, "num_input_tokens_seen": 67943645, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25830078, "step": 3148, "time_per_iteration": 2.8633103370666504 }, { "auxiliary_loss_clip": 0.01575107, "auxiliary_loss_mlp": 0.01046339, "balance_loss_clip": 1.35085726, "balance_loss_mlp": 1.0196718, "epoch": 0.18932812265143545, "flos": 22283134272000.0, "grad_norm": 2.900369567083367, "language_loss": 0.77392524, "learning_rate": 3.739563260095902e-06, "loss": 0.80013967, "num_input_tokens_seen": 67962345, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.26696777, "step": 3149, "time_per_iteration": 2.8908467292785645 }, { "auxiliary_loss_clip": 0.0155615, "auxiliary_loss_mlp": 0.01052308, "balance_loss_clip": 1.34022152, "balance_loss_mlp": 1.02607048, "epoch": 0.1893882459041034, "flos": 18633586694400.0, "grad_norm": 2.048909697756744, "language_loss": 0.82001507, "learning_rate": 3.7393710523731245e-06, "loss": 0.84609967, "num_input_tokens_seen": 67979760, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.26245117, "step": 3150, "time_per_iteration": 2.8559305667877197 }, { "auxiliary_loss_clip": 0.01585718, "auxiliary_loss_mlp": 0.01050019, "balance_loss_clip": 1.36180663, "balance_loss_mlp": 1.02283883, "epoch": 0.18944836915677138, "flos": 22903180018560.0, "grad_norm": 2.198466313895822, "language_loss": 0.86300385, "learning_rate": 3.7391787786932215e-06, "loss": 0.88936126, "num_input_tokens_seen": 67996895, "router_z_loss_clip": 2.24023438, "router_z_loss_mlp": 0.27148438, "step": 3151, "time_per_iteration": 2.900867462158203 }, { "auxiliary_loss_clip": 0.01567729, "auxiliary_loss_mlp": 0.01050215, "balance_loss_clip": 1.34872818, "balance_loss_mlp": 1.02293992, "epoch": 0.18950849240943934, "flos": 26807646654720.0, "grad_norm": 1.657777748678977, "language_loss": 0.75840318, "learning_rate": 3.7389864390634857e-06, "loss": 0.78458261, "num_input_tokens_seen": 68018365, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.27270508, "step": 3152, "time_per_iteration": 2.9809913635253906 }, { "auxiliary_loss_clip": 0.01572855, "auxiliary_loss_mlp": 0.01057012, "balance_loss_clip": 1.35191989, "balance_loss_mlp": 1.02996349, "epoch": 0.1895686156621073, "flos": 24981606011520.0, "grad_norm": 1.8178600894816515, "language_loss": 0.76181173, "learning_rate": 3.738794033491209e-06, "loss": 0.78811038, "num_input_tokens_seen": 68037985, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.27050781, "step": 3153, "time_per_iteration": 2.898897886276245 }, { "auxiliary_loss_clip": 0.01575594, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.35347247, "balance_loss_mlp": 1.02941132, "epoch": 0.1896287389147753, "flos": 21954547399680.0, "grad_norm": 3.4844041477462473, "language_loss": 0.80967152, "learning_rate": 3.7386015619836887e-06, "loss": 0.83597612, "num_input_tokens_seen": 68057975, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.25463867, "step": 3154, "time_per_iteration": 4.2570414543151855 }, { "auxiliary_loss_clip": 0.0159768, "auxiliary_loss_mlp": 0.01053287, "balance_loss_clip": 1.36994326, "balance_loss_mlp": 1.02614307, "epoch": 0.18968886216744327, "flos": 18186150435840.0, "grad_norm": 2.2987345238783825, "language_loss": 0.73777759, "learning_rate": 3.738409024548223e-06, "loss": 0.76428723, "num_input_tokens_seen": 68074175, "router_z_loss_clip": 2.27734375, "router_z_loss_mlp": 0.2713623, "step": 3155, "time_per_iteration": 2.8232548236846924 }, { "auxiliary_loss_clip": 0.01562849, "auxiliary_loss_mlp": 0.01053507, "balance_loss_clip": 1.34331608, "balance_loss_mlp": 1.02717364, "epoch": 0.18974898542011123, "flos": 20422182625920.0, "grad_norm": 1.8368039108003942, "language_loss": 0.74944377, "learning_rate": 3.7382164211921136e-06, "loss": 0.77560735, "num_input_tokens_seen": 68095230, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26342773, "step": 3156, "time_per_iteration": 2.885035753250122 }, { "auxiliary_loss_clip": 0.01586772, "auxiliary_loss_mlp": 0.01050511, "balance_loss_clip": 1.36095393, "balance_loss_mlp": 1.02477384, "epoch": 0.1898091086727792, "flos": 23994850008960.0, "grad_norm": 2.046962983755719, "language_loss": 0.69333756, "learning_rate": 3.7380237519226623e-06, "loss": 0.71971041, "num_input_tokens_seen": 68113805, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.25756836, "step": 3157, "time_per_iteration": 2.909311056137085 }, { "auxiliary_loss_clip": 0.01583055, "auxiliary_loss_mlp": 0.01047504, "balance_loss_clip": 1.36011314, "balance_loss_mlp": 1.02121854, "epoch": 0.18986923192544716, "flos": 27648741087360.0, "grad_norm": 1.6996184345048786, "language_loss": 0.81410092, "learning_rate": 3.737831016747176e-06, "loss": 0.84040648, "num_input_tokens_seen": 68133190, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26306152, "step": 3158, "time_per_iteration": 2.900313377380371 }, { "auxiliary_loss_clip": 0.01609863, "auxiliary_loss_mlp": 0.01050496, "balance_loss_clip": 1.37944078, "balance_loss_mlp": 1.02221954, "epoch": 0.18992935517811513, "flos": 25495199447040.0, "grad_norm": 1.6809999893446657, "language_loss": 0.72908485, "learning_rate": 3.737638215672964e-06, "loss": 0.75568849, "num_input_tokens_seen": 68152330, "router_z_loss_clip": 2.30664062, "router_z_loss_mlp": 0.28271484, "step": 3159, "time_per_iteration": 2.8512120246887207 }, { "auxiliary_loss_clip": 0.01584003, "auxiliary_loss_mlp": 0.01047339, "balance_loss_clip": 1.36071897, "balance_loss_mlp": 1.02058816, "epoch": 0.1899894784307831, "flos": 17429763548160.0, "grad_norm": 2.1022971506225487, "language_loss": 0.85579586, "learning_rate": 3.7374453487073366e-06, "loss": 0.88210928, "num_input_tokens_seen": 68170185, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26757812, "step": 3160, "time_per_iteration": 4.165325403213501 }, { "auxiliary_loss_clip": 0.01571868, "auxiliary_loss_mlp": 0.01051803, "balance_loss_clip": 1.35514867, "balance_loss_mlp": 1.02582741, "epoch": 0.19004960168345109, "flos": 27504165392640.0, "grad_norm": 1.6844320090357818, "language_loss": 0.74285394, "learning_rate": 3.7372524158576074e-06, "loss": 0.76909065, "num_input_tokens_seen": 68191665, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.2598877, "step": 3161, "time_per_iteration": 2.8833272457122803 }, { "auxiliary_loss_clip": 0.01575602, "auxiliary_loss_mlp": 0.01042444, "balance_loss_clip": 1.35613811, "balance_loss_mlp": 1.0168494, "epoch": 0.19010972493611905, "flos": 38668925128320.0, "grad_norm": 2.071936983361805, "language_loss": 0.81926751, "learning_rate": 3.7370594171310926e-06, "loss": 0.84544796, "num_input_tokens_seen": 68214635, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.25598145, "step": 3162, "time_per_iteration": 3.012822151184082 }, { "auxiliary_loss_clip": 0.01585004, "auxiliary_loss_mlp": 0.01042653, "balance_loss_clip": 1.36145067, "balance_loss_mlp": 1.01525879, "epoch": 0.19016984818878702, "flos": 19254084888960.0, "grad_norm": 3.3685948786342044, "language_loss": 0.76738483, "learning_rate": 3.73686635253511e-06, "loss": 0.79366142, "num_input_tokens_seen": 68232150, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27416992, "step": 3163, "time_per_iteration": 5.8205296993255615 }, { "auxiliary_loss_clip": 0.01586474, "auxiliary_loss_mlp": 0.01051437, "balance_loss_clip": 1.36879122, "balance_loss_mlp": 1.02481723, "epoch": 0.19022997144145498, "flos": 37610944531200.0, "grad_norm": 1.7011990888707766, "language_loss": 0.75612962, "learning_rate": 3.736673222076982e-06, "loss": 0.78250873, "num_input_tokens_seen": 68253370, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.26611328, "step": 3164, "time_per_iteration": 3.0000579357147217 }, { "auxiliary_loss_clip": 0.01594567, "auxiliary_loss_mlp": 0.0104712, "balance_loss_clip": 1.37244916, "balance_loss_mlp": 1.01960695, "epoch": 0.19029009469412295, "flos": 61551021070080.0, "grad_norm": 1.4890671270348457, "language_loss": 0.67479306, "learning_rate": 3.7364800257640313e-06, "loss": 0.7012099, "num_input_tokens_seen": 68278895, "router_z_loss_clip": 2.22363281, "router_z_loss_mlp": 0.27514648, "step": 3165, "time_per_iteration": 3.2436959743499756 }, { "auxiliary_loss_clip": 0.01587906, "auxiliary_loss_mlp": 0.01046314, "balance_loss_clip": 1.36904025, "balance_loss_mlp": 1.01727414, "epoch": 0.1903502179467909, "flos": 13962960293760.0, "grad_norm": 2.08591175565006, "language_loss": 0.75765777, "learning_rate": 3.7362867636035835e-06, "loss": 0.78399992, "num_input_tokens_seen": 68294880, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.29040527, "step": 3166, "time_per_iteration": 2.8089373111724854 }, { "auxiliary_loss_clip": 0.01358795, "auxiliary_loss_mlp": 0.01029027, "balance_loss_clip": 1.22600949, "balance_loss_mlp": 1.00079811, "epoch": 0.1904103411994589, "flos": 66931983855360.0, "grad_norm": 0.8028460833196479, "language_loss": 0.505198, "learning_rate": 3.736093435602968e-06, "loss": 0.52907622, "num_input_tokens_seen": 68359665, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.28320312, "step": 3167, "time_per_iteration": 3.372403621673584 }, { "auxiliary_loss_clip": 0.0156611, "auxiliary_loss_mlp": 0.01050964, "balance_loss_clip": 1.35044646, "balance_loss_mlp": 1.02459502, "epoch": 0.19047046445212687, "flos": 21918912480000.0, "grad_norm": 1.8639202814268288, "language_loss": 0.75131738, "learning_rate": 3.7359000417695156e-06, "loss": 0.77748811, "num_input_tokens_seen": 68378950, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26391602, "step": 3168, "time_per_iteration": 2.822801113128662 }, { "auxiliary_loss_clip": 0.01358899, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 1.22566354, "balance_loss_mlp": 1.00235391, "epoch": 0.19053058770479483, "flos": 59280593552640.0, "grad_norm": 0.8927012824871449, "language_loss": 0.60137618, "learning_rate": 3.73570658211056e-06, "loss": 0.6252557, "num_input_tokens_seen": 68434235, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.26757812, "step": 3169, "time_per_iteration": 3.2251923084259033 }, { "auxiliary_loss_clip": 0.01602767, "auxiliary_loss_mlp": 0.01047079, "balance_loss_clip": 1.37611556, "balance_loss_mlp": 1.02037573, "epoch": 0.1905907109574628, "flos": 23961839287680.0, "grad_norm": 1.672524400730476, "language_loss": 0.79639339, "learning_rate": 3.735513056633436e-06, "loss": 0.82289183, "num_input_tokens_seen": 68453830, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.26721191, "step": 3170, "time_per_iteration": 2.870544195175171 }, { "auxiliary_loss_clip": 0.01582535, "auxiliary_loss_mlp": 0.01045175, "balance_loss_clip": 1.36352396, "balance_loss_mlp": 1.01812673, "epoch": 0.19065083421013077, "flos": 20821722624000.0, "grad_norm": 1.762526474556542, "language_loss": 0.79549479, "learning_rate": 3.7353194653454834e-06, "loss": 0.82177192, "num_input_tokens_seen": 68473005, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.27038574, "step": 3171, "time_per_iteration": 2.871091365814209 }, { "auxiliary_loss_clip": 0.01613734, "auxiliary_loss_mlp": 0.01045354, "balance_loss_clip": 1.38215482, "balance_loss_mlp": 1.01849663, "epoch": 0.19071095746279873, "flos": 31297429013760.0, "grad_norm": 2.1491517315433035, "language_loss": 0.80146438, "learning_rate": 3.7351258082540426e-06, "loss": 0.82805526, "num_input_tokens_seen": 68493470, "router_z_loss_clip": 2.31835938, "router_z_loss_mlp": 0.26879883, "step": 3172, "time_per_iteration": 3.0304644107818604 }, { "auxiliary_loss_clip": 0.01583974, "auxiliary_loss_mlp": 0.01050931, "balance_loss_clip": 1.36377609, "balance_loss_mlp": 1.02518141, "epoch": 0.1907710807154667, "flos": 14364310083840.0, "grad_norm": 1.9593912954730246, "language_loss": 0.8177948, "learning_rate": 3.7349320853664576e-06, "loss": 0.84414387, "num_input_tokens_seen": 68511290, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.25756836, "step": 3173, "time_per_iteration": 2.82743501663208 }, { "auxiliary_loss_clip": 0.01598949, "auxiliary_loss_mlp": 0.01048347, "balance_loss_clip": 1.37270784, "balance_loss_mlp": 1.02264535, "epoch": 0.1908312039681347, "flos": 26918442466560.0, "grad_norm": 5.630213003996178, "language_loss": 0.79590869, "learning_rate": 3.7347382966900735e-06, "loss": 0.82238162, "num_input_tokens_seen": 68532575, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.25708008, "step": 3174, "time_per_iteration": 2.914959192276001 }, { "auxiliary_loss_clip": 0.01605777, "auxiliary_loss_mlp": 0.01048846, "balance_loss_clip": 1.38010073, "balance_loss_mlp": 1.02207136, "epoch": 0.19089132722080265, "flos": 14501013183360.0, "grad_norm": 2.189019942797107, "language_loss": 0.82303667, "learning_rate": 3.7345444422322395e-06, "loss": 0.84958285, "num_input_tokens_seen": 68548760, "router_z_loss_clip": 2.25585938, "router_z_loss_mlp": 0.26782227, "step": 3175, "time_per_iteration": 2.836819887161255 }, { "auxiliary_loss_clip": 0.01607444, "auxiliary_loss_mlp": 0.01055218, "balance_loss_clip": 1.38012362, "balance_loss_mlp": 1.02863431, "epoch": 0.19095145047347062, "flos": 13960471829760.0, "grad_norm": 2.0221176055758874, "language_loss": 0.87212706, "learning_rate": 3.7343505220003067e-06, "loss": 0.89875364, "num_input_tokens_seen": 68563100, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.26611328, "step": 3176, "time_per_iteration": 2.8335506916046143 }, { "auxiliary_loss_clip": 0.0161666, "auxiliary_loss_mlp": 0.01045417, "balance_loss_clip": 1.38599944, "balance_loss_mlp": 1.01696157, "epoch": 0.19101157372613858, "flos": 25312862327040.0, "grad_norm": 1.9335281878751003, "language_loss": 0.83261561, "learning_rate": 3.7341565360016285e-06, "loss": 0.85923642, "num_input_tokens_seen": 68581650, "router_z_loss_clip": 2.3046875, "router_z_loss_mlp": 0.28491211, "step": 3177, "time_per_iteration": 2.8637807369232178 }, { "auxiliary_loss_clip": 0.01594735, "auxiliary_loss_mlp": 0.01042105, "balance_loss_clip": 1.3718338, "balance_loss_mlp": 1.01593864, "epoch": 0.19107169697880655, "flos": 20567889440640.0, "grad_norm": 2.0585533787105343, "language_loss": 0.767537, "learning_rate": 3.73396248424356e-06, "loss": 0.79390544, "num_input_tokens_seen": 68600360, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26196289, "step": 3178, "time_per_iteration": 2.8514297008514404 }, { "auxiliary_loss_clip": 0.01604265, "auxiliary_loss_mlp": 0.01041751, "balance_loss_clip": 1.37830174, "balance_loss_mlp": 1.0154295, "epoch": 0.19113182023147451, "flos": 22173198111360.0, "grad_norm": 1.7623080237531925, "language_loss": 0.82116759, "learning_rate": 3.7337683667334606e-06, "loss": 0.84762776, "num_input_tokens_seen": 68617885, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.26379395, "step": 3179, "time_per_iteration": 2.9932940006256104 }, { "auxiliary_loss_clip": 0.01604155, "auxiliary_loss_mlp": 0.01044849, "balance_loss_clip": 1.37981856, "balance_loss_mlp": 1.01846862, "epoch": 0.19119194348414248, "flos": 18589355262720.0, "grad_norm": 2.473546059181213, "language_loss": 0.80593711, "learning_rate": 3.733574183478691e-06, "loss": 0.83242714, "num_input_tokens_seen": 68634550, "router_z_loss_clip": 2.2421875, "router_z_loss_mlp": 0.26379395, "step": 3180, "time_per_iteration": 2.796189069747925 }, { "auxiliary_loss_clip": 0.01593418, "auxiliary_loss_mlp": 0.01047769, "balance_loss_clip": 1.37156522, "balance_loss_mlp": 1.02149546, "epoch": 0.19125206673681047, "flos": 19036112849280.0, "grad_norm": 2.0358117694384505, "language_loss": 0.80586076, "learning_rate": 3.733379934486615e-06, "loss": 0.83227265, "num_input_tokens_seen": 68651895, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26269531, "step": 3181, "time_per_iteration": 2.809083938598633 }, { "auxiliary_loss_clip": 0.01600307, "auxiliary_loss_mlp": 0.01048199, "balance_loss_clip": 1.37592936, "balance_loss_mlp": 1.0215317, "epoch": 0.19131218998947844, "flos": 21700352257920.0, "grad_norm": 2.2364548236052313, "language_loss": 0.74678075, "learning_rate": 3.7331856197645973e-06, "loss": 0.77326578, "num_input_tokens_seen": 68671500, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26696777, "step": 3182, "time_per_iteration": 2.817657709121704 }, { "auxiliary_loss_clip": 0.01590158, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.36903358, "balance_loss_mlp": 1.01670861, "epoch": 0.1913723132421464, "flos": 18451747267200.0, "grad_norm": 1.846974170174953, "language_loss": 0.66717911, "learning_rate": 3.7329912393200084e-06, "loss": 0.69350958, "num_input_tokens_seen": 68690570, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26196289, "step": 3183, "time_per_iteration": 2.8341546058654785 }, { "auxiliary_loss_clip": 0.01612259, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.38235044, "balance_loss_mlp": 1.01837778, "epoch": 0.19143243649481437, "flos": 27170963550720.0, "grad_norm": 1.4884668406026045, "language_loss": 0.73681599, "learning_rate": 3.7327967931602173e-06, "loss": 0.7633872, "num_input_tokens_seen": 68709735, "router_z_loss_clip": 2.29882812, "router_z_loss_mlp": 0.26513672, "step": 3184, "time_per_iteration": 2.8783516883850098 }, { "auxiliary_loss_clip": 0.01603129, "auxiliary_loss_mlp": 0.01050011, "balance_loss_clip": 1.37638557, "balance_loss_mlp": 1.02008915, "epoch": 0.19149255974748233, "flos": 21727164441600.0, "grad_norm": 1.772221773319966, "language_loss": 0.89248055, "learning_rate": 3.732602281292598e-06, "loss": 0.91901195, "num_input_tokens_seen": 68727565, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.29882812, "step": 3185, "time_per_iteration": 2.7932181358337402 }, { "auxiliary_loss_clip": 0.01585889, "auxiliary_loss_mlp": 0.01040624, "balance_loss_clip": 1.36216378, "balance_loss_mlp": 1.01405263, "epoch": 0.1915526830001503, "flos": 22972821045120.0, "grad_norm": 1.8431155412777869, "language_loss": 0.73857898, "learning_rate": 3.7324077037245267e-06, "loss": 0.76484406, "num_input_tokens_seen": 68748110, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26574707, "step": 3186, "time_per_iteration": 2.9249303340911865 }, { "auxiliary_loss_clip": 0.01608186, "auxiliary_loss_mlp": 0.01049689, "balance_loss_clip": 1.3814677, "balance_loss_mlp": 1.02224731, "epoch": 0.1916128062528183, "flos": 26151920743680.0, "grad_norm": 1.8240701997055138, "language_loss": 0.84880418, "learning_rate": 3.7322130604633825e-06, "loss": 0.8753829, "num_input_tokens_seen": 68769765, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.2746582, "step": 3187, "time_per_iteration": 2.8734729290008545 }, { "auxiliary_loss_clip": 0.01351451, "auxiliary_loss_mlp": 0.01042981, "balance_loss_clip": 1.2171154, "balance_loss_mlp": 1.0181855, "epoch": 0.19167292950548626, "flos": 54953303829120.0, "grad_norm": 0.8802834394269181, "language_loss": 0.55876303, "learning_rate": 3.732018351516544e-06, "loss": 0.58270729, "num_input_tokens_seen": 68826815, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.24707031, "step": 3188, "time_per_iteration": 4.797913312911987 }, { "auxiliary_loss_clip": 0.01596319, "auxiliary_loss_mlp": 0.01051803, "balance_loss_clip": 1.37255955, "balance_loss_mlp": 1.02521968, "epoch": 0.19173305275815422, "flos": 29947853808000.0, "grad_norm": 1.5767004351929843, "language_loss": 0.7047919, "learning_rate": 3.731823576891397e-06, "loss": 0.73127306, "num_input_tokens_seen": 68847585, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26574707, "step": 3189, "time_per_iteration": 2.9395627975463867 }, { "auxiliary_loss_clip": 0.01582261, "auxiliary_loss_mlp": 0.01043256, "balance_loss_clip": 1.36426187, "balance_loss_mlp": 1.01753068, "epoch": 0.1917931760108222, "flos": 24762819565440.0, "grad_norm": 1.8111492738173383, "language_loss": 0.74900985, "learning_rate": 3.7316287365953266e-06, "loss": 0.7752651, "num_input_tokens_seen": 68866620, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25756836, "step": 3190, "time_per_iteration": 2.8578155040740967 }, { "auxiliary_loss_clip": 0.01586477, "auxiliary_loss_mlp": 0.01049947, "balance_loss_clip": 1.36669457, "balance_loss_mlp": 1.02397144, "epoch": 0.19185329926349015, "flos": 18852825588480.0, "grad_norm": 2.287782808313809, "language_loss": 0.85813051, "learning_rate": 3.73143383063572e-06, "loss": 0.88449478, "num_input_tokens_seen": 68885515, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.25976562, "step": 3191, "time_per_iteration": 2.8653664588928223 }, { "auxiliary_loss_clip": 0.01587691, "auxiliary_loss_mlp": 0.01044797, "balance_loss_clip": 1.36862862, "balance_loss_mlp": 1.01860666, "epoch": 0.19191342251615812, "flos": 22095955958400.0, "grad_norm": 2.0842952667217585, "language_loss": 0.90604949, "learning_rate": 3.73123885901997e-06, "loss": 0.93237436, "num_input_tokens_seen": 68903225, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26184082, "step": 3192, "time_per_iteration": 2.844270706176758 }, { "auxiliary_loss_clip": 0.01607469, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.37988842, "balance_loss_mlp": 1.02453244, "epoch": 0.19197354576882608, "flos": 22208968765440.0, "grad_norm": 1.9865084382874445, "language_loss": 0.75272024, "learning_rate": 3.7310438217554687e-06, "loss": 0.77932155, "num_input_tokens_seen": 68922860, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.28173828, "step": 3193, "time_per_iteration": 2.8871216773986816 }, { "auxiliary_loss_clip": 0.01599558, "auxiliary_loss_mlp": 0.0104935, "balance_loss_clip": 1.37238002, "balance_loss_mlp": 1.02190816, "epoch": 0.19203366902149407, "flos": 24905992671360.0, "grad_norm": 1.7028475204125064, "language_loss": 0.75825262, "learning_rate": 3.730848718849612e-06, "loss": 0.7847417, "num_input_tokens_seen": 68943000, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.27441406, "step": 3194, "time_per_iteration": 2.9212520122528076 }, { "auxiliary_loss_clip": 0.01333945, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.19878316, "balance_loss_mlp": 1.00963616, "epoch": 0.19209379227416204, "flos": 68445481547520.0, "grad_norm": 0.7951739055743999, "language_loss": 0.68538529, "learning_rate": 3.7306535503097985e-06, "loss": 0.70906907, "num_input_tokens_seen": 69000255, "router_z_loss_clip": 1.3515625, "router_z_loss_mlp": 0.24804688, "step": 3195, "time_per_iteration": 4.729078769683838 }, { "auxiliary_loss_clip": 0.01597646, "auxiliary_loss_mlp": 0.01054015, "balance_loss_clip": 1.374331, "balance_loss_mlp": 1.027336, "epoch": 0.19215391552683, "flos": 22065524190720.0, "grad_norm": 2.09716449351901, "language_loss": 0.7462787, "learning_rate": 3.730458316143429e-06, "loss": 0.77279532, "num_input_tokens_seen": 69019665, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.2668457, "step": 3196, "time_per_iteration": 2.8374783992767334 }, { "auxiliary_loss_clip": 0.01601126, "auxiliary_loss_mlp": 0.01049476, "balance_loss_clip": 1.37838554, "balance_loss_mlp": 1.02263069, "epoch": 0.19221403877949797, "flos": 20312608423680.0, "grad_norm": 1.7391405017110613, "language_loss": 0.84154677, "learning_rate": 3.7302630163579068e-06, "loss": 0.86805272, "num_input_tokens_seen": 69039055, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26831055, "step": 3197, "time_per_iteration": 2.8432765007019043 }, { "auxiliary_loss_clip": 0.01593632, "auxiliary_loss_mlp": 0.01049425, "balance_loss_clip": 1.36735654, "balance_loss_mlp": 1.02193511, "epoch": 0.19227416203216594, "flos": 23195724768000.0, "grad_norm": 2.488101655320581, "language_loss": 0.81374812, "learning_rate": 3.7300676509606373e-06, "loss": 0.84017873, "num_input_tokens_seen": 69056370, "router_z_loss_clip": 2.26367188, "router_z_loss_mlp": 0.27490234, "step": 3198, "time_per_iteration": 4.30849552154541 }, { "auxiliary_loss_clip": 0.01602188, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.37634027, "balance_loss_mlp": 1.01913261, "epoch": 0.1923342852848339, "flos": 25787879930880.0, "grad_norm": 3.119331425499335, "language_loss": 0.79682016, "learning_rate": 3.729872219959029e-06, "loss": 0.82331121, "num_input_tokens_seen": 69075915, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.27770996, "step": 3199, "time_per_iteration": 4.211105585098267 }, { "auxiliary_loss_clip": 0.01593932, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.370785, "balance_loss_mlp": 1.01815772, "epoch": 0.19239440853750187, "flos": 17137083064320.0, "grad_norm": 2.1531842717003657, "language_loss": 0.85675061, "learning_rate": 3.7296767233604934e-06, "loss": 0.88313019, "num_input_tokens_seen": 69094145, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25878906, "step": 3200, "time_per_iteration": 2.8035526275634766 }, { "auxiliary_loss_clip": 0.01594119, "auxiliary_loss_mlp": 0.01055649, "balance_loss_clip": 1.37127519, "balance_loss_mlp": 1.02897024, "epoch": 0.19245453179016986, "flos": 16443640972800.0, "grad_norm": 2.040986317692293, "language_loss": 0.80025339, "learning_rate": 3.729481161172443e-06, "loss": 0.82675099, "num_input_tokens_seen": 69111110, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26696777, "step": 3201, "time_per_iteration": 2.8297629356384277 }, { "auxiliary_loss_clip": 0.01591571, "auxiliary_loss_mlp": 0.01046334, "balance_loss_clip": 1.36688066, "balance_loss_mlp": 1.01865339, "epoch": 0.19251465504283782, "flos": 20239981240320.0, "grad_norm": 2.21565989526193, "language_loss": 0.7046659, "learning_rate": 3.7292855334022927e-06, "loss": 0.73104495, "num_input_tokens_seen": 69130280, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27734375, "step": 3202, "time_per_iteration": 2.8338985443115234 }, { "auxiliary_loss_clip": 0.01576039, "auxiliary_loss_mlp": 0.0104723, "balance_loss_clip": 1.35748434, "balance_loss_mlp": 1.01944244, "epoch": 0.1925747782955058, "flos": 19473957210240.0, "grad_norm": 1.7830657217416914, "language_loss": 0.91912627, "learning_rate": 3.7290898400574627e-06, "loss": 0.94535899, "num_input_tokens_seen": 69149570, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.27783203, "step": 3203, "time_per_iteration": 2.8469111919403076 }, { "auxiliary_loss_clip": 0.01599124, "auxiliary_loss_mlp": 0.01054967, "balance_loss_clip": 1.37198436, "balance_loss_mlp": 1.02838349, "epoch": 0.19263490154817375, "flos": 17794618767360.0, "grad_norm": 2.349201103598365, "language_loss": 0.83755696, "learning_rate": 3.7288940811453725e-06, "loss": 0.86409789, "num_input_tokens_seen": 69168190, "router_z_loss_clip": 2.27148438, "router_z_loss_mlp": 0.26635742, "step": 3204, "time_per_iteration": 2.830904483795166 }, { "auxiliary_loss_clip": 0.01600775, "auxiliary_loss_mlp": 0.01045722, "balance_loss_clip": 1.3787117, "balance_loss_mlp": 1.01926982, "epoch": 0.19269502480084172, "flos": 17465986650240.0, "grad_norm": 1.8727635968072005, "language_loss": 0.76045895, "learning_rate": 3.7286982566734454e-06, "loss": 0.78692389, "num_input_tokens_seen": 69186950, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.26464844, "step": 3205, "time_per_iteration": 2.8485400676727295 }, { "auxiliary_loss_clip": 0.01615377, "auxiliary_loss_mlp": 0.01051843, "balance_loss_clip": 1.38800359, "balance_loss_mlp": 1.02496076, "epoch": 0.19275514805350968, "flos": 21516838773120.0, "grad_norm": 2.7750057895904674, "language_loss": 0.8462128, "learning_rate": 3.728502366649107e-06, "loss": 0.87288499, "num_input_tokens_seen": 69204850, "router_z_loss_clip": 2.27539062, "router_z_loss_mlp": 0.2689209, "step": 3206, "time_per_iteration": 2.8545782566070557 }, { "auxiliary_loss_clip": 0.01320873, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.18672788, "balance_loss_mlp": 1.0119921, "epoch": 0.19281527130617768, "flos": 47720184871680.0, "grad_norm": 0.8620259903875853, "language_loss": 0.6063863, "learning_rate": 3.728306411079786e-06, "loss": 0.62992096, "num_input_tokens_seen": 69259200, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.20605469, "step": 3207, "time_per_iteration": 3.2039854526519775 }, { "auxiliary_loss_clip": 0.01596403, "auxiliary_loss_mlp": 0.01060979, "balance_loss_clip": 1.36947465, "balance_loss_mlp": 1.03458595, "epoch": 0.19287539455884564, "flos": 11808559002240.0, "grad_norm": 2.8521803347963894, "language_loss": 0.76094502, "learning_rate": 3.7281103899729125e-06, "loss": 0.78751886, "num_input_tokens_seen": 69275835, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.26367188, "step": 3208, "time_per_iteration": 2.8869776725769043 }, { "auxiliary_loss_clip": 0.01619295, "auxiliary_loss_mlp": 0.01068733, "balance_loss_clip": 1.39156532, "balance_loss_mlp": 1.04157662, "epoch": 0.1929355178115136, "flos": 20641195296000.0, "grad_norm": 1.9126386356728184, "language_loss": 0.61829174, "learning_rate": 3.7279143033359195e-06, "loss": 0.645172, "num_input_tokens_seen": 69294810, "router_z_loss_clip": 2.27929688, "router_z_loss_mlp": 0.27197266, "step": 3209, "time_per_iteration": 2.841313362121582 }, { "auxiliary_loss_clip": 0.01602133, "auxiliary_loss_mlp": 0.01066796, "balance_loss_clip": 1.375139, "balance_loss_mlp": 1.04103506, "epoch": 0.19299564106418157, "flos": 40822873971840.0, "grad_norm": 1.8264743848593412, "language_loss": 0.80984139, "learning_rate": 3.727718151176243e-06, "loss": 0.83653069, "num_input_tokens_seen": 69316065, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.2578125, "step": 3210, "time_per_iteration": 3.0539228916168213 }, { "auxiliary_loss_clip": 0.01584488, "auxiliary_loss_mlp": 0.0106464, "balance_loss_clip": 1.3660655, "balance_loss_mlp": 1.03872442, "epoch": 0.19305576431684954, "flos": 11368090442880.0, "grad_norm": 1.9813870026955729, "language_loss": 0.84017658, "learning_rate": 3.7275219335013217e-06, "loss": 0.86666787, "num_input_tokens_seen": 69332900, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2590332, "step": 3211, "time_per_iteration": 2.8057193756103516 }, { "auxiliary_loss_clip": 0.01319548, "auxiliary_loss_mlp": 0.01037316, "balance_loss_clip": 1.18636191, "balance_loss_mlp": 1.0087055, "epoch": 0.1931158875695175, "flos": 54536073114240.0, "grad_norm": 0.9769680721396009, "language_loss": 0.63659501, "learning_rate": 3.7273256503185953e-06, "loss": 0.66016364, "num_input_tokens_seen": 69382535, "router_z_loss_clip": 1.328125, "router_z_loss_mlp": 0.28515625, "step": 3212, "time_per_iteration": 3.239260196685791 }, { "auxiliary_loss_clip": 0.0159117, "auxiliary_loss_mlp": 0.01063672, "balance_loss_clip": 1.37036574, "balance_loss_mlp": 1.03859043, "epoch": 0.19317601082218547, "flos": 19837907533440.0, "grad_norm": 1.5492710082922068, "language_loss": 0.77521461, "learning_rate": 3.7271293016355074e-06, "loss": 0.801763, "num_input_tokens_seen": 69400600, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25109863, "step": 3213, "time_per_iteration": 2.8692479133605957 }, { "auxiliary_loss_clip": 0.01612785, "auxiliary_loss_mlp": 0.01067722, "balance_loss_clip": 1.38442326, "balance_loss_mlp": 1.04146004, "epoch": 0.19323613407485346, "flos": 13159944000000.0, "grad_norm": 4.360238204682326, "language_loss": 0.72277731, "learning_rate": 3.726932887459503e-06, "loss": 0.74958235, "num_input_tokens_seen": 69417350, "router_z_loss_clip": 2.28125, "router_z_loss_mlp": 0.26281738, "step": 3214, "time_per_iteration": 2.8058691024780273 }, { "auxiliary_loss_clip": 0.01596984, "auxiliary_loss_mlp": 0.01078921, "balance_loss_clip": 1.37240791, "balance_loss_mlp": 1.05035853, "epoch": 0.19329625732752143, "flos": 14035225518720.0, "grad_norm": 2.4609375825808555, "language_loss": 0.76370353, "learning_rate": 3.72673640779803e-06, "loss": 0.79046255, "num_input_tokens_seen": 69431845, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.2857666, "step": 3215, "time_per_iteration": 2.935652732849121 }, { "auxiliary_loss_clip": 0.01591032, "auxiliary_loss_mlp": 0.01073795, "balance_loss_clip": 1.37166977, "balance_loss_mlp": 1.04678226, "epoch": 0.1933563805801894, "flos": 23452453618560.0, "grad_norm": 1.687377028570692, "language_loss": 0.88767099, "learning_rate": 3.72653986265854e-06, "loss": 0.91431922, "num_input_tokens_seen": 69453275, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.27001953, "step": 3216, "time_per_iteration": 2.852292776107788 }, { "auxiliary_loss_clip": 0.01593834, "auxiliary_loss_mlp": 0.0107024, "balance_loss_clip": 1.37295151, "balance_loss_mlp": 1.04552794, "epoch": 0.19341650383285736, "flos": 20494990788480.0, "grad_norm": 1.8937768693185784, "language_loss": 0.80887246, "learning_rate": 3.726343252048485e-06, "loss": 0.83551323, "num_input_tokens_seen": 69471830, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.24719238, "step": 3217, "time_per_iteration": 2.855722665786743 }, { "auxiliary_loss_clip": 0.01627515, "auxiliary_loss_mlp": 0.01068219, "balance_loss_clip": 1.39379144, "balance_loss_mlp": 1.04010987, "epoch": 0.19347662708552532, "flos": 17867517419520.0, "grad_norm": 2.8794496926170363, "language_loss": 0.64319146, "learning_rate": 3.7261465759753206e-06, "loss": 0.67014873, "num_input_tokens_seen": 69489320, "router_z_loss_clip": 2.33984375, "router_z_loss_mlp": 0.28125, "step": 3218, "time_per_iteration": 2.773016929626465 }, { "auxiliary_loss_clip": 0.01610255, "auxiliary_loss_mlp": 0.01069544, "balance_loss_clip": 1.38459754, "balance_loss_mlp": 1.04298401, "epoch": 0.1935367503381933, "flos": 18196466250240.0, "grad_norm": 1.6043076997911772, "language_loss": 0.80759645, "learning_rate": 3.7259498344465053e-06, "loss": 0.83439445, "num_input_tokens_seen": 69506665, "router_z_loss_clip": 2.2578125, "router_z_loss_mlp": 0.26611328, "step": 3219, "time_per_iteration": 2.803114414215088 }, { "auxiliary_loss_clip": 0.01588709, "auxiliary_loss_mlp": 0.01058174, "balance_loss_clip": 1.3689568, "balance_loss_mlp": 1.03180504, "epoch": 0.19359687359086128, "flos": 15964596581760.0, "grad_norm": 2.425071376728323, "language_loss": 0.86856198, "learning_rate": 3.7257530274694993e-06, "loss": 0.8950308, "num_input_tokens_seen": 69523835, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26367188, "step": 3220, "time_per_iteration": 2.8387198448181152 }, { "auxiliary_loss_clip": 0.01568554, "auxiliary_loss_mlp": 0.01050713, "balance_loss_clip": 1.35427785, "balance_loss_mlp": 1.0254643, "epoch": 0.19365699684352924, "flos": 21225108430080.0, "grad_norm": 2.284279702136911, "language_loss": 0.84565628, "learning_rate": 3.725556155051766e-06, "loss": 0.87184894, "num_input_tokens_seen": 69542620, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25244141, "step": 3221, "time_per_iteration": 2.7967641353607178 }, { "auxiliary_loss_clip": 0.01587566, "auxiliary_loss_mlp": 0.0106314, "balance_loss_clip": 1.3699975, "balance_loss_mlp": 1.03747439, "epoch": 0.1937171200961972, "flos": 17319963121920.0, "grad_norm": 2.011883242117417, "language_loss": 0.87132215, "learning_rate": 3.7253592172007702e-06, "loss": 0.89782917, "num_input_tokens_seen": 69561130, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.2565918, "step": 3222, "time_per_iteration": 2.91170597076416 }, { "auxiliary_loss_clip": 0.01591441, "auxiliary_loss_mlp": 0.01051263, "balance_loss_clip": 1.36776328, "balance_loss_mlp": 1.02514386, "epoch": 0.19377724334886517, "flos": 22645682006400.0, "grad_norm": 1.6817588034285855, "language_loss": 0.78897297, "learning_rate": 3.72516221392398e-06, "loss": 0.81540006, "num_input_tokens_seen": 69580425, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26135254, "step": 3223, "time_per_iteration": 2.830341339111328 }, { "auxiliary_loss_clip": 0.01589622, "auxiliary_loss_mlp": 0.0105731, "balance_loss_clip": 1.37039995, "balance_loss_mlp": 1.03164411, "epoch": 0.19383736660153314, "flos": 15084111911040.0, "grad_norm": 1.8272496385227819, "language_loss": 0.76433623, "learning_rate": 3.7249651452288653e-06, "loss": 0.79080552, "num_input_tokens_seen": 69597085, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25683594, "step": 3224, "time_per_iteration": 4.192519187927246 }, { "auxiliary_loss_clip": 0.01589658, "auxiliary_loss_mlp": 0.01049856, "balance_loss_clip": 1.36779058, "balance_loss_mlp": 1.02306986, "epoch": 0.1938974898542011, "flos": 47136525223680.0, "grad_norm": 2.3436830097037857, "language_loss": 0.72352314, "learning_rate": 3.7247680111229e-06, "loss": 0.74991834, "num_input_tokens_seen": 69618885, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.26794434, "step": 3225, "time_per_iteration": 3.0543878078460693 }, { "auxiliary_loss_clip": 0.01582704, "auxiliary_loss_mlp": 0.01043167, "balance_loss_clip": 1.36156082, "balance_loss_mlp": 1.01797855, "epoch": 0.19395761310686907, "flos": 25823695829760.0, "grad_norm": 2.1239632362484535, "language_loss": 0.70306295, "learning_rate": 3.7245708116135585e-06, "loss": 0.72932172, "num_input_tokens_seen": 69638200, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.25195312, "step": 3226, "time_per_iteration": 2.8952291011810303 }, { "auxiliary_loss_clip": 0.01576765, "auxiliary_loss_mlp": 0.01045953, "balance_loss_clip": 1.36182499, "balance_loss_mlp": 1.0184629, "epoch": 0.19401773635953706, "flos": 23050017953280.0, "grad_norm": 1.773626252894541, "language_loss": 0.77144039, "learning_rate": 3.7243735467083193e-06, "loss": 0.7976675, "num_input_tokens_seen": 69657550, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.27453613, "step": 3227, "time_per_iteration": 2.8513972759246826 }, { "auxiliary_loss_clip": 0.01581917, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.36153793, "balance_loss_mlp": 1.01813149, "epoch": 0.19407785961220503, "flos": 15928282990080.0, "grad_norm": 2.0070380154588463, "language_loss": 0.70349103, "learning_rate": 3.724176216414662e-06, "loss": 0.72976053, "num_input_tokens_seen": 69675005, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.2689209, "step": 3228, "time_per_iteration": 2.7930748462677 }, { "auxiliary_loss_clip": 0.01572314, "auxiliary_loss_mlp": 0.01043931, "balance_loss_clip": 1.35513091, "balance_loss_mlp": 1.01770544, "epoch": 0.194137982864873, "flos": 25932817584000.0, "grad_norm": 1.8911257198333322, "language_loss": 0.74871588, "learning_rate": 3.72397882074007e-06, "loss": 0.77487838, "num_input_tokens_seen": 69696455, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.26220703, "step": 3229, "time_per_iteration": 2.905484914779663 }, { "auxiliary_loss_clip": 0.01580427, "auxiliary_loss_mlp": 0.01050576, "balance_loss_clip": 1.36251915, "balance_loss_mlp": 1.02398014, "epoch": 0.19419810611754096, "flos": 13269201488640.0, "grad_norm": 1.802127469998507, "language_loss": 0.66891289, "learning_rate": 3.7237813596920285e-06, "loss": 0.69522291, "num_input_tokens_seen": 69714245, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26574707, "step": 3230, "time_per_iteration": 2.8357434272766113 }, { "auxiliary_loss_clip": 0.01567465, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.35409427, "balance_loss_mlp": 1.01609755, "epoch": 0.19425822937020892, "flos": 15713975779200.0, "grad_norm": 3.8864357563038427, "language_loss": 0.82241488, "learning_rate": 3.7235838332780254e-06, "loss": 0.84852892, "num_input_tokens_seen": 69731515, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.27856445, "step": 3231, "time_per_iteration": 4.33530592918396 }, { "auxiliary_loss_clip": 0.01582729, "auxiliary_loss_mlp": 0.01043303, "balance_loss_clip": 1.36430192, "balance_loss_mlp": 1.01598072, "epoch": 0.1943183526228769, "flos": 23113958135040.0, "grad_norm": 2.0536648829359048, "language_loss": 0.87550187, "learning_rate": 3.72338624150555e-06, "loss": 0.90176225, "num_input_tokens_seen": 69748885, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2734375, "step": 3232, "time_per_iteration": 2.9359803199768066 }, { "auxiliary_loss_clip": 0.01572505, "auxiliary_loss_mlp": 0.01046496, "balance_loss_clip": 1.35708928, "balance_loss_mlp": 1.0189352, "epoch": 0.19437847587554485, "flos": 24722071983360.0, "grad_norm": 1.6786380220459685, "language_loss": 0.85871375, "learning_rate": 3.723188584382096e-06, "loss": 0.88490373, "num_input_tokens_seen": 69767540, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.27587891, "step": 3233, "time_per_iteration": 4.296800851821899 }, { "auxiliary_loss_clip": 0.0158688, "auxiliary_loss_mlp": 0.01049253, "balance_loss_clip": 1.36515856, "balance_loss_mlp": 1.02225184, "epoch": 0.19443859912821285, "flos": 23127486330240.0, "grad_norm": 1.6079227148715274, "language_loss": 0.89754075, "learning_rate": 3.722990861915158e-06, "loss": 0.92390203, "num_input_tokens_seen": 69789340, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2701416, "step": 3234, "time_per_iteration": 2.8633687496185303 }, { "auxiliary_loss_clip": 0.01574593, "auxiliary_loss_mlp": 0.01044705, "balance_loss_clip": 1.35291719, "balance_loss_mlp": 1.01762033, "epoch": 0.1944987223808808, "flos": 15092391709440.0, "grad_norm": 2.2392622915992964, "language_loss": 0.7960093, "learning_rate": 3.722793074112234e-06, "loss": 0.82220232, "num_input_tokens_seen": 69806470, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27099609, "step": 3235, "time_per_iteration": 2.793816089630127 }, { "auxiliary_loss_clip": 0.01575111, "auxiliary_loss_mlp": 0.01041642, "balance_loss_clip": 1.35941243, "balance_loss_mlp": 1.01546383, "epoch": 0.19455884563354878, "flos": 17135228027520.0, "grad_norm": 1.8324927368184571, "language_loss": 0.80006051, "learning_rate": 3.7225952209808233e-06, "loss": 0.82622802, "num_input_tokens_seen": 69822655, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.26171875, "step": 3236, "time_per_iteration": 2.837857246398926 }, { "auxiliary_loss_clip": 0.01567549, "auxiliary_loss_mlp": 0.01045271, "balance_loss_clip": 1.35373664, "balance_loss_mlp": 1.01832926, "epoch": 0.19461896888621674, "flos": 20202988976640.0, "grad_norm": 1.9527497248702526, "language_loss": 0.77429587, "learning_rate": 3.72239730252843e-06, "loss": 0.80042404, "num_input_tokens_seen": 69841895, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.26977539, "step": 3237, "time_per_iteration": 2.8135523796081543 }, { "auxiliary_loss_clip": 0.01581638, "auxiliary_loss_mlp": 0.01044406, "balance_loss_clip": 1.36006284, "balance_loss_mlp": 1.01767969, "epoch": 0.1946790921388847, "flos": 25312274144640.0, "grad_norm": 1.923058663366835, "language_loss": 0.76085436, "learning_rate": 3.7221993187625583e-06, "loss": 0.78711486, "num_input_tokens_seen": 69862220, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26745605, "step": 3238, "time_per_iteration": 2.8834903240203857 }, { "auxiliary_loss_clip": 0.01556076, "auxiliary_loss_mlp": 0.01042301, "balance_loss_clip": 1.34154749, "balance_loss_mlp": 1.01540732, "epoch": 0.19473921539155267, "flos": 20202988976640.0, "grad_norm": 1.802520751952537, "language_loss": 0.74851322, "learning_rate": 3.7220012696907155e-06, "loss": 0.77449703, "num_input_tokens_seen": 69881830, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2689209, "step": 3239, "time_per_iteration": 2.794116735458374 }, { "auxiliary_loss_clip": 0.01571223, "auxiliary_loss_mlp": 0.01044349, "balance_loss_clip": 1.35656118, "balance_loss_mlp": 1.01582253, "epoch": 0.19479933864422067, "flos": 20897562188160.0, "grad_norm": 1.6813017588338317, "language_loss": 0.74472404, "learning_rate": 3.721803155320412e-06, "loss": 0.77087975, "num_input_tokens_seen": 69900515, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.28491211, "step": 3240, "time_per_iteration": 2.854987859725952 }, { "auxiliary_loss_clip": 0.01573688, "auxiliary_loss_mlp": 0.01043678, "balance_loss_clip": 1.3580128, "balance_loss_mlp": 1.01628375, "epoch": 0.19485946189688863, "flos": 23305570439040.0, "grad_norm": 1.9677566399792343, "language_loss": 0.67736065, "learning_rate": 3.7216049756591606e-06, "loss": 0.70353425, "num_input_tokens_seen": 69920060, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.27416992, "step": 3241, "time_per_iteration": 2.8206679821014404 }, { "auxiliary_loss_clip": 0.01580424, "auxiliary_loss_mlp": 0.01046258, "balance_loss_clip": 1.36328077, "balance_loss_mlp": 1.01864934, "epoch": 0.1949195851495566, "flos": 23305525194240.0, "grad_norm": 1.4361777861871758, "language_loss": 0.83968264, "learning_rate": 3.7214067307144754e-06, "loss": 0.86594945, "num_input_tokens_seen": 69939820, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.27587891, "step": 3242, "time_per_iteration": 2.869105100631714 }, { "auxiliary_loss_clip": 0.01327075, "auxiliary_loss_mlp": 0.01066762, "balance_loss_clip": 1.19077694, "balance_loss_mlp": 1.04177594, "epoch": 0.19497970840222456, "flos": 64993925790720.0, "grad_norm": 0.8324449489093395, "language_loss": 0.57583505, "learning_rate": 3.721208420493875e-06, "loss": 0.59977341, "num_input_tokens_seen": 70002145, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.25, "step": 3243, "time_per_iteration": 3.376127004623413 }, { "auxiliary_loss_clip": 0.01579206, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.36200571, "balance_loss_mlp": 1.02267623, "epoch": 0.19503983165489253, "flos": 19653805866240.0, "grad_norm": 1.7547136988143541, "language_loss": 0.84563404, "learning_rate": 3.7210100450048784e-06, "loss": 0.87193465, "num_input_tokens_seen": 70020510, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.28149414, "step": 3244, "time_per_iteration": 2.869676351547241 }, { "auxiliary_loss_clip": 0.0158971, "auxiliary_loss_mlp": 0.01059038, "balance_loss_clip": 1.37021792, "balance_loss_mlp": 1.03003407, "epoch": 0.1950999549075605, "flos": 21151802574720.0, "grad_norm": 2.0642628983726325, "language_loss": 0.77685481, "learning_rate": 3.7208116042550088e-06, "loss": 0.80334228, "num_input_tokens_seen": 70040760, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.28967285, "step": 3245, "time_per_iteration": 2.9199533462524414 }, { "auxiliary_loss_clip": 0.01567959, "auxiliary_loss_mlp": 0.01046076, "balance_loss_clip": 1.34968722, "balance_loss_mlp": 1.01775146, "epoch": 0.19516007816022846, "flos": 20894123583360.0, "grad_norm": 2.0565330467835152, "language_loss": 0.85669935, "learning_rate": 3.7206130982517906e-06, "loss": 0.88283968, "num_input_tokens_seen": 70058720, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.2833252, "step": 3246, "time_per_iteration": 2.9189541339874268 }, { "auxiliary_loss_clip": 0.015785, "auxiliary_loss_mlp": 0.01053579, "balance_loss_clip": 1.35883546, "balance_loss_mlp": 1.02721024, "epoch": 0.19522020141289645, "flos": 16919472983040.0, "grad_norm": 2.5872328865770227, "language_loss": 0.77856827, "learning_rate": 3.7204145270027514e-06, "loss": 0.80488908, "num_input_tokens_seen": 70076470, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26342773, "step": 3247, "time_per_iteration": 2.8074357509613037 }, { "auxiliary_loss_clip": 0.01564071, "auxiliary_loss_mlp": 0.01047897, "balance_loss_clip": 1.34780395, "balance_loss_mlp": 1.02122951, "epoch": 0.19528032466556441, "flos": 26736241080960.0, "grad_norm": 1.595723199207394, "language_loss": 0.76442903, "learning_rate": 3.720215890515421e-06, "loss": 0.79054868, "num_input_tokens_seen": 70096220, "router_z_loss_clip": 2.1640625, "router_z_loss_mlp": 0.26696777, "step": 3248, "time_per_iteration": 2.8972721099853516 }, { "auxiliary_loss_clip": 0.0157143, "auxiliary_loss_mlp": 0.01049355, "balance_loss_clip": 1.35244513, "balance_loss_mlp": 1.02148438, "epoch": 0.19534044791823238, "flos": 21042680820480.0, "grad_norm": 1.7930897151430056, "language_loss": 0.7994222, "learning_rate": 3.7200171887973316e-06, "loss": 0.82563007, "num_input_tokens_seen": 70114800, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.27856445, "step": 3249, "time_per_iteration": 2.823413610458374 }, { "auxiliary_loss_clip": 0.01564447, "auxiliary_loss_mlp": 0.01048886, "balance_loss_clip": 1.34753895, "balance_loss_mlp": 1.02058578, "epoch": 0.19540057117090034, "flos": 22353680194560.0, "grad_norm": 1.4867276700605674, "language_loss": 0.73765576, "learning_rate": 3.7198184218560176e-06, "loss": 0.76378906, "num_input_tokens_seen": 70134930, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.28283691, "step": 3250, "time_per_iteration": 2.8628592491149902 }, { "auxiliary_loss_clip": 0.01560029, "auxiliary_loss_mlp": 0.01044637, "balance_loss_clip": 1.34588027, "balance_loss_mlp": 1.01677799, "epoch": 0.1954606944235683, "flos": 20310934366080.0, "grad_norm": 1.8631908430314594, "language_loss": 0.80786932, "learning_rate": 3.719619589699017e-06, "loss": 0.83391601, "num_input_tokens_seen": 70152045, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.27868652, "step": 3251, "time_per_iteration": 2.808612823486328 }, { "auxiliary_loss_clip": 0.01570814, "auxiliary_loss_mlp": 0.01044148, "balance_loss_clip": 1.35213494, "balance_loss_mlp": 1.01608562, "epoch": 0.19552081767623627, "flos": 17355552796800.0, "grad_norm": 2.1548846916450066, "language_loss": 0.84364057, "learning_rate": 3.7194206923338695e-06, "loss": 0.86979014, "num_input_tokens_seen": 70169240, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.28039551, "step": 3252, "time_per_iteration": 2.8758645057678223 }, { "auxiliary_loss_clip": 0.01589967, "auxiliary_loss_mlp": 0.01056121, "balance_loss_clip": 1.36461067, "balance_loss_mlp": 1.02660477, "epoch": 0.19558094092890424, "flos": 31990101943680.0, "grad_norm": 1.7733686687297636, "language_loss": 0.73826563, "learning_rate": 3.719221729768117e-06, "loss": 0.76472658, "num_input_tokens_seen": 70192690, "router_z_loss_clip": 2.25195312, "router_z_loss_mlp": 0.29528809, "step": 3253, "time_per_iteration": 2.9374611377716064 }, { "auxiliary_loss_clip": 0.01586863, "auxiliary_loss_mlp": 0.01051653, "balance_loss_clip": 1.36211753, "balance_loss_mlp": 1.0239253, "epoch": 0.19564106418157223, "flos": 22277931120000.0, "grad_norm": 1.711652845036668, "language_loss": 0.77712566, "learning_rate": 3.7190227020093037e-06, "loss": 0.80351084, "num_input_tokens_seen": 70209685, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.27709961, "step": 3254, "time_per_iteration": 2.841430902481079 }, { "auxiliary_loss_clip": 0.01324157, "auxiliary_loss_mlp": 0.01055939, "balance_loss_clip": 1.19419456, "balance_loss_mlp": 1.02942657, "epoch": 0.1957011874342402, "flos": 54388330283520.0, "grad_norm": 0.7666990244990247, "language_loss": 0.55396461, "learning_rate": 3.7188236090649774e-06, "loss": 0.57776558, "num_input_tokens_seen": 70265050, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.265625, "step": 3255, "time_per_iteration": 3.313453197479248 }, { "auxiliary_loss_clip": 0.01582981, "auxiliary_loss_mlp": 0.01048807, "balance_loss_clip": 1.36220741, "balance_loss_mlp": 1.02118611, "epoch": 0.19576131068690816, "flos": 16514910812160.0, "grad_norm": 2.3783829668126395, "language_loss": 0.72555542, "learning_rate": 3.718624450942688e-06, "loss": 0.75187337, "num_input_tokens_seen": 70281830, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.27624512, "step": 3256, "time_per_iteration": 2.8090317249298096 }, { "auxiliary_loss_clip": 0.01569907, "auxiliary_loss_mlp": 0.01049409, "balance_loss_clip": 1.35294616, "balance_loss_mlp": 1.02326643, "epoch": 0.19582143393957613, "flos": 14727626979840.0, "grad_norm": 2.4476956177306817, "language_loss": 0.81569719, "learning_rate": 3.718425227649987e-06, "loss": 0.84189039, "num_input_tokens_seen": 70297420, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.26123047, "step": 3257, "time_per_iteration": 2.859912872314453 }, { "auxiliary_loss_clip": 0.01587281, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.36866498, "balance_loss_mlp": 1.02142203, "epoch": 0.1958815571922441, "flos": 24436132974720.0, "grad_norm": 2.1871293228393953, "language_loss": 0.76089621, "learning_rate": 3.7182259391944292e-06, "loss": 0.78726602, "num_input_tokens_seen": 70319210, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.2824707, "step": 3258, "time_per_iteration": 4.422747850418091 }, { "auxiliary_loss_clip": 0.01589469, "auxiliary_loss_mlp": 0.0104634, "balance_loss_clip": 1.36576915, "balance_loss_mlp": 1.01839721, "epoch": 0.19594168044491206, "flos": 24911150578560.0, "grad_norm": 1.6158583556339396, "language_loss": 0.75236058, "learning_rate": 3.7180265855835714e-06, "loss": 0.77871871, "num_input_tokens_seen": 70339045, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27978516, "step": 3259, "time_per_iteration": 2.86773681640625 }, { "auxiliary_loss_clip": 0.01591987, "auxiliary_loss_mlp": 0.01051956, "balance_loss_clip": 1.36578357, "balance_loss_mlp": 1.02322626, "epoch": 0.19600180369758005, "flos": 12064473446400.0, "grad_norm": 2.355805514697153, "language_loss": 0.79099214, "learning_rate": 3.7178271668249735e-06, "loss": 0.81743151, "num_input_tokens_seen": 70356505, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.28735352, "step": 3260, "time_per_iteration": 2.8117856979370117 }, { "auxiliary_loss_clip": 0.01591244, "auxiliary_loss_mlp": 0.01049733, "balance_loss_clip": 1.3690486, "balance_loss_mlp": 1.0219574, "epoch": 0.19606192695024802, "flos": 20859981742080.0, "grad_norm": 1.8854894405731202, "language_loss": 0.83283174, "learning_rate": 3.7176276829261975e-06, "loss": 0.85924155, "num_input_tokens_seen": 70375410, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27783203, "step": 3261, "time_per_iteration": 2.8003501892089844 }, { "auxiliary_loss_clip": 0.01589941, "auxiliary_loss_mlp": 0.01051255, "balance_loss_clip": 1.37095082, "balance_loss_mlp": 1.02183378, "epoch": 0.19612205020291598, "flos": 28487211321600.0, "grad_norm": 1.6839341576947946, "language_loss": 0.77180433, "learning_rate": 3.717428133894807e-06, "loss": 0.79821628, "num_input_tokens_seen": 70396315, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.29394531, "step": 3262, "time_per_iteration": 2.8650102615356445 }, { "auxiliary_loss_clip": 0.01575305, "auxiliary_loss_mlp": 0.01051269, "balance_loss_clip": 1.35811329, "balance_loss_mlp": 1.023839, "epoch": 0.19618217345558395, "flos": 25567555161600.0, "grad_norm": 1.9578037795642376, "language_loss": 0.87068266, "learning_rate": 3.71722851973837e-06, "loss": 0.8969484, "num_input_tokens_seen": 70417945, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.27453613, "step": 3263, "time_per_iteration": 2.905756950378418 }, { "auxiliary_loss_clip": 0.0158448, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 1.36397469, "balance_loss_mlp": 1.01961923, "epoch": 0.1962422967082519, "flos": 25275191391360.0, "grad_norm": 2.022196669350236, "language_loss": 0.75336623, "learning_rate": 3.717028840464455e-06, "loss": 0.77966851, "num_input_tokens_seen": 70438690, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26135254, "step": 3264, "time_per_iteration": 2.844958543777466 }, { "auxiliary_loss_clip": 0.01575382, "auxiliary_loss_mlp": 0.01051938, "balance_loss_clip": 1.35946536, "balance_loss_mlp": 1.02401876, "epoch": 0.19630241996091988, "flos": 18816692976000.0, "grad_norm": 2.5296978800359, "language_loss": 0.8015877, "learning_rate": 3.7168290960806344e-06, "loss": 0.82786095, "num_input_tokens_seen": 70455385, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.27929688, "step": 3265, "time_per_iteration": 2.938898801803589 }, { "auxiliary_loss_clip": 0.01316181, "auxiliary_loss_mlp": 0.01030773, "balance_loss_clip": 1.18841136, "balance_loss_mlp": 1.00674081, "epoch": 0.19636254321358784, "flos": 62347675829760.0, "grad_norm": 0.8272363273666344, "language_loss": 0.53571332, "learning_rate": 3.716629286594483e-06, "loss": 0.55918288, "num_input_tokens_seen": 70514280, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.24023438, "step": 3266, "time_per_iteration": 4.7649595737457275 }, { "auxiliary_loss_clip": 0.01593469, "auxiliary_loss_mlp": 0.01061476, "balance_loss_clip": 1.36798525, "balance_loss_mlp": 1.0335815, "epoch": 0.19642266646625584, "flos": 21079220636160.0, "grad_norm": 1.8769965550384446, "language_loss": 0.80970526, "learning_rate": 3.7164294120135767e-06, "loss": 0.83625472, "num_input_tokens_seen": 70531800, "router_z_loss_clip": 2.25390625, "router_z_loss_mlp": 0.27880859, "step": 3267, "time_per_iteration": 2.8376312255859375 }, { "auxiliary_loss_clip": 0.01570926, "auxiliary_loss_mlp": 0.01049045, "balance_loss_clip": 1.35274935, "balance_loss_mlp": 1.02205658, "epoch": 0.1964827897189238, "flos": 14546918672640.0, "grad_norm": 2.1806490223785886, "language_loss": 0.8751868, "learning_rate": 3.7162294723454953e-06, "loss": 0.9013865, "num_input_tokens_seen": 70550615, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.26953125, "step": 3268, "time_per_iteration": 5.63650107383728 }, { "auxiliary_loss_clip": 0.01575541, "auxiliary_loss_mlp": 0.01048013, "balance_loss_clip": 1.36006308, "balance_loss_mlp": 1.02163184, "epoch": 0.19654291297159177, "flos": 19253949154560.0, "grad_norm": 2.3611376335432577, "language_loss": 0.70229471, "learning_rate": 3.7160294675978197e-06, "loss": 0.72853029, "num_input_tokens_seen": 70568690, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.26379395, "step": 3269, "time_per_iteration": 2.8611230850219727 }, { "auxiliary_loss_clip": 0.01593569, "auxiliary_loss_mlp": 0.01053334, "balance_loss_clip": 1.37081456, "balance_loss_mlp": 1.02429438, "epoch": 0.19660303622425973, "flos": 25786748810880.0, "grad_norm": 1.7618721242590047, "language_loss": 0.81536609, "learning_rate": 3.715829397778135e-06, "loss": 0.84183514, "num_input_tokens_seen": 70588665, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.29052734, "step": 3270, "time_per_iteration": 2.880014181137085 }, { "auxiliary_loss_clip": 0.01575099, "auxiliary_loss_mlp": 0.01050469, "balance_loss_clip": 1.35694432, "balance_loss_mlp": 1.02353978, "epoch": 0.1966631594769277, "flos": 20604881704320.0, "grad_norm": 1.9367749141975046, "language_loss": 0.85599506, "learning_rate": 3.715629262894028e-06, "loss": 0.88225067, "num_input_tokens_seen": 70606900, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26940918, "step": 3271, "time_per_iteration": 2.8191463947296143 }, { "auxiliary_loss_clip": 0.01578483, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.3633523, "balance_loss_mlp": 1.01991296, "epoch": 0.19672328272959566, "flos": 23633704863360.0, "grad_norm": 1.7973831789291228, "language_loss": 0.80932856, "learning_rate": 3.715429062953087e-06, "loss": 0.83558118, "num_input_tokens_seen": 70625955, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.26843262, "step": 3272, "time_per_iteration": 2.870612859725952 }, { "auxiliary_loss_clip": 0.01593214, "auxiliary_loss_mlp": 0.01049045, "balance_loss_clip": 1.37231028, "balance_loss_mlp": 1.02098322, "epoch": 0.19678340598226365, "flos": 23120925834240.0, "grad_norm": 1.7185000993549815, "language_loss": 0.81942904, "learning_rate": 3.7152287979629043e-06, "loss": 0.84585154, "num_input_tokens_seen": 70646090, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.28039551, "step": 3273, "time_per_iteration": 2.965625762939453 }, { "auxiliary_loss_clip": 0.0158643, "auxiliary_loss_mlp": 0.01046986, "balance_loss_clip": 1.36639011, "balance_loss_mlp": 1.02099788, "epoch": 0.19684352923493162, "flos": 24545480952960.0, "grad_norm": 1.7118373648642495, "language_loss": 0.79090285, "learning_rate": 3.7150284679310735e-06, "loss": 0.81723702, "num_input_tokens_seen": 70666065, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26025391, "step": 3274, "time_per_iteration": 2.835944890975952 }, { "auxiliary_loss_clip": 0.01598243, "auxiliary_loss_mlp": 0.01048098, "balance_loss_clip": 1.37688828, "balance_loss_mlp": 1.02068019, "epoch": 0.19690365248759958, "flos": 21805854428160.0, "grad_norm": 3.299325342709566, "language_loss": 0.82618499, "learning_rate": 3.7148280728651914e-06, "loss": 0.85264844, "num_input_tokens_seen": 70681580, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.27416992, "step": 3275, "time_per_iteration": 2.74751615524292 }, { "auxiliary_loss_clip": 0.01584286, "auxiliary_loss_mlp": 0.01049378, "balance_loss_clip": 1.3653636, "balance_loss_mlp": 1.02134037, "epoch": 0.19696377574026755, "flos": 19064463356160.0, "grad_norm": 1.8626873822115226, "language_loss": 0.81949317, "learning_rate": 3.7146276127728563e-06, "loss": 0.84582984, "num_input_tokens_seen": 70697745, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.28076172, "step": 3276, "time_per_iteration": 2.842057466506958 }, { "auxiliary_loss_clip": 0.01595761, "auxiliary_loss_mlp": 0.01044811, "balance_loss_clip": 1.37639832, "balance_loss_mlp": 1.01797724, "epoch": 0.19702389899293551, "flos": 22831005283200.0, "grad_norm": 1.8039233631257374, "language_loss": 0.90372175, "learning_rate": 3.7144270876616713e-06, "loss": 0.9301275, "num_input_tokens_seen": 70715110, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26843262, "step": 3277, "time_per_iteration": 2.803459882736206 }, { "auxiliary_loss_clip": 0.01595908, "auxiliary_loss_mlp": 0.0105551, "balance_loss_clip": 1.36929929, "balance_loss_mlp": 1.02751923, "epoch": 0.19708402224560348, "flos": 22904763586560.0, "grad_norm": 3.299260808621769, "language_loss": 0.64157486, "learning_rate": 3.714226497539239e-06, "loss": 0.66808897, "num_input_tokens_seen": 70734715, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 0.27966309, "step": 3278, "time_per_iteration": 2.829946994781494 }, { "auxiliary_loss_clip": 0.01588959, "auxiliary_loss_mlp": 0.01055691, "balance_loss_clip": 1.36745572, "balance_loss_mlp": 1.02876127, "epoch": 0.19714414549827144, "flos": 25672559639040.0, "grad_norm": 1.83437975091346, "language_loss": 0.74868053, "learning_rate": 3.714025842413166e-06, "loss": 0.77512705, "num_input_tokens_seen": 70752650, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.26928711, "step": 3279, "time_per_iteration": 2.8498685359954834 }, { "auxiliary_loss_clip": 0.01574778, "auxiliary_loss_mlp": 0.01045624, "balance_loss_clip": 1.35542917, "balance_loss_mlp": 1.01863539, "epoch": 0.19720426875093944, "flos": 23926611571200.0, "grad_norm": 1.559209505308059, "language_loss": 0.83174062, "learning_rate": 3.713825122291061e-06, "loss": 0.85794461, "num_input_tokens_seen": 70772365, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.2701416, "step": 3280, "time_per_iteration": 2.814842939376831 }, { "auxiliary_loss_clip": 0.01586388, "auxiliary_loss_mlp": 0.01046851, "balance_loss_clip": 1.36675549, "balance_loss_mlp": 1.01998186, "epoch": 0.1972643920036074, "flos": 13890061641600.0, "grad_norm": 1.8944552797378194, "language_loss": 0.78216481, "learning_rate": 3.713624337180536e-06, "loss": 0.80849719, "num_input_tokens_seen": 70790340, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.26867676, "step": 3281, "time_per_iteration": 2.8537449836730957 }, { "auxiliary_loss_clip": 0.0157253, "auxiliary_loss_mlp": 0.01043535, "balance_loss_clip": 1.35994291, "balance_loss_mlp": 1.01842988, "epoch": 0.19732451525627537, "flos": 19872728046720.0, "grad_norm": 1.66284930184393, "language_loss": 0.8089689, "learning_rate": 3.7134234870892045e-06, "loss": 0.83512956, "num_input_tokens_seen": 70809295, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25085449, "step": 3282, "time_per_iteration": 2.8340342044830322 }, { "auxiliary_loss_clip": 0.01586392, "auxiliary_loss_mlp": 0.01049159, "balance_loss_clip": 1.36637807, "balance_loss_mlp": 1.02239609, "epoch": 0.19738463850894333, "flos": 24984365944320.0, "grad_norm": 1.9907781379727785, "language_loss": 0.72700632, "learning_rate": 3.7132225720246826e-06, "loss": 0.75336182, "num_input_tokens_seen": 70828765, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26806641, "step": 3283, "time_per_iteration": 2.86289644241333 }, { "auxiliary_loss_clip": 0.01585751, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.36573052, "balance_loss_mlp": 1.02341986, "epoch": 0.1974447617616113, "flos": 18377943719040.0, "grad_norm": 1.6910251332203223, "language_loss": 0.79795992, "learning_rate": 3.7130215919945886e-06, "loss": 0.82431155, "num_input_tokens_seen": 70846805, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26013184, "step": 3284, "time_per_iteration": 2.8177490234375 }, { "auxiliary_loss_clip": 0.01597004, "auxiliary_loss_mlp": 0.01050511, "balance_loss_clip": 1.37327623, "balance_loss_mlp": 1.02414227, "epoch": 0.19750488501427926, "flos": 22903134773760.0, "grad_norm": 1.8992745504546273, "language_loss": 0.86891937, "learning_rate": 3.7128205470065445e-06, "loss": 0.89539456, "num_input_tokens_seen": 70863805, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.26379395, "step": 3285, "time_per_iteration": 2.860649347305298 }, { "auxiliary_loss_clip": 0.01570705, "auxiliary_loss_mlp": 0.01049647, "balance_loss_clip": 1.35573959, "balance_loss_mlp": 1.02404141, "epoch": 0.19756500826694723, "flos": 21881467768320.0, "grad_norm": 1.895361939845148, "language_loss": 0.89118326, "learning_rate": 3.712619437068174e-06, "loss": 0.91738677, "num_input_tokens_seen": 70882660, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25610352, "step": 3286, "time_per_iteration": 2.8846237659454346 }, { "auxiliary_loss_clip": 0.01605014, "auxiliary_loss_mlp": 0.01049584, "balance_loss_clip": 1.38119519, "balance_loss_mlp": 1.02137959, "epoch": 0.19762513151961522, "flos": 15167416867200.0, "grad_norm": 2.414848410490199, "language_loss": 0.7892282, "learning_rate": 3.712418262187102e-06, "loss": 0.8157742, "num_input_tokens_seen": 70898765, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.28222656, "step": 3287, "time_per_iteration": 2.866270065307617 }, { "auxiliary_loss_clip": 0.016037, "auxiliary_loss_mlp": 0.01048567, "balance_loss_clip": 1.38001096, "balance_loss_mlp": 1.02132785, "epoch": 0.1976852547722832, "flos": 16987032748800.0, "grad_norm": 1.8352889834187307, "language_loss": 0.82701433, "learning_rate": 3.7122170223709584e-06, "loss": 0.8535369, "num_input_tokens_seen": 70916370, "router_z_loss_clip": 2.23632812, "router_z_loss_mlp": 0.27258301, "step": 3288, "time_per_iteration": 2.877490997314453 }, { "auxiliary_loss_clip": 0.01570923, "auxiliary_loss_mlp": 0.01051048, "balance_loss_clip": 1.35718226, "balance_loss_mlp": 1.02378476, "epoch": 0.19774537802495115, "flos": 20312563178880.0, "grad_norm": 1.6694984093507008, "language_loss": 0.73968446, "learning_rate": 3.712015717627374e-06, "loss": 0.76590419, "num_input_tokens_seen": 70934870, "router_z_loss_clip": 2.13769531, "router_z_loss_mlp": 0.27258301, "step": 3289, "time_per_iteration": 2.854494094848633 }, { "auxiliary_loss_clip": 0.01581319, "auxiliary_loss_mlp": 0.01051385, "balance_loss_clip": 1.36480486, "balance_loss_mlp": 1.02475357, "epoch": 0.19780550127761912, "flos": 27246984094080.0, "grad_norm": 1.6480766040507855, "language_loss": 0.80345517, "learning_rate": 3.7118143479639813e-06, "loss": 0.82978225, "num_input_tokens_seen": 70955140, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.26647949, "step": 3290, "time_per_iteration": 2.9184963703155518 }, { "auxiliary_loss_clip": 0.01357308, "auxiliary_loss_mlp": 0.01031009, "balance_loss_clip": 1.22887802, "balance_loss_mlp": 1.00564098, "epoch": 0.19786562453028708, "flos": 63584527680000.0, "grad_norm": 0.8989204677889219, "language_loss": 0.6041075, "learning_rate": 3.711612913388418e-06, "loss": 0.62799072, "num_input_tokens_seen": 71012005, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.25390625, "step": 3291, "time_per_iteration": 3.396043300628662 }, { "auxiliary_loss_clip": 0.01597661, "auxiliary_loss_mlp": 0.01051569, "balance_loss_clip": 1.37280846, "balance_loss_mlp": 1.02405596, "epoch": 0.19792574778295505, "flos": 26297853782400.0, "grad_norm": 1.7428547501556686, "language_loss": 0.82499373, "learning_rate": 3.7114114139083204e-06, "loss": 0.85148609, "num_input_tokens_seen": 71031140, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.27539062, "step": 3292, "time_per_iteration": 2.8627769947052 }, { "auxiliary_loss_clip": 0.01573328, "auxiliary_loss_mlp": 0.0104617, "balance_loss_clip": 1.3595103, "balance_loss_mlp": 1.02057528, "epoch": 0.19798587103562304, "flos": 19947662714880.0, "grad_norm": 2.796477354993951, "language_loss": 0.82360756, "learning_rate": 3.7112098495313313e-06, "loss": 0.84980249, "num_input_tokens_seen": 71050250, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25610352, "step": 3293, "time_per_iteration": 4.233789920806885 }, { "auxiliary_loss_clip": 0.01619639, "auxiliary_loss_mlp": 0.01056208, "balance_loss_clip": 1.38883793, "balance_loss_mlp": 1.02912331, "epoch": 0.198045994288291, "flos": 20129818855680.0, "grad_norm": 1.7274948888138428, "language_loss": 0.62711805, "learning_rate": 3.711008220265093e-06, "loss": 0.65387648, "num_input_tokens_seen": 71068665, "router_z_loss_clip": 2.31054688, "router_z_loss_mlp": 0.27099609, "step": 3294, "time_per_iteration": 2.975546360015869 }, { "auxiliary_loss_clip": 0.01583735, "auxiliary_loss_mlp": 0.01056071, "balance_loss_clip": 1.36488271, "balance_loss_mlp": 1.03127575, "epoch": 0.19810611754095897, "flos": 17976639173760.0, "grad_norm": 1.7586315843173386, "language_loss": 0.88180828, "learning_rate": 3.710806526117251e-06, "loss": 0.90820634, "num_input_tokens_seen": 71085320, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24804688, "step": 3295, "time_per_iteration": 2.785015344619751 }, { "auxiliary_loss_clip": 0.01576409, "auxiliary_loss_mlp": 0.01056151, "balance_loss_clip": 1.36092722, "balance_loss_mlp": 1.03110468, "epoch": 0.19816624079362694, "flos": 15093160871040.0, "grad_norm": 2.141233259771441, "language_loss": 0.82214653, "learning_rate": 3.7106047670954544e-06, "loss": 0.84847206, "num_input_tokens_seen": 71102020, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.25012207, "step": 3296, "time_per_iteration": 2.8380656242370605 }, { "auxiliary_loss_clip": 0.01600071, "auxiliary_loss_mlp": 0.01058223, "balance_loss_clip": 1.3772676, "balance_loss_mlp": 1.03068531, "epoch": 0.1982263640462949, "flos": 24911195823360.0, "grad_norm": 1.69564701735555, "language_loss": 0.68706948, "learning_rate": 3.710402943207354e-06, "loss": 0.71365243, "num_input_tokens_seen": 71123390, "router_z_loss_clip": 2.22265625, "router_z_loss_mlp": 0.27563477, "step": 3297, "time_per_iteration": 2.8697750568389893 }, { "auxiliary_loss_clip": 0.01580398, "auxiliary_loss_mlp": 0.01049208, "balance_loss_clip": 1.3647027, "balance_loss_mlp": 1.0252347, "epoch": 0.19828648729896287, "flos": 20385914279040.0, "grad_norm": 2.062370779515307, "language_loss": 0.82210505, "learning_rate": 3.7102010544606016e-06, "loss": 0.84840113, "num_input_tokens_seen": 71141800, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23986816, "step": 3298, "time_per_iteration": 2.858400583267212 }, { "auxiliary_loss_clip": 0.01609899, "auxiliary_loss_mlp": 0.01059131, "balance_loss_clip": 1.38383722, "balance_loss_mlp": 1.0323329, "epoch": 0.19834661055163083, "flos": 18889320159360.0, "grad_norm": 1.9131701166903587, "language_loss": 0.86167145, "learning_rate": 3.7099991008628544e-06, "loss": 0.88836181, "num_input_tokens_seen": 71159505, "router_z_loss_clip": 2.26171875, "router_z_loss_mlp": 0.26794434, "step": 3299, "time_per_iteration": 2.8041765689849854 }, { "auxiliary_loss_clip": 0.01342641, "auxiliary_loss_mlp": 0.01031592, "balance_loss_clip": 1.21271873, "balance_loss_mlp": 1.00937176, "epoch": 0.19840673380429882, "flos": 60289790976000.0, "grad_norm": 0.7618795897688294, "language_loss": 0.53281176, "learning_rate": 3.7097970824217706e-06, "loss": 0.55655408, "num_input_tokens_seen": 71223265, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.22265625, "step": 3300, "time_per_iteration": 3.3363473415374756 }, { "auxiliary_loss_clip": 0.01601072, "auxiliary_loss_mlp": 0.01067574, "balance_loss_clip": 1.38106489, "balance_loss_mlp": 1.04000044, "epoch": 0.1984668570569668, "flos": 19911665836800.0, "grad_norm": 1.764988500880052, "language_loss": 0.74175435, "learning_rate": 3.7095949991450093e-06, "loss": 0.76844084, "num_input_tokens_seen": 71242385, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.27563477, "step": 3301, "time_per_iteration": 4.391502857208252 }, { "auxiliary_loss_clip": 0.01600292, "auxiliary_loss_mlp": 0.01056735, "balance_loss_clip": 1.37992895, "balance_loss_mlp": 1.03219008, "epoch": 0.19852698030963475, "flos": 15638679152640.0, "grad_norm": 3.00413769759054, "language_loss": 0.90051693, "learning_rate": 3.709392851040235e-06, "loss": 0.92708719, "num_input_tokens_seen": 71258990, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.2454834, "step": 3302, "time_per_iteration": 4.369137763977051 }, { "auxiliary_loss_clip": 0.01602047, "auxiliary_loss_mlp": 0.01062443, "balance_loss_clip": 1.38206446, "balance_loss_mlp": 1.03780222, "epoch": 0.19858710356230272, "flos": 43158571752960.0, "grad_norm": 3.0994433223930735, "language_loss": 0.74771023, "learning_rate": 3.709190638115111e-06, "loss": 0.77435517, "num_input_tokens_seen": 71282770, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24633789, "step": 3303, "time_per_iteration": 4.514683723449707 }, { "auxiliary_loss_clip": 0.01610283, "auxiliary_loss_mlp": 0.01066139, "balance_loss_clip": 1.38752294, "balance_loss_mlp": 1.03919792, "epoch": 0.19864722681497068, "flos": 35155356733440.0, "grad_norm": 1.8128670174043895, "language_loss": 0.76257706, "learning_rate": 3.7089883603773084e-06, "loss": 0.78934133, "num_input_tokens_seen": 71301410, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.26928711, "step": 3304, "time_per_iteration": 2.94378399848938 }, { "auxiliary_loss_clip": 0.01594818, "auxiliary_loss_mlp": 0.01059994, "balance_loss_clip": 1.37661719, "balance_loss_mlp": 1.03579473, "epoch": 0.19870735006763865, "flos": 19435336133760.0, "grad_norm": 1.8040168864481487, "language_loss": 0.87150466, "learning_rate": 3.7087860178344955e-06, "loss": 0.89805281, "num_input_tokens_seen": 71319670, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.2421875, "step": 3305, "time_per_iteration": 2.8326048851013184 }, { "auxiliary_loss_clip": 0.01606519, "auxiliary_loss_mlp": 0.01059601, "balance_loss_clip": 1.38179207, "balance_loss_mlp": 1.03501987, "epoch": 0.19876747332030664, "flos": 23557141382400.0, "grad_norm": 1.601841815749736, "language_loss": 0.69354033, "learning_rate": 3.7085836104943445e-06, "loss": 0.72020149, "num_input_tokens_seen": 71339850, "router_z_loss_clip": 2.24609375, "router_z_loss_mlp": 0.24597168, "step": 3306, "time_per_iteration": 2.9686787128448486 }, { "auxiliary_loss_clip": 0.0159914, "auxiliary_loss_mlp": 0.01056396, "balance_loss_clip": 1.37847805, "balance_loss_mlp": 1.03223228, "epoch": 0.1988275965729746, "flos": 19839129143040.0, "grad_norm": 1.6280183443263112, "language_loss": 0.7763176, "learning_rate": 3.7083811383645332e-06, "loss": 0.80287302, "num_input_tokens_seen": 71359795, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.24169922, "step": 3307, "time_per_iteration": 3.003679037094116 }, { "auxiliary_loss_clip": 0.01601747, "auxiliary_loss_mlp": 0.01049487, "balance_loss_clip": 1.3816005, "balance_loss_mlp": 1.02548993, "epoch": 0.19888771982564257, "flos": 23524085416320.0, "grad_norm": 1.7467661581858134, "language_loss": 0.76948971, "learning_rate": 3.708178601452737e-06, "loss": 0.79600203, "num_input_tokens_seen": 71378885, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.23986816, "step": 3308, "time_per_iteration": 2.930572509765625 }, { "auxiliary_loss_clip": 0.01591086, "auxiliary_loss_mlp": 0.01047848, "balance_loss_clip": 1.37043917, "balance_loss_mlp": 1.02141953, "epoch": 0.19894784307831054, "flos": 18159654965760.0, "grad_norm": 1.781612265836147, "language_loss": 0.77083975, "learning_rate": 3.7079759997666374e-06, "loss": 0.79722911, "num_input_tokens_seen": 71397285, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26452637, "step": 3309, "time_per_iteration": 2.790924072265625 }, { "auxiliary_loss_clip": 0.01571177, "auxiliary_loss_mlp": 0.01051761, "balance_loss_clip": 1.35825253, "balance_loss_mlp": 1.02594018, "epoch": 0.1990079663309785, "flos": 24285720700800.0, "grad_norm": 1.7273304963489382, "language_loss": 0.88641691, "learning_rate": 3.707773333313917e-06, "loss": 0.91264635, "num_input_tokens_seen": 71415775, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.25793457, "step": 3310, "time_per_iteration": 2.8617002964019775 }, { "auxiliary_loss_clip": 0.01585834, "auxiliary_loss_mlp": 0.01042228, "balance_loss_clip": 1.36844134, "balance_loss_mlp": 1.01674092, "epoch": 0.19906808958364647, "flos": 34911205937280.0, "grad_norm": 2.271604908780493, "language_loss": 0.65788293, "learning_rate": 3.70757060210226e-06, "loss": 0.68416357, "num_input_tokens_seen": 71437315, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25512695, "step": 3311, "time_per_iteration": 2.946624279022217 }, { "auxiliary_loss_clip": 0.01587747, "auxiliary_loss_mlp": 0.01049179, "balance_loss_clip": 1.36673045, "balance_loss_mlp": 1.0222857, "epoch": 0.19912821283631443, "flos": 24035597591040.0, "grad_norm": 2.287732287187971, "language_loss": 0.75920779, "learning_rate": 3.707367806139355e-06, "loss": 0.78557706, "num_input_tokens_seen": 71456320, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.26916504, "step": 3312, "time_per_iteration": 2.867140293121338 }, { "auxiliary_loss_clip": 0.0159813, "auxiliary_loss_mlp": 0.01047008, "balance_loss_clip": 1.37907028, "balance_loss_mlp": 1.02085376, "epoch": 0.19918833608898243, "flos": 19866981957120.0, "grad_norm": 1.9764003994313841, "language_loss": 0.84555376, "learning_rate": 3.7071649454328915e-06, "loss": 0.87200516, "num_input_tokens_seen": 71475360, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26135254, "step": 3313, "time_per_iteration": 2.8573191165924072 }, { "auxiliary_loss_clip": 0.01585958, "auxiliary_loss_mlp": 0.01042945, "balance_loss_clip": 1.36775017, "balance_loss_mlp": 1.01809025, "epoch": 0.1992484593416504, "flos": 29107664271360.0, "grad_norm": 2.022588819567331, "language_loss": 0.82256997, "learning_rate": 3.7069620199905625e-06, "loss": 0.84885901, "num_input_tokens_seen": 71496155, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.2487793, "step": 3314, "time_per_iteration": 2.892676830291748 }, { "auxiliary_loss_clip": 0.01562121, "auxiliary_loss_mlp": 0.01048627, "balance_loss_clip": 1.3487289, "balance_loss_mlp": 1.02244854, "epoch": 0.19930858259431836, "flos": 23305479949440.0, "grad_norm": 1.6198228547806657, "language_loss": 0.886774, "learning_rate": 3.7067590298200627e-06, "loss": 0.91288149, "num_input_tokens_seen": 71517295, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.26171875, "step": 3315, "time_per_iteration": 2.8754024505615234 }, { "auxiliary_loss_clip": 0.01588824, "auxiliary_loss_mlp": 0.0104716, "balance_loss_clip": 1.36892164, "balance_loss_mlp": 1.02220988, "epoch": 0.19936870584698632, "flos": 25390918886400.0, "grad_norm": 1.5243603334168598, "language_loss": 0.72640395, "learning_rate": 3.7065559749290892e-06, "loss": 0.75276375, "num_input_tokens_seen": 71540000, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24951172, "step": 3316, "time_per_iteration": 2.88999342918396 }, { "auxiliary_loss_clip": 0.0133288, "auxiliary_loss_mlp": 0.01047002, "balance_loss_clip": 1.19430089, "balance_loss_mlp": 1.02201545, "epoch": 0.1994288290996543, "flos": 62202167976960.0, "grad_norm": 0.8398729977474604, "language_loss": 0.66383195, "learning_rate": 3.706352855325342e-06, "loss": 0.68763077, "num_input_tokens_seen": 71607880, "router_z_loss_clip": 1.390625, "router_z_loss_mlp": 0.25, "step": 3317, "time_per_iteration": 3.430983304977417 }, { "auxiliary_loss_clip": 0.0160353, "auxiliary_loss_mlp": 0.01051441, "balance_loss_clip": 1.38019681, "balance_loss_mlp": 1.02533436, "epoch": 0.19948895235232225, "flos": 19035117463680.0, "grad_norm": 1.929866835525955, "language_loss": 0.75623906, "learning_rate": 3.7061496710165233e-06, "loss": 0.78278875, "num_input_tokens_seen": 71625695, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26098633, "step": 3318, "time_per_iteration": 2.819131374359131 }, { "auxiliary_loss_clip": 0.01588628, "auxiliary_loss_mlp": 0.01044774, "balance_loss_clip": 1.37260079, "balance_loss_mlp": 1.01933527, "epoch": 0.19954907560499022, "flos": 37829911956480.0, "grad_norm": 1.8023782455180206, "language_loss": 0.8049854, "learning_rate": 3.7059464220103385e-06, "loss": 0.83131945, "num_input_tokens_seen": 71648520, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25488281, "step": 3319, "time_per_iteration": 2.964843511581421 }, { "auxiliary_loss_clip": 0.01600713, "auxiliary_loss_mlp": 0.01049664, "balance_loss_clip": 1.38024092, "balance_loss_mlp": 1.02309251, "epoch": 0.1996091988576582, "flos": 49582611613440.0, "grad_norm": 1.998896533093924, "language_loss": 0.77414382, "learning_rate": 3.7057431083144945e-06, "loss": 0.80064756, "num_input_tokens_seen": 71672185, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.26574707, "step": 3320, "time_per_iteration": 3.0875680446624756 }, { "auxiliary_loss_clip": 0.01599697, "auxiliary_loss_mlp": 0.010512, "balance_loss_clip": 1.38045859, "balance_loss_mlp": 1.02505779, "epoch": 0.19966932211032618, "flos": 22645455782400.0, "grad_norm": 1.4908708700039546, "language_loss": 0.80924153, "learning_rate": 3.705539729936701e-06, "loss": 0.83575046, "num_input_tokens_seen": 71692890, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26135254, "step": 3321, "time_per_iteration": 2.8509926795959473 }, { "auxiliary_loss_clip": 0.01333701, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 1.19938636, "balance_loss_mlp": 1.00489962, "epoch": 0.19972944536299414, "flos": 54108273098880.0, "grad_norm": 0.8735789687098219, "language_loss": 0.65390891, "learning_rate": 3.7053362868846696e-06, "loss": 0.67751616, "num_input_tokens_seen": 71745815, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.22167969, "step": 3322, "time_per_iteration": 3.1475648880004883 }, { "auxiliary_loss_clip": 0.01334422, "auxiliary_loss_mlp": 0.01022758, "balance_loss_clip": 1.20032489, "balance_loss_mlp": 1.00139618, "epoch": 0.1997895686156621, "flos": 69381355132800.0, "grad_norm": 0.7856044536088392, "language_loss": 0.57146972, "learning_rate": 3.7051327791661153e-06, "loss": 0.59504151, "num_input_tokens_seen": 71806915, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.21386719, "step": 3323, "time_per_iteration": 3.4460341930389404 }, { "auxiliary_loss_clip": 0.01588804, "auxiliary_loss_mlp": 0.01064371, "balance_loss_clip": 1.37481916, "balance_loss_mlp": 1.03943205, "epoch": 0.19984969186833007, "flos": 18561366714240.0, "grad_norm": 1.892816342301302, "language_loss": 0.81916547, "learning_rate": 3.7049292067887555e-06, "loss": 0.84569722, "num_input_tokens_seen": 71824645, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24951172, "step": 3324, "time_per_iteration": 2.8029603958129883 }, { "auxiliary_loss_clip": 0.01602037, "auxiliary_loss_mlp": 0.01068454, "balance_loss_clip": 1.38360667, "balance_loss_mlp": 1.04241884, "epoch": 0.19990981512099804, "flos": 26440167237120.0, "grad_norm": 1.4669472397419587, "language_loss": 0.54415047, "learning_rate": 3.7047255697603092e-06, "loss": 0.57085538, "num_input_tokens_seen": 71845125, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.26049805, "step": 3325, "time_per_iteration": 2.9312896728515625 }, { "auxiliary_loss_clip": 0.0160502, "auxiliary_loss_mlp": 0.01074724, "balance_loss_clip": 1.38558745, "balance_loss_mlp": 1.0495584, "epoch": 0.19996993837366603, "flos": 16334609708160.0, "grad_norm": 1.9647414134266359, "language_loss": 0.86422229, "learning_rate": 3.7045218680884984e-06, "loss": 0.8910197, "num_input_tokens_seen": 71863500, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25158691, "step": 3326, "time_per_iteration": 2.879739999771118 }, { "auxiliary_loss_clip": 0.01584698, "auxiliary_loss_mlp": 0.01065265, "balance_loss_clip": 1.37037134, "balance_loss_mlp": 1.0396229, "epoch": 0.200030061626334, "flos": 20852380615680.0, "grad_norm": 1.8704057608302813, "language_loss": 0.72588658, "learning_rate": 3.7043181017810476e-06, "loss": 0.75238621, "num_input_tokens_seen": 71881845, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25598145, "step": 3327, "time_per_iteration": 2.973954439163208 }, { "auxiliary_loss_clip": 0.01602212, "auxiliary_loss_mlp": 0.01077083, "balance_loss_clip": 1.38121891, "balance_loss_mlp": 1.05033231, "epoch": 0.20009018487900196, "flos": 23771493838080.0, "grad_norm": 1.8698855012982523, "language_loss": 0.77425569, "learning_rate": 3.7041142708456833e-06, "loss": 0.80104858, "num_input_tokens_seen": 71900940, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26757812, "step": 3328, "time_per_iteration": 4.272123336791992 }, { "auxiliary_loss_clip": 0.01567568, "auxiliary_loss_mlp": 0.01067463, "balance_loss_clip": 1.35751426, "balance_loss_mlp": 1.04337108, "epoch": 0.20015030813166992, "flos": 28123396732800.0, "grad_norm": 1.6536606501837847, "language_loss": 0.69739771, "learning_rate": 3.7039103752901353e-06, "loss": 0.72374797, "num_input_tokens_seen": 71921925, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.2409668, "step": 3329, "time_per_iteration": 2.949007034301758 }, { "auxiliary_loss_clip": 0.01593675, "auxiliary_loss_mlp": 0.01068403, "balance_loss_clip": 1.3756876, "balance_loss_mlp": 1.03907752, "epoch": 0.2002104313843379, "flos": 26078162440320.0, "grad_norm": 1.5945467520206502, "language_loss": 0.82213843, "learning_rate": 3.7037064151221353e-06, "loss": 0.84875911, "num_input_tokens_seen": 71941855, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.29333496, "step": 3330, "time_per_iteration": 2.862922191619873 }, { "auxiliary_loss_clip": 0.01588819, "auxiliary_loss_mlp": 0.01063399, "balance_loss_clip": 1.36899567, "balance_loss_mlp": 1.03683913, "epoch": 0.20027055463700585, "flos": 22977028811520.0, "grad_norm": 3.8702071395681266, "language_loss": 0.76899815, "learning_rate": 3.703502390349417e-06, "loss": 0.79552037, "num_input_tokens_seen": 71960915, "router_z_loss_clip": 2.1953125, "router_z_loss_mlp": 0.26550293, "step": 3331, "time_per_iteration": 2.867485284805298 }, { "auxiliary_loss_clip": 0.01608316, "auxiliary_loss_mlp": 0.01079765, "balance_loss_clip": 1.38770103, "balance_loss_mlp": 1.05210829, "epoch": 0.20033067788967382, "flos": 17174618265600.0, "grad_norm": 1.9192987625308315, "language_loss": 0.80208671, "learning_rate": 3.7032983009797176e-06, "loss": 0.82896751, "num_input_tokens_seen": 71979220, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27661133, "step": 3332, "time_per_iteration": 2.809671640396118 }, { "auxiliary_loss_clip": 0.01329291, "auxiliary_loss_mlp": 0.01059762, "balance_loss_clip": 1.19492495, "balance_loss_mlp": 1.0351572, "epoch": 0.2003908011423418, "flos": 60851805626880.0, "grad_norm": 0.9533613491468712, "language_loss": 0.62021172, "learning_rate": 3.703094147020776e-06, "loss": 0.64410228, "num_input_tokens_seen": 72033950, "router_z_loss_clip": 1.34375, "router_z_loss_mlp": 0.24511719, "step": 3333, "time_per_iteration": 3.2643260955810547 }, { "auxiliary_loss_clip": 0.01586205, "auxiliary_loss_mlp": 0.01056286, "balance_loss_clip": 1.36522865, "balance_loss_mlp": 1.03001237, "epoch": 0.20045092439500978, "flos": 24216713101440.0, "grad_norm": 1.7806025653426771, "language_loss": 0.82160246, "learning_rate": 3.7028899284803334e-06, "loss": 0.84802735, "num_input_tokens_seen": 72051395, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26281738, "step": 3334, "time_per_iteration": 2.874650478363037 }, { "auxiliary_loss_clip": 0.01588854, "auxiliary_loss_mlp": 0.01059805, "balance_loss_clip": 1.36777544, "balance_loss_mlp": 1.03278041, "epoch": 0.20051104764767774, "flos": 29399530348800.0, "grad_norm": 2.4537902393964823, "language_loss": 0.75845802, "learning_rate": 3.702685645366134e-06, "loss": 0.78494465, "num_input_tokens_seen": 72071305, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27062988, "step": 3335, "time_per_iteration": 4.2998998165130615 }, { "auxiliary_loss_clip": 0.01586759, "auxiliary_loss_mlp": 0.0105666, "balance_loss_clip": 1.36848915, "balance_loss_mlp": 1.0301832, "epoch": 0.2005711709003457, "flos": 23524311640320.0, "grad_norm": 2.0922648465740235, "language_loss": 0.81007087, "learning_rate": 3.7024812976859243e-06, "loss": 0.83650506, "num_input_tokens_seen": 72090165, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26525879, "step": 3336, "time_per_iteration": 2.887704849243164 }, { "auxiliary_loss_clip": 0.0158974, "auxiliary_loss_mlp": 0.01051289, "balance_loss_clip": 1.36758733, "balance_loss_mlp": 1.02351367, "epoch": 0.20063129415301367, "flos": 22532261996160.0, "grad_norm": 3.5389855101489682, "language_loss": 0.79080951, "learning_rate": 3.7022768854474532e-06, "loss": 0.81721985, "num_input_tokens_seen": 72107210, "router_z_loss_clip": 2.22070312, "router_z_loss_mlp": 0.27746582, "step": 3337, "time_per_iteration": 4.26805567741394 }, { "auxiliary_loss_clip": 0.01586676, "auxiliary_loss_mlp": 0.01045316, "balance_loss_clip": 1.36757278, "balance_loss_mlp": 1.01870894, "epoch": 0.20069141740568164, "flos": 25969357399680.0, "grad_norm": 1.9834392692861786, "language_loss": 0.70029432, "learning_rate": 3.7020724086584724e-06, "loss": 0.72661424, "num_input_tokens_seen": 72126315, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26611328, "step": 3338, "time_per_iteration": 4.306506156921387 }, { "auxiliary_loss_clip": 0.01582696, "auxiliary_loss_mlp": 0.01048834, "balance_loss_clip": 1.36461306, "balance_loss_mlp": 1.02289438, "epoch": 0.2007515406583496, "flos": 24801395397120.0, "grad_norm": 2.0088450045011372, "language_loss": 0.70591664, "learning_rate": 3.701867867326735e-06, "loss": 0.73223197, "num_input_tokens_seen": 72146470, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25927734, "step": 3339, "time_per_iteration": 2.9037976264953613 }, { "auxiliary_loss_clip": 0.01593626, "auxiliary_loss_mlp": 0.01042267, "balance_loss_clip": 1.37043238, "balance_loss_mlp": 1.01513481, "epoch": 0.2008116639110176, "flos": 37939802872320.0, "grad_norm": 2.222300680736885, "language_loss": 0.673666, "learning_rate": 3.7016632614599974e-06, "loss": 0.7000249, "num_input_tokens_seen": 72166600, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.27124023, "step": 3340, "time_per_iteration": 3.0024187564849854 }, { "auxiliary_loss_clip": 0.01586252, "auxiliary_loss_mlp": 0.0104539, "balance_loss_clip": 1.36362672, "balance_loss_mlp": 1.01806736, "epoch": 0.20087178716368556, "flos": 20750226560640.0, "grad_norm": 2.1512200938767165, "language_loss": 0.74818164, "learning_rate": 3.701458591066019e-06, "loss": 0.77449805, "num_input_tokens_seen": 72185160, "router_z_loss_clip": 2.22460938, "router_z_loss_mlp": 0.2734375, "step": 3341, "time_per_iteration": 2.9285974502563477 }, { "auxiliary_loss_clip": 0.01564848, "auxiliary_loss_mlp": 0.01047288, "balance_loss_clip": 1.35291338, "balance_loss_mlp": 1.02157474, "epoch": 0.20093191041635353, "flos": 23852898512640.0, "grad_norm": 1.771185986963292, "language_loss": 0.73245651, "learning_rate": 3.70125385615256e-06, "loss": 0.75857782, "num_input_tokens_seen": 72205160, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.25744629, "step": 3342, "time_per_iteration": 2.849736452102661 }, { "auxiliary_loss_clip": 0.01582418, "auxiliary_loss_mlp": 0.01044066, "balance_loss_clip": 1.36224377, "balance_loss_mlp": 1.01788759, "epoch": 0.2009920336690215, "flos": 21800334562560.0, "grad_norm": 1.7792059932016684, "language_loss": 0.7330147, "learning_rate": 3.701049056727384e-06, "loss": 0.75927961, "num_input_tokens_seen": 72223555, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26220703, "step": 3343, "time_per_iteration": 2.8210582733154297 }, { "auxiliary_loss_clip": 0.01574326, "auxiliary_loss_mlp": 0.01053009, "balance_loss_clip": 1.355986, "balance_loss_mlp": 1.0248754, "epoch": 0.20105215692168946, "flos": 26370345231360.0, "grad_norm": 2.0180269688608314, "language_loss": 0.82461232, "learning_rate": 3.7008441927982574e-06, "loss": 0.85088563, "num_input_tokens_seen": 72242465, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.28173828, "step": 3344, "time_per_iteration": 2.861100673675537 }, { "auxiliary_loss_clip": 0.01586797, "auxiliary_loss_mlp": 0.01048644, "balance_loss_clip": 1.36431682, "balance_loss_mlp": 1.02103496, "epoch": 0.20111228017435742, "flos": 18816602486400.0, "grad_norm": 14.210659737902155, "language_loss": 0.84576428, "learning_rate": 3.700639264372948e-06, "loss": 0.87211871, "num_input_tokens_seen": 72260655, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27587891, "step": 3345, "time_per_iteration": 2.9493563175201416 }, { "auxiliary_loss_clip": 0.01552171, "auxiliary_loss_mlp": 0.01046852, "balance_loss_clip": 1.34472752, "balance_loss_mlp": 1.02090049, "epoch": 0.20117240342702541, "flos": 19984745468160.0, "grad_norm": 2.2528793433826606, "language_loss": 0.68853819, "learning_rate": 3.7004342714592283e-06, "loss": 0.71452844, "num_input_tokens_seen": 72279055, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.2598877, "step": 3346, "time_per_iteration": 2.8190698623657227 }, { "auxiliary_loss_clip": 0.01566376, "auxiliary_loss_mlp": 0.01050645, "balance_loss_clip": 1.35100985, "balance_loss_mlp": 1.02322721, "epoch": 0.20123252667969338, "flos": 23151493336320.0, "grad_norm": 2.0325933506550817, "language_loss": 0.75358665, "learning_rate": 3.70022921406487e-06, "loss": 0.77975684, "num_input_tokens_seen": 72297895, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.27416992, "step": 3347, "time_per_iteration": 2.85384202003479 }, { "auxiliary_loss_clip": 0.01575192, "auxiliary_loss_mlp": 0.01046571, "balance_loss_clip": 1.36037636, "balance_loss_mlp": 1.02203774, "epoch": 0.20129264993236134, "flos": 23232038359680.0, "grad_norm": 1.5476273844590394, "language_loss": 0.87424833, "learning_rate": 3.70002409219765e-06, "loss": 0.90046597, "num_input_tokens_seen": 72318385, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.2454834, "step": 3348, "time_per_iteration": 2.8500397205352783 }, { "auxiliary_loss_clip": 0.01561283, "auxiliary_loss_mlp": 0.01049103, "balance_loss_clip": 1.35008824, "balance_loss_mlp": 1.02117264, "epoch": 0.2013527731850293, "flos": 21881422523520.0, "grad_norm": 1.6204012455760757, "language_loss": 0.72134966, "learning_rate": 3.699818905865346e-06, "loss": 0.74745357, "num_input_tokens_seen": 72338235, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.27905273, "step": 3349, "time_per_iteration": 2.827141046524048 }, { "auxiliary_loss_clip": 0.01560071, "auxiliary_loss_mlp": 0.01047, "balance_loss_clip": 1.34725988, "balance_loss_mlp": 1.01906896, "epoch": 0.20141289643769728, "flos": 18049854539520.0, "grad_norm": 1.6261137327040167, "language_loss": 0.72691935, "learning_rate": 3.6996136550757377e-06, "loss": 0.75299007, "num_input_tokens_seen": 72357825, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.27929688, "step": 3350, "time_per_iteration": 2.8394765853881836 }, { "auxiliary_loss_clip": 0.01562639, "auxiliary_loss_mlp": 0.01045775, "balance_loss_clip": 1.3454603, "balance_loss_mlp": 1.01871431, "epoch": 0.20147301969036524, "flos": 23961975022080.0, "grad_norm": 2.7163386080977507, "language_loss": 0.77451253, "learning_rate": 3.69940833983661e-06, "loss": 0.80059659, "num_input_tokens_seen": 72376335, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.27062988, "step": 3351, "time_per_iteration": 2.8969576358795166 }, { "auxiliary_loss_clip": 0.01585412, "auxiliary_loss_mlp": 0.01046356, "balance_loss_clip": 1.36264205, "balance_loss_mlp": 1.01891398, "epoch": 0.2015331429430332, "flos": 25598213153280.0, "grad_norm": 1.468398291704413, "language_loss": 0.81827581, "learning_rate": 3.699202960155748e-06, "loss": 0.84459341, "num_input_tokens_seen": 72395440, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.27416992, "step": 3352, "time_per_iteration": 2.9369630813598633 }, { "auxiliary_loss_clip": 0.01568057, "auxiliary_loss_mlp": 0.01044903, "balance_loss_clip": 1.35160506, "balance_loss_mlp": 1.01830745, "epoch": 0.2015932661957012, "flos": 26736422060160.0, "grad_norm": 3.909242091666632, "language_loss": 0.81689525, "learning_rate": 3.6989975160409396e-06, "loss": 0.84302485, "num_input_tokens_seen": 72414670, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26599121, "step": 3353, "time_per_iteration": 2.8768374919891357 }, { "auxiliary_loss_clip": 0.01568384, "auxiliary_loss_mlp": 0.01042793, "balance_loss_clip": 1.3553865, "balance_loss_mlp": 1.01772332, "epoch": 0.20165338944836916, "flos": 15641665309440.0, "grad_norm": 1.6819665675367104, "language_loss": 0.91336012, "learning_rate": 3.6987920074999747e-06, "loss": 0.9394719, "num_input_tokens_seen": 72432210, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.25073242, "step": 3354, "time_per_iteration": 2.879809617996216 }, { "auxiliary_loss_clip": 0.01334869, "auxiliary_loss_mlp": 0.01029093, "balance_loss_clip": 1.20712066, "balance_loss_mlp": 1.00572836, "epoch": 0.20171351270103713, "flos": 57939524369280.0, "grad_norm": 0.8542660202662613, "language_loss": 0.55932653, "learning_rate": 3.6985864345406465e-06, "loss": 0.58296621, "num_input_tokens_seen": 72489225, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.23339844, "step": 3355, "time_per_iteration": 3.3341612815856934 }, { "auxiliary_loss_clip": 0.01555245, "auxiliary_loss_mlp": 0.01056335, "balance_loss_clip": 1.3441633, "balance_loss_mlp": 1.03142083, "epoch": 0.2017736359537051, "flos": 20824301577600.0, "grad_norm": 1.5353229483779283, "language_loss": 0.84946835, "learning_rate": 3.698380797170751e-06, "loss": 0.87558424, "num_input_tokens_seen": 72508715, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.24902344, "step": 3356, "time_per_iteration": 2.9603211879730225 }, { "auxiliary_loss_clip": 0.01596698, "auxiliary_loss_mlp": 0.01051274, "balance_loss_clip": 1.36980689, "balance_loss_mlp": 1.02364111, "epoch": 0.20183375920637306, "flos": 17100814717440.0, "grad_norm": 5.12005868688605, "language_loss": 0.70976049, "learning_rate": 3.698175095398085e-06, "loss": 0.73624015, "num_input_tokens_seen": 72525135, "router_z_loss_clip": 2.26757812, "router_z_loss_mlp": 0.27612305, "step": 3357, "time_per_iteration": 2.899057149887085 }, { "auxiliary_loss_clip": 0.01574057, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.35492134, "balance_loss_mlp": 1.0194521, "epoch": 0.20189388245904102, "flos": 18670624202880.0, "grad_norm": 1.82743947712233, "language_loss": 0.73174816, "learning_rate": 3.6979693292304493e-06, "loss": 0.75794333, "num_input_tokens_seen": 72543690, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.2598877, "step": 3358, "time_per_iteration": 2.7870848178863525 }, { "auxiliary_loss_clip": 0.01541686, "auxiliary_loss_mlp": 0.01053729, "balance_loss_clip": 1.3343184, "balance_loss_mlp": 1.02879024, "epoch": 0.20195400571170902, "flos": 16805555280000.0, "grad_norm": 1.6989074160663957, "language_loss": 0.84124184, "learning_rate": 3.6977634986756463e-06, "loss": 0.86719596, "num_input_tokens_seen": 72560725, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.24926758, "step": 3359, "time_per_iteration": 2.8331081867218018 }, { "auxiliary_loss_clip": 0.01326239, "auxiliary_loss_mlp": 0.01051141, "balance_loss_clip": 1.19755876, "balance_loss_mlp": 1.03120935, "epoch": 0.20201412896437698, "flos": 67202765856000.0, "grad_norm": 0.7756399794665564, "language_loss": 0.59048927, "learning_rate": 3.697557603741482e-06, "loss": 0.61426306, "num_input_tokens_seen": 72621940, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.19921875, "step": 3360, "time_per_iteration": 3.3646411895751953 }, { "auxiliary_loss_clip": 0.01575638, "auxiliary_loss_mlp": 0.01049979, "balance_loss_clip": 1.35691047, "balance_loss_mlp": 1.02301383, "epoch": 0.20207425221704495, "flos": 21335044590720.0, "grad_norm": 2.1923864211094988, "language_loss": 0.64366353, "learning_rate": 3.697351644435763e-06, "loss": 0.66991973, "num_input_tokens_seen": 72639135, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26953125, "step": 3361, "time_per_iteration": 2.8059589862823486 }, { "auxiliary_loss_clip": 0.01563754, "auxiliary_loss_mlp": 0.01050612, "balance_loss_clip": 1.35095561, "balance_loss_mlp": 1.02488685, "epoch": 0.2021343754697129, "flos": 22537012700160.0, "grad_norm": 2.1454466725536627, "language_loss": 0.7700336, "learning_rate": 3.6971456207662993e-06, "loss": 0.79617727, "num_input_tokens_seen": 72658525, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25732422, "step": 3362, "time_per_iteration": 2.8746564388275146 }, { "auxiliary_loss_clip": 0.01563777, "auxiliary_loss_mlp": 0.01048195, "balance_loss_clip": 1.3508513, "balance_loss_mlp": 1.02297008, "epoch": 0.20219449872238088, "flos": 19072200216960.0, "grad_norm": 1.6759195322879317, "language_loss": 0.77539647, "learning_rate": 3.6969395327409035e-06, "loss": 0.80151612, "num_input_tokens_seen": 72678085, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25231934, "step": 3363, "time_per_iteration": 4.254496812820435 }, { "auxiliary_loss_clip": 0.01563041, "auxiliary_loss_mlp": 0.0104686, "balance_loss_clip": 1.34997082, "balance_loss_mlp": 1.02181375, "epoch": 0.20225462197504884, "flos": 24727637093760.0, "grad_norm": 1.4112282268537386, "language_loss": 0.76006937, "learning_rate": 3.696733380367391e-06, "loss": 0.7861684, "num_input_tokens_seen": 72698695, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25048828, "step": 3364, "time_per_iteration": 2.881741523742676 }, { "auxiliary_loss_clip": 0.01581925, "auxiliary_loss_mlp": 0.01052476, "balance_loss_clip": 1.36285067, "balance_loss_mlp": 1.02559435, "epoch": 0.2023147452277168, "flos": 22028531927040.0, "grad_norm": 1.8897377600214003, "language_loss": 0.72265816, "learning_rate": 3.6965271636535783e-06, "loss": 0.74900222, "num_input_tokens_seen": 72717880, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26879883, "step": 3365, "time_per_iteration": 2.8943593502044678 }, { "auxiliary_loss_clip": 0.01569021, "auxiliary_loss_mlp": 0.01050553, "balance_loss_clip": 1.35467625, "balance_loss_mlp": 1.02433896, "epoch": 0.2023748684803848, "flos": 17753418737280.0, "grad_norm": 1.806542196539308, "language_loss": 0.86498231, "learning_rate": 3.696320882607286e-06, "loss": 0.89117801, "num_input_tokens_seen": 72736410, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.26257324, "step": 3366, "time_per_iteration": 2.8551669120788574 }, { "auxiliary_loss_clip": 0.0156845, "auxiliary_loss_mlp": 0.01042616, "balance_loss_clip": 1.35550869, "balance_loss_mlp": 1.01659298, "epoch": 0.20243499173305277, "flos": 31151314995840.0, "grad_norm": 1.5525971625876667, "language_loss": 0.70110261, "learning_rate": 3.696114537236335e-06, "loss": 0.72721326, "num_input_tokens_seen": 72758295, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.26013184, "step": 3367, "time_per_iteration": 2.920126438140869 }, { "auxiliary_loss_clip": 0.01581005, "auxiliary_loss_mlp": 0.01044249, "balance_loss_clip": 1.36041212, "balance_loss_mlp": 1.01616359, "epoch": 0.20249511498572073, "flos": 33852410933760.0, "grad_norm": 1.8250431160483767, "language_loss": 0.69113064, "learning_rate": 3.6959081275485512e-06, "loss": 0.71738315, "num_input_tokens_seen": 72782495, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.28063965, "step": 3368, "time_per_iteration": 2.926661491394043 }, { "auxiliary_loss_clip": 0.01561295, "auxiliary_loss_mlp": 0.01048439, "balance_loss_clip": 1.34946942, "balance_loss_mlp": 1.02196252, "epoch": 0.2025552382383887, "flos": 21225651367680.0, "grad_norm": 1.5039366155660086, "language_loss": 0.78484261, "learning_rate": 3.6957016535517615e-06, "loss": 0.81093991, "num_input_tokens_seen": 72801885, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.26464844, "step": 3369, "time_per_iteration": 2.895616292953491 }, { "auxiliary_loss_clip": 0.01570482, "auxiliary_loss_mlp": 0.01057808, "balance_loss_clip": 1.35147834, "balance_loss_mlp": 1.03131962, "epoch": 0.20261536149105666, "flos": 14655497489280.0, "grad_norm": 3.791362517507997, "language_loss": 0.67005777, "learning_rate": 3.695495115253795e-06, "loss": 0.69634068, "num_input_tokens_seen": 72816990, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26513672, "step": 3370, "time_per_iteration": 2.795042037963867 }, { "auxiliary_loss_clip": 0.01313703, "auxiliary_loss_mlp": 0.01058501, "balance_loss_clip": 1.19372869, "balance_loss_mlp": 1.03694785, "epoch": 0.20267548474372463, "flos": 66814265589120.0, "grad_norm": 0.6895483383539973, "language_loss": 0.5820756, "learning_rate": 3.6952885126624834e-06, "loss": 0.60579765, "num_input_tokens_seen": 72879240, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.21582031, "step": 3371, "time_per_iteration": 4.844992399215698 }, { "auxiliary_loss_clip": 0.01557455, "auxiliary_loss_mlp": 0.01050549, "balance_loss_clip": 1.34516788, "balance_loss_mlp": 1.02361929, "epoch": 0.2027356079963926, "flos": 24691459236480.0, "grad_norm": 1.524627567641125, "language_loss": 0.92225122, "learning_rate": 3.6950818457856617e-06, "loss": 0.94833124, "num_input_tokens_seen": 72899030, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.26940918, "step": 3372, "time_per_iteration": 4.343122243881226 }, { "auxiliary_loss_clip": 0.01573193, "auxiliary_loss_mlp": 0.01053591, "balance_loss_clip": 1.35607803, "balance_loss_mlp": 1.0251472, "epoch": 0.20279573124906058, "flos": 26403129728640.0, "grad_norm": 1.695656185109376, "language_loss": 0.79044497, "learning_rate": 3.694875114631167e-06, "loss": 0.81671274, "num_input_tokens_seen": 72919190, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.28442383, "step": 3373, "time_per_iteration": 4.210289001464844 }, { "auxiliary_loss_clip": 0.01551678, "auxiliary_loss_mlp": 0.01048452, "balance_loss_clip": 1.34427392, "balance_loss_mlp": 1.02112961, "epoch": 0.20285585450172855, "flos": 33812432513280.0, "grad_norm": 1.7280921838951797, "language_loss": 0.72238815, "learning_rate": 3.6946683192068377e-06, "loss": 0.74838948, "num_input_tokens_seen": 72939720, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.27294922, "step": 3374, "time_per_iteration": 2.946377992630005 }, { "auxiliary_loss_clip": 0.01313018, "auxiliary_loss_mlp": 0.010288, "balance_loss_clip": 1.19479227, "balance_loss_mlp": 1.00247908, "epoch": 0.20291597775439651, "flos": 71197324168320.0, "grad_norm": 0.9735946020547792, "language_loss": 0.62473059, "learning_rate": 3.694461459520516e-06, "loss": 0.64814878, "num_input_tokens_seen": 73000015, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.26367188, "step": 3375, "time_per_iteration": 3.3196218013763428 }, { "auxiliary_loss_clip": 0.01553456, "auxiliary_loss_mlp": 0.01048339, "balance_loss_clip": 1.34194064, "balance_loss_mlp": 1.02199411, "epoch": 0.20297610100706448, "flos": 19502171982720.0, "grad_norm": 1.577872935996985, "language_loss": 0.83048964, "learning_rate": 3.6942545355800463e-06, "loss": 0.85650754, "num_input_tokens_seen": 73017675, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.26367188, "step": 3376, "time_per_iteration": 2.822425127029419 }, { "auxiliary_loss_clip": 0.01573252, "auxiliary_loss_mlp": 0.01050424, "balance_loss_clip": 1.35553503, "balance_loss_mlp": 1.02236223, "epoch": 0.20303622425973245, "flos": 25054640398080.0, "grad_norm": 1.919334084219983, "language_loss": 0.82510549, "learning_rate": 3.6940475473932743e-06, "loss": 0.85134226, "num_input_tokens_seen": 73036135, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.28051758, "step": 3377, "time_per_iteration": 2.8600258827209473 }, { "auxiliary_loss_clip": 0.01568752, "auxiliary_loss_mlp": 0.01056629, "balance_loss_clip": 1.35478139, "balance_loss_mlp": 1.03036773, "epoch": 0.2030963475124004, "flos": 21989729871360.0, "grad_norm": 1.8155610280813599, "language_loss": 0.78310513, "learning_rate": 3.69384049496805e-06, "loss": 0.80935895, "num_input_tokens_seen": 73054075, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.26281738, "step": 3378, "time_per_iteration": 2.8825161457061768 }, { "auxiliary_loss_clip": 0.01559555, "auxiliary_loss_mlp": 0.01049681, "balance_loss_clip": 1.34288204, "balance_loss_mlp": 1.02190542, "epoch": 0.2031564707650684, "flos": 19509863598720.0, "grad_norm": 1.8209140951875677, "language_loss": 0.81249887, "learning_rate": 3.6936333783122242e-06, "loss": 0.83859122, "num_input_tokens_seen": 73073530, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.27770996, "step": 3379, "time_per_iteration": 2.8480472564697266 }, { "auxiliary_loss_clip": 0.01546424, "auxiliary_loss_mlp": 0.0105419, "balance_loss_clip": 1.33819032, "balance_loss_mlp": 1.028512, "epoch": 0.20321659401773637, "flos": 22757246979840.0, "grad_norm": 1.6918580032740358, "language_loss": 0.87753922, "learning_rate": 3.6934261974336505e-06, "loss": 0.90354538, "num_input_tokens_seen": 73092820, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.25708008, "step": 3380, "time_per_iteration": 2.8253355026245117 }, { "auxiliary_loss_clip": 0.01566852, "auxiliary_loss_mlp": 0.01050224, "balance_loss_clip": 1.35207546, "balance_loss_mlp": 1.02402163, "epoch": 0.20327671727040433, "flos": 22466104819200.0, "grad_norm": 1.708777914703145, "language_loss": 0.76351237, "learning_rate": 3.693218952340186e-06, "loss": 0.78968316, "num_input_tokens_seen": 73113385, "router_z_loss_clip": 2.14550781, "router_z_loss_mlp": 0.26208496, "step": 3381, "time_per_iteration": 2.870901107788086 }, { "auxiliary_loss_clip": 0.01576797, "auxiliary_loss_mlp": 0.01053446, "balance_loss_clip": 1.35819292, "balance_loss_mlp": 1.026564, "epoch": 0.2033368405230723, "flos": 19544005440000.0, "grad_norm": 1.8255344953339325, "language_loss": 0.80834186, "learning_rate": 3.6930116430396895e-06, "loss": 0.83464426, "num_input_tokens_seen": 73131195, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26855469, "step": 3382, "time_per_iteration": 2.830556631088257 }, { "auxiliary_loss_clip": 0.01579837, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 1.36190856, "balance_loss_mlp": 1.02097702, "epoch": 0.20339696377574026, "flos": 13817389213440.0, "grad_norm": 1.7896673548708524, "language_loss": 0.81402779, "learning_rate": 3.6928042695400214e-06, "loss": 0.84031701, "num_input_tokens_seen": 73148850, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.28100586, "step": 3383, "time_per_iteration": 2.8066368103027344 }, { "auxiliary_loss_clip": 0.01568814, "auxiliary_loss_mlp": 0.01046383, "balance_loss_clip": 1.35407245, "balance_loss_mlp": 1.01970446, "epoch": 0.20345708702840823, "flos": 20349057749760.0, "grad_norm": 1.8452084591068358, "language_loss": 0.75707567, "learning_rate": 3.6925968318490464e-06, "loss": 0.78322762, "num_input_tokens_seen": 73166775, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.26672363, "step": 3384, "time_per_iteration": 2.803147792816162 }, { "auxiliary_loss_clip": 0.01592791, "auxiliary_loss_mlp": 0.0105075, "balance_loss_clip": 1.36652792, "balance_loss_mlp": 1.022259, "epoch": 0.2035172102810762, "flos": 20342180540160.0, "grad_norm": 2.720252133128083, "language_loss": 0.78308415, "learning_rate": 3.6923893299746293e-06, "loss": 0.80951959, "num_input_tokens_seen": 73183215, "router_z_loss_clip": 2.25976562, "router_z_loss_mlp": 0.28479004, "step": 3385, "time_per_iteration": 2.77998423576355 }, { "auxiliary_loss_clip": 0.0157207, "auxiliary_loss_mlp": 0.01060598, "balance_loss_clip": 1.35660267, "balance_loss_mlp": 1.03284597, "epoch": 0.2035773335337442, "flos": 23341341093120.0, "grad_norm": 1.8022732265513657, "language_loss": 0.70314646, "learning_rate": 3.692181763924639e-06, "loss": 0.72947311, "num_input_tokens_seen": 73203290, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.27770996, "step": 3386, "time_per_iteration": 2.8637287616729736 }, { "auxiliary_loss_clip": 0.01579987, "auxiliary_loss_mlp": 0.01056308, "balance_loss_clip": 1.36350214, "balance_loss_mlp": 1.02920032, "epoch": 0.20363745678641215, "flos": 28342137934080.0, "grad_norm": 1.3274381715266834, "language_loss": 0.81804127, "learning_rate": 3.691974133706947e-06, "loss": 0.84440422, "num_input_tokens_seen": 73226185, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.27148438, "step": 3387, "time_per_iteration": 2.885659694671631 }, { "auxiliary_loss_clip": 0.01555458, "auxiliary_loss_mlp": 0.01049114, "balance_loss_clip": 1.34636378, "balance_loss_mlp": 1.02216101, "epoch": 0.20369758003908012, "flos": 18924728855040.0, "grad_norm": 2.395837630485103, "language_loss": 0.81010854, "learning_rate": 3.6917664393294262e-06, "loss": 0.83615428, "num_input_tokens_seen": 73243300, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.26965332, "step": 3388, "time_per_iteration": 2.8331298828125 }, { "auxiliary_loss_clip": 0.01581407, "auxiliary_loss_mlp": 0.01050164, "balance_loss_clip": 1.36288714, "balance_loss_mlp": 1.02331781, "epoch": 0.20375770329174808, "flos": 19215373322880.0, "grad_norm": 1.627191866807711, "language_loss": 0.72680414, "learning_rate": 3.6915586807999527e-06, "loss": 0.75311983, "num_input_tokens_seen": 73261490, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.26818848, "step": 3389, "time_per_iteration": 2.800248622894287 }, { "auxiliary_loss_clip": 0.01570444, "auxiliary_loss_mlp": 0.01048659, "balance_loss_clip": 1.35702658, "balance_loss_mlp": 1.02119267, "epoch": 0.20381782654441605, "flos": 19400470375680.0, "grad_norm": 1.713563831960336, "language_loss": 0.88053715, "learning_rate": 3.691350858126404e-06, "loss": 0.90672815, "num_input_tokens_seen": 73280180, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.27453613, "step": 3390, "time_per_iteration": 2.814483642578125 }, { "auxiliary_loss_clip": 0.01562017, "auxiliary_loss_mlp": 0.01045176, "balance_loss_clip": 1.35012078, "balance_loss_mlp": 1.01794863, "epoch": 0.203877949797084, "flos": 24838659129600.0, "grad_norm": 1.9458072995582858, "language_loss": 0.73194063, "learning_rate": 3.691142971316662e-06, "loss": 0.75801253, "num_input_tokens_seen": 73300680, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.27246094, "step": 3391, "time_per_iteration": 2.8968234062194824 }, { "auxiliary_loss_clip": 0.01564363, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.35157716, "balance_loss_mlp": 1.02329135, "epoch": 0.20393807304975198, "flos": 18012590807040.0, "grad_norm": 2.2514636244515254, "language_loss": 0.88136178, "learning_rate": 3.6909350203786086e-06, "loss": 0.90750277, "num_input_tokens_seen": 73316760, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.26452637, "step": 3392, "time_per_iteration": 2.8103151321411133 }, { "auxiliary_loss_clip": 0.01575677, "auxiliary_loss_mlp": 0.01050656, "balance_loss_clip": 1.35782075, "balance_loss_mlp": 1.02252245, "epoch": 0.20399819630241997, "flos": 24217663242240.0, "grad_norm": 1.430969129202825, "language_loss": 0.81655252, "learning_rate": 3.69072700532013e-06, "loss": 0.84281582, "num_input_tokens_seen": 73339385, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.28149414, "step": 3393, "time_per_iteration": 2.9099128246307373 }, { "auxiliary_loss_clip": 0.01563193, "auxiliary_loss_mlp": 0.01042828, "balance_loss_clip": 1.34995019, "balance_loss_mlp": 1.01629186, "epoch": 0.20405831955508794, "flos": 20786856865920.0, "grad_norm": 1.6902799845061478, "language_loss": 0.86766255, "learning_rate": 3.6905189261491137e-06, "loss": 0.89372271, "num_input_tokens_seen": 73357235, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.26501465, "step": 3394, "time_per_iteration": 2.833338975906372 }, { "auxiliary_loss_clip": 0.01562853, "auxiliary_loss_mlp": 0.01047036, "balance_loss_clip": 1.34879136, "balance_loss_mlp": 1.02104819, "epoch": 0.2041184428077559, "flos": 15495325067520.0, "grad_norm": 2.173526952963263, "language_loss": 0.85175169, "learning_rate": 3.69031078287345e-06, "loss": 0.87785053, "num_input_tokens_seen": 73374435, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.26025391, "step": 3395, "time_per_iteration": 2.8285036087036133 }, { "auxiliary_loss_clip": 0.01576541, "auxiliary_loss_mlp": 0.01045667, "balance_loss_clip": 1.35749245, "balance_loss_mlp": 1.0185231, "epoch": 0.20417856606042387, "flos": 15594854924160.0, "grad_norm": 1.8811318074710468, "language_loss": 0.85199505, "learning_rate": 3.690102575501033e-06, "loss": 0.8782171, "num_input_tokens_seen": 73391025, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.2713623, "step": 3396, "time_per_iteration": 2.806762933731079 }, { "auxiliary_loss_clip": 0.01568357, "auxiliary_loss_mlp": 0.01047175, "balance_loss_clip": 1.3544575, "balance_loss_mlp": 1.01997185, "epoch": 0.20423868931309183, "flos": 24289883222400.0, "grad_norm": 1.6999120911642334, "language_loss": 0.77907741, "learning_rate": 3.6898943040397556e-06, "loss": 0.80523276, "num_input_tokens_seen": 73409270, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.27209473, "step": 3397, "time_per_iteration": 2.865786075592041 }, { "auxiliary_loss_clip": 0.0156347, "auxiliary_loss_mlp": 0.01045203, "balance_loss_clip": 1.34764361, "balance_loss_mlp": 1.0178566, "epoch": 0.2042988125657598, "flos": 18621551577600.0, "grad_norm": 2.3538477056925227, "language_loss": 0.88582194, "learning_rate": 3.689685968497518e-06, "loss": 0.91190863, "num_input_tokens_seen": 73425225, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.2734375, "step": 3398, "time_per_iteration": 4.220952272415161 }, { "auxiliary_loss_clip": 0.01580934, "auxiliary_loss_mlp": 0.01045578, "balance_loss_clip": 1.36369407, "balance_loss_mlp": 1.01838589, "epoch": 0.2043589358184278, "flos": 17858377969920.0, "grad_norm": 1.7882190994765854, "language_loss": 0.78763294, "learning_rate": 3.6894775688822186e-06, "loss": 0.81389809, "num_input_tokens_seen": 73440940, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.27185059, "step": 3399, "time_per_iteration": 2.88181734085083 }, { "auxiliary_loss_clip": 0.01573484, "auxiliary_loss_mlp": 0.01048196, "balance_loss_clip": 1.35576153, "balance_loss_mlp": 1.02090871, "epoch": 0.20441905907109575, "flos": 21445478444160.0, "grad_norm": 1.870615310201243, "language_loss": 0.77180648, "learning_rate": 3.6892691052017603e-06, "loss": 0.79802328, "num_input_tokens_seen": 73458805, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.27282715, "step": 3400, "time_per_iteration": 2.8886966705322266 }, { "auxiliary_loss_clip": 0.01554146, "auxiliary_loss_mlp": 0.01046168, "balance_loss_clip": 1.34287536, "balance_loss_mlp": 1.0205853, "epoch": 0.20447918232376372, "flos": 27718789317120.0, "grad_norm": 1.6491158864075368, "language_loss": 0.806054, "learning_rate": 3.6890605774640487e-06, "loss": 0.83205712, "num_input_tokens_seen": 73479380, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.25610352, "step": 3401, "time_per_iteration": 2.8712358474731445 }, { "auxiliary_loss_clip": 0.01568978, "auxiliary_loss_mlp": 0.010455, "balance_loss_clip": 1.35155797, "balance_loss_mlp": 1.01866579, "epoch": 0.20453930557643168, "flos": 30537874990080.0, "grad_norm": 1.8592813987203245, "language_loss": 0.70438313, "learning_rate": 3.688851985676991e-06, "loss": 0.73052794, "num_input_tokens_seen": 73505105, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26818848, "step": 3402, "time_per_iteration": 2.9641377925872803 }, { "auxiliary_loss_clip": 0.0157107, "auxiliary_loss_mlp": 0.01047197, "balance_loss_clip": 1.35342193, "balance_loss_mlp": 1.01899242, "epoch": 0.20459942882909965, "flos": 18996632121600.0, "grad_norm": 2.2130724212321238, "language_loss": 0.82180291, "learning_rate": 3.688643329848496e-06, "loss": 0.84798557, "num_input_tokens_seen": 73523700, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.28222656, "step": 3403, "time_per_iteration": 2.833972930908203 }, { "auxiliary_loss_clip": 0.01571017, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.35335696, "balance_loss_mlp": 1.02057588, "epoch": 0.20465955208176762, "flos": 20348605301760.0, "grad_norm": 4.097731364701076, "language_loss": 0.85115016, "learning_rate": 3.6884346099864772e-06, "loss": 0.87733513, "num_input_tokens_seen": 73542625, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.26916504, "step": 3404, "time_per_iteration": 2.8034298419952393 }, { "auxiliary_loss_clip": 0.01568347, "auxiliary_loss_mlp": 0.01055846, "balance_loss_clip": 1.34937835, "balance_loss_mlp": 1.02958453, "epoch": 0.20471967533443558, "flos": 21261105308160.0, "grad_norm": 1.8670021321033277, "language_loss": 0.8656106, "learning_rate": 3.6882258260988487e-06, "loss": 0.8918525, "num_input_tokens_seen": 73561450, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26269531, "step": 3405, "time_per_iteration": 2.8429055213928223 }, { "auxiliary_loss_clip": 0.01564625, "auxiliary_loss_mlp": 0.01051034, "balance_loss_clip": 1.34766698, "balance_loss_mlp": 1.02498651, "epoch": 0.20477979858710357, "flos": 14509654940160.0, "grad_norm": 2.23537153822877, "language_loss": 0.85337508, "learning_rate": 3.6880169781935276e-06, "loss": 0.87953162, "num_input_tokens_seen": 73577155, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.26074219, "step": 3406, "time_per_iteration": 4.2840211391448975 }, { "auxiliary_loss_clip": 0.01565572, "auxiliary_loss_mlp": 0.01053449, "balance_loss_clip": 1.35008192, "balance_loss_mlp": 1.0268774, "epoch": 0.20483992183977154, "flos": 11407616415360.0, "grad_norm": 1.9386880301435354, "language_loss": 0.6910395, "learning_rate": 3.6878080662784336e-06, "loss": 0.71722972, "num_input_tokens_seen": 73594900, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.265625, "step": 3407, "time_per_iteration": 5.625287294387817 }, { "auxiliary_loss_clip": 0.01560028, "auxiliary_loss_mlp": 0.01048817, "balance_loss_clip": 1.3459909, "balance_loss_mlp": 1.0229249, "epoch": 0.2049000450924395, "flos": 19069033080960.0, "grad_norm": 1.911111672445809, "language_loss": 0.85104221, "learning_rate": 3.6875990903614886e-06, "loss": 0.87713063, "num_input_tokens_seen": 73613810, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.2590332, "step": 3408, "time_per_iteration": 2.885241746902466 }, { "auxiliary_loss_clip": 0.01580582, "auxiliary_loss_mlp": 0.01063523, "balance_loss_clip": 1.35981679, "balance_loss_mlp": 1.03555703, "epoch": 0.20496016834510747, "flos": 14582689326720.0, "grad_norm": 2.494459554719174, "language_loss": 0.65775156, "learning_rate": 3.6873900504506166e-06, "loss": 0.6841926, "num_input_tokens_seen": 73631495, "router_z_loss_clip": 2.20996094, "router_z_loss_mlp": 0.27990723, "step": 3409, "time_per_iteration": 2.82252836227417 }, { "auxiliary_loss_clip": 0.0158383, "auxiliary_loss_mlp": 0.01066613, "balance_loss_clip": 1.36650383, "balance_loss_mlp": 1.04020858, "epoch": 0.20502029159777543, "flos": 22136613050880.0, "grad_norm": 1.309368188669326, "language_loss": 0.81072164, "learning_rate": 3.687180946553745e-06, "loss": 0.83722609, "num_input_tokens_seen": 73652840, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.26416016, "step": 3410, "time_per_iteration": 2.962275981903076 }, { "auxiliary_loss_clip": 0.0157824, "auxiliary_loss_mlp": 0.01071021, "balance_loss_clip": 1.36184382, "balance_loss_mlp": 1.04531956, "epoch": 0.2050804148504434, "flos": 25377164467200.0, "grad_norm": 1.9124316837304327, "language_loss": 0.76939702, "learning_rate": 3.686971778678803e-06, "loss": 0.79588962, "num_input_tokens_seen": 73672150, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25695801, "step": 3411, "time_per_iteration": 2.903496026992798 }, { "auxiliary_loss_clip": 0.01580842, "auxiliary_loss_mlp": 0.01075601, "balance_loss_clip": 1.36636209, "balance_loss_mlp": 1.04893434, "epoch": 0.2051405381031114, "flos": 23629904300160.0, "grad_norm": 1.8779360452222869, "language_loss": 0.7509191, "learning_rate": 3.686762546833722e-06, "loss": 0.77748346, "num_input_tokens_seen": 73691940, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26696777, "step": 3412, "time_per_iteration": 2.900313377380371 }, { "auxiliary_loss_clip": 0.01584663, "auxiliary_loss_mlp": 0.01076936, "balance_loss_clip": 1.36348057, "balance_loss_mlp": 1.05055451, "epoch": 0.20520066135577936, "flos": 19572989374080.0, "grad_norm": 1.9818385946795365, "language_loss": 0.78916478, "learning_rate": 3.6865532510264362e-06, "loss": 0.81578082, "num_input_tokens_seen": 73709080, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.26416016, "step": 3413, "time_per_iteration": 2.8083720207214355 }, { "auxiliary_loss_clip": 0.01578856, "auxiliary_loss_mlp": 0.01077696, "balance_loss_clip": 1.36824775, "balance_loss_mlp": 1.05132675, "epoch": 0.20526078460844732, "flos": 17685406523520.0, "grad_norm": 1.829279381207386, "language_loss": 0.85364246, "learning_rate": 3.6863438912648823e-06, "loss": 0.88020802, "num_input_tokens_seen": 73727670, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.26367188, "step": 3414, "time_per_iteration": 2.8397960662841797 }, { "auxiliary_loss_clip": 0.01581591, "auxiliary_loss_mlp": 0.01064901, "balance_loss_clip": 1.36653423, "balance_loss_mlp": 1.04014087, "epoch": 0.2053209078611153, "flos": 21508558974720.0, "grad_norm": 1.8043040873657037, "language_loss": 0.81929553, "learning_rate": 3.6861344675569986e-06, "loss": 0.84576046, "num_input_tokens_seen": 73747170, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24755859, "step": 3415, "time_per_iteration": 2.8536548614501953 }, { "auxiliary_loss_clip": 0.01590431, "auxiliary_loss_mlp": 0.01067713, "balance_loss_clip": 1.3775692, "balance_loss_mlp": 1.04314435, "epoch": 0.20538103111378325, "flos": 25673826493440.0, "grad_norm": 2.508462868875211, "language_loss": 0.74312854, "learning_rate": 3.6859249799107275e-06, "loss": 0.76970994, "num_input_tokens_seen": 73767690, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.24584961, "step": 3416, "time_per_iteration": 2.909085988998413 }, { "auxiliary_loss_clip": 0.01587066, "auxiliary_loss_mlp": 0.01065772, "balance_loss_clip": 1.36920357, "balance_loss_mlp": 1.04066622, "epoch": 0.20544115436645122, "flos": 23159365931520.0, "grad_norm": 2.074122135644117, "language_loss": 0.79561067, "learning_rate": 3.6857154283340115e-06, "loss": 0.82213902, "num_input_tokens_seen": 73786900, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.25109863, "step": 3417, "time_per_iteration": 2.881777763366699 }, { "auxiliary_loss_clip": 0.015977, "auxiliary_loss_mlp": 0.01066917, "balance_loss_clip": 1.37863374, "balance_loss_mlp": 1.0402503, "epoch": 0.20550127761911918, "flos": 19399746458880.0, "grad_norm": 2.0769245581817093, "language_loss": 0.88567913, "learning_rate": 3.685505812834798e-06, "loss": 0.91232526, "num_input_tokens_seen": 73804515, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26647949, "step": 3418, "time_per_iteration": 2.7952401638031006 }, { "auxiliary_loss_clip": 0.0159254, "auxiliary_loss_mlp": 0.01058128, "balance_loss_clip": 1.37403691, "balance_loss_mlp": 1.03175867, "epoch": 0.20556140087178718, "flos": 22903360997760.0, "grad_norm": 2.072394255276017, "language_loss": 0.63064229, "learning_rate": 3.685296133421035e-06, "loss": 0.65714896, "num_input_tokens_seen": 73822910, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26391602, "step": 3419, "time_per_iteration": 2.841963291168213 }, { "auxiliary_loss_clip": 0.01621516, "auxiliary_loss_mlp": 0.01065689, "balance_loss_clip": 1.39964283, "balance_loss_mlp": 1.0388906, "epoch": 0.20562152412445514, "flos": 19798517295360.0, "grad_norm": 1.6759174243060417, "language_loss": 0.86828744, "learning_rate": 3.685086390100674e-06, "loss": 0.8951596, "num_input_tokens_seen": 73841160, "router_z_loss_clip": 2.21679688, "router_z_loss_mlp": 0.26818848, "step": 3420, "time_per_iteration": 2.834890127182007 }, { "auxiliary_loss_clip": 0.01586799, "auxiliary_loss_mlp": 0.01058655, "balance_loss_clip": 1.3695966, "balance_loss_mlp": 1.03284669, "epoch": 0.2056816473771231, "flos": 31513998464640.0, "grad_norm": 2.1598218163010587, "language_loss": 0.72670245, "learning_rate": 3.684876582881668e-06, "loss": 0.75315696, "num_input_tokens_seen": 73862795, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25817871, "step": 3421, "time_per_iteration": 3.0526740550994873 }, { "auxiliary_loss_clip": 0.01588247, "auxiliary_loss_mlp": 0.01048246, "balance_loss_clip": 1.3733784, "balance_loss_mlp": 1.02134025, "epoch": 0.20574177062979107, "flos": 23268804399360.0, "grad_norm": 1.97278585463943, "language_loss": 0.72680414, "learning_rate": 3.6846667117719732e-06, "loss": 0.75316912, "num_input_tokens_seen": 73881525, "router_z_loss_clip": 2.14746094, "router_z_loss_mlp": 0.26916504, "step": 3422, "time_per_iteration": 2.877342939376831 }, { "auxiliary_loss_clip": 0.0133716, "auxiliary_loss_mlp": 0.01046688, "balance_loss_clip": 1.21383905, "balance_loss_mlp": 1.01884103, "epoch": 0.20580189388245904, "flos": 70341389441280.0, "grad_norm": 0.7601778547504336, "language_loss": 0.55680513, "learning_rate": 3.684456776779548e-06, "loss": 0.58064365, "num_input_tokens_seen": 73937775, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.27929688, "step": 3423, "time_per_iteration": 3.4115304946899414 }, { "auxiliary_loss_clip": 0.01599721, "auxiliary_loss_mlp": 0.01047493, "balance_loss_clip": 1.37859416, "balance_loss_mlp": 1.02086139, "epoch": 0.205862017135127, "flos": 30750191429760.0, "grad_norm": 2.094599541194899, "language_loss": 0.72404921, "learning_rate": 3.684246777912353e-06, "loss": 0.75052136, "num_input_tokens_seen": 73958250, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.26660156, "step": 3424, "time_per_iteration": 2.9402096271514893 }, { "auxiliary_loss_clip": 0.01586837, "auxiliary_loss_mlp": 0.01047708, "balance_loss_clip": 1.37233555, "balance_loss_mlp": 1.020087, "epoch": 0.20592214038779497, "flos": 21334456408320.0, "grad_norm": 1.4247170936094284, "language_loss": 0.7572099, "learning_rate": 3.684036715178351e-06, "loss": 0.78355527, "num_input_tokens_seen": 73977775, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.27624512, "step": 3425, "time_per_iteration": 2.832782030105591 }, { "auxiliary_loss_clip": 0.01568186, "auxiliary_loss_mlp": 0.01049307, "balance_loss_clip": 1.35779631, "balance_loss_mlp": 1.02252054, "epoch": 0.20598226364046296, "flos": 22901505960960.0, "grad_norm": 1.70594571267528, "language_loss": 0.89215666, "learning_rate": 3.683826588585508e-06, "loss": 0.91833162, "num_input_tokens_seen": 73996590, "router_z_loss_clip": 2.10449219, "router_z_loss_mlp": 0.26818848, "step": 3426, "time_per_iteration": 2.865995168685913 }, { "auxiliary_loss_clip": 0.01568928, "auxiliary_loss_mlp": 0.01045632, "balance_loss_clip": 1.35639298, "balance_loss_mlp": 1.0184164, "epoch": 0.20604238689313092, "flos": 23889121614720.0, "grad_norm": 1.4613035675479986, "language_loss": 0.78380322, "learning_rate": 3.6836163981417926e-06, "loss": 0.8099488, "num_input_tokens_seen": 74015935, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.27185059, "step": 3427, "time_per_iteration": 2.8604531288146973 }, { "auxiliary_loss_clip": 0.01587445, "auxiliary_loss_mlp": 0.01043859, "balance_loss_clip": 1.36824512, "balance_loss_mlp": 1.01675141, "epoch": 0.2061025101457989, "flos": 22501604004480.0, "grad_norm": 1.4534748991415538, "language_loss": 0.74764431, "learning_rate": 3.683406143855174e-06, "loss": 0.77395743, "num_input_tokens_seen": 74036575, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.27087402, "step": 3428, "time_per_iteration": 2.8366570472717285 }, { "auxiliary_loss_clip": 0.0159098, "auxiliary_loss_mlp": 0.01047224, "balance_loss_clip": 1.37088776, "balance_loss_mlp": 1.0191741, "epoch": 0.20616263339846685, "flos": 22787904971520.0, "grad_norm": 1.705467179096323, "language_loss": 0.74469471, "learning_rate": 3.6831958257336256e-06, "loss": 0.77107674, "num_input_tokens_seen": 74055365, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.28015137, "step": 3429, "time_per_iteration": 2.875523090362549 }, { "auxiliary_loss_clip": 0.01596922, "auxiliary_loss_mlp": 0.0104943, "balance_loss_clip": 1.37611783, "balance_loss_mlp": 1.0215584, "epoch": 0.20622275665113482, "flos": 20890820712960.0, "grad_norm": 1.670757551701038, "language_loss": 0.85881221, "learning_rate": 3.6829854437851237e-06, "loss": 0.88527572, "num_input_tokens_seen": 74074875, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.27880859, "step": 3430, "time_per_iteration": 2.8344533443450928 }, { "auxiliary_loss_clip": 0.01576353, "auxiliary_loss_mlp": 0.01047941, "balance_loss_clip": 1.35923457, "balance_loss_mlp": 1.0204041, "epoch": 0.20628287990380278, "flos": 19363840070400.0, "grad_norm": 1.6153644499145472, "language_loss": 0.70356554, "learning_rate": 3.6827749980176444e-06, "loss": 0.72980845, "num_input_tokens_seen": 74094505, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.27563477, "step": 3431, "time_per_iteration": 2.83349347114563 }, { "auxiliary_loss_clip": 0.01329041, "auxiliary_loss_mlp": 0.01079151, "balance_loss_clip": 1.19957983, "balance_loss_mlp": 1.04691648, "epoch": 0.20634300315647078, "flos": 71547474827520.0, "grad_norm": 0.8294178462842249, "language_loss": 0.6026237, "learning_rate": 3.6825644884391693e-06, "loss": 0.62670565, "num_input_tokens_seen": 74158500, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.32226562, "step": 3432, "time_per_iteration": 3.505629301071167 }, { "auxiliary_loss_clip": 0.01582415, "auxiliary_loss_mlp": 0.01047693, "balance_loss_clip": 1.36637437, "balance_loss_mlp": 1.02001262, "epoch": 0.20640312640913874, "flos": 21733453468800.0, "grad_norm": 1.5153538888255014, "language_loss": 0.73550439, "learning_rate": 3.682353915057679e-06, "loss": 0.76180547, "num_input_tokens_seen": 74176685, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.27685547, "step": 3433, "time_per_iteration": 4.274772882461548 }, { "auxiliary_loss_clip": 0.01588292, "auxiliary_loss_mlp": 0.01045972, "balance_loss_clip": 1.36679935, "balance_loss_mlp": 1.01769543, "epoch": 0.2064632496618067, "flos": 20563364960640.0, "grad_norm": 2.122072724626134, "language_loss": 0.87259561, "learning_rate": 3.6821432778811604e-06, "loss": 0.89893824, "num_input_tokens_seen": 74194935, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.28295898, "step": 3434, "time_per_iteration": 2.8769097328186035 }, { "auxiliary_loss_clip": 0.01581808, "auxiliary_loss_mlp": 0.01051806, "balance_loss_clip": 1.36162806, "balance_loss_mlp": 1.0241735, "epoch": 0.20652337291447467, "flos": 29834750511360.0, "grad_norm": 1.626771597166887, "language_loss": 0.70639431, "learning_rate": 3.6819325769176004e-06, "loss": 0.73273045, "num_input_tokens_seen": 74215400, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.27661133, "step": 3435, "time_per_iteration": 2.933743953704834 }, { "auxiliary_loss_clip": 0.01570664, "auxiliary_loss_mlp": 0.0105372, "balance_loss_clip": 1.35717177, "balance_loss_mlp": 1.02346456, "epoch": 0.20658349616714264, "flos": 26224819395840.0, "grad_norm": 1.656085285368096, "language_loss": 0.90389282, "learning_rate": 3.681721812174988e-06, "loss": 0.93013668, "num_input_tokens_seen": 74234090, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.30249023, "step": 3436, "time_per_iteration": 2.8787035942077637 }, { "auxiliary_loss_clip": 0.01575209, "auxiliary_loss_mlp": 0.01049943, "balance_loss_clip": 1.35999656, "balance_loss_mlp": 1.02097559, "epoch": 0.2066436194198106, "flos": 26005399522560.0, "grad_norm": 1.8394290682720231, "language_loss": 0.77946204, "learning_rate": 3.6815109836613163e-06, "loss": 0.80571353, "num_input_tokens_seen": 74253345, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.28955078, "step": 3437, "time_per_iteration": 2.900306224822998 }, { "auxiliary_loss_clip": 0.01578671, "auxiliary_loss_mlp": 0.01049961, "balance_loss_clip": 1.36131191, "balance_loss_mlp": 1.0226624, "epoch": 0.20670374267247857, "flos": 21370996224000.0, "grad_norm": 2.1893917164752366, "language_loss": 0.78904295, "learning_rate": 3.6813000913845795e-06, "loss": 0.81532925, "num_input_tokens_seen": 74271615, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.27294922, "step": 3438, "time_per_iteration": 2.8084795475006104 }, { "auxiliary_loss_clip": 0.01325891, "auxiliary_loss_mlp": 0.01052426, "balance_loss_clip": 1.19715881, "balance_loss_mlp": 1.02267098, "epoch": 0.20676386592514656, "flos": 66414046919040.0, "grad_norm": 0.8534668033017735, "language_loss": 0.67208529, "learning_rate": 3.6810891353527747e-06, "loss": 0.69586849, "num_input_tokens_seen": 74331390, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.296875, "step": 3439, "time_per_iteration": 3.3585543632507324 }, { "auxiliary_loss_clip": 0.01581094, "auxiliary_loss_mlp": 0.01049264, "balance_loss_clip": 1.36050439, "balance_loss_mlp": 1.02198887, "epoch": 0.20682398917781453, "flos": 17283423306240.0, "grad_norm": 1.937195905097359, "language_loss": 0.8508023, "learning_rate": 3.6808781155739014e-06, "loss": 0.87710583, "num_input_tokens_seen": 74347335, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.27319336, "step": 3440, "time_per_iteration": 2.8208167552948 }, { "auxiliary_loss_clip": 0.01592484, "auxiliary_loss_mlp": 0.01049226, "balance_loss_clip": 1.37389922, "balance_loss_mlp": 1.02135491, "epoch": 0.2068841124304825, "flos": 18086349110400.0, "grad_norm": 2.482453324957035, "language_loss": 0.85666394, "learning_rate": 3.6806670320559614e-06, "loss": 0.88308102, "num_input_tokens_seen": 74366310, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.27880859, "step": 3441, "time_per_iteration": 4.186328887939453 }, { "auxiliary_loss_clip": 0.0157685, "auxiliary_loss_mlp": 0.01047439, "balance_loss_clip": 1.36308777, "balance_loss_mlp": 1.01931798, "epoch": 0.20694423568315046, "flos": 27359092005120.0, "grad_norm": 1.6412561347328074, "language_loss": 0.86323321, "learning_rate": 3.680455884806959e-06, "loss": 0.88947612, "num_input_tokens_seen": 74387100, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.28161621, "step": 3442, "time_per_iteration": 4.448321580886841 }, { "auxiliary_loss_clip": 0.01584089, "auxiliary_loss_mlp": 0.01047397, "balance_loss_clip": 1.36305642, "balance_loss_mlp": 1.01939511, "epoch": 0.20700435893581842, "flos": 20239438302720.0, "grad_norm": 1.9501647463495295, "language_loss": 0.74013257, "learning_rate": 3.6802446738349014e-06, "loss": 0.76644742, "num_input_tokens_seen": 74404460, "router_z_loss_clip": 2.20898438, "router_z_loss_mlp": 0.28015137, "step": 3443, "time_per_iteration": 2.82975697517395 }, { "auxiliary_loss_clip": 0.01563841, "auxiliary_loss_mlp": 0.0104767, "balance_loss_clip": 1.35025549, "balance_loss_mlp": 1.02116942, "epoch": 0.2070644821884864, "flos": 20640788092800.0, "grad_norm": 1.7762047370447032, "language_loss": 0.86638212, "learning_rate": 3.680033399147797e-06, "loss": 0.89249718, "num_input_tokens_seen": 74423790, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.26501465, "step": 3444, "time_per_iteration": 2.8425731658935547 }, { "auxiliary_loss_clip": 0.01315696, "auxiliary_loss_mlp": 0.01038195, "balance_loss_clip": 1.19226766, "balance_loss_mlp": 1.00882173, "epoch": 0.20712460544115438, "flos": 65970592202880.0, "grad_norm": 0.7043319373230057, "language_loss": 0.57186842, "learning_rate": 3.6798220607536585e-06, "loss": 0.59540731, "num_input_tokens_seen": 74488130, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.29296875, "step": 3445, "time_per_iteration": 3.3234875202178955 }, { "auxiliary_loss_clip": 0.01566762, "auxiliary_loss_mlp": 0.01050476, "balance_loss_clip": 1.3539362, "balance_loss_mlp": 1.02334428, "epoch": 0.20718472869382235, "flos": 19434747951360.0, "grad_norm": 1.6958014434502875, "language_loss": 0.78874898, "learning_rate": 3.6796106586604987e-06, "loss": 0.81492138, "num_input_tokens_seen": 74506720, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.27172852, "step": 3446, "time_per_iteration": 2.9626452922821045 }, { "auxiliary_loss_clip": 0.01602171, "auxiliary_loss_mlp": 0.0105754, "balance_loss_clip": 1.37538815, "balance_loss_mlp": 1.02921557, "epoch": 0.2072448519464903, "flos": 24509846033280.0, "grad_norm": 2.4479927321522728, "language_loss": 0.63686377, "learning_rate": 3.679399192876334e-06, "loss": 0.66346085, "num_input_tokens_seen": 74525330, "router_z_loss_clip": 2.26953125, "router_z_loss_mlp": 0.28283691, "step": 3447, "time_per_iteration": 2.8674073219299316 }, { "auxiliary_loss_clip": 0.0156301, "auxiliary_loss_mlp": 0.01047311, "balance_loss_clip": 1.34550107, "balance_loss_mlp": 1.01990485, "epoch": 0.20730497519915828, "flos": 23086014831360.0, "grad_norm": 1.6221661041949098, "language_loss": 0.87681437, "learning_rate": 3.679187663409184e-06, "loss": 0.90291756, "num_input_tokens_seen": 74544535, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.27404785, "step": 3448, "time_per_iteration": 2.855940818786621 }, { "auxiliary_loss_clip": 0.01566543, "auxiliary_loss_mlp": 0.01047412, "balance_loss_clip": 1.34997296, "balance_loss_mlp": 1.01995826, "epoch": 0.20736509845182624, "flos": 21078858677760.0, "grad_norm": 2.6909934959531654, "language_loss": 0.76208544, "learning_rate": 3.6789760702670696e-06, "loss": 0.78822494, "num_input_tokens_seen": 74562300, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.27478027, "step": 3449, "time_per_iteration": 2.8234074115753174 }, { "auxiliary_loss_clip": 0.01582788, "auxiliary_loss_mlp": 0.01051529, "balance_loss_clip": 1.36156487, "balance_loss_mlp": 1.02363348, "epoch": 0.2074252217044942, "flos": 17640632154240.0, "grad_norm": 1.756368339629202, "language_loss": 0.77057946, "learning_rate": 3.6787644134580134e-06, "loss": 0.79692256, "num_input_tokens_seen": 74580080, "router_z_loss_clip": 2.21289062, "router_z_loss_mlp": 0.27880859, "step": 3450, "time_per_iteration": 2.84580659866333 }, { "auxiliary_loss_clip": 0.0157278, "auxiliary_loss_mlp": 0.01045852, "balance_loss_clip": 1.35498309, "balance_loss_mlp": 1.01903009, "epoch": 0.20748534495716217, "flos": 23556643689600.0, "grad_norm": 1.6468851500814907, "language_loss": 0.83355117, "learning_rate": 3.6785526929900436e-06, "loss": 0.85973746, "num_input_tokens_seen": 74598980, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.26831055, "step": 3451, "time_per_iteration": 2.8549022674560547 }, { "auxiliary_loss_clip": 0.01311624, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.18950224, "balance_loss_mlp": 0.99329203, "epoch": 0.20754546820983016, "flos": 52277346017280.0, "grad_norm": 0.8066930990930274, "language_loss": 0.56667078, "learning_rate": 3.6783409088711875e-06, "loss": 0.59006512, "num_input_tokens_seen": 74655275, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.34570312, "step": 3452, "time_per_iteration": 3.279888868331909 }, { "auxiliary_loss_clip": 0.01570087, "auxiliary_loss_mlp": 0.01054446, "balance_loss_clip": 1.35325408, "balance_loss_mlp": 1.02707577, "epoch": 0.20760559146249813, "flos": 20422046891520.0, "grad_norm": 2.691345957880228, "language_loss": 0.89589322, "learning_rate": 3.6781290611094755e-06, "loss": 0.92213857, "num_input_tokens_seen": 74674560, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.27416992, "step": 3453, "time_per_iteration": 2.8707380294799805 }, { "auxiliary_loss_clip": 0.01581392, "auxiliary_loss_mlp": 0.01049587, "balance_loss_clip": 1.36344481, "balance_loss_mlp": 1.02133489, "epoch": 0.2076657147151661, "flos": 23196584419200.0, "grad_norm": 1.5410929440294723, "language_loss": 0.81685019, "learning_rate": 3.6779171497129407e-06, "loss": 0.84315991, "num_input_tokens_seen": 74694500, "router_z_loss_clip": 2.18066406, "router_z_loss_mlp": 0.28271484, "step": 3454, "time_per_iteration": 2.872332811355591 }, { "auxiliary_loss_clip": 0.01570322, "auxiliary_loss_mlp": 0.01051253, "balance_loss_clip": 1.35353553, "balance_loss_mlp": 1.02483582, "epoch": 0.20772583796783406, "flos": 18301561217280.0, "grad_norm": 2.7930499586468382, "language_loss": 0.78942257, "learning_rate": 3.6777051746896202e-06, "loss": 0.8156383, "num_input_tokens_seen": 74710485, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.26416016, "step": 3455, "time_per_iteration": 2.9198086261749268 }, { "auxiliary_loss_clip": 0.01562658, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.34996319, "balance_loss_mlp": 1.02287877, "epoch": 0.20778596122050202, "flos": 17611693464960.0, "grad_norm": 1.6471282649710617, "language_loss": 0.81703985, "learning_rate": 3.6774931360475516e-06, "loss": 0.84315097, "num_input_tokens_seen": 74727450, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.25610352, "step": 3456, "time_per_iteration": 2.846808671951294 }, { "auxiliary_loss_clip": 0.01582619, "auxiliary_loss_mlp": 0.01048933, "balance_loss_clip": 1.36470819, "balance_loss_mlp": 1.02239668, "epoch": 0.20784608447317, "flos": 23816132472960.0, "grad_norm": 1.5406226566986498, "language_loss": 0.79434925, "learning_rate": 3.6772810337947745e-06, "loss": 0.82066482, "num_input_tokens_seen": 74746725, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26538086, "step": 3457, "time_per_iteration": 2.9080049991607666 }, { "auxiliary_loss_clip": 0.01578715, "auxiliary_loss_mlp": 0.01058672, "balance_loss_clip": 1.3605057, "balance_loss_mlp": 1.03123021, "epoch": 0.20790620772583795, "flos": 17648188035840.0, "grad_norm": 1.7295085509806147, "language_loss": 0.84657764, "learning_rate": 3.677068867939333e-06, "loss": 0.87295163, "num_input_tokens_seen": 74765255, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.27441406, "step": 3458, "time_per_iteration": 2.8262007236480713 }, { "auxiliary_loss_clip": 0.01576111, "auxiliary_loss_mlp": 0.01043747, "balance_loss_clip": 1.36307859, "balance_loss_mlp": 1.01808119, "epoch": 0.20796633097850595, "flos": 27685145168640.0, "grad_norm": 1.849088510950195, "language_loss": 0.76521945, "learning_rate": 3.676856638489272e-06, "loss": 0.79141802, "num_input_tokens_seen": 74785710, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25695801, "step": 3459, "time_per_iteration": 2.8809902667999268 }, { "auxiliary_loss_clip": 0.01571354, "auxiliary_loss_mlp": 0.01046575, "balance_loss_clip": 1.35959804, "balance_loss_mlp": 1.01991987, "epoch": 0.2080264542311739, "flos": 19255170764160.0, "grad_norm": 1.7809797590188388, "language_loss": 0.77902198, "learning_rate": 3.6766443454526382e-06, "loss": 0.80520129, "num_input_tokens_seen": 74804490, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.26660156, "step": 3460, "time_per_iteration": 2.873784303665161 }, { "auxiliary_loss_clip": 0.01566899, "auxiliary_loss_mlp": 0.01040981, "balance_loss_clip": 1.3518126, "balance_loss_mlp": 1.01524377, "epoch": 0.20808657748384188, "flos": 27537130869120.0, "grad_norm": 1.9552043096463112, "language_loss": 0.76221961, "learning_rate": 3.6764319888374836e-06, "loss": 0.78829849, "num_input_tokens_seen": 74826340, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.25744629, "step": 3461, "time_per_iteration": 2.917376756668091 }, { "auxiliary_loss_clip": 0.01580365, "auxiliary_loss_mlp": 0.01043064, "balance_loss_clip": 1.35902429, "balance_loss_mlp": 1.01729083, "epoch": 0.20814670073650984, "flos": 26918306732160.0, "grad_norm": 1.8356328498650532, "language_loss": 0.89310598, "learning_rate": 3.6762195686518604e-06, "loss": 0.91934019, "num_input_tokens_seen": 74844960, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.25769043, "step": 3462, "time_per_iteration": 2.887219190597534 }, { "auxiliary_loss_clip": 0.01304387, "auxiliary_loss_mlp": 0.01039173, "balance_loss_clip": 1.18221819, "balance_loss_mlp": 1.00941837, "epoch": 0.2082068239891778, "flos": 70206948581760.0, "grad_norm": 0.7629100914793379, "language_loss": 0.59075403, "learning_rate": 3.6760070849038226e-06, "loss": 0.61418962, "num_input_tokens_seen": 74909075, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.296875, "step": 3463, "time_per_iteration": 3.4774398803710938 }, { "auxiliary_loss_clip": 0.0157642, "auxiliary_loss_mlp": 0.01050762, "balance_loss_clip": 1.3556124, "balance_loss_mlp": 1.02329624, "epoch": 0.20826694724184577, "flos": 24618786808320.0, "grad_norm": 2.4439582278070344, "language_loss": 0.68087924, "learning_rate": 3.675794537601429e-06, "loss": 0.70715117, "num_input_tokens_seen": 74928125, "router_z_loss_clip": 2.2109375, "router_z_loss_mlp": 0.27490234, "step": 3464, "time_per_iteration": 2.8441696166992188 }, { "auxiliary_loss_clip": 0.01573408, "auxiliary_loss_mlp": 0.0105048, "balance_loss_clip": 1.35481811, "balance_loss_mlp": 1.02408743, "epoch": 0.20832707049451377, "flos": 12899640810240.0, "grad_norm": 1.956848906998702, "language_loss": 0.85159594, "learning_rate": 3.6755819267527373e-06, "loss": 0.8778348, "num_input_tokens_seen": 74945090, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.26403809, "step": 3465, "time_per_iteration": 2.8777337074279785 }, { "auxiliary_loss_clip": 0.01583619, "auxiliary_loss_mlp": 0.01048306, "balance_loss_clip": 1.36592889, "balance_loss_mlp": 1.02142465, "epoch": 0.20838719374718173, "flos": 22208606807040.0, "grad_norm": 2.34575015798552, "language_loss": 0.8317886, "learning_rate": 3.6753692523658113e-06, "loss": 0.85810781, "num_input_tokens_seen": 74963630, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.2689209, "step": 3466, "time_per_iteration": 2.8336503505706787 }, { "auxiliary_loss_clip": 0.01562946, "auxiliary_loss_mlp": 0.01043095, "balance_loss_clip": 1.35179138, "balance_loss_mlp": 1.01823974, "epoch": 0.2084473169998497, "flos": 15167009664000.0, "grad_norm": 1.8280312526848845, "language_loss": 0.83002418, "learning_rate": 3.675156514448716e-06, "loss": 0.85608464, "num_input_tokens_seen": 74981875, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24853516, "step": 3467, "time_per_iteration": 2.8921115398406982 }, { "auxiliary_loss_clip": 0.01546349, "auxiliary_loss_mlp": 0.01046472, "balance_loss_clip": 1.34109533, "balance_loss_mlp": 1.02165294, "epoch": 0.20850744025251766, "flos": 17465624691840.0, "grad_norm": 1.7567668956238731, "language_loss": 0.82468915, "learning_rate": 3.674943713009518e-06, "loss": 0.85061741, "num_input_tokens_seen": 74999155, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.24816895, "step": 3468, "time_per_iteration": 4.346414089202881 }, { "auxiliary_loss_clip": 0.01591818, "auxiliary_loss_mlp": 0.01053599, "balance_loss_clip": 1.370435, "balance_loss_mlp": 1.02607393, "epoch": 0.20856756350518563, "flos": 25709008965120.0, "grad_norm": 1.9841593939856008, "language_loss": 0.90477729, "learning_rate": 3.6747308480562856e-06, "loss": 0.9312315, "num_input_tokens_seen": 75017850, "router_z_loss_clip": 2.21484375, "router_z_loss_mlp": 0.2755127, "step": 3469, "time_per_iteration": 2.8687009811401367 }, { "auxiliary_loss_clip": 0.01587589, "auxiliary_loss_mlp": 0.01051727, "balance_loss_clip": 1.37218702, "balance_loss_mlp": 1.02677608, "epoch": 0.2086276867578536, "flos": 37903263056640.0, "grad_norm": 2.192599147486033, "language_loss": 0.77558178, "learning_rate": 3.674517919597092e-06, "loss": 0.80197495, "num_input_tokens_seen": 75039270, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24951172, "step": 3470, "time_per_iteration": 3.006735324859619 }, { "auxiliary_loss_clip": 0.01568449, "auxiliary_loss_mlp": 0.01049197, "balance_loss_clip": 1.35650826, "balance_loss_mlp": 1.02180254, "epoch": 0.20868781001052156, "flos": 25568098099200.0, "grad_norm": 1.7531097033802499, "language_loss": 0.76274133, "learning_rate": 3.674304927640011e-06, "loss": 0.78891778, "num_input_tokens_seen": 75059350, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.27416992, "step": 3471, "time_per_iteration": 2.876418352127075 }, { "auxiliary_loss_clip": 0.0160164, "auxiliary_loss_mlp": 0.01057172, "balance_loss_clip": 1.37876439, "balance_loss_mlp": 1.02970624, "epoch": 0.20874793326318955, "flos": 27541248145920.0, "grad_norm": 2.0526749794347046, "language_loss": 0.76658249, "learning_rate": 3.67409187219312e-06, "loss": 0.79317063, "num_input_tokens_seen": 75080150, "router_z_loss_clip": 2.2265625, "router_z_loss_mlp": 0.27490234, "step": 3472, "time_per_iteration": 2.9014899730682373 }, { "auxiliary_loss_clip": 0.01574532, "auxiliary_loss_mlp": 0.01044703, "balance_loss_clip": 1.36149693, "balance_loss_mlp": 1.02019358, "epoch": 0.20880805651585752, "flos": 18557249437440.0, "grad_norm": 2.1459270630503733, "language_loss": 0.85590672, "learning_rate": 3.6738787532644966e-06, "loss": 0.88209903, "num_input_tokens_seen": 75097920, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.24523926, "step": 3473, "time_per_iteration": 2.845736265182495 }, { "auxiliary_loss_clip": 0.01301817, "auxiliary_loss_mlp": 0.01031173, "balance_loss_clip": 1.17915881, "balance_loss_mlp": 1.00618637, "epoch": 0.20886817976852548, "flos": 65977605146880.0, "grad_norm": 0.8837869229572406, "language_loss": 0.63720304, "learning_rate": 3.6736655708622235e-06, "loss": 0.66053295, "num_input_tokens_seen": 75152410, "router_z_loss_clip": 1.2265625, "router_z_loss_mlp": 0.25, "step": 3474, "time_per_iteration": 3.3063149452209473 }, { "auxiliary_loss_clip": 0.01594132, "auxiliary_loss_mlp": 0.01047849, "balance_loss_clip": 1.37707818, "balance_loss_mlp": 1.021945, "epoch": 0.20892830302119345, "flos": 36553054423680.0, "grad_norm": 2.204921392941378, "language_loss": 0.71433204, "learning_rate": 3.6734523249943844e-06, "loss": 0.74075186, "num_input_tokens_seen": 75173265, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.2590332, "step": 3475, "time_per_iteration": 4.432063579559326 }, { "auxiliary_loss_clip": 0.01597499, "auxiliary_loss_mlp": 0.01050006, "balance_loss_clip": 1.38082993, "balance_loss_mlp": 1.02563941, "epoch": 0.2089884262738614, "flos": 20965800625920.0, "grad_norm": 1.4856491517583763, "language_loss": 0.71122444, "learning_rate": 3.673239015669065e-06, "loss": 0.73769945, "num_input_tokens_seen": 75193640, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24353027, "step": 3476, "time_per_iteration": 2.982536792755127 }, { "auxiliary_loss_clip": 0.01582436, "auxiliary_loss_mlp": 0.01055747, "balance_loss_clip": 1.36985803, "balance_loss_mlp": 1.02961612, "epoch": 0.20904854952652938, "flos": 22794329733120.0, "grad_norm": 1.7864167679467173, "language_loss": 0.90042526, "learning_rate": 3.6730256428943544e-06, "loss": 0.92680717, "num_input_tokens_seen": 75212545, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.26135254, "step": 3477, "time_per_iteration": 5.613992929458618 }, { "auxiliary_loss_clip": 0.01579775, "auxiliary_loss_mlp": 0.01055212, "balance_loss_clip": 1.36698556, "balance_loss_mlp": 1.03245544, "epoch": 0.20910867277919734, "flos": 27313457984640.0, "grad_norm": 2.40986115101093, "language_loss": 0.68918264, "learning_rate": 3.672812206678344e-06, "loss": 0.71553254, "num_input_tokens_seen": 75230865, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.22753906, "step": 3478, "time_per_iteration": 2.8909878730773926 }, { "auxiliary_loss_clip": 0.01601871, "auxiliary_loss_mlp": 0.01058144, "balance_loss_clip": 1.38493633, "balance_loss_mlp": 1.03291917, "epoch": 0.20916879603186533, "flos": 14327317820160.0, "grad_norm": 2.0379919708209533, "language_loss": 0.85422349, "learning_rate": 3.672598707029127e-06, "loss": 0.88082361, "num_input_tokens_seen": 75248285, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.25231934, "step": 3479, "time_per_iteration": 2.8117213249206543 }, { "auxiliary_loss_clip": 0.01622109, "auxiliary_loss_mlp": 0.01064644, "balance_loss_clip": 1.40176225, "balance_loss_mlp": 1.03883505, "epoch": 0.2092289192845333, "flos": 22283134272000.0, "grad_norm": 2.229480018969664, "language_loss": 0.75258768, "learning_rate": 3.6723851439548003e-06, "loss": 0.77945518, "num_input_tokens_seen": 75266310, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.25817871, "step": 3480, "time_per_iteration": 2.8442554473876953 }, { "auxiliary_loss_clip": 0.01595128, "auxiliary_loss_mlp": 0.0106301, "balance_loss_clip": 1.37992084, "balance_loss_mlp": 1.03796363, "epoch": 0.20928904253720126, "flos": 14839101463680.0, "grad_norm": 2.326773163435039, "language_loss": 0.77308244, "learning_rate": 3.67217151746346e-06, "loss": 0.79966378, "num_input_tokens_seen": 75284175, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.25073242, "step": 3481, "time_per_iteration": 2.9033396244049072 }, { "auxiliary_loss_clip": 0.01600601, "auxiliary_loss_mlp": 0.01067667, "balance_loss_clip": 1.38445234, "balance_loss_mlp": 1.04357517, "epoch": 0.20934916578986923, "flos": 23269799784960.0, "grad_norm": 1.696646483573241, "language_loss": 0.85875076, "learning_rate": 3.671957827563209e-06, "loss": 0.88543344, "num_input_tokens_seen": 75303465, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.2409668, "step": 3482, "time_per_iteration": 2.9559326171875 }, { "auxiliary_loss_clip": 0.01614426, "auxiliary_loss_mlp": 0.01058478, "balance_loss_clip": 1.39730644, "balance_loss_mlp": 1.03408766, "epoch": 0.2094092890425372, "flos": 32026958472960.0, "grad_norm": 2.5832225196608833, "language_loss": 0.73108906, "learning_rate": 3.6717440742621494e-06, "loss": 0.7578181, "num_input_tokens_seen": 75325290, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24401855, "step": 3483, "time_per_iteration": 2.978679656982422 }, { "auxiliary_loss_clip": 0.01632857, "auxiliary_loss_mlp": 0.01071987, "balance_loss_clip": 1.41006529, "balance_loss_mlp": 1.04694116, "epoch": 0.20946941229520516, "flos": 20020108919040.0, "grad_norm": 1.9102037208420837, "language_loss": 0.75814462, "learning_rate": 3.6715302575683865e-06, "loss": 0.78519309, "num_input_tokens_seen": 75343895, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.25085449, "step": 3484, "time_per_iteration": 2.9299166202545166 }, { "auxiliary_loss_clip": 0.01622443, "auxiliary_loss_mlp": 0.01059284, "balance_loss_clip": 1.40380394, "balance_loss_mlp": 1.03469157, "epoch": 0.20952953554787315, "flos": 30752996607360.0, "grad_norm": 1.5675497283726032, "language_loss": 0.71051073, "learning_rate": 3.6713163774900292e-06, "loss": 0.73732793, "num_input_tokens_seen": 75367100, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24609375, "step": 3485, "time_per_iteration": 3.113206148147583 }, { "auxiliary_loss_clip": 0.01633383, "auxiliary_loss_mlp": 0.01074511, "balance_loss_clip": 1.40995705, "balance_loss_mlp": 1.04799855, "epoch": 0.20958965880054112, "flos": 27059670046080.0, "grad_norm": 1.8766670558984113, "language_loss": 0.83890104, "learning_rate": 3.6711024340351875e-06, "loss": 0.86598003, "num_input_tokens_seen": 75389925, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.26525879, "step": 3486, "time_per_iteration": 2.933392286300659 }, { "auxiliary_loss_clip": 0.01610553, "auxiliary_loss_mlp": 0.01065502, "balance_loss_clip": 1.3928839, "balance_loss_mlp": 1.04189825, "epoch": 0.20964978205320908, "flos": 34217401887360.0, "grad_norm": 1.7770383573052646, "language_loss": 0.88052475, "learning_rate": 3.6708884272119737e-06, "loss": 0.90728533, "num_input_tokens_seen": 75408575, "router_z_loss_clip": 2.17578125, "router_z_loss_mlp": 0.23596191, "step": 3487, "time_per_iteration": 2.9854443073272705 }, { "auxiliary_loss_clip": 0.01608429, "auxiliary_loss_mlp": 0.01063088, "balance_loss_clip": 1.39152384, "balance_loss_mlp": 1.03925824, "epoch": 0.20970990530587705, "flos": 23487907559040.0, "grad_norm": 2.815392010081029, "language_loss": 0.73144519, "learning_rate": 3.670674357028504e-06, "loss": 0.75816035, "num_input_tokens_seen": 75427155, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.23828125, "step": 3488, "time_per_iteration": 2.88979172706604 }, { "auxiliary_loss_clip": 0.01610514, "auxiliary_loss_mlp": 0.01066098, "balance_loss_clip": 1.39491677, "balance_loss_mlp": 1.04220796, "epoch": 0.209770028558545, "flos": 18560597552640.0, "grad_norm": 8.835210202889051, "language_loss": 0.81599128, "learning_rate": 3.6704602234928945e-06, "loss": 0.84275734, "num_input_tokens_seen": 75444450, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.23901367, "step": 3489, "time_per_iteration": 2.8530938625335693 }, { "auxiliary_loss_clip": 0.01622772, "auxiliary_loss_mlp": 0.01059693, "balance_loss_clip": 1.40302742, "balance_loss_mlp": 1.03513539, "epoch": 0.20983015181121298, "flos": 21627136892160.0, "grad_norm": 1.9792191495799596, "language_loss": 0.73740691, "learning_rate": 3.670246026613266e-06, "loss": 0.76423156, "num_input_tokens_seen": 75462625, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.24536133, "step": 3490, "time_per_iteration": 2.8296351432800293 }, { "auxiliary_loss_clip": 0.01612006, "auxiliary_loss_mlp": 0.01056729, "balance_loss_clip": 1.40089011, "balance_loss_mlp": 1.03332853, "epoch": 0.20989027506388094, "flos": 16622584732800.0, "grad_norm": 2.567326456785195, "language_loss": 0.71212959, "learning_rate": 3.6700317663977415e-06, "loss": 0.73881692, "num_input_tokens_seen": 75480640, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.23376465, "step": 3491, "time_per_iteration": 2.8325035572052 }, { "auxiliary_loss_clip": 0.01621829, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.40133989, "balance_loss_mlp": 1.02786076, "epoch": 0.20995039831654894, "flos": 23225930311680.0, "grad_norm": 3.621408117576884, "language_loss": 0.80587894, "learning_rate": 3.669817442854444e-06, "loss": 0.83262461, "num_input_tokens_seen": 75494900, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 0.24902344, "step": 3492, "time_per_iteration": 2.8666093349456787 }, { "auxiliary_loss_clip": 0.01606245, "auxiliary_loss_mlp": 0.01047689, "balance_loss_clip": 1.39156723, "balance_loss_mlp": 1.02370358, "epoch": 0.2100105215692169, "flos": 18155854402560.0, "grad_norm": 1.8439052769180586, "language_loss": 0.87604666, "learning_rate": 3.669603055991502e-06, "loss": 0.90258598, "num_input_tokens_seen": 75513370, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2401123, "step": 3493, "time_per_iteration": 2.825314998626709 }, { "auxiliary_loss_clip": 0.01588111, "auxiliary_loss_mlp": 0.0104452, "balance_loss_clip": 1.3790226, "balance_loss_mlp": 1.02106023, "epoch": 0.21007064482188487, "flos": 15970252181760.0, "grad_norm": 1.6635366680287995, "language_loss": 0.7081722, "learning_rate": 3.6693886058170455e-06, "loss": 0.73449856, "num_input_tokens_seen": 75532480, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.23461914, "step": 3494, "time_per_iteration": 2.8576161861419678 }, { "auxiliary_loss_clip": 0.01638385, "auxiliary_loss_mlp": 0.01046959, "balance_loss_clip": 1.41564059, "balance_loss_mlp": 1.02281868, "epoch": 0.21013076807455283, "flos": 32247192752640.0, "grad_norm": 1.6782030846400946, "language_loss": 0.79556483, "learning_rate": 3.6691740923392053e-06, "loss": 0.82241827, "num_input_tokens_seen": 75552745, "router_z_loss_clip": 2.23046875, "router_z_loss_mlp": 0.24145508, "step": 3495, "time_per_iteration": 2.9335973262786865 }, { "auxiliary_loss_clip": 0.01619292, "auxiliary_loss_mlp": 0.0105216, "balance_loss_clip": 1.40227365, "balance_loss_mlp": 1.02739978, "epoch": 0.2101908913272208, "flos": 23707282187520.0, "grad_norm": 1.5612574830074857, "language_loss": 0.77909982, "learning_rate": 3.668959515566116e-06, "loss": 0.80581439, "num_input_tokens_seen": 75574355, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.24743652, "step": 3496, "time_per_iteration": 2.9463846683502197 }, { "auxiliary_loss_clip": 0.01624157, "auxiliary_loss_mlp": 0.01049636, "balance_loss_clip": 1.40591598, "balance_loss_mlp": 1.02474523, "epoch": 0.21025101457988876, "flos": 20385416586240.0, "grad_norm": 5.154272264854059, "language_loss": 0.82637465, "learning_rate": 3.668744875505915e-06, "loss": 0.85311264, "num_input_tokens_seen": 75592215, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24914551, "step": 3497, "time_per_iteration": 2.8832790851593018 }, { "auxiliary_loss_clip": 0.01631579, "auxiliary_loss_mlp": 0.01049221, "balance_loss_clip": 1.40826488, "balance_loss_mlp": 1.02305448, "epoch": 0.21031113783255675, "flos": 25786658321280.0, "grad_norm": 3.9267061128959067, "language_loss": 0.68484813, "learning_rate": 3.668530172166741e-06, "loss": 0.71165615, "num_input_tokens_seen": 75610740, "router_z_loss_clip": 2.23242188, "router_z_loss_mlp": 0.26184082, "step": 3498, "time_per_iteration": 2.9090375900268555 }, { "auxiliary_loss_clip": 0.0163645, "auxiliary_loss_mlp": 0.01047746, "balance_loss_clip": 1.41190791, "balance_loss_mlp": 1.02144802, "epoch": 0.21037126108522472, "flos": 22028396192640.0, "grad_norm": 1.9131055775479922, "language_loss": 0.82330751, "learning_rate": 3.6683154055567352e-06, "loss": 0.85014945, "num_input_tokens_seen": 75631005, "router_z_loss_clip": 2.24804688, "router_z_loss_mlp": 0.26318359, "step": 3499, "time_per_iteration": 2.8462469577789307 }, { "auxiliary_loss_clip": 0.01594628, "auxiliary_loss_mlp": 0.0104597, "balance_loss_clip": 1.38113487, "balance_loss_mlp": 1.0221045, "epoch": 0.21043138433789269, "flos": 25344425214720.0, "grad_norm": 1.5147462120788404, "language_loss": 0.791022, "learning_rate": 3.668100575684043e-06, "loss": 0.81742799, "num_input_tokens_seen": 75650655, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.23864746, "step": 3500, "time_per_iteration": 2.860731363296509 }, { "auxiliary_loss_clip": 0.01608012, "auxiliary_loss_mlp": 0.01042763, "balance_loss_clip": 1.39095616, "balance_loss_mlp": 1.01824152, "epoch": 0.21049150759056065, "flos": 25567600406400.0, "grad_norm": 1.490225138089557, "language_loss": 0.75115263, "learning_rate": 3.6678856825568094e-06, "loss": 0.77766037, "num_input_tokens_seen": 75669895, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.2454834, "step": 3501, "time_per_iteration": 2.8910651206970215 }, { "auxiliary_loss_clip": 0.01597083, "auxiliary_loss_mlp": 0.01044155, "balance_loss_clip": 1.38256061, "balance_loss_mlp": 1.01805973, "epoch": 0.21055163084322862, "flos": 24505502532480.0, "grad_norm": 1.8392569566153247, "language_loss": 0.76333463, "learning_rate": 3.667670726183183e-06, "loss": 0.789747, "num_input_tokens_seen": 75689535, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.26098633, "step": 3502, "time_per_iteration": 2.8671019077301025 }, { "auxiliary_loss_clip": 0.01595, "auxiliary_loss_mlp": 0.01038743, "balance_loss_clip": 1.38082588, "balance_loss_mlp": 1.01441216, "epoch": 0.21061175409589658, "flos": 25750028016000.0, "grad_norm": 1.9442301028509381, "language_loss": 0.77892268, "learning_rate": 3.667455706571316e-06, "loss": 0.80526006, "num_input_tokens_seen": 75709265, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24328613, "step": 3503, "time_per_iteration": 4.338297367095947 }, { "auxiliary_loss_clip": 0.01626891, "auxiliary_loss_mlp": 0.01052381, "balance_loss_clip": 1.40275407, "balance_loss_mlp": 1.0259409, "epoch": 0.21067187734856455, "flos": 18998396668800.0, "grad_norm": 2.137116509951048, "language_loss": 0.80015254, "learning_rate": 3.6672406237293617e-06, "loss": 0.8269453, "num_input_tokens_seen": 75727050, "router_z_loss_clip": 2.24414062, "router_z_loss_mlp": 0.26477051, "step": 3504, "time_per_iteration": 2.8929390907287598 }, { "auxiliary_loss_clip": 0.0162298, "auxiliary_loss_mlp": 0.0104858, "balance_loss_clip": 1.39946198, "balance_loss_mlp": 1.02252054, "epoch": 0.21073200060123254, "flos": 24692183153280.0, "grad_norm": 1.4355710442292404, "language_loss": 0.77748561, "learning_rate": 3.6670254776654754e-06, "loss": 0.80420119, "num_input_tokens_seen": 75747175, "router_z_loss_clip": 2.23828125, "router_z_loss_mlp": 0.26074219, "step": 3505, "time_per_iteration": 2.8696703910827637 }, { "auxiliary_loss_clip": 0.01588001, "auxiliary_loss_mlp": 0.01047957, "balance_loss_clip": 1.37700319, "balance_loss_mlp": 1.02318537, "epoch": 0.2107921238539005, "flos": 28561286338560.0, "grad_norm": 1.6056810631882448, "language_loss": 0.63848633, "learning_rate": 3.6668102683878163e-06, "loss": 0.66484594, "num_input_tokens_seen": 75767690, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.24780273, "step": 3506, "time_per_iteration": 2.9022254943847656 }, { "auxiliary_loss_clip": 0.0160299, "auxiliary_loss_mlp": 0.01045227, "balance_loss_clip": 1.38715816, "balance_loss_mlp": 1.02067029, "epoch": 0.21085224710656847, "flos": 25897092174720.0, "grad_norm": 1.6276633356893873, "language_loss": 0.82981437, "learning_rate": 3.6665949959045443e-06, "loss": 0.85629654, "num_input_tokens_seen": 75787255, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.24584961, "step": 3507, "time_per_iteration": 2.932844877243042 }, { "auxiliary_loss_clip": 0.01592165, "auxiliary_loss_mlp": 0.01045425, "balance_loss_clip": 1.37726188, "balance_loss_mlp": 1.02062917, "epoch": 0.21091237035923643, "flos": 14984627299200.0, "grad_norm": 2.3467543470847296, "language_loss": 0.76895618, "learning_rate": 3.666379660223824e-06, "loss": 0.79533207, "num_input_tokens_seen": 75805890, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.2479248, "step": 3508, "time_per_iteration": 2.837642192840576 }, { "auxiliary_loss_clip": 0.01604906, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.38636589, "balance_loss_mlp": 1.01686049, "epoch": 0.2109724936119044, "flos": 16371149523840.0, "grad_norm": 3.2327358738596232, "language_loss": 0.86295873, "learning_rate": 3.6661642613538192e-06, "loss": 0.88942564, "num_input_tokens_seen": 75821620, "router_z_loss_clip": 2.18554688, "router_z_loss_mlp": 0.24914551, "step": 3509, "time_per_iteration": 2.8366899490356445 }, { "auxiliary_loss_clip": 0.01607103, "auxiliary_loss_mlp": 0.01044944, "balance_loss_clip": 1.38645077, "balance_loss_mlp": 1.01882565, "epoch": 0.21103261686457236, "flos": 31514631891840.0, "grad_norm": 1.7805922693776797, "language_loss": 0.68937206, "learning_rate": 3.6659487993026987e-06, "loss": 0.71589255, "num_input_tokens_seen": 75842490, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26123047, "step": 3510, "time_per_iteration": 4.338866233825684 }, { "auxiliary_loss_clip": 0.01613645, "auxiliary_loss_mlp": 0.01040088, "balance_loss_clip": 1.39160967, "balance_loss_mlp": 1.01603127, "epoch": 0.21109274011724033, "flos": 27355381931520.0, "grad_norm": 1.7580826776592287, "language_loss": 0.73476881, "learning_rate": 3.6657332740786327e-06, "loss": 0.76130617, "num_input_tokens_seen": 75865985, "router_z_loss_clip": 2.21875, "router_z_loss_mlp": 0.24072266, "step": 3511, "time_per_iteration": 2.8831405639648438 }, { "auxiliary_loss_clip": 0.01625148, "auxiliary_loss_mlp": 0.01049113, "balance_loss_clip": 1.40165734, "balance_loss_mlp": 1.02258945, "epoch": 0.21115286336990832, "flos": 17828986832640.0, "grad_norm": 2.9789023990048475, "language_loss": 0.71046019, "learning_rate": 3.665517685689794e-06, "loss": 0.73720276, "num_input_tokens_seen": 75882745, "router_z_loss_clip": 2.234375, "router_z_loss_mlp": 0.26525879, "step": 3512, "time_per_iteration": 4.243883371353149 }, { "auxiliary_loss_clip": 0.01609222, "auxiliary_loss_mlp": 0.01048406, "balance_loss_clip": 1.39012742, "balance_loss_mlp": 1.02355075, "epoch": 0.2112129866225763, "flos": 27209222668800.0, "grad_norm": 2.4406001463792824, "language_loss": 0.74434853, "learning_rate": 3.6653020341443584e-06, "loss": 0.77092481, "num_input_tokens_seen": 75904305, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.24865723, "step": 3513, "time_per_iteration": 2.941117286682129 }, { "auxiliary_loss_clip": 0.01585409, "auxiliary_loss_mlp": 0.01043253, "balance_loss_clip": 1.37246287, "balance_loss_mlp": 1.01802838, "epoch": 0.21127310987524425, "flos": 23741469273600.0, "grad_norm": 1.710699049417334, "language_loss": 0.74827337, "learning_rate": 3.665086319450502e-06, "loss": 0.77455992, "num_input_tokens_seen": 75923710, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25219727, "step": 3514, "time_per_iteration": 2.8781352043151855 }, { "auxiliary_loss_clip": 0.01612381, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.39207721, "balance_loss_mlp": 1.0184691, "epoch": 0.21133323312791222, "flos": 18341584882560.0, "grad_norm": 1.7471150855919921, "language_loss": 0.77663159, "learning_rate": 3.6648705416164062e-06, "loss": 0.80318594, "num_input_tokens_seen": 75942625, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.24584961, "step": 3515, "time_per_iteration": 2.8819997310638428 }, { "auxiliary_loss_clip": 0.01613535, "auxiliary_loss_mlp": 0.01045932, "balance_loss_clip": 1.39507246, "balance_loss_mlp": 1.02051663, "epoch": 0.21139335638058018, "flos": 17940189847680.0, "grad_norm": 1.7204826934822057, "language_loss": 0.69107807, "learning_rate": 3.6646547006502518e-06, "loss": 0.71767271, "num_input_tokens_seen": 75959930, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.25439453, "step": 3516, "time_per_iteration": 2.824127674102783 }, { "auxiliary_loss_clip": 0.01596081, "auxiliary_loss_mlp": 0.01052561, "balance_loss_clip": 1.37701166, "balance_loss_mlp": 1.02561975, "epoch": 0.21145347963324815, "flos": 24582925664640.0, "grad_norm": 1.6369528189905638, "language_loss": 0.85576469, "learning_rate": 3.664438796560225e-06, "loss": 0.88225114, "num_input_tokens_seen": 75980335, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.26965332, "step": 3517, "time_per_iteration": 2.863694667816162 }, { "auxiliary_loss_clip": 0.01592952, "auxiliary_loss_mlp": 0.0104105, "balance_loss_clip": 1.37450719, "balance_loss_mlp": 1.01637423, "epoch": 0.21151360288591614, "flos": 35859657576960.0, "grad_norm": 1.7023532740344416, "language_loss": 0.64183241, "learning_rate": 3.664222829354512e-06, "loss": 0.66817242, "num_input_tokens_seen": 76002095, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.24707031, "step": 3518, "time_per_iteration": 2.924001932144165 }, { "auxiliary_loss_clip": 0.01596346, "auxiliary_loss_mlp": 0.01048961, "balance_loss_clip": 1.38104558, "balance_loss_mlp": 1.02408171, "epoch": 0.2115737261385841, "flos": 24651526060800.0, "grad_norm": 1.7994553508380786, "language_loss": 0.90247154, "learning_rate": 3.664006799041303e-06, "loss": 0.92892456, "num_input_tokens_seen": 76020425, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.24902344, "step": 3519, "time_per_iteration": 2.8809826374053955 }, { "auxiliary_loss_clip": 0.01603211, "auxiliary_loss_mlp": 0.01051682, "balance_loss_clip": 1.3835243, "balance_loss_mlp": 1.02477658, "epoch": 0.21163384939125207, "flos": 25237384721280.0, "grad_norm": 1.7498774942476862, "language_loss": 0.82515794, "learning_rate": 3.6637907056287886e-06, "loss": 0.85170686, "num_input_tokens_seen": 76041210, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26904297, "step": 3520, "time_per_iteration": 2.8607378005981445 }, { "auxiliary_loss_clip": 0.01574699, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.3629936, "balance_loss_mlp": 1.01882255, "epoch": 0.21169397264392004, "flos": 26078614888320.0, "grad_norm": 1.6677683836674955, "language_loss": 0.76997483, "learning_rate": 3.6635745491251642e-06, "loss": 0.79616594, "num_input_tokens_seen": 76062685, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.25610352, "step": 3521, "time_per_iteration": 2.859482526779175 }, { "auxiliary_loss_clip": 0.01584429, "auxiliary_loss_mlp": 0.01049864, "balance_loss_clip": 1.3694905, "balance_loss_mlp": 1.0253787, "epoch": 0.211754095896588, "flos": 23117577719040.0, "grad_norm": 4.116591803440022, "language_loss": 0.767694, "learning_rate": 3.663358329538626e-06, "loss": 0.79403692, "num_input_tokens_seen": 76082300, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.24499512, "step": 3522, "time_per_iteration": 2.9286365509033203 }, { "auxiliary_loss_clip": 0.01592193, "auxiliary_loss_mlp": 0.0105444, "balance_loss_clip": 1.37383842, "balance_loss_mlp": 1.02671206, "epoch": 0.21181421914925597, "flos": 27932372611200.0, "grad_norm": 2.3299385700694266, "language_loss": 0.7130096, "learning_rate": 3.663142046877374e-06, "loss": 0.73947585, "num_input_tokens_seen": 76101135, "router_z_loss_clip": 2.18359375, "router_z_loss_mlp": 0.27734375, "step": 3523, "time_per_iteration": 2.867607593536377 }, { "auxiliary_loss_clip": 0.0160601, "auxiliary_loss_mlp": 0.01046652, "balance_loss_clip": 1.38733816, "balance_loss_mlp": 1.02070022, "epoch": 0.21187434240192393, "flos": 17137445022720.0, "grad_norm": 2.1416928345858004, "language_loss": 0.78649628, "learning_rate": 3.6629257011496085e-06, "loss": 0.81302285, "num_input_tokens_seen": 76119320, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.25952148, "step": 3524, "time_per_iteration": 2.828070878982544 }, { "auxiliary_loss_clip": 0.01609367, "auxiliary_loss_mlp": 0.01043666, "balance_loss_clip": 1.3862679, "balance_loss_mlp": 1.01763105, "epoch": 0.21193446565459192, "flos": 22357797471360.0, "grad_norm": 1.8285900966656392, "language_loss": 0.82587856, "learning_rate": 3.6627092923635338e-06, "loss": 0.85240889, "num_input_tokens_seen": 76137445, "router_z_loss_clip": 2.22851562, "router_z_loss_mlp": 0.26037598, "step": 3525, "time_per_iteration": 2.864640712738037 }, { "auxiliary_loss_clip": 0.01595902, "auxiliary_loss_mlp": 0.01043031, "balance_loss_clip": 1.38002491, "balance_loss_mlp": 1.01767492, "epoch": 0.2119945889072599, "flos": 27210896726400.0, "grad_norm": 1.7822670062951536, "language_loss": 0.75673151, "learning_rate": 3.662492820527356e-06, "loss": 0.78312081, "num_input_tokens_seen": 76159500, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.25341797, "step": 3526, "time_per_iteration": 2.8851542472839355 }, { "auxiliary_loss_clip": 0.0160026, "auxiliary_loss_mlp": 0.0104083, "balance_loss_clip": 1.38109815, "balance_loss_mlp": 1.01473475, "epoch": 0.21205471215992786, "flos": 21000575894400.0, "grad_norm": 1.74345902127393, "language_loss": 0.78210163, "learning_rate": 3.662276285649284e-06, "loss": 0.80851257, "num_input_tokens_seen": 76177990, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26123047, "step": 3527, "time_per_iteration": 2.9474925994873047 }, { "auxiliary_loss_clip": 0.01601548, "auxiliary_loss_mlp": 0.01047434, "balance_loss_clip": 1.38534713, "balance_loss_mlp": 1.02114844, "epoch": 0.21211483541259582, "flos": 20787535537920.0, "grad_norm": 1.5656628358249758, "language_loss": 0.79128903, "learning_rate": 3.662059687737528e-06, "loss": 0.81777883, "num_input_tokens_seen": 76197125, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26306152, "step": 3528, "time_per_iteration": 3.0328245162963867 }, { "auxiliary_loss_clip": 0.01586257, "auxiliary_loss_mlp": 0.01050214, "balance_loss_clip": 1.37186694, "balance_loss_mlp": 1.02514458, "epoch": 0.21217495866526379, "flos": 18999437299200.0, "grad_norm": 1.6608645527152934, "language_loss": 0.8202008, "learning_rate": 3.6618430268003024e-06, "loss": 0.84656549, "num_input_tokens_seen": 76216215, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25085449, "step": 3529, "time_per_iteration": 2.961333751678467 }, { "auxiliary_loss_clip": 0.01600479, "auxiliary_loss_mlp": 0.01054678, "balance_loss_clip": 1.37992382, "balance_loss_mlp": 1.0279876, "epoch": 0.21223508191793175, "flos": 20676920705280.0, "grad_norm": 2.3878362373181052, "language_loss": 0.78481841, "learning_rate": 3.6616263028458235e-06, "loss": 0.81137002, "num_input_tokens_seen": 76237010, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26721191, "step": 3530, "time_per_iteration": 3.006397247314453 }, { "auxiliary_loss_clip": 0.01594714, "auxiliary_loss_mlp": 0.0104719, "balance_loss_clip": 1.37973571, "balance_loss_mlp": 1.02150011, "epoch": 0.21229520517059972, "flos": 21626458220160.0, "grad_norm": 4.772064312878051, "language_loss": 0.84751761, "learning_rate": 3.661409515882308e-06, "loss": 0.87393665, "num_input_tokens_seen": 76255965, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25671387, "step": 3531, "time_per_iteration": 2.8433279991149902 }, { "auxiliary_loss_clip": 0.01605111, "auxiliary_loss_mlp": 0.01047232, "balance_loss_clip": 1.3848902, "balance_loss_mlp": 1.01986182, "epoch": 0.2123553284232677, "flos": 13999092906240.0, "grad_norm": 2.363788024184872, "language_loss": 0.74933153, "learning_rate": 3.661192665917977e-06, "loss": 0.77585495, "num_input_tokens_seen": 76272150, "router_z_loss_clip": 2.20117188, "router_z_loss_mlp": 0.27392578, "step": 3532, "time_per_iteration": 2.843433380126953 }, { "auxiliary_loss_clip": 0.01592506, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.37631714, "balance_loss_mlp": 1.01908875, "epoch": 0.21241545167593567, "flos": 18306040452480.0, "grad_norm": 1.7630882461947641, "language_loss": 0.74979436, "learning_rate": 3.660975752961054e-06, "loss": 0.7761628, "num_input_tokens_seen": 76291425, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25256348, "step": 3533, "time_per_iteration": 2.822777032852173 }, { "auxiliary_loss_clip": 0.01610223, "auxiliary_loss_mlp": 0.01045438, "balance_loss_clip": 1.38916183, "balance_loss_mlp": 1.02020168, "epoch": 0.21247557492860364, "flos": 34726063639680.0, "grad_norm": 2.24192967123436, "language_loss": 0.72468746, "learning_rate": 3.6607587770197634e-06, "loss": 0.75124407, "num_input_tokens_seen": 76313975, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.25219727, "step": 3534, "time_per_iteration": 3.0029408931732178 }, { "auxiliary_loss_clip": 0.0160068, "auxiliary_loss_mlp": 0.01047396, "balance_loss_clip": 1.38212395, "balance_loss_mlp": 1.02266049, "epoch": 0.2125356981812716, "flos": 22063804888320.0, "grad_norm": 1.8614097300370627, "language_loss": 0.7315675, "learning_rate": 3.6605417381023346e-06, "loss": 0.75804824, "num_input_tokens_seen": 76330955, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.24731445, "step": 3535, "time_per_iteration": 2.874300479888916 }, { "auxiliary_loss_clip": 0.01598754, "auxiliary_loss_mlp": 0.01052757, "balance_loss_clip": 1.3824966, "balance_loss_mlp": 1.02684128, "epoch": 0.21259582143393957, "flos": 28560019484160.0, "grad_norm": 2.0171326098589946, "language_loss": 0.71203214, "learning_rate": 3.660324636216996e-06, "loss": 0.73854727, "num_input_tokens_seen": 76352680, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25915527, "step": 3536, "time_per_iteration": 2.9106850624084473 }, { "auxiliary_loss_clip": 0.01599832, "auxiliary_loss_mlp": 0.01048906, "balance_loss_clip": 1.38078785, "balance_loss_mlp": 1.02378881, "epoch": 0.21265594468660753, "flos": 20130588017280.0, "grad_norm": 1.7489473813769747, "language_loss": 0.88243389, "learning_rate": 3.660107471371981e-06, "loss": 0.90892136, "num_input_tokens_seen": 76370750, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.25109863, "step": 3537, "time_per_iteration": 2.888338088989258 }, { "auxiliary_loss_clip": 0.01588907, "auxiliary_loss_mlp": 0.01048494, "balance_loss_clip": 1.37448454, "balance_loss_mlp": 1.02347195, "epoch": 0.21271606793927553, "flos": 23086603013760.0, "grad_norm": 1.6665571130565497, "language_loss": 0.81713498, "learning_rate": 3.659890243575524e-06, "loss": 0.84350896, "num_input_tokens_seen": 76390610, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25024414, "step": 3538, "time_per_iteration": 4.356805086135864 }, { "auxiliary_loss_clip": 0.01583812, "auxiliary_loss_mlp": 0.0105418, "balance_loss_clip": 1.36951447, "balance_loss_mlp": 1.02901459, "epoch": 0.2127761911919435, "flos": 26397383639040.0, "grad_norm": 1.6562930289070181, "language_loss": 0.88398004, "learning_rate": 3.659672952835863e-06, "loss": 0.91035998, "num_input_tokens_seen": 76408860, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25195312, "step": 3539, "time_per_iteration": 2.89227557182312 }, { "auxiliary_loss_clip": 0.01593713, "auxiliary_loss_mlp": 0.01057363, "balance_loss_clip": 1.37602496, "balance_loss_mlp": 1.03110075, "epoch": 0.21283631444461146, "flos": 20237447531520.0, "grad_norm": 2.033168966038709, "language_loss": 0.59025502, "learning_rate": 3.659455599161237e-06, "loss": 0.61676574, "num_input_tokens_seen": 76424980, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.26281738, "step": 3540, "time_per_iteration": 2.8385040760040283 }, { "auxiliary_loss_clip": 0.01592498, "auxiliary_loss_mlp": 0.01049495, "balance_loss_clip": 1.37539446, "balance_loss_mlp": 1.02462816, "epoch": 0.21289643769727942, "flos": 13524980198400.0, "grad_norm": 2.314725713029893, "language_loss": 0.77262658, "learning_rate": 3.659238182559888e-06, "loss": 0.79904652, "num_input_tokens_seen": 76443135, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.24841309, "step": 3541, "time_per_iteration": 2.9411773681640625 }, { "auxiliary_loss_clip": 0.01587378, "auxiliary_loss_mlp": 0.01050375, "balance_loss_clip": 1.37440419, "balance_loss_mlp": 1.02535343, "epoch": 0.2129565609499474, "flos": 24837799478400.0, "grad_norm": 2.3855973139699027, "language_loss": 0.7042101, "learning_rate": 3.6590207030400615e-06, "loss": 0.73058766, "num_input_tokens_seen": 76462470, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25036621, "step": 3542, "time_per_iteration": 2.939868688583374 }, { "auxiliary_loss_clip": 0.0159153, "auxiliary_loss_mlp": 0.0104937, "balance_loss_clip": 1.37846398, "balance_loss_mlp": 1.02480125, "epoch": 0.21301668420261535, "flos": 23669656496640.0, "grad_norm": 1.7017816288046186, "language_loss": 0.76849282, "learning_rate": 3.658803160610004e-06, "loss": 0.79490185, "num_input_tokens_seen": 76481995, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24572754, "step": 3543, "time_per_iteration": 2.861057996749878 }, { "auxiliary_loss_clip": 0.01589556, "auxiliary_loss_mlp": 0.01048497, "balance_loss_clip": 1.37518907, "balance_loss_mlp": 1.02264011, "epoch": 0.21307680745528332, "flos": 16371511482240.0, "grad_norm": 1.8250785780890129, "language_loss": 0.67554438, "learning_rate": 3.6585855552779634e-06, "loss": 0.70192486, "num_input_tokens_seen": 76500245, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25891113, "step": 3544, "time_per_iteration": 2.8570363521575928 }, { "auxiliary_loss_clip": 0.01577763, "auxiliary_loss_mlp": 0.0105253, "balance_loss_clip": 1.364959, "balance_loss_mlp": 1.02726972, "epoch": 0.2131369307079513, "flos": 19108423319040.0, "grad_norm": 1.5934270387805176, "language_loss": 0.71300721, "learning_rate": 3.6583678870521934e-06, "loss": 0.73931015, "num_input_tokens_seen": 76519535, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25280762, "step": 3545, "time_per_iteration": 2.8272645473480225 }, { "auxiliary_loss_clip": 0.01603803, "auxiliary_loss_mlp": 0.01051534, "balance_loss_clip": 1.38572693, "balance_loss_mlp": 1.02722692, "epoch": 0.21319705396061928, "flos": 30384340824960.0, "grad_norm": 1.749022083145176, "language_loss": 0.73032147, "learning_rate": 3.658150155940946e-06, "loss": 0.75687486, "num_input_tokens_seen": 76542065, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.24304199, "step": 3546, "time_per_iteration": 4.282550811767578 }, { "auxiliary_loss_clip": 0.01609174, "auxiliary_loss_mlp": 0.01054754, "balance_loss_clip": 1.39090848, "balance_loss_mlp": 1.0291841, "epoch": 0.21325717721328724, "flos": 21764609153280.0, "grad_norm": 1.8629614973496167, "language_loss": 0.80669463, "learning_rate": 3.657932361952479e-06, "loss": 0.83333397, "num_input_tokens_seen": 76560540, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.25585938, "step": 3547, "time_per_iteration": 4.317594528198242 }, { "auxiliary_loss_clip": 0.01582162, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.36335969, "balance_loss_mlp": 1.02033627, "epoch": 0.2133173004659552, "flos": 28742628072960.0, "grad_norm": 3.0369105610627534, "language_loss": 0.76639795, "learning_rate": 3.6577145050950504e-06, "loss": 0.79267395, "num_input_tokens_seen": 76581760, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 0.25109863, "step": 3548, "time_per_iteration": 4.370636701583862 }, { "auxiliary_loss_clip": 0.01600879, "auxiliary_loss_mlp": 0.01056087, "balance_loss_clip": 1.38006163, "balance_loss_mlp": 1.02959895, "epoch": 0.21337742371862317, "flos": 16845895658880.0, "grad_norm": 1.9232872134587669, "language_loss": 0.74692184, "learning_rate": 3.657496585376922e-06, "loss": 0.7734915, "num_input_tokens_seen": 76599940, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.26513672, "step": 3549, "time_per_iteration": 2.852412700653076 }, { "auxiliary_loss_clip": 0.01598084, "auxiliary_loss_mlp": 0.01049061, "balance_loss_clip": 1.38273752, "balance_loss_mlp": 1.0244925, "epoch": 0.21343754697129114, "flos": 24435409057920.0, "grad_norm": 1.6190791493447203, "language_loss": 0.8183859, "learning_rate": 3.657278602806357e-06, "loss": 0.84485739, "num_input_tokens_seen": 76619580, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.24560547, "step": 3550, "time_per_iteration": 2.844397783279419 }, { "auxiliary_loss_clip": 0.01560779, "auxiliary_loss_mlp": 0.0104666, "balance_loss_clip": 1.35312152, "balance_loss_mlp": 1.02341402, "epoch": 0.21349767022395913, "flos": 19286281203840.0, "grad_norm": 1.581824519582713, "language_loss": 0.88346559, "learning_rate": 3.657060557391621e-06, "loss": 0.90954, "num_input_tokens_seen": 76638195, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.23242188, "step": 3551, "time_per_iteration": 2.8811280727386475 }, { "auxiliary_loss_clip": 0.01576837, "auxiliary_loss_mlp": 0.01046372, "balance_loss_clip": 1.36579359, "balance_loss_mlp": 1.02129006, "epoch": 0.2135577934766271, "flos": 17356412448000.0, "grad_norm": 1.782918639996052, "language_loss": 0.84069085, "learning_rate": 3.656842449140983e-06, "loss": 0.86692297, "num_input_tokens_seen": 76656695, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.25109863, "step": 3552, "time_per_iteration": 2.860135316848755 }, { "auxiliary_loss_clip": 0.01574883, "auxiliary_loss_mlp": 0.01053348, "balance_loss_clip": 1.36339283, "balance_loss_mlp": 1.02887464, "epoch": 0.21361791672929506, "flos": 24066843765120.0, "grad_norm": 1.7336998963771433, "language_loss": 0.77323949, "learning_rate": 3.656624278062713e-06, "loss": 0.7995218, "num_input_tokens_seen": 76677430, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24475098, "step": 3553, "time_per_iteration": 2.8982932567596436 }, { "auxiliary_loss_clip": 0.01555221, "auxiliary_loss_mlp": 0.01051444, "balance_loss_clip": 1.3449403, "balance_loss_mlp": 1.0281744, "epoch": 0.21367803998196302, "flos": 22172066991360.0, "grad_norm": 2.2388048075564924, "language_loss": 0.73559529, "learning_rate": 3.6564060441650843e-06, "loss": 0.76166189, "num_input_tokens_seen": 76697615, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23278809, "step": 3554, "time_per_iteration": 2.911975860595703 }, { "auxiliary_loss_clip": 0.01589831, "auxiliary_loss_mlp": 0.0104841, "balance_loss_clip": 1.37454462, "balance_loss_mlp": 1.02289927, "epoch": 0.213738163234631, "flos": 20896702536960.0, "grad_norm": 1.8149443956057874, "language_loss": 0.68795967, "learning_rate": 3.6561877474563724e-06, "loss": 0.714342, "num_input_tokens_seen": 76715685, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25524902, "step": 3555, "time_per_iteration": 2.8772294521331787 }, { "auxiliary_loss_clip": 0.01574632, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.36138713, "balance_loss_mlp": 1.021523, "epoch": 0.21379828648729896, "flos": 28414946096640.0, "grad_norm": 1.7479400012686521, "language_loss": 0.6647988, "learning_rate": 3.6559693879448553e-06, "loss": 0.69100738, "num_input_tokens_seen": 76735405, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.24694824, "step": 3556, "time_per_iteration": 2.9523372650146484 }, { "auxiliary_loss_clip": 0.01561764, "auxiliary_loss_mlp": 0.01049355, "balance_loss_clip": 1.35057902, "balance_loss_mlp": 1.0235461, "epoch": 0.21385840973996692, "flos": 25489543847040.0, "grad_norm": 1.9068913549539808, "language_loss": 0.73414379, "learning_rate": 3.6557509656388125e-06, "loss": 0.76025498, "num_input_tokens_seen": 76754395, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.25805664, "step": 3557, "time_per_iteration": 2.905759811401367 }, { "auxiliary_loss_clip": 0.01578956, "auxiliary_loss_mlp": 0.01046947, "balance_loss_clip": 1.36361456, "balance_loss_mlp": 1.01857519, "epoch": 0.2139185329926349, "flos": 28085906776320.0, "grad_norm": 1.7161115953673018, "language_loss": 0.68552649, "learning_rate": 3.655532480546528e-06, "loss": 0.7117855, "num_input_tokens_seen": 76777210, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.28369141, "step": 3558, "time_per_iteration": 2.9231388568878174 }, { "auxiliary_loss_clip": 0.01577167, "auxiliary_loss_mlp": 0.01046343, "balance_loss_clip": 1.35813653, "balance_loss_mlp": 1.01981878, "epoch": 0.21397865624530288, "flos": 19617899477760.0, "grad_norm": 1.7470466078190239, "language_loss": 0.81468153, "learning_rate": 3.655313932676286e-06, "loss": 0.84091657, "num_input_tokens_seen": 76795830, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26550293, "step": 3559, "time_per_iteration": 2.830726146697998 }, { "auxiliary_loss_clip": 0.01556485, "auxiliary_loss_mlp": 0.01043462, "balance_loss_clip": 1.34616137, "balance_loss_mlp": 1.0195483, "epoch": 0.21403877949797084, "flos": 24692318887680.0, "grad_norm": 1.600755865953125, "language_loss": 0.68583149, "learning_rate": 3.655095322036373e-06, "loss": 0.71183097, "num_input_tokens_seen": 76814700, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23925781, "step": 3560, "time_per_iteration": 2.883713722229004 }, { "auxiliary_loss_clip": 0.01570302, "auxiliary_loss_mlp": 0.01045478, "balance_loss_clip": 1.35480165, "balance_loss_mlp": 1.01993132, "epoch": 0.2140989027506388, "flos": 19869651400320.0, "grad_norm": 2.473314289049543, "language_loss": 0.73953652, "learning_rate": 3.65487664863508e-06, "loss": 0.76569432, "num_input_tokens_seen": 76833400, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.2557373, "step": 3561, "time_per_iteration": 2.8458545207977295 }, { "auxiliary_loss_clip": 0.01574025, "auxiliary_loss_mlp": 0.01050214, "balance_loss_clip": 1.36071134, "balance_loss_mlp": 1.02391624, "epoch": 0.21415902600330677, "flos": 19144646421120.0, "grad_norm": 2.154303731695337, "language_loss": 0.79401881, "learning_rate": 3.654657912480698e-06, "loss": 0.82026118, "num_input_tokens_seen": 76850645, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.26306152, "step": 3562, "time_per_iteration": 2.82829213142395 }, { "auxiliary_loss_clip": 0.01560826, "auxiliary_loss_mlp": 0.01049684, "balance_loss_clip": 1.35008192, "balance_loss_mlp": 1.02317238, "epoch": 0.21421914925597474, "flos": 22282636579200.0, "grad_norm": 1.6952815700046682, "language_loss": 0.85330123, "learning_rate": 3.6544391135815237e-06, "loss": 0.87940633, "num_input_tokens_seen": 76870135, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.26513672, "step": 3563, "time_per_iteration": 2.8837172985076904 }, { "auxiliary_loss_clip": 0.01570874, "auxiliary_loss_mlp": 0.0104563, "balance_loss_clip": 1.36050844, "balance_loss_mlp": 1.02046537, "epoch": 0.2142792725086427, "flos": 33888136343040.0, "grad_norm": 1.3707779395693789, "language_loss": 0.77472192, "learning_rate": 3.6542202519458507e-06, "loss": 0.80088699, "num_input_tokens_seen": 76893905, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.25158691, "step": 3564, "time_per_iteration": 2.9383113384246826 }, { "auxiliary_loss_clip": 0.01564279, "auxiliary_loss_mlp": 0.01043722, "balance_loss_clip": 1.35457253, "balance_loss_mlp": 1.01822305, "epoch": 0.2143393957613107, "flos": 19868475035520.0, "grad_norm": 1.6287056030957199, "language_loss": 0.89388496, "learning_rate": 3.654001327581981e-06, "loss": 0.91996491, "num_input_tokens_seen": 76914205, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.25500488, "step": 3565, "time_per_iteration": 2.8876523971557617 }, { "auxiliary_loss_clip": 0.01368994, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.247293, "balance_loss_mlp": 1.00660086, "epoch": 0.21439951901397866, "flos": 68559896943360.0, "grad_norm": 0.8671009271569773, "language_loss": 0.52261698, "learning_rate": 3.653782340498215e-06, "loss": 0.54662091, "num_input_tokens_seen": 76975650, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.24707031, "step": 3566, "time_per_iteration": 3.303924798965454 }, { "auxiliary_loss_clip": 0.01537463, "auxiliary_loss_mlp": 0.01039977, "balance_loss_clip": 1.33453131, "balance_loss_mlp": 1.01447773, "epoch": 0.21445964226664663, "flos": 19692562677120.0, "grad_norm": 1.9501930004331203, "language_loss": 0.67926174, "learning_rate": 3.6535632907028566e-06, "loss": 0.70503616, "num_input_tokens_seen": 76992615, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.25500488, "step": 3567, "time_per_iteration": 2.8587489128112793 }, { "auxiliary_loss_clip": 0.01553285, "auxiliary_loss_mlp": 0.01042577, "balance_loss_clip": 1.34777725, "balance_loss_mlp": 1.01784134, "epoch": 0.2145197655193146, "flos": 31120883228160.0, "grad_norm": 1.4523267130627369, "language_loss": 0.75046784, "learning_rate": 3.6533441782042126e-06, "loss": 0.77642649, "num_input_tokens_seen": 77017005, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.24743652, "step": 3568, "time_per_iteration": 2.989978313446045 }, { "auxiliary_loss_clip": 0.01556425, "auxiliary_loss_mlp": 0.01048688, "balance_loss_clip": 1.34894931, "balance_loss_mlp": 1.02359462, "epoch": 0.21457988877198256, "flos": 20130452282880.0, "grad_norm": 1.6910397335315217, "language_loss": 0.79047072, "learning_rate": 3.6531250030105917e-06, "loss": 0.81652182, "num_input_tokens_seen": 77034990, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.25073242, "step": 3569, "time_per_iteration": 2.8444364070892334 }, { "auxiliary_loss_clip": 0.01584858, "auxiliary_loss_mlp": 0.01043062, "balance_loss_clip": 1.36536837, "balance_loss_mlp": 1.01634753, "epoch": 0.21464001202465052, "flos": 18597227857920.0, "grad_norm": 2.263610051622323, "language_loss": 0.71189207, "learning_rate": 3.6529057651303053e-06, "loss": 0.73817122, "num_input_tokens_seen": 77052610, "router_z_loss_clip": 2.19726562, "router_z_loss_mlp": 0.26745605, "step": 3570, "time_per_iteration": 2.8805501461029053 }, { "auxiliary_loss_clip": 0.01570285, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.35344148, "balance_loss_mlp": 1.02192903, "epoch": 0.21470013527731852, "flos": 21845154176640.0, "grad_norm": 2.398553477349351, "language_loss": 0.80674201, "learning_rate": 3.6526864645716666e-06, "loss": 0.83291638, "num_input_tokens_seen": 77072475, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.25219727, "step": 3571, "time_per_iteration": 2.9900119304656982 }, { "auxiliary_loss_clip": 0.01579013, "auxiliary_loss_mlp": 0.01046653, "balance_loss_clip": 1.36509371, "balance_loss_mlp": 1.02140498, "epoch": 0.21476025852998648, "flos": 17612191157760.0, "grad_norm": 1.9521905860463338, "language_loss": 0.84714627, "learning_rate": 3.652467101342991e-06, "loss": 0.87340295, "num_input_tokens_seen": 77089930, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25244141, "step": 3572, "time_per_iteration": 2.864039659500122 }, { "auxiliary_loss_clip": 0.01580178, "auxiliary_loss_mlp": 0.01045716, "balance_loss_clip": 1.36275697, "balance_loss_mlp": 1.02117074, "epoch": 0.21482038178265445, "flos": 24839202067200.0, "grad_norm": 2.378261763657597, "language_loss": 0.6655215, "learning_rate": 3.652247675452598e-06, "loss": 0.69178045, "num_input_tokens_seen": 77108970, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.24560547, "step": 3573, "time_per_iteration": 4.325469970703125 }, { "auxiliary_loss_clip": 0.01554092, "auxiliary_loss_mlp": 0.0104586, "balance_loss_clip": 1.34754837, "balance_loss_mlp": 1.02074218, "epoch": 0.2148805050353224, "flos": 23268623420160.0, "grad_norm": 2.643786979905976, "language_loss": 0.76062584, "learning_rate": 3.652028186908807e-06, "loss": 0.78662527, "num_input_tokens_seen": 77126045, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.25109863, "step": 3574, "time_per_iteration": 2.901113748550415 }, { "auxiliary_loss_clip": 0.01571102, "auxiliary_loss_mlp": 0.01049791, "balance_loss_clip": 1.35715091, "balance_loss_mlp": 1.02355266, "epoch": 0.21494062828799038, "flos": 21330112907520.0, "grad_norm": 2.0122282900967563, "language_loss": 0.73819768, "learning_rate": 3.6518086357199416e-06, "loss": 0.76440662, "num_input_tokens_seen": 77144600, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.26208496, "step": 3575, "time_per_iteration": 2.828514575958252 }, { "auxiliary_loss_clip": 0.01560336, "auxiliary_loss_mlp": 0.01051157, "balance_loss_clip": 1.35096502, "balance_loss_mlp": 1.02624261, "epoch": 0.21500075154065834, "flos": 18852554119680.0, "grad_norm": 1.5782074065758862, "language_loss": 0.68976164, "learning_rate": 3.6515890218943277e-06, "loss": 0.71587658, "num_input_tokens_seen": 77162965, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.24926758, "step": 3576, "time_per_iteration": 2.867933750152588 }, { "auxiliary_loss_clip": 0.01578948, "auxiliary_loss_mlp": 0.01051997, "balance_loss_clip": 1.36154783, "balance_loss_mlp": 1.02637887, "epoch": 0.2150608747933263, "flos": 18451113840000.0, "grad_norm": 1.927781035100966, "language_loss": 0.90569955, "learning_rate": 3.651369345440292e-06, "loss": 0.93200898, "num_input_tokens_seen": 77179960, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.25610352, "step": 3577, "time_per_iteration": 2.813812017440796 }, { "auxiliary_loss_clip": 0.01356938, "auxiliary_loss_mlp": 0.01072759, "balance_loss_clip": 1.23683333, "balance_loss_mlp": 1.04777241, "epoch": 0.2151209980459943, "flos": 66628037416320.0, "grad_norm": 0.8051943400803414, "language_loss": 0.56147867, "learning_rate": 3.6511496063661654e-06, "loss": 0.58577561, "num_input_tokens_seen": 77239500, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.24902344, "step": 3578, "time_per_iteration": 3.331730604171753 }, { "auxiliary_loss_clip": 0.01577319, "auxiliary_loss_mlp": 0.01049485, "balance_loss_clip": 1.36441755, "balance_loss_mlp": 1.02607203, "epoch": 0.21518112129866226, "flos": 21584805742080.0, "grad_norm": 1.61923645566343, "language_loss": 0.89354634, "learning_rate": 3.6509298046802807e-06, "loss": 0.91981447, "num_input_tokens_seen": 77254680, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.23425293, "step": 3579, "time_per_iteration": 2.8296000957489014 }, { "auxiliary_loss_clip": 0.01574991, "auxiliary_loss_mlp": 0.0104757, "balance_loss_clip": 1.35909092, "balance_loss_mlp": 1.02106953, "epoch": 0.21524124455133023, "flos": 20057236917120.0, "grad_norm": 1.5972009738400594, "language_loss": 0.79194653, "learning_rate": 3.650709940390972e-06, "loss": 0.8181721, "num_input_tokens_seen": 77274060, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26538086, "step": 3580, "time_per_iteration": 2.8478307723999023 }, { "auxiliary_loss_clip": 0.01558759, "auxiliary_loss_mlp": 0.01050606, "balance_loss_clip": 1.34753847, "balance_loss_mlp": 1.02579808, "epoch": 0.2153013678039982, "flos": 23962472714880.0, "grad_norm": 1.6999756918307452, "language_loss": 0.74477464, "learning_rate": 3.6504900135065775e-06, "loss": 0.77086824, "num_input_tokens_seen": 77293255, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24780273, "step": 3581, "time_per_iteration": 4.280422925949097 }, { "auxiliary_loss_clip": 0.01562229, "auxiliary_loss_mlp": 0.01046665, "balance_loss_clip": 1.35068679, "balance_loss_mlp": 1.02005708, "epoch": 0.21536149105666616, "flos": 20604564990720.0, "grad_norm": 2.5211134782884175, "language_loss": 0.73372829, "learning_rate": 3.6502700240354357e-06, "loss": 0.75981718, "num_input_tokens_seen": 77312390, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.26623535, "step": 3582, "time_per_iteration": 5.679165601730347 }, { "auxiliary_loss_clip": 0.0156574, "auxiliary_loss_mlp": 0.01047945, "balance_loss_clip": 1.35311937, "balance_loss_mlp": 1.02130198, "epoch": 0.21542161430933413, "flos": 12867806453760.0, "grad_norm": 2.918641567440283, "language_loss": 0.85631639, "learning_rate": 3.650049971985889e-06, "loss": 0.8824532, "num_input_tokens_seen": 77330985, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.26696777, "step": 3583, "time_per_iteration": 2.934835195541382 }, { "auxiliary_loss_clip": 0.01571211, "auxiliary_loss_mlp": 0.01049923, "balance_loss_clip": 1.35461533, "balance_loss_mlp": 1.024984, "epoch": 0.21548173756200212, "flos": 26115018969600.0, "grad_norm": 2.6796054145273347, "language_loss": 0.84107769, "learning_rate": 3.6498298573662824e-06, "loss": 0.86728907, "num_input_tokens_seen": 77350770, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24963379, "step": 3584, "time_per_iteration": 2.876629114151001 }, { "auxiliary_loss_clip": 0.01567144, "auxiliary_loss_mlp": 0.01050702, "balance_loss_clip": 1.35678363, "balance_loss_mlp": 1.02521467, "epoch": 0.21554186081467008, "flos": 22173922028160.0, "grad_norm": 1.7771126255102319, "language_loss": 0.91799068, "learning_rate": 3.6496096801849625e-06, "loss": 0.94416916, "num_input_tokens_seen": 77370510, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.25524902, "step": 3585, "time_per_iteration": 2.942531108856201 }, { "auxiliary_loss_clip": 0.01556402, "auxiliary_loss_mlp": 0.01045142, "balance_loss_clip": 1.34506893, "balance_loss_mlp": 1.02048957, "epoch": 0.21560198406733805, "flos": 22977164545920.0, "grad_norm": 1.7406462751215042, "language_loss": 0.75575858, "learning_rate": 3.649389440450277e-06, "loss": 0.78177404, "num_input_tokens_seen": 77390645, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.24658203, "step": 3586, "time_per_iteration": 2.912104606628418 }, { "auxiliary_loss_clip": 0.01576726, "auxiliary_loss_mlp": 0.01053799, "balance_loss_clip": 1.36188221, "balance_loss_mlp": 1.02681041, "epoch": 0.215662107320006, "flos": 22794374977920.0, "grad_norm": 1.6189818566684133, "language_loss": 0.83512032, "learning_rate": 3.6491691381705804e-06, "loss": 0.86142558, "num_input_tokens_seen": 77409655, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.26977539, "step": 3587, "time_per_iteration": 2.9209725856781006 }, { "auxiliary_loss_clip": 0.01562479, "auxiliary_loss_mlp": 0.01047027, "balance_loss_clip": 1.34976196, "balance_loss_mlp": 1.02149212, "epoch": 0.21572223057267398, "flos": 30896260202880.0, "grad_norm": 2.9519668658711704, "language_loss": 0.77104509, "learning_rate": 3.648948773354224e-06, "loss": 0.79714024, "num_input_tokens_seen": 77430560, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.25524902, "step": 3588, "time_per_iteration": 2.916823387145996 }, { "auxiliary_loss_clip": 0.01570061, "auxiliary_loss_mlp": 0.01045348, "balance_loss_clip": 1.35588777, "balance_loss_mlp": 1.01962328, "epoch": 0.21578235382534194, "flos": 26922785967360.0, "grad_norm": 1.7227345585824707, "language_loss": 0.81842542, "learning_rate": 3.6487283460095643e-06, "loss": 0.84457946, "num_input_tokens_seen": 77455000, "router_z_loss_clip": 2.13964844, "router_z_loss_mlp": 0.25732422, "step": 3589, "time_per_iteration": 2.910130023956299 }, { "auxiliary_loss_clip": 0.01568073, "auxiliary_loss_mlp": 0.01042674, "balance_loss_clip": 1.3561877, "balance_loss_mlp": 1.0180577, "epoch": 0.2158424770780099, "flos": 24436404443520.0, "grad_norm": 2.296552286858966, "language_loss": 0.73864567, "learning_rate": 3.648507856144961e-06, "loss": 0.76475316, "num_input_tokens_seen": 77475075, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.24633789, "step": 3590, "time_per_iteration": 2.9115357398986816 }, { "auxiliary_loss_clip": 0.01564487, "auxiliary_loss_mlp": 0.01048482, "balance_loss_clip": 1.3495059, "balance_loss_mlp": 1.02181458, "epoch": 0.2159026003306779, "flos": 23960165230080.0, "grad_norm": 1.7384292984470875, "language_loss": 0.85507435, "learning_rate": 3.648287303768775e-06, "loss": 0.88120401, "num_input_tokens_seen": 77495945, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26660156, "step": 3591, "time_per_iteration": 2.8779680728912354 }, { "auxiliary_loss_clip": 0.01572831, "auxiliary_loss_mlp": 0.01054148, "balance_loss_clip": 1.35691798, "balance_loss_mlp": 1.02477455, "epoch": 0.21596272358334587, "flos": 30052179613440.0, "grad_norm": 2.294141451608414, "language_loss": 0.69446993, "learning_rate": 3.6480666888893686e-06, "loss": 0.72073972, "num_input_tokens_seen": 77517140, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.29394531, "step": 3592, "time_per_iteration": 3.005282402038574 }, { "auxiliary_loss_clip": 0.0157025, "auxiliary_loss_mlp": 0.01051779, "balance_loss_clip": 1.35537124, "balance_loss_mlp": 1.0246824, "epoch": 0.21602284683601383, "flos": 20385959523840.0, "grad_norm": 2.4961006470982134, "language_loss": 0.85901427, "learning_rate": 3.647846011515108e-06, "loss": 0.88523459, "num_input_tokens_seen": 77536085, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.27124023, "step": 3593, "time_per_iteration": 2.812030553817749 }, { "auxiliary_loss_clip": 0.01564725, "auxiliary_loss_mlp": 0.01045844, "balance_loss_clip": 1.35045171, "balance_loss_mlp": 1.01850939, "epoch": 0.2160829700886818, "flos": 20787128334720.0, "grad_norm": 2.3192501397241148, "language_loss": 0.7676338, "learning_rate": 3.6476252716543625e-06, "loss": 0.7937395, "num_input_tokens_seen": 77553675, "router_z_loss_clip": 2.14355469, "router_z_loss_mlp": 0.27319336, "step": 3594, "time_per_iteration": 2.858562469482422 }, { "auxiliary_loss_clip": 0.01541905, "auxiliary_loss_mlp": 0.01052174, "balance_loss_clip": 1.33315253, "balance_loss_mlp": 1.02381468, "epoch": 0.21614309334134976, "flos": 22319945556480.0, "grad_norm": 1.4681669366861925, "language_loss": 0.80517483, "learning_rate": 3.6474044693155007e-06, "loss": 0.8311156, "num_input_tokens_seen": 77573360, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.28356934, "step": 3595, "time_per_iteration": 2.876054286956787 }, { "auxiliary_loss_clip": 0.01560411, "auxiliary_loss_mlp": 0.01044241, "balance_loss_clip": 1.34365726, "balance_loss_mlp": 1.01611984, "epoch": 0.21620321659401773, "flos": 19618940108160.0, "grad_norm": 1.7686373818494592, "language_loss": 0.79415727, "learning_rate": 3.647183604506897e-06, "loss": 0.82020372, "num_input_tokens_seen": 77591865, "router_z_loss_clip": 2.16796875, "router_z_loss_mlp": 0.28137207, "step": 3596, "time_per_iteration": 2.8432259559631348 }, { "auxiliary_loss_clip": 0.01542106, "auxiliary_loss_mlp": 0.01048202, "balance_loss_clip": 1.33244753, "balance_loss_mlp": 1.02040279, "epoch": 0.2162633398466857, "flos": 18854092442880.0, "grad_norm": 1.7124125220120825, "language_loss": 0.84406114, "learning_rate": 3.6469626772369253e-06, "loss": 0.86996424, "num_input_tokens_seen": 77611600, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.27783203, "step": 3597, "time_per_iteration": 2.949237108230591 }, { "auxiliary_loss_clip": 0.01551093, "auxiliary_loss_mlp": 0.01045736, "balance_loss_clip": 1.33647919, "balance_loss_mlp": 1.01842523, "epoch": 0.21632346309935369, "flos": 18777619451520.0, "grad_norm": 1.9652168595141264, "language_loss": 0.8142364, "learning_rate": 3.6467416875139642e-06, "loss": 0.84020472, "num_input_tokens_seen": 77630665, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.27307129, "step": 3598, "time_per_iteration": 2.8100430965423584 }, { "auxiliary_loss_clip": 0.01555826, "auxiliary_loss_mlp": 0.01055847, "balance_loss_clip": 1.33930945, "balance_loss_mlp": 1.02586579, "epoch": 0.21638358635202165, "flos": 26335660452480.0, "grad_norm": 1.850066971157753, "language_loss": 0.82858479, "learning_rate": 3.6465206353463934e-06, "loss": 0.85470152, "num_input_tokens_seen": 77650835, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.29968262, "step": 3599, "time_per_iteration": 2.920530319213867 }, { "auxiliary_loss_clip": 0.01554111, "auxiliary_loss_mlp": 0.01052029, "balance_loss_clip": 1.34192872, "balance_loss_mlp": 1.0248853, "epoch": 0.21644370960468962, "flos": 20750633763840.0, "grad_norm": 2.1092629461700745, "language_loss": 0.77308524, "learning_rate": 3.6462995207425947e-06, "loss": 0.79914665, "num_input_tokens_seen": 77669000, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.27172852, "step": 3600, "time_per_iteration": 2.810134172439575 }, { "auxiliary_loss_clip": 0.01542805, "auxiliary_loss_mlp": 0.01045602, "balance_loss_clip": 1.3324182, "balance_loss_mlp": 1.01886368, "epoch": 0.21650383285735758, "flos": 23963015652480.0, "grad_norm": 3.5468157250702594, "language_loss": 0.81333619, "learning_rate": 3.6460783437109533e-06, "loss": 0.83922029, "num_input_tokens_seen": 77688745, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.26745605, "step": 3601, "time_per_iteration": 2.883336305618286 }, { "auxiliary_loss_clip": 0.01549092, "auxiliary_loss_mlp": 0.01053414, "balance_loss_clip": 1.33372653, "balance_loss_mlp": 1.0253756, "epoch": 0.21656395611002555, "flos": 23706603515520.0, "grad_norm": 2.490618195569459, "language_loss": 0.84998697, "learning_rate": 3.6458571042598565e-06, "loss": 0.87601203, "num_input_tokens_seen": 77708445, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.28076172, "step": 3602, "time_per_iteration": 2.842400074005127 }, { "auxiliary_loss_clip": 0.015648, "auxiliary_loss_mlp": 0.01051603, "balance_loss_clip": 1.34993076, "balance_loss_mlp": 1.02103758, "epoch": 0.2166240793626935, "flos": 20675337137280.0, "grad_norm": 1.8413025666392315, "language_loss": 0.76029509, "learning_rate": 3.645635802397693e-06, "loss": 0.78645909, "num_input_tokens_seen": 77728465, "router_z_loss_clip": 2.14941406, "router_z_loss_mlp": 0.30566406, "step": 3603, "time_per_iteration": 2.8345208168029785 }, { "auxiliary_loss_clip": 0.01544753, "auxiliary_loss_mlp": 0.0105387, "balance_loss_clip": 1.33798039, "balance_loss_mlp": 1.02490258, "epoch": 0.2166842026153615, "flos": 21590506586880.0, "grad_norm": 1.918004274854854, "language_loss": 0.75438344, "learning_rate": 3.645414438132855e-06, "loss": 0.7803697, "num_input_tokens_seen": 77746735, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.29003906, "step": 3604, "time_per_iteration": 2.881065845489502 }, { "auxiliary_loss_clip": 0.01538934, "auxiliary_loss_mlp": 0.01049836, "balance_loss_clip": 1.33085799, "balance_loss_mlp": 1.02153563, "epoch": 0.21674432586802947, "flos": 25640996751360.0, "grad_norm": 1.6446844282423814, "language_loss": 0.80805838, "learning_rate": 3.6451930114737366e-06, "loss": 0.83394611, "num_input_tokens_seen": 77768105, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.28320312, "step": 3605, "time_per_iteration": 2.868868112564087 }, { "auxiliary_loss_clip": 0.01347993, "auxiliary_loss_mlp": 0.01241698, "balance_loss_clip": 1.22505736, "balance_loss_mlp": 1.20431352, "epoch": 0.21680444912069743, "flos": 56444694796800.0, "grad_norm": 0.7745646414400693, "language_loss": 0.58451337, "learning_rate": 3.6449715224287347e-06, "loss": 0.61041027, "num_input_tokens_seen": 77833750, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.37304688, "step": 3606, "time_per_iteration": 3.4698266983032227 }, { "auxiliary_loss_clip": 0.01560145, "auxiliary_loss_mlp": 0.01055158, "balance_loss_clip": 1.34326971, "balance_loss_mlp": 1.02597594, "epoch": 0.2168645723733654, "flos": 23889619307520.0, "grad_norm": 1.9694989565320489, "language_loss": 0.742378, "learning_rate": 3.644749971006248e-06, "loss": 0.76853102, "num_input_tokens_seen": 77853780, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.29199219, "step": 3607, "time_per_iteration": 4.312813758850098 }, { "auxiliary_loss_clip": 0.01557829, "auxiliary_loss_mlp": 0.01062482, "balance_loss_clip": 1.34220386, "balance_loss_mlp": 1.0330615, "epoch": 0.21692469562603336, "flos": 16954926923520.0, "grad_norm": 1.9709374938909772, "language_loss": 0.78174067, "learning_rate": 3.6445283572146765e-06, "loss": 0.80794382, "num_input_tokens_seen": 77872575, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.29394531, "step": 3608, "time_per_iteration": 2.8579771518707275 }, { "auxiliary_loss_clip": 0.0156179, "auxiliary_loss_mlp": 0.01064434, "balance_loss_clip": 1.34636974, "balance_loss_mlp": 1.03680146, "epoch": 0.21698481887870133, "flos": 25129937024640.0, "grad_norm": 2.0227403527813856, "language_loss": 0.75246203, "learning_rate": 3.6443066810624255e-06, "loss": 0.77872425, "num_input_tokens_seen": 77892700, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.27685547, "step": 3609, "time_per_iteration": 2.8933165073394775 }, { "auxiliary_loss_clip": 0.01553657, "auxiliary_loss_mlp": 0.01079384, "balance_loss_clip": 1.34022963, "balance_loss_mlp": 1.04886651, "epoch": 0.2170449421313693, "flos": 17903650032000.0, "grad_norm": 2.0838356384336363, "language_loss": 0.89748889, "learning_rate": 3.6440849425579e-06, "loss": 0.9238193, "num_input_tokens_seen": 77911060, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.30517578, "step": 3610, "time_per_iteration": 2.891892910003662 }, { "auxiliary_loss_clip": 0.01548629, "auxiliary_loss_mlp": 0.0108281, "balance_loss_clip": 1.33856845, "balance_loss_mlp": 1.05379438, "epoch": 0.2171050653840373, "flos": 22648803897600.0, "grad_norm": 1.6808863240937366, "language_loss": 0.7883575, "learning_rate": 3.6438631417095095e-06, "loss": 0.81467187, "num_input_tokens_seen": 77929930, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.2902832, "step": 3611, "time_per_iteration": 2.909013271331787 }, { "auxiliary_loss_clip": 0.01546404, "auxiliary_loss_mlp": 0.01098888, "balance_loss_clip": 1.3364383, "balance_loss_mlp": 1.07139826, "epoch": 0.21716518863670525, "flos": 19509365905920.0, "grad_norm": 3.0529351993419005, "language_loss": 0.64275777, "learning_rate": 3.6436412785256637e-06, "loss": 0.66921067, "num_input_tokens_seen": 77949060, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.27478027, "step": 3612, "time_per_iteration": 2.8417248725891113 }, { "auxiliary_loss_clip": 0.01554507, "auxiliary_loss_mlp": 0.01094298, "balance_loss_clip": 1.34069204, "balance_loss_mlp": 1.06674862, "epoch": 0.21722531188937322, "flos": 19801503452160.0, "grad_norm": 1.7351041988385476, "language_loss": 0.7726289, "learning_rate": 3.643419353014776e-06, "loss": 0.79911703, "num_input_tokens_seen": 77967920, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.27514648, "step": 3613, "time_per_iteration": 2.8021726608276367 }, { "auxiliary_loss_clip": 0.01551739, "auxiliary_loss_mlp": 0.01101072, "balance_loss_clip": 1.34172082, "balance_loss_mlp": 1.07374895, "epoch": 0.21728543514204118, "flos": 13342281120000.0, "grad_norm": 1.9598516872721383, "language_loss": 0.72838777, "learning_rate": 3.643197365185261e-06, "loss": 0.75491589, "num_input_tokens_seen": 77985330, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.27331543, "step": 3614, "time_per_iteration": 2.9022440910339355 }, { "auxiliary_loss_clip": 0.01549668, "auxiliary_loss_mlp": 0.01105539, "balance_loss_clip": 1.33733869, "balance_loss_mlp": 1.07773948, "epoch": 0.21734555839470915, "flos": 15240315519360.0, "grad_norm": 1.5754842326616403, "language_loss": 0.73954558, "learning_rate": 3.6429753150455378e-06, "loss": 0.76609766, "num_input_tokens_seen": 78003105, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.27807617, "step": 3615, "time_per_iteration": 2.807662010192871 }, { "auxiliary_loss_clip": 0.01581337, "auxiliary_loss_mlp": 0.01089343, "balance_loss_clip": 1.36232305, "balance_loss_mlp": 1.06228209, "epoch": 0.2174056816473771, "flos": 19983388124160.0, "grad_norm": 2.251969703188635, "language_loss": 0.9106493, "learning_rate": 3.6427532026040263e-06, "loss": 0.93735611, "num_input_tokens_seen": 78019655, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.27087402, "step": 3616, "time_per_iteration": 4.206519365310669 }, { "auxiliary_loss_clip": 0.01576884, "auxiliary_loss_mlp": 0.01085695, "balance_loss_clip": 1.36186445, "balance_loss_mlp": 1.05849111, "epoch": 0.21746580490004508, "flos": 16695076181760.0, "grad_norm": 1.9325679661898285, "language_loss": 0.82048261, "learning_rate": 3.642531027869148e-06, "loss": 0.84710848, "num_input_tokens_seen": 78036025, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.27209473, "step": 3617, "time_per_iteration": 4.319268703460693 }, { "auxiliary_loss_clip": 0.01571902, "auxiliary_loss_mlp": 0.01086563, "balance_loss_clip": 1.35742414, "balance_loss_mlp": 1.06011057, "epoch": 0.21752592815271307, "flos": 25782450554880.0, "grad_norm": 1.7090524231480697, "language_loss": 0.76580828, "learning_rate": 3.642308790849329e-06, "loss": 0.79239297, "num_input_tokens_seen": 78055645, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.26489258, "step": 3618, "time_per_iteration": 4.363492727279663 }, { "auxiliary_loss_clip": 0.01568508, "auxiliary_loss_mlp": 0.01082781, "balance_loss_clip": 1.35338068, "balance_loss_mlp": 1.05581605, "epoch": 0.21758605140538104, "flos": 11262045335040.0, "grad_norm": 2.032160888929606, "language_loss": 0.69966739, "learning_rate": 3.642086491552996e-06, "loss": 0.72618032, "num_input_tokens_seen": 78071660, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26977539, "step": 3619, "time_per_iteration": 2.7954177856445312 }, { "auxiliary_loss_clip": 0.01570055, "auxiliary_loss_mlp": 0.01078063, "balance_loss_clip": 1.35500097, "balance_loss_mlp": 1.05139542, "epoch": 0.217646174658049, "flos": 19251415445760.0, "grad_norm": 1.5822913790636493, "language_loss": 0.78680336, "learning_rate": 3.641864129988579e-06, "loss": 0.81328452, "num_input_tokens_seen": 78091265, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2668457, "step": 3620, "time_per_iteration": 2.852590560913086 }, { "auxiliary_loss_clip": 0.01554469, "auxiliary_loss_mlp": 0.01061546, "balance_loss_clip": 1.34600282, "balance_loss_mlp": 1.03515327, "epoch": 0.21770629791071697, "flos": 21955226071680.0, "grad_norm": 1.4695835619853503, "language_loss": 0.80876571, "learning_rate": 3.641641706164509e-06, "loss": 0.83492589, "num_input_tokens_seen": 78110095, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.26391602, "step": 3621, "time_per_iteration": 2.9717319011688232 }, { "auxiliary_loss_clip": 0.01560484, "auxiliary_loss_mlp": 0.01054599, "balance_loss_clip": 1.34750366, "balance_loss_mlp": 1.02687073, "epoch": 0.21776642116338493, "flos": 24947645149440.0, "grad_norm": 1.654074014635563, "language_loss": 0.88439906, "learning_rate": 3.641419220089221e-06, "loss": 0.91054988, "num_input_tokens_seen": 78129475, "router_z_loss_clip": 2.12988281, "router_z_loss_mlp": 0.27746582, "step": 3622, "time_per_iteration": 2.8861684799194336 }, { "auxiliary_loss_clip": 0.01586252, "auxiliary_loss_mlp": 0.01054647, "balance_loss_clip": 1.36733961, "balance_loss_mlp": 1.02753866, "epoch": 0.2178265444160529, "flos": 17830163197440.0, "grad_norm": 1.8246283818750202, "language_loss": 0.78141761, "learning_rate": 3.641196671771152e-06, "loss": 0.80782658, "num_input_tokens_seen": 78146880, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.27075195, "step": 3623, "time_per_iteration": 2.8534955978393555 }, { "auxiliary_loss_clip": 0.01573518, "auxiliary_loss_mlp": 0.01057151, "balance_loss_clip": 1.35740018, "balance_loss_mlp": 1.02871919, "epoch": 0.2178866676687209, "flos": 17721991584000.0, "grad_norm": 1.8972716539453516, "language_loss": 0.85494578, "learning_rate": 3.640974061218741e-06, "loss": 0.88125247, "num_input_tokens_seen": 78165065, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.28405762, "step": 3624, "time_per_iteration": 2.8529136180877686 }, { "auxiliary_loss_clip": 0.01567822, "auxiliary_loss_mlp": 0.01057711, "balance_loss_clip": 1.35203028, "balance_loss_mlp": 1.02893424, "epoch": 0.21794679092138886, "flos": 16954519720320.0, "grad_norm": 2.2119978052108022, "language_loss": 0.79330331, "learning_rate": 3.640751388440429e-06, "loss": 0.81955862, "num_input_tokens_seen": 78180005, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.2878418, "step": 3625, "time_per_iteration": 2.8427653312683105 }, { "auxiliary_loss_clip": 0.01344556, "auxiliary_loss_mlp": 0.01029507, "balance_loss_clip": 1.21580672, "balance_loss_mlp": 0.99956173, "epoch": 0.21800691417405682, "flos": 63748314432000.0, "grad_norm": 0.8196046102317003, "language_loss": 0.6073935, "learning_rate": 3.64052865344466e-06, "loss": 0.63113415, "num_input_tokens_seen": 78245350, "router_z_loss_clip": 1.2890625, "router_z_loss_mlp": 0.29882812, "step": 3626, "time_per_iteration": 3.525327682495117 }, { "auxiliary_loss_clip": 0.01561687, "auxiliary_loss_mlp": 0.01058128, "balance_loss_clip": 1.34565854, "balance_loss_mlp": 1.02775371, "epoch": 0.21806703742672479, "flos": 21626367730560.0, "grad_norm": 2.439441746122754, "language_loss": 0.91610467, "learning_rate": 3.6403058562398795e-06, "loss": 0.94230282, "num_input_tokens_seen": 78264165, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.30371094, "step": 3627, "time_per_iteration": 2.869502067565918 }, { "auxiliary_loss_clip": 0.01560913, "auxiliary_loss_mlp": 0.01046255, "balance_loss_clip": 1.3495152, "balance_loss_mlp": 1.01733494, "epoch": 0.21812716067939275, "flos": 19364428252800.0, "grad_norm": 1.567586291762664, "language_loss": 0.74360406, "learning_rate": 3.6400829968345365e-06, "loss": 0.76967579, "num_input_tokens_seen": 78283745, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.28930664, "step": 3628, "time_per_iteration": 2.878415584564209 }, { "auxiliary_loss_clip": 0.01543001, "auxiliary_loss_mlp": 0.01048683, "balance_loss_clip": 1.33244967, "balance_loss_mlp": 1.01890445, "epoch": 0.21818728393206072, "flos": 23558001033600.0, "grad_norm": 1.9045103106186956, "language_loss": 0.78829753, "learning_rate": 3.6398600752370826e-06, "loss": 0.81421435, "num_input_tokens_seen": 78302900, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.29785156, "step": 3629, "time_per_iteration": 2.8743019104003906 }, { "auxiliary_loss_clip": 0.01550203, "auxiliary_loss_mlp": 0.01053615, "balance_loss_clip": 1.34075618, "balance_loss_mlp": 1.0257082, "epoch": 0.21824740718472868, "flos": 30237502890240.0, "grad_norm": 1.5831673489698208, "language_loss": 0.72456503, "learning_rate": 3.63963709145597e-06, "loss": 0.7506032, "num_input_tokens_seen": 78326470, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.27893066, "step": 3630, "time_per_iteration": 2.971954822540283 }, { "auxiliary_loss_clip": 0.01534599, "auxiliary_loss_mlp": 0.01048106, "balance_loss_clip": 1.33023024, "balance_loss_mlp": 1.02110541, "epoch": 0.21830753043739667, "flos": 26144274372480.0, "grad_norm": 1.6850470883380413, "language_loss": 0.77442551, "learning_rate": 3.6394140454996544e-06, "loss": 0.8002525, "num_input_tokens_seen": 78345810, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.27001953, "step": 3631, "time_per_iteration": 2.911485195159912 }, { "auxiliary_loss_clip": 0.01567206, "auxiliary_loss_mlp": 0.01046896, "balance_loss_clip": 1.35222077, "balance_loss_mlp": 1.01933479, "epoch": 0.21836765369006464, "flos": 21729381436800.0, "grad_norm": 2.875348684553046, "language_loss": 0.76117909, "learning_rate": 3.639190937376594e-06, "loss": 0.78732014, "num_input_tokens_seen": 78364085, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.27587891, "step": 3632, "time_per_iteration": 2.832697629928589 }, { "auxiliary_loss_clip": 0.01535879, "auxiliary_loss_mlp": 0.01048474, "balance_loss_clip": 1.32951164, "balance_loss_mlp": 1.02198565, "epoch": 0.2184277769427326, "flos": 19947029287680.0, "grad_norm": 8.445167202456247, "language_loss": 0.85058415, "learning_rate": 3.638967767095249e-06, "loss": 0.87642765, "num_input_tokens_seen": 78381385, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.26513672, "step": 3633, "time_per_iteration": 2.858062982559204 }, { "auxiliary_loss_clip": 0.01553551, "auxiliary_loss_mlp": 0.01053779, "balance_loss_clip": 1.34263575, "balance_loss_mlp": 1.02547848, "epoch": 0.21848790019540057, "flos": 20350098380160.0, "grad_norm": 2.0142021819604117, "language_loss": 0.82031989, "learning_rate": 3.6387445346640823e-06, "loss": 0.84639323, "num_input_tokens_seen": 78400500, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.28344727, "step": 3634, "time_per_iteration": 2.857800006866455 }, { "auxiliary_loss_clip": 0.01557966, "auxiliary_loss_mlp": 0.01047886, "balance_loss_clip": 1.34459472, "balance_loss_mlp": 1.0199194, "epoch": 0.21854802344806853, "flos": 15459328189440.0, "grad_norm": 1.7712082112106888, "language_loss": 0.76205093, "learning_rate": 3.638521240091558e-06, "loss": 0.78810942, "num_input_tokens_seen": 78418340, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.27941895, "step": 3635, "time_per_iteration": 2.8408913612365723 }, { "auxiliary_loss_clip": 0.01534712, "auxiliary_loss_mlp": 0.01055022, "balance_loss_clip": 1.32823944, "balance_loss_mlp": 1.02715123, "epoch": 0.2186081467007365, "flos": 16327958722560.0, "grad_norm": 3.144301965428661, "language_loss": 0.89419532, "learning_rate": 3.6382978833861445e-06, "loss": 0.92009264, "num_input_tokens_seen": 78434375, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.27905273, "step": 3636, "time_per_iteration": 2.794177770614624 }, { "auxiliary_loss_clip": 0.01554667, "auxiliary_loss_mlp": 0.01051753, "balance_loss_clip": 1.34286618, "balance_loss_mlp": 1.02354836, "epoch": 0.2186682699534045, "flos": 21699221137920.0, "grad_norm": 2.2392941826777686, "language_loss": 0.76917398, "learning_rate": 3.638074464556311e-06, "loss": 0.7952382, "num_input_tokens_seen": 78451735, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.28198242, "step": 3637, "time_per_iteration": 2.8681886196136475 }, { "auxiliary_loss_clip": 0.01574191, "auxiliary_loss_mlp": 0.01047752, "balance_loss_clip": 1.35809314, "balance_loss_mlp": 1.01933229, "epoch": 0.21872839320607246, "flos": 17745727121280.0, "grad_norm": 2.56757592520503, "language_loss": 0.91048813, "learning_rate": 3.63785098361053e-06, "loss": 0.9367075, "num_input_tokens_seen": 78462730, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.28442383, "step": 3638, "time_per_iteration": 2.7905232906341553 }, { "auxiliary_loss_clip": 0.01558089, "auxiliary_loss_mlp": 0.01054073, "balance_loss_clip": 1.34548569, "balance_loss_mlp": 1.0253675, "epoch": 0.21878851645874042, "flos": 18659222513280.0, "grad_norm": 4.039486675225097, "language_loss": 0.90264744, "learning_rate": 3.637627440557275e-06, "loss": 0.92876899, "num_input_tokens_seen": 78476300, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.28723145, "step": 3639, "time_per_iteration": 2.7648720741271973 }, { "auxiliary_loss_clip": 0.01555462, "auxiliary_loss_mlp": 0.01041831, "balance_loss_clip": 1.3452878, "balance_loss_mlp": 1.01486623, "epoch": 0.2188486397114084, "flos": 25568414812800.0, "grad_norm": 1.6636724489399652, "language_loss": 0.80248511, "learning_rate": 3.637403835405024e-06, "loss": 0.82845807, "num_input_tokens_seen": 78496135, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.26977539, "step": 3640, "time_per_iteration": 2.8911850452423096 }, { "auxiliary_loss_clip": 0.01566267, "auxiliary_loss_mlp": 0.01054612, "balance_loss_clip": 1.35282707, "balance_loss_mlp": 1.0251193, "epoch": 0.21890876296407635, "flos": 17900347161600.0, "grad_norm": 1.9284827349117362, "language_loss": 0.73597336, "learning_rate": 3.637180168162255e-06, "loss": 0.76218218, "num_input_tokens_seen": 78513855, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.29516602, "step": 3641, "time_per_iteration": 2.816746950149536 }, { "auxiliary_loss_clip": 0.01569276, "auxiliary_loss_mlp": 0.01052224, "balance_loss_clip": 1.35795927, "balance_loss_mlp": 1.02425742, "epoch": 0.21896888621674432, "flos": 17758124196480.0, "grad_norm": 2.166661236949752, "language_loss": 0.82277179, "learning_rate": 3.63695643883745e-06, "loss": 0.84898674, "num_input_tokens_seen": 78531740, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.27978516, "step": 3642, "time_per_iteration": 4.281655550003052 }, { "auxiliary_loss_clip": 0.01572777, "auxiliary_loss_mlp": 0.01049048, "balance_loss_clip": 1.35734725, "balance_loss_mlp": 1.01978195, "epoch": 0.21902900946941228, "flos": 23086603013760.0, "grad_norm": 1.8649919917963906, "language_loss": 0.72853112, "learning_rate": 3.6367326474390928e-06, "loss": 0.75474942, "num_input_tokens_seen": 78549600, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.29284668, "step": 3643, "time_per_iteration": 2.873709201812744 }, { "auxiliary_loss_clip": 0.0156781, "auxiliary_loss_mlp": 0.01048579, "balance_loss_clip": 1.35548282, "balance_loss_mlp": 1.02135158, "epoch": 0.21908913272208028, "flos": 48195546451200.0, "grad_norm": 1.8979379737637807, "language_loss": 0.68783379, "learning_rate": 3.6365087939756696e-06, "loss": 0.71399772, "num_input_tokens_seen": 78573350, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.27233887, "step": 3644, "time_per_iteration": 3.078646659851074 }, { "auxiliary_loss_clip": 0.01590829, "auxiliary_loss_mlp": 0.01051936, "balance_loss_clip": 1.37001014, "balance_loss_mlp": 1.02363539, "epoch": 0.21914925597474824, "flos": 22246911169920.0, "grad_norm": 2.6831741740501145, "language_loss": 0.79479009, "learning_rate": 3.636284878455669e-06, "loss": 0.82121772, "num_input_tokens_seen": 78591005, "router_z_loss_clip": 2.20703125, "router_z_loss_mlp": 0.28295898, "step": 3645, "time_per_iteration": 2.8720359802246094 }, { "auxiliary_loss_clip": 0.01557569, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.34948862, "balance_loss_mlp": 1.02481925, "epoch": 0.2192093792274162, "flos": 22135300951680.0, "grad_norm": 1.6963448253196738, "language_loss": 0.83317339, "learning_rate": 3.636060900887582e-06, "loss": 0.8592757, "num_input_tokens_seen": 78610645, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.27880859, "step": 3646, "time_per_iteration": 2.8609414100646973 }, { "auxiliary_loss_clip": 0.01561116, "auxiliary_loss_mlp": 0.01048159, "balance_loss_clip": 1.35315347, "balance_loss_mlp": 1.02008533, "epoch": 0.21926950248008417, "flos": 15677933656320.0, "grad_norm": 1.8496548428486472, "language_loss": 0.84131402, "learning_rate": 3.635836861279901e-06, "loss": 0.86740673, "num_input_tokens_seen": 78628340, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.28112793, "step": 3647, "time_per_iteration": 2.8129632472991943 }, { "auxiliary_loss_clip": 0.01557128, "auxiliary_loss_mlp": 0.01054944, "balance_loss_clip": 1.34731174, "balance_loss_mlp": 1.02704859, "epoch": 0.21932962573275214, "flos": 30274449909120.0, "grad_norm": 1.728672460955334, "language_loss": 0.73199528, "learning_rate": 3.635612759641123e-06, "loss": 0.75811601, "num_input_tokens_seen": 78649355, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.27929688, "step": 3648, "time_per_iteration": 2.9695637226104736 }, { "auxiliary_loss_clip": 0.0157683, "auxiliary_loss_mlp": 0.01054595, "balance_loss_clip": 1.3591783, "balance_loss_mlp": 1.0272963, "epoch": 0.2193897489854201, "flos": 10787299200000.0, "grad_norm": 2.279313432773206, "language_loss": 0.74590254, "learning_rate": 3.635388595979745e-06, "loss": 0.7722168, "num_input_tokens_seen": 78664915, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.27319336, "step": 3649, "time_per_iteration": 2.8115382194519043 }, { "auxiliary_loss_clip": 0.0155767, "auxiliary_loss_mlp": 0.010464, "balance_loss_clip": 1.34889829, "balance_loss_mlp": 1.01960182, "epoch": 0.21944987223808807, "flos": 19142112712320.0, "grad_norm": 1.8622976473207737, "language_loss": 0.87135887, "learning_rate": 3.635164370304267e-06, "loss": 0.8973996, "num_input_tokens_seen": 78681475, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.26806641, "step": 3650, "time_per_iteration": 2.8774044513702393 }, { "auxiliary_loss_clip": 0.01569695, "auxiliary_loss_mlp": 0.01047039, "balance_loss_clip": 1.35647726, "balance_loss_mlp": 1.01957369, "epoch": 0.21950999549075606, "flos": 22721385836160.0, "grad_norm": 2.09268474080192, "language_loss": 0.84587395, "learning_rate": 3.6349400826231927e-06, "loss": 0.87204129, "num_input_tokens_seen": 78702300, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.27478027, "step": 3651, "time_per_iteration": 4.382604122161865 }, { "auxiliary_loss_clip": 0.01575232, "auxiliary_loss_mlp": 0.0104918, "balance_loss_clip": 1.3614924, "balance_loss_mlp": 1.02177382, "epoch": 0.21957011874342403, "flos": 10568467509120.0, "grad_norm": 2.158004919971209, "language_loss": 0.75310826, "learning_rate": 3.634715732945027e-06, "loss": 0.77935237, "num_input_tokens_seen": 78720230, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.27416992, "step": 3652, "time_per_iteration": 4.16953706741333 }, { "auxiliary_loss_clip": 0.01363819, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.22806644, "balance_loss_mlp": 1.00178778, "epoch": 0.219630241996092, "flos": 65774591153280.0, "grad_norm": 0.7441403570779467, "language_loss": 0.51660192, "learning_rate": 3.6344913212782764e-06, "loss": 0.54060513, "num_input_tokens_seen": 78780200, "router_z_loss_clip": 1.359375, "router_z_loss_mlp": 0.34765625, "step": 3653, "time_per_iteration": 4.77081561088562 }, { "auxiliary_loss_clip": 0.01599535, "auxiliary_loss_mlp": 0.01049809, "balance_loss_clip": 1.38431334, "balance_loss_mlp": 1.02268863, "epoch": 0.21969036524875996, "flos": 23707146453120.0, "grad_norm": 2.1890048283841548, "language_loss": 0.76188982, "learning_rate": 3.6342668476314514e-06, "loss": 0.78838325, "num_input_tokens_seen": 78800575, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.27124023, "step": 3654, "time_per_iteration": 2.857222557067871 }, { "auxiliary_loss_clip": 0.01592434, "auxiliary_loss_mlp": 0.01054607, "balance_loss_clip": 1.37447405, "balance_loss_mlp": 1.02646172, "epoch": 0.21975048850142792, "flos": 19649009917440.0, "grad_norm": 2.0203538279675284, "language_loss": 0.73486674, "learning_rate": 3.634042312013064e-06, "loss": 0.76133716, "num_input_tokens_seen": 78819585, "router_z_loss_clip": 2.1796875, "router_z_loss_mlp": 0.28137207, "step": 3655, "time_per_iteration": 2.878793954849243 }, { "auxiliary_loss_clip": 0.01561552, "auxiliary_loss_mlp": 0.01046115, "balance_loss_clip": 1.35129702, "balance_loss_mlp": 1.01937628, "epoch": 0.21981061175409589, "flos": 22456920124800.0, "grad_norm": 1.6227779920421104, "language_loss": 0.8116287, "learning_rate": 3.6338177144316276e-06, "loss": 0.83770537, "num_input_tokens_seen": 78837330, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.26757812, "step": 3656, "time_per_iteration": 3.042649745941162 }, { "auxiliary_loss_clip": 0.01584677, "auxiliary_loss_mlp": 0.0104784, "balance_loss_clip": 1.37143278, "balance_loss_mlp": 1.02128053, "epoch": 0.21987073500676388, "flos": 18160333637760.0, "grad_norm": 1.9518946221750473, "language_loss": 0.86256379, "learning_rate": 3.63359305489566e-06, "loss": 0.88888896, "num_input_tokens_seen": 78854955, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.26574707, "step": 3657, "time_per_iteration": 2.9165942668914795 }, { "auxiliary_loss_clip": 0.01579089, "auxiliary_loss_mlp": 0.01047062, "balance_loss_clip": 1.3630259, "balance_loss_mlp": 1.02028775, "epoch": 0.21993085825943184, "flos": 25636246047360.0, "grad_norm": 1.5225178479224017, "language_loss": 0.81132483, "learning_rate": 3.6333683334136803e-06, "loss": 0.83758634, "num_input_tokens_seen": 78874965, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26745605, "step": 3658, "time_per_iteration": 2.8570139408111572 }, { "auxiliary_loss_clip": 0.01338338, "auxiliary_loss_mlp": 0.0104454, "balance_loss_clip": 1.20915735, "balance_loss_mlp": 1.01268721, "epoch": 0.2199909815120998, "flos": 70959236175360.0, "grad_norm": 0.7863018379968482, "language_loss": 0.58287382, "learning_rate": 3.6331435499942095e-06, "loss": 0.60670257, "num_input_tokens_seen": 78937740, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.31835938, "step": 3659, "time_per_iteration": 3.439894676208496 }, { "auxiliary_loss_clip": 0.01564439, "auxiliary_loss_mlp": 0.01049627, "balance_loss_clip": 1.35254359, "balance_loss_mlp": 1.02305555, "epoch": 0.22005110476476777, "flos": 21553514323200.0, "grad_norm": 2.3620239272260974, "language_loss": 0.75152677, "learning_rate": 3.632918704645772e-06, "loss": 0.7776674, "num_input_tokens_seen": 78955055, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.26574707, "step": 3660, "time_per_iteration": 2.8405041694641113 }, { "auxiliary_loss_clip": 0.01574071, "auxiliary_loss_mlp": 0.01044586, "balance_loss_clip": 1.35928833, "balance_loss_mlp": 1.01769257, "epoch": 0.22011122801743574, "flos": 22064664539520.0, "grad_norm": 1.4876682536621197, "language_loss": 0.81491292, "learning_rate": 3.632693797376893e-06, "loss": 0.84109944, "num_input_tokens_seen": 78974895, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.2689209, "step": 3661, "time_per_iteration": 2.8389694690704346 }, { "auxiliary_loss_clip": 0.01554399, "auxiliary_loss_mlp": 0.01044477, "balance_loss_clip": 1.34467947, "balance_loss_mlp": 1.01856089, "epoch": 0.2201713512701037, "flos": 26698796369280.0, "grad_norm": 1.6593231991851383, "language_loss": 0.74362397, "learning_rate": 3.632468828196102e-06, "loss": 0.76961267, "num_input_tokens_seen": 78994990, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.2590332, "step": 3662, "time_per_iteration": 2.941701889038086 }, { "auxiliary_loss_clip": 0.01540209, "auxiliary_loss_mlp": 0.0105043, "balance_loss_clip": 1.33290553, "balance_loss_mlp": 1.02494359, "epoch": 0.22023147452277167, "flos": 22171931256960.0, "grad_norm": 1.5128658993929442, "language_loss": 0.79960454, "learning_rate": 3.632243797111929e-06, "loss": 0.82551098, "num_input_tokens_seen": 79014405, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.25512695, "step": 3663, "time_per_iteration": 2.846776247024536 }, { "auxiliary_loss_clip": 0.01584774, "auxiliary_loss_mlp": 0.01056414, "balance_loss_clip": 1.36585748, "balance_loss_mlp": 1.02891183, "epoch": 0.22029159777543966, "flos": 22532126261760.0, "grad_norm": 1.6002012756667423, "language_loss": 0.81034684, "learning_rate": 3.632018704132908e-06, "loss": 0.83675867, "num_input_tokens_seen": 79032375, "router_z_loss_clip": 2.18945312, "router_z_loss_mlp": 0.27514648, "step": 3664, "time_per_iteration": 2.879239797592163 }, { "auxiliary_loss_clip": 0.01575674, "auxiliary_loss_mlp": 0.01048834, "balance_loss_clip": 1.3564049, "balance_loss_mlp": 1.02179718, "epoch": 0.22035172102810763, "flos": 13050279308160.0, "grad_norm": 2.654225571970054, "language_loss": 0.7869646, "learning_rate": 3.6317935492675742e-06, "loss": 0.81320971, "num_input_tokens_seen": 79049635, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.27050781, "step": 3665, "time_per_iteration": 2.8054111003875732 }, { "auxiliary_loss_clip": 0.0155607, "auxiliary_loss_mlp": 0.01049548, "balance_loss_clip": 1.34342301, "balance_loss_mlp": 1.02398992, "epoch": 0.2204118442807756, "flos": 12173142752640.0, "grad_norm": 2.3829151101585824, "language_loss": 0.98858523, "learning_rate": 3.631568332524466e-06, "loss": 1.0146414, "num_input_tokens_seen": 79062890, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.2557373, "step": 3666, "time_per_iteration": 2.7684144973754883 }, { "auxiliary_loss_clip": 0.01544255, "auxiliary_loss_mlp": 0.01053444, "balance_loss_clip": 1.33312631, "balance_loss_mlp": 1.02540648, "epoch": 0.22047196753344356, "flos": 40122056977920.0, "grad_norm": 1.6755184617301044, "language_loss": 0.81650007, "learning_rate": 3.631343053912122e-06, "loss": 0.84247708, "num_input_tokens_seen": 79085495, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.28039551, "step": 3667, "time_per_iteration": 3.074862241744995 }, { "auxiliary_loss_clip": 0.0157822, "auxiliary_loss_mlp": 0.0105323, "balance_loss_clip": 1.35922194, "balance_loss_mlp": 1.02551436, "epoch": 0.22053209078611152, "flos": 20710067160960.0, "grad_norm": 1.62350957704438, "language_loss": 0.78158367, "learning_rate": 3.631117713439087e-06, "loss": 0.80789816, "num_input_tokens_seen": 79101820, "router_z_loss_clip": 2.19335938, "router_z_loss_mlp": 0.27722168, "step": 3668, "time_per_iteration": 2.8252410888671875 }, { "auxiliary_loss_clip": 0.01538559, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.32895494, "balance_loss_mlp": 1.02033305, "epoch": 0.2205922140387795, "flos": 24726732197760.0, "grad_norm": 1.4755785125719523, "language_loss": 0.72163588, "learning_rate": 3.630892311113904e-06, "loss": 0.74749386, "num_input_tokens_seen": 79123320, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.26916504, "step": 3669, "time_per_iteration": 2.996981143951416 }, { "auxiliary_loss_clip": 0.01544059, "auxiliary_loss_mlp": 0.01045837, "balance_loss_clip": 1.33235455, "balance_loss_mlp": 1.01936007, "epoch": 0.22065233729144745, "flos": 23487726579840.0, "grad_norm": 1.7339900196806786, "language_loss": 0.86347997, "learning_rate": 3.6306668469451215e-06, "loss": 0.88937891, "num_input_tokens_seen": 79141615, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.26477051, "step": 3670, "time_per_iteration": 2.840299367904663 }, { "auxiliary_loss_clip": 0.01561917, "auxiliary_loss_mlp": 0.01054283, "balance_loss_clip": 1.34628165, "balance_loss_mlp": 1.0278182, "epoch": 0.22071246054411545, "flos": 35239068892800.0, "grad_norm": 1.7642298718453084, "language_loss": 0.77511561, "learning_rate": 3.6304413209412886e-06, "loss": 0.80127764, "num_input_tokens_seen": 79164910, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.26489258, "step": 3671, "time_per_iteration": 2.9431350231170654 }, { "auxiliary_loss_clip": 0.01548795, "auxiliary_loss_mlp": 0.01047736, "balance_loss_clip": 1.33756471, "balance_loss_mlp": 1.02166545, "epoch": 0.2207725837967834, "flos": 18159383496960.0, "grad_norm": 1.9276752036350355, "language_loss": 0.81306082, "learning_rate": 3.6302157331109573e-06, "loss": 0.83902615, "num_input_tokens_seen": 79179685, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.26098633, "step": 3672, "time_per_iteration": 2.77610445022583 }, { "auxiliary_loss_clip": 0.01549402, "auxiliary_loss_mlp": 0.01049845, "balance_loss_clip": 1.33903682, "balance_loss_mlp": 1.02502525, "epoch": 0.22083270704945138, "flos": 20487887354880.0, "grad_norm": 1.8814614766587932, "language_loss": 0.74478024, "learning_rate": 3.629990083462682e-06, "loss": 0.7707727, "num_input_tokens_seen": 79196285, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.24768066, "step": 3673, "time_per_iteration": 2.8167409896850586 }, { "auxiliary_loss_clip": 0.01545796, "auxiliary_loss_mlp": 0.01045756, "balance_loss_clip": 1.33597517, "balance_loss_mlp": 1.0188024, "epoch": 0.22089283030211934, "flos": 34137671270400.0, "grad_norm": 2.3003851577677925, "language_loss": 0.77533615, "learning_rate": 3.6297643720050203e-06, "loss": 0.80125165, "num_input_tokens_seen": 79216060, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.26916504, "step": 3674, "time_per_iteration": 2.9564268589019775 }, { "auxiliary_loss_clip": 0.01550146, "auxiliary_loss_mlp": 0.01053364, "balance_loss_clip": 1.33844984, "balance_loss_mlp": 1.02612519, "epoch": 0.2209529535547873, "flos": 18086258620800.0, "grad_norm": 1.8842058019719292, "language_loss": 0.76367682, "learning_rate": 3.6295385987465293e-06, "loss": 0.78971195, "num_input_tokens_seen": 79235145, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.27233887, "step": 3675, "time_per_iteration": 2.8900671005249023 }, { "auxiliary_loss_clip": 0.01553198, "auxiliary_loss_mlp": 0.01053109, "balance_loss_clip": 1.34019494, "balance_loss_mlp": 1.02656066, "epoch": 0.22101307680745527, "flos": 27246803114880.0, "grad_norm": 1.7013513683409005, "language_loss": 0.81008852, "learning_rate": 3.629312763695772e-06, "loss": 0.8361516, "num_input_tokens_seen": 79256960, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.26550293, "step": 3676, "time_per_iteration": 2.9181697368621826 }, { "auxiliary_loss_clip": 0.01547327, "auxiliary_loss_mlp": 0.01047256, "balance_loss_clip": 1.33278227, "balance_loss_mlp": 1.02136374, "epoch": 0.22107320006012326, "flos": 16551495872640.0, "grad_norm": 2.0143616168436695, "language_loss": 0.77178288, "learning_rate": 3.6290868668613107e-06, "loss": 0.79772872, "num_input_tokens_seen": 79274860, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2590332, "step": 3677, "time_per_iteration": 4.333882093429565 }, { "auxiliary_loss_clip": 0.0154144, "auxiliary_loss_mlp": 0.01046929, "balance_loss_clip": 1.3289547, "balance_loss_mlp": 1.02090549, "epoch": 0.22113332331279123, "flos": 22064619294720.0, "grad_norm": 2.069746614824054, "language_loss": 0.84606987, "learning_rate": 3.628860908251712e-06, "loss": 0.87195355, "num_input_tokens_seen": 79294005, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.26049805, "step": 3678, "time_per_iteration": 2.8162953853607178 }, { "auxiliary_loss_clip": 0.01550182, "auxiliary_loss_mlp": 0.01052458, "balance_loss_clip": 1.34009469, "balance_loss_mlp": 1.02700698, "epoch": 0.2211934465654592, "flos": 26623499742720.0, "grad_norm": 1.7727009447070683, "language_loss": 0.90151638, "learning_rate": 3.6286348878755452e-06, "loss": 0.92754275, "num_input_tokens_seen": 79314005, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.2545166, "step": 3679, "time_per_iteration": 2.8922386169433594 }, { "auxiliary_loss_clip": 0.01592638, "auxiliary_loss_mlp": 0.0105132, "balance_loss_clip": 1.37444603, "balance_loss_mlp": 1.02470016, "epoch": 0.22125356981812716, "flos": 16368434835840.0, "grad_norm": 2.066163582602331, "language_loss": 0.87748849, "learning_rate": 3.6284088057413803e-06, "loss": 0.90392816, "num_input_tokens_seen": 79331030, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.26623535, "step": 3680, "time_per_iteration": 2.822935104370117 }, { "auxiliary_loss_clip": 0.01540964, "auxiliary_loss_mlp": 0.01043826, "balance_loss_clip": 1.33385611, "balance_loss_mlp": 1.01829159, "epoch": 0.22131369307079513, "flos": 21660645306240.0, "grad_norm": 1.8352116791114714, "language_loss": 0.81804973, "learning_rate": 3.6281826618577894e-06, "loss": 0.84389764, "num_input_tokens_seen": 79348560, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.25537109, "step": 3681, "time_per_iteration": 2.8633689880371094 }, { "auxiliary_loss_clip": 0.01524854, "auxiliary_loss_mlp": 0.01047278, "balance_loss_clip": 1.321908, "balance_loss_mlp": 1.02166021, "epoch": 0.2213738163234631, "flos": 19618940108160.0, "grad_norm": 2.638627396821017, "language_loss": 0.81246483, "learning_rate": 3.62795645623335e-06, "loss": 0.83818614, "num_input_tokens_seen": 79367175, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.25646973, "step": 3682, "time_per_iteration": 2.8272817134857178 }, { "auxiliary_loss_clip": 0.01548082, "auxiliary_loss_mlp": 0.01049493, "balance_loss_clip": 1.33675599, "balance_loss_mlp": 1.02300453, "epoch": 0.22143393957613106, "flos": 23633976332160.0, "grad_norm": 1.5819038677764574, "language_loss": 0.78053868, "learning_rate": 3.627730188876638e-06, "loss": 0.80651438, "num_input_tokens_seen": 79388435, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.26513672, "step": 3683, "time_per_iteration": 2.8330986499786377 }, { "auxiliary_loss_clip": 0.01552038, "auxiliary_loss_mlp": 0.01052489, "balance_loss_clip": 1.33509672, "balance_loss_mlp": 1.02542806, "epoch": 0.22149406282879905, "flos": 26189139231360.0, "grad_norm": 2.3624098391980493, "language_loss": 0.73754454, "learning_rate": 3.627503859796234e-06, "loss": 0.76358986, "num_input_tokens_seen": 79407910, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.27075195, "step": 3684, "time_per_iteration": 2.8971121311187744 }, { "auxiliary_loss_clip": 0.01533122, "auxiliary_loss_mlp": 0.01054824, "balance_loss_clip": 1.32338214, "balance_loss_mlp": 1.02735806, "epoch": 0.221554186081467, "flos": 14547054407040.0, "grad_norm": 2.4743531564303898, "language_loss": 0.81101257, "learning_rate": 3.6272774690007207e-06, "loss": 0.83689207, "num_input_tokens_seen": 79424020, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.2746582, "step": 3685, "time_per_iteration": 2.81502103805542 }, { "auxiliary_loss_clip": 0.01518332, "auxiliary_loss_mlp": 0.01061509, "balance_loss_clip": 1.31386423, "balance_loss_mlp": 1.03423405, "epoch": 0.22161430933413498, "flos": 22248811451520.0, "grad_norm": 1.4117014025950925, "language_loss": 0.87684786, "learning_rate": 3.6270510164986823e-06, "loss": 0.9026463, "num_input_tokens_seen": 79445605, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.27319336, "step": 3686, "time_per_iteration": 4.306074380874634 }, { "auxiliary_loss_clip": 0.01525819, "auxiliary_loss_mlp": 0.01070648, "balance_loss_clip": 1.31715465, "balance_loss_mlp": 1.04084587, "epoch": 0.22167443258680294, "flos": 23486821683840.0, "grad_norm": 1.827666874056116, "language_loss": 0.79183149, "learning_rate": 3.626824502298707e-06, "loss": 0.81779623, "num_input_tokens_seen": 79463850, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.29785156, "step": 3687, "time_per_iteration": 4.319127559661865 }, { "auxiliary_loss_clip": 0.015522, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.33489823, "balance_loss_mlp": 1.03523254, "epoch": 0.2217345558394709, "flos": 23231359687680.0, "grad_norm": 2.341458328658131, "language_loss": 0.86218399, "learning_rate": 3.626597926409383e-06, "loss": 0.88835514, "num_input_tokens_seen": 79482845, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.29663086, "step": 3688, "time_per_iteration": 4.248947620391846 }, { "auxiliary_loss_clip": 0.01542868, "auxiliary_loss_mlp": 0.01056624, "balance_loss_clip": 1.32886028, "balance_loss_mlp": 1.028157, "epoch": 0.22179467909213887, "flos": 20020651856640.0, "grad_norm": 1.7384778086244979, "language_loss": 0.82426679, "learning_rate": 3.6263712888393027e-06, "loss": 0.85026169, "num_input_tokens_seen": 79501550, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.28466797, "step": 3689, "time_per_iteration": 2.8132243156433105 }, { "auxiliary_loss_clip": 0.01522264, "auxiliary_loss_mlp": 0.01074015, "balance_loss_clip": 1.3145597, "balance_loss_mlp": 1.04499984, "epoch": 0.22185480234480687, "flos": 19692607921920.0, "grad_norm": 1.6242320643922146, "language_loss": 0.71671546, "learning_rate": 3.626144589597061e-06, "loss": 0.74267828, "num_input_tokens_seen": 79519680, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.2902832, "step": 3690, "time_per_iteration": 2.8555452823638916 }, { "auxiliary_loss_clip": 0.01532952, "auxiliary_loss_mlp": 0.01062989, "balance_loss_clip": 1.3188808, "balance_loss_mlp": 1.0328052, "epoch": 0.22191492559747483, "flos": 21991403928960.0, "grad_norm": 1.6744829651121327, "language_loss": 0.73484838, "learning_rate": 3.6259178286912528e-06, "loss": 0.76080775, "num_input_tokens_seen": 79539000, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.30175781, "step": 3691, "time_per_iteration": 2.8474085330963135 }, { "auxiliary_loss_clip": 0.01517481, "auxiliary_loss_mlp": 0.01068514, "balance_loss_clip": 1.30975842, "balance_loss_mlp": 1.03594601, "epoch": 0.2219750488501428, "flos": 23232536052480.0, "grad_norm": 1.9043043595744231, "language_loss": 0.72802353, "learning_rate": 3.625691006130477e-06, "loss": 0.75388348, "num_input_tokens_seen": 79559695, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.32568359, "step": 3692, "time_per_iteration": 2.8302063941955566 }, { "auxiliary_loss_clip": 0.01531918, "auxiliary_loss_mlp": 0.01060433, "balance_loss_clip": 1.31971872, "balance_loss_mlp": 1.03034472, "epoch": 0.22203517210281076, "flos": 22463390131200.0, "grad_norm": 2.07943778477618, "language_loss": 0.87677133, "learning_rate": 3.6254641219233362e-06, "loss": 0.90269482, "num_input_tokens_seen": 79579095, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.30102539, "step": 3693, "time_per_iteration": 2.8835949897766113 }, { "auxiliary_loss_clip": 0.01508168, "auxiliary_loss_mlp": 0.01055709, "balance_loss_clip": 1.30488992, "balance_loss_mlp": 1.02645516, "epoch": 0.22209529535547873, "flos": 17572981898880.0, "grad_norm": 2.265172959033575, "language_loss": 0.8597343, "learning_rate": 3.6252371760784325e-06, "loss": 0.88537312, "num_input_tokens_seen": 79596430, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.29223633, "step": 3694, "time_per_iteration": 2.807481288909912 }, { "auxiliary_loss_clip": 0.01538653, "auxiliary_loss_mlp": 0.01049744, "balance_loss_clip": 1.32070518, "balance_loss_mlp": 1.02025175, "epoch": 0.2221554186081467, "flos": 21478579655040.0, "grad_norm": 2.1405591901886263, "language_loss": 0.70808375, "learning_rate": 3.6250101686043725e-06, "loss": 0.73396766, "num_input_tokens_seen": 79615825, "router_z_loss_clip": 2.18164062, "router_z_loss_mlp": 0.29443359, "step": 3695, "time_per_iteration": 2.8378751277923584 }, { "auxiliary_loss_clip": 0.01504991, "auxiliary_loss_mlp": 0.01052052, "balance_loss_clip": 1.30282021, "balance_loss_mlp": 1.0232029, "epoch": 0.22221554186081466, "flos": 27685099923840.0, "grad_norm": 1.4591319382742496, "language_loss": 0.72429812, "learning_rate": 3.6247830995097637e-06, "loss": 0.74986857, "num_input_tokens_seen": 79637875, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.28857422, "step": 3696, "time_per_iteration": 2.880035400390625 }, { "auxiliary_loss_clip": 0.01521868, "auxiliary_loss_mlp": 0.01051763, "balance_loss_clip": 1.31282651, "balance_loss_mlp": 1.02060127, "epoch": 0.22227566511348265, "flos": 25970217050880.0, "grad_norm": 1.6236684126671401, "language_loss": 0.88405144, "learning_rate": 3.624555968803217e-06, "loss": 0.90978765, "num_input_tokens_seen": 79656970, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.31152344, "step": 3697, "time_per_iteration": 2.925234079360962 }, { "auxiliary_loss_clip": 0.01512653, "auxiliary_loss_mlp": 0.01047719, "balance_loss_clip": 1.31021452, "balance_loss_mlp": 1.01962113, "epoch": 0.22233578836615062, "flos": 39218515441920.0, "grad_norm": 1.625401334914006, "language_loss": 0.66932386, "learning_rate": 3.624328776493346e-06, "loss": 0.69492757, "num_input_tokens_seen": 79680275, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.28112793, "step": 3698, "time_per_iteration": 3.032222032546997 }, { "auxiliary_loss_clip": 0.01531077, "auxiliary_loss_mlp": 0.01052922, "balance_loss_clip": 1.32219326, "balance_loss_mlp": 1.02526593, "epoch": 0.22239591161881858, "flos": 36297592427520.0, "grad_norm": 1.745801909224781, "language_loss": 0.8371138, "learning_rate": 3.6241015225887637e-06, "loss": 0.86295378, "num_input_tokens_seen": 79701255, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.27685547, "step": 3699, "time_per_iteration": 3.0708842277526855 }, { "auxiliary_loss_clip": 0.01518083, "auxiliary_loss_mlp": 0.01052441, "balance_loss_clip": 1.3117578, "balance_loss_mlp": 1.0244267, "epoch": 0.22245603487148655, "flos": 19729192982400.0, "grad_norm": 1.4674400596607815, "language_loss": 0.8061806, "learning_rate": 3.62387420709809e-06, "loss": 0.83188581, "num_input_tokens_seen": 79721315, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.2800293, "step": 3700, "time_per_iteration": 2.8814496994018555 }, { "auxiliary_loss_clip": 0.01524298, "auxiliary_loss_mlp": 0.01050346, "balance_loss_clip": 1.31281376, "balance_loss_mlp": 1.02230823, "epoch": 0.2225161581241545, "flos": 46296742890240.0, "grad_norm": 1.8739201316578045, "language_loss": 0.73064053, "learning_rate": 3.623646830029943e-06, "loss": 0.756387, "num_input_tokens_seen": 79742705, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.28051758, "step": 3701, "time_per_iteration": 3.0360448360443115 }, { "auxiliary_loss_clip": 0.01533225, "auxiliary_loss_mlp": 0.01056941, "balance_loss_clip": 1.32457376, "balance_loss_mlp": 1.02780616, "epoch": 0.22257628137682248, "flos": 23706739249920.0, "grad_norm": 1.7040491240413036, "language_loss": 0.81475317, "learning_rate": 3.6234193913929454e-06, "loss": 0.84065485, "num_input_tokens_seen": 79763000, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.29125977, "step": 3702, "time_per_iteration": 2.8876004219055176 }, { "auxiliary_loss_clip": 0.01504455, "auxiliary_loss_mlp": 0.01061735, "balance_loss_clip": 1.30291271, "balance_loss_mlp": 1.03279126, "epoch": 0.22263640462949044, "flos": 19363297132800.0, "grad_norm": 1.9879889272703812, "language_loss": 0.79133493, "learning_rate": 3.623191891195723e-06, "loss": 0.81699681, "num_input_tokens_seen": 79781335, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.28881836, "step": 3703, "time_per_iteration": 2.903937816619873 }, { "auxiliary_loss_clip": 0.01534618, "auxiliary_loss_mlp": 0.01066673, "balance_loss_clip": 1.32198787, "balance_loss_mlp": 1.03675115, "epoch": 0.22269652788215843, "flos": 20785499521920.0, "grad_norm": 2.027828373196379, "language_loss": 0.76006395, "learning_rate": 3.6229643294469005e-06, "loss": 0.78607684, "num_input_tokens_seen": 79800150, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.29907227, "step": 3704, "time_per_iteration": 2.8594677448272705 }, { "auxiliary_loss_clip": 0.01509844, "auxiliary_loss_mlp": 0.01073549, "balance_loss_clip": 1.30759084, "balance_loss_mlp": 1.04626155, "epoch": 0.2227566511348264, "flos": 47975583640320.0, "grad_norm": 1.7312814155415173, "language_loss": 0.65309811, "learning_rate": 3.6227367061551074e-06, "loss": 0.67893195, "num_input_tokens_seen": 79822390, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.27331543, "step": 3705, "time_per_iteration": 3.0836493968963623 }, { "auxiliary_loss_clip": 0.01327022, "auxiliary_loss_mlp": 0.01080512, "balance_loss_clip": 1.20234537, "balance_loss_mlp": 1.04598927, "epoch": 0.22281677438749437, "flos": 66244903297920.0, "grad_norm": 1.4423330771618499, "language_loss": 0.6528728, "learning_rate": 3.6225090213289766e-06, "loss": 0.67694807, "num_input_tokens_seen": 79873350, "router_z_loss_clip": 1.25, "router_z_loss_mlp": 0.34570312, "step": 3706, "time_per_iteration": 3.2226431369781494 }, { "auxiliary_loss_clip": 0.01529484, "auxiliary_loss_mlp": 0.01068963, "balance_loss_clip": 1.31984937, "balance_loss_mlp": 1.04134202, "epoch": 0.22287689764016233, "flos": 21881603502720.0, "grad_norm": 2.049113596674448, "language_loss": 0.81535149, "learning_rate": 3.622281274977141e-06, "loss": 0.84133589, "num_input_tokens_seen": 79891715, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.27648926, "step": 3707, "time_per_iteration": 2.856508493423462 }, { "auxiliary_loss_clip": 0.01515275, "auxiliary_loss_mlp": 0.01070343, "balance_loss_clip": 1.31057656, "balance_loss_mlp": 1.04116035, "epoch": 0.2229370208928303, "flos": 27683878314240.0, "grad_norm": 2.0399772428031526, "language_loss": 0.79370618, "learning_rate": 3.6220534671082367e-06, "loss": 0.81956238, "num_input_tokens_seen": 79911175, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.29199219, "step": 3708, "time_per_iteration": 2.914273738861084 }, { "auxiliary_loss_clip": 0.01538146, "auxiliary_loss_mlp": 0.01069448, "balance_loss_clip": 1.32525301, "balance_loss_mlp": 1.0413624, "epoch": 0.22299714414549826, "flos": 30167002212480.0, "grad_norm": 1.8900729925800335, "language_loss": 0.81362772, "learning_rate": 3.6218255977309024e-06, "loss": 0.83970368, "num_input_tokens_seen": 79931875, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.28063965, "step": 3709, "time_per_iteration": 2.8960230350494385 }, { "auxiliary_loss_clip": 0.01539169, "auxiliary_loss_mlp": 0.01067588, "balance_loss_clip": 1.32792854, "balance_loss_mlp": 1.03916883, "epoch": 0.22305726739816625, "flos": 23152036273920.0, "grad_norm": 1.7383643031380074, "language_loss": 0.69278044, "learning_rate": 3.6215976668537787e-06, "loss": 0.71884799, "num_input_tokens_seen": 79952445, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.28393555, "step": 3710, "time_per_iteration": 2.89029598236084 }, { "auxiliary_loss_clip": 0.01529957, "auxiliary_loss_mlp": 0.01062262, "balance_loss_clip": 1.31907535, "balance_loss_mlp": 1.03349674, "epoch": 0.22311739065083422, "flos": 19181457705600.0, "grad_norm": 1.983189018332301, "language_loss": 0.91576064, "learning_rate": 3.6213696744855096e-06, "loss": 0.94168282, "num_input_tokens_seen": 79971030, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.28747559, "step": 3711, "time_per_iteration": 2.7950806617736816 }, { "auxiliary_loss_clip": 0.01533214, "auxiliary_loss_mlp": 0.01063422, "balance_loss_clip": 1.32236636, "balance_loss_mlp": 1.03501415, "epoch": 0.22317751390350218, "flos": 13623514669440.0, "grad_norm": 2.7561736582618495, "language_loss": 0.90810961, "learning_rate": 3.6211416206347395e-06, "loss": 0.93407601, "num_input_tokens_seen": 79982085, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.28430176, "step": 3712, "time_per_iteration": 4.2227325439453125 }, { "auxiliary_loss_clip": 0.01515393, "auxiliary_loss_mlp": 0.01061958, "balance_loss_clip": 1.3127768, "balance_loss_mlp": 1.0322032, "epoch": 0.22323763715617015, "flos": 11034617132160.0, "grad_norm": 3.117126213710573, "language_loss": 0.76435262, "learning_rate": 3.620913505310117e-06, "loss": 0.79012609, "num_input_tokens_seen": 79997460, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.29724121, "step": 3713, "time_per_iteration": 2.7940170764923096 }, { "auxiliary_loss_clip": 0.01528658, "auxiliary_loss_mlp": 0.0105681, "balance_loss_clip": 1.32027483, "balance_loss_mlp": 1.02696049, "epoch": 0.22329776040883811, "flos": 41365677565440.0, "grad_norm": 2.011605690957797, "language_loss": 0.63568819, "learning_rate": 3.6206853285202917e-06, "loss": 0.66154277, "num_input_tokens_seen": 80022450, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.29833984, "step": 3714, "time_per_iteration": 3.0150184631347656 }, { "auxiliary_loss_clip": 0.01530868, "auxiliary_loss_mlp": 0.01046693, "balance_loss_clip": 1.3231349, "balance_loss_mlp": 1.01910758, "epoch": 0.22335788366150608, "flos": 25130479962240.0, "grad_norm": 4.880643617434365, "language_loss": 0.80048847, "learning_rate": 3.6204570902739164e-06, "loss": 0.82626408, "num_input_tokens_seen": 80042100, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.27587891, "step": 3715, "time_per_iteration": 2.8704912662506104 }, { "auxiliary_loss_clip": 0.01526097, "auxiliary_loss_mlp": 0.01051503, "balance_loss_clip": 1.31759357, "balance_loss_mlp": 1.02320325, "epoch": 0.22341800691417404, "flos": 16992326390400.0, "grad_norm": 1.7505197049793304, "language_loss": 0.78563833, "learning_rate": 3.620228790579645e-06, "loss": 0.81141424, "num_input_tokens_seen": 80059690, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.28271484, "step": 3716, "time_per_iteration": 2.8237810134887695 }, { "auxiliary_loss_clip": 0.01539449, "auxiliary_loss_mlp": 0.01052589, "balance_loss_clip": 1.32963288, "balance_loss_mlp": 1.02354908, "epoch": 0.22347813016684204, "flos": 14144754476160.0, "grad_norm": 2.04961603196267, "language_loss": 0.80359203, "learning_rate": 3.6200004294461367e-06, "loss": 0.82951236, "num_input_tokens_seen": 80076060, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.29077148, "step": 3717, "time_per_iteration": 2.8803696632385254 }, { "auxiliary_loss_clip": 0.01540288, "auxiliary_loss_mlp": 0.01050632, "balance_loss_clip": 1.32624543, "balance_loss_mlp": 1.02226031, "epoch": 0.22353825341951, "flos": 23592731057280.0, "grad_norm": 1.8158578987619551, "language_loss": 0.6810385, "learning_rate": 3.6197720068820497e-06, "loss": 0.70694768, "num_input_tokens_seen": 80094760, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.28369141, "step": 3718, "time_per_iteration": 2.984872341156006 }, { "auxiliary_loss_clip": 0.01532644, "auxiliary_loss_mlp": 0.01051017, "balance_loss_clip": 1.3212806, "balance_loss_mlp": 1.02138114, "epoch": 0.22359837667217797, "flos": 29835926876160.0, "grad_norm": 1.4559992634706882, "language_loss": 0.8173362, "learning_rate": 3.619543522896045e-06, "loss": 0.84317279, "num_input_tokens_seen": 80114475, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.29638672, "step": 3719, "time_per_iteration": 2.9412968158721924 }, { "auxiliary_loss_clip": 0.01551766, "auxiliary_loss_mlp": 0.01048242, "balance_loss_clip": 1.33589101, "balance_loss_mlp": 1.01907194, "epoch": 0.22365849992484593, "flos": 17612055423360.0, "grad_norm": 2.232010442755566, "language_loss": 0.87658888, "learning_rate": 3.6193149774967885e-06, "loss": 0.90258896, "num_input_tokens_seen": 80132920, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.29162598, "step": 3720, "time_per_iteration": 4.215553522109985 }, { "auxiliary_loss_clip": 0.01532494, "auxiliary_loss_mlp": 0.01045035, "balance_loss_clip": 1.32721972, "balance_loss_mlp": 1.01535177, "epoch": 0.2237186231775139, "flos": 22721204856960.0, "grad_norm": 2.1636039356843293, "language_loss": 0.75079429, "learning_rate": 3.619086370692945e-06, "loss": 0.7765696, "num_input_tokens_seen": 80152845, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.296875, "step": 3721, "time_per_iteration": 2.8520843982696533 }, { "auxiliary_loss_clip": 0.0155088, "auxiliary_loss_mlp": 0.01054266, "balance_loss_clip": 1.33496654, "balance_loss_mlp": 1.02541721, "epoch": 0.22377874643018186, "flos": 13379590097280.0, "grad_norm": 11.694820842464114, "language_loss": 0.7990551, "learning_rate": 3.6188577024931844e-06, "loss": 0.82510662, "num_input_tokens_seen": 80170680, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.28881836, "step": 3722, "time_per_iteration": 4.309507369995117 }, { "auxiliary_loss_clip": 0.01538341, "auxiliary_loss_mlp": 0.01048809, "balance_loss_clip": 1.32884693, "balance_loss_mlp": 1.02120066, "epoch": 0.22383886968284986, "flos": 17904011990400.0, "grad_norm": 2.126990282034098, "language_loss": 0.83411515, "learning_rate": 3.618628972906178e-06, "loss": 0.85998666, "num_input_tokens_seen": 80189030, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.27600098, "step": 3723, "time_per_iteration": 4.243016719818115 }, { "auxiliary_loss_clip": 0.01547693, "auxiliary_loss_mlp": 0.01049168, "balance_loss_clip": 1.33448362, "balance_loss_mlp": 1.02045083, "epoch": 0.22389899293551782, "flos": 23889845531520.0, "grad_norm": 1.7704934336216331, "language_loss": 0.8582375, "learning_rate": 3.6184001819405984e-06, "loss": 0.88420612, "num_input_tokens_seen": 80208365, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.28723145, "step": 3724, "time_per_iteration": 2.870314598083496 }, { "auxiliary_loss_clip": 0.01537887, "auxiliary_loss_mlp": 0.01049965, "balance_loss_clip": 1.32806158, "balance_loss_mlp": 1.02195072, "epoch": 0.2239591161881858, "flos": 27284066847360.0, "grad_norm": 2.0011244910583916, "language_loss": 0.80182076, "learning_rate": 3.618171329605121e-06, "loss": 0.8276993, "num_input_tokens_seen": 80228685, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.28051758, "step": 3725, "time_per_iteration": 2.8906025886535645 }, { "auxiliary_loss_clip": 0.01535217, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.3256222, "balance_loss_mlp": 1.0187875, "epoch": 0.22401923944085375, "flos": 22247046904320.0, "grad_norm": 2.549225671169216, "language_loss": 0.77984941, "learning_rate": 3.6179424159084254e-06, "loss": 0.80567551, "num_input_tokens_seen": 80247635, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.28613281, "step": 3726, "time_per_iteration": 2.824796676635742 }, { "auxiliary_loss_clip": 0.01569909, "auxiliary_loss_mlp": 0.0105241, "balance_loss_clip": 1.34903967, "balance_loss_mlp": 1.02239263, "epoch": 0.22407936269352172, "flos": 12060265680000.0, "grad_norm": 4.2315692056733285, "language_loss": 0.74257636, "learning_rate": 3.6177134408591914e-06, "loss": 0.76879954, "num_input_tokens_seen": 80260045, "router_z_loss_clip": 2.20507812, "router_z_loss_mlp": 0.30029297, "step": 3727, "time_per_iteration": 2.855062246322632 }, { "auxiliary_loss_clip": 0.01550058, "auxiliary_loss_mlp": 0.01050436, "balance_loss_clip": 1.33492005, "balance_loss_mlp": 1.02039528, "epoch": 0.22413948594618968, "flos": 19362754195200.0, "grad_norm": 2.076897817432705, "language_loss": 0.87962246, "learning_rate": 3.6174844044661013e-06, "loss": 0.90562737, "num_input_tokens_seen": 80277680, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.30029297, "step": 3728, "time_per_iteration": 2.8009896278381348 }, { "auxiliary_loss_clip": 0.01544985, "auxiliary_loss_mlp": 0.01050648, "balance_loss_clip": 1.33253193, "balance_loss_mlp": 1.02024961, "epoch": 0.22419960919885765, "flos": 24180535244160.0, "grad_norm": 3.12615415539506, "language_loss": 0.8177073, "learning_rate": 3.6172553067378406e-06, "loss": 0.84366357, "num_input_tokens_seen": 80294795, "router_z_loss_clip": 2.12402344, "router_z_loss_mlp": 0.30371094, "step": 3729, "time_per_iteration": 2.8567426204681396 }, { "auxiliary_loss_clip": 0.0153295, "auxiliary_loss_mlp": 0.01055105, "balance_loss_clip": 1.32444918, "balance_loss_mlp": 1.02718639, "epoch": 0.22425973245152564, "flos": 27389885731200.0, "grad_norm": 1.6469958283355879, "language_loss": 0.87690419, "learning_rate": 3.6170261476830964e-06, "loss": 0.90278471, "num_input_tokens_seen": 80315425, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.27905273, "step": 3730, "time_per_iteration": 2.886948347091675 }, { "auxiliary_loss_clip": 0.01540269, "auxiliary_loss_mlp": 0.0104761, "balance_loss_clip": 1.33250141, "balance_loss_mlp": 1.01888108, "epoch": 0.2243198557041936, "flos": 13743630910080.0, "grad_norm": 1.8795527029717858, "language_loss": 0.7484799, "learning_rate": 3.616796927310559e-06, "loss": 0.77435869, "num_input_tokens_seen": 80333905, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.28710938, "step": 3731, "time_per_iteration": 2.8478658199310303 }, { "auxiliary_loss_clip": 0.01564747, "auxiliary_loss_mlp": 0.01046847, "balance_loss_clip": 1.35088468, "balance_loss_mlp": 1.01941705, "epoch": 0.22437997895686157, "flos": 19538576064000.0, "grad_norm": 2.8598084193124893, "language_loss": 0.76624972, "learning_rate": 3.6165676456289195e-06, "loss": 0.79236567, "num_input_tokens_seen": 80352165, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.27416992, "step": 3732, "time_per_iteration": 2.8200795650482178 }, { "auxiliary_loss_clip": 0.01542485, "auxiliary_loss_mlp": 0.01053702, "balance_loss_clip": 1.33263206, "balance_loss_mlp": 1.02506781, "epoch": 0.22444010220952954, "flos": 23706784494720.0, "grad_norm": 1.6276970271148343, "language_loss": 0.89356577, "learning_rate": 3.616338302646873e-06, "loss": 0.91952759, "num_input_tokens_seen": 80371305, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.28662109, "step": 3733, "time_per_iteration": 2.858466625213623 }, { "auxiliary_loss_clip": 0.01544236, "auxiliary_loss_mlp": 0.01052947, "balance_loss_clip": 1.33369946, "balance_loss_mlp": 1.02207148, "epoch": 0.2245002254621975, "flos": 22393160922240.0, "grad_norm": 1.5147335682843577, "language_loss": 0.85221756, "learning_rate": 3.6161088983731166e-06, "loss": 0.87818933, "num_input_tokens_seen": 80391020, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.30883789, "step": 3734, "time_per_iteration": 2.8464205265045166 }, { "auxiliary_loss_clip": 0.015403, "auxiliary_loss_mlp": 0.01059528, "balance_loss_clip": 1.33037925, "balance_loss_mlp": 1.0311327, "epoch": 0.22456034871486547, "flos": 26953398714240.0, "grad_norm": 1.922770622506954, "language_loss": 0.77510822, "learning_rate": 3.6158794328163482e-06, "loss": 0.80110651, "num_input_tokens_seen": 80411365, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.28393555, "step": 3735, "time_per_iteration": 2.876671552658081 }, { "auxiliary_loss_clip": 0.01531448, "auxiliary_loss_mlp": 0.01050451, "balance_loss_clip": 1.32588935, "balance_loss_mlp": 1.02112556, "epoch": 0.22462047196753343, "flos": 28994651464320.0, "grad_norm": 1.6350834856147414, "language_loss": 0.85378397, "learning_rate": 3.6156499059852702e-06, "loss": 0.87960291, "num_input_tokens_seen": 80431075, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.29284668, "step": 3736, "time_per_iteration": 2.954570770263672 }, { "auxiliary_loss_clip": 0.01556441, "auxiliary_loss_mlp": 0.01045969, "balance_loss_clip": 1.34338951, "balance_loss_mlp": 1.01709652, "epoch": 0.22468059522020142, "flos": 20020968570240.0, "grad_norm": 1.669360671691999, "language_loss": 0.87051022, "learning_rate": 3.615420317888586e-06, "loss": 0.89653432, "num_input_tokens_seen": 80449240, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.28881836, "step": 3737, "time_per_iteration": 2.85334849357605 }, { "auxiliary_loss_clip": 0.01552174, "auxiliary_loss_mlp": 0.01057983, "balance_loss_clip": 1.33818412, "balance_loss_mlp": 1.0264163, "epoch": 0.2247407184728694, "flos": 29326043514240.0, "grad_norm": 2.4424319613994316, "language_loss": 0.79944551, "learning_rate": 3.6151906685350006e-06, "loss": 0.8255471, "num_input_tokens_seen": 80467900, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.31567383, "step": 3738, "time_per_iteration": 2.961815118789673 }, { "auxiliary_loss_clip": 0.01545414, "auxiliary_loss_mlp": 0.01051719, "balance_loss_clip": 1.33292365, "balance_loss_mlp": 1.02363324, "epoch": 0.22480084172553735, "flos": 22320714718080.0, "grad_norm": 1.6782565651325472, "language_loss": 0.77466273, "learning_rate": 3.614960957933224e-06, "loss": 0.80063403, "num_input_tokens_seen": 80487100, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.28076172, "step": 3739, "time_per_iteration": 2.836958885192871 }, { "auxiliary_loss_clip": 0.01534717, "auxiliary_loss_mlp": 0.01052016, "balance_loss_clip": 1.32427025, "balance_loss_mlp": 1.01985335, "epoch": 0.22486096497820532, "flos": 25601742247680.0, "grad_norm": 4.571255832643259, "language_loss": 0.75497323, "learning_rate": 3.6147311860919655e-06, "loss": 0.78084058, "num_input_tokens_seen": 80508625, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.3215332, "step": 3740, "time_per_iteration": 2.8794212341308594 }, { "auxiliary_loss_clip": 0.01526577, "auxiliary_loss_mlp": 0.01047601, "balance_loss_clip": 1.31819129, "balance_loss_mlp": 1.01880002, "epoch": 0.22492108823087328, "flos": 17648866707840.0, "grad_norm": 1.887467425843772, "language_loss": 0.77047449, "learning_rate": 3.614501353019939e-06, "loss": 0.79621625, "num_input_tokens_seen": 80527345, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.28796387, "step": 3741, "time_per_iteration": 2.8430638313293457 }, { "auxiliary_loss_clip": 0.0153776, "auxiliary_loss_mlp": 0.01047233, "balance_loss_clip": 1.33054364, "balance_loss_mlp": 1.01881313, "epoch": 0.22498121148354125, "flos": 16043331813120.0, "grad_norm": 1.715347611607743, "language_loss": 0.88517159, "learning_rate": 3.6142714587258592e-06, "loss": 0.91102153, "num_input_tokens_seen": 80545545, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.28381348, "step": 3742, "time_per_iteration": 2.911713123321533 }, { "auxiliary_loss_clip": 0.01521661, "auxiliary_loss_mlp": 0.01053344, "balance_loss_clip": 1.31659913, "balance_loss_mlp": 1.02294612, "epoch": 0.22504133473620924, "flos": 24034195002240.0, "grad_norm": 2.4258360730147785, "language_loss": 0.8243317, "learning_rate": 3.614041503218444e-06, "loss": 0.85008168, "num_input_tokens_seen": 80565040, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.30419922, "step": 3743, "time_per_iteration": 3.0027434825897217 }, { "auxiliary_loss_clip": 0.01534911, "auxiliary_loss_mlp": 0.0104612, "balance_loss_clip": 1.3240521, "balance_loss_mlp": 1.01736689, "epoch": 0.2251014579888772, "flos": 16772725537920.0, "grad_norm": 2.069787498989708, "language_loss": 0.64260972, "learning_rate": 3.6138114865064134e-06, "loss": 0.66841996, "num_input_tokens_seen": 80582815, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.28735352, "step": 3744, "time_per_iteration": 2.8116295337677 }, { "auxiliary_loss_clip": 0.01521203, "auxiliary_loss_mlp": 0.01050282, "balance_loss_clip": 1.31002629, "balance_loss_mlp": 1.0218153, "epoch": 0.22516158124154517, "flos": 13999047661440.0, "grad_norm": 3.0721771725421743, "language_loss": 0.76982129, "learning_rate": 3.613581408598489e-06, "loss": 0.79553616, "num_input_tokens_seen": 80600865, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.28466797, "step": 3745, "time_per_iteration": 2.823042392730713 }, { "auxiliary_loss_clip": 0.01525826, "auxiliary_loss_mlp": 0.01048829, "balance_loss_clip": 1.31792748, "balance_loss_mlp": 1.02054048, "epoch": 0.22522170449421314, "flos": 14397547029120.0, "grad_norm": 1.8942995207489424, "language_loss": 0.81737006, "learning_rate": 3.6133512695033965e-06, "loss": 0.84311658, "num_input_tokens_seen": 80617455, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.28295898, "step": 3746, "time_per_iteration": 4.264824628829956 }, { "auxiliary_loss_clip": 0.01529026, "auxiliary_loss_mlp": 0.0105502, "balance_loss_clip": 1.31770778, "balance_loss_mlp": 1.02620721, "epoch": 0.2252818277468811, "flos": 23816087228160.0, "grad_norm": 2.2278963911297613, "language_loss": 0.87581521, "learning_rate": 3.613121069229862e-06, "loss": 0.90165555, "num_input_tokens_seen": 80635125, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.28820801, "step": 3747, "time_per_iteration": 2.819066286087036 }, { "auxiliary_loss_clip": 0.01532456, "auxiliary_loss_mlp": 0.01047654, "balance_loss_clip": 1.32214904, "balance_loss_mlp": 1.01928186, "epoch": 0.22534195099954907, "flos": 24728903948160.0, "grad_norm": 1.8023029524812735, "language_loss": 0.77460176, "learning_rate": 3.6128908077866145e-06, "loss": 0.80040288, "num_input_tokens_seen": 80656370, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.28417969, "step": 3748, "time_per_iteration": 2.922307252883911 }, { "auxiliary_loss_clip": 0.01526722, "auxiliary_loss_mlp": 0.01050875, "balance_loss_clip": 1.31868196, "balance_loss_mlp": 1.02147841, "epoch": 0.22540207425221703, "flos": 21042092638080.0, "grad_norm": 1.9816409818737228, "language_loss": 0.80710018, "learning_rate": 3.6126604851823864e-06, "loss": 0.83287609, "num_input_tokens_seen": 80676495, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.29418945, "step": 3749, "time_per_iteration": 2.8502230644226074 }, { "auxiliary_loss_clip": 0.01512205, "auxiliary_loss_mlp": 0.01048275, "balance_loss_clip": 1.30581737, "balance_loss_mlp": 1.01959372, "epoch": 0.22546219750488503, "flos": 19399655969280.0, "grad_norm": 1.525089940026836, "language_loss": 0.80334485, "learning_rate": 3.6124301014259108e-06, "loss": 0.82894957, "num_input_tokens_seen": 80694755, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.28686523, "step": 3750, "time_per_iteration": 2.8381307125091553 }, { "auxiliary_loss_clip": 0.01539488, "auxiliary_loss_mlp": 0.01055856, "balance_loss_clip": 1.32651067, "balance_loss_mlp": 1.02750778, "epoch": 0.225522320757553, "flos": 25203695328000.0, "grad_norm": 5.416841267111513, "language_loss": 0.83306801, "learning_rate": 3.6121996565259244e-06, "loss": 0.85902143, "num_input_tokens_seen": 80713670, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.28356934, "step": 3751, "time_per_iteration": 2.883805751800537 }, { "auxiliary_loss_clip": 0.01537812, "auxiliary_loss_mlp": 0.01054101, "balance_loss_clip": 1.33054662, "balance_loss_mlp": 1.02563381, "epoch": 0.22558244401022096, "flos": 17171405884800.0, "grad_norm": 2.5850435224094546, "language_loss": 0.84657884, "learning_rate": 3.611969150491165e-06, "loss": 0.87249798, "num_input_tokens_seen": 80731450, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.28466797, "step": 3752, "time_per_iteration": 2.7997288703918457 }, { "auxiliary_loss_clip": 0.01519272, "auxiliary_loss_mlp": 0.01040973, "balance_loss_clip": 1.3126328, "balance_loss_mlp": 1.01533091, "epoch": 0.22564256726288892, "flos": 15239229644160.0, "grad_norm": 1.7513019381399806, "language_loss": 0.79261029, "learning_rate": 3.611738583330375e-06, "loss": 0.81821269, "num_input_tokens_seen": 80748415, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.25634766, "step": 3753, "time_per_iteration": 2.832828998565674 }, { "auxiliary_loss_clip": 0.0152849, "auxiliary_loss_mlp": 0.01045703, "balance_loss_clip": 1.32181883, "balance_loss_mlp": 1.01767659, "epoch": 0.2257026905155569, "flos": 34581940392960.0, "grad_norm": 1.7818187463908335, "language_loss": 0.79554057, "learning_rate": 3.611507955052295e-06, "loss": 0.82128251, "num_input_tokens_seen": 80770835, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.28039551, "step": 3754, "time_per_iteration": 2.9197747707366943 }, { "auxiliary_loss_clip": 0.01521631, "auxiliary_loss_mlp": 0.01048182, "balance_loss_clip": 1.31670439, "balance_loss_mlp": 1.01995289, "epoch": 0.22576281376822485, "flos": 19948069918080.0, "grad_norm": 1.7790457126556039, "language_loss": 0.7103247, "learning_rate": 3.6112772656656727e-06, "loss": 0.73602283, "num_input_tokens_seen": 80787840, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.28234863, "step": 3755, "time_per_iteration": 4.252835512161255 }, { "auxiliary_loss_clip": 0.0153807, "auxiliary_loss_mlp": 0.01053668, "balance_loss_clip": 1.32546556, "balance_loss_mlp": 1.02601171, "epoch": 0.22582293702089282, "flos": 24611140437120.0, "grad_norm": 2.280353507914846, "language_loss": 0.7817024, "learning_rate": 3.6110465151792547e-06, "loss": 0.80761975, "num_input_tokens_seen": 80806335, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.27685547, "step": 3756, "time_per_iteration": 2.9065020084381104 }, { "auxiliary_loss_clip": 0.01549891, "auxiliary_loss_mlp": 0.01043932, "balance_loss_clip": 1.33700562, "balance_loss_mlp": 1.01653814, "epoch": 0.2258830602735608, "flos": 23044769556480.0, "grad_norm": 13.18143871256427, "language_loss": 0.83606517, "learning_rate": 3.6108157036017916e-06, "loss": 0.86200339, "num_input_tokens_seen": 80825355, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.27404785, "step": 3757, "time_per_iteration": 4.252346515655518 }, { "auxiliary_loss_clip": 0.01543328, "auxiliary_loss_mlp": 0.01043795, "balance_loss_clip": 1.33022404, "balance_loss_mlp": 1.01597166, "epoch": 0.22594318352622877, "flos": 22167180552960.0, "grad_norm": 2.3032334805131, "language_loss": 0.73818803, "learning_rate": 3.6105848309420358e-06, "loss": 0.76405931, "num_input_tokens_seen": 80842570, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.27832031, "step": 3758, "time_per_iteration": 4.20307469367981 }, { "auxiliary_loss_clip": 0.01532222, "auxiliary_loss_mlp": 0.01048852, "balance_loss_clip": 1.32114172, "balance_loss_mlp": 1.02086139, "epoch": 0.22600330677889674, "flos": 20603886318720.0, "grad_norm": 2.182321432903683, "language_loss": 0.77273977, "learning_rate": 3.6103538972087412e-06, "loss": 0.79855049, "num_input_tokens_seen": 80858745, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.27978516, "step": 3759, "time_per_iteration": 2.8294897079467773 }, { "auxiliary_loss_clip": 0.01539222, "auxiliary_loss_mlp": 0.01049388, "balance_loss_clip": 1.32678676, "balance_loss_mlp": 1.02207673, "epoch": 0.2260634300315647, "flos": 35672479263360.0, "grad_norm": 1.711283843432931, "language_loss": 0.78981626, "learning_rate": 3.6101229024106655e-06, "loss": 0.81570238, "num_input_tokens_seen": 80880085, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.2734375, "step": 3760, "time_per_iteration": 2.969528913497925 }, { "auxiliary_loss_clip": 0.0129122, "auxiliary_loss_mlp": 0.01043986, "balance_loss_clip": 1.16764665, "balance_loss_mlp": 1.01461303, "epoch": 0.22612355328423267, "flos": 72117226339200.0, "grad_norm": 0.954599524700344, "language_loss": 0.60130632, "learning_rate": 3.609891846556569e-06, "loss": 0.62465835, "num_input_tokens_seen": 80937660, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.29296875, "step": 3761, "time_per_iteration": 3.2857141494750977 }, { "auxiliary_loss_clip": 0.01551985, "auxiliary_loss_mlp": 0.01055522, "balance_loss_clip": 1.33575988, "balance_loss_mlp": 1.02957034, "epoch": 0.22618367653690064, "flos": 22794013019520.0, "grad_norm": 2.9064264919473635, "language_loss": 0.77778077, "learning_rate": 3.609660729655211e-06, "loss": 0.80385584, "num_input_tokens_seen": 80956265, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.25927734, "step": 3762, "time_per_iteration": 2.872756004333496 }, { "auxiliary_loss_clip": 0.0155121, "auxiliary_loss_mlp": 0.01044578, "balance_loss_clip": 1.33875978, "balance_loss_mlp": 1.0177803, "epoch": 0.22624379978956863, "flos": 20457772300800.0, "grad_norm": 3.42500249029195, "language_loss": 0.79914749, "learning_rate": 3.6094295517153573e-06, "loss": 0.82510543, "num_input_tokens_seen": 80975185, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.26831055, "step": 3763, "time_per_iteration": 2.823897123336792 }, { "auxiliary_loss_clip": 0.01553856, "auxiliary_loss_mlp": 0.01054136, "balance_loss_clip": 1.33790994, "balance_loss_mlp": 1.02649117, "epoch": 0.2263039230422366, "flos": 17503431361920.0, "grad_norm": 1.713214065018996, "language_loss": 0.92705798, "learning_rate": 3.6091983127457743e-06, "loss": 0.95313793, "num_input_tokens_seen": 80992830, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.27661133, "step": 3764, "time_per_iteration": 2.8070759773254395 }, { "auxiliary_loss_clip": 0.01534094, "auxiliary_loss_mlp": 0.01053869, "balance_loss_clip": 1.32784247, "balance_loss_mlp": 1.02797627, "epoch": 0.22636404629490456, "flos": 28341459262080.0, "grad_norm": 1.6772547502973378, "language_loss": 0.75946146, "learning_rate": 3.6089670127552293e-06, "loss": 0.78534108, "num_input_tokens_seen": 81013675, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.25878906, "step": 3765, "time_per_iteration": 2.8975167274475098 }, { "auxiliary_loss_clip": 0.01540107, "auxiliary_loss_mlp": 0.01055675, "balance_loss_clip": 1.33289647, "balance_loss_mlp": 1.02965176, "epoch": 0.22642416954757252, "flos": 17496916110720.0, "grad_norm": 1.88371499543693, "language_loss": 0.91106641, "learning_rate": 3.608735651752494e-06, "loss": 0.93702424, "num_input_tokens_seen": 81030345, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.26013184, "step": 3766, "time_per_iteration": 2.7890732288360596 }, { "auxiliary_loss_clip": 0.01533339, "auxiliary_loss_mlp": 0.01054196, "balance_loss_clip": 1.33021259, "balance_loss_mlp": 1.02726626, "epoch": 0.2264842928002405, "flos": 24393982803840.0, "grad_norm": 1.5401573321021198, "language_loss": 0.7571919, "learning_rate": 3.6085042297463417e-06, "loss": 0.78306723, "num_input_tokens_seen": 81051000, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.26940918, "step": 3767, "time_per_iteration": 2.8692991733551025 }, { "auxiliary_loss_clip": 0.01553821, "auxiliary_loss_mlp": 0.01061337, "balance_loss_clip": 1.34168208, "balance_loss_mlp": 1.03459883, "epoch": 0.22654441605290845, "flos": 19840531731840.0, "grad_norm": 1.4938241348761294, "language_loss": 0.72595799, "learning_rate": 3.6082727467455477e-06, "loss": 0.75210965, "num_input_tokens_seen": 81071205, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.26733398, "step": 3768, "time_per_iteration": 2.849514961242676 }, { "auxiliary_loss_clip": 0.01539211, "auxiliary_loss_mlp": 0.01060941, "balance_loss_clip": 1.33475351, "balance_loss_mlp": 1.03492975, "epoch": 0.22660453930557642, "flos": 27466177743360.0, "grad_norm": 1.7611297152614769, "language_loss": 0.78781283, "learning_rate": 3.6080412027588905e-06, "loss": 0.8138144, "num_input_tokens_seen": 81091880, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.26013184, "step": 3769, "time_per_iteration": 2.9468629360198975 }, { "auxiliary_loss_clip": 0.01538642, "auxiliary_loss_mlp": 0.01060281, "balance_loss_clip": 1.32757461, "balance_loss_mlp": 1.03390026, "epoch": 0.2266646625582444, "flos": 23998786306560.0, "grad_norm": 2.6881760447460725, "language_loss": 0.70133084, "learning_rate": 3.6078095977951488e-06, "loss": 0.72732002, "num_input_tokens_seen": 81113290, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.26391602, "step": 3770, "time_per_iteration": 2.8608558177948 }, { "auxiliary_loss_clip": 0.01552444, "auxiliary_loss_mlp": 0.01062649, "balance_loss_clip": 1.34342861, "balance_loss_mlp": 1.03808022, "epoch": 0.22672478581091238, "flos": 26038364999040.0, "grad_norm": 1.6779546908087195, "language_loss": 0.80978119, "learning_rate": 3.6075779318631067e-06, "loss": 0.83593214, "num_input_tokens_seen": 81133535, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.24584961, "step": 3771, "time_per_iteration": 2.8651278018951416 }, { "auxiliary_loss_clip": 0.01529995, "auxiliary_loss_mlp": 0.01057275, "balance_loss_clip": 1.32688236, "balance_loss_mlp": 1.03189528, "epoch": 0.22678490906358034, "flos": 23852310330240.0, "grad_norm": 1.4786098606571236, "language_loss": 0.79192054, "learning_rate": 3.6073462049715486e-06, "loss": 0.81779325, "num_input_tokens_seen": 81154650, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.25378418, "step": 3772, "time_per_iteration": 2.885185480117798 }, { "auxiliary_loss_clip": 0.0129708, "auxiliary_loss_mlp": 0.01034669, "balance_loss_clip": 1.17372942, "balance_loss_mlp": 1.01187575, "epoch": 0.2268450323162483, "flos": 65082343409280.0, "grad_norm": 0.8540738676136503, "language_loss": 0.54510063, "learning_rate": 3.607114417129261e-06, "loss": 0.56841815, "num_input_tokens_seen": 81221240, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.22753906, "step": 3773, "time_per_iteration": 3.4410598278045654 }, { "auxiliary_loss_clip": 0.01530119, "auxiliary_loss_mlp": 0.01049097, "balance_loss_clip": 1.32599497, "balance_loss_mlp": 1.02421844, "epoch": 0.22690515556891627, "flos": 22535655356160.0, "grad_norm": 1.652865665104445, "language_loss": 0.71300256, "learning_rate": 3.6068825683450334e-06, "loss": 0.7387948, "num_input_tokens_seen": 81241520, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.24902344, "step": 3774, "time_per_iteration": 2.8518316745758057 }, { "auxiliary_loss_clip": 0.0153746, "auxiliary_loss_mlp": 0.01049573, "balance_loss_clip": 1.33238626, "balance_loss_mlp": 1.02494383, "epoch": 0.22696527882158424, "flos": 18232598862720.0, "grad_norm": 1.8973620743449378, "language_loss": 0.74713433, "learning_rate": 3.606650658627658e-06, "loss": 0.77300471, "num_input_tokens_seen": 81256825, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.24633789, "step": 3775, "time_per_iteration": 2.814811944961548 }, { "auxiliary_loss_clip": 0.01515778, "auxiliary_loss_mlp": 0.0104858, "balance_loss_clip": 1.31195414, "balance_loss_mlp": 1.02357018, "epoch": 0.22702540207425223, "flos": 17028051799680.0, "grad_norm": 1.804581600827397, "language_loss": 0.8418473, "learning_rate": 3.606418687985928e-06, "loss": 0.86749089, "num_input_tokens_seen": 81275695, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.25012207, "step": 3776, "time_per_iteration": 2.8440778255462646 }, { "auxiliary_loss_clip": 0.01535956, "auxiliary_loss_mlp": 0.01049977, "balance_loss_clip": 1.32706952, "balance_loss_mlp": 1.02407324, "epoch": 0.2270855253269202, "flos": 21335949486720.0, "grad_norm": 1.939496226483114, "language_loss": 0.83524668, "learning_rate": 3.606186656428641e-06, "loss": 0.86110598, "num_input_tokens_seen": 81294920, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2590332, "step": 3777, "time_per_iteration": 2.85398530960083 }, { "auxiliary_loss_clip": 0.01541398, "auxiliary_loss_mlp": 0.01047962, "balance_loss_clip": 1.33302307, "balance_loss_mlp": 1.02247524, "epoch": 0.22714564857958816, "flos": 23561032435200.0, "grad_norm": 2.4028679252884175, "language_loss": 0.74043608, "learning_rate": 3.6059545639645955e-06, "loss": 0.76632971, "num_input_tokens_seen": 81314275, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.25500488, "step": 3778, "time_per_iteration": 2.890629529953003 }, { "auxiliary_loss_clip": 0.0153561, "auxiliary_loss_mlp": 0.01041788, "balance_loss_clip": 1.32413578, "balance_loss_mlp": 1.01676619, "epoch": 0.22720577183225613, "flos": 25999743922560.0, "grad_norm": 2.1086742078639182, "language_loss": 0.65628982, "learning_rate": 3.605722410602591e-06, "loss": 0.68206376, "num_input_tokens_seen": 81333890, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.25, "step": 3779, "time_per_iteration": 2.8753128051757812 }, { "auxiliary_loss_clip": 0.01521602, "auxiliary_loss_mlp": 0.01046344, "balance_loss_clip": 1.3178333, "balance_loss_mlp": 1.02120304, "epoch": 0.2272658950849241, "flos": 20823939619200.0, "grad_norm": 1.8408528013001781, "language_loss": 0.71956348, "learning_rate": 3.6054901963514323e-06, "loss": 0.74524289, "num_input_tokens_seen": 81353640, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.25170898, "step": 3780, "time_per_iteration": 2.8263442516326904 }, { "auxiliary_loss_clip": 0.01526005, "auxiliary_loss_mlp": 0.01045312, "balance_loss_clip": 1.32105732, "balance_loss_mlp": 1.01834726, "epoch": 0.22732601833759206, "flos": 23918422262400.0, "grad_norm": 1.615438358259811, "language_loss": 0.90308058, "learning_rate": 3.6052579212199246e-06, "loss": 0.92879373, "num_input_tokens_seen": 81371595, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.26977539, "step": 3781, "time_per_iteration": 4.281549453735352 }, { "auxiliary_loss_clip": 0.01523221, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.31571126, "balance_loss_mlp": 1.0185982, "epoch": 0.22738614159026002, "flos": 15932852714880.0, "grad_norm": 2.024344874085057, "language_loss": 0.75779092, "learning_rate": 3.6050255852168753e-06, "loss": 0.78346407, "num_input_tokens_seen": 81388435, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.25524902, "step": 3782, "time_per_iteration": 2.845776319503784 }, { "auxiliary_loss_clip": 0.01508731, "auxiliary_loss_mlp": 0.01046256, "balance_loss_clip": 1.30632675, "balance_loss_mlp": 1.0225209, "epoch": 0.22744626484292801, "flos": 24215808205440.0, "grad_norm": 3.1051367821642604, "language_loss": 0.83631891, "learning_rate": 3.604793188351095e-06, "loss": 0.86186874, "num_input_tokens_seen": 81410195, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.23730469, "step": 3783, "time_per_iteration": 2.897343397140503 }, { "auxiliary_loss_clip": 0.01515332, "auxiliary_loss_mlp": 0.01047023, "balance_loss_clip": 1.30939543, "balance_loss_mlp": 1.02172649, "epoch": 0.22750638809559598, "flos": 24802662251520.0, "grad_norm": 1.6407136672388425, "language_loss": 0.76511174, "learning_rate": 3.6045607306313964e-06, "loss": 0.7907353, "num_input_tokens_seen": 81430060, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.2532959, "step": 3784, "time_per_iteration": 2.879348039627075 }, { "auxiliary_loss_clip": 0.01519427, "auxiliary_loss_mlp": 0.01045311, "balance_loss_clip": 1.31431389, "balance_loss_mlp": 1.0195384, "epoch": 0.22756651134826394, "flos": 22246594456320.0, "grad_norm": 1.54643732950845, "language_loss": 0.71536744, "learning_rate": 3.604328212066594e-06, "loss": 0.74101484, "num_input_tokens_seen": 81447375, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.25756836, "step": 3785, "time_per_iteration": 2.9326696395874023 }, { "auxiliary_loss_clip": 0.01283158, "auxiliary_loss_mlp": 0.01030409, "balance_loss_clip": 1.16936898, "balance_loss_mlp": 1.00980949, "epoch": 0.2276266346009319, "flos": 62739677928960.0, "grad_norm": 0.8190006572904106, "language_loss": 0.61987746, "learning_rate": 3.6040956326655047e-06, "loss": 0.64301312, "num_input_tokens_seen": 81505235, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.20605469, "step": 3786, "time_per_iteration": 3.3358380794525146 }, { "auxiliary_loss_clip": 0.01527153, "auxiliary_loss_mlp": 0.01044492, "balance_loss_clip": 1.3194809, "balance_loss_mlp": 1.01879096, "epoch": 0.22768675785359987, "flos": 18620375212800.0, "grad_norm": 2.4930385852486303, "language_loss": 0.88609755, "learning_rate": 3.6038629924369486e-06, "loss": 0.91181397, "num_input_tokens_seen": 81518685, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.25744629, "step": 3787, "time_per_iteration": 2.819997787475586 }, { "auxiliary_loss_clip": 0.01514061, "auxiliary_loss_mlp": 0.01049403, "balance_loss_clip": 1.31169367, "balance_loss_mlp": 1.0261215, "epoch": 0.22774688110626784, "flos": 26881495447680.0, "grad_norm": 1.258954494919232, "language_loss": 0.7344901, "learning_rate": 3.6036302913897474e-06, "loss": 0.7601248, "num_input_tokens_seen": 81538940, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.23278809, "step": 3788, "time_per_iteration": 2.9018099308013916 }, { "auxiliary_loss_clip": 0.01517587, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.31217909, "balance_loss_mlp": 1.01520729, "epoch": 0.2278070043589358, "flos": 15560396369280.0, "grad_norm": 2.2962587955421405, "language_loss": 0.69134581, "learning_rate": 3.6033975295327243e-06, "loss": 0.71692216, "num_input_tokens_seen": 81555525, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.24853516, "step": 3789, "time_per_iteration": 2.7884645462036133 }, { "auxiliary_loss_clip": 0.01520704, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.31639504, "balance_loss_mlp": 1.01914287, "epoch": 0.2278671276116038, "flos": 22426397867520.0, "grad_norm": 1.9281422678271343, "language_loss": 0.77082491, "learning_rate": 3.6031647068747065e-06, "loss": 0.79647386, "num_input_tokens_seen": 81576305, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.25036621, "step": 3790, "time_per_iteration": 4.3906683921813965 }, { "auxiliary_loss_clip": 0.0150886, "auxiliary_loss_mlp": 0.01045698, "balance_loss_clip": 1.30742908, "balance_loss_mlp": 1.02067637, "epoch": 0.22792725086427176, "flos": 20641104806400.0, "grad_norm": 1.8882639228446665, "language_loss": 0.91806614, "learning_rate": 3.602931823424522e-06, "loss": 0.94361162, "num_input_tokens_seen": 81594115, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.25012207, "step": 3791, "time_per_iteration": 2.8177073001861572 }, { "auxiliary_loss_clip": 0.01516961, "auxiliary_loss_mlp": 0.01041766, "balance_loss_clip": 1.30892777, "balance_loss_mlp": 1.0182817, "epoch": 0.22798737411693973, "flos": 31440918833280.0, "grad_norm": 2.338137298927301, "language_loss": 0.83888352, "learning_rate": 3.6026988791910026e-06, "loss": 0.86447084, "num_input_tokens_seen": 81615355, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.23486328, "step": 3792, "time_per_iteration": 4.303308010101318 }, { "auxiliary_loss_clip": 0.01299784, "auxiliary_loss_mlp": 0.0105803, "balance_loss_clip": 1.18557048, "balance_loss_mlp": 1.03695345, "epoch": 0.2280474973696077, "flos": 52420564368000.0, "grad_norm": 1.1502437050314154, "language_loss": 0.65722501, "learning_rate": 3.602465874182981e-06, "loss": 0.68080318, "num_input_tokens_seen": 81662075, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.2109375, "step": 3793, "time_per_iteration": 4.536312818527222 }, { "auxiliary_loss_clip": 0.01554919, "auxiliary_loss_mlp": 0.01048281, "balance_loss_clip": 1.33965635, "balance_loss_mlp": 1.02343822, "epoch": 0.22810762062227566, "flos": 26407292250240.0, "grad_norm": 2.7371352098440367, "language_loss": 0.78309703, "learning_rate": 3.602232808409293e-06, "loss": 0.809129, "num_input_tokens_seen": 81681625, "router_z_loss_clip": 2.15527344, "router_z_loss_mlp": 0.24853516, "step": 3794, "time_per_iteration": 2.9299237728118896 }, { "auxiliary_loss_clip": 0.01529926, "auxiliary_loss_mlp": 0.01043613, "balance_loss_clip": 1.32226872, "balance_loss_mlp": 1.01984262, "epoch": 0.22816774387494362, "flos": 25641403954560.0, "grad_norm": 2.6955563051320888, "language_loss": 0.81579369, "learning_rate": 3.6019996818787755e-06, "loss": 0.84152907, "num_input_tokens_seen": 81701170, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.23791504, "step": 3795, "time_per_iteration": 2.8729264736175537 }, { "auxiliary_loss_clip": 0.01511653, "auxiliary_loss_mlp": 0.01052372, "balance_loss_clip": 1.30880344, "balance_loss_mlp": 1.02720702, "epoch": 0.22822786712761162, "flos": 22460765932800.0, "grad_norm": 1.8015410200842634, "language_loss": 0.77992463, "learning_rate": 3.6017664946002704e-06, "loss": 0.80556488, "num_input_tokens_seen": 81721265, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.25170898, "step": 3796, "time_per_iteration": 2.9117472171783447 }, { "auxiliary_loss_clip": 0.01518918, "auxiliary_loss_mlp": 0.0104227, "balance_loss_clip": 1.31307411, "balance_loss_mlp": 1.01817763, "epoch": 0.22828799038027958, "flos": 12209954037120.0, "grad_norm": 2.374112670172366, "language_loss": 0.96527576, "learning_rate": 3.6015332465826188e-06, "loss": 0.99088758, "num_input_tokens_seen": 81736565, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.24084473, "step": 3797, "time_per_iteration": 2.956239938735962 }, { "auxiliary_loss_clip": 0.01511443, "auxiliary_loss_mlp": 0.01051763, "balance_loss_clip": 1.30681276, "balance_loss_mlp": 1.02715802, "epoch": 0.22834811363294755, "flos": 22094508124800.0, "grad_norm": 1.5635256360694203, "language_loss": 0.8264519, "learning_rate": 3.601299937834666e-06, "loss": 0.85208392, "num_input_tokens_seen": 81756240, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.24597168, "step": 3798, "time_per_iteration": 2.912991762161255 }, { "auxiliary_loss_clip": 0.01515723, "auxiliary_loss_mlp": 0.01043174, "balance_loss_clip": 1.30821931, "balance_loss_mlp": 1.01783037, "epoch": 0.2284082368856155, "flos": 24870674465280.0, "grad_norm": 2.555309979074055, "language_loss": 0.79915035, "learning_rate": 3.6010665683652596e-06, "loss": 0.82473934, "num_input_tokens_seen": 81775720, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.25354004, "step": 3799, "time_per_iteration": 2.892906904220581 }, { "auxiliary_loss_clip": 0.01511826, "auxiliary_loss_mlp": 0.01050382, "balance_loss_clip": 1.30658972, "balance_loss_mlp": 1.02520478, "epoch": 0.22846836013828348, "flos": 23302810506240.0, "grad_norm": 1.4736745245906775, "language_loss": 0.75538671, "learning_rate": 3.6008331381832484e-06, "loss": 0.78100872, "num_input_tokens_seen": 81795830, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.25170898, "step": 3800, "time_per_iteration": 2.865077257156372 }, { "auxiliary_loss_clip": 0.01506285, "auxiliary_loss_mlp": 0.01046678, "balance_loss_clip": 1.30257344, "balance_loss_mlp": 1.02226424, "epoch": 0.22852848339095144, "flos": 27427420932480.0, "grad_norm": 3.8275672034762995, "language_loss": 0.64806861, "learning_rate": 3.600599647297484e-06, "loss": 0.67359817, "num_input_tokens_seen": 81815745, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.24438477, "step": 3801, "time_per_iteration": 2.9472155570983887 }, { "auxiliary_loss_clip": 0.01507876, "auxiliary_loss_mlp": 0.01044503, "balance_loss_clip": 1.30689156, "balance_loss_mlp": 1.02093506, "epoch": 0.2285886066436194, "flos": 26331859889280.0, "grad_norm": 1.5824162300255884, "language_loss": 0.82478034, "learning_rate": 3.60036609571682e-06, "loss": 0.85030413, "num_input_tokens_seen": 81835155, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.23571777, "step": 3802, "time_per_iteration": 2.8514151573181152 }, { "auxiliary_loss_clip": 0.01519531, "auxiliary_loss_mlp": 0.01050161, "balance_loss_clip": 1.31310093, "balance_loss_mlp": 1.02579427, "epoch": 0.2286487298962874, "flos": 29728614913920.0, "grad_norm": 1.6975647854672962, "language_loss": 0.80000943, "learning_rate": 3.600132483450114e-06, "loss": 0.82570636, "num_input_tokens_seen": 81855655, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.24365234, "step": 3803, "time_per_iteration": 2.894819498062134 }, { "auxiliary_loss_clip": 0.01523605, "auxiliary_loss_mlp": 0.01047469, "balance_loss_clip": 1.31516576, "balance_loss_mlp": 1.02104068, "epoch": 0.22870885314895537, "flos": 21296875962240.0, "grad_norm": 1.623900769825681, "language_loss": 0.86255479, "learning_rate": 3.5998988105062235e-06, "loss": 0.88826549, "num_input_tokens_seen": 81876385, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.26428223, "step": 3804, "time_per_iteration": 2.826904773712158 }, { "auxiliary_loss_clip": 0.01520119, "auxiliary_loss_mlp": 0.01043168, "balance_loss_clip": 1.3101176, "balance_loss_mlp": 1.01821721, "epoch": 0.22876897640162333, "flos": 14947680280320.0, "grad_norm": 2.1463268173569667, "language_loss": 0.77824879, "learning_rate": 3.59966507689401e-06, "loss": 0.80388165, "num_input_tokens_seen": 81893225, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24963379, "step": 3805, "time_per_iteration": 2.832686424255371 }, { "auxiliary_loss_clip": 0.01535721, "auxiliary_loss_mlp": 0.01051242, "balance_loss_clip": 1.32509029, "balance_loss_mlp": 1.02555239, "epoch": 0.2288290996542913, "flos": 18123024660480.0, "grad_norm": 2.8446736156739196, "language_loss": 0.805453, "learning_rate": 3.5994312826223363e-06, "loss": 0.83132261, "num_input_tokens_seen": 81911350, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.25708008, "step": 3806, "time_per_iteration": 2.849250316619873 }, { "auxiliary_loss_clip": 0.0152286, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.31543541, "balance_loss_mlp": 1.02667928, "epoch": 0.22888922290695926, "flos": 39869173935360.0, "grad_norm": 2.0760826761741153, "language_loss": 0.7103883, "learning_rate": 3.5991974277000684e-06, "loss": 0.73613811, "num_input_tokens_seen": 81935420, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.2545166, "step": 3807, "time_per_iteration": 3.0215892791748047 }, { "auxiliary_loss_clip": 0.01529003, "auxiliary_loss_mlp": 0.01053, "balance_loss_clip": 1.31840932, "balance_loss_mlp": 1.02702379, "epoch": 0.22894934615962723, "flos": 23414104010880.0, "grad_norm": 2.438672231817149, "language_loss": 0.66775322, "learning_rate": 3.5989635121360733e-06, "loss": 0.6935733, "num_input_tokens_seen": 81953845, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.2598877, "step": 3808, "time_per_iteration": 2.8938193321228027 }, { "auxiliary_loss_clip": 0.0153659, "auxiliary_loss_mlp": 0.01054024, "balance_loss_clip": 1.32715023, "balance_loss_mlp": 1.02870417, "epoch": 0.22900946941229522, "flos": 18852146916480.0, "grad_norm": 1.7358063062615237, "language_loss": 0.76403511, "learning_rate": 3.598729535939222e-06, "loss": 0.78994119, "num_input_tokens_seen": 81972100, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.25341797, "step": 3809, "time_per_iteration": 2.8038406372070312 }, { "auxiliary_loss_clip": 0.014995, "auxiliary_loss_mlp": 0.01051466, "balance_loss_clip": 1.29721606, "balance_loss_mlp": 1.02633667, "epoch": 0.22906959266496318, "flos": 22939584099840.0, "grad_norm": 1.710742637321474, "language_loss": 0.82467496, "learning_rate": 3.5984954991183862e-06, "loss": 0.85018462, "num_input_tokens_seen": 81992760, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.25170898, "step": 3810, "time_per_iteration": 2.878061532974243 }, { "auxiliary_loss_clip": 0.01515901, "auxiliary_loss_mlp": 0.01043397, "balance_loss_clip": 1.31073105, "balance_loss_mlp": 1.01882839, "epoch": 0.22912971591763115, "flos": 19363975804800.0, "grad_norm": 1.8663052651893375, "language_loss": 0.79596245, "learning_rate": 3.598261401682441e-06, "loss": 0.82155538, "num_input_tokens_seen": 82009080, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.24560547, "step": 3811, "time_per_iteration": 2.868255615234375 }, { "auxiliary_loss_clip": 0.01508975, "auxiliary_loss_mlp": 0.01044566, "balance_loss_clip": 1.30397236, "balance_loss_mlp": 1.01962721, "epoch": 0.22918983917029911, "flos": 19941916625280.0, "grad_norm": 1.7012535544235612, "language_loss": 0.83682203, "learning_rate": 3.5980272436402632e-06, "loss": 0.86235744, "num_input_tokens_seen": 82026705, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.24926758, "step": 3812, "time_per_iteration": 2.8483266830444336 }, { "auxiliary_loss_clip": 0.01535754, "auxiliary_loss_mlp": 0.01050638, "balance_loss_clip": 1.32394958, "balance_loss_mlp": 1.0255568, "epoch": 0.22924996242296708, "flos": 16699600661760.0, "grad_norm": 2.3808113378939275, "language_loss": 0.85594612, "learning_rate": 3.5977930250007324e-06, "loss": 0.88181007, "num_input_tokens_seen": 82043245, "router_z_loss_clip": 2.12207031, "router_z_loss_mlp": 0.25097656, "step": 3813, "time_per_iteration": 2.8966779708862305 }, { "auxiliary_loss_clip": 0.01512091, "auxiliary_loss_mlp": 0.01047555, "balance_loss_clip": 1.30658317, "balance_loss_mlp": 1.02247357, "epoch": 0.22931008567563504, "flos": 33049032681600.0, "grad_norm": 1.7129411647859194, "language_loss": 0.71444857, "learning_rate": 3.5975587457727298e-06, "loss": 0.74004507, "num_input_tokens_seen": 82066870, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.25061035, "step": 3814, "time_per_iteration": 2.961540937423706 }, { "auxiliary_loss_clip": 0.01500098, "auxiliary_loss_mlp": 0.01046431, "balance_loss_clip": 1.29717398, "balance_loss_mlp": 1.02113485, "epoch": 0.229370208928303, "flos": 23340979134720.0, "grad_norm": 5.390795405642149, "language_loss": 0.68171167, "learning_rate": 3.597324405965139e-06, "loss": 0.70717704, "num_input_tokens_seen": 82083180, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.25317383, "step": 3815, "time_per_iteration": 2.9316020011901855 }, { "auxiliary_loss_clip": 0.01515603, "auxiliary_loss_mlp": 0.01053511, "balance_loss_clip": 1.30957651, "balance_loss_mlp": 1.02616417, "epoch": 0.229430332180971, "flos": 28627850718720.0, "grad_norm": 1.7854511675695954, "language_loss": 0.83857799, "learning_rate": 3.597090005586848e-06, "loss": 0.86426908, "num_input_tokens_seen": 82102950, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.27368164, "step": 3816, "time_per_iteration": 4.373397588729858 }, { "auxiliary_loss_clip": 0.01517791, "auxiliary_loss_mlp": 0.01047058, "balance_loss_clip": 1.31253672, "balance_loss_mlp": 1.02018785, "epoch": 0.22949045543363897, "flos": 17247064469760.0, "grad_norm": 2.429173079461321, "language_loss": 0.89164776, "learning_rate": 3.596855544646742e-06, "loss": 0.91729623, "num_input_tokens_seen": 82119510, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.26879883, "step": 3817, "time_per_iteration": 2.8765721321105957 }, { "auxiliary_loss_clip": 0.01520426, "auxiliary_loss_mlp": 0.01048622, "balance_loss_clip": 1.31274569, "balance_loss_mlp": 1.02190697, "epoch": 0.22955057868630693, "flos": 27500636298240.0, "grad_norm": 1.7909923140204635, "language_loss": 0.75933719, "learning_rate": 3.5966210231537154e-06, "loss": 0.78502768, "num_input_tokens_seen": 82140095, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.26708984, "step": 3818, "time_per_iteration": 2.987168788909912 }, { "auxiliary_loss_clip": 0.01512737, "auxiliary_loss_mlp": 0.01047348, "balance_loss_clip": 1.30730176, "balance_loss_mlp": 1.0201925, "epoch": 0.2296107019389749, "flos": 23486685949440.0, "grad_norm": 2.483988088044078, "language_loss": 0.75828874, "learning_rate": 3.596386441116659e-06, "loss": 0.78388965, "num_input_tokens_seen": 82159510, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.27148438, "step": 3819, "time_per_iteration": 2.922292470932007 }, { "auxiliary_loss_clip": 0.01526321, "auxiliary_loss_mlp": 0.01048675, "balance_loss_clip": 1.31853902, "balance_loss_mlp": 1.0209825, "epoch": 0.22967082519164286, "flos": 31297655237760.0, "grad_norm": 1.6556528879471908, "language_loss": 0.81615829, "learning_rate": 3.5961517985444684e-06, "loss": 0.84190828, "num_input_tokens_seen": 82179580, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.27697754, "step": 3820, "time_per_iteration": 3.0351929664611816 }, { "auxiliary_loss_clip": 0.01531671, "auxiliary_loss_mlp": 0.01052838, "balance_loss_clip": 1.31968915, "balance_loss_mlp": 1.02363181, "epoch": 0.22973094844431083, "flos": 14649887134080.0, "grad_norm": 1.932972429633902, "language_loss": 0.70239294, "learning_rate": 3.595917095446042e-06, "loss": 0.72823805, "num_input_tokens_seen": 82195585, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.29223633, "step": 3821, "time_per_iteration": 2.9400064945220947 }, { "auxiliary_loss_clip": 0.01509055, "auxiliary_loss_mlp": 0.01047933, "balance_loss_clip": 1.30304527, "balance_loss_mlp": 1.02013373, "epoch": 0.2297910716969788, "flos": 22835032070400.0, "grad_norm": 1.5739326809067737, "language_loss": 0.83673239, "learning_rate": 3.5956823318302796e-06, "loss": 0.8623023, "num_input_tokens_seen": 82217530, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.27807617, "step": 3822, "time_per_iteration": 2.9545109272003174 }, { "auxiliary_loss_clip": 0.01523612, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.31916595, "balance_loss_mlp": 1.01588404, "epoch": 0.2298511949496468, "flos": 23049294036480.0, "grad_norm": 2.367886776627778, "language_loss": 0.67082816, "learning_rate": 3.5954475077060833e-06, "loss": 0.69649816, "num_input_tokens_seen": 82237980, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.27539062, "step": 3823, "time_per_iteration": 2.942615032196045 }, { "auxiliary_loss_clip": 0.0134908, "auxiliary_loss_mlp": 0.01063075, "balance_loss_clip": 1.22748172, "balance_loss_mlp": 1.03331995, "epoch": 0.22991131820231475, "flos": 66920989368960.0, "grad_norm": 0.7953184186567328, "language_loss": 0.56801695, "learning_rate": 3.595212623082357e-06, "loss": 0.59213847, "num_input_tokens_seen": 82301785, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.296875, "step": 3824, "time_per_iteration": 3.438488721847534 }, { "auxiliary_loss_clip": 0.01508972, "auxiliary_loss_mlp": 0.01049791, "balance_loss_clip": 1.3066628, "balance_loss_mlp": 1.02246809, "epoch": 0.22997144145498272, "flos": 17894284358400.0, "grad_norm": 1.9630434992854373, "language_loss": 0.73693883, "learning_rate": 3.594977677968009e-06, "loss": 0.76252645, "num_input_tokens_seen": 82317355, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.27319336, "step": 3825, "time_per_iteration": 4.256927728652954 }, { "auxiliary_loss_clip": 0.01526873, "auxiliary_loss_mlp": 0.0104937, "balance_loss_clip": 1.32095647, "balance_loss_mlp": 1.02190459, "epoch": 0.23003156470765068, "flos": 24686482308480.0, "grad_norm": 1.9875408202496487, "language_loss": 0.89175379, "learning_rate": 3.5947426723719473e-06, "loss": 0.91751611, "num_input_tokens_seen": 82336645, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.27490234, "step": 3826, "time_per_iteration": 2.8776278495788574 }, { "auxiliary_loss_clip": 0.01534695, "auxiliary_loss_mlp": 0.01050533, "balance_loss_clip": 1.32122338, "balance_loss_mlp": 1.02274561, "epoch": 0.23009168796031865, "flos": 15822328371840.0, "grad_norm": 5.547488682335743, "language_loss": 0.8277427, "learning_rate": 3.594507606303083e-06, "loss": 0.85359496, "num_input_tokens_seen": 82354225, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.2779541, "step": 3827, "time_per_iteration": 4.230492830276489 }, { "auxiliary_loss_clip": 0.01511908, "auxiliary_loss_mlp": 0.01048459, "balance_loss_clip": 1.30848145, "balance_loss_mlp": 1.02096939, "epoch": 0.2301518112129866, "flos": 16220465781120.0, "grad_norm": 2.0575405309738617, "language_loss": 0.87915051, "learning_rate": 3.5942724797703314e-06, "loss": 0.90475416, "num_input_tokens_seen": 82370240, "router_z_loss_clip": 2.03613281, "router_z_loss_mlp": 0.27539062, "step": 3828, "time_per_iteration": 4.276114225387573 }, { "auxiliary_loss_clip": 0.0150788, "auxiliary_loss_mlp": 0.01049189, "balance_loss_clip": 1.3014195, "balance_loss_mlp": 1.02106702, "epoch": 0.2302119344656546, "flos": 20605334152320.0, "grad_norm": 2.047180537134642, "language_loss": 0.72018778, "learning_rate": 3.594037292782607e-06, "loss": 0.74575847, "num_input_tokens_seen": 82389145, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.28088379, "step": 3829, "time_per_iteration": 2.8294689655303955 }, { "auxiliary_loss_clip": 0.01507556, "auxiliary_loss_mlp": 0.01046975, "balance_loss_clip": 1.30606651, "balance_loss_mlp": 1.02102351, "epoch": 0.23027205771832257, "flos": 26808099102720.0, "grad_norm": 1.7332053064842587, "language_loss": 0.85712552, "learning_rate": 3.5938020453488293e-06, "loss": 0.88267088, "num_input_tokens_seen": 82409185, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.25976562, "step": 3830, "time_per_iteration": 2.9193902015686035 }, { "auxiliary_loss_clip": 0.01506904, "auxiliary_loss_mlp": 0.01054061, "balance_loss_clip": 1.30358028, "balance_loss_mlp": 1.02647638, "epoch": 0.23033218097099054, "flos": 43889006108160.0, "grad_norm": 1.6201235904048665, "language_loss": 0.67711276, "learning_rate": 3.5935667374779177e-06, "loss": 0.70272243, "num_input_tokens_seen": 82432070, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.27587891, "step": 3831, "time_per_iteration": 3.027104139328003 }, { "auxiliary_loss_clip": 0.01524282, "auxiliary_loss_mlp": 0.01056283, "balance_loss_clip": 1.31652498, "balance_loss_mlp": 1.02891254, "epoch": 0.2303923042236585, "flos": 26078795867520.0, "grad_norm": 2.1144973810420087, "language_loss": 0.76967245, "learning_rate": 3.5933313691787957e-06, "loss": 0.79547811, "num_input_tokens_seen": 82450625, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.27331543, "step": 3832, "time_per_iteration": 2.9248671531677246 }, { "auxiliary_loss_clip": 0.01522181, "auxiliary_loss_mlp": 0.01049896, "balance_loss_clip": 1.31523895, "balance_loss_mlp": 1.02192974, "epoch": 0.23045242747632647, "flos": 18305542759680.0, "grad_norm": 1.8935753615826372, "language_loss": 0.8856473, "learning_rate": 3.593095940460389e-06, "loss": 0.91136813, "num_input_tokens_seen": 82468575, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.27954102, "step": 3833, "time_per_iteration": 2.8466296195983887 }, { "auxiliary_loss_clip": 0.01522867, "auxiliary_loss_mlp": 0.01049713, "balance_loss_clip": 1.31313682, "balance_loss_mlp": 1.02248573, "epoch": 0.23051255072899443, "flos": 25531558283520.0, "grad_norm": 1.5652774458094958, "language_loss": 0.75684839, "learning_rate": 3.592860451331624e-06, "loss": 0.78257418, "num_input_tokens_seen": 82488655, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.2722168, "step": 3834, "time_per_iteration": 2.932950735092163 }, { "auxiliary_loss_clip": 0.01503223, "auxiliary_loss_mlp": 0.0105698, "balance_loss_clip": 1.29984784, "balance_loss_mlp": 1.03044438, "epoch": 0.2305726739816624, "flos": 21225108430080.0, "grad_norm": 1.7411795032767157, "language_loss": 0.87098718, "learning_rate": 3.592624901801432e-06, "loss": 0.89658916, "num_input_tokens_seen": 82507220, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.26586914, "step": 3835, "time_per_iteration": 2.88079833984375 }, { "auxiliary_loss_clip": 0.01528339, "auxiliary_loss_mlp": 0.01051853, "balance_loss_clip": 1.31744885, "balance_loss_mlp": 1.02585387, "epoch": 0.2306327972343304, "flos": 23341522072320.0, "grad_norm": 1.9472584044208716, "language_loss": 0.83300984, "learning_rate": 3.5923892918787432e-06, "loss": 0.8588118, "num_input_tokens_seen": 82527920, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.26013184, "step": 3836, "time_per_iteration": 2.9056904315948486 }, { "auxiliary_loss_clip": 0.01525412, "auxiliary_loss_mlp": 0.01051228, "balance_loss_clip": 1.319682, "balance_loss_mlp": 1.02493072, "epoch": 0.23069292048699835, "flos": 20676196788480.0, "grad_norm": 1.7099337264441898, "language_loss": 0.80473101, "learning_rate": 3.5921536215724934e-06, "loss": 0.83049744, "num_input_tokens_seen": 82549040, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.26318359, "step": 3837, "time_per_iteration": 2.9525349140167236 }, { "auxiliary_loss_clip": 0.01340646, "auxiliary_loss_mlp": 0.01067775, "balance_loss_clip": 1.21964765, "balance_loss_mlp": 1.03782964, "epoch": 0.23075304373966632, "flos": 70482798000000.0, "grad_norm": 0.9035344452207386, "language_loss": 0.65402484, "learning_rate": 3.5919178908916184e-06, "loss": 0.67810905, "num_input_tokens_seen": 82604070, "router_z_loss_clip": 1.2109375, "router_z_loss_mlp": 0.29882812, "step": 3838, "time_per_iteration": 3.3570077419281006 }, { "auxiliary_loss_clip": 0.01508644, "auxiliary_loss_mlp": 0.01053544, "balance_loss_clip": 1.30378556, "balance_loss_mlp": 1.02808106, "epoch": 0.23081316699233428, "flos": 16626702009600.0, "grad_norm": 2.1265525965954333, "language_loss": 0.76633221, "learning_rate": 3.591682099845058e-06, "loss": 0.79195404, "num_input_tokens_seen": 82619665, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.25500488, "step": 3839, "time_per_iteration": 2.8439126014709473 }, { "auxiliary_loss_clip": 0.01542412, "auxiliary_loss_mlp": 0.01055568, "balance_loss_clip": 1.33032274, "balance_loss_mlp": 1.03016436, "epoch": 0.23087329024500225, "flos": 13306012773120.0, "grad_norm": 1.7578290286056804, "language_loss": 0.70466173, "learning_rate": 3.591446248441752e-06, "loss": 0.7306416, "num_input_tokens_seen": 82637530, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25402832, "step": 3840, "time_per_iteration": 3.1429245471954346 }, { "auxiliary_loss_clip": 0.01525479, "auxiliary_loss_mlp": 0.01054055, "balance_loss_clip": 1.32076955, "balance_loss_mlp": 1.02767372, "epoch": 0.23093341349767021, "flos": 17794799746560.0, "grad_norm": 1.992562560846032, "language_loss": 0.80357003, "learning_rate": 3.591210336690645e-06, "loss": 0.82936543, "num_input_tokens_seen": 82656130, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.26379395, "step": 3841, "time_per_iteration": 2.8717758655548096 }, { "auxiliary_loss_clip": 0.01518106, "auxiliary_loss_mlp": 0.01054344, "balance_loss_clip": 1.31247532, "balance_loss_mlp": 1.02948904, "epoch": 0.23099353675033818, "flos": 23998695816960.0, "grad_norm": 1.8843729536880263, "language_loss": 0.8358109, "learning_rate": 3.590974364600683e-06, "loss": 0.86153537, "num_input_tokens_seen": 82675295, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.24865723, "step": 3842, "time_per_iteration": 3.116657257080078 }, { "auxiliary_loss_clip": 0.01511554, "auxiliary_loss_mlp": 0.01053056, "balance_loss_clip": 1.307024, "balance_loss_mlp": 1.02655613, "epoch": 0.23105366000300617, "flos": 36009753137280.0, "grad_norm": 1.473702479008749, "language_loss": 0.6686241, "learning_rate": 3.5907383321808135e-06, "loss": 0.69427025, "num_input_tokens_seen": 82703260, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.26513672, "step": 3843, "time_per_iteration": 3.087411642074585 }, { "auxiliary_loss_clip": 0.01505543, "auxiliary_loss_mlp": 0.01057126, "balance_loss_clip": 1.30546463, "balance_loss_mlp": 1.03017235, "epoch": 0.23111378325567414, "flos": 31256952900480.0, "grad_norm": 1.7955261084913654, "language_loss": 0.78057277, "learning_rate": 3.590502239439987e-06, "loss": 0.80619943, "num_input_tokens_seen": 82725060, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.26928711, "step": 3844, "time_per_iteration": 2.9715194702148438 }, { "auxiliary_loss_clip": 0.01520963, "auxiliary_loss_mlp": 0.01063751, "balance_loss_clip": 1.31561768, "balance_loss_mlp": 1.03650022, "epoch": 0.2311739065083421, "flos": 19217545073280.0, "grad_norm": 1.572769212965825, "language_loss": 0.78657776, "learning_rate": 3.590266086387156e-06, "loss": 0.8124249, "num_input_tokens_seen": 82742960, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.27270508, "step": 3845, "time_per_iteration": 2.8821213245391846 }, { "auxiliary_loss_clip": 0.01490689, "auxiliary_loss_mlp": 0.01049309, "balance_loss_clip": 1.29299068, "balance_loss_mlp": 1.02426314, "epoch": 0.23123402976101007, "flos": 23368650969600.0, "grad_norm": 2.201840776289063, "language_loss": 0.7737121, "learning_rate": 3.590029873031276e-06, "loss": 0.79911208, "num_input_tokens_seen": 82760205, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.25061035, "step": 3846, "time_per_iteration": 2.803565502166748 }, { "auxiliary_loss_clip": 0.0152686, "auxiliary_loss_mlp": 0.0105939, "balance_loss_clip": 1.32012355, "balance_loss_mlp": 1.03409362, "epoch": 0.23129415301367803, "flos": 13743268951680.0, "grad_norm": 1.8799741490335318, "language_loss": 0.70656127, "learning_rate": 3.589793599381304e-06, "loss": 0.73242378, "num_input_tokens_seen": 82778590, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.25305176, "step": 3847, "time_per_iteration": 2.795222759246826 }, { "auxiliary_loss_clip": 0.01322603, "auxiliary_loss_mlp": 0.01051413, "balance_loss_clip": 1.20194364, "balance_loss_mlp": 1.02862024, "epoch": 0.231354276266346, "flos": 69767882611200.0, "grad_norm": 0.7821371794875414, "language_loss": 0.61053944, "learning_rate": 3.589557265446198e-06, "loss": 0.63427961, "num_input_tokens_seen": 82833925, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.22753906, "step": 3848, "time_per_iteration": 3.2593278884887695 }, { "auxiliary_loss_clip": 0.01512643, "auxiliary_loss_mlp": 0.01055466, "balance_loss_clip": 1.30643034, "balance_loss_mlp": 1.0303123, "epoch": 0.231414399519014, "flos": 18843188446080.0, "grad_norm": 2.0091925360634932, "language_loss": 0.79245031, "learning_rate": 3.589320871234923e-06, "loss": 0.81813145, "num_input_tokens_seen": 82850625, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.25146484, "step": 3849, "time_per_iteration": 2.8323168754577637 }, { "auxiliary_loss_clip": 0.01528527, "auxiliary_loss_mlp": 0.01050524, "balance_loss_clip": 1.32004154, "balance_loss_mlp": 1.02430964, "epoch": 0.23147452277168196, "flos": 36148627987200.0, "grad_norm": 1.8826138668491732, "language_loss": 0.72611177, "learning_rate": 3.5890844167564405e-06, "loss": 0.75190228, "num_input_tokens_seen": 82872105, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.26245117, "step": 3850, "time_per_iteration": 2.9565370082855225 }, { "auxiliary_loss_clip": 0.01519966, "auxiliary_loss_mlp": 0.01050478, "balance_loss_clip": 1.31555581, "balance_loss_mlp": 1.02496696, "epoch": 0.23153464602434992, "flos": 20822491785600.0, "grad_norm": 2.0293828935096756, "language_loss": 0.77586699, "learning_rate": 3.588847902019718e-06, "loss": 0.80157137, "num_input_tokens_seen": 82890595, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.25512695, "step": 3851, "time_per_iteration": 4.262117624282837 }, { "auxiliary_loss_clip": 0.01516017, "auxiliary_loss_mlp": 0.01047738, "balance_loss_clip": 1.31168652, "balance_loss_mlp": 1.02157211, "epoch": 0.2315947692770179, "flos": 19949110548480.0, "grad_norm": 1.4657336424047718, "language_loss": 0.70489842, "learning_rate": 3.588611327033723e-06, "loss": 0.73053598, "num_input_tokens_seen": 82908910, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.26184082, "step": 3852, "time_per_iteration": 2.9430229663848877 }, { "auxiliary_loss_clip": 0.01528416, "auxiliary_loss_mlp": 0.01042778, "balance_loss_clip": 1.32009053, "balance_loss_mlp": 1.01844776, "epoch": 0.23165489252968585, "flos": 12862150853760.0, "grad_norm": 2.041903088619222, "language_loss": 0.6842013, "learning_rate": 3.588374691807428e-06, "loss": 0.70991325, "num_input_tokens_seen": 82925405, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.24353027, "step": 3853, "time_per_iteration": 2.8629841804504395 }, { "auxiliary_loss_clip": 0.01540725, "auxiliary_loss_mlp": 0.010431, "balance_loss_clip": 1.33196604, "balance_loss_mlp": 1.01767302, "epoch": 0.23171501578235382, "flos": 30640571982720.0, "grad_norm": 3.388508573080948, "language_loss": 0.81241018, "learning_rate": 3.5881379963498053e-06, "loss": 0.83824843, "num_input_tokens_seen": 82945615, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.25415039, "step": 3854, "time_per_iteration": 2.954526901245117 }, { "auxiliary_loss_clip": 0.0154411, "auxiliary_loss_mlp": 0.01050566, "balance_loss_clip": 1.32860231, "balance_loss_mlp": 1.02392244, "epoch": 0.23177513903502178, "flos": 23853531939840.0, "grad_norm": 1.9414652408464406, "language_loss": 0.66988856, "learning_rate": 3.587901240669831e-06, "loss": 0.69583535, "num_input_tokens_seen": 82967570, "router_z_loss_clip": 2.15429688, "router_z_loss_mlp": 0.26623535, "step": 3855, "time_per_iteration": 2.9186127185821533 }, { "auxiliary_loss_clip": 0.01518805, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.31045258, "balance_loss_mlp": 1.01808548, "epoch": 0.23183526228768978, "flos": 29582319916800.0, "grad_norm": 1.8180448679859638, "language_loss": 0.72504663, "learning_rate": 3.5876644247764815e-06, "loss": 0.75067973, "num_input_tokens_seen": 82987435, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.26416016, "step": 3856, "time_per_iteration": 3.006605386734009 }, { "auxiliary_loss_clip": 0.01520522, "auxiliary_loss_mlp": 0.01045066, "balance_loss_clip": 1.31537807, "balance_loss_mlp": 1.02040207, "epoch": 0.23189538554035774, "flos": 34471823253120.0, "grad_norm": 1.533476206159904, "language_loss": 0.77935898, "learning_rate": 3.5874275486787387e-06, "loss": 0.80501485, "num_input_tokens_seen": 83010505, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.2467041, "step": 3857, "time_per_iteration": 3.0112574100494385 }, { "auxiliary_loss_clip": 0.0154004, "auxiliary_loss_mlp": 0.01050712, "balance_loss_clip": 1.3279686, "balance_loss_mlp": 1.02219737, "epoch": 0.2319555087930257, "flos": 18012726541440.0, "grad_norm": 2.3248480762763193, "language_loss": 0.92222828, "learning_rate": 3.587190612385584e-06, "loss": 0.94813573, "num_input_tokens_seen": 83026705, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.28491211, "step": 3858, "time_per_iteration": 2.9500911235809326 }, { "auxiliary_loss_clip": 0.01514306, "auxiliary_loss_mlp": 0.01041804, "balance_loss_clip": 1.31179309, "balance_loss_mlp": 1.01609063, "epoch": 0.23201563204569367, "flos": 23152941169920.0, "grad_norm": 1.8404635649070624, "language_loss": 0.77873868, "learning_rate": 3.5869536159060026e-06, "loss": 0.80429971, "num_input_tokens_seen": 83046500, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.25720215, "step": 3859, "time_per_iteration": 2.8936314582824707 }, { "auxiliary_loss_clip": 0.01509283, "auxiliary_loss_mlp": 0.01045465, "balance_loss_clip": 1.30481267, "balance_loss_mlp": 1.01987076, "epoch": 0.23207575529836164, "flos": 20677780356480.0, "grad_norm": 1.9977278197932442, "language_loss": 0.85560262, "learning_rate": 3.58671655924898e-06, "loss": 0.88115013, "num_input_tokens_seen": 83065280, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.25585938, "step": 3860, "time_per_iteration": 4.295123100280762 }, { "auxiliary_loss_clip": 0.01520325, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.31433797, "balance_loss_mlp": 1.01252258, "epoch": 0.2321358785510296, "flos": 16480904705280.0, "grad_norm": 3.053601436795869, "language_loss": 0.84179431, "learning_rate": 3.586479442423508e-06, "loss": 0.86739469, "num_input_tokens_seen": 83082310, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.27172852, "step": 3861, "time_per_iteration": 2.883608102798462 }, { "auxiliary_loss_clip": 0.01515988, "auxiliary_loss_mlp": 0.0104514, "balance_loss_clip": 1.31002855, "balance_loss_mlp": 1.0193435, "epoch": 0.2321960018036976, "flos": 21626277240960.0, "grad_norm": 1.506432260095734, "language_loss": 0.86830103, "learning_rate": 3.586242265438576e-06, "loss": 0.89391226, "num_input_tokens_seen": 83102065, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.25793457, "step": 3862, "time_per_iteration": 4.443707227706909 }, { "auxiliary_loss_clip": 0.01505306, "auxiliary_loss_mlp": 0.01047366, "balance_loss_clip": 1.30348217, "balance_loss_mlp": 1.02099681, "epoch": 0.23225612505636556, "flos": 22281188745600.0, "grad_norm": 1.4662452042329233, "language_loss": 0.75485778, "learning_rate": 3.5860050283031773e-06, "loss": 0.78038448, "num_input_tokens_seen": 83121445, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.26379395, "step": 3863, "time_per_iteration": 4.290844440460205 }, { "auxiliary_loss_clip": 0.01505752, "auxiliary_loss_mlp": 0.01046673, "balance_loss_clip": 1.30383539, "balance_loss_mlp": 1.02109075, "epoch": 0.23231624830903352, "flos": 17060248114560.0, "grad_norm": 1.8252468242558253, "language_loss": 0.75243795, "learning_rate": 3.58576773102631e-06, "loss": 0.77796221, "num_input_tokens_seen": 83138175, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.25598145, "step": 3864, "time_per_iteration": 2.8840951919555664 }, { "auxiliary_loss_clip": 0.01507766, "auxiliary_loss_mlp": 0.01042518, "balance_loss_clip": 1.30466986, "balance_loss_mlp": 1.01550508, "epoch": 0.2323763715617015, "flos": 34652938763520.0, "grad_norm": 2.8617960873854584, "language_loss": 0.71255785, "learning_rate": 3.5855303736169714e-06, "loss": 0.73806071, "num_input_tokens_seen": 83161975, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.27026367, "step": 3865, "time_per_iteration": 2.9787814617156982 }, { "auxiliary_loss_clip": 0.01538096, "auxiliary_loss_mlp": 0.01042825, "balance_loss_clip": 1.32416153, "balance_loss_mlp": 1.01613438, "epoch": 0.23243649481436945, "flos": 25561356624000.0, "grad_norm": 1.7503839206954328, "language_loss": 0.95924795, "learning_rate": 3.5852929560841617e-06, "loss": 0.98505712, "num_input_tokens_seen": 83180905, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.26721191, "step": 3866, "time_per_iteration": 2.8740692138671875 }, { "auxiliary_loss_clip": 0.01520179, "auxiliary_loss_mlp": 0.01044245, "balance_loss_clip": 1.3147558, "balance_loss_mlp": 1.01896095, "epoch": 0.23249661806703742, "flos": 20492683303680.0, "grad_norm": 2.4076170663905625, "language_loss": 0.74594247, "learning_rate": 3.5850554784368846e-06, "loss": 0.77158678, "num_input_tokens_seen": 83196390, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.25268555, "step": 3867, "time_per_iteration": 2.8725171089172363 }, { "auxiliary_loss_clip": 0.01521784, "auxiliary_loss_mlp": 0.01045289, "balance_loss_clip": 1.31389415, "balance_loss_mlp": 1.01815712, "epoch": 0.23255674131970538, "flos": 20386457216640.0, "grad_norm": 2.068177945725956, "language_loss": 0.83336735, "learning_rate": 3.584817940684145e-06, "loss": 0.859038, "num_input_tokens_seen": 83216165, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.27148438, "step": 3868, "time_per_iteration": 2.846949338912964 }, { "auxiliary_loss_clip": 0.01501407, "auxiliary_loss_mlp": 0.01045827, "balance_loss_clip": 1.29997933, "balance_loss_mlp": 1.01917171, "epoch": 0.23261686457237338, "flos": 17064998818560.0, "grad_norm": 1.668636103915524, "language_loss": 0.74540198, "learning_rate": 3.58458034283495e-06, "loss": 0.77087432, "num_input_tokens_seen": 83233845, "router_z_loss_clip": 2.01171875, "router_z_loss_mlp": 0.2668457, "step": 3869, "time_per_iteration": 2.874662160873413 }, { "auxiliary_loss_clip": 0.01508485, "auxiliary_loss_mlp": 0.01046403, "balance_loss_clip": 1.30452538, "balance_loss_mlp": 1.02057028, "epoch": 0.23267698782504134, "flos": 29182056001920.0, "grad_norm": 1.6687691059617609, "language_loss": 0.80637872, "learning_rate": 3.5843426848983097e-06, "loss": 0.8319276, "num_input_tokens_seen": 83254930, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.25793457, "step": 3870, "time_per_iteration": 2.980065107345581 }, { "auxiliary_loss_clip": 0.01530835, "auxiliary_loss_mlp": 0.01055081, "balance_loss_clip": 1.32002449, "balance_loss_mlp": 1.02885449, "epoch": 0.2327371110777093, "flos": 21183501196800.0, "grad_norm": 1.7768967071235278, "language_loss": 0.72552371, "learning_rate": 3.5841049668832357e-06, "loss": 0.75138283, "num_input_tokens_seen": 83272095, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.26257324, "step": 3871, "time_per_iteration": 2.8243227005004883 }, { "auxiliary_loss_clip": 0.01521553, "auxiliary_loss_mlp": 0.01055138, "balance_loss_clip": 1.31240928, "balance_loss_mlp": 1.02836394, "epoch": 0.23279723433037727, "flos": 24873796356480.0, "grad_norm": 2.409060755990464, "language_loss": 0.70464933, "learning_rate": 3.5838671887987433e-06, "loss": 0.73041618, "num_input_tokens_seen": 83290980, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.26806641, "step": 3872, "time_per_iteration": 2.886056900024414 }, { "auxiliary_loss_clip": 0.01539028, "auxiliary_loss_mlp": 0.01045185, "balance_loss_clip": 1.32599556, "balance_loss_mlp": 1.0181129, "epoch": 0.23285735758304524, "flos": 38814903411840.0, "grad_norm": 1.4904964757839079, "language_loss": 0.78840417, "learning_rate": 3.5836293506538474e-06, "loss": 0.8142463, "num_input_tokens_seen": 83315175, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.27087402, "step": 3873, "time_per_iteration": 3.005896806716919 }, { "auxiliary_loss_clip": 0.01322267, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.20188689, "balance_loss_mlp": 1.00606585, "epoch": 0.2329174808357132, "flos": 53972293916160.0, "grad_norm": 0.8598714885059744, "language_loss": 0.60612535, "learning_rate": 3.5833914524575687e-06, "loss": 0.6296891, "num_input_tokens_seen": 83372060, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.28125, "step": 3874, "time_per_iteration": 3.312131881713867 }, { "auxiliary_loss_clip": 0.01512598, "auxiliary_loss_mlp": 0.01045006, "balance_loss_clip": 1.30679631, "balance_loss_mlp": 1.01800489, "epoch": 0.23297760408838117, "flos": 21226149060480.0, "grad_norm": 2.4342135270417207, "language_loss": 0.81927443, "learning_rate": 3.583153494218927e-06, "loss": 0.84485054, "num_input_tokens_seen": 83389795, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.2701416, "step": 3875, "time_per_iteration": 2.8949317932128906 }, { "auxiliary_loss_clip": 0.01510687, "auxiliary_loss_mlp": 0.01043459, "balance_loss_clip": 1.30493569, "balance_loss_mlp": 1.01860344, "epoch": 0.23303772734104916, "flos": 28414810362240.0, "grad_norm": 1.5287518477438544, "language_loss": 0.61862868, "learning_rate": 3.5829154759469464e-06, "loss": 0.64417017, "num_input_tokens_seen": 83410005, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.24865723, "step": 3876, "time_per_iteration": 2.9404914379119873 }, { "auxiliary_loss_clip": 0.01529996, "auxiliary_loss_mlp": 0.01044619, "balance_loss_clip": 1.3212018, "balance_loss_mlp": 1.01814222, "epoch": 0.23309785059371713, "flos": 24324975204480.0, "grad_norm": 1.5914533168067075, "language_loss": 0.71841818, "learning_rate": 3.5826773976506523e-06, "loss": 0.74416435, "num_input_tokens_seen": 83430250, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.26477051, "step": 3877, "time_per_iteration": 2.925236701965332 }, { "auxiliary_loss_clip": 0.01515952, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.30835915, "balance_loss_mlp": 1.02092886, "epoch": 0.2331579738463851, "flos": 16000231501440.0, "grad_norm": 2.4030564407106763, "language_loss": 0.82014352, "learning_rate": 3.582439259339073e-06, "loss": 0.84579027, "num_input_tokens_seen": 83447950, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.27807617, "step": 3878, "time_per_iteration": 2.9005205631256104 }, { "auxiliary_loss_clip": 0.01530361, "auxiliary_loss_mlp": 0.01052122, "balance_loss_clip": 1.31811285, "balance_loss_mlp": 1.02404833, "epoch": 0.23321809709905306, "flos": 36440086861440.0, "grad_norm": 1.7109184154819705, "language_loss": 0.75481313, "learning_rate": 3.5822010610212374e-06, "loss": 0.78063798, "num_input_tokens_seen": 83467785, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.28027344, "step": 3879, "time_per_iteration": 3.0409903526306152 }, { "auxiliary_loss_clip": 0.01515412, "auxiliary_loss_mlp": 0.01043645, "balance_loss_clip": 1.30699086, "balance_loss_mlp": 1.01697803, "epoch": 0.23327822035172102, "flos": 21334727877120.0, "grad_norm": 2.2963304996056704, "language_loss": 0.90426576, "learning_rate": 3.5819628027061795e-06, "loss": 0.9298563, "num_input_tokens_seen": 83485390, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.2668457, "step": 3880, "time_per_iteration": 2.864959239959717 }, { "auxiliary_loss_clip": 0.01530436, "auxiliary_loss_mlp": 0.01048886, "balance_loss_clip": 1.31790876, "balance_loss_mlp": 1.02343524, "epoch": 0.233338343604389, "flos": 19181231481600.0, "grad_norm": 1.592694136361489, "language_loss": 0.72640562, "learning_rate": 3.5817244844029334e-06, "loss": 0.75219887, "num_input_tokens_seen": 83504890, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25476074, "step": 3881, "time_per_iteration": 2.8695712089538574 }, { "auxiliary_loss_clip": 0.01521284, "auxiliary_loss_mlp": 0.01049481, "balance_loss_clip": 1.31443143, "balance_loss_mlp": 1.02449441, "epoch": 0.23339846685705698, "flos": 26920116524160.0, "grad_norm": 2.527193248406329, "language_loss": 0.69068158, "learning_rate": 3.581486106120537e-06, "loss": 0.71638918, "num_input_tokens_seen": 83526475, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.24963379, "step": 3882, "time_per_iteration": 2.949002265930176 }, { "auxiliary_loss_clip": 0.01538943, "auxiliary_loss_mlp": 0.01052866, "balance_loss_clip": 1.32790172, "balance_loss_mlp": 1.0267477, "epoch": 0.23345859010972494, "flos": 32355907303680.0, "grad_norm": 1.9182576662426278, "language_loss": 0.77711183, "learning_rate": 3.5812476678680287e-06, "loss": 0.80302989, "num_input_tokens_seen": 83546620, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.26123047, "step": 3883, "time_per_iteration": 3.0050430297851562 }, { "auxiliary_loss_clip": 0.01309034, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.19264674, "balance_loss_mlp": 1.0038718, "epoch": 0.2335187133623929, "flos": 58517709396480.0, "grad_norm": 0.7757320585735973, "language_loss": 0.59073198, "learning_rate": 3.58100916965445e-06, "loss": 0.61412811, "num_input_tokens_seen": 83616160, "router_z_loss_clip": 1.1640625, "router_z_loss_mlp": 0.26757812, "step": 3884, "time_per_iteration": 3.5351359844207764 }, { "auxiliary_loss_clip": 0.0153637, "auxiliary_loss_mlp": 0.010479, "balance_loss_clip": 1.32326639, "balance_loss_mlp": 1.02263927, "epoch": 0.23357883661506088, "flos": 24513963310080.0, "grad_norm": 1.7390223326734005, "language_loss": 0.81264067, "learning_rate": 3.5807706114888455e-06, "loss": 0.83848333, "num_input_tokens_seen": 83636795, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25280762, "step": 3885, "time_per_iteration": 2.894704818725586 }, { "auxiliary_loss_clip": 0.0152452, "auxiliary_loss_mlp": 0.01049873, "balance_loss_clip": 1.31652927, "balance_loss_mlp": 1.02469563, "epoch": 0.23363895986772884, "flos": 18956653701120.0, "grad_norm": 2.379114922718705, "language_loss": 0.88544381, "learning_rate": 3.580531993380261e-06, "loss": 0.91118765, "num_input_tokens_seen": 83654050, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.25195312, "step": 3886, "time_per_iteration": 2.8602209091186523 }, { "auxiliary_loss_clip": 0.01540652, "auxiliary_loss_mlp": 0.01051055, "balance_loss_clip": 1.32988954, "balance_loss_mlp": 1.02459025, "epoch": 0.2336990831203968, "flos": 31699502720640.0, "grad_norm": 1.8140613898382878, "language_loss": 0.74040115, "learning_rate": 3.5802933153377445e-06, "loss": 0.7663182, "num_input_tokens_seen": 83673720, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.26489258, "step": 3887, "time_per_iteration": 4.324591398239136 }, { "auxiliary_loss_clip": 0.01537951, "auxiliary_loss_mlp": 0.01044333, "balance_loss_clip": 1.32627845, "balance_loss_mlp": 1.01971626, "epoch": 0.23375920637306477, "flos": 27721368270720.0, "grad_norm": 1.7353037166693437, "language_loss": 0.85340917, "learning_rate": 3.5800545773703475e-06, "loss": 0.87923193, "num_input_tokens_seen": 83693470, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24633789, "step": 3888, "time_per_iteration": 2.945384979248047 }, { "auxiliary_loss_clip": 0.01530975, "auxiliary_loss_mlp": 0.010518, "balance_loss_clip": 1.32305694, "balance_loss_mlp": 1.02640879, "epoch": 0.23381932962573276, "flos": 17684501627520.0, "grad_norm": 2.2237801328897477, "language_loss": 0.8927865, "learning_rate": 3.5798157794871225e-06, "loss": 0.91861415, "num_input_tokens_seen": 83711620, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.25390625, "step": 3889, "time_per_iteration": 2.8776543140411377 }, { "auxiliary_loss_clip": 0.01525122, "auxiliary_loss_mlp": 0.01043992, "balance_loss_clip": 1.31505775, "balance_loss_mlp": 1.02006698, "epoch": 0.23387945287840073, "flos": 14398587659520.0, "grad_norm": 2.5782118886322762, "language_loss": 0.78585315, "learning_rate": 3.579576921697125e-06, "loss": 0.8115443, "num_input_tokens_seen": 83727890, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23901367, "step": 3890, "time_per_iteration": 2.887502670288086 }, { "auxiliary_loss_clip": 0.01531216, "auxiliary_loss_mlp": 0.01049046, "balance_loss_clip": 1.32173514, "balance_loss_mlp": 1.02299917, "epoch": 0.2339395761310687, "flos": 46114224791040.0, "grad_norm": 1.673112527744224, "language_loss": 0.74351966, "learning_rate": 3.579338004009412e-06, "loss": 0.76932234, "num_input_tokens_seen": 83749370, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.26025391, "step": 3891, "time_per_iteration": 3.1023240089416504 }, { "auxiliary_loss_clip": 0.01522513, "auxiliary_loss_mlp": 0.01046857, "balance_loss_clip": 1.31765425, "balance_loss_mlp": 1.02183449, "epoch": 0.23399969938373666, "flos": 22392075047040.0, "grad_norm": 1.5751556357755865, "language_loss": 0.83465695, "learning_rate": 3.5790990264330433e-06, "loss": 0.86035061, "num_input_tokens_seen": 83769560, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.25024414, "step": 3892, "time_per_iteration": 2.985633134841919 }, { "auxiliary_loss_clip": 0.01544975, "auxiliary_loss_mlp": 0.01043555, "balance_loss_clip": 1.33311391, "balance_loss_mlp": 1.01846159, "epoch": 0.23405982263640462, "flos": 43524377112960.0, "grad_norm": 1.5379876767542195, "language_loss": 0.65531898, "learning_rate": 3.578859988977082e-06, "loss": 0.68120426, "num_input_tokens_seen": 83795635, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25097656, "step": 3893, "time_per_iteration": 3.0850818157196045 }, { "auxiliary_loss_clip": 0.01529881, "auxiliary_loss_mlp": 0.01045905, "balance_loss_clip": 1.32521558, "balance_loss_mlp": 1.01991749, "epoch": 0.2341199458890726, "flos": 22574819370240.0, "grad_norm": 2.094694898739344, "language_loss": 0.80124891, "learning_rate": 3.5786208916505916e-06, "loss": 0.82700682, "num_input_tokens_seen": 83814090, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.2598877, "step": 3894, "time_per_iteration": 2.950762987136841 }, { "auxiliary_loss_clip": 0.01520726, "auxiliary_loss_mlp": 0.01040876, "balance_loss_clip": 1.31567121, "balance_loss_mlp": 1.01592541, "epoch": 0.23418006914174055, "flos": 25646064168960.0, "grad_norm": 1.9694474495115317, "language_loss": 0.82453299, "learning_rate": 3.5783817344626383e-06, "loss": 0.85014904, "num_input_tokens_seen": 83836870, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.24963379, "step": 3895, "time_per_iteration": 4.411352872848511 }, { "auxiliary_loss_clip": 0.0152709, "auxiliary_loss_mlp": 0.01039986, "balance_loss_clip": 1.3207159, "balance_loss_mlp": 1.01379621, "epoch": 0.23424019239440855, "flos": 13551973361280.0, "grad_norm": 3.452224374386368, "language_loss": 0.81507087, "learning_rate": 3.578142517422292e-06, "loss": 0.84074163, "num_input_tokens_seen": 83853275, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.26208496, "step": 3896, "time_per_iteration": 4.28142786026001 }, { "auxiliary_loss_clip": 0.01535632, "auxiliary_loss_mlp": 0.01048984, "balance_loss_clip": 1.32405353, "balance_loss_mlp": 1.02305627, "epoch": 0.2343003156470765, "flos": 22429791227520.0, "grad_norm": 2.1273465984258713, "language_loss": 0.84026778, "learning_rate": 3.577903240538623e-06, "loss": 0.8661139, "num_input_tokens_seen": 83872340, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.2590332, "step": 3897, "time_per_iteration": 2.8864715099334717 }, { "auxiliary_loss_clip": 0.01546992, "auxiliary_loss_mlp": 0.01047647, "balance_loss_clip": 1.33330107, "balance_loss_mlp": 1.0209564, "epoch": 0.23436043889974448, "flos": 14798308636800.0, "grad_norm": 1.6713852780429568, "language_loss": 0.79789764, "learning_rate": 3.577663903820705e-06, "loss": 0.82384402, "num_input_tokens_seen": 83888795, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.2668457, "step": 3898, "time_per_iteration": 4.402770757675171 }, { "auxiliary_loss_clip": 0.0151621, "auxiliary_loss_mlp": 0.01045622, "balance_loss_clip": 1.31384015, "balance_loss_mlp": 1.02021849, "epoch": 0.23442056215241244, "flos": 22975852446720.0, "grad_norm": 2.110013659373313, "language_loss": 0.74920154, "learning_rate": 3.577424507277614e-06, "loss": 0.77481985, "num_input_tokens_seen": 83906820, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.25402832, "step": 3899, "time_per_iteration": 3.0654308795928955 }, { "auxiliary_loss_clip": 0.0153304, "auxiliary_loss_mlp": 0.01040166, "balance_loss_clip": 1.32414353, "balance_loss_mlp": 1.01507282, "epoch": 0.2344806854050804, "flos": 23081218882560.0, "grad_norm": 1.5543533618030059, "language_loss": 0.7661137, "learning_rate": 3.5771850509184277e-06, "loss": 0.79184574, "num_input_tokens_seen": 83926370, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.25109863, "step": 3900, "time_per_iteration": 2.9907684326171875 }, { "auxiliary_loss_clip": 0.01517686, "auxiliary_loss_mlp": 0.01045357, "balance_loss_clip": 1.31121171, "balance_loss_mlp": 1.01964366, "epoch": 0.23454080865774837, "flos": 16335967052160.0, "grad_norm": 2.518979459363364, "language_loss": 0.67658055, "learning_rate": 3.5769455347522256e-06, "loss": 0.70221102, "num_input_tokens_seen": 83944600, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.25720215, "step": 3901, "time_per_iteration": 2.8332109451293945 }, { "auxiliary_loss_clip": 0.01313409, "auxiliary_loss_mlp": 0.01041212, "balance_loss_clip": 1.19439197, "balance_loss_mlp": 1.02232969, "epoch": 0.23460093191041637, "flos": 67789800881280.0, "grad_norm": 0.7693293793050607, "language_loss": 0.58197355, "learning_rate": 3.576705958788091e-06, "loss": 0.60551977, "num_input_tokens_seen": 84005100, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.18847656, "step": 3902, "time_per_iteration": 3.3710453510284424 }, { "auxiliary_loss_clip": 0.01521177, "auxiliary_loss_mlp": 0.01049879, "balance_loss_clip": 1.31593668, "balance_loss_mlp": 1.0222342, "epoch": 0.23466105516308433, "flos": 20085542179200.0, "grad_norm": 1.9462754592324514, "language_loss": 0.81076485, "learning_rate": 3.576466323035108e-06, "loss": 0.83647537, "num_input_tokens_seen": 84023775, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.27648926, "step": 3903, "time_per_iteration": 2.8782460689544678 }, { "auxiliary_loss_clip": 0.01517306, "auxiliary_loss_mlp": 0.0104538, "balance_loss_clip": 1.31115782, "balance_loss_mlp": 1.01976192, "epoch": 0.2347211784157523, "flos": 24546566828160.0, "grad_norm": 2.0615651854189228, "language_loss": 0.83592987, "learning_rate": 3.5762266275023645e-06, "loss": 0.86155677, "num_input_tokens_seen": 84042605, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.25622559, "step": 3904, "time_per_iteration": 2.8895552158355713 }, { "auxiliary_loss_clip": 0.01521769, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.31571901, "balance_loss_mlp": 1.02487326, "epoch": 0.23478130166842026, "flos": 23815453800960.0, "grad_norm": 4.189836373182601, "language_loss": 0.72241211, "learning_rate": 3.57598687219895e-06, "loss": 0.74813336, "num_input_tokens_seen": 84061520, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.25512695, "step": 3905, "time_per_iteration": 2.8616042137145996 }, { "auxiliary_loss_clip": 0.01512216, "auxiliary_loss_mlp": 0.01041888, "balance_loss_clip": 1.30953181, "balance_loss_mlp": 1.01690221, "epoch": 0.23484142492108823, "flos": 24103247846400.0, "grad_norm": 1.6131483515808744, "language_loss": 0.7199837, "learning_rate": 3.5757470571339543e-06, "loss": 0.74552476, "num_input_tokens_seen": 84081800, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.24987793, "step": 3906, "time_per_iteration": 2.9244043827056885 }, { "auxiliary_loss_clip": 0.01533829, "auxiliary_loss_mlp": 0.01042888, "balance_loss_clip": 1.32046556, "balance_loss_mlp": 1.01710343, "epoch": 0.2349015481737562, "flos": 29107709516160.0, "grad_norm": 2.349421809353929, "language_loss": 0.7440778, "learning_rate": 3.575507182316473e-06, "loss": 0.76984501, "num_input_tokens_seen": 84102340, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25817871, "step": 3907, "time_per_iteration": 2.9132325649261475 }, { "auxiliary_loss_clip": 0.01523632, "auxiliary_loss_mlp": 0.01047711, "balance_loss_clip": 1.31576991, "balance_loss_mlp": 1.02125812, "epoch": 0.23496167142642416, "flos": 18925543261440.0, "grad_norm": 2.4983415284705215, "language_loss": 0.73553085, "learning_rate": 3.575267247755601e-06, "loss": 0.7612443, "num_input_tokens_seen": 84120370, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.26464844, "step": 3908, "time_per_iteration": 2.8181638717651367 }, { "auxiliary_loss_clip": 0.0130743, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.18858635, "balance_loss_mlp": 1.0073632, "epoch": 0.23502179467909215, "flos": 55894199587200.0, "grad_norm": 1.0346300906367807, "language_loss": 0.73429942, "learning_rate": 3.5750272534604367e-06, "loss": 0.75764668, "num_input_tokens_seen": 84165515, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.19921875, "step": 3909, "time_per_iteration": 3.1355514526367188 }, { "auxiliary_loss_clip": 0.01514674, "auxiliary_loss_mlp": 0.01042105, "balance_loss_clip": 1.30687785, "balance_loss_mlp": 1.01655817, "epoch": 0.23508191793176011, "flos": 23411570302080.0, "grad_norm": 1.5439285983534448, "language_loss": 0.88671935, "learning_rate": 3.5747871994400822e-06, "loss": 0.91228718, "num_input_tokens_seen": 84184540, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.25585938, "step": 3910, "time_per_iteration": 2.904994487762451 }, { "auxiliary_loss_clip": 0.01524678, "auxiliary_loss_mlp": 0.01046066, "balance_loss_clip": 1.31789446, "balance_loss_mlp": 1.02149653, "epoch": 0.23514204118442808, "flos": 20057282161920.0, "grad_norm": 1.827705736771759, "language_loss": 0.77116811, "learning_rate": 3.5745470857036386e-06, "loss": 0.7968756, "num_input_tokens_seen": 84202025, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.24572754, "step": 3911, "time_per_iteration": 2.906074047088623 }, { "auxiliary_loss_clip": 0.01502391, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.30240893, "balance_loss_mlp": 1.02051914, "epoch": 0.23520216443709605, "flos": 21590732810880.0, "grad_norm": 1.5507428808047905, "language_loss": 0.82086313, "learning_rate": 3.5743069122602122e-06, "loss": 0.84633315, "num_input_tokens_seen": 84221895, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.24084473, "step": 3912, "time_per_iteration": 3.0106732845306396 }, { "auxiliary_loss_clip": 0.01501162, "auxiliary_loss_mlp": 0.01045044, "balance_loss_clip": 1.30060363, "balance_loss_mlp": 1.01897311, "epoch": 0.235262287689764, "flos": 23196312950400.0, "grad_norm": 39.04365376997832, "language_loss": 0.72738969, "learning_rate": 3.574066679118909e-06, "loss": 0.75285184, "num_input_tokens_seen": 84240455, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.26062012, "step": 3913, "time_per_iteration": 2.892909526824951 }, { "auxiliary_loss_clip": 0.01531442, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.32210803, "balance_loss_mlp": 1.02221727, "epoch": 0.23532241094243198, "flos": 23195589033600.0, "grad_norm": 2.645132650676352, "language_loss": 0.76532364, "learning_rate": 3.57382638628884e-06, "loss": 0.79112321, "num_input_tokens_seen": 84261605, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.26293945, "step": 3914, "time_per_iteration": 2.9661648273468018 }, { "auxiliary_loss_clip": 0.01521835, "auxiliary_loss_mlp": 0.01046195, "balance_loss_clip": 1.31599402, "balance_loss_mlp": 1.02050579, "epoch": 0.23538253419509997, "flos": 17028142289280.0, "grad_norm": 3.6052963570970307, "language_loss": 0.9037503, "learning_rate": 3.5735860337791174e-06, "loss": 0.9294306, "num_input_tokens_seen": 84278675, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.25695801, "step": 3915, "time_per_iteration": 2.8724114894866943 }, { "auxiliary_loss_clip": 0.01306756, "auxiliary_loss_mlp": 0.01030756, "balance_loss_clip": 1.18833947, "balance_loss_mlp": 1.00872612, "epoch": 0.23544265744776793, "flos": 63474483047040.0, "grad_norm": 0.8161060406184484, "language_loss": 0.59429026, "learning_rate": 3.573345621598854e-06, "loss": 0.61766535, "num_input_tokens_seen": 84329765, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.22070312, "step": 3916, "time_per_iteration": 3.3638880252838135 }, { "auxiliary_loss_clip": 0.0130461, "auxiliary_loss_mlp": 0.01047193, "balance_loss_clip": 1.18631697, "balance_loss_mlp": 1.02707005, "epoch": 0.2355027807004359, "flos": 70549561342080.0, "grad_norm": 0.8017934055820477, "language_loss": 0.49514854, "learning_rate": 3.5731051497571675e-06, "loss": 0.51866657, "num_input_tokens_seen": 84393680, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.20117188, "step": 3917, "time_per_iteration": 3.39274263381958 }, { "auxiliary_loss_clip": 0.01533794, "auxiliary_loss_mlp": 0.01053236, "balance_loss_clip": 1.32387567, "balance_loss_mlp": 1.02585363, "epoch": 0.23556290395310386, "flos": 21444302079360.0, "grad_norm": 2.2310418226901993, "language_loss": 0.7706787, "learning_rate": 3.5728646182631756e-06, "loss": 0.79654908, "num_input_tokens_seen": 84412640, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.27380371, "step": 3918, "time_per_iteration": 2.9718573093414307 }, { "auxiliary_loss_clip": 0.01525352, "auxiliary_loss_mlp": 0.01047833, "balance_loss_clip": 1.31588542, "balance_loss_mlp": 1.0208801, "epoch": 0.23562302720577183, "flos": 18195425619840.0, "grad_norm": 1.7888915722780365, "language_loss": 0.70243579, "learning_rate": 3.5726240271259995e-06, "loss": 0.72816765, "num_input_tokens_seen": 84431605, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.26940918, "step": 3919, "time_per_iteration": 2.9170117378234863 }, { "auxiliary_loss_clip": 0.01505012, "auxiliary_loss_mlp": 0.01048823, "balance_loss_clip": 1.30373693, "balance_loss_mlp": 1.02315772, "epoch": 0.2356831504584398, "flos": 33743832117120.0, "grad_norm": 1.6909528770438067, "language_loss": 0.70612615, "learning_rate": 3.5723833763547634e-06, "loss": 0.73166454, "num_input_tokens_seen": 84454210, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.25708008, "step": 3920, "time_per_iteration": 3.037109851837158 }, { "auxiliary_loss_clip": 0.01524089, "auxiliary_loss_mlp": 0.01051416, "balance_loss_clip": 1.3197844, "balance_loss_mlp": 1.02575004, "epoch": 0.23574327371110776, "flos": 24942622976640.0, "grad_norm": 1.6166843749479267, "language_loss": 0.77853066, "learning_rate": 3.5721426659585916e-06, "loss": 0.80428571, "num_input_tokens_seen": 84475540, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.25695801, "step": 3921, "time_per_iteration": 2.9396636486053467 }, { "auxiliary_loss_clip": 0.01517882, "auxiliary_loss_mlp": 0.01044224, "balance_loss_clip": 1.31237209, "balance_loss_mlp": 1.01762807, "epoch": 0.23580339696377575, "flos": 17831158583040.0, "grad_norm": 2.1594104470764117, "language_loss": 0.76845366, "learning_rate": 3.571901895946612e-06, "loss": 0.79407471, "num_input_tokens_seen": 84494580, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.26599121, "step": 3922, "time_per_iteration": 4.266889333724976 }, { "auxiliary_loss_clip": 0.01510646, "auxiliary_loss_mlp": 0.01045753, "balance_loss_clip": 1.30670297, "balance_loss_mlp": 1.02161312, "epoch": 0.23586352021644372, "flos": 26297808537600.0, "grad_norm": 1.9644969624818611, "language_loss": 0.81235725, "learning_rate": 3.571661066327956e-06, "loss": 0.83792126, "num_input_tokens_seen": 84513850, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.24157715, "step": 3923, "time_per_iteration": 2.944692611694336 }, { "auxiliary_loss_clip": 0.01522672, "auxiliary_loss_mlp": 0.01048711, "balance_loss_clip": 1.31808555, "balance_loss_mlp": 1.02203214, "epoch": 0.23592364346911168, "flos": 14254871616000.0, "grad_norm": 1.7653891236082997, "language_loss": 0.7501303, "learning_rate": 3.571420177111754e-06, "loss": 0.7758441, "num_input_tokens_seen": 84532315, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.26672363, "step": 3924, "time_per_iteration": 3.051602602005005 }, { "auxiliary_loss_clip": 0.01523703, "auxiliary_loss_mlp": 0.01051804, "balance_loss_clip": 1.31932521, "balance_loss_mlp": 1.02590001, "epoch": 0.23598376672177965, "flos": 18597092123520.0, "grad_norm": 2.173866935397387, "language_loss": 0.83098733, "learning_rate": 3.5711792283071416e-06, "loss": 0.85674244, "num_input_tokens_seen": 84550970, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.25915527, "step": 3925, "time_per_iteration": 2.975119113922119 }, { "auxiliary_loss_clip": 0.01525763, "auxiliary_loss_mlp": 0.01053814, "balance_loss_clip": 1.31665826, "balance_loss_mlp": 1.02699196, "epoch": 0.2360438899744476, "flos": 22686067630080.0, "grad_norm": 2.076292465817274, "language_loss": 0.60473275, "learning_rate": 3.5709382199232564e-06, "loss": 0.63052857, "num_input_tokens_seen": 84571655, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.26831055, "step": 3926, "time_per_iteration": 2.9609193801879883 }, { "auxiliary_loss_clip": 0.01496723, "auxiliary_loss_mlp": 0.01046206, "balance_loss_clip": 1.29562259, "balance_loss_mlp": 1.02126789, "epoch": 0.23610401322711558, "flos": 29582138937600.0, "grad_norm": 2.012968638031437, "language_loss": 0.72763968, "learning_rate": 3.570697151969235e-06, "loss": 0.75306892, "num_input_tokens_seen": 84593130, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.24938965, "step": 3927, "time_per_iteration": 3.014434576034546 }, { "auxiliary_loss_clip": 0.01504857, "auxiliary_loss_mlp": 0.01046519, "balance_loss_clip": 1.30192327, "balance_loss_mlp": 1.02172327, "epoch": 0.23616413647978354, "flos": 17867562664320.0, "grad_norm": 1.69707312890078, "language_loss": 0.75539434, "learning_rate": 3.570456024454221e-06, "loss": 0.78090811, "num_input_tokens_seen": 84612410, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.24816895, "step": 3928, "time_per_iteration": 2.8798787593841553 }, { "auxiliary_loss_clip": 0.01530552, "auxiliary_loss_mlp": 0.01054589, "balance_loss_clip": 1.32254958, "balance_loss_mlp": 1.02740908, "epoch": 0.23622425973245154, "flos": 11041901544960.0, "grad_norm": 2.806922488238597, "language_loss": 0.83738416, "learning_rate": 3.5702148373873576e-06, "loss": 0.86323559, "num_input_tokens_seen": 84627610, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.27160645, "step": 3929, "time_per_iteration": 2.860170364379883 }, { "auxiliary_loss_clip": 0.01547299, "auxiliary_loss_mlp": 0.01053624, "balance_loss_clip": 1.3337816, "balance_loss_mlp": 1.02490604, "epoch": 0.2362843829851195, "flos": 23414465969280.0, "grad_norm": 2.3950859696548443, "language_loss": 0.72985625, "learning_rate": 3.569973590777789e-06, "loss": 0.75586545, "num_input_tokens_seen": 84648415, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.28723145, "step": 3930, "time_per_iteration": 4.292843818664551 }, { "auxiliary_loss_clip": 0.0151579, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.31145334, "balance_loss_mlp": 1.01708496, "epoch": 0.23634450623778747, "flos": 39544206647040.0, "grad_norm": 1.804919467961483, "language_loss": 0.7524094, "learning_rate": 3.569732284634665e-06, "loss": 0.77800471, "num_input_tokens_seen": 84670080, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.26672363, "step": 3931, "time_per_iteration": 3.0175082683563232 }, { "auxiliary_loss_clip": 0.0152675, "auxiliary_loss_mlp": 0.01044326, "balance_loss_clip": 1.32015824, "balance_loss_mlp": 1.01758766, "epoch": 0.23640462949045543, "flos": 24217889466240.0, "grad_norm": 2.0665505112122884, "language_loss": 0.80694991, "learning_rate": 3.569490918967136e-06, "loss": 0.83266068, "num_input_tokens_seen": 84686465, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.26733398, "step": 3932, "time_per_iteration": 4.3228371143341064 }, { "auxiliary_loss_clip": 0.01506237, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.30519938, "balance_loss_mlp": 1.01548982, "epoch": 0.2364647527431234, "flos": 26188913007360.0, "grad_norm": 1.5131036586349695, "language_loss": 0.87020159, "learning_rate": 3.5692494937843537e-06, "loss": 0.89566374, "num_input_tokens_seen": 84708825, "router_z_loss_clip": 2.01269531, "router_z_loss_mlp": 0.24462891, "step": 3933, "time_per_iteration": 4.3813183307647705 }, { "auxiliary_loss_clip": 0.01529288, "auxiliary_loss_mlp": 0.01046306, "balance_loss_clip": 1.32021904, "balance_loss_mlp": 1.0196979, "epoch": 0.23652487599579136, "flos": 22646994105600.0, "grad_norm": 5.146381076287382, "language_loss": 0.84030402, "learning_rate": 3.5690080090954727e-06, "loss": 0.8660599, "num_input_tokens_seen": 84726165, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.26647949, "step": 3934, "time_per_iteration": 2.867572784423828 }, { "auxiliary_loss_clip": 0.01524097, "auxiliary_loss_mlp": 0.01050698, "balance_loss_clip": 1.31749368, "balance_loss_mlp": 1.02368557, "epoch": 0.23658499924845935, "flos": 21772255524480.0, "grad_norm": 1.548105574142038, "language_loss": 0.79836065, "learning_rate": 3.5687664649096515e-06, "loss": 0.8241086, "num_input_tokens_seen": 84745815, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.27050781, "step": 3935, "time_per_iteration": 2.88531494140625 }, { "auxiliary_loss_clip": 0.01496948, "auxiliary_loss_mlp": 0.01044236, "balance_loss_clip": 1.29628265, "balance_loss_mlp": 1.01858211, "epoch": 0.23664512250112732, "flos": 21809066808960.0, "grad_norm": 2.0576660674508007, "language_loss": 0.80937278, "learning_rate": 3.5685248612360487e-06, "loss": 0.83478463, "num_input_tokens_seen": 84765415, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.25683594, "step": 3936, "time_per_iteration": 2.902045965194702 }, { "auxiliary_loss_clip": 0.01509063, "auxiliary_loss_mlp": 0.01046653, "balance_loss_clip": 1.3055898, "balance_loss_mlp": 1.02024841, "epoch": 0.23670524575379528, "flos": 22648079980800.0, "grad_norm": 1.4812020059566067, "language_loss": 0.79734623, "learning_rate": 3.568283198083826e-06, "loss": 0.82290339, "num_input_tokens_seen": 84787080, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.26428223, "step": 3937, "time_per_iteration": 2.90594482421875 }, { "auxiliary_loss_clip": 0.01515817, "auxiliary_loss_mlp": 0.01045846, "balance_loss_clip": 1.31608665, "balance_loss_mlp": 1.02120554, "epoch": 0.23676536900646325, "flos": 16733109075840.0, "grad_norm": 1.900343828252305, "language_loss": 0.86501133, "learning_rate": 3.568041475462147e-06, "loss": 0.89062798, "num_input_tokens_seen": 84805395, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.24658203, "step": 3938, "time_per_iteration": 2.838836908340454 }, { "auxiliary_loss_clip": 0.01512248, "auxiliary_loss_mlp": 0.01046738, "balance_loss_clip": 1.31018317, "balance_loss_mlp": 1.02090526, "epoch": 0.23682549225913122, "flos": 11140436016000.0, "grad_norm": 2.420949367332862, "language_loss": 0.95157403, "learning_rate": 3.5677996933801785e-06, "loss": 0.97716391, "num_input_tokens_seen": 84818090, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.25830078, "step": 3939, "time_per_iteration": 2.8480775356292725 }, { "auxiliary_loss_clip": 0.01526239, "auxiliary_loss_mlp": 0.01042682, "balance_loss_clip": 1.31889963, "balance_loss_mlp": 1.01680136, "epoch": 0.23688561551179918, "flos": 22567942160640.0, "grad_norm": 2.01045704807033, "language_loss": 0.8294906, "learning_rate": 3.567557851847088e-06, "loss": 0.85517979, "num_input_tokens_seen": 84837695, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.25891113, "step": 3940, "time_per_iteration": 2.8664603233337402 }, { "auxiliary_loss_clip": 0.01535542, "auxiliary_loss_mlp": 0.01051121, "balance_loss_clip": 1.32186103, "balance_loss_mlp": 1.02370286, "epoch": 0.23694573876446715, "flos": 18524012492160.0, "grad_norm": 2.115666496689497, "language_loss": 0.90610313, "learning_rate": 3.5673159508720464e-06, "loss": 0.93196982, "num_input_tokens_seen": 84854630, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.27404785, "step": 3941, "time_per_iteration": 2.824141502380371 }, { "auxiliary_loss_clip": 0.01534211, "auxiliary_loss_mlp": 0.01054373, "balance_loss_clip": 1.32313204, "balance_loss_mlp": 1.02812362, "epoch": 0.23700586201713514, "flos": 15342741043200.0, "grad_norm": 2.31504447821371, "language_loss": 0.85485953, "learning_rate": 3.5670739904642274e-06, "loss": 0.88074541, "num_input_tokens_seen": 84871805, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.26257324, "step": 3942, "time_per_iteration": 2.820502519607544 }, { "auxiliary_loss_clip": 0.0153715, "auxiliary_loss_mlp": 0.01051893, "balance_loss_clip": 1.32605267, "balance_loss_mlp": 1.02452302, "epoch": 0.2370659852698031, "flos": 23957179073280.0, "grad_norm": 1.7431627216912864, "language_loss": 0.81624353, "learning_rate": 3.5668319706328065e-06, "loss": 0.842134, "num_input_tokens_seen": 84889815, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.27331543, "step": 3943, "time_per_iteration": 2.899634599685669 }, { "auxiliary_loss_clip": 0.01545779, "auxiliary_loss_mlp": 0.01042762, "balance_loss_clip": 1.33030927, "balance_loss_mlp": 1.01713252, "epoch": 0.23712610852247107, "flos": 15338714256000.0, "grad_norm": 2.2959552895159714, "language_loss": 0.69467711, "learning_rate": 3.566589891386959e-06, "loss": 0.72056252, "num_input_tokens_seen": 84904380, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25634766, "step": 3944, "time_per_iteration": 2.823150157928467 }, { "auxiliary_loss_clip": 0.01537529, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.32729316, "balance_loss_mlp": 1.02342129, "epoch": 0.23718623177513903, "flos": 19692019739520.0, "grad_norm": 1.8703510367380072, "language_loss": 0.76936924, "learning_rate": 3.566347752735866e-06, "loss": 0.79524487, "num_input_tokens_seen": 84922935, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.26611328, "step": 3945, "time_per_iteration": 2.8503623008728027 }, { "auxiliary_loss_clip": 0.01546571, "auxiliary_loss_mlp": 0.01042914, "balance_loss_clip": 1.33678102, "balance_loss_mlp": 1.01833344, "epoch": 0.237246355027807, "flos": 24984230209920.0, "grad_norm": 1.4537830073880582, "language_loss": 0.65083402, "learning_rate": 3.5661055546887094e-06, "loss": 0.67672884, "num_input_tokens_seen": 84943685, "router_z_loss_clip": 2.09863281, "router_z_loss_mlp": 0.24597168, "step": 3946, "time_per_iteration": 2.9498884677886963 }, { "auxiliary_loss_clip": 0.01538533, "auxiliary_loss_mlp": 0.01043687, "balance_loss_clip": 1.3293196, "balance_loss_mlp": 1.01738906, "epoch": 0.23730647828047496, "flos": 15385796110080.0, "grad_norm": 2.204825824296122, "language_loss": 0.77855986, "learning_rate": 3.5658632972546734e-06, "loss": 0.80438209, "num_input_tokens_seen": 84959505, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.26330566, "step": 3947, "time_per_iteration": 2.8044888973236084 }, { "auxiliary_loss_clip": 0.0154898, "auxiliary_loss_mlp": 0.01044658, "balance_loss_clip": 1.33813, "balance_loss_mlp": 1.01973116, "epoch": 0.23736660153314296, "flos": 28163194174080.0, "grad_norm": 1.5797423680134133, "language_loss": 0.81444597, "learning_rate": 3.565620980442944e-06, "loss": 0.8403824, "num_input_tokens_seen": 84982130, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.24926758, "step": 3948, "time_per_iteration": 2.944415330886841 }, { "auxiliary_loss_clip": 0.01550657, "auxiliary_loss_mlp": 0.01047706, "balance_loss_clip": 1.33860159, "balance_loss_mlp": 1.02133703, "epoch": 0.23742672478581092, "flos": 22095594000000.0, "grad_norm": 1.7664978038736934, "language_loss": 0.81595725, "learning_rate": 3.5653786042627107e-06, "loss": 0.84194082, "num_input_tokens_seen": 85000640, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.26367188, "step": 3949, "time_per_iteration": 2.8414676189422607 }, { "auxiliary_loss_clip": 0.01551433, "auxiliary_loss_mlp": 0.01046507, "balance_loss_clip": 1.33843923, "balance_loss_mlp": 1.02072167, "epoch": 0.2374868480384789, "flos": 19546629638400.0, "grad_norm": 2.0594572906467827, "language_loss": 0.74260879, "learning_rate": 3.565136168723163e-06, "loss": 0.76858819, "num_input_tokens_seen": 85018970, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25793457, "step": 3950, "time_per_iteration": 2.869485378265381 }, { "auxiliary_loss_clip": 0.01534121, "auxiliary_loss_mlp": 0.01043098, "balance_loss_clip": 1.32555771, "balance_loss_mlp": 1.01917243, "epoch": 0.23754697129114685, "flos": 19430675919360.0, "grad_norm": 1.9937069474101101, "language_loss": 0.7342304, "learning_rate": 3.564893673833495e-06, "loss": 0.76000249, "num_input_tokens_seen": 85035905, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.23950195, "step": 3951, "time_per_iteration": 2.8497581481933594 }, { "auxiliary_loss_clip": 0.01560433, "auxiliary_loss_mlp": 0.01047814, "balance_loss_clip": 1.34930682, "balance_loss_mlp": 1.02102768, "epoch": 0.23760709454381482, "flos": 19510632760320.0, "grad_norm": 1.9246186225149102, "language_loss": 0.74813879, "learning_rate": 3.564651119602903e-06, "loss": 0.77422118, "num_input_tokens_seen": 85054560, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.26794434, "step": 3952, "time_per_iteration": 2.83380126953125 }, { "auxiliary_loss_clip": 0.01561434, "auxiliary_loss_mlp": 0.01048419, "balance_loss_clip": 1.34831405, "balance_loss_mlp": 1.02274096, "epoch": 0.23766721779648278, "flos": 27648152904960.0, "grad_norm": 1.594603718909292, "language_loss": 0.71831346, "learning_rate": 3.564408506040583e-06, "loss": 0.74441195, "num_input_tokens_seen": 85074425, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25683594, "step": 3953, "time_per_iteration": 2.9337158203125 }, { "auxiliary_loss_clip": 0.01555479, "auxiliary_loss_mlp": 0.01049008, "balance_loss_clip": 1.34030294, "balance_loss_mlp": 1.02212584, "epoch": 0.23772734104915075, "flos": 23414827927680.0, "grad_norm": 1.7963930911697839, "language_loss": 0.82804978, "learning_rate": 3.5641658331557356e-06, "loss": 0.85409462, "num_input_tokens_seen": 85092865, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.26904297, "step": 3954, "time_per_iteration": 2.9956421852111816 }, { "auxiliary_loss_clip": 0.01551672, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.34014988, "balance_loss_mlp": 1.02257252, "epoch": 0.23778746430181874, "flos": 15713885289600.0, "grad_norm": 2.092371956123049, "language_loss": 0.67076349, "learning_rate": 3.5639231009575634e-06, "loss": 0.69676989, "num_input_tokens_seen": 85110175, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.26416016, "step": 3955, "time_per_iteration": 2.897327423095703 }, { "auxiliary_loss_clip": 0.01544539, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.33564019, "balance_loss_mlp": 1.02147841, "epoch": 0.2378475875544867, "flos": 19435652847360.0, "grad_norm": 1.4244923638869071, "language_loss": 0.84607583, "learning_rate": 3.5636803094552704e-06, "loss": 0.87198693, "num_input_tokens_seen": 85129925, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.25097656, "step": 3956, "time_per_iteration": 2.9360969066619873 }, { "auxiliary_loss_clip": 0.01537056, "auxiliary_loss_mlp": 0.01043131, "balance_loss_clip": 1.32904935, "balance_loss_mlp": 1.01746535, "epoch": 0.23790771080715467, "flos": 22277523916800.0, "grad_norm": 2.0241728108383006, "language_loss": 0.86191249, "learning_rate": 3.5634374586580635e-06, "loss": 0.88771439, "num_input_tokens_seen": 85147755, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.25683594, "step": 3957, "time_per_iteration": 4.318829774856567 }, { "auxiliary_loss_clip": 0.01537035, "auxiliary_loss_mlp": 0.01044214, "balance_loss_clip": 1.326092, "balance_loss_mlp": 1.01898956, "epoch": 0.23796783405982264, "flos": 20056829713920.0, "grad_norm": 1.8770273319911206, "language_loss": 0.71343458, "learning_rate": 3.563194548575151e-06, "loss": 0.73924708, "num_input_tokens_seen": 85165270, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.25244141, "step": 3958, "time_per_iteration": 2.8778750896453857 }, { "auxiliary_loss_clip": 0.01545847, "auxiliary_loss_mlp": 0.01043699, "balance_loss_clip": 1.3342644, "balance_loss_mlp": 1.01773477, "epoch": 0.2380279573124906, "flos": 14253921475200.0, "grad_norm": 2.4743879552785883, "language_loss": 0.67419732, "learning_rate": 3.562951579215745e-06, "loss": 0.70009279, "num_input_tokens_seen": 85181555, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.25964355, "step": 3959, "time_per_iteration": 2.8004953861236572 }, { "auxiliary_loss_clip": 0.01544644, "auxiliary_loss_mlp": 0.01042274, "balance_loss_clip": 1.33655643, "balance_loss_mlp": 1.01788402, "epoch": 0.23808808056515857, "flos": 21189202041600.0, "grad_norm": 1.742023749824331, "language_loss": 0.73315847, "learning_rate": 3.5627085505890586e-06, "loss": 0.75902772, "num_input_tokens_seen": 85199455, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.24389648, "step": 3960, "time_per_iteration": 2.849541425704956 }, { "auxiliary_loss_clip": 0.01543037, "auxiliary_loss_mlp": 0.01042553, "balance_loss_clip": 1.3329699, "balance_loss_mlp": 1.01619625, "epoch": 0.23814820381782653, "flos": 22538460533760.0, "grad_norm": 1.7519187691688702, "language_loss": 0.75916147, "learning_rate": 3.562465462704307e-06, "loss": 0.78501737, "num_input_tokens_seen": 85219170, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.26367188, "step": 3961, "time_per_iteration": 2.8578577041625977 }, { "auxiliary_loss_clip": 0.01553307, "auxiliary_loss_mlp": 0.01048509, "balance_loss_clip": 1.33975244, "balance_loss_mlp": 1.02023244, "epoch": 0.23820832707049452, "flos": 22313113591680.0, "grad_norm": 2.182549379608525, "language_loss": 0.66652668, "learning_rate": 3.5622223155707085e-06, "loss": 0.69254482, "num_input_tokens_seen": 85238480, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.28295898, "step": 3962, "time_per_iteration": 2.8278043270111084 }, { "auxiliary_loss_clip": 0.01539144, "auxiliary_loss_mlp": 0.01047168, "balance_loss_clip": 1.32896233, "balance_loss_mlp": 1.02248013, "epoch": 0.2382684503231625, "flos": 24875198945280.0, "grad_norm": 1.9090306286048286, "language_loss": 0.74962616, "learning_rate": 3.561979109197483e-06, "loss": 0.77548927, "num_input_tokens_seen": 85259180, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24694824, "step": 3963, "time_per_iteration": 2.8934361934661865 }, { "auxiliary_loss_clip": 0.01558306, "auxiliary_loss_mlp": 0.01036869, "balance_loss_clip": 1.34373379, "balance_loss_mlp": 1.01145375, "epoch": 0.23832857357583045, "flos": 21881603502720.0, "grad_norm": 1.6912593943241845, "language_loss": 0.78495979, "learning_rate": 3.5617358435938538e-06, "loss": 0.8109116, "num_input_tokens_seen": 85278550, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.25415039, "step": 3964, "time_per_iteration": 2.8404552936553955 }, { "auxiliary_loss_clip": 0.01539181, "auxiliary_loss_mlp": 0.01045653, "balance_loss_clip": 1.33210993, "balance_loss_mlp": 1.02096438, "epoch": 0.23838869682849842, "flos": 21297961837440.0, "grad_norm": 2.2331401482797557, "language_loss": 0.73290926, "learning_rate": 3.561492518769045e-06, "loss": 0.75875753, "num_input_tokens_seen": 85297345, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.24719238, "step": 3965, "time_per_iteration": 4.219902992248535 }, { "auxiliary_loss_clip": 0.01541221, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.33383346, "balance_loss_mlp": 1.02248943, "epoch": 0.23844882008116638, "flos": 16189310096640.0, "grad_norm": 1.7703392915046345, "language_loss": 0.79375303, "learning_rate": 3.561249134732282e-06, "loss": 0.81963682, "num_input_tokens_seen": 85315105, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.24633789, "step": 3966, "time_per_iteration": 2.8615241050720215 }, { "auxiliary_loss_clip": 0.01556463, "auxiliary_loss_mlp": 0.0104511, "balance_loss_clip": 1.34637749, "balance_loss_mlp": 1.02044594, "epoch": 0.23850894333383435, "flos": 21079718328960.0, "grad_norm": 1.6226753905816218, "language_loss": 0.69612134, "learning_rate": 3.561005691492797e-06, "loss": 0.72213709, "num_input_tokens_seen": 85334735, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.24682617, "step": 3967, "time_per_iteration": 4.395959377288818 }, { "auxiliary_loss_clip": 0.01555515, "auxiliary_loss_mlp": 0.01054256, "balance_loss_clip": 1.34432912, "balance_loss_mlp": 1.02963972, "epoch": 0.23856906658650234, "flos": 17210434164480.0, "grad_norm": 2.1114591538507073, "language_loss": 0.69540739, "learning_rate": 3.5607621890598185e-06, "loss": 0.72150505, "num_input_tokens_seen": 85352875, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24645996, "step": 3968, "time_per_iteration": 4.234614372253418 }, { "auxiliary_loss_clip": 0.01543077, "auxiliary_loss_mlp": 0.01052968, "balance_loss_clip": 1.3331995, "balance_loss_mlp": 1.02768397, "epoch": 0.2386291898391703, "flos": 29505439722240.0, "grad_norm": 6.56384161639236, "language_loss": 0.77630305, "learning_rate": 3.5605186274425823e-06, "loss": 0.80226344, "num_input_tokens_seen": 85372205, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.2532959, "step": 3969, "time_per_iteration": 2.91475248336792 }, { "auxiliary_loss_clip": 0.01557475, "auxiliary_loss_mlp": 0.01043001, "balance_loss_clip": 1.35054588, "balance_loss_mlp": 1.01847959, "epoch": 0.23868931309183827, "flos": 21152390757120.0, "grad_norm": 1.9853770457409146, "language_loss": 0.77703917, "learning_rate": 3.5602750066503225e-06, "loss": 0.80304396, "num_input_tokens_seen": 85389705, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.2454834, "step": 3970, "time_per_iteration": 2.8673665523529053 }, { "auxiliary_loss_clip": 0.01554926, "auxiliary_loss_mlp": 0.01047218, "balance_loss_clip": 1.34227157, "balance_loss_mlp": 1.02270818, "epoch": 0.23874943634450624, "flos": 25668623341440.0, "grad_norm": 2.5531048161158774, "language_loss": 0.85936046, "learning_rate": 3.5600313266922793e-06, "loss": 0.88538188, "num_input_tokens_seen": 85407855, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.24511719, "step": 3971, "time_per_iteration": 2.8758463859558105 }, { "auxiliary_loss_clip": 0.01278858, "auxiliary_loss_mlp": 0.0105489, "balance_loss_clip": 1.1657629, "balance_loss_mlp": 1.036484, "epoch": 0.2388095595971742, "flos": 59018046105600.0, "grad_norm": 0.7573690089249212, "language_loss": 0.62818253, "learning_rate": 3.5597875875776915e-06, "loss": 0.65152001, "num_input_tokens_seen": 85470885, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.18359375, "step": 3972, "time_per_iteration": 3.4521329402923584 }, { "auxiliary_loss_clip": 0.01559966, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.34985948, "balance_loss_mlp": 1.01545143, "epoch": 0.23886968284984217, "flos": 16809084374400.0, "grad_norm": 2.2145282374838744, "language_loss": 0.82892931, "learning_rate": 3.5595437893158013e-06, "loss": 0.85492706, "num_input_tokens_seen": 85488460, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.24353027, "step": 3973, "time_per_iteration": 2.8343052864074707 }, { "auxiliary_loss_clip": 0.0154603, "auxiliary_loss_mlp": 0.01052576, "balance_loss_clip": 1.33825254, "balance_loss_mlp": 1.02778006, "epoch": 0.23892980610251013, "flos": 22392844208640.0, "grad_norm": 1.532413655494669, "language_loss": 0.80047673, "learning_rate": 3.5592999319158546e-06, "loss": 0.82646275, "num_input_tokens_seen": 85508590, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.24804688, "step": 3974, "time_per_iteration": 2.84763503074646 }, { "auxiliary_loss_clip": 0.01563468, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.35205662, "balance_loss_mlp": 1.02537942, "epoch": 0.23898992935517813, "flos": 12830859434880.0, "grad_norm": 1.9499612186035467, "language_loss": 0.85732222, "learning_rate": 3.5590560153870984e-06, "loss": 0.88346571, "num_input_tokens_seen": 85525970, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.25463867, "step": 3975, "time_per_iteration": 2.8622958660125732 }, { "auxiliary_loss_clip": 0.01543374, "auxiliary_loss_mlp": 0.01042008, "balance_loss_clip": 1.33566916, "balance_loss_mlp": 1.0188216, "epoch": 0.2390500526078461, "flos": 22355716210560.0, "grad_norm": 2.282040057258212, "language_loss": 0.84178734, "learning_rate": 3.5588120397387816e-06, "loss": 0.86764109, "num_input_tokens_seen": 85543700, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.23168945, "step": 3976, "time_per_iteration": 2.8790998458862305 }, { "auxiliary_loss_clip": 0.01534009, "auxiliary_loss_mlp": 0.01042624, "balance_loss_clip": 1.329584, "balance_loss_mlp": 1.02017677, "epoch": 0.23911017586051406, "flos": 22644641376000.0, "grad_norm": 1.8535276895342452, "language_loss": 0.75277513, "learning_rate": 3.5585680049801566e-06, "loss": 0.77854145, "num_input_tokens_seen": 85562765, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.2244873, "step": 3977, "time_per_iteration": 2.872523546218872 }, { "auxiliary_loss_clip": 0.01536477, "auxiliary_loss_mlp": 0.0104763, "balance_loss_clip": 1.32876635, "balance_loss_mlp": 1.0235734, "epoch": 0.23917029911318202, "flos": 23662145859840.0, "grad_norm": 1.6669788033809092, "language_loss": 0.72807872, "learning_rate": 3.5583239111204764e-06, "loss": 0.75391978, "num_input_tokens_seen": 85581755, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.24060059, "step": 3978, "time_per_iteration": 2.8388442993164062 }, { "auxiliary_loss_clip": 0.01564621, "auxiliary_loss_mlp": 0.01052795, "balance_loss_clip": 1.35112047, "balance_loss_mlp": 1.02844048, "epoch": 0.23923042236585, "flos": 22793786795520.0, "grad_norm": 2.05705396466009, "language_loss": 0.79720128, "learning_rate": 3.558079758168997e-06, "loss": 0.82337546, "num_input_tokens_seen": 85599455, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.24365234, "step": 3979, "time_per_iteration": 2.824420690536499 }, { "auxiliary_loss_clip": 0.01543135, "auxiliary_loss_mlp": 0.01050839, "balance_loss_clip": 1.3367269, "balance_loss_mlp": 1.02611494, "epoch": 0.23929054561851795, "flos": 28159484100480.0, "grad_norm": 1.8326109414324316, "language_loss": 0.82766134, "learning_rate": 3.557835546134977e-06, "loss": 0.8536011, "num_input_tokens_seen": 85619970, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.24731445, "step": 3980, "time_per_iteration": 2.884382724761963 }, { "auxiliary_loss_clip": 0.01536719, "auxiliary_loss_mlp": 0.01047602, "balance_loss_clip": 1.33072937, "balance_loss_mlp": 1.02458239, "epoch": 0.23935066887118592, "flos": 21695782533120.0, "grad_norm": 1.6902024244381624, "language_loss": 0.84396827, "learning_rate": 3.5575912750276775e-06, "loss": 0.86981153, "num_input_tokens_seen": 85638850, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.23034668, "step": 3981, "time_per_iteration": 2.8399159908294678 }, { "auxiliary_loss_clip": 0.01560133, "auxiliary_loss_mlp": 0.01051556, "balance_loss_clip": 1.34734535, "balance_loss_mlp": 1.02629602, "epoch": 0.2394107921238539, "flos": 32134451414400.0, "grad_norm": 1.9458902225879464, "language_loss": 0.779055, "learning_rate": 3.5573469448563607e-06, "loss": 0.80517197, "num_input_tokens_seen": 85656285, "router_z_loss_clip": 2.12597656, "router_z_loss_mlp": 0.25268555, "step": 3982, "time_per_iteration": 2.907567024230957 }, { "auxiliary_loss_clip": 0.01535111, "auxiliary_loss_mlp": 0.01053673, "balance_loss_clip": 1.32864189, "balance_loss_mlp": 1.03089178, "epoch": 0.23947091537652188, "flos": 17027961310080.0, "grad_norm": 1.8724007398630402, "language_loss": 0.78483593, "learning_rate": 3.5571025556302915e-06, "loss": 0.81072378, "num_input_tokens_seen": 85673020, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.2277832, "step": 3983, "time_per_iteration": 2.8125290870666504 }, { "auxiliary_loss_clip": 0.01531507, "auxiliary_loss_mlp": 0.01054416, "balance_loss_clip": 1.32427561, "balance_loss_mlp": 1.030586, "epoch": 0.23953103862918984, "flos": 20602981422720.0, "grad_norm": 4.237696213511316, "language_loss": 0.74380469, "learning_rate": 3.556858107358737e-06, "loss": 0.76966393, "num_input_tokens_seen": 85692565, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.23828125, "step": 3984, "time_per_iteration": 2.903290033340454 }, { "auxiliary_loss_clip": 0.01538796, "auxiliary_loss_mlp": 0.01056329, "balance_loss_clip": 1.3296982, "balance_loss_mlp": 1.03247535, "epoch": 0.2395911618818578, "flos": 20714184437760.0, "grad_norm": 2.0649574702825637, "language_loss": 0.79572308, "learning_rate": 3.5566136000509674e-06, "loss": 0.82167435, "num_input_tokens_seen": 85709730, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.23864746, "step": 3985, "time_per_iteration": 2.8235702514648438 }, { "auxiliary_loss_clip": 0.01540894, "auxiliary_loss_mlp": 0.01057544, "balance_loss_clip": 1.33374977, "balance_loss_mlp": 1.0325458, "epoch": 0.23965128513452577, "flos": 27065099422080.0, "grad_norm": 2.033189932205901, "language_loss": 0.748052, "learning_rate": 3.556369033716254e-06, "loss": 0.77403641, "num_input_tokens_seen": 85730045, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.25024414, "step": 3986, "time_per_iteration": 2.9143483638763428 }, { "auxiliary_loss_clip": 0.01549494, "auxiliary_loss_mlp": 0.01054651, "balance_loss_clip": 1.335271, "balance_loss_mlp": 1.03067815, "epoch": 0.23971140838719374, "flos": 23153529352320.0, "grad_norm": 2.068527092619971, "language_loss": 0.88230395, "learning_rate": 3.556124408363871e-06, "loss": 0.9083454, "num_input_tokens_seen": 85747590, "router_z_loss_clip": 2.14160156, "router_z_loss_mlp": 0.23950195, "step": 3987, "time_per_iteration": 2.859154462814331 }, { "auxiliary_loss_clip": 0.01518085, "auxiliary_loss_mlp": 0.01046579, "balance_loss_clip": 1.31889343, "balance_loss_mlp": 1.02431095, "epoch": 0.23977153163986173, "flos": 18041981944320.0, "grad_norm": 2.1375307163852737, "language_loss": 0.84268945, "learning_rate": 3.5558797240030945e-06, "loss": 0.86833608, "num_input_tokens_seen": 85763460, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.22277832, "step": 3988, "time_per_iteration": 2.8060498237609863 }, { "auxiliary_loss_clip": 0.01519512, "auxiliary_loss_mlp": 0.01050452, "balance_loss_clip": 1.31495762, "balance_loss_mlp": 1.02581096, "epoch": 0.2398316548925297, "flos": 18122345988480.0, "grad_norm": 2.7227803388797907, "language_loss": 0.85829449, "learning_rate": 3.5556349806432035e-06, "loss": 0.8839941, "num_input_tokens_seen": 85782050, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.2467041, "step": 3989, "time_per_iteration": 2.8503901958465576 }, { "auxiliary_loss_clip": 0.01512145, "auxiliary_loss_mlp": 0.01045729, "balance_loss_clip": 1.30960822, "balance_loss_mlp": 1.02231622, "epoch": 0.23989177814519766, "flos": 12575306949120.0, "grad_norm": 2.023636911443924, "language_loss": 0.86251968, "learning_rate": 3.555390178293477e-06, "loss": 0.88809842, "num_input_tokens_seen": 85797400, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.23413086, "step": 3990, "time_per_iteration": 2.8243470191955566 }, { "auxiliary_loss_clip": 0.01532489, "auxiliary_loss_mlp": 0.01049036, "balance_loss_clip": 1.32638168, "balance_loss_mlp": 1.0271014, "epoch": 0.23995190139786562, "flos": 25275417615360.0, "grad_norm": 1.712671784107316, "language_loss": 0.7729001, "learning_rate": 3.5551453169631994e-06, "loss": 0.79871535, "num_input_tokens_seen": 85818995, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.21936035, "step": 3991, "time_per_iteration": 2.930565118789673 }, { "auxiliary_loss_clip": 0.01291292, "auxiliary_loss_mlp": 0.01023174, "balance_loss_clip": 1.17641592, "balance_loss_mlp": 1.00772405, "epoch": 0.2400120246505336, "flos": 61987888028160.0, "grad_norm": 0.8879148971818479, "language_loss": 0.63918686, "learning_rate": 3.554900396661656e-06, "loss": 0.66233152, "num_input_tokens_seen": 85876695, "router_z_loss_clip": 1.1484375, "router_z_loss_mlp": 0.15429688, "step": 3992, "time_per_iteration": 4.721688508987427 }, { "auxiliary_loss_clip": 0.01291246, "auxiliary_loss_mlp": 0.01026016, "balance_loss_clip": 1.17552853, "balance_loss_mlp": 1.00722837, "epoch": 0.24007214790320155, "flos": 66738607004160.0, "grad_norm": 0.7567890470030744, "language_loss": 0.62995815, "learning_rate": 3.5546554173981334e-06, "loss": 0.65313077, "num_input_tokens_seen": 85940990, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.1875, "step": 3993, "time_per_iteration": 3.388216257095337 }, { "auxiliary_loss_clip": 0.0155336, "auxiliary_loss_mlp": 0.01047156, "balance_loss_clip": 1.34361041, "balance_loss_mlp": 1.02243197, "epoch": 0.24013227115586952, "flos": 25819307084160.0, "grad_norm": 1.5419392633269222, "language_loss": 0.78012401, "learning_rate": 3.5544103791819218e-06, "loss": 0.8061291, "num_input_tokens_seen": 85961165, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24707031, "step": 3994, "time_per_iteration": 2.9148221015930176 }, { "auxiliary_loss_clip": 0.01550853, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.34106588, "balance_loss_mlp": 1.02515829, "epoch": 0.2401923944085375, "flos": 25568641036800.0, "grad_norm": 2.0870410129169743, "language_loss": 0.79295528, "learning_rate": 3.5541652820223124e-06, "loss": 0.81896639, "num_input_tokens_seen": 85982710, "router_z_loss_clip": 2.09765625, "router_z_loss_mlp": 0.2512207, "step": 3995, "time_per_iteration": 2.9044837951660156 }, { "auxiliary_loss_clip": 0.01288486, "auxiliary_loss_mlp": 0.01019374, "balance_loss_clip": 1.17029428, "balance_loss_mlp": 1.000301, "epoch": 0.24025251766120548, "flos": 54972668603520.0, "grad_norm": 0.9113811358755965, "language_loss": 0.63485277, "learning_rate": 3.5539201259286006e-06, "loss": 0.65793133, "num_input_tokens_seen": 86046935, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.19042969, "step": 3996, "time_per_iteration": 3.454390525817871 }, { "auxiliary_loss_clip": 0.01555453, "auxiliary_loss_mlp": 0.01046159, "balance_loss_clip": 1.3443346, "balance_loss_mlp": 1.02213788, "epoch": 0.24031264091387344, "flos": 20640742848000.0, "grad_norm": 2.4698875964816254, "language_loss": 0.70258915, "learning_rate": 3.5536749109100808e-06, "loss": 0.72860521, "num_input_tokens_seen": 86064355, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.2401123, "step": 3997, "time_per_iteration": 2.9914352893829346 }, { "auxiliary_loss_clip": 0.01529168, "auxiliary_loss_mlp": 0.01046879, "balance_loss_clip": 1.32387996, "balance_loss_mlp": 1.02294135, "epoch": 0.2403727641665414, "flos": 20895933375360.0, "grad_norm": 2.13307114814015, "language_loss": 0.87700909, "learning_rate": 3.5534296369760535e-06, "loss": 0.90276957, "num_input_tokens_seen": 86081340, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.23962402, "step": 3998, "time_per_iteration": 2.8382608890533447 }, { "auxiliary_loss_clip": 0.01559851, "auxiliary_loss_mlp": 0.01044844, "balance_loss_clip": 1.34476542, "balance_loss_mlp": 1.01928544, "epoch": 0.24043288741920937, "flos": 22830236121600.0, "grad_norm": 1.7185887107814417, "language_loss": 0.76394033, "learning_rate": 3.5531843041358183e-06, "loss": 0.78998733, "num_input_tokens_seen": 86102260, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.25598145, "step": 3999, "time_per_iteration": 2.880476236343384 }, { "auxiliary_loss_clip": 0.01529831, "auxiliary_loss_mlp": 0.01043965, "balance_loss_clip": 1.32556725, "balance_loss_mlp": 1.01957428, "epoch": 0.24049301067187734, "flos": 27969907812480.0, "grad_norm": 2.652143356394441, "language_loss": 0.73558527, "learning_rate": 3.552938912398679e-06, "loss": 0.76132321, "num_input_tokens_seen": 86123400, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.24414062, "step": 4000, "time_per_iteration": 4.264817237854004 }, { "auxiliary_loss_clip": 0.01562282, "auxiliary_loss_mlp": 0.01048122, "balance_loss_clip": 1.34797573, "balance_loss_mlp": 1.02346933, "epoch": 0.24055313392454533, "flos": 27462196200960.0, "grad_norm": 2.2128145542419686, "language_loss": 0.67895555, "learning_rate": 3.5526934617739397e-06, "loss": 0.70505959, "num_input_tokens_seen": 86144060, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.24645996, "step": 4001, "time_per_iteration": 2.8735036849975586 }, { "auxiliary_loss_clip": 0.01541141, "auxiliary_loss_mlp": 0.01047616, "balance_loss_clip": 1.33126009, "balance_loss_mlp": 1.02280807, "epoch": 0.2406132571772133, "flos": 25567555161600.0, "grad_norm": 1.709900383598382, "language_loss": 0.84027964, "learning_rate": 3.5524479522709095e-06, "loss": 0.86616725, "num_input_tokens_seen": 86163005, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.2479248, "step": 4002, "time_per_iteration": 4.30093789100647 }, { "auxiliary_loss_clip": 0.01536093, "auxiliary_loss_mlp": 0.01038845, "balance_loss_clip": 1.32999289, "balance_loss_mlp": 1.01502693, "epoch": 0.24067338042988126, "flos": 24802390782720.0, "grad_norm": 1.8115652009772572, "language_loss": 0.83823979, "learning_rate": 3.552202383898897e-06, "loss": 0.86398917, "num_input_tokens_seen": 86182580, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.23815918, "step": 4003, "time_per_iteration": 2.8722972869873047 }, { "auxiliary_loss_clip": 0.01566514, "auxiliary_loss_mlp": 0.01050647, "balance_loss_clip": 1.35509157, "balance_loss_mlp": 1.02666235, "epoch": 0.24073350368254923, "flos": 21187618473600.0, "grad_norm": 4.515549902131664, "language_loss": 0.88857913, "learning_rate": 3.551956756667215e-06, "loss": 0.91475075, "num_input_tokens_seen": 86200665, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.23999023, "step": 4004, "time_per_iteration": 4.215851068496704 }, { "auxiliary_loss_clip": 0.01547329, "auxiliary_loss_mlp": 0.01050915, "balance_loss_clip": 1.33335495, "balance_loss_mlp": 1.02662063, "epoch": 0.2407936269352172, "flos": 22505087854080.0, "grad_norm": 2.208901344746885, "language_loss": 0.78745961, "learning_rate": 3.551711070585177e-06, "loss": 0.81344205, "num_input_tokens_seen": 86221640, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24291992, "step": 4005, "time_per_iteration": 2.895538568496704 }, { "auxiliary_loss_clip": 0.01525872, "auxiliary_loss_mlp": 0.01047044, "balance_loss_clip": 1.32198119, "balance_loss_mlp": 1.0229044, "epoch": 0.24085375018788516, "flos": 18560190349440.0, "grad_norm": 1.5902826744612062, "language_loss": 0.79683352, "learning_rate": 3.5514653256620995e-06, "loss": 0.82256269, "num_input_tokens_seen": 86240795, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.24133301, "step": 4006, "time_per_iteration": 2.847377300262451 }, { "auxiliary_loss_clip": 0.01564399, "auxiliary_loss_mlp": 0.010528, "balance_loss_clip": 1.34714985, "balance_loss_mlp": 1.02697945, "epoch": 0.24091387344055312, "flos": 24181213916160.0, "grad_norm": 1.6692640534790306, "language_loss": 0.72501504, "learning_rate": 3.551219521907302e-06, "loss": 0.75118703, "num_input_tokens_seen": 86262000, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.25842285, "step": 4007, "time_per_iteration": 2.857990264892578 }, { "auxiliary_loss_clip": 0.01529198, "auxiliary_loss_mlp": 0.01048103, "balance_loss_clip": 1.32464862, "balance_loss_mlp": 1.02421331, "epoch": 0.24097399669322112, "flos": 11043756581760.0, "grad_norm": 1.6861190504102508, "language_loss": 0.7697804, "learning_rate": 3.5509736593301042e-06, "loss": 0.79555333, "num_input_tokens_seen": 86279680, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.2388916, "step": 4008, "time_per_iteration": 2.8299713134765625 }, { "auxiliary_loss_clip": 0.0153837, "auxiliary_loss_mlp": 0.01044556, "balance_loss_clip": 1.32974744, "balance_loss_mlp": 1.02111912, "epoch": 0.24103411994588908, "flos": 17173894348800.0, "grad_norm": 2.9686758625456924, "language_loss": 0.76010048, "learning_rate": 3.5507277379398295e-06, "loss": 0.78592968, "num_input_tokens_seen": 86297180, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.234375, "step": 4009, "time_per_iteration": 2.930962324142456 }, { "auxiliary_loss_clip": 0.01526328, "auxiliary_loss_mlp": 0.01046668, "balance_loss_clip": 1.32139695, "balance_loss_mlp": 1.02347028, "epoch": 0.24109424319855705, "flos": 20677735111680.0, "grad_norm": 13.245457932985847, "language_loss": 0.81277978, "learning_rate": 3.550481757745804e-06, "loss": 0.83850968, "num_input_tokens_seen": 86317660, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.23193359, "step": 4010, "time_per_iteration": 2.8428843021392822 }, { "auxiliary_loss_clip": 0.01543345, "auxiliary_loss_mlp": 0.01047697, "balance_loss_clip": 1.33203673, "balance_loss_mlp": 1.02244794, "epoch": 0.241154366451225, "flos": 28192947269760.0, "grad_norm": 1.9971350706065911, "language_loss": 0.71395445, "learning_rate": 3.5502357187573555e-06, "loss": 0.73986483, "num_input_tokens_seen": 86338325, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.25256348, "step": 4011, "time_per_iteration": 2.9492876529693604 }, { "auxiliary_loss_clip": 0.01542071, "auxiliary_loss_mlp": 0.01045698, "balance_loss_clip": 1.3323257, "balance_loss_mlp": 1.02121234, "epoch": 0.24121448970389298, "flos": 21699809320320.0, "grad_norm": 1.6965998898968548, "language_loss": 0.69654548, "learning_rate": 3.5499896209838118e-06, "loss": 0.72242314, "num_input_tokens_seen": 86357615, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.24499512, "step": 4012, "time_per_iteration": 2.838156223297119 }, { "auxiliary_loss_clip": 0.01534962, "auxiliary_loss_mlp": 0.01042953, "balance_loss_clip": 1.32604218, "balance_loss_mlp": 1.01756096, "epoch": 0.24127461295656094, "flos": 39690682623360.0, "grad_norm": 1.611192746141382, "language_loss": 0.75061011, "learning_rate": 3.5497434644345073e-06, "loss": 0.77638924, "num_input_tokens_seen": 86380355, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.25390625, "step": 4013, "time_per_iteration": 3.0236399173736572 }, { "auxiliary_loss_clip": 0.01553305, "auxiliary_loss_mlp": 0.01051083, "balance_loss_clip": 1.34252453, "balance_loss_mlp": 1.02722967, "epoch": 0.2413347362092289, "flos": 19145325093120.0, "grad_norm": 1.7014246774073183, "language_loss": 0.89092404, "learning_rate": 3.5494972491187753e-06, "loss": 0.91696793, "num_input_tokens_seen": 86399125, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23852539, "step": 4014, "time_per_iteration": 2.8191099166870117 }, { "auxiliary_loss_clip": 0.01554705, "auxiliary_loss_mlp": 0.0104499, "balance_loss_clip": 1.33850956, "balance_loss_mlp": 1.02086258, "epoch": 0.2413948594618969, "flos": 26949779130240.0, "grad_norm": 2.351975150329219, "language_loss": 0.95784211, "learning_rate": 3.549250975045952e-06, "loss": 0.98383904, "num_input_tokens_seen": 86418625, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.24133301, "step": 4015, "time_per_iteration": 2.9241411685943604 }, { "auxiliary_loss_clip": 0.01539523, "auxiliary_loss_mlp": 0.01043596, "balance_loss_clip": 1.32938147, "balance_loss_mlp": 1.01939702, "epoch": 0.24145498271456486, "flos": 25238923044480.0, "grad_norm": 1.5769877397366443, "language_loss": 0.84483731, "learning_rate": 3.5490046422253768e-06, "loss": 0.87066853, "num_input_tokens_seen": 86438375, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.2421875, "step": 4016, "time_per_iteration": 2.9136672019958496 }, { "auxiliary_loss_clip": 0.01521718, "auxiliary_loss_mlp": 0.01047632, "balance_loss_clip": 1.31805253, "balance_loss_mlp": 1.02431512, "epoch": 0.24151510596723283, "flos": 40676624219520.0, "grad_norm": 2.0821015072609788, "language_loss": 0.69363886, "learning_rate": 3.54875825066639e-06, "loss": 0.71933234, "num_input_tokens_seen": 86463230, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.2331543, "step": 4017, "time_per_iteration": 3.003466844558716 }, { "auxiliary_loss_clip": 0.01557129, "auxiliary_loss_mlp": 0.01055072, "balance_loss_clip": 1.34181488, "balance_loss_mlp": 1.03000271, "epoch": 0.2415752292199008, "flos": 18154813772160.0, "grad_norm": 1.5877086002957073, "language_loss": 0.85219491, "learning_rate": 3.5485118003783353e-06, "loss": 0.87831688, "num_input_tokens_seen": 86481230, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.25073242, "step": 4018, "time_per_iteration": 2.8351943492889404 }, { "auxiliary_loss_clip": 0.01279373, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 1.15836358, "balance_loss_mlp": 1.01760864, "epoch": 0.24163535247256876, "flos": 67317588455040.0, "grad_norm": 0.8247855191503591, "language_loss": 0.60777897, "learning_rate": 3.548265291370558e-06, "loss": 0.63093281, "num_input_tokens_seen": 86541260, "router_z_loss_clip": 1.2109375, "router_z_loss_mlp": 0.18359375, "step": 4019, "time_per_iteration": 3.410660982131958 }, { "auxiliary_loss_clip": 0.01527388, "auxiliary_loss_mlp": 0.01046649, "balance_loss_clip": 1.31916332, "balance_loss_mlp": 1.02391553, "epoch": 0.24169547572523672, "flos": 24939048637440.0, "grad_norm": 1.987518149210506, "language_loss": 0.74177909, "learning_rate": 3.5480187236524055e-06, "loss": 0.76751947, "num_input_tokens_seen": 86559580, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.22741699, "step": 4020, "time_per_iteration": 2.89510440826416 }, { "auxiliary_loss_clip": 0.01551486, "auxiliary_loss_mlp": 0.01049487, "balance_loss_clip": 1.34226525, "balance_loss_mlp": 1.02600336, "epoch": 0.24175559897790472, "flos": 18736057463040.0, "grad_norm": 1.7970668602907283, "language_loss": 0.82996666, "learning_rate": 3.5477720972332285e-06, "loss": 0.8559764, "num_input_tokens_seen": 86577560, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.23474121, "step": 4021, "time_per_iteration": 2.8438777923583984 }, { "auxiliary_loss_clip": 0.01547918, "auxiliary_loss_mlp": 0.01053425, "balance_loss_clip": 1.3360424, "balance_loss_mlp": 1.02687752, "epoch": 0.24181572223057268, "flos": 23049610750080.0, "grad_norm": 2.0345470685334637, "language_loss": 0.7733866, "learning_rate": 3.547525412122378e-06, "loss": 0.79939997, "num_input_tokens_seen": 86595350, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.265625, "step": 4022, "time_per_iteration": 2.845194101333618 }, { "auxiliary_loss_clip": 0.01558214, "auxiliary_loss_mlp": 0.01048734, "balance_loss_clip": 1.34009981, "balance_loss_mlp": 1.02442753, "epoch": 0.24187584548324065, "flos": 20385869034240.0, "grad_norm": 1.704665729797788, "language_loss": 0.76065254, "learning_rate": 3.5472786683292083e-06, "loss": 0.78672206, "num_input_tokens_seen": 86614805, "router_z_loss_clip": 2.17773438, "router_z_loss_mlp": 0.24304199, "step": 4023, "time_per_iteration": 2.870590925216675 }, { "auxiliary_loss_clip": 0.01547401, "auxiliary_loss_mlp": 0.01051221, "balance_loss_clip": 1.33567381, "balance_loss_mlp": 1.02635407, "epoch": 0.2419359687359086, "flos": 21407445550080.0, "grad_norm": 2.0502549368112946, "language_loss": 0.83288229, "learning_rate": 3.5470318658630766e-06, "loss": 0.8588686, "num_input_tokens_seen": 86633700, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.2487793, "step": 4024, "time_per_iteration": 2.845041513442993 }, { "auxiliary_loss_clip": 0.01528912, "auxiliary_loss_mlp": 0.01048381, "balance_loss_clip": 1.32253933, "balance_loss_mlp": 1.02439594, "epoch": 0.24199609198857658, "flos": 18378577146240.0, "grad_norm": 1.9630240144777524, "language_loss": 0.86579263, "learning_rate": 3.5467850047333424e-06, "loss": 0.89156562, "num_input_tokens_seen": 86650905, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.23999023, "step": 4025, "time_per_iteration": 2.8311245441436768 }, { "auxiliary_loss_clip": 0.01555609, "auxiliary_loss_mlp": 0.01057011, "balance_loss_clip": 1.34124827, "balance_loss_mlp": 1.03265643, "epoch": 0.24205621524124454, "flos": 19473323783040.0, "grad_norm": 2.6797582123147996, "language_loss": 0.72414231, "learning_rate": 3.546538084949365e-06, "loss": 0.75026846, "num_input_tokens_seen": 86669185, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.2434082, "step": 4026, "time_per_iteration": 2.8220276832580566 }, { "auxiliary_loss_clip": 0.01528933, "auxiliary_loss_mlp": 0.01048661, "balance_loss_clip": 1.32348263, "balance_loss_mlp": 1.02566516, "epoch": 0.2421163384939125, "flos": 14984672544000.0, "grad_norm": 2.0145058809473233, "language_loss": 0.65943408, "learning_rate": 3.546291106520509e-06, "loss": 0.68520999, "num_input_tokens_seen": 86686805, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.22998047, "step": 4027, "time_per_iteration": 4.277646064758301 }, { "auxiliary_loss_clip": 0.01545477, "auxiliary_loss_mlp": 0.01054072, "balance_loss_clip": 1.33059132, "balance_loss_mlp": 1.0265348, "epoch": 0.2421764617465805, "flos": 18671529098880.0, "grad_norm": 1.7947282324871627, "language_loss": 0.7247051, "learning_rate": 3.5460440694561388e-06, "loss": 0.75070059, "num_input_tokens_seen": 86705520, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.27502441, "step": 4028, "time_per_iteration": 2.871005058288574 }, { "auxiliary_loss_clip": 0.01275817, "auxiliary_loss_mlp": 0.01021617, "balance_loss_clip": 1.15912127, "balance_loss_mlp": 1.00492752, "epoch": 0.24223658499924847, "flos": 64379445154560.0, "grad_norm": 0.8560531053294638, "language_loss": 0.55393541, "learning_rate": 3.545796973765623e-06, "loss": 0.57690978, "num_input_tokens_seen": 86767320, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.16699219, "step": 4029, "time_per_iteration": 3.3756840229034424 }, { "auxiliary_loss_clip": 0.01540592, "auxiliary_loss_mlp": 0.01050898, "balance_loss_clip": 1.33094108, "balance_loss_mlp": 1.02566087, "epoch": 0.24229670825191643, "flos": 25786160628480.0, "grad_norm": 2.113636184320302, "language_loss": 0.74870384, "learning_rate": 3.54554981945833e-06, "loss": 0.77461869, "num_input_tokens_seen": 86788110, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.25256348, "step": 4030, "time_per_iteration": 2.873448133468628 }, { "auxiliary_loss_clip": 0.01540326, "auxiliary_loss_mlp": 0.01047563, "balance_loss_clip": 1.33140099, "balance_loss_mlp": 1.02360177, "epoch": 0.2423568315045844, "flos": 20676649236480.0, "grad_norm": 1.7125512941718748, "language_loss": 0.77676946, "learning_rate": 3.5453026065436343e-06, "loss": 0.80264837, "num_input_tokens_seen": 86807640, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.23974609, "step": 4031, "time_per_iteration": 2.884960174560547 }, { "auxiliary_loss_clip": 0.01556613, "auxiliary_loss_mlp": 0.0105394, "balance_loss_clip": 1.34080589, "balance_loss_mlp": 1.02950168, "epoch": 0.24241695475725236, "flos": 22426397867520.0, "grad_norm": 2.339947637372905, "language_loss": 0.66330278, "learning_rate": 3.5450553350309083e-06, "loss": 0.6894083, "num_input_tokens_seen": 86826795, "router_z_loss_clip": 2.15820312, "router_z_loss_mlp": 0.24450684, "step": 4032, "time_per_iteration": 2.8432087898254395 }, { "auxiliary_loss_clip": 0.01531974, "auxiliary_loss_mlp": 0.01041704, "balance_loss_clip": 1.32540989, "balance_loss_mlp": 1.01869678, "epoch": 0.24247707800992033, "flos": 17137897470720.0, "grad_norm": 1.93527291778241, "language_loss": 0.82039106, "learning_rate": 3.5448080049295286e-06, "loss": 0.84612787, "num_input_tokens_seen": 86843175, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.23010254, "step": 4033, "time_per_iteration": 2.829012870788574 }, { "auxiliary_loss_clip": 0.01532042, "auxiliary_loss_mlp": 0.01042522, "balance_loss_clip": 1.32627308, "balance_loss_mlp": 1.01820314, "epoch": 0.2425372012625883, "flos": 31626377844480.0, "grad_norm": 1.8499002502884996, "language_loss": 0.70237982, "learning_rate": 3.5445606162488754e-06, "loss": 0.72812539, "num_input_tokens_seen": 86863185, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.2434082, "step": 4034, "time_per_iteration": 2.9322004318237305 }, { "auxiliary_loss_clip": 0.01539856, "auxiliary_loss_mlp": 0.01041679, "balance_loss_clip": 1.3294003, "balance_loss_mlp": 1.01778924, "epoch": 0.24259732451525629, "flos": 16334654952960.0, "grad_norm": 2.234998207512587, "language_loss": 0.96860647, "learning_rate": 3.5443131689983283e-06, "loss": 0.9944219, "num_input_tokens_seen": 86880040, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.2388916, "step": 4035, "time_per_iteration": 4.255556344985962 }, { "auxiliary_loss_clip": 0.01524644, "auxiliary_loss_mlp": 0.01046692, "balance_loss_clip": 1.32124877, "balance_loss_mlp": 1.02220654, "epoch": 0.24265744776792425, "flos": 22866866426880.0, "grad_norm": 1.6641152161470683, "language_loss": 0.79322374, "learning_rate": 3.5440656631872715e-06, "loss": 0.81893706, "num_input_tokens_seen": 86900610, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.24487305, "step": 4036, "time_per_iteration": 2.9280831813812256 }, { "auxiliary_loss_clip": 0.01535853, "auxiliary_loss_mlp": 0.01040767, "balance_loss_clip": 1.3275609, "balance_loss_mlp": 1.01599514, "epoch": 0.24271757102059222, "flos": 21881558257920.0, "grad_norm": 2.028943667364006, "language_loss": 0.7511158, "learning_rate": 3.5438180988250898e-06, "loss": 0.77688193, "num_input_tokens_seen": 86919385, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.24768066, "step": 4037, "time_per_iteration": 4.310131072998047 }, { "auxiliary_loss_clip": 0.01538517, "auxiliary_loss_mlp": 0.01044958, "balance_loss_clip": 1.32954121, "balance_loss_mlp": 1.01924443, "epoch": 0.24277769427326018, "flos": 19217997521280.0, "grad_norm": 2.1515862743842376, "language_loss": 0.78209341, "learning_rate": 3.543570475921171e-06, "loss": 0.80792814, "num_input_tokens_seen": 86938885, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.25756836, "step": 4038, "time_per_iteration": 2.8268887996673584 }, { "auxiliary_loss_clip": 0.01533606, "auxiliary_loss_mlp": 0.01050211, "balance_loss_clip": 1.3254441, "balance_loss_mlp": 1.02418745, "epoch": 0.24283781752592815, "flos": 19509230171520.0, "grad_norm": 2.22871959886407, "language_loss": 0.72558099, "learning_rate": 3.543322794484905e-06, "loss": 0.75141919, "num_input_tokens_seen": 86957705, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.26037598, "step": 4039, "time_per_iteration": 4.3983776569366455 }, { "auxiliary_loss_clip": 0.01547426, "auxiliary_loss_mlp": 0.01043507, "balance_loss_clip": 1.33706844, "balance_loss_mlp": 1.01749527, "epoch": 0.2428979407785961, "flos": 19911892060800.0, "grad_norm": 1.6588451499159933, "language_loss": 0.79931104, "learning_rate": 3.5430750545256843e-06, "loss": 0.82522035, "num_input_tokens_seen": 86975845, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.26013184, "step": 4040, "time_per_iteration": 2.860842704772949 }, { "auxiliary_loss_clip": 0.0153238, "auxiliary_loss_mlp": 0.01040452, "balance_loss_clip": 1.32837009, "balance_loss_mlp": 1.01613379, "epoch": 0.2429580640312641, "flos": 24726189260160.0, "grad_norm": 1.8497861512407998, "language_loss": 0.80993712, "learning_rate": 3.5428272560529027e-06, "loss": 0.83566546, "num_input_tokens_seen": 86994800, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.2434082, "step": 4041, "time_per_iteration": 2.876401662826538 }, { "auxiliary_loss_clip": 0.01533402, "auxiliary_loss_mlp": 0.01044823, "balance_loss_clip": 1.3273685, "balance_loss_mlp": 1.020576, "epoch": 0.24301818728393207, "flos": 25641494444160.0, "grad_norm": 9.02462556591432, "language_loss": 0.77303213, "learning_rate": 3.542579399075957e-06, "loss": 0.79881442, "num_input_tokens_seen": 87016845, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.24243164, "step": 4042, "time_per_iteration": 2.88173246383667 }, { "auxiliary_loss_clip": 0.01531543, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.32709336, "balance_loss_mlp": 1.01445007, "epoch": 0.24307831053660003, "flos": 26152735150080.0, "grad_norm": 2.303921665234922, "language_loss": 0.82395589, "learning_rate": 3.542331483604246e-06, "loss": 0.84967738, "num_input_tokens_seen": 87036270, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.26184082, "step": 4043, "time_per_iteration": 2.9143826961517334 }, { "auxiliary_loss_clip": 0.01557097, "auxiliary_loss_mlp": 0.01046132, "balance_loss_clip": 1.34262538, "balance_loss_mlp": 1.01993024, "epoch": 0.243138433789268, "flos": 14979740860800.0, "grad_norm": 3.7439719246110763, "language_loss": 0.73978198, "learning_rate": 3.5420835096471706e-06, "loss": 0.76581424, "num_input_tokens_seen": 87049920, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26245117, "step": 4044, "time_per_iteration": 2.810695171356201 }, { "auxiliary_loss_clip": 0.01541471, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.33366275, "balance_loss_mlp": 1.01984787, "epoch": 0.24319855704193596, "flos": 25202518963200.0, "grad_norm": 3.3186675044968323, "language_loss": 0.85431659, "learning_rate": 3.5418354772141337e-06, "loss": 0.88018119, "num_input_tokens_seen": 87068230, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.25170898, "step": 4045, "time_per_iteration": 2.916626453399658 }, { "auxiliary_loss_clip": 0.01541437, "auxiliary_loss_mlp": 0.01048006, "balance_loss_clip": 1.33408451, "balance_loss_mlp": 1.02280533, "epoch": 0.24325868029460393, "flos": 22137336967680.0, "grad_norm": 1.6749230965042554, "language_loss": 0.87596768, "learning_rate": 3.541587386314541e-06, "loss": 0.90186214, "num_input_tokens_seen": 87086435, "router_z_loss_clip": 2.07324219, "router_z_loss_mlp": 0.2520752, "step": 4046, "time_per_iteration": 2.851457118988037 }, { "auxiliary_loss_clip": 0.01524633, "auxiliary_loss_mlp": 0.01041332, "balance_loss_clip": 1.32160234, "balance_loss_mlp": 1.01527309, "epoch": 0.2433188035472719, "flos": 23591418958080.0, "grad_norm": 2.3070992050247288, "language_loss": 0.73487997, "learning_rate": 3.5413392369578e-06, "loss": 0.76053959, "num_input_tokens_seen": 87105340, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.26037598, "step": 4047, "time_per_iteration": 2.908132791519165 }, { "auxiliary_loss_clip": 0.01533618, "auxiliary_loss_mlp": 0.01042129, "balance_loss_clip": 1.3254987, "balance_loss_mlp": 1.01584291, "epoch": 0.2433789267999399, "flos": 24473668176000.0, "grad_norm": 3.41546600792429, "language_loss": 0.74041557, "learning_rate": 3.5410910291533213e-06, "loss": 0.76617301, "num_input_tokens_seen": 87125780, "router_z_loss_clip": 2.07910156, "router_z_loss_mlp": 0.26257324, "step": 4048, "time_per_iteration": 2.876168966293335 }, { "auxiliary_loss_clip": 0.01533787, "auxiliary_loss_mlp": 0.01043567, "balance_loss_clip": 1.32754803, "balance_loss_mlp": 1.0189265, "epoch": 0.24343905005260785, "flos": 16736773904640.0, "grad_norm": 2.069100843552389, "language_loss": 0.74153674, "learning_rate": 3.5408427629105155e-06, "loss": 0.76731026, "num_input_tokens_seen": 87144470, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.2467041, "step": 4049, "time_per_iteration": 2.877272844314575 }, { "auxiliary_loss_clip": 0.01527621, "auxiliary_loss_mlp": 0.01044162, "balance_loss_clip": 1.32245398, "balance_loss_mlp": 1.01915145, "epoch": 0.24349917330527582, "flos": 20052350478720.0, "grad_norm": 1.6155221165559606, "language_loss": 0.75157601, "learning_rate": 3.5405944382387985e-06, "loss": 0.77729386, "num_input_tokens_seen": 87162830, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.25024414, "step": 4050, "time_per_iteration": 2.8639228343963623 }, { "auxiliary_loss_clip": 0.01521953, "auxiliary_loss_mlp": 0.0104516, "balance_loss_clip": 1.31957769, "balance_loss_mlp": 1.01983964, "epoch": 0.24355929655794378, "flos": 17429446834560.0, "grad_norm": 2.4282746700594355, "language_loss": 0.75961912, "learning_rate": 3.5403460551475854e-06, "loss": 0.78529024, "num_input_tokens_seen": 87180905, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.25341797, "step": 4051, "time_per_iteration": 2.844693899154663 }, { "auxiliary_loss_clip": 0.01537284, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.32925391, "balance_loss_mlp": 1.01779389, "epoch": 0.24361941981061175, "flos": 25421893591680.0, "grad_norm": 1.7619503585485203, "language_loss": 0.71723431, "learning_rate": 3.540097613646296e-06, "loss": 0.74302769, "num_input_tokens_seen": 87202290, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.24267578, "step": 4052, "time_per_iteration": 3.0478668212890625 }, { "auxiliary_loss_clip": 0.01526787, "auxiliary_loss_mlp": 0.01048049, "balance_loss_clip": 1.32364154, "balance_loss_mlp": 1.02244294, "epoch": 0.2436795430632797, "flos": 22831186262400.0, "grad_norm": 1.5345535711094274, "language_loss": 0.82018745, "learning_rate": 3.539849113744351e-06, "loss": 0.84593582, "num_input_tokens_seen": 87221650, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.25622559, "step": 4053, "time_per_iteration": 2.8567092418670654 }, { "auxiliary_loss_clip": 0.0152854, "auxiliary_loss_mlp": 0.01041399, "balance_loss_clip": 1.32173932, "balance_loss_mlp": 1.01642454, "epoch": 0.2437396663159477, "flos": 15165923788800.0, "grad_norm": 2.0303203919320927, "language_loss": 0.78611827, "learning_rate": 3.539600555451172e-06, "loss": 0.81181777, "num_input_tokens_seen": 87238515, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.24963379, "step": 4054, "time_per_iteration": 2.8815085887908936 }, { "auxiliary_loss_clip": 0.01530953, "auxiliary_loss_mlp": 0.01047494, "balance_loss_clip": 1.32496703, "balance_loss_mlp": 1.02249575, "epoch": 0.24379978956861567, "flos": 22101340089600.0, "grad_norm": 1.6729472135361405, "language_loss": 0.84812385, "learning_rate": 3.5393519387761866e-06, "loss": 0.87390834, "num_input_tokens_seen": 87256290, "router_z_loss_clip": 2.05761719, "router_z_loss_mlp": 0.25024414, "step": 4055, "time_per_iteration": 2.858440399169922 }, { "auxiliary_loss_clip": 0.01555821, "auxiliary_loss_mlp": 0.01043987, "balance_loss_clip": 1.34277296, "balance_loss_mlp": 1.01883364, "epoch": 0.24385991282128364, "flos": 31480354316160.0, "grad_norm": 2.8092468307728025, "language_loss": 0.56427968, "learning_rate": 3.5391032637288217e-06, "loss": 0.59027779, "num_input_tokens_seen": 87277085, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.25109863, "step": 4056, "time_per_iteration": 2.8953638076782227 }, { "auxiliary_loss_clip": 0.01553509, "auxiliary_loss_mlp": 0.0104653, "balance_loss_clip": 1.34256506, "balance_loss_mlp": 1.02051878, "epoch": 0.2439200360739516, "flos": 23848509767040.0, "grad_norm": 3.568979611954272, "language_loss": 0.81092751, "learning_rate": 3.538854530318506e-06, "loss": 0.83692789, "num_input_tokens_seen": 87293020, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.26025391, "step": 4057, "time_per_iteration": 2.878134250640869 }, { "auxiliary_loss_clip": 0.0154353, "auxiliary_loss_mlp": 0.0104337, "balance_loss_clip": 1.33804238, "balance_loss_mlp": 1.01855087, "epoch": 0.24398015932661957, "flos": 19178833507200.0, "grad_norm": 1.6804874197186, "language_loss": 0.80812585, "learning_rate": 3.538605738554673e-06, "loss": 0.83399481, "num_input_tokens_seen": 87311445, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.24829102, "step": 4058, "time_per_iteration": 2.8491127490997314 }, { "auxiliary_loss_clip": 0.0154463, "auxiliary_loss_mlp": 0.01042265, "balance_loss_clip": 1.3356992, "balance_loss_mlp": 1.01705194, "epoch": 0.24404028257928753, "flos": 25272748172160.0, "grad_norm": 1.6027979269298358, "language_loss": 0.86212075, "learning_rate": 3.538356888446756e-06, "loss": 0.8879897, "num_input_tokens_seen": 87332055, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.2520752, "step": 4059, "time_per_iteration": 2.909658432006836 }, { "auxiliary_loss_clip": 0.01539266, "auxiliary_loss_mlp": 0.01042384, "balance_loss_clip": 1.33642495, "balance_loss_mlp": 1.01882827, "epoch": 0.2441004058319555, "flos": 26478471600000.0, "grad_norm": 1.5790547177953014, "language_loss": 0.74764204, "learning_rate": 3.5381079800041913e-06, "loss": 0.77345848, "num_input_tokens_seen": 87351295, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.23547363, "step": 4060, "time_per_iteration": 2.8980133533477783 }, { "auxiliary_loss_clip": 0.01553974, "auxiliary_loss_mlp": 0.01046642, "balance_loss_clip": 1.344154, "balance_loss_mlp": 1.02070248, "epoch": 0.2441605290846235, "flos": 26771740266240.0, "grad_norm": 1.617433868957443, "language_loss": 0.74813801, "learning_rate": 3.5378590132364182e-06, "loss": 0.77414417, "num_input_tokens_seen": 87370650, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.25976562, "step": 4061, "time_per_iteration": 2.9292373657226562 }, { "auxiliary_loss_clip": 0.01529249, "auxiliary_loss_mlp": 0.01053038, "balance_loss_clip": 1.32762694, "balance_loss_mlp": 1.02858853, "epoch": 0.24422065233729146, "flos": 21115760451840.0, "grad_norm": 1.7754115236026111, "language_loss": 0.77037323, "learning_rate": 3.5376099881528768e-06, "loss": 0.7961961, "num_input_tokens_seen": 87389020, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.24450684, "step": 4062, "time_per_iteration": 4.366536855697632 }, { "auxiliary_loss_clip": 0.01536323, "auxiliary_loss_mlp": 0.01043596, "balance_loss_clip": 1.33613503, "balance_loss_mlp": 1.02064848, "epoch": 0.24428077558995942, "flos": 25273607823360.0, "grad_norm": 1.4955271960761864, "language_loss": 0.85614043, "learning_rate": 3.537360904763011e-06, "loss": 0.88193959, "num_input_tokens_seen": 87409695, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.22937012, "step": 4063, "time_per_iteration": 2.8736603260040283 }, { "auxiliary_loss_clip": 0.01538275, "auxiliary_loss_mlp": 0.01038196, "balance_loss_clip": 1.32901478, "balance_loss_mlp": 1.01417542, "epoch": 0.24434089884262739, "flos": 20494945543680.0, "grad_norm": 2.0498084043999216, "language_loss": 0.69325483, "learning_rate": 3.5371117630762656e-06, "loss": 0.71901953, "num_input_tokens_seen": 87428250, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.24035645, "step": 4064, "time_per_iteration": 2.8824684619903564 }, { "auxiliary_loss_clip": 0.01557877, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.34638429, "balance_loss_mlp": 1.01625979, "epoch": 0.24440102209529535, "flos": 23632076050560.0, "grad_norm": 1.4709937078484387, "language_loss": 0.70602274, "learning_rate": 3.536862563102088e-06, "loss": 0.73202342, "num_input_tokens_seen": 87449380, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.25952148, "step": 4065, "time_per_iteration": 2.9009432792663574 }, { "auxiliary_loss_clip": 0.0154939, "auxiliary_loss_mlp": 0.01043664, "balance_loss_clip": 1.33864784, "balance_loss_mlp": 1.0178194, "epoch": 0.24446114534796332, "flos": 20563500695040.0, "grad_norm": 1.7488322837146757, "language_loss": 0.85410953, "learning_rate": 3.5366133048499282e-06, "loss": 0.88004011, "num_input_tokens_seen": 87465365, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.25854492, "step": 4066, "time_per_iteration": 2.8591020107269287 }, { "auxiliary_loss_clip": 0.0130175, "auxiliary_loss_mlp": 0.01039092, "balance_loss_clip": 1.18631864, "balance_loss_mlp": 1.01429641, "epoch": 0.24452126860063128, "flos": 60416178261120.0, "grad_norm": 0.7570450696352516, "language_loss": 0.5234533, "learning_rate": 3.5363639883292374e-06, "loss": 0.54686171, "num_input_tokens_seen": 87522525, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.24707031, "step": 4067, "time_per_iteration": 3.323737144470215 }, { "auxiliary_loss_clip": 0.01543515, "auxiliary_loss_mlp": 0.010452, "balance_loss_clip": 1.3353765, "balance_loss_mlp": 1.02001131, "epoch": 0.24458139185329927, "flos": 15129881665920.0, "grad_norm": 2.534726198565204, "language_loss": 0.73941612, "learning_rate": 3.5361146135494706e-06, "loss": 0.76530325, "num_input_tokens_seen": 87539170, "router_z_loss_clip": 2.078125, "router_z_loss_mlp": 0.25170898, "step": 4068, "time_per_iteration": 2.8170695304870605 }, { "auxiliary_loss_clip": 0.01530996, "auxiliary_loss_mlp": 0.01047479, "balance_loss_clip": 1.32688034, "balance_loss_mlp": 1.0221467, "epoch": 0.24464151510596724, "flos": 28009660008960.0, "grad_norm": 1.5152240841300242, "language_loss": 0.78581524, "learning_rate": 3.5358651805200835e-06, "loss": 0.81159997, "num_input_tokens_seen": 87558875, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.25341797, "step": 4069, "time_per_iteration": 4.318330526351929 }, { "auxiliary_loss_clip": 0.01530332, "auxiliary_loss_mlp": 0.01051838, "balance_loss_clip": 1.32828248, "balance_loss_mlp": 1.02681637, "epoch": 0.2447016383586352, "flos": 19802589327360.0, "grad_norm": 1.7111457335156701, "language_loss": 0.81132317, "learning_rate": 3.5356156892505347e-06, "loss": 0.83714485, "num_input_tokens_seen": 87576485, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.25036621, "step": 4070, "time_per_iteration": 2.8873612880706787 }, { "auxiliary_loss_clip": 0.01541261, "auxiliary_loss_mlp": 0.01045257, "balance_loss_clip": 1.3348068, "balance_loss_mlp": 1.02038991, "epoch": 0.24476176161130317, "flos": 26078388664320.0, "grad_norm": 4.535724780700914, "language_loss": 0.84827626, "learning_rate": 3.5353661397502854e-06, "loss": 0.87414145, "num_input_tokens_seen": 87598620, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.24841309, "step": 4071, "time_per_iteration": 2.9025285243988037 }, { "auxiliary_loss_clip": 0.01564541, "auxiliary_loss_mlp": 0.0105842, "balance_loss_clip": 1.35068429, "balance_loss_mlp": 1.03259969, "epoch": 0.24482188486397113, "flos": 18852825588480.0, "grad_norm": 1.8858993192776379, "language_loss": 0.81069362, "learning_rate": 3.535116532028798e-06, "loss": 0.83692324, "num_input_tokens_seen": 87616595, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.25830078, "step": 4072, "time_per_iteration": 4.234854221343994 }, { "auxiliary_loss_clip": 0.0153423, "auxiliary_loss_mlp": 0.01049778, "balance_loss_clip": 1.33388424, "balance_loss_mlp": 1.02621043, "epoch": 0.2448820081166391, "flos": 21261738735360.0, "grad_norm": 1.436359096763441, "language_loss": 0.7108981, "learning_rate": 3.5348668660955382e-06, "loss": 0.7367382, "num_input_tokens_seen": 87635755, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.23571777, "step": 4073, "time_per_iteration": 2.8814799785614014 }, { "auxiliary_loss_clip": 0.01540846, "auxiliary_loss_mlp": 0.01054039, "balance_loss_clip": 1.33829176, "balance_loss_mlp": 1.0301851, "epoch": 0.2449421313693071, "flos": 23960934391680.0, "grad_norm": 2.379739349614642, "language_loss": 0.68706942, "learning_rate": 3.5346171419599728e-06, "loss": 0.71301824, "num_input_tokens_seen": 87652885, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.23852539, "step": 4074, "time_per_iteration": 4.3329758644104 }, { "auxiliary_loss_clip": 0.01306369, "auxiliary_loss_mlp": 0.01031518, "balance_loss_clip": 1.1916734, "balance_loss_mlp": 1.00462472, "epoch": 0.24500225462197506, "flos": 60715962178560.0, "grad_norm": 1.0349972936049683, "language_loss": 0.68711865, "learning_rate": 3.5343673596315718e-06, "loss": 0.7104975, "num_input_tokens_seen": 87713220, "router_z_loss_clip": 1.1484375, "router_z_loss_mlp": 0.26953125, "step": 4075, "time_per_iteration": 3.4778990745544434 }, { "auxiliary_loss_clip": 0.01531343, "auxiliary_loss_mlp": 0.01051537, "balance_loss_clip": 1.32964897, "balance_loss_mlp": 1.02589512, "epoch": 0.24506237787464302, "flos": 26294777136000.0, "grad_norm": 1.794202287332855, "language_loss": 0.80614114, "learning_rate": 3.5341175191198063e-06, "loss": 0.83196998, "num_input_tokens_seen": 87732680, "router_z_loss_clip": 2.01855469, "router_z_loss_mlp": 0.25671387, "step": 4076, "time_per_iteration": 2.9264001846313477 }, { "auxiliary_loss_clip": 0.01549646, "auxiliary_loss_mlp": 0.01052303, "balance_loss_clip": 1.33847535, "balance_loss_mlp": 1.02655423, "epoch": 0.245122501127311, "flos": 20560424048640.0, "grad_norm": 2.3873004631587875, "language_loss": 0.8373028, "learning_rate": 3.533867620434151e-06, "loss": 0.86332226, "num_input_tokens_seen": 87751880, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.25744629, "step": 4077, "time_per_iteration": 2.841952085494995 }, { "auxiliary_loss_clip": 0.01547825, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.33900547, "balance_loss_mlp": 1.02707577, "epoch": 0.24518262437997895, "flos": 29144566045440.0, "grad_norm": 1.814100034227778, "language_loss": 0.63204509, "learning_rate": 3.533617663584082e-06, "loss": 0.65805137, "num_input_tokens_seen": 87771795, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.25695801, "step": 4078, "time_per_iteration": 2.9186201095581055 }, { "auxiliary_loss_clip": 0.01540514, "auxiliary_loss_mlp": 0.01053533, "balance_loss_clip": 1.33660793, "balance_loss_mlp": 1.0297749, "epoch": 0.24524274763264692, "flos": 23487319376640.0, "grad_norm": 1.5111688765312712, "language_loss": 0.76275754, "learning_rate": 3.5333676485790765e-06, "loss": 0.78869802, "num_input_tokens_seen": 87793640, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.23742676, "step": 4079, "time_per_iteration": 2.8980133533477783 }, { "auxiliary_loss_clip": 0.01538826, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.33406591, "balance_loss_mlp": 1.02011061, "epoch": 0.24530287088531488, "flos": 17209891226880.0, "grad_norm": 2.5840070852938735, "language_loss": 0.75403917, "learning_rate": 3.5331175754286173e-06, "loss": 0.77988124, "num_input_tokens_seen": 87812390, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.25268555, "step": 4080, "time_per_iteration": 2.8104021549224854 }, { "auxiliary_loss_clip": 0.01540562, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.33856595, "balance_loss_mlp": 1.02156484, "epoch": 0.24536299413798288, "flos": 14875912748160.0, "grad_norm": 1.7918077701428412, "language_loss": 0.84733588, "learning_rate": 3.532867444142186e-06, "loss": 0.87319362, "num_input_tokens_seen": 87830640, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.23657227, "step": 4081, "time_per_iteration": 2.8599462509155273 }, { "auxiliary_loss_clip": 0.01535605, "auxiliary_loss_mlp": 0.01042514, "balance_loss_clip": 1.33312023, "balance_loss_mlp": 1.01863599, "epoch": 0.24542311739065084, "flos": 35275563463680.0, "grad_norm": 2.0886886451046296, "language_loss": 0.74776137, "learning_rate": 3.532617254729267e-06, "loss": 0.77354252, "num_input_tokens_seen": 87850450, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.23876953, "step": 4082, "time_per_iteration": 3.0146734714508057 }, { "auxiliary_loss_clip": 0.01543753, "auxiliary_loss_mlp": 0.01042305, "balance_loss_clip": 1.34329128, "balance_loss_mlp": 1.01942873, "epoch": 0.2454832406433188, "flos": 21512088069120.0, "grad_norm": 1.7551364899048196, "language_loss": 0.72628725, "learning_rate": 3.5323670071993485e-06, "loss": 0.75214779, "num_input_tokens_seen": 87868810, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.2286377, "step": 4083, "time_per_iteration": 2.8884942531585693 }, { "auxiliary_loss_clip": 0.01556115, "auxiliary_loss_mlp": 0.01045287, "balance_loss_clip": 1.3479985, "balance_loss_mlp": 1.01912045, "epoch": 0.24554336389598677, "flos": 14764754977920.0, "grad_norm": 2.0945899474975396, "language_loss": 0.758448, "learning_rate": 3.532116701561919e-06, "loss": 0.78446198, "num_input_tokens_seen": 87885685, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.26159668, "step": 4084, "time_per_iteration": 2.827548027038574 }, { "auxiliary_loss_clip": 0.01542778, "auxiliary_loss_mlp": 0.01040296, "balance_loss_clip": 1.34062386, "balance_loss_mlp": 1.01687145, "epoch": 0.24560348714865474, "flos": 14984763033600.0, "grad_norm": 1.7272352791747034, "language_loss": 0.86402535, "learning_rate": 3.531866337826471e-06, "loss": 0.8898561, "num_input_tokens_seen": 87903715, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.23425293, "step": 4085, "time_per_iteration": 2.810129404067993 }, { "auxiliary_loss_clip": 0.01554419, "auxiliary_loss_mlp": 0.01051528, "balance_loss_clip": 1.34897351, "balance_loss_mlp": 1.02675581, "epoch": 0.2456636104013227, "flos": 22685796161280.0, "grad_norm": 1.8950149038432358, "language_loss": 0.80176699, "learning_rate": 3.5316159160024982e-06, "loss": 0.82782644, "num_input_tokens_seen": 87923375, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.24768066, "step": 4086, "time_per_iteration": 2.954594373703003 }, { "auxiliary_loss_clip": 0.01538319, "auxiliary_loss_mlp": 0.01037584, "balance_loss_clip": 1.33620095, "balance_loss_mlp": 1.0148623, "epoch": 0.2457237336539907, "flos": 27429411703680.0, "grad_norm": 1.5850204229208797, "language_loss": 0.75713003, "learning_rate": 3.531365436099496e-06, "loss": 0.78288907, "num_input_tokens_seen": 87943115, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.22717285, "step": 4087, "time_per_iteration": 2.9384562969207764 }, { "auxiliary_loss_clip": 0.01561505, "auxiliary_loss_mlp": 0.01047593, "balance_loss_clip": 1.35675228, "balance_loss_mlp": 1.02255881, "epoch": 0.24578385690665866, "flos": 20422046891520.0, "grad_norm": 2.366702696993089, "language_loss": 0.80769265, "learning_rate": 3.5311148981269635e-06, "loss": 0.83378363, "num_input_tokens_seen": 87959505, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.25024414, "step": 4088, "time_per_iteration": 2.8505074977874756 }, { "auxiliary_loss_clip": 0.01530171, "auxiliary_loss_mlp": 0.01040265, "balance_loss_clip": 1.33140492, "balance_loss_mlp": 1.01684082, "epoch": 0.24584398015932662, "flos": 23926068633600.0, "grad_norm": 1.4072810400846387, "language_loss": 0.77616215, "learning_rate": 3.5308643020944e-06, "loss": 0.80186653, "num_input_tokens_seen": 87979725, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.234375, "step": 4089, "time_per_iteration": 2.901097536087036 }, { "auxiliary_loss_clip": 0.01541691, "auxiliary_loss_mlp": 0.0104356, "balance_loss_clip": 1.33663487, "balance_loss_mlp": 1.0205642, "epoch": 0.2459041034119946, "flos": 41510253260160.0, "grad_norm": 1.813346053141106, "language_loss": 0.82103157, "learning_rate": 3.530613648011309e-06, "loss": 0.84688413, "num_input_tokens_seen": 87998270, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.2298584, "step": 4090, "time_per_iteration": 3.008744239807129 }, { "auxiliary_loss_clip": 0.01548201, "auxiliary_loss_mlp": 0.01040514, "balance_loss_clip": 1.34338784, "balance_loss_mlp": 1.01534855, "epoch": 0.24596422666466256, "flos": 19946033902080.0, "grad_norm": 1.7057450910068406, "language_loss": 0.74652076, "learning_rate": 3.5303629358871946e-06, "loss": 0.77240789, "num_input_tokens_seen": 88016760, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.25183105, "step": 4091, "time_per_iteration": 2.863926887512207 }, { "auxiliary_loss_clip": 0.01548547, "auxiliary_loss_mlp": 0.01042967, "balance_loss_clip": 1.34617341, "balance_loss_mlp": 1.01911378, "epoch": 0.24602434991733052, "flos": 21554600198400.0, "grad_norm": 2.047574358846292, "language_loss": 0.7796098, "learning_rate": 3.5301121657315653e-06, "loss": 0.80552495, "num_input_tokens_seen": 88036465, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.23852539, "step": 4092, "time_per_iteration": 2.872969388961792 }, { "auxiliary_loss_clip": 0.01565151, "auxiliary_loss_mlp": 0.01051109, "balance_loss_clip": 1.35434425, "balance_loss_mlp": 1.02639651, "epoch": 0.24608447316999849, "flos": 23195408054400.0, "grad_norm": 2.5746071629125518, "language_loss": 0.82919145, "learning_rate": 3.5298613375539287e-06, "loss": 0.85535407, "num_input_tokens_seen": 88053270, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.24755859, "step": 4093, "time_per_iteration": 2.8568148612976074 }, { "auxiliary_loss_clip": 0.01568061, "auxiliary_loss_mlp": 0.01044095, "balance_loss_clip": 1.35908258, "balance_loss_mlp": 1.01850116, "epoch": 0.24614459642266648, "flos": 19650322016640.0, "grad_norm": 1.7985818326356795, "language_loss": 0.88609481, "learning_rate": 3.529610451363797e-06, "loss": 0.91221637, "num_input_tokens_seen": 88072305, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.25622559, "step": 4094, "time_per_iteration": 2.906229019165039 }, { "auxiliary_loss_clip": 0.01293463, "auxiliary_loss_mlp": 0.01023829, "balance_loss_clip": 1.18143249, "balance_loss_mlp": 1.0058049, "epoch": 0.24620471967533444, "flos": 61766839342080.0, "grad_norm": 0.8192335639071074, "language_loss": 0.57632244, "learning_rate": 3.5293595071706833e-06, "loss": 0.59949529, "num_input_tokens_seen": 88137995, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.18066406, "step": 4095, "time_per_iteration": 3.483731508255005 }, { "auxiliary_loss_clip": 0.01297974, "auxiliary_loss_mlp": 0.01020958, "balance_loss_clip": 1.18779445, "balance_loss_mlp": 1.00159883, "epoch": 0.2462648429280024, "flos": 69185462555520.0, "grad_norm": 0.6514127323740173, "language_loss": 0.5627017, "learning_rate": 3.5291085049841042e-06, "loss": 0.58589101, "num_input_tokens_seen": 88208490, "router_z_loss_clip": 1.1015625, "router_z_loss_mlp": 0.19335938, "step": 4096, "time_per_iteration": 3.4202136993408203 }, { "auxiliary_loss_clip": 0.01560568, "auxiliary_loss_mlp": 0.01046935, "balance_loss_clip": 1.35347104, "balance_loss_mlp": 1.02297413, "epoch": 0.24632496618067037, "flos": 29471297880960.0, "grad_norm": 1.9120760817027531, "language_loss": 0.78284454, "learning_rate": 3.5288574448135773e-06, "loss": 0.80891961, "num_input_tokens_seen": 88228050, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.23974609, "step": 4097, "time_per_iteration": 4.400754928588867 }, { "auxiliary_loss_clip": 0.01573687, "auxiliary_loss_mlp": 0.01056039, "balance_loss_clip": 1.35971725, "balance_loss_mlp": 1.02907336, "epoch": 0.24638508943333834, "flos": 24327146954880.0, "grad_norm": 1.9044455490195293, "language_loss": 0.77431715, "learning_rate": 3.5286063266686235e-06, "loss": 0.80061436, "num_input_tokens_seen": 88248090, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.27001953, "step": 4098, "time_per_iteration": 2.902709484100342 }, { "auxiliary_loss_clip": 0.0155832, "auxiliary_loss_mlp": 0.01051489, "balance_loss_clip": 1.35237598, "balance_loss_mlp": 1.02836227, "epoch": 0.2464452126860063, "flos": 26624087925120.0, "grad_norm": 2.8683576200770746, "language_loss": 0.69863862, "learning_rate": 3.528355150558764e-06, "loss": 0.72473669, "num_input_tokens_seen": 88267545, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.23120117, "step": 4099, "time_per_iteration": 2.941516876220703 }, { "auxiliary_loss_clip": 0.01549948, "auxiliary_loss_mlp": 0.01051489, "balance_loss_clip": 1.34902382, "balance_loss_mlp": 1.02802813, "epoch": 0.24650533593867427, "flos": 31224937564800.0, "grad_norm": 2.7745483876845882, "language_loss": 0.66782731, "learning_rate": 3.5281039164935237e-06, "loss": 0.6938417, "num_input_tokens_seen": 88289785, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.23474121, "step": 4100, "time_per_iteration": 2.9128377437591553 }, { "auxiliary_loss_clip": 0.01294438, "auxiliary_loss_mlp": 0.01015944, "balance_loss_clip": 1.18318462, "balance_loss_mlp": 0.99849206, "epoch": 0.24656545919134226, "flos": 68523900065280.0, "grad_norm": 0.7042903294625029, "language_loss": 0.61568069, "learning_rate": 3.5278526244824304e-06, "loss": 0.63878453, "num_input_tokens_seen": 88357320, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.17480469, "step": 4101, "time_per_iteration": 3.4796142578125 }, { "auxiliary_loss_clip": 0.01548991, "auxiliary_loss_mlp": 0.01049898, "balance_loss_clip": 1.34568357, "balance_loss_mlp": 1.02526927, "epoch": 0.24662558244401023, "flos": 20093821977600.0, "grad_norm": 1.5384883944137095, "language_loss": 0.74234504, "learning_rate": 3.527601274535012e-06, "loss": 0.76833391, "num_input_tokens_seen": 88377040, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.24609375, "step": 4102, "time_per_iteration": 2.8833229541778564 }, { "auxiliary_loss_clip": 0.01560013, "auxiliary_loss_mlp": 0.01048634, "balance_loss_clip": 1.35325813, "balance_loss_mlp": 1.0246371, "epoch": 0.2466857056966782, "flos": 30713696858880.0, "grad_norm": 1.9763270026652755, "language_loss": 0.75891072, "learning_rate": 3.5273498666608004e-06, "loss": 0.78499722, "num_input_tokens_seen": 88395085, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.2401123, "step": 4103, "time_per_iteration": 2.9058477878570557 }, { "auxiliary_loss_clip": 0.01569296, "auxiliary_loss_mlp": 0.01050061, "balance_loss_clip": 1.35890436, "balance_loss_mlp": 1.02631474, "epoch": 0.24674582894934616, "flos": 22538732002560.0, "grad_norm": 1.8657194927541842, "language_loss": 0.79586881, "learning_rate": 3.5270984008693288e-06, "loss": 0.82206237, "num_input_tokens_seen": 88413205, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.23754883, "step": 4104, "time_per_iteration": 4.2830493450164795 }, { "auxiliary_loss_clip": 0.015482, "auxiliary_loss_mlp": 0.0104942, "balance_loss_clip": 1.34369218, "balance_loss_mlp": 1.02394545, "epoch": 0.24680595220201412, "flos": 20714048703360.0, "grad_norm": 2.048217336683823, "language_loss": 0.84657305, "learning_rate": 3.526846877170133e-06, "loss": 0.8725493, "num_input_tokens_seen": 88431525, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.2545166, "step": 4105, "time_per_iteration": 2.9072978496551514 }, { "auxiliary_loss_clip": 0.01549938, "auxiliary_loss_mlp": 0.01045989, "balance_loss_clip": 1.34490216, "balance_loss_mlp": 1.02280295, "epoch": 0.2468660754546821, "flos": 21840132003840.0, "grad_norm": 1.7919197440714565, "language_loss": 0.76977468, "learning_rate": 3.52659529557275e-06, "loss": 0.79573393, "num_input_tokens_seen": 88451210, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.23193359, "step": 4106, "time_per_iteration": 2.865534782409668 }, { "auxiliary_loss_clip": 0.01558593, "auxiliary_loss_mlp": 0.0104368, "balance_loss_clip": 1.35232854, "balance_loss_mlp": 1.01964724, "epoch": 0.24692619870735008, "flos": 15275995683840.0, "grad_norm": 2.2030554627586034, "language_loss": 0.73829514, "learning_rate": 3.5263436560867205e-06, "loss": 0.76431787, "num_input_tokens_seen": 88467790, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.24035645, "step": 4107, "time_per_iteration": 4.289412260055542 }, { "auxiliary_loss_clip": 0.01572962, "auxiliary_loss_mlp": 0.01041706, "balance_loss_clip": 1.36491132, "balance_loss_mlp": 1.01862741, "epoch": 0.24698632196001805, "flos": 29692256077440.0, "grad_norm": 4.194945048330935, "language_loss": 0.66653407, "learning_rate": 3.526091958721587e-06, "loss": 0.69268084, "num_input_tokens_seen": 88490330, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.23083496, "step": 4108, "time_per_iteration": 2.9575953483581543 }, { "auxiliary_loss_clip": 0.01569527, "auxiliary_loss_mlp": 0.01049614, "balance_loss_clip": 1.36006796, "balance_loss_mlp": 1.02320862, "epoch": 0.247046445212686, "flos": 39179849120640.0, "grad_norm": 1.7209873532949953, "language_loss": 0.73621082, "learning_rate": 3.5258402034868936e-06, "loss": 0.76240224, "num_input_tokens_seen": 88512435, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.26416016, "step": 4109, "time_per_iteration": 4.443131685256958 }, { "auxiliary_loss_clip": 0.01565444, "auxiliary_loss_mlp": 0.0104645, "balance_loss_clip": 1.35530627, "balance_loss_mlp": 1.02021253, "epoch": 0.24710656846535398, "flos": 23008139251200.0, "grad_norm": 5.244666295388618, "language_loss": 0.79407692, "learning_rate": 3.5255883903921866e-06, "loss": 0.82019579, "num_input_tokens_seen": 88529780, "router_z_loss_clip": 2.10058594, "router_z_loss_mlp": 0.2623291, "step": 4110, "time_per_iteration": 2.8426079750061035 }, { "auxiliary_loss_clip": 0.01577607, "auxiliary_loss_mlp": 0.01046237, "balance_loss_clip": 1.36764288, "balance_loss_mlp": 1.02146494, "epoch": 0.24716669171802194, "flos": 26444103534720.0, "grad_norm": 2.151932752146124, "language_loss": 0.81960827, "learning_rate": 3.5253365194470144e-06, "loss": 0.84584671, "num_input_tokens_seen": 88547200, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.2479248, "step": 4111, "time_per_iteration": 2.899045944213867 }, { "auxiliary_loss_clip": 0.01564583, "auxiliary_loss_mlp": 0.01042086, "balance_loss_clip": 1.35476315, "balance_loss_mlp": 1.01882792, "epoch": 0.2472268149706899, "flos": 23339938504320.0, "grad_norm": 1.8933064218841427, "language_loss": 0.75820613, "learning_rate": 3.5250845906609294e-06, "loss": 0.78427279, "num_input_tokens_seen": 88566415, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.23254395, "step": 4112, "time_per_iteration": 2.87064790725708 }, { "auxiliary_loss_clip": 0.01560512, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.35267353, "balance_loss_mlp": 1.01368368, "epoch": 0.24728693822335787, "flos": 23779185454080.0, "grad_norm": 2.6461200641347324, "language_loss": 0.83264863, "learning_rate": 3.5248326040434835e-06, "loss": 0.85863304, "num_input_tokens_seen": 88585225, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.24267578, "step": 4113, "time_per_iteration": 2.8673107624053955 }, { "auxiliary_loss_clip": 0.01551523, "auxiliary_loss_mlp": 0.01040032, "balance_loss_clip": 1.34475076, "balance_loss_mlp": 1.01511717, "epoch": 0.24734706147602586, "flos": 19326802561920.0, "grad_norm": 2.20006467496588, "language_loss": 0.8846696, "learning_rate": 3.5245805596042322e-06, "loss": 0.91058517, "num_input_tokens_seen": 88603280, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.24926758, "step": 4114, "time_per_iteration": 2.8257949352264404 }, { "auxiliary_loss_clip": 0.01549332, "auxiliary_loss_mlp": 0.01037529, "balance_loss_clip": 1.34278095, "balance_loss_mlp": 1.013592, "epoch": 0.24740718472869383, "flos": 28048326330240.0, "grad_norm": 2.124041177375743, "language_loss": 0.76166427, "learning_rate": 3.524328457352734e-06, "loss": 0.78753293, "num_input_tokens_seen": 88624925, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.23913574, "step": 4115, "time_per_iteration": 2.922628402709961 }, { "auxiliary_loss_clip": 0.01285802, "auxiliary_loss_mlp": 0.0102445, "balance_loss_clip": 1.17146206, "balance_loss_mlp": 1.00680697, "epoch": 0.2474673079813618, "flos": 68141236377600.0, "grad_norm": 0.6944937417467176, "language_loss": 0.5841136, "learning_rate": 3.5240762972985475e-06, "loss": 0.60721612, "num_input_tokens_seen": 88691475, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.17675781, "step": 4116, "time_per_iteration": 3.4839670658111572 }, { "auxiliary_loss_clip": 0.01556195, "auxiliary_loss_mlp": 0.01045268, "balance_loss_clip": 1.34922743, "balance_loss_mlp": 1.02078259, "epoch": 0.24752743123402976, "flos": 29474465016960.0, "grad_norm": 1.6386755091344234, "language_loss": 0.84182668, "learning_rate": 3.523824079451235e-06, "loss": 0.8678413, "num_input_tokens_seen": 88713425, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.24499512, "step": 4117, "time_per_iteration": 2.917466402053833 }, { "auxiliary_loss_clip": 0.01286704, "auxiliary_loss_mlp": 0.01024362, "balance_loss_clip": 1.17173672, "balance_loss_mlp": 1.00738704, "epoch": 0.24758755448669773, "flos": 58377594954240.0, "grad_norm": 0.8966812671241432, "language_loss": 0.63467383, "learning_rate": 3.5235718038203602e-06, "loss": 0.65778446, "num_input_tokens_seen": 88769995, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.16992188, "step": 4118, "time_per_iteration": 3.164216995239258 }, { "auxiliary_loss_clip": 0.0155157, "auxiliary_loss_mlp": 0.01040925, "balance_loss_clip": 1.34608448, "balance_loss_mlp": 1.01683307, "epoch": 0.2476476777393657, "flos": 20493859668480.0, "grad_norm": 2.2635271845579688, "language_loss": 0.80151308, "learning_rate": 3.523319470415491e-06, "loss": 0.82743806, "num_input_tokens_seen": 88789970, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.2409668, "step": 4119, "time_per_iteration": 2.847882032394409 }, { "auxiliary_loss_clip": 0.01541145, "auxiliary_loss_mlp": 0.01044582, "balance_loss_clip": 1.33763826, "balance_loss_mlp": 1.02027547, "epoch": 0.24770780099203366, "flos": 20495669460480.0, "grad_norm": 2.58032762765821, "language_loss": 0.75201511, "learning_rate": 3.5230670792461943e-06, "loss": 0.77787232, "num_input_tokens_seen": 88810000, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.24304199, "step": 4120, "time_per_iteration": 2.8408749103546143 }, { "auxiliary_loss_clip": 0.01541548, "auxiliary_loss_mlp": 0.01047365, "balance_loss_clip": 1.33533859, "balance_loss_mlp": 1.02283168, "epoch": 0.24776792424470165, "flos": 15160403923200.0, "grad_norm": 2.0802273664543174, "language_loss": 0.89138079, "learning_rate": 3.522814630322041e-06, "loss": 0.91726995, "num_input_tokens_seen": 88827515, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.24511719, "step": 4121, "time_per_iteration": 2.8349595069885254 }, { "auxiliary_loss_clip": 0.01558679, "auxiliary_loss_mlp": 0.0104009, "balance_loss_clip": 1.34981775, "balance_loss_mlp": 1.01488876, "epoch": 0.2478280474973696, "flos": 21735353750400.0, "grad_norm": 2.0962796903888536, "language_loss": 0.70904028, "learning_rate": 3.5225621236526045e-06, "loss": 0.73502803, "num_input_tokens_seen": 88845025, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.25231934, "step": 4122, "time_per_iteration": 2.824787139892578 }, { "auxiliary_loss_clip": 0.01545081, "auxiliary_loss_mlp": 0.01043738, "balance_loss_clip": 1.33681202, "balance_loss_mlp": 1.01775002, "epoch": 0.24788817075003758, "flos": 20421820667520.0, "grad_norm": 3.0800651570564646, "language_loss": 0.80694616, "learning_rate": 3.5223095592474596e-06, "loss": 0.83283436, "num_input_tokens_seen": 88861740, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.2598877, "step": 4123, "time_per_iteration": 2.8626742362976074 }, { "auxiliary_loss_clip": 0.01536376, "auxiliary_loss_mlp": 0.01043061, "balance_loss_clip": 1.33249378, "balance_loss_mlp": 1.01740754, "epoch": 0.24794829400270554, "flos": 22602853163520.0, "grad_norm": 2.3377607126552657, "language_loss": 0.75539041, "learning_rate": 3.5220569371161846e-06, "loss": 0.78118479, "num_input_tokens_seen": 88879740, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.25646973, "step": 4124, "time_per_iteration": 2.9859230518341064 }, { "auxiliary_loss_clip": 0.01540795, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.33808208, "balance_loss_mlp": 1.01621747, "epoch": 0.2480084172553735, "flos": 39690501644160.0, "grad_norm": 1.4101275848800583, "language_loss": 0.74208021, "learning_rate": 3.521804257268357e-06, "loss": 0.76788771, "num_input_tokens_seen": 88904095, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.23718262, "step": 4125, "time_per_iteration": 3.105130910873413 }, { "auxiliary_loss_clip": 0.01571795, "auxiliary_loss_mlp": 0.01045072, "balance_loss_clip": 1.35775042, "balance_loss_mlp": 1.02069414, "epoch": 0.24806854050804147, "flos": 22063669153920.0, "grad_norm": 2.0233768407485377, "language_loss": 0.69749087, "learning_rate": 3.5215515197135595e-06, "loss": 0.72365952, "num_input_tokens_seen": 88920740, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.24377441, "step": 4126, "time_per_iteration": 2.858269453048706 }, { "auxiliary_loss_clip": 0.01530659, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.3251946, "balance_loss_mlp": 1.01874042, "epoch": 0.24812866376070947, "flos": 15495144088320.0, "grad_norm": 2.0573556051273822, "language_loss": 0.82499254, "learning_rate": 3.5212987244613764e-06, "loss": 0.85073686, "num_input_tokens_seen": 88938510, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.25048828, "step": 4127, "time_per_iteration": 2.82283091545105 }, { "auxiliary_loss_clip": 0.01541804, "auxiliary_loss_mlp": 0.01040619, "balance_loss_clip": 1.33517313, "balance_loss_mlp": 1.01664615, "epoch": 0.24818878701337743, "flos": 14765705118720.0, "grad_norm": 2.180847698415879, "language_loss": 0.85056186, "learning_rate": 3.5210458715213927e-06, "loss": 0.87638605, "num_input_tokens_seen": 88955235, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.23974609, "step": 4128, "time_per_iteration": 2.8791873455047607 }, { "auxiliary_loss_clip": 0.01557207, "auxiliary_loss_mlp": 0.01045602, "balance_loss_clip": 1.34750247, "balance_loss_mlp": 1.02077079, "epoch": 0.2482489102660454, "flos": 27100960565760.0, "grad_norm": 3.0121155658487693, "language_loss": 0.66878819, "learning_rate": 3.5207929609031973e-06, "loss": 0.69481623, "num_input_tokens_seen": 88975210, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24829102, "step": 4129, "time_per_iteration": 2.882044792175293 }, { "auxiliary_loss_clip": 0.01535609, "auxiliary_loss_mlp": 0.01045233, "balance_loss_clip": 1.32907009, "balance_loss_mlp": 1.01912594, "epoch": 0.24830903351871336, "flos": 26478381110400.0, "grad_norm": 1.6167037300283822, "language_loss": 0.75998193, "learning_rate": 3.5205399926163806e-06, "loss": 0.78579032, "num_input_tokens_seen": 88996120, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.2611084, "step": 4130, "time_per_iteration": 2.895861864089966 }, { "auxiliary_loss_clip": 0.0153372, "auxiliary_loss_mlp": 0.01047536, "balance_loss_clip": 1.328251, "balance_loss_mlp": 1.02276444, "epoch": 0.24836915677138133, "flos": 10234315526400.0, "grad_norm": 4.354161046787067, "language_loss": 0.78188908, "learning_rate": 3.520286966670535e-06, "loss": 0.80770165, "num_input_tokens_seen": 89008685, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.24780273, "step": 4131, "time_per_iteration": 2.941476583480835 }, { "auxiliary_loss_clip": 0.01530441, "auxiliary_loss_mlp": 0.01039262, "balance_loss_clip": 1.32865667, "balance_loss_mlp": 1.0148716, "epoch": 0.2484292800240493, "flos": 30092429502720.0, "grad_norm": 1.5786114224072478, "language_loss": 0.84938645, "learning_rate": 3.520033883075255e-06, "loss": 0.87508345, "num_input_tokens_seen": 89031160, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.24414062, "step": 4132, "time_per_iteration": 4.39780855178833 }, { "auxiliary_loss_clip": 0.01542177, "auxiliary_loss_mlp": 0.01042296, "balance_loss_clip": 1.33744574, "balance_loss_mlp": 1.01585567, "epoch": 0.24848940327671726, "flos": 13450633712640.0, "grad_norm": 2.213269750036648, "language_loss": 0.72392821, "learning_rate": 3.5197807418401386e-06, "loss": 0.74977291, "num_input_tokens_seen": 89047235, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.26428223, "step": 4133, "time_per_iteration": 2.821784496307373 }, { "auxiliary_loss_clip": 0.01580571, "auxiliary_loss_mlp": 0.01046618, "balance_loss_clip": 1.36352706, "balance_loss_mlp": 1.01943851, "epoch": 0.24854952652938525, "flos": 19978682664960.0, "grad_norm": 2.250191465096188, "language_loss": 0.63015759, "learning_rate": 3.5195275429747834e-06, "loss": 0.65642947, "num_input_tokens_seen": 89064790, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.27160645, "step": 4134, "time_per_iteration": 2.9214653968811035 }, { "auxiliary_loss_clip": 0.01551589, "auxiliary_loss_mlp": 0.01043089, "balance_loss_clip": 1.34336793, "balance_loss_mlp": 1.01791239, "epoch": 0.24860964978205322, "flos": 18159247762560.0, "grad_norm": 1.9121575264078476, "language_loss": 0.79356509, "learning_rate": 3.5192742864887914e-06, "loss": 0.81951189, "num_input_tokens_seen": 89083250, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.2520752, "step": 4135, "time_per_iteration": 2.834266424179077 }, { "auxiliary_loss_clip": 0.01552683, "auxiliary_loss_mlp": 0.010434, "balance_loss_clip": 1.34585977, "balance_loss_mlp": 1.0189265, "epoch": 0.24866977303472118, "flos": 11736022308480.0, "grad_norm": 2.373647640151077, "language_loss": 0.84245098, "learning_rate": 3.5190209723917662e-06, "loss": 0.86841178, "num_input_tokens_seen": 89100905, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.24511719, "step": 4136, "time_per_iteration": 2.8805932998657227 }, { "auxiliary_loss_clip": 0.015648, "auxiliary_loss_mlp": 0.01042624, "balance_loss_clip": 1.35274005, "balance_loss_mlp": 1.01818585, "epoch": 0.24872989628738915, "flos": 34836814206720.0, "grad_norm": 1.6043786979033687, "language_loss": 0.72031641, "learning_rate": 3.518767600693314e-06, "loss": 0.74639064, "num_input_tokens_seen": 89122630, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.24450684, "step": 4137, "time_per_iteration": 2.956493377685547 }, { "auxiliary_loss_clip": 0.01565997, "auxiliary_loss_mlp": 0.01044528, "balance_loss_clip": 1.35433137, "balance_loss_mlp": 1.02036428, "epoch": 0.2487900195400571, "flos": 13707362563200.0, "grad_norm": 2.035887232189882, "language_loss": 0.68102264, "learning_rate": 3.518514171403042e-06, "loss": 0.70712793, "num_input_tokens_seen": 89141050, "router_z_loss_clip": 2.11425781, "router_z_loss_mlp": 0.24157715, "step": 4138, "time_per_iteration": 2.8355743885040283 }, { "auxiliary_loss_clip": 0.01547981, "auxiliary_loss_mlp": 0.01041441, "balance_loss_clip": 1.34353197, "balance_loss_mlp": 1.01808834, "epoch": 0.24885014279272508, "flos": 25348723470720.0, "grad_norm": 1.8602379610798447, "language_loss": 0.85181248, "learning_rate": 3.51826068453056e-06, "loss": 0.87770671, "num_input_tokens_seen": 89160810, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.23352051, "step": 4139, "time_per_iteration": 2.8749048709869385 }, { "auxiliary_loss_clip": 0.01566811, "auxiliary_loss_mlp": 0.0105019, "balance_loss_clip": 1.35222673, "balance_loss_mlp": 1.02390385, "epoch": 0.24891026604539307, "flos": 20641195296000.0, "grad_norm": 1.51579934623308, "language_loss": 0.79580003, "learning_rate": 3.518007140085481e-06, "loss": 0.82196999, "num_input_tokens_seen": 89180610, "router_z_loss_clip": 2.14453125, "router_z_loss_mlp": 0.26281738, "step": 4140, "time_per_iteration": 4.284176826477051 }, { "auxiliary_loss_clip": 0.01314255, "auxiliary_loss_mlp": 0.01030413, "balance_loss_clip": 1.1912508, "balance_loss_mlp": 1.00599909, "epoch": 0.24897038929806103, "flos": 66989454030720.0, "grad_norm": 0.8386226834367205, "language_loss": 0.61086941, "learning_rate": 3.51775353807742e-06, "loss": 0.63431609, "num_input_tokens_seen": 89241880, "router_z_loss_clip": 1.2265625, "router_z_loss_mlp": 0.24414062, "step": 4141, "time_per_iteration": 4.808851718902588 }, { "auxiliary_loss_clip": 0.01569104, "auxiliary_loss_mlp": 0.01046549, "balance_loss_clip": 1.35810602, "balance_loss_mlp": 1.02176547, "epoch": 0.249030512550729, "flos": 36406216488960.0, "grad_norm": 1.7673376466490704, "language_loss": 0.73644614, "learning_rate": 3.5174998785159913e-06, "loss": 0.76260269, "num_input_tokens_seen": 89263340, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24804688, "step": 4142, "time_per_iteration": 3.0371172428131104 }, { "auxiliary_loss_clip": 0.01551135, "auxiliary_loss_mlp": 0.01045075, "balance_loss_clip": 1.34466648, "balance_loss_mlp": 1.02112567, "epoch": 0.24909063580339696, "flos": 20163553493760.0, "grad_norm": 2.3074238284393966, "language_loss": 0.82060218, "learning_rate": 3.5172461614108157e-06, "loss": 0.84656429, "num_input_tokens_seen": 89282870, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.23950195, "step": 4143, "time_per_iteration": 2.8668746948242188 }, { "auxiliary_loss_clip": 0.01551768, "auxiliary_loss_mlp": 0.0104499, "balance_loss_clip": 1.34482682, "balance_loss_mlp": 1.02095723, "epoch": 0.24915075905606493, "flos": 26407925677440.0, "grad_norm": 2.0285207578531255, "language_loss": 0.59896815, "learning_rate": 3.5169923867715137e-06, "loss": 0.62493575, "num_input_tokens_seen": 89303830, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.24060059, "step": 4144, "time_per_iteration": 4.340224027633667 }, { "auxiliary_loss_clip": 0.01536908, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.33105135, "balance_loss_mlp": 1.01372313, "epoch": 0.2492108823087329, "flos": 27538759681920.0, "grad_norm": 2.3742930967975107, "language_loss": 0.7994616, "learning_rate": 3.516738554607708e-06, "loss": 0.82520795, "num_input_tokens_seen": 89324350, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.23999023, "step": 4145, "time_per_iteration": 2.956942319869995 }, { "auxiliary_loss_clip": 0.01568417, "auxiliary_loss_mlp": 0.01048247, "balance_loss_clip": 1.35352504, "balance_loss_mlp": 1.01950526, "epoch": 0.24927100556140086, "flos": 16700234088960.0, "grad_norm": 2.150635292223525, "language_loss": 0.66812062, "learning_rate": 3.5164846649290253e-06, "loss": 0.6942873, "num_input_tokens_seen": 89342875, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.28771973, "step": 4146, "time_per_iteration": 2.843592643737793 }, { "auxiliary_loss_clip": 0.01310694, "auxiliary_loss_mlp": 0.01029171, "balance_loss_clip": 1.19017816, "balance_loss_mlp": 1.00590181, "epoch": 0.24933112881406885, "flos": 62802622725120.0, "grad_norm": 0.9373230418655215, "language_loss": 0.67289877, "learning_rate": 3.5162307177450915e-06, "loss": 0.69629741, "num_input_tokens_seen": 89404925, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.23242188, "step": 4147, "time_per_iteration": 3.4775214195251465 }, { "auxiliary_loss_clip": 0.01549029, "auxiliary_loss_mlp": 0.01043582, "balance_loss_clip": 1.34282041, "balance_loss_mlp": 1.0179286, "epoch": 0.24939125206673682, "flos": 26663297184000.0, "grad_norm": 1.8537607040913286, "language_loss": 0.89696056, "learning_rate": 3.5159767130655366e-06, "loss": 0.92288661, "num_input_tokens_seen": 89425090, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.2565918, "step": 4148, "time_per_iteration": 2.945061683654785 }, { "auxiliary_loss_clip": 0.01566273, "auxiliary_loss_mlp": 0.01047232, "balance_loss_clip": 1.35335863, "balance_loss_mlp": 1.01964664, "epoch": 0.24945137531940478, "flos": 20714093948160.0, "grad_norm": 1.840706232990269, "language_loss": 0.69624335, "learning_rate": 3.5157226508999935e-06, "loss": 0.72237843, "num_input_tokens_seen": 89442615, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.27600098, "step": 4149, "time_per_iteration": 2.8541877269744873 }, { "auxiliary_loss_clip": 0.01551508, "auxiliary_loss_mlp": 0.01040027, "balance_loss_clip": 1.34472251, "balance_loss_mlp": 1.01431334, "epoch": 0.24951149857207275, "flos": 23779094964480.0, "grad_norm": 1.8718381932563004, "language_loss": 0.72190928, "learning_rate": 3.515468531258095e-06, "loss": 0.74782461, "num_input_tokens_seen": 89463025, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.25708008, "step": 4150, "time_per_iteration": 2.966066837310791 }, { "auxiliary_loss_clip": 0.01548874, "auxiliary_loss_mlp": 0.01040106, "balance_loss_clip": 1.33679175, "balance_loss_mlp": 1.01464319, "epoch": 0.2495716218247407, "flos": 15672187566720.0, "grad_norm": 2.109604906928087, "language_loss": 0.7362417, "learning_rate": 3.515214354149478e-06, "loss": 0.76213157, "num_input_tokens_seen": 89480225, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.2545166, "step": 4151, "time_per_iteration": 2.818089008331299 }, { "auxiliary_loss_clip": 0.01571111, "auxiliary_loss_mlp": 0.0103821, "balance_loss_clip": 1.35430884, "balance_loss_mlp": 1.0138917, "epoch": 0.24963174507740868, "flos": 24060916696320.0, "grad_norm": 2.974834033877862, "language_loss": 0.65797788, "learning_rate": 3.514960119583781e-06, "loss": 0.68407112, "num_input_tokens_seen": 89496985, "router_z_loss_clip": 2.16601562, "router_z_loss_mlp": 0.24328613, "step": 4152, "time_per_iteration": 2.8644957542419434 }, { "auxiliary_loss_clip": 0.01536699, "auxiliary_loss_mlp": 0.0104137, "balance_loss_clip": 1.3329078, "balance_loss_mlp": 1.01653814, "epoch": 0.24969186833007664, "flos": 21809700236160.0, "grad_norm": 1.9344260272248734, "language_loss": 0.78211004, "learning_rate": 3.514705827570645e-06, "loss": 0.80789077, "num_input_tokens_seen": 89514420, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.24853516, "step": 4153, "time_per_iteration": 2.8515055179595947 }, { "auxiliary_loss_clip": 0.01544796, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.33732486, "balance_loss_mlp": 1.01790202, "epoch": 0.24975199158274464, "flos": 19947707959680.0, "grad_norm": 1.8393199048058204, "language_loss": 0.77895445, "learning_rate": 3.514451478119711e-06, "loss": 0.80483639, "num_input_tokens_seen": 89532925, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.25500488, "step": 4154, "time_per_iteration": 2.8365697860717773 }, { "auxiliary_loss_clip": 0.01562775, "auxiliary_loss_mlp": 0.01045358, "balance_loss_clip": 1.34551513, "balance_loss_mlp": 1.01853561, "epoch": 0.2498121148354126, "flos": 25349447387520.0, "grad_norm": 3.1480393824861337, "language_loss": 0.71448779, "learning_rate": 3.5141970712406258e-06, "loss": 0.74056911, "num_input_tokens_seen": 89552855, "router_z_loss_clip": 2.171875, "router_z_loss_mlp": 0.26831055, "step": 4155, "time_per_iteration": 2.954972267150879 }, { "auxiliary_loss_clip": 0.01561155, "auxiliary_loss_mlp": 0.0104241, "balance_loss_clip": 1.34666371, "balance_loss_mlp": 1.01768589, "epoch": 0.24987223808808057, "flos": 20568613357440.0, "grad_norm": 5.027013849976075, "language_loss": 0.75961411, "learning_rate": 3.513942606943036e-06, "loss": 0.78564978, "num_input_tokens_seen": 89572830, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.24743652, "step": 4156, "time_per_iteration": 2.844348669052124 }, { "auxiliary_loss_clip": 0.01536613, "auxiliary_loss_mlp": 0.01040209, "balance_loss_clip": 1.32984984, "balance_loss_mlp": 1.01577163, "epoch": 0.24993236134074853, "flos": 19756819572480.0, "grad_norm": 2.2572891174708833, "language_loss": 0.77839047, "learning_rate": 3.513688085236591e-06, "loss": 0.80415869, "num_input_tokens_seen": 89590345, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.24438477, "step": 4157, "time_per_iteration": 2.8707756996154785 }, { "auxiliary_loss_clip": 0.01551269, "auxiliary_loss_mlp": 0.01049222, "balance_loss_clip": 1.34053791, "balance_loss_mlp": 1.02355611, "epoch": 0.2499924845934165, "flos": 18779474488320.0, "grad_norm": 1.9600691760994138, "language_loss": 0.82339525, "learning_rate": 3.513433506130942e-06, "loss": 0.84940016, "num_input_tokens_seen": 89610295, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.25671387, "step": 4158, "time_per_iteration": 2.8424744606018066 }, { "auxiliary_loss_clip": 0.01544118, "auxiliary_loss_mlp": 0.01042379, "balance_loss_clip": 1.3344692, "balance_loss_mlp": 1.01742899, "epoch": 0.25005260784608446, "flos": 16880670927360.0, "grad_norm": 3.102606699580561, "language_loss": 0.76927948, "learning_rate": 3.5131788696357427e-06, "loss": 0.79514444, "num_input_tokens_seen": 89627795, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.24963379, "step": 4159, "time_per_iteration": 2.8461995124816895 }, { "auxiliary_loss_clip": 0.01560712, "auxiliary_loss_mlp": 0.01047704, "balance_loss_clip": 1.34614587, "balance_loss_mlp": 1.02259898, "epoch": 0.2501127310987524, "flos": 22134441300480.0, "grad_norm": 1.7265624399435509, "language_loss": 0.71721274, "learning_rate": 3.512924175760649e-06, "loss": 0.74329692, "num_input_tokens_seen": 89648090, "router_z_loss_clip": 2.14257812, "router_z_loss_mlp": 0.25134277, "step": 4160, "time_per_iteration": 2.8609492778778076 }, { "auxiliary_loss_clip": 0.01315628, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.18603945, "balance_loss_mlp": 1.00246847, "epoch": 0.2501728543514204, "flos": 69492214172160.0, "grad_norm": 0.747175042815921, "language_loss": 0.56823742, "learning_rate": 3.5126694245153186e-06, "loss": 0.59167969, "num_input_tokens_seen": 89710345, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.26171875, "step": 4161, "time_per_iteration": 3.4035096168518066 }, { "auxiliary_loss_clip": 0.01561925, "auxiliary_loss_mlp": 0.01045671, "balance_loss_clip": 1.34488273, "balance_loss_mlp": 1.01963532, "epoch": 0.25023297760408836, "flos": 16298703319680.0, "grad_norm": 1.893776371134238, "language_loss": 0.8211239, "learning_rate": 3.5124146159094125e-06, "loss": 0.84719992, "num_input_tokens_seen": 89729390, "router_z_loss_clip": 2.16992188, "router_z_loss_mlp": 0.26049805, "step": 4162, "time_per_iteration": 2.870893955230713 }, { "auxiliary_loss_clip": 0.01551594, "auxiliary_loss_mlp": 0.01050661, "balance_loss_clip": 1.33702791, "balance_loss_mlp": 1.02484012, "epoch": 0.2502931008567563, "flos": 12245136508800.0, "grad_norm": 2.3838014003026524, "language_loss": 0.88711321, "learning_rate": 3.5121597499525927e-06, "loss": 0.91313577, "num_input_tokens_seen": 89742805, "router_z_loss_clip": 2.14648438, "router_z_loss_mlp": 0.25817871, "step": 4163, "time_per_iteration": 2.8320767879486084 }, { "auxiliary_loss_clip": 0.01558015, "auxiliary_loss_mlp": 0.01046184, "balance_loss_clip": 1.34607971, "balance_loss_mlp": 1.02050686, "epoch": 0.25035322410942434, "flos": 23191381267200.0, "grad_norm": 1.7011125765934234, "language_loss": 0.8414377, "learning_rate": 3.5119048266545232e-06, "loss": 0.86747968, "num_input_tokens_seen": 89761145, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.2565918, "step": 4164, "time_per_iteration": 2.901123285293579 }, { "auxiliary_loss_clip": 0.01520591, "auxiliary_loss_mlp": 0.01047904, "balance_loss_clip": 1.31722438, "balance_loss_mlp": 1.02223873, "epoch": 0.2504133473620923, "flos": 20925867450240.0, "grad_norm": 1.688983524663447, "language_loss": 0.75229323, "learning_rate": 3.5116498460248716e-06, "loss": 0.77797812, "num_input_tokens_seen": 89780905, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.2565918, "step": 4165, "time_per_iteration": 2.9231045246124268 }, { "auxiliary_loss_clip": 0.015417, "auxiliary_loss_mlp": 0.01050601, "balance_loss_clip": 1.32904506, "balance_loss_mlp": 1.02505469, "epoch": 0.2504734706147603, "flos": 20786132949120.0, "grad_norm": 1.8006735071649989, "language_loss": 0.7426796, "learning_rate": 3.5113948080733062e-06, "loss": 0.76860261, "num_input_tokens_seen": 89799230, "router_z_loss_clip": 2.12792969, "router_z_loss_mlp": 0.25561523, "step": 4166, "time_per_iteration": 2.8615317344665527 }, { "auxiliary_loss_clip": 0.01528713, "auxiliary_loss_mlp": 0.01050988, "balance_loss_clip": 1.31936371, "balance_loss_mlp": 1.02522695, "epoch": 0.25053359386742824, "flos": 24359162290560.0, "grad_norm": 1.956298370550903, "language_loss": 0.82273245, "learning_rate": 3.5111397128094973e-06, "loss": 0.84852946, "num_input_tokens_seen": 89818240, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.25720215, "step": 4167, "time_per_iteration": 4.385786056518555 }, { "auxiliary_loss_clip": 0.01527819, "auxiliary_loss_mlp": 0.01041843, "balance_loss_clip": 1.32215858, "balance_loss_mlp": 1.01649928, "epoch": 0.2505937171200962, "flos": 21224022554880.0, "grad_norm": 2.0543361420483786, "language_loss": 0.81217974, "learning_rate": 3.51088456024312e-06, "loss": 0.83787632, "num_input_tokens_seen": 89834485, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.25390625, "step": 4168, "time_per_iteration": 2.8361828327178955 }, { "auxiliary_loss_clip": 0.01559574, "auxiliary_loss_mlp": 0.01043592, "balance_loss_clip": 1.34217596, "balance_loss_mlp": 1.01659155, "epoch": 0.25065384037276417, "flos": 41442105312000.0, "grad_norm": 2.0885710463698035, "language_loss": 0.70115638, "learning_rate": 3.510629350383849e-06, "loss": 0.72718811, "num_input_tokens_seen": 89855645, "router_z_loss_clip": 2.17382812, "router_z_loss_mlp": 0.27001953, "step": 4169, "time_per_iteration": 3.017787456512451 }, { "auxiliary_loss_clip": 0.01528144, "auxiliary_loss_mlp": 0.01044518, "balance_loss_clip": 1.32069087, "balance_loss_mlp": 1.01965141, "epoch": 0.25071396362543213, "flos": 26113073443200.0, "grad_norm": 3.6430497085183573, "language_loss": 0.78590167, "learning_rate": 3.510374083241361e-06, "loss": 0.81162822, "num_input_tokens_seen": 89874895, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.2487793, "step": 4170, "time_per_iteration": 2.928492546081543 }, { "auxiliary_loss_clip": 0.01536518, "auxiliary_loss_mlp": 0.01043871, "balance_loss_clip": 1.32550764, "balance_loss_mlp": 1.01708496, "epoch": 0.2507740868781001, "flos": 19108332829440.0, "grad_norm": 2.1207014495117362, "language_loss": 0.77624774, "learning_rate": 3.5101187588253368e-06, "loss": 0.8020516, "num_input_tokens_seen": 89891700, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.2677002, "step": 4171, "time_per_iteration": 2.8056704998016357 }, { "auxiliary_loss_clip": 0.01301313, "auxiliary_loss_mlp": 0.01048854, "balance_loss_clip": 1.17628121, "balance_loss_mlp": 1.01852667, "epoch": 0.25083421013076806, "flos": 64372567944960.0, "grad_norm": 0.9371618396865574, "language_loss": 0.60094047, "learning_rate": 3.509863377145458e-06, "loss": 0.6244421, "num_input_tokens_seen": 89955775, "router_z_loss_clip": 1.25, "router_z_loss_mlp": 0.30273438, "step": 4172, "time_per_iteration": 3.351564407348633 }, { "auxiliary_loss_clip": 0.01532683, "auxiliary_loss_mlp": 0.01046202, "balance_loss_clip": 1.32163262, "balance_loss_mlp": 1.02005947, "epoch": 0.25089433338343603, "flos": 24290199936000.0, "grad_norm": 1.5938772910401175, "language_loss": 0.79872042, "learning_rate": 3.509607938211409e-06, "loss": 0.82450926, "num_input_tokens_seen": 89977150, "router_z_loss_clip": 2.10839844, "router_z_loss_mlp": 0.26159668, "step": 4173, "time_per_iteration": 2.8763394355773926 }, { "auxiliary_loss_clip": 0.0154076, "auxiliary_loss_mlp": 0.01052544, "balance_loss_clip": 1.330814, "balance_loss_mlp": 1.02686679, "epoch": 0.250954456636104, "flos": 14729889219840.0, "grad_norm": 3.20039148214578, "language_loss": 0.84098446, "learning_rate": 3.509352442032875e-06, "loss": 0.86691749, "num_input_tokens_seen": 89994925, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.25695801, "step": 4174, "time_per_iteration": 4.23157525062561 }, { "auxiliary_loss_clip": 0.0153795, "auxiliary_loss_mlp": 0.01044158, "balance_loss_clip": 1.32779765, "balance_loss_mlp": 1.01924372, "epoch": 0.25101457988877196, "flos": 22283858188800.0, "grad_norm": 1.9827364250993678, "language_loss": 0.72213364, "learning_rate": 3.509096888619545e-06, "loss": 0.74795473, "num_input_tokens_seen": 90013235, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.24914551, "step": 4175, "time_per_iteration": 2.88515305519104 }, { "auxiliary_loss_clip": 0.01541232, "auxiliary_loss_mlp": 0.01041888, "balance_loss_clip": 1.32756782, "balance_loss_mlp": 1.01531625, "epoch": 0.2510747031414399, "flos": 25199035113600.0, "grad_norm": 2.0307854223475563, "language_loss": 0.82128352, "learning_rate": 3.50884127798111e-06, "loss": 0.84711474, "num_input_tokens_seen": 90032150, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.26574707, "step": 4176, "time_per_iteration": 2.8935353755950928 }, { "auxiliary_loss_clip": 0.01535028, "auxiliary_loss_mlp": 0.01042244, "balance_loss_clip": 1.32412231, "balance_loss_mlp": 1.01512432, "epoch": 0.25113482639410795, "flos": 20713912968960.0, "grad_norm": 2.0573190221891906, "language_loss": 0.83679163, "learning_rate": 3.5085856101272623e-06, "loss": 0.86256433, "num_input_tokens_seen": 90049085, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.27124023, "step": 4177, "time_per_iteration": 4.202538251876831 }, { "auxiliary_loss_clip": 0.01532499, "auxiliary_loss_mlp": 0.01048559, "balance_loss_clip": 1.32371736, "balance_loss_mlp": 1.02306008, "epoch": 0.2511949496467759, "flos": 21516884017920.0, "grad_norm": 3.9475979705968003, "language_loss": 0.83931816, "learning_rate": 3.508329885067698e-06, "loss": 0.86512876, "num_input_tokens_seen": 90067695, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.25476074, "step": 4178, "time_per_iteration": 2.971217632293701 }, { "auxiliary_loss_clip": 0.01523931, "auxiliary_loss_mlp": 0.01042613, "balance_loss_clip": 1.31763315, "balance_loss_mlp": 1.01846159, "epoch": 0.2512550728994439, "flos": 20711153036160.0, "grad_norm": 2.3139927911324976, "language_loss": 0.76380014, "learning_rate": 3.508074102812112e-06, "loss": 0.78946555, "num_input_tokens_seen": 90083890, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.24145508, "step": 4179, "time_per_iteration": 4.259655714035034 }, { "auxiliary_loss_clip": 0.0153536, "auxiliary_loss_mlp": 0.01053319, "balance_loss_clip": 1.32370067, "balance_loss_mlp": 1.02760601, "epoch": 0.25131519615211184, "flos": 18487789390080.0, "grad_norm": 1.9625158235792273, "language_loss": 0.71811378, "learning_rate": 3.507818263370206e-06, "loss": 0.74400061, "num_input_tokens_seen": 90100995, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.25720215, "step": 4180, "time_per_iteration": 2.916630744934082 }, { "auxiliary_loss_clip": 0.01529193, "auxiliary_loss_mlp": 0.01054008, "balance_loss_clip": 1.32307267, "balance_loss_mlp": 1.02859282, "epoch": 0.2513753194047798, "flos": 20494538340480.0, "grad_norm": 2.6340340189286313, "language_loss": 0.86067271, "learning_rate": 3.5075623667516796e-06, "loss": 0.88650471, "num_input_tokens_seen": 90120365, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.25427246, "step": 4181, "time_per_iteration": 2.867891311645508 }, { "auxiliary_loss_clip": 0.01527204, "auxiliary_loss_mlp": 0.01050235, "balance_loss_clip": 1.31958175, "balance_loss_mlp": 1.02495027, "epoch": 0.25143544265744777, "flos": 37684205141760.0, "grad_norm": 2.612496587648746, "language_loss": 0.69849813, "learning_rate": 3.507306412966238e-06, "loss": 0.72427249, "num_input_tokens_seen": 90142610, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.25292969, "step": 4182, "time_per_iteration": 3.0725393295288086 }, { "auxiliary_loss_clip": 0.01301271, "auxiliary_loss_mlp": 0.01063787, "balance_loss_clip": 1.17256773, "balance_loss_mlp": 1.03899157, "epoch": 0.25149556591011574, "flos": 69397009833600.0, "grad_norm": 0.9848465998454152, "language_loss": 0.70206356, "learning_rate": 3.5070504020235853e-06, "loss": 0.72571415, "num_input_tokens_seen": 90200555, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.24804688, "step": 4183, "time_per_iteration": 3.4219133853912354 }, { "auxiliary_loss_clip": 0.01531881, "auxiliary_loss_mlp": 0.01047217, "balance_loss_clip": 1.3214525, "balance_loss_mlp": 1.02214742, "epoch": 0.2515556891627837, "flos": 13998278499840.0, "grad_norm": 1.7110555973656565, "language_loss": 0.75452155, "learning_rate": 3.506794333933431e-06, "loss": 0.78031254, "num_input_tokens_seen": 90218120, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.25061035, "step": 4184, "time_per_iteration": 2.978550910949707 }, { "auxiliary_loss_clip": 0.01529584, "auxiliary_loss_mlp": 0.01049241, "balance_loss_clip": 1.32228577, "balance_loss_mlp": 1.02418351, "epoch": 0.25161581241545167, "flos": 22173605314560.0, "grad_norm": 1.7464299157045184, "language_loss": 0.83694935, "learning_rate": 3.506538208705484e-06, "loss": 0.86273766, "num_input_tokens_seen": 90236790, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.25036621, "step": 4185, "time_per_iteration": 2.9578592777252197 }, { "auxiliary_loss_clip": 0.01309932, "auxiliary_loss_mlp": 0.0102033, "balance_loss_clip": 1.18226206, "balance_loss_mlp": 1.00039816, "epoch": 0.25167593566811963, "flos": 69385608144000.0, "grad_norm": 0.7901497563145897, "language_loss": 0.61514294, "learning_rate": 3.5062820263494574e-06, "loss": 0.6384455, "num_input_tokens_seen": 90297070, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.19921875, "step": 4186, "time_per_iteration": 3.287594795227051 }, { "auxiliary_loss_clip": 0.01531929, "auxiliary_loss_mlp": 0.01043509, "balance_loss_clip": 1.32294226, "balance_loss_mlp": 1.01773643, "epoch": 0.2517360589207876, "flos": 13269337223040.0, "grad_norm": 1.8831487239081453, "language_loss": 0.80176532, "learning_rate": 3.5060257868750656e-06, "loss": 0.82751977, "num_input_tokens_seen": 90315255, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.25817871, "step": 4187, "time_per_iteration": 2.9569191932678223 }, { "auxiliary_loss_clip": 0.01527167, "auxiliary_loss_mlp": 0.01048781, "balance_loss_clip": 1.32097793, "balance_loss_mlp": 1.02291274, "epoch": 0.25179618217345556, "flos": 20386728685440.0, "grad_norm": 1.4860743309875963, "language_loss": 0.80742657, "learning_rate": 3.5057694902920244e-06, "loss": 0.83318609, "num_input_tokens_seen": 90334990, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.25878906, "step": 4188, "time_per_iteration": 2.869967222213745 }, { "auxiliary_loss_clip": 0.01541411, "auxiliary_loss_mlp": 0.01046458, "balance_loss_clip": 1.33296895, "balance_loss_mlp": 1.02224672, "epoch": 0.25185630542612353, "flos": 27674105437440.0, "grad_norm": 1.8490157684553794, "language_loss": 0.75509262, "learning_rate": 3.5055131366100534e-06, "loss": 0.78097129, "num_input_tokens_seen": 90351825, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.24182129, "step": 4189, "time_per_iteration": 2.937288999557495 }, { "auxiliary_loss_clip": 0.01521506, "auxiliary_loss_mlp": 0.0104703, "balance_loss_clip": 1.31734574, "balance_loss_mlp": 1.02421319, "epoch": 0.25191642867879155, "flos": 21006321984000.0, "grad_norm": 1.9743094490745905, "language_loss": 0.8580417, "learning_rate": 3.5052567258388745e-06, "loss": 0.88372707, "num_input_tokens_seen": 90369860, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.22827148, "step": 4190, "time_per_iteration": 2.832486867904663 }, { "auxiliary_loss_clip": 0.015235, "auxiliary_loss_mlp": 0.01054588, "balance_loss_clip": 1.31599689, "balance_loss_mlp": 1.02814698, "epoch": 0.2519765519314595, "flos": 21115534227840.0, "grad_norm": 2.377170522915985, "language_loss": 0.76019627, "learning_rate": 3.5050002579882082e-06, "loss": 0.78597713, "num_input_tokens_seen": 90389245, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.26452637, "step": 4191, "time_per_iteration": 2.876530408859253 }, { "auxiliary_loss_clip": 0.01297433, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.17525125, "balance_loss_mlp": 1.01009011, "epoch": 0.2520366751841275, "flos": 62777484599040.0, "grad_norm": 0.7207595514930212, "language_loss": 0.57275057, "learning_rate": 3.5047437330677823e-06, "loss": 0.59602988, "num_input_tokens_seen": 90456735, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.20410156, "step": 4192, "time_per_iteration": 3.453383684158325 }, { "auxiliary_loss_clip": 0.01531145, "auxiliary_loss_mlp": 0.01052255, "balance_loss_clip": 1.32703948, "balance_loss_mlp": 1.02682805, "epoch": 0.25209679843679544, "flos": 22239310043520.0, "grad_norm": 2.022261416183438, "language_loss": 0.76602805, "learning_rate": 3.504487151087323e-06, "loss": 0.79186201, "num_input_tokens_seen": 90474165, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.25427246, "step": 4193, "time_per_iteration": 2.835674285888672 }, { "auxiliary_loss_clip": 0.0152968, "auxiliary_loss_mlp": 0.01050561, "balance_loss_clip": 1.32064915, "balance_loss_mlp": 1.02499115, "epoch": 0.2521569216894634, "flos": 12174183383040.0, "grad_norm": 2.0979232296172614, "language_loss": 0.84814847, "learning_rate": 3.5042305120565598e-06, "loss": 0.8739509, "num_input_tokens_seen": 90491660, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.2557373, "step": 4194, "time_per_iteration": 2.8352138996124268 }, { "auxiliary_loss_clip": 0.01535041, "auxiliary_loss_mlp": 0.01058467, "balance_loss_clip": 1.32416821, "balance_loss_mlp": 1.03419614, "epoch": 0.2522170449421314, "flos": 23711263729920.0, "grad_norm": 1.5637017349119167, "language_loss": 0.88841426, "learning_rate": 3.5039738159852253e-06, "loss": 0.91434938, "num_input_tokens_seen": 90514025, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.24267578, "step": 4195, "time_per_iteration": 2.912318468093872 }, { "auxiliary_loss_clip": 0.01543547, "auxiliary_loss_mlp": 0.01052836, "balance_loss_clip": 1.33422089, "balance_loss_mlp": 1.02688396, "epoch": 0.25227716819479934, "flos": 20964171813120.0, "grad_norm": 1.7695295400440785, "language_loss": 0.87001407, "learning_rate": 3.503717062883053e-06, "loss": 0.89597785, "num_input_tokens_seen": 90533530, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.25964355, "step": 4196, "time_per_iteration": 2.8785829544067383 }, { "auxiliary_loss_clip": 0.01539305, "auxiliary_loss_mlp": 0.01051446, "balance_loss_clip": 1.32774568, "balance_loss_mlp": 1.02656746, "epoch": 0.2523372914474673, "flos": 23341793541120.0, "grad_norm": 1.9834047177443144, "language_loss": 0.84497219, "learning_rate": 3.5034602527597786e-06, "loss": 0.87087971, "num_input_tokens_seen": 90554025, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.2487793, "step": 4197, "time_per_iteration": 2.9115893840789795 }, { "auxiliary_loss_clip": 0.01528974, "auxiliary_loss_mlp": 0.01052734, "balance_loss_clip": 1.31840324, "balance_loss_mlp": 1.0268898, "epoch": 0.25239741470013527, "flos": 36982573741440.0, "grad_norm": 2.245463716551549, "language_loss": 0.73933476, "learning_rate": 3.5032033856251405e-06, "loss": 0.76515186, "num_input_tokens_seen": 90576930, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.25878906, "step": 4198, "time_per_iteration": 2.972625494003296 }, { "auxiliary_loss_clip": 0.01536599, "auxiliary_loss_mlp": 0.01048744, "balance_loss_clip": 1.32364082, "balance_loss_mlp": 1.02357864, "epoch": 0.25245753795280323, "flos": 18525098367360.0, "grad_norm": 1.8066810630308692, "language_loss": 0.77994812, "learning_rate": 3.50294646148888e-06, "loss": 0.80580157, "num_input_tokens_seen": 90595710, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25158691, "step": 4199, "time_per_iteration": 2.845914602279663 }, { "auxiliary_loss_clip": 0.01540875, "auxiliary_loss_mlp": 0.01051058, "balance_loss_clip": 1.32765007, "balance_loss_mlp": 1.02515364, "epoch": 0.2525176612054712, "flos": 32358079054080.0, "grad_norm": 1.6322777585658708, "language_loss": 0.74173975, "learning_rate": 3.502689480360739e-06, "loss": 0.76765907, "num_input_tokens_seen": 90617945, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.2590332, "step": 4200, "time_per_iteration": 2.9290146827697754 }, { "auxiliary_loss_clip": 0.01520343, "auxiliary_loss_mlp": 0.01042965, "balance_loss_clip": 1.31199682, "balance_loss_mlp": 1.0184201, "epoch": 0.25257778445813917, "flos": 45274080499200.0, "grad_norm": 1.5378463036957457, "language_loss": 0.82981312, "learning_rate": 3.5024324422504616e-06, "loss": 0.85544622, "num_input_tokens_seen": 90640855, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.24560547, "step": 4201, "time_per_iteration": 3.0643723011016846 }, { "auxiliary_loss_clip": 0.01534806, "auxiliary_loss_mlp": 0.01047535, "balance_loss_clip": 1.32228959, "balance_loss_mlp": 1.02216744, "epoch": 0.25263790771080713, "flos": 23378242867200.0, "grad_norm": 1.7711968013888884, "language_loss": 0.76044232, "learning_rate": 3.5021753471677965e-06, "loss": 0.78626567, "num_input_tokens_seen": 90661350, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.25390625, "step": 4202, "time_per_iteration": 2.895042896270752 }, { "auxiliary_loss_clip": 0.01519595, "auxiliary_loss_mlp": 0.01041021, "balance_loss_clip": 1.31436014, "balance_loss_mlp": 1.01649976, "epoch": 0.25269803096347515, "flos": 18524193471360.0, "grad_norm": 1.8901598671031354, "language_loss": 0.7452482, "learning_rate": 3.501918195122491e-06, "loss": 0.77085435, "num_input_tokens_seen": 90680540, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.2454834, "step": 4203, "time_per_iteration": 4.242213249206543 }, { "auxiliary_loss_clip": 0.01542016, "auxiliary_loss_mlp": 0.01046139, "balance_loss_clip": 1.32970238, "balance_loss_mlp": 1.02042532, "epoch": 0.2527581542161431, "flos": 24621908699520.0, "grad_norm": 1.382876129681569, "language_loss": 0.78170729, "learning_rate": 3.501660986124297e-06, "loss": 0.80758888, "num_input_tokens_seen": 90703460, "router_z_loss_clip": 2.125, "router_z_loss_mlp": 0.25708008, "step": 4204, "time_per_iteration": 2.908308744430542 }, { "auxiliary_loss_clip": 0.01529384, "auxiliary_loss_mlp": 0.0104784, "balance_loss_clip": 1.32053828, "balance_loss_mlp": 1.02377176, "epoch": 0.2528182774688111, "flos": 12649020007680.0, "grad_norm": 2.1470519940759374, "language_loss": 0.73202235, "learning_rate": 3.5014037201829684e-06, "loss": 0.75779462, "num_input_tokens_seen": 90718815, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.24060059, "step": 4205, "time_per_iteration": 2.8494181632995605 }, { "auxiliary_loss_clip": 0.01505781, "auxiliary_loss_mlp": 0.01045847, "balance_loss_clip": 1.30538285, "balance_loss_mlp": 1.02139688, "epoch": 0.25287840072147905, "flos": 46953147473280.0, "grad_norm": 1.4204120761923895, "language_loss": 0.76603758, "learning_rate": 3.50114639730826e-06, "loss": 0.79155397, "num_input_tokens_seen": 90742125, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.24438477, "step": 4206, "time_per_iteration": 3.091952085494995 }, { "auxiliary_loss_clip": 0.01532394, "auxiliary_loss_mlp": 0.01043994, "balance_loss_clip": 1.32248795, "balance_loss_mlp": 1.0191747, "epoch": 0.252938523974147, "flos": 18888912956160.0, "grad_norm": 1.5612702676217622, "language_loss": 0.80061448, "learning_rate": 3.5008890175099296e-06, "loss": 0.82637835, "num_input_tokens_seen": 90760785, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24816895, "step": 4207, "time_per_iteration": 2.85799241065979 }, { "auxiliary_loss_clip": 0.01515582, "auxiliary_loss_mlp": 0.01044272, "balance_loss_clip": 1.30971956, "balance_loss_mlp": 1.01951265, "epoch": 0.252998647226815, "flos": 21444845016960.0, "grad_norm": 1.550928577560789, "language_loss": 0.77069825, "learning_rate": 3.5006315807977375e-06, "loss": 0.79629683, "num_input_tokens_seen": 90780045, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.24780273, "step": 4208, "time_per_iteration": 2.8857839107513428 }, { "auxiliary_loss_clip": 0.01511107, "auxiliary_loss_mlp": 0.01043727, "balance_loss_clip": 1.31037617, "balance_loss_mlp": 1.01906276, "epoch": 0.25305877047948294, "flos": 25451782421760.0, "grad_norm": 2.020424720531339, "language_loss": 0.70451498, "learning_rate": 3.5003740871814456e-06, "loss": 0.73006332, "num_input_tokens_seen": 90797980, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.24682617, "step": 4209, "time_per_iteration": 4.440242528915405 }, { "auxiliary_loss_clip": 0.01295206, "auxiliary_loss_mlp": 0.01047014, "balance_loss_clip": 1.1674614, "balance_loss_mlp": 1.02202821, "epoch": 0.2531188937321509, "flos": 60216032672640.0, "grad_norm": 0.7638864027724452, "language_loss": 0.55144906, "learning_rate": 3.5001165366708175e-06, "loss": 0.5748713, "num_input_tokens_seen": 90864865, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.25, "step": 4210, "time_per_iteration": 3.418522834777832 }, { "auxiliary_loss_clip": 0.01541881, "auxiliary_loss_mlp": 0.0103726, "balance_loss_clip": 1.33202267, "balance_loss_mlp": 1.0129056, "epoch": 0.25317901698481887, "flos": 19691793515520.0, "grad_norm": 2.2781343232214453, "language_loss": 0.80875993, "learning_rate": 3.4998589292756204e-06, "loss": 0.83455133, "num_input_tokens_seen": 90882885, "router_z_loss_clip": 2.09863281, "router_z_loss_mlp": 0.24377441, "step": 4211, "time_per_iteration": 4.25178599357605 }, { "auxiliary_loss_clip": 0.0151196, "auxiliary_loss_mlp": 0.01045475, "balance_loss_clip": 1.30905259, "balance_loss_mlp": 1.02145386, "epoch": 0.25323914023748684, "flos": 24434685141120.0, "grad_norm": 1.6466273105075449, "language_loss": 0.79180539, "learning_rate": 3.499601265005622e-06, "loss": 0.81737971, "num_input_tokens_seen": 90902985, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.24023438, "step": 4212, "time_per_iteration": 2.868617057800293 }, { "auxiliary_loss_clip": 0.01522943, "auxiliary_loss_mlp": 0.01041812, "balance_loss_clip": 1.3161633, "balance_loss_mlp": 1.01717162, "epoch": 0.2532992634901548, "flos": 25458433407360.0, "grad_norm": 1.8914698764183817, "language_loss": 0.54502666, "learning_rate": 3.4993435438705938e-06, "loss": 0.57067418, "num_input_tokens_seen": 90923550, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.24633789, "step": 4213, "time_per_iteration": 2.9028208255767822 }, { "auxiliary_loss_clip": 0.01527675, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.31843424, "balance_loss_mlp": 1.01957583, "epoch": 0.25335938674282277, "flos": 18889953586560.0, "grad_norm": 4.755586770229758, "language_loss": 0.66283149, "learning_rate": 3.499085765880308e-06, "loss": 0.68856788, "num_input_tokens_seen": 90943260, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.26391602, "step": 4214, "time_per_iteration": 4.364147424697876 }, { "auxiliary_loss_clip": 0.01288002, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.16520286, "balance_loss_mlp": 1.01470375, "epoch": 0.25341950999549073, "flos": 53089094557440.0, "grad_norm": 0.8515879820347452, "language_loss": 0.5814395, "learning_rate": 3.4988279310445396e-06, "loss": 0.60473359, "num_input_tokens_seen": 90996295, "router_z_loss_clip": 1.2265625, "router_z_loss_mlp": 0.26757812, "step": 4215, "time_per_iteration": 3.1124112606048584 }, { "auxiliary_loss_clip": 0.01516066, "auxiliary_loss_mlp": 0.01038442, "balance_loss_clip": 1.3097856, "balance_loss_mlp": 1.01392031, "epoch": 0.2534796332481587, "flos": 39034775733120.0, "grad_norm": 1.575038653877057, "language_loss": 0.8460052, "learning_rate": 3.498570039373066e-06, "loss": 0.87155032, "num_input_tokens_seen": 91017545, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.2454834, "step": 4216, "time_per_iteration": 2.9882240295410156 }, { "auxiliary_loss_clip": 0.01524417, "auxiliary_loss_mlp": 0.0104387, "balance_loss_clip": 1.3189708, "balance_loss_mlp": 1.01983762, "epoch": 0.2535397565008267, "flos": 23597572250880.0, "grad_norm": 1.9262801534543614, "language_loss": 0.81342852, "learning_rate": 3.498312090875666e-06, "loss": 0.83911133, "num_input_tokens_seen": 91037715, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.24047852, "step": 4217, "time_per_iteration": 2.900644302368164 }, { "auxiliary_loss_clip": 0.01515062, "auxiliary_loss_mlp": 0.01037628, "balance_loss_clip": 1.30890858, "balance_loss_mlp": 1.01298714, "epoch": 0.2535998797534947, "flos": 19290760439040.0, "grad_norm": 2.9991357016801268, "language_loss": 0.764328, "learning_rate": 3.4980540855621218e-06, "loss": 0.78985488, "num_input_tokens_seen": 91055295, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.24658203, "step": 4218, "time_per_iteration": 2.8198580741882324 }, { "auxiliary_loss_clip": 0.01537782, "auxiliary_loss_mlp": 0.01048694, "balance_loss_clip": 1.32758451, "balance_loss_mlp": 1.02349305, "epoch": 0.25366000300616265, "flos": 24034964163840.0, "grad_norm": 2.1229275507408296, "language_loss": 0.75820792, "learning_rate": 3.4977960234422167e-06, "loss": 0.7840727, "num_input_tokens_seen": 91075485, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.25219727, "step": 4219, "time_per_iteration": 2.9028990268707275 }, { "auxiliary_loss_clip": 0.01532636, "auxiliary_loss_mlp": 0.01044718, "balance_loss_clip": 1.32247007, "balance_loss_mlp": 1.01975596, "epoch": 0.2537201262588306, "flos": 16297934158080.0, "grad_norm": 1.7573000603866413, "language_loss": 0.82250094, "learning_rate": 3.497537904525736e-06, "loss": 0.84827453, "num_input_tokens_seen": 91093620, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24975586, "step": 4220, "time_per_iteration": 2.8830039501190186 }, { "auxiliary_loss_clip": 0.01533593, "auxiliary_loss_mlp": 0.01047949, "balance_loss_clip": 1.32510841, "balance_loss_mlp": 1.02368999, "epoch": 0.2537802495114986, "flos": 23305072746240.0, "grad_norm": 1.9146424466104128, "language_loss": 0.72402924, "learning_rate": 3.497279728822468e-06, "loss": 0.74984467, "num_input_tokens_seen": 91114110, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.24255371, "step": 4221, "time_per_iteration": 2.961087942123413 }, { "auxiliary_loss_clip": 0.0152793, "auxiliary_loss_mlp": 0.01049081, "balance_loss_clip": 1.31650186, "balance_loss_mlp": 1.02424943, "epoch": 0.25384037276416654, "flos": 17648007056640.0, "grad_norm": 2.468452544230068, "language_loss": 0.63044596, "learning_rate": 3.497021496342202e-06, "loss": 0.65621608, "num_input_tokens_seen": 91133135, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.24841309, "step": 4222, "time_per_iteration": 2.8441786766052246 }, { "auxiliary_loss_clip": 0.01531223, "auxiliary_loss_mlp": 0.01044747, "balance_loss_clip": 1.32211423, "balance_loss_mlp": 1.02039266, "epoch": 0.2539004960168345, "flos": 21517019752320.0, "grad_norm": 1.7366475569368265, "language_loss": 0.75597346, "learning_rate": 3.496763207094731e-06, "loss": 0.78173316, "num_input_tokens_seen": 91151805, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.24353027, "step": 4223, "time_per_iteration": 2.8664767742156982 }, { "auxiliary_loss_clip": 0.01510607, "auxiliary_loss_mlp": 0.01039449, "balance_loss_clip": 1.30846667, "balance_loss_mlp": 1.01734805, "epoch": 0.2539606192695025, "flos": 23961341594880.0, "grad_norm": 1.6718633837605723, "language_loss": 0.80781442, "learning_rate": 3.49650486108985e-06, "loss": 0.83331501, "num_input_tokens_seen": 91172270, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.22094727, "step": 4224, "time_per_iteration": 2.954723596572876 }, { "auxiliary_loss_clip": 0.0151452, "auxiliary_loss_mlp": 0.0104452, "balance_loss_clip": 1.31065249, "balance_loss_mlp": 1.02015388, "epoch": 0.25402074252217044, "flos": 24180263775360.0, "grad_norm": 1.4953989182170675, "language_loss": 0.78091311, "learning_rate": 3.496246458337354e-06, "loss": 0.80650353, "num_input_tokens_seen": 91192080, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.24353027, "step": 4225, "time_per_iteration": 2.844721555709839 }, { "auxiliary_loss_clip": 0.01518711, "auxiliary_loss_mlp": 0.01049725, "balance_loss_clip": 1.31381238, "balance_loss_mlp": 1.02608609, "epoch": 0.2540808657748384, "flos": 22312389674880.0, "grad_norm": 1.8735665398410137, "language_loss": 0.85736108, "learning_rate": 3.4959879988470426e-06, "loss": 0.88304549, "num_input_tokens_seen": 91211450, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.23657227, "step": 4226, "time_per_iteration": 2.890226125717163 }, { "auxiliary_loss_clip": 0.01520633, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.31512225, "balance_loss_mlp": 1.02296054, "epoch": 0.25414098902750637, "flos": 27610934417280.0, "grad_norm": 1.417705534420962, "language_loss": 0.72182524, "learning_rate": 3.4957294826287164e-06, "loss": 0.747504, "num_input_tokens_seen": 91231835, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.24291992, "step": 4227, "time_per_iteration": 2.9160988330841064 }, { "auxiliary_loss_clip": 0.01273838, "auxiliary_loss_mlp": 0.01032696, "balance_loss_clip": 1.15205359, "balance_loss_mlp": 1.01486254, "epoch": 0.25420111228017434, "flos": 58198651194240.0, "grad_norm": 0.9783498061141501, "language_loss": 0.6185109, "learning_rate": 3.4954709096921785e-06, "loss": 0.64157629, "num_input_tokens_seen": 91288755, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.17871094, "step": 4228, "time_per_iteration": 3.2070937156677246 }, { "auxiliary_loss_clip": 0.01539772, "auxiliary_loss_mlp": 0.01041955, "balance_loss_clip": 1.32909775, "balance_loss_mlp": 1.01653934, "epoch": 0.2542612355328423, "flos": 11469430091520.0, "grad_norm": 2.1876230903333385, "language_loss": 0.87795663, "learning_rate": 3.4952122800472336e-06, "loss": 0.90377396, "num_input_tokens_seen": 91302485, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.25427246, "step": 4229, "time_per_iteration": 2.8106462955474854 }, { "auxiliary_loss_clip": 0.01539277, "auxiliary_loss_mlp": 0.01041898, "balance_loss_clip": 1.33235669, "balance_loss_mlp": 1.01762724, "epoch": 0.2543213587855103, "flos": 22976078670720.0, "grad_norm": 2.5018685040894804, "language_loss": 0.77775216, "learning_rate": 3.4949535937036892e-06, "loss": 0.80356395, "num_input_tokens_seen": 91321120, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.24291992, "step": 4230, "time_per_iteration": 2.8537495136260986 }, { "auxiliary_loss_clip": 0.0152094, "auxiliary_loss_mlp": 0.01048387, "balance_loss_clip": 1.31353629, "balance_loss_mlp": 1.02250719, "epoch": 0.2543814820381783, "flos": 18260587411200.0, "grad_norm": 1.9017187069990786, "language_loss": 0.7595731, "learning_rate": 3.4946948506713544e-06, "loss": 0.7852664, "num_input_tokens_seen": 91338575, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.25878906, "step": 4231, "time_per_iteration": 2.9102985858917236 }, { "auxiliary_loss_clip": 0.01532093, "auxiliary_loss_mlp": 0.01046854, "balance_loss_clip": 1.32175827, "balance_loss_mlp": 1.02221322, "epoch": 0.25444160529084625, "flos": 15641122371840.0, "grad_norm": 1.5797516428161937, "language_loss": 0.74691164, "learning_rate": 3.4944360509600416e-06, "loss": 0.77270108, "num_input_tokens_seen": 91357355, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.24658203, "step": 4232, "time_per_iteration": 2.8397443294525146 }, { "auxiliary_loss_clip": 0.0152027, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.31279039, "balance_loss_mlp": 1.01566195, "epoch": 0.2545017285435142, "flos": 24610642744320.0, "grad_norm": 1.895604092637888, "language_loss": 0.87973142, "learning_rate": 3.4941771945795637e-06, "loss": 0.90533614, "num_input_tokens_seen": 91376515, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.2454834, "step": 4233, "time_per_iteration": 2.8908908367156982 }, { "auxiliary_loss_clip": 0.01499127, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.2982645, "balance_loss_mlp": 1.02054453, "epoch": 0.2545618517961822, "flos": 24689287486080.0, "grad_norm": 1.6657064864018902, "language_loss": 0.75700706, "learning_rate": 3.493918281539737e-06, "loss": 0.7824344, "num_input_tokens_seen": 91397595, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.23071289, "step": 4234, "time_per_iteration": 2.871424674987793 }, { "auxiliary_loss_clip": 0.01525862, "auxiliary_loss_mlp": 0.01041097, "balance_loss_clip": 1.31765997, "balance_loss_mlp": 1.01702881, "epoch": 0.25462197504885015, "flos": 23925706675200.0, "grad_norm": 1.4346920545652044, "language_loss": 0.75571346, "learning_rate": 3.493659311850379e-06, "loss": 0.7813831, "num_input_tokens_seen": 91417775, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.24084473, "step": 4235, "time_per_iteration": 2.889403820037842 }, { "auxiliary_loss_clip": 0.01581572, "auxiliary_loss_mlp": 0.01041563, "balance_loss_clip": 1.36144555, "balance_loss_mlp": 1.01613545, "epoch": 0.2546820983015181, "flos": 24799857073920.0, "grad_norm": 2.680127371715491, "language_loss": 0.65860617, "learning_rate": 3.4934002855213106e-06, "loss": 0.68483752, "num_input_tokens_seen": 91437665, "router_z_loss_clip": 2.20019531, "router_z_loss_mlp": 0.25415039, "step": 4236, "time_per_iteration": 2.8805904388427734 }, { "auxiliary_loss_clip": 0.01516845, "auxiliary_loss_mlp": 0.01038646, "balance_loss_clip": 1.31270349, "balance_loss_mlp": 1.01489949, "epoch": 0.2547422215541861, "flos": 18743387120640.0, "grad_norm": 1.5403681037642782, "language_loss": 0.67890245, "learning_rate": 3.493141202562354e-06, "loss": 0.70445734, "num_input_tokens_seen": 91456705, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.23754883, "step": 4237, "time_per_iteration": 2.825218677520752 }, { "auxiliary_loss_clip": 0.01520913, "auxiliary_loss_mlp": 0.01043776, "balance_loss_clip": 1.31129646, "balance_loss_mlp": 1.01839674, "epoch": 0.25480234480685404, "flos": 21042228372480.0, "grad_norm": 1.9989598687617298, "language_loss": 0.76261437, "learning_rate": 3.492882062983333e-06, "loss": 0.78826129, "num_input_tokens_seen": 91475535, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.25390625, "step": 4238, "time_per_iteration": 4.2961297035217285 }, { "auxiliary_loss_clip": 0.01525831, "auxiliary_loss_mlp": 0.01045981, "balance_loss_clip": 1.3177228, "balance_loss_mlp": 1.02098274, "epoch": 0.254862468059522, "flos": 25092944760960.0, "grad_norm": 4.151359097962964, "language_loss": 0.81479347, "learning_rate": 3.492622866794074e-06, "loss": 0.84051162, "num_input_tokens_seen": 91499140, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.24987793, "step": 4239, "time_per_iteration": 2.9050424098968506 }, { "auxiliary_loss_clip": 0.01514068, "auxiliary_loss_mlp": 0.01043035, "balance_loss_clip": 1.31100869, "balance_loss_mlp": 1.01840639, "epoch": 0.25492259131219, "flos": 20568115664640.0, "grad_norm": 1.663637818419357, "language_loss": 0.78510821, "learning_rate": 3.492363614004407e-06, "loss": 0.8106792, "num_input_tokens_seen": 91518335, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.24633789, "step": 4240, "time_per_iteration": 2.81251859664917 }, { "auxiliary_loss_clip": 0.01533732, "auxiliary_loss_mlp": 0.01042693, "balance_loss_clip": 1.32295632, "balance_loss_mlp": 1.01758778, "epoch": 0.25498271456485794, "flos": 25052423402880.0, "grad_norm": 1.7158471663124326, "language_loss": 0.84004438, "learning_rate": 3.492104304624162e-06, "loss": 0.86580867, "num_input_tokens_seen": 91537655, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.25085449, "step": 4241, "time_per_iteration": 2.891291379928589 }, { "auxiliary_loss_clip": 0.01531187, "auxiliary_loss_mlp": 0.01042448, "balance_loss_clip": 1.32259905, "balance_loss_mlp": 1.01830804, "epoch": 0.2550428378175259, "flos": 26189908392960.0, "grad_norm": 2.0104611761517224, "language_loss": 0.74143374, "learning_rate": 3.4918449386631725e-06, "loss": 0.76717007, "num_input_tokens_seen": 91557545, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.24133301, "step": 4242, "time_per_iteration": 2.8750274181365967 }, { "auxiliary_loss_clip": 0.0154318, "auxiliary_loss_mlp": 0.01044245, "balance_loss_clip": 1.33308148, "balance_loss_mlp": 1.01960397, "epoch": 0.2551029610701939, "flos": 15275317011840.0, "grad_norm": 2.6688108268079604, "language_loss": 0.74190927, "learning_rate": 3.491585516131273e-06, "loss": 0.76778352, "num_input_tokens_seen": 91574405, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.24633789, "step": 4243, "time_per_iteration": 2.8517098426818848 }, { "auxiliary_loss_clip": 0.01519511, "auxiliary_loss_mlp": 0.01048513, "balance_loss_clip": 1.31219828, "balance_loss_mlp": 1.02358675, "epoch": 0.2551630843228619, "flos": 18120852910080.0, "grad_norm": 1.611570058369904, "language_loss": 0.82834005, "learning_rate": 3.491326037038301e-06, "loss": 0.85402024, "num_input_tokens_seen": 91593755, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.24914551, "step": 4244, "time_per_iteration": 4.2871222496032715 }, { "auxiliary_loss_clip": 0.01293774, "auxiliary_loss_mlp": 0.01025171, "balance_loss_clip": 1.16943383, "balance_loss_mlp": 1.0066694, "epoch": 0.25522320757552985, "flos": 70555669390080.0, "grad_norm": 0.682864921752293, "language_loss": 0.57729113, "learning_rate": 3.4910665013940967e-06, "loss": 0.60048056, "num_input_tokens_seen": 91660335, "router_z_loss_clip": 1.2421875, "router_z_loss_mlp": 0.18457031, "step": 4245, "time_per_iteration": 3.429171085357666 }, { "auxiliary_loss_clip": 0.01543188, "auxiliary_loss_mlp": 0.01053599, "balance_loss_clip": 1.33100188, "balance_loss_mlp": 1.0282793, "epoch": 0.2552833308281978, "flos": 22903315752960.0, "grad_norm": 2.0550025370386056, "language_loss": 0.66928113, "learning_rate": 3.4908069092085015e-06, "loss": 0.69524896, "num_input_tokens_seen": 91678500, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.25317383, "step": 4246, "time_per_iteration": 4.288013696670532 }, { "auxiliary_loss_clip": 0.01515883, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.31519651, "balance_loss_mlp": 1.01768243, "epoch": 0.2553434540808658, "flos": 22063533419520.0, "grad_norm": 1.7218867763908496, "language_loss": 0.82214606, "learning_rate": 3.4905472604913585e-06, "loss": 0.84770894, "num_input_tokens_seen": 91696430, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.22705078, "step": 4247, "time_per_iteration": 2.856114149093628 }, { "auxiliary_loss_clip": 0.01554774, "auxiliary_loss_mlp": 0.01046682, "balance_loss_clip": 1.33868551, "balance_loss_mlp": 1.02123094, "epoch": 0.25540357733353375, "flos": 16552672237440.0, "grad_norm": 2.1450477017325893, "language_loss": 0.84519672, "learning_rate": 3.490287555252514e-06, "loss": 0.87121123, "num_input_tokens_seen": 91713270, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.25476074, "step": 4248, "time_per_iteration": 2.804044246673584 }, { "auxiliary_loss_clip": 0.01548175, "auxiliary_loss_mlp": 0.01049273, "balance_loss_clip": 1.33798599, "balance_loss_mlp": 1.02457309, "epoch": 0.2554637005862017, "flos": 17573570081280.0, "grad_norm": 2.0952920265659265, "language_loss": 0.85303271, "learning_rate": 3.4900277935018166e-06, "loss": 0.87900716, "num_input_tokens_seen": 91728865, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.24707031, "step": 4249, "time_per_iteration": 4.22739839553833 }, { "auxiliary_loss_clip": 0.01294335, "auxiliary_loss_mlp": 0.01027265, "balance_loss_clip": 1.16865146, "balance_loss_mlp": 1.00704753, "epoch": 0.2555238238388697, "flos": 72273746661120.0, "grad_norm": 0.7678677995863831, "language_loss": 0.56356514, "learning_rate": 3.489767975249115e-06, "loss": 0.58678114, "num_input_tokens_seen": 91787470, "router_z_loss_clip": 1.2578125, "router_z_loss_mlp": 0.20214844, "step": 4250, "time_per_iteration": 3.402534008026123 }, { "auxiliary_loss_clip": 0.01535048, "auxiliary_loss_mlp": 0.01048438, "balance_loss_clip": 1.32534611, "balance_loss_mlp": 1.02277207, "epoch": 0.25558394709153764, "flos": 24400136096640.0, "grad_norm": 1.9149506597383026, "language_loss": 0.82218701, "learning_rate": 3.4895081005042632e-06, "loss": 0.84802186, "num_input_tokens_seen": 91805640, "router_z_loss_clip": 2.09960938, "router_z_loss_mlp": 0.25683594, "step": 4251, "time_per_iteration": 2.895263195037842 }, { "auxiliary_loss_clip": 0.01290421, "auxiliary_loss_mlp": 0.01055961, "balance_loss_clip": 1.16685295, "balance_loss_mlp": 1.03307283, "epoch": 0.2556440703442056, "flos": 69263247646080.0, "grad_norm": 0.8041048749186148, "language_loss": 0.66197085, "learning_rate": 3.4892481692771146e-06, "loss": 0.6854347, "num_input_tokens_seen": 91869695, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.22851562, "step": 4252, "time_per_iteration": 3.3405096530914307 }, { "auxiliary_loss_clip": 0.0151283, "auxiliary_loss_mlp": 0.01041234, "balance_loss_clip": 1.31069922, "balance_loss_mlp": 1.01808345, "epoch": 0.2557041935968736, "flos": 24874791742080.0, "grad_norm": 2.3969089601448266, "language_loss": 0.74501812, "learning_rate": 3.4889881815775267e-06, "loss": 0.77055871, "num_input_tokens_seen": 91889920, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.23144531, "step": 4253, "time_per_iteration": 2.8995838165283203 }, { "auxiliary_loss_clip": 0.01541234, "auxiliary_loss_mlp": 0.01043054, "balance_loss_clip": 1.33403349, "balance_loss_mlp": 1.01867533, "epoch": 0.25576431684954154, "flos": 22502327921280.0, "grad_norm": 1.9192156814862793, "language_loss": 0.74707383, "learning_rate": 3.488728137415357e-06, "loss": 0.77291673, "num_input_tokens_seen": 91908665, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.24389648, "step": 4254, "time_per_iteration": 2.865361452102661 }, { "auxiliary_loss_clip": 0.01527511, "auxiliary_loss_mlp": 0.01043118, "balance_loss_clip": 1.31876385, "balance_loss_mlp": 1.01723742, "epoch": 0.2558244401022095, "flos": 19835781027840.0, "grad_norm": 2.312488940571898, "language_loss": 0.81931609, "learning_rate": 3.4884680368004675e-06, "loss": 0.84502238, "num_input_tokens_seen": 91927855, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.2590332, "step": 4255, "time_per_iteration": 2.8436686992645264 }, { "auxiliary_loss_clip": 0.01518855, "auxiliary_loss_mlp": 0.01049802, "balance_loss_clip": 1.3150878, "balance_loss_mlp": 1.02469623, "epoch": 0.2558845633548775, "flos": 23230545281280.0, "grad_norm": 1.5558396809392558, "language_loss": 0.86166632, "learning_rate": 3.488207879742721e-06, "loss": 0.88735282, "num_input_tokens_seen": 91948500, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.25134277, "step": 4256, "time_per_iteration": 2.859417676925659 }, { "auxiliary_loss_clip": 0.01545739, "auxiliary_loss_mlp": 0.01040808, "balance_loss_clip": 1.33175755, "balance_loss_mlp": 1.01458156, "epoch": 0.2559446866075455, "flos": 16846891044480.0, "grad_norm": 1.7354723665652005, "language_loss": 0.75842619, "learning_rate": 3.4879476662519826e-06, "loss": 0.78429163, "num_input_tokens_seen": 91968375, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.26208496, "step": 4257, "time_per_iteration": 2.8664653301239014 }, { "auxiliary_loss_clip": 0.01283138, "auxiliary_loss_mlp": 0.01029174, "balance_loss_clip": 1.15620267, "balance_loss_mlp": 1.0049504, "epoch": 0.25600480986021346, "flos": 57623198837760.0, "grad_norm": 0.7945477927640593, "language_loss": 0.65354002, "learning_rate": 3.4876873963381196e-06, "loss": 0.67666316, "num_input_tokens_seen": 92028490, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.2421875, "step": 4258, "time_per_iteration": 3.30615234375 }, { "auxiliary_loss_clip": 0.01503733, "auxiliary_loss_mlp": 0.01048463, "balance_loss_clip": 1.30224156, "balance_loss_mlp": 1.0222609, "epoch": 0.2560649331128814, "flos": 27831259186560.0, "grad_norm": 1.7647732466044816, "language_loss": 0.77528644, "learning_rate": 3.4874270700110013e-06, "loss": 0.80080843, "num_input_tokens_seen": 92048060, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.26208496, "step": 4259, "time_per_iteration": 2.8937032222747803 }, { "auxiliary_loss_clip": 0.01280626, "auxiliary_loss_mlp": 0.01028213, "balance_loss_clip": 1.1565516, "balance_loss_mlp": 1.00742328, "epoch": 0.2561250563655494, "flos": 70984148077440.0, "grad_norm": 0.7964492978757748, "language_loss": 0.58503646, "learning_rate": 3.4871666872804994e-06, "loss": 0.60812485, "num_input_tokens_seen": 92118180, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.20800781, "step": 4260, "time_per_iteration": 3.443023204803467 }, { "auxiliary_loss_clip": 0.01503401, "auxiliary_loss_mlp": 0.01048093, "balance_loss_clip": 1.29709923, "balance_loss_mlp": 1.02124739, "epoch": 0.25618517961821735, "flos": 27022677782400.0, "grad_norm": 1.8433915950467596, "language_loss": 0.77362537, "learning_rate": 3.4869062481564875e-06, "loss": 0.79914033, "num_input_tokens_seen": 92137570, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.26879883, "step": 4261, "time_per_iteration": 3.0200788974761963 }, { "auxiliary_loss_clip": 0.01509356, "auxiliary_loss_mlp": 0.01045848, "balance_loss_clip": 1.30533862, "balance_loss_mlp": 1.02126718, "epoch": 0.2562453028708853, "flos": 23076694402560.0, "grad_norm": 1.659973431836026, "language_loss": 0.84148133, "learning_rate": 3.486645752648842e-06, "loss": 0.86703336, "num_input_tokens_seen": 92157625, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.24584961, "step": 4262, "time_per_iteration": 2.9245333671569824 }, { "auxiliary_loss_clip": 0.01522719, "auxiliary_loss_mlp": 0.01048608, "balance_loss_clip": 1.30945086, "balance_loss_mlp": 1.02204847, "epoch": 0.2563054261235533, "flos": 15128976769920.0, "grad_norm": 2.4630944052797044, "language_loss": 0.75269675, "learning_rate": 3.4863852007674405e-06, "loss": 0.77841002, "num_input_tokens_seen": 92175350, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.26586914, "step": 4263, "time_per_iteration": 2.9228169918060303 }, { "auxiliary_loss_clip": 0.01510316, "auxiliary_loss_mlp": 0.0104551, "balance_loss_clip": 1.30652905, "balance_loss_mlp": 1.02060699, "epoch": 0.25636554937622125, "flos": 27866305923840.0, "grad_norm": 1.5491104365930486, "language_loss": 0.83495986, "learning_rate": 3.486124592522163e-06, "loss": 0.8605181, "num_input_tokens_seen": 92196070, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.24902344, "step": 4264, "time_per_iteration": 2.9376823902130127 }, { "auxiliary_loss_clip": 0.01532702, "auxiliary_loss_mlp": 0.01048565, "balance_loss_clip": 1.32345641, "balance_loss_mlp": 1.02274418, "epoch": 0.2564256726288892, "flos": 28916549660160.0, "grad_norm": 1.6470340666173287, "language_loss": 0.75773448, "learning_rate": 3.4858639279228924e-06, "loss": 0.78354716, "num_input_tokens_seen": 92216310, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.25817871, "step": 4265, "time_per_iteration": 2.952333688735962 }, { "auxiliary_loss_clip": 0.01518302, "auxiliary_loss_mlp": 0.01038908, "balance_loss_clip": 1.31232166, "balance_loss_mlp": 1.01330149, "epoch": 0.2564857958815572, "flos": 18524057736960.0, "grad_norm": 2.110845222443224, "language_loss": 0.82922381, "learning_rate": 3.485603206979513e-06, "loss": 0.85479593, "num_input_tokens_seen": 92234510, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.25610352, "step": 4266, "time_per_iteration": 2.8531272411346436 }, { "auxiliary_loss_clip": 0.01494427, "auxiliary_loss_mlp": 0.01044828, "balance_loss_clip": 1.2921015, "balance_loss_mlp": 1.01961589, "epoch": 0.25654591913422514, "flos": 25818311698560.0, "grad_norm": 1.6224734294874987, "language_loss": 0.80235124, "learning_rate": 3.4853424297019103e-06, "loss": 0.82774377, "num_input_tokens_seen": 92254070, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.25231934, "step": 4267, "time_per_iteration": 2.8710947036743164 }, { "auxiliary_loss_clip": 0.01494479, "auxiliary_loss_mlp": 0.01045165, "balance_loss_clip": 1.29596639, "balance_loss_mlp": 1.02040565, "epoch": 0.2566060423868931, "flos": 19109192480640.0, "grad_norm": 1.5682987505188923, "language_loss": 0.7991811, "learning_rate": 3.4850815960999736e-06, "loss": 0.82457757, "num_input_tokens_seen": 92275060, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.24743652, "step": 4268, "time_per_iteration": 2.8485147953033447 }, { "auxiliary_loss_clip": 0.01504564, "auxiliary_loss_mlp": 0.010491, "balance_loss_clip": 1.30006981, "balance_loss_mlp": 1.02444708, "epoch": 0.25666616563956113, "flos": 23853396205440.0, "grad_norm": 1.8988597897716362, "language_loss": 0.69483197, "learning_rate": 3.484820706183595e-06, "loss": 0.72036856, "num_input_tokens_seen": 92293610, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.2467041, "step": 4269, "time_per_iteration": 2.850463628768921 }, { "auxiliary_loss_clip": 0.01523322, "auxiliary_loss_mlp": 0.0104968, "balance_loss_clip": 1.31502795, "balance_loss_mlp": 1.02536166, "epoch": 0.2567262888922291, "flos": 14610632630400.0, "grad_norm": 2.808533407213428, "language_loss": 0.81224775, "learning_rate": 3.484559759962666e-06, "loss": 0.83797777, "num_input_tokens_seen": 92308305, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.24316406, "step": 4270, "time_per_iteration": 2.8116774559020996 }, { "auxiliary_loss_clip": 0.01539784, "auxiliary_loss_mlp": 0.010552, "balance_loss_clip": 1.32476544, "balance_loss_mlp": 1.02760339, "epoch": 0.25678641214489706, "flos": 32935024488960.0, "grad_norm": 1.914207737353283, "language_loss": 0.69032907, "learning_rate": 3.4842987574470816e-06, "loss": 0.71627891, "num_input_tokens_seen": 92329875, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.27600098, "step": 4271, "time_per_iteration": 2.9345316886901855 }, { "auxiliary_loss_clip": 0.01529935, "auxiliary_loss_mlp": 0.01053619, "balance_loss_clip": 1.31832242, "balance_loss_mlp": 1.02883577, "epoch": 0.256846535397565, "flos": 24109944076800.0, "grad_norm": 1.4221674582723618, "language_loss": 0.8788054, "learning_rate": 3.4840376986467403e-06, "loss": 0.90464103, "num_input_tokens_seen": 92348780, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.24804688, "step": 4272, "time_per_iteration": 4.295428037643433 }, { "auxiliary_loss_clip": 0.01535963, "auxiliary_loss_mlp": 0.01058626, "balance_loss_clip": 1.32505584, "balance_loss_mlp": 1.03286493, "epoch": 0.256906658650233, "flos": 19727745148800.0, "grad_norm": 3.2289228564465384, "language_loss": 0.829391, "learning_rate": 3.483776583571541e-06, "loss": 0.8553369, "num_input_tokens_seen": 92368175, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.25805664, "step": 4273, "time_per_iteration": 2.8580329418182373 }, { "auxiliary_loss_clip": 0.01522021, "auxiliary_loss_mlp": 0.01058066, "balance_loss_clip": 1.31774497, "balance_loss_mlp": 1.03361678, "epoch": 0.25696678190290095, "flos": 22935331088640.0, "grad_norm": 1.7194352439697924, "language_loss": 0.78180712, "learning_rate": 3.4835154122313846e-06, "loss": 0.80760801, "num_input_tokens_seen": 92387755, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.24462891, "step": 4274, "time_per_iteration": 2.8791096210479736 }, { "auxiliary_loss_clip": 0.0151791, "auxiliary_loss_mlp": 0.01053349, "balance_loss_clip": 1.31382465, "balance_loss_mlp": 1.02811241, "epoch": 0.2570269051555689, "flos": 27319204074240.0, "grad_norm": 1.7523613899439572, "language_loss": 0.84449375, "learning_rate": 3.4832541846361743e-06, "loss": 0.8702063, "num_input_tokens_seen": 92409850, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.25256348, "step": 4275, "time_per_iteration": 2.9033493995666504 }, { "auxiliary_loss_clip": 0.01541382, "auxiliary_loss_mlp": 0.01051201, "balance_loss_clip": 1.33039904, "balance_loss_mlp": 1.02594066, "epoch": 0.2570870284082369, "flos": 27574892294400.0, "grad_norm": 2.053966000544112, "language_loss": 0.79462945, "learning_rate": 3.4829929007958175e-06, "loss": 0.82055533, "num_input_tokens_seen": 92431250, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.25280762, "step": 4276, "time_per_iteration": 2.9344139099121094 }, { "auxiliary_loss_clip": 0.01535806, "auxiliary_loss_mlp": 0.01062937, "balance_loss_clip": 1.32854807, "balance_loss_mlp": 1.03808141, "epoch": 0.25714715166090485, "flos": 28742582828160.0, "grad_norm": 1.9552830878413154, "language_loss": 0.80314827, "learning_rate": 3.4827315607202214e-06, "loss": 0.82913566, "num_input_tokens_seen": 92452065, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.24841309, "step": 4277, "time_per_iteration": 2.9190521240234375 }, { "auxiliary_loss_clip": 0.01545369, "auxiliary_loss_mlp": 0.01057213, "balance_loss_clip": 1.3363626, "balance_loss_mlp": 1.03229845, "epoch": 0.2572072749135728, "flos": 20124615703680.0, "grad_norm": 10.969114845266425, "language_loss": 0.80072403, "learning_rate": 3.482470164419295e-06, "loss": 0.8267498, "num_input_tokens_seen": 92470025, "router_z_loss_clip": 2.08691406, "router_z_loss_mlp": 0.24914551, "step": 4278, "time_per_iteration": 2.885235071182251 }, { "auxiliary_loss_clip": 0.01552526, "auxiliary_loss_mlp": 0.01051946, "balance_loss_clip": 1.34156883, "balance_loss_mlp": 1.02713871, "epoch": 0.2572673981662408, "flos": 26041894093440.0, "grad_norm": 1.7394480729513695, "language_loss": 0.75070459, "learning_rate": 3.482208711902952e-06, "loss": 0.77674937, "num_input_tokens_seen": 92489825, "router_z_loss_clip": 2.11035156, "router_z_loss_mlp": 0.24816895, "step": 4279, "time_per_iteration": 4.330492973327637 }, { "auxiliary_loss_clip": 0.01541743, "auxiliary_loss_mlp": 0.01062, "balance_loss_clip": 1.32998419, "balance_loss_mlp": 1.03522539, "epoch": 0.25732752141890874, "flos": 16115054100480.0, "grad_norm": 2.3956694291503795, "language_loss": 0.86648828, "learning_rate": 3.4819472031811065e-06, "loss": 0.89252573, "num_input_tokens_seen": 92507270, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.26794434, "step": 4280, "time_per_iteration": 2.836162805557251 }, { "auxiliary_loss_clip": 0.01549014, "auxiliary_loss_mlp": 0.01055447, "balance_loss_clip": 1.33415115, "balance_loss_mlp": 1.0297339, "epoch": 0.2573876446715767, "flos": 22533574095360.0, "grad_norm": 2.2098473110294883, "language_loss": 0.79914051, "learning_rate": 3.4816856382636744e-06, "loss": 0.82518512, "num_input_tokens_seen": 92526300, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25744629, "step": 4281, "time_per_iteration": 4.271686315536499 }, { "auxiliary_loss_clip": 0.01536491, "auxiliary_loss_mlp": 0.01051189, "balance_loss_clip": 1.32717156, "balance_loss_mlp": 1.02596462, "epoch": 0.2574477679242447, "flos": 23961251105280.0, "grad_norm": 2.022837623378409, "language_loss": 0.87542099, "learning_rate": 3.4814240171605737e-06, "loss": 0.90129781, "num_input_tokens_seen": 92546465, "router_z_loss_clip": 2.09082031, "router_z_loss_mlp": 0.25219727, "step": 4282, "time_per_iteration": 2.9051856994628906 }, { "auxiliary_loss_clip": 0.01549059, "auxiliary_loss_mlp": 0.01049844, "balance_loss_clip": 1.33772635, "balance_loss_mlp": 1.02593052, "epoch": 0.2575078911769127, "flos": 21991901621760.0, "grad_norm": 1.5672992739997922, "language_loss": 0.71403074, "learning_rate": 3.4811623398817267e-06, "loss": 0.74001974, "num_input_tokens_seen": 92567260, "router_z_loss_clip": 2.11328125, "router_z_loss_mlp": 0.23925781, "step": 4283, "time_per_iteration": 2.90817928314209 }, { "auxiliary_loss_clip": 0.01518651, "auxiliary_loss_mlp": 0.01052179, "balance_loss_clip": 1.31660581, "balance_loss_mlp": 1.02752614, "epoch": 0.25756801442958066, "flos": 21955633274880.0, "grad_norm": 1.7312683618808944, "language_loss": 0.81499964, "learning_rate": 3.4809006064370553e-06, "loss": 0.84070796, "num_input_tokens_seen": 92585425, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.24658203, "step": 4284, "time_per_iteration": 4.3241307735443115 }, { "auxiliary_loss_clip": 0.01549474, "auxiliary_loss_mlp": 0.01048931, "balance_loss_clip": 1.33843017, "balance_loss_mlp": 1.02443385, "epoch": 0.2576281376822486, "flos": 35275472974080.0, "grad_norm": 1.7973952594267986, "language_loss": 0.71157438, "learning_rate": 3.4806388168364835e-06, "loss": 0.73755848, "num_input_tokens_seen": 92604770, "router_z_loss_clip": 2.11132812, "router_z_loss_mlp": 0.24511719, "step": 4285, "time_per_iteration": 2.9777517318725586 }, { "auxiliary_loss_clip": 0.01543469, "auxiliary_loss_mlp": 0.01045784, "balance_loss_clip": 1.33498716, "balance_loss_mlp": 1.02090526, "epoch": 0.2576882609349166, "flos": 14139144120960.0, "grad_norm": 1.786972001403901, "language_loss": 0.59457946, "learning_rate": 3.4803769710899402e-06, "loss": 0.62047195, "num_input_tokens_seen": 92622635, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.24902344, "step": 4286, "time_per_iteration": 2.8152787685394287 }, { "auxiliary_loss_clip": 0.01570535, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.35712552, "balance_loss_mlp": 1.02344882, "epoch": 0.25774838418758456, "flos": 23268759154560.0, "grad_norm": 8.051339501603794, "language_loss": 0.65385038, "learning_rate": 3.480115069207354e-06, "loss": 0.68002814, "num_input_tokens_seen": 92642960, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.23803711, "step": 4287, "time_per_iteration": 2.8763434886932373 }, { "auxiliary_loss_clip": 0.01554794, "auxiliary_loss_mlp": 0.01046921, "balance_loss_clip": 1.33969152, "balance_loss_mlp": 1.01968145, "epoch": 0.2578085074402525, "flos": 22612037857920.0, "grad_norm": 1.983639907734974, "language_loss": 0.72985113, "learning_rate": 3.4798531111986557e-06, "loss": 0.75586832, "num_input_tokens_seen": 92662455, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.27209473, "step": 4288, "time_per_iteration": 2.8657469749450684 }, { "auxiliary_loss_clip": 0.01537275, "auxiliary_loss_mlp": 0.01046399, "balance_loss_clip": 1.33098376, "balance_loss_mlp": 1.02177024, "epoch": 0.2578686306929205, "flos": 24582608951040.0, "grad_norm": 1.5897684120963937, "language_loss": 0.77462614, "learning_rate": 3.4795910970737786e-06, "loss": 0.8004629, "num_input_tokens_seen": 92683520, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.24645996, "step": 4289, "time_per_iteration": 2.9498164653778076 }, { "auxiliary_loss_clip": 0.01545845, "auxiliary_loss_mlp": 0.01046383, "balance_loss_clip": 1.33643329, "balance_loss_mlp": 1.02033615, "epoch": 0.25792875394558845, "flos": 18123069905280.0, "grad_norm": 2.8048928586557555, "language_loss": 0.84936357, "learning_rate": 3.4793290268426592e-06, "loss": 0.87528586, "num_input_tokens_seen": 92701450, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.26062012, "step": 4290, "time_per_iteration": 2.8241350650787354 }, { "auxiliary_loss_clip": 0.01550638, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.33734274, "balance_loss_mlp": 1.01977932, "epoch": 0.2579888771982564, "flos": 17721991584000.0, "grad_norm": 3.1491716086476935, "language_loss": 0.73665947, "learning_rate": 3.4790669005152354e-06, "loss": 0.76262695, "num_input_tokens_seen": 92720355, "router_z_loss_clip": 2.13378906, "router_z_loss_mlp": 0.26379395, "step": 4291, "time_per_iteration": 2.841290235519409 }, { "auxiliary_loss_clip": 0.01551736, "auxiliary_loss_mlp": 0.01045318, "balance_loss_clip": 1.33960295, "balance_loss_mlp": 1.01792419, "epoch": 0.2580490004509244, "flos": 16443369504000.0, "grad_norm": 2.651628341899296, "language_loss": 0.81892836, "learning_rate": 3.4788047181014458e-06, "loss": 0.84489894, "num_input_tokens_seen": 92736755, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.27404785, "step": 4292, "time_per_iteration": 2.851317882537842 }, { "auxiliary_loss_clip": 0.01546222, "auxiliary_loss_mlp": 0.01044875, "balance_loss_clip": 1.33707404, "balance_loss_mlp": 1.01773119, "epoch": 0.25810912370359235, "flos": 33847253026560.0, "grad_norm": 2.1187369557933753, "language_loss": 0.69604284, "learning_rate": 3.4785424796112337e-06, "loss": 0.72195381, "num_input_tokens_seen": 92757655, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.27148438, "step": 4293, "time_per_iteration": 2.953986406326294 }, { "auxiliary_loss_clip": 0.01545088, "auxiliary_loss_mlp": 0.01047629, "balance_loss_clip": 1.3369565, "balance_loss_mlp": 1.02272594, "epoch": 0.2581692469562603, "flos": 25203152390400.0, "grad_norm": 2.914496276230053, "language_loss": 0.76670265, "learning_rate": 3.478280185054542e-06, "loss": 0.79262978, "num_input_tokens_seen": 92776100, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.24914551, "step": 4294, "time_per_iteration": 2.9885363578796387 }, { "auxiliary_loss_clip": 0.01536971, "auxiliary_loss_mlp": 0.01047338, "balance_loss_clip": 1.32877147, "balance_loss_mlp": 1.02069449, "epoch": 0.2582293702089283, "flos": 34945257288960.0, "grad_norm": 2.2207129698642025, "language_loss": 0.81603003, "learning_rate": 3.478017834441318e-06, "loss": 0.84187317, "num_input_tokens_seen": 92798880, "router_z_loss_clip": 2.08398438, "router_z_loss_mlp": 0.26672363, "step": 4295, "time_per_iteration": 2.962592124938965 }, { "auxiliary_loss_clip": 0.01553953, "auxiliary_loss_mlp": 0.01043, "balance_loss_clip": 1.33813834, "balance_loss_mlp": 1.0165354, "epoch": 0.2582894934615963, "flos": 26845046121600.0, "grad_norm": 2.4275269131903743, "language_loss": 0.73627061, "learning_rate": 3.4777554277815096e-06, "loss": 0.76224011, "num_input_tokens_seen": 92817750, "router_z_loss_clip": 2.16015625, "router_z_loss_mlp": 0.26489258, "step": 4296, "time_per_iteration": 2.891291856765747 }, { "auxiliary_loss_clip": 0.01561237, "auxiliary_loss_mlp": 0.01041827, "balance_loss_clip": 1.34872866, "balance_loss_mlp": 1.01569629, "epoch": 0.25834961671426426, "flos": 23526121432320.0, "grad_norm": 2.2023706091803117, "language_loss": 0.87716484, "learning_rate": 3.477492965085067e-06, "loss": 0.9031955, "num_input_tokens_seen": 92837995, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.26123047, "step": 4297, "time_per_iteration": 2.904721260070801 }, { "auxiliary_loss_clip": 0.01556792, "auxiliary_loss_mlp": 0.01046975, "balance_loss_clip": 1.34492564, "balance_loss_mlp": 1.0223825, "epoch": 0.25840973996693223, "flos": 22460042016000.0, "grad_norm": 1.697232557508499, "language_loss": 0.85359848, "learning_rate": 3.477230446361943e-06, "loss": 0.87963617, "num_input_tokens_seen": 92857245, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24621582, "step": 4298, "time_per_iteration": 2.838688850402832 }, { "auxiliary_loss_clip": 0.01558589, "auxiliary_loss_mlp": 0.01047022, "balance_loss_clip": 1.34696364, "balance_loss_mlp": 1.02050972, "epoch": 0.2584698632196002, "flos": 11298132702720.0, "grad_norm": 1.9688893338296947, "language_loss": 0.84200859, "learning_rate": 3.4769678716220927e-06, "loss": 0.86806464, "num_input_tokens_seen": 92873265, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.26538086, "step": 4299, "time_per_iteration": 2.839909553527832 }, { "auxiliary_loss_clip": 0.01531507, "auxiliary_loss_mlp": 0.01043547, "balance_loss_clip": 1.32603359, "balance_loss_mlp": 1.01945484, "epoch": 0.25852998647226816, "flos": 17938651524480.0, "grad_norm": 2.4053938938950674, "language_loss": 0.83931786, "learning_rate": 3.4767052408754726e-06, "loss": 0.86506838, "num_input_tokens_seen": 92890880, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.2409668, "step": 4300, "time_per_iteration": 2.8212552070617676 }, { "auxiliary_loss_clip": 0.01548493, "auxiliary_loss_mlp": 0.01043154, "balance_loss_clip": 1.33656168, "balance_loss_mlp": 1.01906133, "epoch": 0.2585901097249361, "flos": 33269312206080.0, "grad_norm": 2.0393505920931334, "language_loss": 0.68277049, "learning_rate": 3.4764425541320417e-06, "loss": 0.70868695, "num_input_tokens_seen": 92910770, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.24084473, "step": 4301, "time_per_iteration": 2.9867055416107178 }, { "auxiliary_loss_clip": 0.01557165, "auxiliary_loss_mlp": 0.0105263, "balance_loss_clip": 1.34103405, "balance_loss_mlp": 1.0252955, "epoch": 0.2586502329776041, "flos": 18450208944000.0, "grad_norm": 2.2601280610731416, "language_loss": 0.82826036, "learning_rate": 3.4761798114017617e-06, "loss": 0.85435832, "num_input_tokens_seen": 92929520, "router_z_loss_clip": 2.16210938, "router_z_loss_mlp": 0.27331543, "step": 4302, "time_per_iteration": 2.836373805999756 }, { "auxiliary_loss_clip": 0.01544165, "auxiliary_loss_mlp": 0.0105071, "balance_loss_clip": 1.33412194, "balance_loss_mlp": 1.02462673, "epoch": 0.25871035623027205, "flos": 17977227356160.0, "grad_norm": 2.1166353188074494, "language_loss": 0.92955744, "learning_rate": 3.475917012694595e-06, "loss": 0.95550621, "num_input_tokens_seen": 92947890, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.26074219, "step": 4303, "time_per_iteration": 2.8549914360046387 }, { "auxiliary_loss_clip": 0.0153996, "auxiliary_loss_mlp": 0.01054705, "balance_loss_clip": 1.32778442, "balance_loss_mlp": 1.02429497, "epoch": 0.25877047948294, "flos": 27788068385280.0, "grad_norm": 1.8917274396719457, "language_loss": 0.68113804, "learning_rate": 3.475654158020507e-06, "loss": 0.70708466, "num_input_tokens_seen": 92967690, "router_z_loss_clip": 2.12109375, "router_z_loss_mlp": 0.30432129, "step": 4304, "time_per_iteration": 2.9784390926361084 }, { "auxiliary_loss_clip": 0.01547092, "auxiliary_loss_mlp": 0.01042773, "balance_loss_clip": 1.33320189, "balance_loss_mlp": 1.01744056, "epoch": 0.258830602735608, "flos": 27137274157440.0, "grad_norm": 2.3599063192984477, "language_loss": 0.73215675, "learning_rate": 3.4753912473894657e-06, "loss": 0.75805545, "num_input_tokens_seen": 92986830, "router_z_loss_clip": 2.13867188, "router_z_loss_mlp": 0.25317383, "step": 4305, "time_per_iteration": 2.887453079223633 }, { "auxiliary_loss_clip": 0.01551682, "auxiliary_loss_mlp": 0.01046345, "balance_loss_clip": 1.33708, "balance_loss_mlp": 1.01957059, "epoch": 0.25889072598827595, "flos": 17899623244800.0, "grad_norm": 1.9113160780371612, "language_loss": 0.7685281, "learning_rate": 3.4751282808114403e-06, "loss": 0.79450834, "num_input_tokens_seen": 93002740, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.26782227, "step": 4306, "time_per_iteration": 2.8557512760162354 }, { "auxiliary_loss_clip": 0.01274753, "auxiliary_loss_mlp": 0.0105202, "balance_loss_clip": 1.1498239, "balance_loss_mlp": 1.03294647, "epoch": 0.2589508492409439, "flos": 53960756492160.0, "grad_norm": 0.8659033066560984, "language_loss": 0.57197279, "learning_rate": 3.474865258296403e-06, "loss": 0.59524053, "num_input_tokens_seen": 93058645, "router_z_loss_clip": 1.25, "router_z_loss_mlp": 0.19042969, "step": 4307, "time_per_iteration": 4.747069597244263 }, { "auxiliary_loss_clip": 0.01534782, "auxiliary_loss_mlp": 0.01048489, "balance_loss_clip": 1.32948637, "balance_loss_mlp": 1.02272749, "epoch": 0.2590109724936119, "flos": 22135527175680.0, "grad_norm": 1.6019545817153567, "language_loss": 0.72435629, "learning_rate": 3.474602179854327e-06, "loss": 0.75018907, "num_input_tokens_seen": 93077140, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.25769043, "step": 4308, "time_per_iteration": 2.8435721397399902 }, { "auxiliary_loss_clip": 0.01555887, "auxiliary_loss_mlp": 0.01046554, "balance_loss_clip": 1.34114563, "balance_loss_mlp": 1.02150774, "epoch": 0.2590710957462799, "flos": 13480658277120.0, "grad_norm": 1.7034627613247233, "language_loss": 0.84752935, "learning_rate": 3.4743390454951886e-06, "loss": 0.87355375, "num_input_tokens_seen": 93093580, "router_z_loss_clip": 2.1484375, "router_z_loss_mlp": 0.25061035, "step": 4309, "time_per_iteration": 2.8347713947296143 }, { "auxiliary_loss_clip": 0.01529549, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.3236376, "balance_loss_mlp": 1.02059698, "epoch": 0.25913121899894787, "flos": 22316597441280.0, "grad_norm": 1.5331207278404484, "language_loss": 0.84971333, "learning_rate": 3.474075855228966e-06, "loss": 0.87545776, "num_input_tokens_seen": 93112345, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.24304199, "step": 4310, "time_per_iteration": 2.852576494216919 }, { "auxiliary_loss_clip": 0.01559177, "auxiliary_loss_mlp": 0.01052964, "balance_loss_clip": 1.34663117, "balance_loss_mlp": 1.02460384, "epoch": 0.25919134225161583, "flos": 25822655199360.0, "grad_norm": 4.255204712540294, "language_loss": 0.78808552, "learning_rate": 3.473812609065639e-06, "loss": 0.81420696, "num_input_tokens_seen": 93131545, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.28356934, "step": 4311, "time_per_iteration": 2.8843564987182617 }, { "auxiliary_loss_clip": 0.01551971, "auxiliary_loss_mlp": 0.01049885, "balance_loss_clip": 1.33880591, "balance_loss_mlp": 1.02408791, "epoch": 0.2592514655042838, "flos": 31224756585600.0, "grad_norm": 4.676813196961946, "language_loss": 0.73626405, "learning_rate": 3.4735493070151904e-06, "loss": 0.76228261, "num_input_tokens_seen": 93150730, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.25805664, "step": 4312, "time_per_iteration": 2.9447107315063477 }, { "auxiliary_loss_clip": 0.01536258, "auxiliary_loss_mlp": 0.01044688, "balance_loss_clip": 1.32628775, "balance_loss_mlp": 1.02035737, "epoch": 0.25931158875695176, "flos": 18483219665280.0, "grad_norm": 1.7922669088839218, "language_loss": 0.7127347, "learning_rate": 3.4732859490876044e-06, "loss": 0.73854411, "num_input_tokens_seen": 93167895, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.24353027, "step": 4313, "time_per_iteration": 2.8080410957336426 }, { "auxiliary_loss_clip": 0.01536808, "auxiliary_loss_mlp": 0.0105155, "balance_loss_clip": 1.32850313, "balance_loss_mlp": 1.02647996, "epoch": 0.2593717120096197, "flos": 19217183114880.0, "grad_norm": 1.8536092888964166, "language_loss": 0.81387448, "learning_rate": 3.473022535292867e-06, "loss": 0.83975804, "num_input_tokens_seen": 93187650, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.25048828, "step": 4314, "time_per_iteration": 4.291544437408447 }, { "auxiliary_loss_clip": 0.01548774, "auxiliary_loss_mlp": 0.01062026, "balance_loss_clip": 1.33460152, "balance_loss_mlp": 1.03621757, "epoch": 0.2594318352622877, "flos": 31260436750080.0, "grad_norm": 1.990484932738092, "language_loss": 0.6831556, "learning_rate": 3.472759065640968e-06, "loss": 0.70926368, "num_input_tokens_seen": 93207370, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.25805664, "step": 4315, "time_per_iteration": 2.9445948600769043 }, { "auxiliary_loss_clip": 0.01530844, "auxiliary_loss_mlp": 0.01054272, "balance_loss_clip": 1.32362318, "balance_loss_mlp": 1.0304656, "epoch": 0.25949195851495566, "flos": 22247408862720.0, "grad_norm": 1.5073653180334778, "language_loss": 0.80207789, "learning_rate": 3.4724955401418976e-06, "loss": 0.82792902, "num_input_tokens_seen": 93227925, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.23828125, "step": 4316, "time_per_iteration": 4.272040367126465 }, { "auxiliary_loss_clip": 0.01548486, "auxiliary_loss_mlp": 0.01049725, "balance_loss_clip": 1.33228493, "balance_loss_mlp": 1.02321303, "epoch": 0.2595520817676236, "flos": 28087716568320.0, "grad_norm": 1.7240259445029567, "language_loss": 0.78552389, "learning_rate": 3.4722319588056487e-06, "loss": 0.81150603, "num_input_tokens_seen": 93250020, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.26513672, "step": 4317, "time_per_iteration": 2.9294583797454834 }, { "auxiliary_loss_clip": 0.01536915, "auxiliary_loss_mlp": 0.0105629, "balance_loss_clip": 1.32860267, "balance_loss_mlp": 1.02990901, "epoch": 0.2596122050202916, "flos": 20199912330240.0, "grad_norm": 1.9960473553112512, "language_loss": 0.78942406, "learning_rate": 3.4719683216422163e-06, "loss": 0.81535608, "num_input_tokens_seen": 93269070, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.26379395, "step": 4318, "time_per_iteration": 2.8939156532287598 }, { "auxiliary_loss_clip": 0.01526185, "auxiliary_loss_mlp": 0.01054258, "balance_loss_clip": 1.31833911, "balance_loss_mlp": 1.02687573, "epoch": 0.25967232827295955, "flos": 22538008085760.0, "grad_norm": 1.7701020642633152, "language_loss": 0.77048516, "learning_rate": 3.471704628661598e-06, "loss": 0.79628962, "num_input_tokens_seen": 93290250, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.27380371, "step": 4319, "time_per_iteration": 4.279749155044556 }, { "auxiliary_loss_clip": 0.01527982, "auxiliary_loss_mlp": 0.01047231, "balance_loss_clip": 1.32004642, "balance_loss_mlp": 1.02175605, "epoch": 0.2597324515256275, "flos": 21077863292160.0, "grad_norm": 2.327396823357656, "language_loss": 0.77148795, "learning_rate": 3.4714408798737925e-06, "loss": 0.79724014, "num_input_tokens_seen": 93310090, "router_z_loss_clip": 2.08203125, "router_z_loss_mlp": 0.25476074, "step": 4320, "time_per_iteration": 2.8872604370117188 }, { "auxiliary_loss_clip": 0.01535316, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.32543516, "balance_loss_mlp": 1.01541209, "epoch": 0.2597925747782955, "flos": 22059008939520.0, "grad_norm": 1.9484503461200693, "language_loss": 0.72303355, "learning_rate": 3.471177075288801e-06, "loss": 0.74879158, "num_input_tokens_seen": 93329570, "router_z_loss_clip": 2.09667969, "router_z_loss_mlp": 0.25061035, "step": 4321, "time_per_iteration": 2.879722833633423 }, { "auxiliary_loss_clip": 0.01554151, "auxiliary_loss_mlp": 0.01043259, "balance_loss_clip": 1.33849108, "balance_loss_mlp": 1.01672304, "epoch": 0.2598526980309635, "flos": 19546448659200.0, "grad_norm": 2.0339592354195823, "language_loss": 0.75761747, "learning_rate": 3.4709132149166277e-06, "loss": 0.78359151, "num_input_tokens_seen": 93347920, "router_z_loss_clip": 2.15625, "router_z_loss_mlp": 0.265625, "step": 4322, "time_per_iteration": 2.8381519317626953 }, { "auxiliary_loss_clip": 0.01538885, "auxiliary_loss_mlp": 0.01049649, "balance_loss_clip": 1.32688355, "balance_loss_mlp": 1.02252853, "epoch": 0.25991282128363147, "flos": 24505004839680.0, "grad_norm": 2.4579813246143662, "language_loss": 0.74213266, "learning_rate": 3.470649298767278e-06, "loss": 0.76801801, "num_input_tokens_seen": 93367145, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.27111816, "step": 4323, "time_per_iteration": 2.93650484085083 }, { "auxiliary_loss_clip": 0.01562297, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.34208572, "balance_loss_mlp": 1.0176084, "epoch": 0.25997294453629943, "flos": 24210695543040.0, "grad_norm": 2.582305123894663, "language_loss": 0.67023969, "learning_rate": 3.4703853268507597e-06, "loss": 0.69630772, "num_input_tokens_seen": 93386555, "router_z_loss_clip": 2.19921875, "router_z_loss_mlp": 0.26904297, "step": 4324, "time_per_iteration": 2.8541853427886963 }, { "auxiliary_loss_clip": 0.01551258, "auxiliary_loss_mlp": 0.01050293, "balance_loss_clip": 1.33917475, "balance_loss_mlp": 1.02548599, "epoch": 0.2600330677889674, "flos": 31444085969280.0, "grad_norm": 1.8090248420102464, "language_loss": 0.71789181, "learning_rate": 3.470121299177082e-06, "loss": 0.74390727, "num_input_tokens_seen": 93405590, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.24829102, "step": 4325, "time_per_iteration": 2.914515972137451 }, { "auxiliary_loss_clip": 0.01541025, "auxiliary_loss_mlp": 0.01044872, "balance_loss_clip": 1.33029532, "balance_loss_mlp": 1.01884842, "epoch": 0.26009319104163536, "flos": 32278303192320.0, "grad_norm": 2.430913848089865, "language_loss": 0.73754013, "learning_rate": 3.469857215756257e-06, "loss": 0.76339906, "num_input_tokens_seen": 93424750, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.26025391, "step": 4326, "time_per_iteration": 2.894897222518921 }, { "auxiliary_loss_clip": 0.01530043, "auxiliary_loss_mlp": 0.01045814, "balance_loss_clip": 1.32221341, "balance_loss_mlp": 1.02172148, "epoch": 0.26015331429430333, "flos": 26297989516800.0, "grad_norm": 1.8475483185590078, "language_loss": 0.88114959, "learning_rate": 3.4695930765982997e-06, "loss": 0.90690815, "num_input_tokens_seen": 93443465, "router_z_loss_clip": 2.07714844, "router_z_loss_mlp": 0.24084473, "step": 4327, "time_per_iteration": 2.854595422744751 }, { "auxiliary_loss_clip": 0.01552816, "auxiliary_loss_mlp": 0.01063959, "balance_loss_clip": 1.33745885, "balance_loss_mlp": 1.03543246, "epoch": 0.2602134375469713, "flos": 21152255022720.0, "grad_norm": 1.5252139165794856, "language_loss": 0.80930191, "learning_rate": 3.4693288817132255e-06, "loss": 0.83546966, "num_input_tokens_seen": 93462580, "router_z_loss_clip": 2.15234375, "router_z_loss_mlp": 0.28515625, "step": 4328, "time_per_iteration": 2.8479790687561035 }, { "auxiliary_loss_clip": 0.01532305, "auxiliary_loss_mlp": 0.01056845, "balance_loss_clip": 1.32056522, "balance_loss_mlp": 1.0328362, "epoch": 0.26027356079963926, "flos": 25932319891200.0, "grad_norm": 2.2093658083315346, "language_loss": 0.8832792, "learning_rate": 3.4690646311110525e-06, "loss": 0.90917075, "num_input_tokens_seen": 93482790, "router_z_loss_clip": 2.11914062, "router_z_loss_mlp": 0.23986816, "step": 4329, "time_per_iteration": 2.8868157863616943 }, { "auxiliary_loss_clip": 0.01530486, "auxiliary_loss_mlp": 0.01054773, "balance_loss_clip": 1.32276654, "balance_loss_mlp": 1.03053808, "epoch": 0.2603336840523072, "flos": 26370345231360.0, "grad_norm": 2.9443156757842823, "language_loss": 0.79011512, "learning_rate": 3.468800324801802e-06, "loss": 0.81596768, "num_input_tokens_seen": 93498795, "router_z_loss_clip": 2.07519531, "router_z_loss_mlp": 0.24255371, "step": 4330, "time_per_iteration": 2.8730273246765137 }, { "auxiliary_loss_clip": 0.01542487, "auxiliary_loss_mlp": 0.0105679, "balance_loss_clip": 1.32775664, "balance_loss_mlp": 1.03129053, "epoch": 0.2603938073049752, "flos": 23524130661120.0, "grad_norm": 1.4777406324452096, "language_loss": 0.76466483, "learning_rate": 3.4685359627954958e-06, "loss": 0.79065764, "num_input_tokens_seen": 93518335, "router_z_loss_clip": 2.15039062, "router_z_loss_mlp": 0.25512695, "step": 4331, "time_per_iteration": 2.8902511596679688 }, { "auxiliary_loss_clip": 0.0153582, "auxiliary_loss_mlp": 0.01058447, "balance_loss_clip": 1.32652569, "balance_loss_mlp": 1.03368771, "epoch": 0.26045393055764315, "flos": 25385218041600.0, "grad_norm": 1.5539816484942166, "language_loss": 0.69757259, "learning_rate": 3.4682715451021584e-06, "loss": 0.72351521, "num_input_tokens_seen": 93539170, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.24731445, "step": 4332, "time_per_iteration": 2.8708066940307617 }, { "auxiliary_loss_clip": 0.01543269, "auxiliary_loss_mlp": 0.01055345, "balance_loss_clip": 1.32920933, "balance_loss_mlp": 1.0304184, "epoch": 0.2605140538103111, "flos": 27646478847360.0, "grad_norm": 3.18984671962052, "language_loss": 0.80787873, "learning_rate": 3.4680070717318174e-06, "loss": 0.83386493, "num_input_tokens_seen": 93558480, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.24890137, "step": 4333, "time_per_iteration": 2.8994672298431396 }, { "auxiliary_loss_clip": 0.01520277, "auxiliary_loss_mlp": 0.0104979, "balance_loss_clip": 1.31424212, "balance_loss_mlp": 1.02361166, "epoch": 0.2605741770629791, "flos": 13777229813760.0, "grad_norm": 1.9272631156026911, "language_loss": 0.81590474, "learning_rate": 3.467742542694501e-06, "loss": 0.84160542, "num_input_tokens_seen": 93575220, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.26208496, "step": 4334, "time_per_iteration": 2.7789146900177 }, { "auxiliary_loss_clip": 0.0153119, "auxiliary_loss_mlp": 0.01048621, "balance_loss_clip": 1.32064021, "balance_loss_mlp": 1.02222776, "epoch": 0.26063430031564705, "flos": 26042934723840.0, "grad_norm": 1.8163989708306456, "language_loss": 0.80056745, "learning_rate": 3.46747795800024e-06, "loss": 0.82636559, "num_input_tokens_seen": 93597015, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.26391602, "step": 4335, "time_per_iteration": 2.922973871231079 }, { "auxiliary_loss_clip": 0.01296884, "auxiliary_loss_mlp": 0.01032955, "balance_loss_clip": 1.1663928, "balance_loss_mlp": 1.00930405, "epoch": 0.26069442356831507, "flos": 62473809628800.0, "grad_norm": 0.840472374470239, "language_loss": 0.60826683, "learning_rate": 3.467213317659068e-06, "loss": 0.63156521, "num_input_tokens_seen": 93657775, "router_z_loss_clip": 1.3046875, "router_z_loss_mlp": 0.23632812, "step": 4336, "time_per_iteration": 3.3345608711242676 }, { "auxiliary_loss_clip": 0.01532544, "auxiliary_loss_mlp": 0.01055107, "balance_loss_clip": 1.31960869, "balance_loss_mlp": 1.02826154, "epoch": 0.26075454682098304, "flos": 13634373421440.0, "grad_norm": 2.1729115752032064, "language_loss": 0.78052145, "learning_rate": 3.46694862168102e-06, "loss": 0.80639791, "num_input_tokens_seen": 93676145, "router_z_loss_clip": 2.12695312, "router_z_loss_mlp": 0.26879883, "step": 4337, "time_per_iteration": 2.9384918212890625 }, { "auxiliary_loss_clip": 0.01522997, "auxiliary_loss_mlp": 0.01048534, "balance_loss_clip": 1.31211329, "balance_loss_mlp": 1.02144933, "epoch": 0.260814670073651, "flos": 12133119087360.0, "grad_norm": 2.1358247085364654, "language_loss": 0.75974, "learning_rate": 3.4666838700761334e-06, "loss": 0.78545535, "num_input_tokens_seen": 93692480, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.27099609, "step": 4338, "time_per_iteration": 2.8210909366607666 }, { "auxiliary_loss_clip": 0.0154091, "auxiliary_loss_mlp": 0.01055746, "balance_loss_clip": 1.32427144, "balance_loss_mlp": 1.02754128, "epoch": 0.26087479332631897, "flos": 15130967541120.0, "grad_norm": 2.5868472384030703, "language_loss": 0.82050383, "learning_rate": 3.466419062854447e-06, "loss": 0.84647036, "num_input_tokens_seen": 93710165, "router_z_loss_clip": 2.16503906, "router_z_loss_mlp": 0.28198242, "step": 4339, "time_per_iteration": 2.828016519546509 }, { "auxiliary_loss_clip": 0.01512222, "auxiliary_loss_mlp": 0.01053674, "balance_loss_clip": 1.30565619, "balance_loss_mlp": 1.02742422, "epoch": 0.26093491657898693, "flos": 24691640215680.0, "grad_norm": 1.5850070740845916, "language_loss": 0.77593911, "learning_rate": 3.4661542000260033e-06, "loss": 0.80159807, "num_input_tokens_seen": 93730185, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.26293945, "step": 4340, "time_per_iteration": 2.887030601501465 }, { "auxiliary_loss_clip": 0.0153428, "auxiliary_loss_mlp": 0.01052668, "balance_loss_clip": 1.32343149, "balance_loss_mlp": 1.02530932, "epoch": 0.2609950398316549, "flos": 25126272195840.0, "grad_norm": 1.491726682860765, "language_loss": 0.82819825, "learning_rate": 3.465889281600845e-06, "loss": 0.85406768, "num_input_tokens_seen": 93747690, "router_z_loss_clip": 2.10644531, "router_z_loss_mlp": 0.27355957, "step": 4341, "time_per_iteration": 2.856893539428711 }, { "auxiliary_loss_clip": 0.01513396, "auxiliary_loss_mlp": 0.01052665, "balance_loss_clip": 1.30361438, "balance_loss_mlp": 1.02547324, "epoch": 0.26105516308432286, "flos": 28560336197760.0, "grad_norm": 2.7702825753716587, "language_loss": 0.77654958, "learning_rate": 3.4656243075890183e-06, "loss": 0.80221021, "num_input_tokens_seen": 93767405, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.27209473, "step": 4342, "time_per_iteration": 4.305824279785156 }, { "auxiliary_loss_clip": 0.01526267, "auxiliary_loss_mlp": 0.01052165, "balance_loss_clip": 1.31479979, "balance_loss_mlp": 1.0243181, "epoch": 0.2611152863369908, "flos": 39545156787840.0, "grad_norm": 2.620990631377114, "language_loss": 0.66750324, "learning_rate": 3.4653592780005707e-06, "loss": 0.69328761, "num_input_tokens_seen": 93789950, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.27856445, "step": 4343, "time_per_iteration": 3.038104772567749 }, { "auxiliary_loss_clip": 0.0153126, "auxiliary_loss_mlp": 0.01050868, "balance_loss_clip": 1.31805301, "balance_loss_mlp": 1.02544069, "epoch": 0.2611754095896588, "flos": 13743223706880.0, "grad_norm": 2.569619971373267, "language_loss": 0.74668151, "learning_rate": 3.465094192845553e-06, "loss": 0.77250278, "num_input_tokens_seen": 93807835, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.25439453, "step": 4344, "time_per_iteration": 2.848231077194214 }, { "auxiliary_loss_clip": 0.01526441, "auxiliary_loss_mlp": 0.01051836, "balance_loss_clip": 1.31629348, "balance_loss_mlp": 1.02497768, "epoch": 0.26123553284232676, "flos": 21516522059520.0, "grad_norm": 3.0636397672614946, "language_loss": 0.87634742, "learning_rate": 3.4648290521340165e-06, "loss": 0.90213019, "num_input_tokens_seen": 93825670, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.2689209, "step": 4345, "time_per_iteration": 2.8392138481140137 }, { "auxiliary_loss_clip": 0.01520914, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.31482935, "balance_loss_mlp": 1.02526152, "epoch": 0.2612956560949947, "flos": 21149042641920.0, "grad_norm": 2.409237343793178, "language_loss": 0.77848101, "learning_rate": 3.464563855876015e-06, "loss": 0.80419904, "num_input_tokens_seen": 93844045, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.25622559, "step": 4346, "time_per_iteration": 2.8563320636749268 }, { "auxiliary_loss_clip": 0.01530399, "auxiliary_loss_mlp": 0.01058969, "balance_loss_clip": 1.31881511, "balance_loss_mlp": 1.03105056, "epoch": 0.2613557793476627, "flos": 25129891779840.0, "grad_norm": 1.481790677209732, "language_loss": 0.76229972, "learning_rate": 3.464298604081606e-06, "loss": 0.7881934, "num_input_tokens_seen": 93864380, "router_z_loss_clip": 2.1171875, "router_z_loss_mlp": 0.27929688, "step": 4347, "time_per_iteration": 2.8837060928344727 }, { "auxiliary_loss_clip": 0.01518243, "auxiliary_loss_mlp": 0.01048316, "balance_loss_clip": 1.31027293, "balance_loss_mlp": 1.02180386, "epoch": 0.26141590260033065, "flos": 26078841112320.0, "grad_norm": 1.303315032586668, "language_loss": 0.74417424, "learning_rate": 3.4640332967608476e-06, "loss": 0.76983976, "num_input_tokens_seen": 93885475, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.26513672, "step": 4348, "time_per_iteration": 2.912468433380127 }, { "auxiliary_loss_clip": 0.01532938, "auxiliary_loss_mlp": 0.0104577, "balance_loss_clip": 1.32359529, "balance_loss_mlp": 1.02046156, "epoch": 0.2614760258529987, "flos": 25712221345920.0, "grad_norm": 1.7301801179413712, "language_loss": 0.92008489, "learning_rate": 3.463767933923799e-06, "loss": 0.94587195, "num_input_tokens_seen": 93905545, "router_z_loss_clip": 2.09472656, "router_z_loss_mlp": 0.2532959, "step": 4349, "time_per_iteration": 4.304780960083008 }, { "auxiliary_loss_clip": 0.01515136, "auxiliary_loss_mlp": 0.01044367, "balance_loss_clip": 1.31048977, "balance_loss_mlp": 1.01779556, "epoch": 0.26153614910566664, "flos": 17466077139840.0, "grad_norm": 2.1628789302838585, "language_loss": 0.81038678, "learning_rate": 3.463502515580524e-06, "loss": 0.83598179, "num_input_tokens_seen": 93924185, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.26550293, "step": 4350, "time_per_iteration": 2.860969066619873 }, { "auxiliary_loss_clip": 0.01506664, "auxiliary_loss_mlp": 0.01044677, "balance_loss_clip": 1.30266881, "balance_loss_mlp": 1.01785505, "epoch": 0.2615962723583346, "flos": 17721901094400.0, "grad_norm": 3.9761005585149674, "language_loss": 0.63055629, "learning_rate": 3.4632370417410866e-06, "loss": 0.6560697, "num_input_tokens_seen": 93942825, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.26794434, "step": 4351, "time_per_iteration": 4.1543285846710205 }, { "auxiliary_loss_clip": 0.01525312, "auxiliary_loss_mlp": 0.01044105, "balance_loss_clip": 1.31344199, "balance_loss_mlp": 1.01719964, "epoch": 0.26165639561100257, "flos": 23267989992960.0, "grad_norm": 2.1742600309903035, "language_loss": 0.8498807, "learning_rate": 3.462971512415555e-06, "loss": 0.87557483, "num_input_tokens_seen": 93962045, "router_z_loss_clip": 2.11621094, "router_z_loss_mlp": 0.2689209, "step": 4352, "time_per_iteration": 2.8238565921783447 }, { "auxiliary_loss_clip": 0.0128882, "auxiliary_loss_mlp": 0.01039168, "balance_loss_clip": 1.15777326, "balance_loss_mlp": 1.0118928, "epoch": 0.26171651886367053, "flos": 66766188349440.0, "grad_norm": 0.8151844094540279, "language_loss": 0.70609063, "learning_rate": 3.462705927613996e-06, "loss": 0.72937047, "num_input_tokens_seen": 94021175, "router_z_loss_clip": 1.3125, "router_z_loss_mlp": 0.2734375, "step": 4353, "time_per_iteration": 3.2244582176208496 }, { "auxiliary_loss_clip": 0.01515455, "auxiliary_loss_mlp": 0.01048229, "balance_loss_clip": 1.30958331, "balance_loss_mlp": 1.02113307, "epoch": 0.2617766421163385, "flos": 22360647893760.0, "grad_norm": 1.7085689841927818, "language_loss": 0.78777766, "learning_rate": 3.4624402873464816e-06, "loss": 0.81341445, "num_input_tokens_seen": 94043370, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.27099609, "step": 4354, "time_per_iteration": 4.359747886657715 }, { "auxiliary_loss_clip": 0.01528528, "auxiliary_loss_mlp": 0.01049155, "balance_loss_clip": 1.31437993, "balance_loss_mlp": 1.02297688, "epoch": 0.26183676536900646, "flos": 26078117195520.0, "grad_norm": 2.4481691411642577, "language_loss": 0.69179416, "learning_rate": 3.462174591623085e-06, "loss": 0.71757102, "num_input_tokens_seen": 94063510, "router_z_loss_clip": 2.140625, "router_z_loss_mlp": 0.26171875, "step": 4355, "time_per_iteration": 2.8772928714752197 }, { "auxiliary_loss_clip": 0.01518498, "auxiliary_loss_mlp": 0.01043478, "balance_loss_clip": 1.30931735, "balance_loss_mlp": 1.01675153, "epoch": 0.26189688862167443, "flos": 21006367228800.0, "grad_norm": 1.915767444586632, "language_loss": 0.6803987, "learning_rate": 3.4619088404538815e-06, "loss": 0.70601845, "num_input_tokens_seen": 94083865, "router_z_loss_clip": 2.09277344, "router_z_loss_mlp": 0.2677002, "step": 4356, "time_per_iteration": 2.8707733154296875 }, { "auxiliary_loss_clip": 0.01295549, "auxiliary_loss_mlp": 0.01023621, "balance_loss_clip": 1.16651011, "balance_loss_mlp": 1.00264001, "epoch": 0.2619570118743424, "flos": 65828550216960.0, "grad_norm": 0.6844197312751421, "language_loss": 0.53090227, "learning_rate": 3.4616430338489487e-06, "loss": 0.55409396, "num_input_tokens_seen": 94144095, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 0.20996094, "step": 4357, "time_per_iteration": 3.224717617034912 }, { "auxiliary_loss_clip": 0.01535685, "auxiliary_loss_mlp": 0.01044532, "balance_loss_clip": 1.32270217, "balance_loss_mlp": 1.01940215, "epoch": 0.26201713512701036, "flos": 28778443971840.0, "grad_norm": 2.0394699941086, "language_loss": 0.85458797, "learning_rate": 3.4613771718183654e-06, "loss": 0.88039017, "num_input_tokens_seen": 94163035, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25085449, "step": 4358, "time_per_iteration": 2.8925161361694336 }, { "auxiliary_loss_clip": 0.01557919, "auxiliary_loss_mlp": 0.01053799, "balance_loss_clip": 1.33845329, "balance_loss_mlp": 1.02773964, "epoch": 0.2620772583796783, "flos": 26443741576320.0, "grad_norm": 2.363841264826529, "language_loss": 0.68988246, "learning_rate": 3.4611112543722127e-06, "loss": 0.7159996, "num_input_tokens_seen": 94182520, "router_z_loss_clip": 2.19140625, "router_z_loss_mlp": 0.26074219, "step": 4359, "time_per_iteration": 2.8992254734039307 }, { "auxiliary_loss_clip": 0.01540138, "auxiliary_loss_mlp": 0.01053519, "balance_loss_clip": 1.32933724, "balance_loss_mlp": 1.0279721, "epoch": 0.2621373816323463, "flos": 20166041957760.0, "grad_norm": 2.167844238716568, "language_loss": 0.79266322, "learning_rate": 3.4608452815205757e-06, "loss": 0.81859976, "num_input_tokens_seen": 94201795, "router_z_loss_clip": 2.10742188, "router_z_loss_mlp": 0.25561523, "step": 4360, "time_per_iteration": 2.8378803730010986 }, { "auxiliary_loss_clip": 0.01505799, "auxiliary_loss_mlp": 0.01049082, "balance_loss_clip": 1.3024497, "balance_loss_mlp": 1.0239054, "epoch": 0.26219750488501425, "flos": 28632918136320.0, "grad_norm": 1.790973432712191, "language_loss": 0.68711156, "learning_rate": 3.4605792532735387e-06, "loss": 0.71266037, "num_input_tokens_seen": 94222390, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.25170898, "step": 4361, "time_per_iteration": 2.932987928390503 }, { "auxiliary_loss_clip": 0.01545726, "auxiliary_loss_mlp": 0.01059664, "balance_loss_clip": 1.33280969, "balance_loss_mlp": 1.03416538, "epoch": 0.2622576281376823, "flos": 15049155663360.0, "grad_norm": 1.7496310228883967, "language_loss": 0.8450706, "learning_rate": 3.46031316964119e-06, "loss": 0.87112451, "num_input_tokens_seen": 94239980, "router_z_loss_clip": 2.12890625, "router_z_loss_mlp": 0.25537109, "step": 4362, "time_per_iteration": 2.8265230655670166 }, { "auxiliary_loss_clip": 0.01524059, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.3179934, "balance_loss_mlp": 1.02939963, "epoch": 0.26231775139035024, "flos": 26407427984640.0, "grad_norm": 1.730811506941307, "language_loss": 0.66053694, "learning_rate": 3.4600470306336197e-06, "loss": 0.68635672, "num_input_tokens_seen": 94260715, "router_z_loss_clip": 2.06152344, "router_z_loss_mlp": 0.28491211, "step": 4363, "time_per_iteration": 2.8585410118103027 }, { "auxiliary_loss_clip": 0.01285645, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.15961838, "balance_loss_mlp": 1.00759375, "epoch": 0.2623778746430182, "flos": 65442221700480.0, "grad_norm": 0.8908791774180849, "language_loss": 0.61177057, "learning_rate": 3.4597808362609194e-06, "loss": 0.63494134, "num_input_tokens_seen": 94321285, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.23828125, "step": 4364, "time_per_iteration": 3.4337141513824463 }, { "auxiliary_loss_clip": 0.01540113, "auxiliary_loss_mlp": 0.01056836, "balance_loss_clip": 1.32813239, "balance_loss_mlp": 1.02982354, "epoch": 0.26243799789568617, "flos": 12611801520000.0, "grad_norm": 2.465566876196162, "language_loss": 0.72888923, "learning_rate": 3.459514586533184e-06, "loss": 0.75485873, "num_input_tokens_seen": 94335420, "router_z_loss_clip": 2.11816406, "router_z_loss_mlp": 0.27001953, "step": 4365, "time_per_iteration": 2.7960169315338135 }, { "auxiliary_loss_clip": 0.01518978, "auxiliary_loss_mlp": 0.01051868, "balance_loss_clip": 1.31402755, "balance_loss_mlp": 1.02574909, "epoch": 0.26249812114835414, "flos": 28636175761920.0, "grad_norm": 2.728274289119461, "language_loss": 0.77695417, "learning_rate": 3.459248281460509e-06, "loss": 0.80266261, "num_input_tokens_seen": 94357440, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.2611084, "step": 4366, "time_per_iteration": 2.903496503829956 }, { "auxiliary_loss_clip": 0.01539199, "auxiliary_loss_mlp": 0.01047038, "balance_loss_clip": 1.32962346, "balance_loss_mlp": 1.02257609, "epoch": 0.2625582444010221, "flos": 14473567572480.0, "grad_norm": 1.5587902467557506, "language_loss": 0.76693547, "learning_rate": 3.4589819210529927e-06, "loss": 0.7927978, "num_input_tokens_seen": 94375690, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.24462891, "step": 4367, "time_per_iteration": 2.823155641555786 }, { "auxiliary_loss_clip": 0.01530391, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.32593381, "balance_loss_mlp": 1.02519059, "epoch": 0.26261836765369007, "flos": 16620639206400.0, "grad_norm": 1.5042125857216515, "language_loss": 0.70399392, "learning_rate": 3.458715505320736e-06, "loss": 0.72980106, "num_input_tokens_seen": 94393190, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.25109863, "step": 4368, "time_per_iteration": 2.7993810176849365 }, { "auxiliary_loss_clip": 0.01518554, "auxiliary_loss_mlp": 0.01051681, "balance_loss_clip": 1.31201148, "balance_loss_mlp": 1.02469254, "epoch": 0.26267849090635803, "flos": 20529132629760.0, "grad_norm": 1.875777821928032, "language_loss": 0.78976554, "learning_rate": 3.458449034273841e-06, "loss": 0.81546789, "num_input_tokens_seen": 94410975, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.26977539, "step": 4369, "time_per_iteration": 2.833770513534546 }, { "auxiliary_loss_clip": 0.01516529, "auxiliary_loss_mlp": 0.01048082, "balance_loss_clip": 1.31193447, "balance_loss_mlp": 1.02239192, "epoch": 0.262738614159026, "flos": 21333687246720.0, "grad_norm": 1.8479244115291924, "language_loss": 0.84221637, "learning_rate": 3.4581825079224133e-06, "loss": 0.86786246, "num_input_tokens_seen": 94429985, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.25671387, "step": 4370, "time_per_iteration": 2.849787712097168 }, { "auxiliary_loss_clip": 0.01540453, "auxiliary_loss_mlp": 0.01049839, "balance_loss_clip": 1.32729673, "balance_loss_mlp": 1.02341032, "epoch": 0.26279873741169396, "flos": 17612688850560.0, "grad_norm": 1.603238933188055, "language_loss": 0.72193408, "learning_rate": 3.4579159262765575e-06, "loss": 0.74783701, "num_input_tokens_seen": 94448660, "router_z_loss_clip": 2.13476562, "router_z_loss_mlp": 0.2644043, "step": 4371, "time_per_iteration": 2.8413732051849365 }, { "auxiliary_loss_clip": 0.01284781, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.15733993, "balance_loss_mlp": 1.01346934, "epoch": 0.2628588606643619, "flos": 60979839707520.0, "grad_norm": 0.6857680744458399, "language_loss": 0.56481111, "learning_rate": 3.457649289346384e-06, "loss": 0.58801675, "num_input_tokens_seen": 94515630, "router_z_loss_clip": 1.2734375, "router_z_loss_mlp": 0.22363281, "step": 4372, "time_per_iteration": 3.4885976314544678 }, { "auxiliary_loss_clip": 0.01514544, "auxiliary_loss_mlp": 0.010427, "balance_loss_clip": 1.31047761, "balance_loss_mlp": 1.01747537, "epoch": 0.2629189839170299, "flos": 27027699955200.0, "grad_norm": 1.5376102209070737, "language_loss": 0.78757548, "learning_rate": 3.4573825971420042e-06, "loss": 0.8131479, "num_input_tokens_seen": 94535385, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.25244141, "step": 4373, "time_per_iteration": 2.884505033493042 }, { "auxiliary_loss_clip": 0.015151, "auxiliary_loss_mlp": 0.01041468, "balance_loss_clip": 1.31045091, "balance_loss_mlp": 1.01592135, "epoch": 0.26297910716969786, "flos": 17028820961280.0, "grad_norm": 3.5563579814290436, "language_loss": 0.72049582, "learning_rate": 3.4571158496735294e-06, "loss": 0.74606144, "num_input_tokens_seen": 94552650, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.2557373, "step": 4374, "time_per_iteration": 2.926116943359375 }, { "auxiliary_loss_clip": 0.01528109, "auxiliary_loss_mlp": 0.01044733, "balance_loss_clip": 1.32208014, "balance_loss_mlp": 1.01854324, "epoch": 0.2630392304223659, "flos": 24907485749760.0, "grad_norm": 2.5622222864093898, "language_loss": 0.80978173, "learning_rate": 3.4568490469510756e-06, "loss": 0.83551013, "num_input_tokens_seen": 94574075, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.26184082, "step": 4375, "time_per_iteration": 2.8686394691467285 }, { "auxiliary_loss_clip": 0.01509123, "auxiliary_loss_mlp": 0.01041323, "balance_loss_clip": 1.3056531, "balance_loss_mlp": 1.01695633, "epoch": 0.26309935367503384, "flos": 32866921785600.0, "grad_norm": 1.825715332964434, "language_loss": 0.67257237, "learning_rate": 3.4565821889847603e-06, "loss": 0.69807684, "num_input_tokens_seen": 94594255, "router_z_loss_clip": 2.03613281, "router_z_loss_mlp": 0.24353027, "step": 4376, "time_per_iteration": 2.9296579360961914 }, { "auxiliary_loss_clip": 0.01538352, "auxiliary_loss_mlp": 0.01047721, "balance_loss_clip": 1.33103895, "balance_loss_mlp": 1.0231638, "epoch": 0.2631594769277018, "flos": 15896403388800.0, "grad_norm": 1.9027616750234337, "language_loss": 0.70038062, "learning_rate": 3.4563152757847026e-06, "loss": 0.72624135, "num_input_tokens_seen": 94611410, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.24560547, "step": 4377, "time_per_iteration": 4.25696325302124 }, { "auxiliary_loss_clip": 0.01519416, "auxiliary_loss_mlp": 0.01046387, "balance_loss_clip": 1.31376493, "balance_loss_mlp": 1.02128172, "epoch": 0.2632196001803698, "flos": 50822476882560.0, "grad_norm": 2.168020926484134, "language_loss": 0.80373544, "learning_rate": 3.4560483073610233e-06, "loss": 0.82939339, "num_input_tokens_seen": 94636575, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.25109863, "step": 4378, "time_per_iteration": 3.078158140182495 }, { "auxiliary_loss_clip": 0.01529264, "auxiliary_loss_mlp": 0.01054869, "balance_loss_clip": 1.32640028, "balance_loss_mlp": 1.03170705, "epoch": 0.26327972343303774, "flos": 13740328039680.0, "grad_norm": 2.3896879385624468, "language_loss": 0.77744091, "learning_rate": 3.455781283723846e-06, "loss": 0.8032822, "num_input_tokens_seen": 94654345, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.23156738, "step": 4379, "time_per_iteration": 2.9241106510162354 }, { "auxiliary_loss_clip": 0.01544294, "auxiliary_loss_mlp": 0.01046294, "balance_loss_clip": 1.33323979, "balance_loss_mlp": 1.02013946, "epoch": 0.2633398466857057, "flos": 23779592657280.0, "grad_norm": 1.9693124960458068, "language_loss": 0.78604573, "learning_rate": 3.4555142048832975e-06, "loss": 0.81195164, "num_input_tokens_seen": 94673985, "router_z_loss_clip": 2.109375, "router_z_loss_mlp": 0.26159668, "step": 4380, "time_per_iteration": 2.857485055923462 }, { "auxiliary_loss_clip": 0.01526657, "auxiliary_loss_mlp": 0.01041194, "balance_loss_clip": 1.31760788, "balance_loss_mlp": 1.01630318, "epoch": 0.26339996993837367, "flos": 27611794068480.0, "grad_norm": 2.813304737557254, "language_loss": 0.65038168, "learning_rate": 3.4552470708495036e-06, "loss": 0.6760602, "num_input_tokens_seen": 94693145, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.24926758, "step": 4381, "time_per_iteration": 2.879714250564575 }, { "auxiliary_loss_clip": 0.01528043, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.32215667, "balance_loss_mlp": 1.01861846, "epoch": 0.26346009319104163, "flos": 16955107902720.0, "grad_norm": 2.2188558899779776, "language_loss": 0.83001232, "learning_rate": 3.454979881632595e-06, "loss": 0.85571623, "num_input_tokens_seen": 94710185, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.23730469, "step": 4382, "time_per_iteration": 2.8500680923461914 }, { "auxiliary_loss_clip": 0.01550427, "auxiliary_loss_mlp": 0.01043652, "balance_loss_clip": 1.33728158, "balance_loss_mlp": 1.01865387, "epoch": 0.2635202164437096, "flos": 37246225046400.0, "grad_norm": 1.8896720576590502, "language_loss": 0.71050298, "learning_rate": 3.4547126372427035e-06, "loss": 0.73644376, "num_input_tokens_seen": 94730280, "router_z_loss_clip": 2.13183594, "router_z_loss_mlp": 0.24987793, "step": 4383, "time_per_iteration": 2.9775073528289795 }, { "auxiliary_loss_clip": 0.01529375, "auxiliary_loss_mlp": 0.01046995, "balance_loss_clip": 1.32548118, "balance_loss_mlp": 1.02364182, "epoch": 0.26358033969637756, "flos": 21006231494400.0, "grad_norm": 1.90476581095725, "language_loss": 0.70507365, "learning_rate": 3.4544453376899638e-06, "loss": 0.73083735, "num_input_tokens_seen": 94748560, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.23352051, "step": 4384, "time_per_iteration": 4.246594429016113 }, { "auxiliary_loss_clip": 0.01521961, "auxiliary_loss_mlp": 0.01041779, "balance_loss_clip": 1.31776762, "balance_loss_mlp": 1.01784182, "epoch": 0.26364046294904553, "flos": 27757681862400.0, "grad_norm": 2.084104467433158, "language_loss": 0.70594382, "learning_rate": 3.45417798298451e-06, "loss": 0.73158121, "num_input_tokens_seen": 94767570, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.23937988, "step": 4385, "time_per_iteration": 2.9262397289276123 }, { "auxiliary_loss_clip": 0.0152244, "auxiliary_loss_mlp": 0.01044619, "balance_loss_clip": 1.31925964, "balance_loss_mlp": 1.0192039, "epoch": 0.2637005862017135, "flos": 22903315752960.0, "grad_norm": 1.9095303948859703, "language_loss": 0.85500282, "learning_rate": 3.453910573136482e-06, "loss": 0.88067341, "num_input_tokens_seen": 94784985, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.25427246, "step": 4386, "time_per_iteration": 2.8517913818359375 }, { "auxiliary_loss_clip": 0.01522979, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.31972802, "balance_loss_mlp": 1.01819086, "epoch": 0.26376070945438146, "flos": 15057073503360.0, "grad_norm": 2.090660821915703, "language_loss": 0.78369153, "learning_rate": 3.4536431081560196e-06, "loss": 0.80934668, "num_input_tokens_seen": 94802545, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.24353027, "step": 4387, "time_per_iteration": 4.163708686828613 }, { "auxiliary_loss_clip": 0.01522182, "auxiliary_loss_mlp": 0.01038966, "balance_loss_clip": 1.31920481, "balance_loss_mlp": 1.01543427, "epoch": 0.2638208327070494, "flos": 21151621595520.0, "grad_norm": 2.0069154852581423, "language_loss": 0.76522291, "learning_rate": 3.453375588053264e-06, "loss": 0.79083443, "num_input_tokens_seen": 94820730, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.23522949, "step": 4388, "time_per_iteration": 2.827009677886963 }, { "auxiliary_loss_clip": 0.01518004, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.31413853, "balance_loss_mlp": 1.01471102, "epoch": 0.26388095595971744, "flos": 21735534729600.0, "grad_norm": 2.067805992440318, "language_loss": 0.87815952, "learning_rate": 3.4531080128383617e-06, "loss": 0.90372491, "num_input_tokens_seen": 94839175, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.23828125, "step": 4389, "time_per_iteration": 4.225706338882446 }, { "auxiliary_loss_clip": 0.01300892, "auxiliary_loss_mlp": 0.01038585, "balance_loss_clip": 1.17686319, "balance_loss_mlp": 1.01808095, "epoch": 0.2639410792123854, "flos": 65548131073920.0, "grad_norm": 0.8427475568619489, "language_loss": 0.60358632, "learning_rate": 3.452840382521457e-06, "loss": 0.62698102, "num_input_tokens_seen": 94898865, "router_z_loss_clip": 1.2421875, "router_z_loss_mlp": 0.20507812, "step": 4390, "time_per_iteration": 3.392824172973633 }, { "auxiliary_loss_clip": 0.01531193, "auxiliary_loss_mlp": 0.0103818, "balance_loss_clip": 1.32262111, "balance_loss_mlp": 1.01483929, "epoch": 0.2640012024650534, "flos": 23958400682880.0, "grad_norm": 1.6314567637237802, "language_loss": 0.78425699, "learning_rate": 3.4525726971127e-06, "loss": 0.80995077, "num_input_tokens_seen": 94917490, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.23327637, "step": 4391, "time_per_iteration": 2.9354658126831055 }, { "auxiliary_loss_clip": 0.01298687, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.17517698, "balance_loss_mlp": 1.00955474, "epoch": 0.26406132571772134, "flos": 56474149161600.0, "grad_norm": 0.9106257356736966, "language_loss": 0.58872283, "learning_rate": 3.45230495662224e-06, "loss": 0.61201978, "num_input_tokens_seen": 94969065, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.21484375, "step": 4392, "time_per_iteration": 3.2705657482147217 }, { "auxiliary_loss_clip": 0.01525385, "auxiliary_loss_mlp": 0.01046056, "balance_loss_clip": 1.31773686, "balance_loss_mlp": 1.0214988, "epoch": 0.2641214489703893, "flos": 22100616172800.0, "grad_norm": 1.8795410932087275, "language_loss": 0.70098758, "learning_rate": 3.4520371610602306e-06, "loss": 0.72670197, "num_input_tokens_seen": 94988540, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.24572754, "step": 4393, "time_per_iteration": 2.95015811920166 }, { "auxiliary_loss_clip": 0.01545515, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.33317471, "balance_loss_mlp": 1.01797175, "epoch": 0.26418157222305727, "flos": 16553396154240.0, "grad_norm": 2.683938649985518, "language_loss": 0.85332561, "learning_rate": 3.4517693104368267e-06, "loss": 0.8792206, "num_input_tokens_seen": 95004810, "router_z_loss_clip": 2.12304688, "router_z_loss_mlp": 0.26013184, "step": 4394, "time_per_iteration": 2.9153456687927246 }, { "auxiliary_loss_clip": 0.01538724, "auxiliary_loss_mlp": 0.01044347, "balance_loss_clip": 1.32497358, "balance_loss_mlp": 1.01937222, "epoch": 0.26424169547572524, "flos": 18011052483840.0, "grad_norm": 4.043430004442886, "language_loss": 0.71635568, "learning_rate": 3.4515014047621856e-06, "loss": 0.74218643, "num_input_tokens_seen": 95024085, "router_z_loss_clip": 2.13671875, "router_z_loss_mlp": 0.24938965, "step": 4395, "time_per_iteration": 2.8526618480682373 }, { "auxiliary_loss_clip": 0.01519815, "auxiliary_loss_mlp": 0.0104424, "balance_loss_clip": 1.31646478, "balance_loss_mlp": 1.02019572, "epoch": 0.2643018187283932, "flos": 16992054921600.0, "grad_norm": 1.7516845339848786, "language_loss": 0.87364721, "learning_rate": 3.4512334440464655e-06, "loss": 0.89928776, "num_input_tokens_seen": 95042515, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.24035645, "step": 4396, "time_per_iteration": 2.8595776557922363 }, { "auxiliary_loss_clip": 0.01304253, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.18003905, "balance_loss_mlp": 1.01250994, "epoch": 0.26436194198106117, "flos": 59691942443520.0, "grad_norm": 0.7962026812127484, "language_loss": 0.55065805, "learning_rate": 3.4509654282998277e-06, "loss": 0.57403266, "num_input_tokens_seen": 95094835, "router_z_loss_clip": 1.2421875, "router_z_loss_mlp": 0.20703125, "step": 4397, "time_per_iteration": 3.13437819480896 }, { "auxiliary_loss_clip": 0.01521974, "auxiliary_loss_mlp": 0.01044664, "balance_loss_clip": 1.31868124, "balance_loss_mlp": 1.02142978, "epoch": 0.26442206523372913, "flos": 32932762248960.0, "grad_norm": 2.2016703717221047, "language_loss": 0.78573084, "learning_rate": 3.450697357532435e-06, "loss": 0.81139719, "num_input_tokens_seen": 95113480, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.23217773, "step": 4398, "time_per_iteration": 2.9811439514160156 }, { "auxiliary_loss_clip": 0.01524789, "auxiliary_loss_mlp": 0.01042792, "balance_loss_clip": 1.32125902, "balance_loss_mlp": 1.01940298, "epoch": 0.2644821884863971, "flos": 21040780538880.0, "grad_norm": 1.967708778928165, "language_loss": 0.67741418, "learning_rate": 3.4504292317544534e-06, "loss": 0.70308995, "num_input_tokens_seen": 95132580, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.23400879, "step": 4399, "time_per_iteration": 2.8819286823272705 }, { "auxiliary_loss_clip": 0.01504437, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.30779207, "balance_loss_mlp": 1.01505387, "epoch": 0.26454231173906506, "flos": 20786449662720.0, "grad_norm": 1.554974440848081, "language_loss": 0.87214875, "learning_rate": 3.4501610509760504e-06, "loss": 0.8975662, "num_input_tokens_seen": 95152375, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.22253418, "step": 4400, "time_per_iteration": 2.8537957668304443 }, { "auxiliary_loss_clip": 0.01534232, "auxiliary_loss_mlp": 0.01045038, "balance_loss_clip": 1.32601404, "balance_loss_mlp": 1.0198487, "epoch": 0.264602434991733, "flos": 16627561660800.0, "grad_norm": 1.8715207275812595, "language_loss": 0.7786777, "learning_rate": 3.4498928152073944e-06, "loss": 0.80447036, "num_input_tokens_seen": 95170265, "router_z_loss_clip": 2.08496094, "router_z_loss_mlp": 0.25183105, "step": 4401, "time_per_iteration": 2.894500732421875 }, { "auxiliary_loss_clip": 0.01544041, "auxiliary_loss_mlp": 0.01045512, "balance_loss_clip": 1.33508027, "balance_loss_mlp": 1.02084744, "epoch": 0.26466255824440105, "flos": 19072064482560.0, "grad_norm": 1.6518533074602137, "language_loss": 0.88966846, "learning_rate": 3.4496245244586577e-06, "loss": 0.91556406, "num_input_tokens_seen": 95188655, "router_z_loss_clip": 2.08789062, "router_z_loss_mlp": 0.2467041, "step": 4402, "time_per_iteration": 2.8054749965667725 }, { "auxiliary_loss_clip": 0.01530281, "auxiliary_loss_mlp": 0.01041311, "balance_loss_clip": 1.32660794, "balance_loss_mlp": 1.01807702, "epoch": 0.264722681497069, "flos": 22648939632000.0, "grad_norm": 1.669792516038524, "language_loss": 0.78914678, "learning_rate": 3.4493561787400137e-06, "loss": 0.81486267, "num_input_tokens_seen": 95209615, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.2322998, "step": 4403, "time_per_iteration": 2.908496856689453 }, { "auxiliary_loss_clip": 0.01525939, "auxiliary_loss_mlp": 0.01039504, "balance_loss_clip": 1.32115781, "balance_loss_mlp": 1.01543581, "epoch": 0.264782804749737, "flos": 22502508900480.0, "grad_norm": 1.7591338941963925, "language_loss": 0.89979988, "learning_rate": 3.4490877780616387e-06, "loss": 0.92545432, "num_input_tokens_seen": 95227810, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.24060059, "step": 4404, "time_per_iteration": 2.854583740234375 }, { "auxiliary_loss_clip": 0.01526712, "auxiliary_loss_mlp": 0.01042219, "balance_loss_clip": 1.32048154, "balance_loss_mlp": 1.01956952, "epoch": 0.26484292800240494, "flos": 16808993884800.0, "grad_norm": 1.8309025703095054, "language_loss": 0.76851112, "learning_rate": 3.448819322433709e-06, "loss": 0.79420042, "num_input_tokens_seen": 95245890, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.2265625, "step": 4405, "time_per_iteration": 2.839958906173706 }, { "auxiliary_loss_clip": 0.01543763, "auxiliary_loss_mlp": 0.01041139, "balance_loss_clip": 1.33658028, "balance_loss_mlp": 1.0163908, "epoch": 0.2649030512550729, "flos": 20459446358400.0, "grad_norm": 1.704466538836302, "language_loss": 0.71071798, "learning_rate": 3.4485508118664066e-06, "loss": 0.73656702, "num_input_tokens_seen": 95264955, "router_z_loss_clip": 2.07128906, "router_z_loss_mlp": 0.24755859, "step": 4406, "time_per_iteration": 2.8724164962768555 }, { "auxiliary_loss_clip": 0.01514227, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.31192744, "balance_loss_mlp": 1.0172317, "epoch": 0.2649631745077409, "flos": 22425538216320.0, "grad_norm": 1.8125199871560334, "language_loss": 0.84814143, "learning_rate": 3.448282246369912e-06, "loss": 0.87369299, "num_input_tokens_seen": 95284245, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.23718262, "step": 4407, "time_per_iteration": 2.8510217666625977 }, { "auxiliary_loss_clip": 0.01533089, "auxiliary_loss_mlp": 0.01036134, "balance_loss_clip": 1.32822227, "balance_loss_mlp": 1.01290059, "epoch": 0.26502329776040884, "flos": 35129585180160.0, "grad_norm": 2.0657121164559538, "language_loss": 0.76798666, "learning_rate": 3.4480136259544084e-06, "loss": 0.79367888, "num_input_tokens_seen": 95307125, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.2322998, "step": 4408, "time_per_iteration": 2.9734225273132324 }, { "auxiliary_loss_clip": 0.0152472, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.32181621, "balance_loss_mlp": 1.0179522, "epoch": 0.2650834210130768, "flos": 38700804729600.0, "grad_norm": 2.0387639700262685, "language_loss": 0.71677941, "learning_rate": 3.447744950630084e-06, "loss": 0.74243939, "num_input_tokens_seen": 95329150, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.23352051, "step": 4409, "time_per_iteration": 2.984628915786743 }, { "auxiliary_loss_clip": 0.01535019, "auxiliary_loss_mlp": 0.01043131, "balance_loss_clip": 1.32780707, "balance_loss_mlp": 1.01794195, "epoch": 0.26514354426574477, "flos": 24727410869760.0, "grad_norm": 1.8166396241438636, "language_loss": 0.74267077, "learning_rate": 3.4474762204071253e-06, "loss": 0.76845229, "num_input_tokens_seen": 95349880, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.25183105, "step": 4410, "time_per_iteration": 2.8831350803375244 }, { "auxiliary_loss_clip": 0.01526927, "auxiliary_loss_mlp": 0.01036756, "balance_loss_clip": 1.3209244, "balance_loss_mlp": 1.01397562, "epoch": 0.26520366751841273, "flos": 20349872156160.0, "grad_norm": 1.8098784404045032, "language_loss": 0.74623829, "learning_rate": 3.4472074352957244e-06, "loss": 0.77187508, "num_input_tokens_seen": 95368570, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.22766113, "step": 4411, "time_per_iteration": 2.8563497066497803 }, { "auxiliary_loss_clip": 0.01511096, "auxiliary_loss_mlp": 0.01042132, "balance_loss_clip": 1.30818176, "balance_loss_mlp": 1.01715767, "epoch": 0.2652637907710807, "flos": 22353453970560.0, "grad_norm": 1.871453537657166, "language_loss": 0.83076775, "learning_rate": 3.446938595306071e-06, "loss": 0.8563, "num_input_tokens_seen": 95387065, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.24975586, "step": 4412, "time_per_iteration": 4.283655643463135 }, { "auxiliary_loss_clip": 0.01530871, "auxiliary_loss_mlp": 0.010483, "balance_loss_clip": 1.32708406, "balance_loss_mlp": 1.02452922, "epoch": 0.26532391402374866, "flos": 19363613846400.0, "grad_norm": 2.531761544680092, "language_loss": 0.75505376, "learning_rate": 3.4466697004483622e-06, "loss": 0.78084546, "num_input_tokens_seen": 95406345, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.23779297, "step": 4413, "time_per_iteration": 2.8651609420776367 }, { "auxiliary_loss_clip": 0.01291085, "auxiliary_loss_mlp": 0.01049851, "balance_loss_clip": 1.17188764, "balance_loss_mlp": 1.03068256, "epoch": 0.26538403727641663, "flos": 44813785438080.0, "grad_norm": 0.8808210720389538, "language_loss": 0.56982142, "learning_rate": 3.446400750732793e-06, "loss": 0.59323078, "num_input_tokens_seen": 95463595, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.19140625, "step": 4414, "time_per_iteration": 3.30176043510437 }, { "auxiliary_loss_clip": 0.01508089, "auxiliary_loss_mlp": 0.01043895, "balance_loss_clip": 1.31036043, "balance_loss_mlp": 1.02055347, "epoch": 0.26544416052908465, "flos": 28193354472960.0, "grad_norm": 1.5811340610465416, "language_loss": 0.75117469, "learning_rate": 3.4461317461695625e-06, "loss": 0.77669454, "num_input_tokens_seen": 95484115, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.23352051, "step": 4415, "time_per_iteration": 2.9457054138183594 }, { "auxiliary_loss_clip": 0.01532268, "auxiliary_loss_mlp": 0.0104027, "balance_loss_clip": 1.32484031, "balance_loss_mlp": 1.01406813, "epoch": 0.2655042837817526, "flos": 17573886794880.0, "grad_norm": 2.564044396578109, "language_loss": 0.87453306, "learning_rate": 3.4458626867688707e-06, "loss": 0.90025842, "num_input_tokens_seen": 95501435, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.26208496, "step": 4416, "time_per_iteration": 2.881178617477417 }, { "auxiliary_loss_clip": 0.01535627, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.33059335, "balance_loss_mlp": 1.01306224, "epoch": 0.2655644070344206, "flos": 23415189886080.0, "grad_norm": 1.5074861383172853, "language_loss": 0.77671099, "learning_rate": 3.4455935725409217e-06, "loss": 0.80244827, "num_input_tokens_seen": 95520135, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.25036621, "step": 4417, "time_per_iteration": 2.8869640827178955 }, { "auxiliary_loss_clip": 0.01521311, "auxiliary_loss_mlp": 0.01044887, "balance_loss_clip": 1.321733, "balance_loss_mlp": 1.02139127, "epoch": 0.26562453028708854, "flos": 26479557475200.0, "grad_norm": 1.4721480089396264, "language_loss": 0.81099552, "learning_rate": 3.4453244034959196e-06, "loss": 0.83665752, "num_input_tokens_seen": 95541705, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.23498535, "step": 4418, "time_per_iteration": 2.9317519664764404 }, { "auxiliary_loss_clip": 0.01525477, "auxiliary_loss_mlp": 0.01045573, "balance_loss_clip": 1.32039833, "balance_loss_mlp": 1.02165949, "epoch": 0.2656846535397565, "flos": 19216730666880.0, "grad_norm": 1.9952149919770632, "language_loss": 0.68677104, "learning_rate": 3.445055179644071e-06, "loss": 0.71248162, "num_input_tokens_seen": 95560300, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.23913574, "step": 4419, "time_per_iteration": 4.282297134399414 }, { "auxiliary_loss_clip": 0.01531585, "auxiliary_loss_mlp": 0.01048353, "balance_loss_clip": 1.32636416, "balance_loss_mlp": 1.02291346, "epoch": 0.2657447767924245, "flos": 30562560668160.0, "grad_norm": 2.128527944077912, "language_loss": 0.79896045, "learning_rate": 3.444785900995585e-06, "loss": 0.82475984, "num_input_tokens_seen": 95580150, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.25427246, "step": 4420, "time_per_iteration": 2.9104080200195312 }, { "auxiliary_loss_clip": 0.01537072, "auxiliary_loss_mlp": 0.01047284, "balance_loss_clip": 1.33008146, "balance_loss_mlp": 1.02099788, "epoch": 0.26580490004509244, "flos": 20932427946240.0, "grad_norm": 1.861202400116776, "language_loss": 0.82594121, "learning_rate": 3.444516567560673e-06, "loss": 0.85178483, "num_input_tokens_seen": 95597570, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.26281738, "step": 4421, "time_per_iteration": 2.867424726486206 }, { "auxiliary_loss_clip": 0.01524715, "auxiliary_loss_mlp": 0.01043998, "balance_loss_clip": 1.32453895, "balance_loss_mlp": 1.02009666, "epoch": 0.2658650232977604, "flos": 43961452312320.0, "grad_norm": 1.6331566580627146, "language_loss": 0.67027295, "learning_rate": 3.444247179349548e-06, "loss": 0.6959601, "num_input_tokens_seen": 95619415, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.23925781, "step": 4422, "time_per_iteration": 4.534050703048706 }, { "auxiliary_loss_clip": 0.01529654, "auxiliary_loss_mlp": 0.01049677, "balance_loss_clip": 1.32670617, "balance_loss_mlp": 1.02622819, "epoch": 0.26592514655042837, "flos": 29728569669120.0, "grad_norm": 2.068611112653832, "language_loss": 0.75676966, "learning_rate": 3.4439777363724252e-06, "loss": 0.78256297, "num_input_tokens_seen": 95639155, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.23461914, "step": 4423, "time_per_iteration": 2.9582459926605225 }, { "auxiliary_loss_clip": 0.01533692, "auxiliary_loss_mlp": 0.01055753, "balance_loss_clip": 1.32853258, "balance_loss_mlp": 1.03248358, "epoch": 0.26598526980309634, "flos": 46693070507520.0, "grad_norm": 1.584457524237858, "language_loss": 0.78762293, "learning_rate": 3.443708238639522e-06, "loss": 0.81351733, "num_input_tokens_seen": 95663320, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.23278809, "step": 4424, "time_per_iteration": 4.526980876922607 }, { "auxiliary_loss_clip": 0.01535373, "auxiliary_loss_mlp": 0.01050341, "balance_loss_clip": 1.33124638, "balance_loss_mlp": 1.02621305, "epoch": 0.2660453930557643, "flos": 11516692924800.0, "grad_norm": 1.9968483402637718, "language_loss": 0.80449462, "learning_rate": 3.4434386861610573e-06, "loss": 0.83035177, "num_input_tokens_seen": 95680260, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.24133301, "step": 4425, "time_per_iteration": 2.85209059715271 }, { "auxiliary_loss_clip": 0.01517965, "auxiliary_loss_mlp": 0.01052054, "balance_loss_clip": 1.31816006, "balance_loss_mlp": 1.02821183, "epoch": 0.26610551630843227, "flos": 24802617006720.0, "grad_norm": 1.771907854303047, "language_loss": 0.81602776, "learning_rate": 3.4431690789472532e-06, "loss": 0.84172797, "num_input_tokens_seen": 95701140, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.23815918, "step": 4426, "time_per_iteration": 2.8540475368499756 }, { "auxiliary_loss_clip": 0.01529937, "auxiliary_loss_mlp": 0.01056883, "balance_loss_clip": 1.32693303, "balance_loss_mlp": 1.03090692, "epoch": 0.26616563956110023, "flos": 27647881436160.0, "grad_norm": 1.6511327224614956, "language_loss": 0.77599728, "learning_rate": 3.442899417008333e-06, "loss": 0.80186546, "num_input_tokens_seen": 95722060, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.25964355, "step": 4427, "time_per_iteration": 2.921757936477661 }, { "auxiliary_loss_clip": 0.01516432, "auxiliary_loss_mlp": 0.01042441, "balance_loss_clip": 1.3184967, "balance_loss_mlp": 1.02031541, "epoch": 0.26622576281376825, "flos": 28373746066560.0, "grad_norm": 2.7507957441061546, "language_loss": 0.77384019, "learning_rate": 3.4426297003545227e-06, "loss": 0.79942888, "num_input_tokens_seen": 95742495, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.22119141, "step": 4428, "time_per_iteration": 2.9779465198516846 }, { "auxiliary_loss_clip": 0.01530985, "auxiliary_loss_mlp": 0.01043754, "balance_loss_clip": 1.32678163, "balance_loss_mlp": 1.02047276, "epoch": 0.2662858860664362, "flos": 18050533211520.0, "grad_norm": 2.2413583795885628, "language_loss": 0.83901978, "learning_rate": 3.4423599289960495e-06, "loss": 0.86476713, "num_input_tokens_seen": 95761510, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.23278809, "step": 4429, "time_per_iteration": 2.8244171142578125 }, { "auxiliary_loss_clip": 0.0153072, "auxiliary_loss_mlp": 0.01048262, "balance_loss_clip": 1.32850742, "balance_loss_mlp": 1.02359784, "epoch": 0.2663460093191042, "flos": 22755663411840.0, "grad_norm": 1.8066965703403772, "language_loss": 0.73460293, "learning_rate": 3.442090102943143e-06, "loss": 0.76039279, "num_input_tokens_seen": 95782385, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.24645996, "step": 4430, "time_per_iteration": 2.874713659286499 }, { "auxiliary_loss_clip": 0.01514847, "auxiliary_loss_mlp": 0.0104741, "balance_loss_clip": 1.3131429, "balance_loss_mlp": 1.0241046, "epoch": 0.26640613257177215, "flos": 16517535010560.0, "grad_norm": 1.8357819495139982, "language_loss": 0.82728243, "learning_rate": 3.441820222206035e-06, "loss": 0.85290504, "num_input_tokens_seen": 95800595, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.2331543, "step": 4431, "time_per_iteration": 2.8452835083007812 }, { "auxiliary_loss_clip": 0.01539485, "auxiliary_loss_mlp": 0.01054399, "balance_loss_clip": 1.33033347, "balance_loss_mlp": 1.02912664, "epoch": 0.2664662558244401, "flos": 23086331544960.0, "grad_norm": 3.087713956260392, "language_loss": 0.76909894, "learning_rate": 3.44155028679496e-06, "loss": 0.79503775, "num_input_tokens_seen": 95818480, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.25268555, "step": 4432, "time_per_iteration": 2.861431360244751 }, { "auxiliary_loss_clip": 0.01540196, "auxiliary_loss_mlp": 0.01042878, "balance_loss_clip": 1.33436728, "balance_loss_mlp": 1.01907158, "epoch": 0.2665263790771081, "flos": 23779864126080.0, "grad_norm": 1.9173016058230161, "language_loss": 0.83984083, "learning_rate": 3.441280296720154e-06, "loss": 0.86567152, "num_input_tokens_seen": 95837205, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.23815918, "step": 4433, "time_per_iteration": 3.0163822174072266 }, { "auxiliary_loss_clip": 0.01519967, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.32062495, "balance_loss_mlp": 1.02554059, "epoch": 0.26658650232977604, "flos": 28012872389760.0, "grad_norm": 1.9643143083143801, "language_loss": 0.77007312, "learning_rate": 3.441010251991854e-06, "loss": 0.79577959, "num_input_tokens_seen": 95858395, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.25134277, "step": 4434, "time_per_iteration": 2.9614663124084473 }, { "auxiliary_loss_clip": 0.0151703, "auxiliary_loss_mlp": 0.01047951, "balance_loss_clip": 1.31697559, "balance_loss_mlp": 1.02364469, "epoch": 0.266646625582444, "flos": 22173741048960.0, "grad_norm": 1.938497369516178, "language_loss": 0.83739114, "learning_rate": 3.440740152620301e-06, "loss": 0.86304098, "num_input_tokens_seen": 95877875, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.24316406, "step": 4435, "time_per_iteration": 2.869894504547119 }, { "auxiliary_loss_clip": 0.01523381, "auxiliary_loss_mlp": 0.01056922, "balance_loss_clip": 1.31536007, "balance_loss_mlp": 1.03058839, "epoch": 0.266706748835112, "flos": 27864631866240.0, "grad_norm": 2.2367574520744564, "language_loss": 0.88681746, "learning_rate": 3.4404699986157376e-06, "loss": 0.91262054, "num_input_tokens_seen": 95895820, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.26306152, "step": 4436, "time_per_iteration": 2.9244894981384277 }, { "auxiliary_loss_clip": 0.01523095, "auxiliary_loss_mlp": 0.01042538, "balance_loss_clip": 1.31738806, "balance_loss_mlp": 1.01700342, "epoch": 0.26676687208777994, "flos": 25823152892160.0, "grad_norm": 2.0180491898179187, "language_loss": 0.78995836, "learning_rate": 3.440199789988407e-06, "loss": 0.8156147, "num_input_tokens_seen": 95918025, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.25561523, "step": 4437, "time_per_iteration": 2.916325569152832 }, { "auxiliary_loss_clip": 0.01513785, "auxiliary_loss_mlp": 0.01045737, "balance_loss_clip": 1.3126992, "balance_loss_mlp": 1.02016699, "epoch": 0.2668269953404479, "flos": 36078263043840.0, "grad_norm": 2.0832828580137197, "language_loss": 0.65537935, "learning_rate": 3.439929526748556e-06, "loss": 0.6809746, "num_input_tokens_seen": 95937725, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.25585938, "step": 4438, "time_per_iteration": 2.944976568222046 }, { "auxiliary_loss_clip": 0.015196, "auxiliary_loss_mlp": 0.01044131, "balance_loss_clip": 1.31672001, "balance_loss_mlp": 1.01845288, "epoch": 0.26688711859311587, "flos": 26580308941440.0, "grad_norm": 1.9608864294263801, "language_loss": 0.76580644, "learning_rate": 3.4396592089064334e-06, "loss": 0.79144371, "num_input_tokens_seen": 95956335, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.2565918, "step": 4439, "time_per_iteration": 2.8695287704467773 }, { "auxiliary_loss_clip": 0.01523703, "auxiliary_loss_mlp": 0.01040056, "balance_loss_clip": 1.31921792, "balance_loss_mlp": 1.01286459, "epoch": 0.26694724184578383, "flos": 26773052365440.0, "grad_norm": 1.7719274028850802, "language_loss": 0.72027147, "learning_rate": 3.4393888364722897e-06, "loss": 0.74590904, "num_input_tokens_seen": 95977135, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.27185059, "step": 4440, "time_per_iteration": 2.9155635833740234 }, { "auxiliary_loss_clip": 0.01517759, "auxiliary_loss_mlp": 0.01043453, "balance_loss_clip": 1.31158912, "balance_loss_mlp": 1.01622593, "epoch": 0.2670073650984518, "flos": 20969374965120.0, "grad_norm": 1.8410703149429473, "language_loss": 0.67723465, "learning_rate": 3.439118409456376e-06, "loss": 0.70284677, "num_input_tokens_seen": 95995435, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.27209473, "step": 4441, "time_per_iteration": 2.823272466659546 }, { "auxiliary_loss_clip": 0.01519301, "auxiliary_loss_mlp": 0.01046817, "balance_loss_clip": 1.31573582, "balance_loss_mlp": 1.01955354, "epoch": 0.2670674883511198, "flos": 28377727608960.0, "grad_norm": 1.8274250470550508, "language_loss": 0.77580225, "learning_rate": 3.4388479278689486e-06, "loss": 0.80146343, "num_input_tokens_seen": 96016340, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.27258301, "step": 4442, "time_per_iteration": 2.912431001663208 }, { "auxiliary_loss_clip": 0.01300238, "auxiliary_loss_mlp": 0.01069606, "balance_loss_clip": 1.1741128, "balance_loss_mlp": 1.02955174, "epoch": 0.2671276116037878, "flos": 58998771820800.0, "grad_norm": 0.9371487785988872, "language_loss": 0.61317694, "learning_rate": 3.4385773917202637e-06, "loss": 0.63687539, "num_input_tokens_seen": 96071205, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.40039062, "step": 4443, "time_per_iteration": 3.228618860244751 }, { "auxiliary_loss_clip": 0.01521967, "auxiliary_loss_mlp": 0.01043821, "balance_loss_clip": 1.31866336, "balance_loss_mlp": 1.01697528, "epoch": 0.26718773485645575, "flos": 43960683150720.0, "grad_norm": 7.355362075147745, "language_loss": 0.77027893, "learning_rate": 3.4383068010205793e-06, "loss": 0.79593676, "num_input_tokens_seen": 96094240, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.26806641, "step": 4444, "time_per_iteration": 3.070876359939575 }, { "auxiliary_loss_clip": 0.0150962, "auxiliary_loss_mlp": 0.01049648, "balance_loss_clip": 1.30809498, "balance_loss_mlp": 1.02298057, "epoch": 0.2672478581091237, "flos": 25239194513280.0, "grad_norm": 1.6325759822643362, "language_loss": 0.81355369, "learning_rate": 3.438036155780158e-06, "loss": 0.83914632, "num_input_tokens_seen": 96114105, "router_z_loss_clip": 2.01269531, "router_z_loss_mlp": 0.2668457, "step": 4445, "time_per_iteration": 2.87665057182312 }, { "auxiliary_loss_clip": 0.01520185, "auxiliary_loss_mlp": 0.01043872, "balance_loss_clip": 1.31591606, "balance_loss_mlp": 1.01758623, "epoch": 0.2673079813617917, "flos": 15276583866240.0, "grad_norm": 1.7372285896816426, "language_loss": 0.90442276, "learning_rate": 3.43776545600926e-06, "loss": 0.93006325, "num_input_tokens_seen": 96132140, "router_z_loss_clip": 2.04394531, "router_z_loss_mlp": 0.26318359, "step": 4446, "time_per_iteration": 2.8198487758636475 }, { "auxiliary_loss_clip": 0.0152027, "auxiliary_loss_mlp": 0.0104747, "balance_loss_clip": 1.31642783, "balance_loss_mlp": 1.02223372, "epoch": 0.26736810461445965, "flos": 25823922053760.0, "grad_norm": 4.444928622575966, "language_loss": 0.68711305, "learning_rate": 3.437494701718153e-06, "loss": 0.71279049, "num_input_tokens_seen": 96152090, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.25256348, "step": 4447, "time_per_iteration": 4.251283168792725 }, { "auxiliary_loss_clip": 0.01519034, "auxiliary_loss_mlp": 0.01041955, "balance_loss_clip": 1.31543767, "balance_loss_mlp": 1.01562202, "epoch": 0.2674282278671276, "flos": 24322441495680.0, "grad_norm": 2.450978689143095, "language_loss": 0.84392059, "learning_rate": 3.4372238929171026e-06, "loss": 0.8695305, "num_input_tokens_seen": 96170015, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.2635498, "step": 4448, "time_per_iteration": 2.8767435550689697 }, { "auxiliary_loss_clip": 0.01506109, "auxiliary_loss_mlp": 0.01056177, "balance_loss_clip": 1.30550075, "balance_loss_mlp": 1.02886617, "epoch": 0.2674883511197956, "flos": 22824535276800.0, "grad_norm": 1.4621392229313053, "language_loss": 0.85171175, "learning_rate": 3.436953029616378e-06, "loss": 0.87733459, "num_input_tokens_seen": 96188065, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.27307129, "step": 4449, "time_per_iteration": 2.8537991046905518 }, { "auxiliary_loss_clip": 0.01537159, "auxiliary_loss_mlp": 0.0105105, "balance_loss_clip": 1.32391691, "balance_loss_mlp": 1.02342963, "epoch": 0.26754847437246354, "flos": 25380014889600.0, "grad_norm": 1.6417443680541726, "language_loss": 0.84410179, "learning_rate": 3.4366821118262506e-06, "loss": 0.86998379, "num_input_tokens_seen": 96205780, "router_z_loss_clip": 2.13085938, "router_z_loss_mlp": 0.27648926, "step": 4450, "time_per_iteration": 2.8755767345428467 }, { "auxiliary_loss_clip": 0.01493202, "auxiliary_loss_mlp": 0.01049759, "balance_loss_clip": 1.29561567, "balance_loss_mlp": 1.02381921, "epoch": 0.2676085976251315, "flos": 20240162219520.0, "grad_norm": 4.114279523344304, "language_loss": 0.81907201, "learning_rate": 3.4364111395569937e-06, "loss": 0.84450155, "num_input_tokens_seen": 96224990, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.25939941, "step": 4451, "time_per_iteration": 2.8153154850006104 }, { "auxiliary_loss_clip": 0.01511406, "auxiliary_loss_mlp": 0.0104701, "balance_loss_clip": 1.31207836, "balance_loss_mlp": 1.02195239, "epoch": 0.26766872087779947, "flos": 28049955143040.0, "grad_norm": 1.6147705887826143, "language_loss": 0.86888748, "learning_rate": 3.436140112818882e-06, "loss": 0.89447165, "num_input_tokens_seen": 96245345, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.25048828, "step": 4452, "time_per_iteration": 2.923161268234253 }, { "auxiliary_loss_clip": 0.01520066, "auxiliary_loss_mlp": 0.01051527, "balance_loss_clip": 1.31836283, "balance_loss_mlp": 1.02487135, "epoch": 0.26772884413046744, "flos": 18332354943360.0, "grad_norm": 2.3917675978850377, "language_loss": 0.84740114, "learning_rate": 3.435869031622194e-06, "loss": 0.87311703, "num_input_tokens_seen": 96259000, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.26660156, "step": 4453, "time_per_iteration": 2.8075778484344482 }, { "auxiliary_loss_clip": 0.01516238, "auxiliary_loss_mlp": 0.01059335, "balance_loss_clip": 1.31352663, "balance_loss_mlp": 1.03206015, "epoch": 0.2677889673831354, "flos": 22137698926080.0, "grad_norm": 1.6747056314030577, "language_loss": 0.80427641, "learning_rate": 3.435597895977208e-06, "loss": 0.83003217, "num_input_tokens_seen": 96277000, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.27319336, "step": 4454, "time_per_iteration": 4.3820905685424805 }, { "auxiliary_loss_clip": 0.01524215, "auxiliary_loss_mlp": 0.01046112, "balance_loss_clip": 1.31974483, "balance_loss_mlp": 1.02106643, "epoch": 0.2678490906358034, "flos": 23739478502400.0, "grad_norm": 1.6681156457499626, "language_loss": 0.73431039, "learning_rate": 3.435326705894206e-06, "loss": 0.76001364, "num_input_tokens_seen": 96297010, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.25061035, "step": 4455, "time_per_iteration": 2.910524606704712 }, { "auxiliary_loss_clip": 0.0151458, "auxiliary_loss_mlp": 0.01044663, "balance_loss_clip": 1.31688273, "balance_loss_mlp": 1.0205946, "epoch": 0.2679092138884714, "flos": 21773024686080.0, "grad_norm": 1.4910234440050816, "language_loss": 0.74514723, "learning_rate": 3.435055461383471e-06, "loss": 0.77073967, "num_input_tokens_seen": 96315780, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.24072266, "step": 4456, "time_per_iteration": 2.896707534790039 }, { "auxiliary_loss_clip": 0.01535003, "auxiliary_loss_mlp": 0.01046038, "balance_loss_clip": 1.32860148, "balance_loss_mlp": 1.02118301, "epoch": 0.26796933714113935, "flos": 19869696645120.0, "grad_norm": 2.927129881848539, "language_loss": 0.72150171, "learning_rate": 3.4347841624552896e-06, "loss": 0.74731213, "num_input_tokens_seen": 96333465, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.24865723, "step": 4457, "time_per_iteration": 4.166894197463989 }, { "auxiliary_loss_clip": 0.01540205, "auxiliary_loss_mlp": 0.01041767, "balance_loss_clip": 1.33508384, "balance_loss_mlp": 1.01716197, "epoch": 0.2680294603938073, "flos": 20057553630720.0, "grad_norm": 1.6220325537808822, "language_loss": 0.80135399, "learning_rate": 3.4345128091199493e-06, "loss": 0.82717371, "num_input_tokens_seen": 96352005, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.24609375, "step": 4458, "time_per_iteration": 2.846040725708008 }, { "auxiliary_loss_clip": 0.01296127, "auxiliary_loss_mlp": 0.01022009, "balance_loss_clip": 1.1768508, "balance_loss_mlp": 1.00160015, "epoch": 0.2680895836464753, "flos": 72146590214400.0, "grad_norm": 0.8901027373155399, "language_loss": 0.58835876, "learning_rate": 3.434241401387739e-06, "loss": 0.61154014, "num_input_tokens_seen": 96406265, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.20410156, "step": 4459, "time_per_iteration": 4.736678123474121 }, { "auxiliary_loss_clip": 0.01521079, "auxiliary_loss_mlp": 0.01044809, "balance_loss_clip": 1.31899548, "balance_loss_mlp": 1.02111018, "epoch": 0.26814970689914325, "flos": 20458767686400.0, "grad_norm": 1.8678507906264046, "language_loss": 0.85950977, "learning_rate": 3.4339699392689507e-06, "loss": 0.88516861, "num_input_tokens_seen": 96425225, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.23718262, "step": 4460, "time_per_iteration": 2.8161098957061768 }, { "auxiliary_loss_clip": 0.01524257, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.32424283, "balance_loss_mlp": 1.02066898, "epoch": 0.2682098301518112, "flos": 17575651342080.0, "grad_norm": 1.7172688425154983, "language_loss": 0.68880856, "learning_rate": 3.4336984227738796e-06, "loss": 0.71449018, "num_input_tokens_seen": 96443780, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.23217773, "step": 4461, "time_per_iteration": 2.8570456504821777 }, { "auxiliary_loss_clip": 0.0152097, "auxiliary_loss_mlp": 0.01047964, "balance_loss_clip": 1.32015753, "balance_loss_mlp": 1.02363312, "epoch": 0.2682699534044792, "flos": 18342535023360.0, "grad_norm": 1.4548136871606214, "language_loss": 0.68235868, "learning_rate": 3.43342685191282e-06, "loss": 0.70804799, "num_input_tokens_seen": 96464530, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.24353027, "step": 4462, "time_per_iteration": 2.8807334899902344 }, { "auxiliary_loss_clip": 0.0152623, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.32519603, "balance_loss_mlp": 1.02082896, "epoch": 0.26833007665714714, "flos": 25312183655040.0, "grad_norm": 1.8316420456471962, "language_loss": 0.7070933, "learning_rate": 3.4331552266960705e-06, "loss": 0.73280537, "num_input_tokens_seen": 96483345, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.24145508, "step": 4463, "time_per_iteration": 2.9418694972991943 }, { "auxiliary_loss_clip": 0.01537395, "auxiliary_loss_mlp": 0.01044958, "balance_loss_clip": 1.33017135, "balance_loss_mlp": 1.02134264, "epoch": 0.2683901999098151, "flos": 16106412343680.0, "grad_norm": 4.130139473240399, "language_loss": 0.78491861, "learning_rate": 3.432883547133931e-06, "loss": 0.81074202, "num_input_tokens_seen": 96498305, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.23632812, "step": 4464, "time_per_iteration": 2.913673162460327 }, { "auxiliary_loss_clip": 0.01528349, "auxiliary_loss_mlp": 0.01044006, "balance_loss_clip": 1.32602525, "balance_loss_mlp": 1.02142799, "epoch": 0.2684503231624831, "flos": 27319475543040.0, "grad_norm": 1.824387969057913, "language_loss": 0.71600354, "learning_rate": 3.432611813236704e-06, "loss": 0.74172717, "num_input_tokens_seen": 96519740, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.22595215, "step": 4465, "time_per_iteration": 2.8759219646453857 }, { "auxiliary_loss_clip": 0.0129338, "auxiliary_loss_mlp": 0.0103697, "balance_loss_clip": 1.17613339, "balance_loss_mlp": 1.01646614, "epoch": 0.26851044641515104, "flos": 71890675770240.0, "grad_norm": 0.7178615314687674, "language_loss": 0.53256559, "learning_rate": 3.4323400250146943e-06, "loss": 0.5558691, "num_input_tokens_seen": 96588870, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.20507812, "step": 4466, "time_per_iteration": 3.506331205368042 }, { "auxiliary_loss_clip": 0.01523212, "auxiliary_loss_mlp": 0.01047766, "balance_loss_clip": 1.32113171, "balance_loss_mlp": 1.02406716, "epoch": 0.268570569667819, "flos": 18742934672640.0, "grad_norm": 2.4058971636445454, "language_loss": 0.74208295, "learning_rate": 3.4320681824782057e-06, "loss": 0.7677927, "num_input_tokens_seen": 96605100, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.23706055, "step": 4467, "time_per_iteration": 2.8276045322418213 }, { "auxiliary_loss_clip": 0.01541312, "auxiliary_loss_mlp": 0.01044913, "balance_loss_clip": 1.33442187, "balance_loss_mlp": 1.02142882, "epoch": 0.268630692920487, "flos": 18185290784640.0, "grad_norm": 3.807262057356128, "language_loss": 0.82034016, "learning_rate": 3.4317962856375493e-06, "loss": 0.84620237, "num_input_tokens_seen": 96621410, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.23486328, "step": 4468, "time_per_iteration": 2.8067502975463867 }, { "auxiliary_loss_clip": 0.01295298, "auxiliary_loss_mlp": 0.01046181, "balance_loss_clip": 1.17366219, "balance_loss_mlp": 1.02920556, "epoch": 0.268690816173155, "flos": 68766621010560.0, "grad_norm": 0.8568809874710711, "language_loss": 0.59745157, "learning_rate": 3.4315243345030334e-06, "loss": 0.62086642, "num_input_tokens_seen": 96684810, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.16992188, "step": 4469, "time_per_iteration": 3.368896484375 }, { "auxiliary_loss_clip": 0.01532753, "auxiliary_loss_mlp": 0.01044411, "balance_loss_clip": 1.32833123, "balance_loss_mlp": 1.01919794, "epoch": 0.26875093942582295, "flos": 23303760647040.0, "grad_norm": 1.9444616399261607, "language_loss": 0.82342112, "learning_rate": 3.431252329084972e-06, "loss": 0.84919274, "num_input_tokens_seen": 96701920, "router_z_loss_clip": 2.04296875, "router_z_loss_mlp": 0.25219727, "step": 4470, "time_per_iteration": 2.8601126670837402 }, { "auxiliary_loss_clip": 0.01512829, "auxiliary_loss_mlp": 0.01041984, "balance_loss_clip": 1.315346, "balance_loss_mlp": 1.0181303, "epoch": 0.2688110626784909, "flos": 21553423833600.0, "grad_norm": 1.7073796038118079, "language_loss": 0.837511, "learning_rate": 3.4309802693936786e-06, "loss": 0.86305916, "num_input_tokens_seen": 96721260, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.23852539, "step": 4471, "time_per_iteration": 2.8547284603118896 }, { "auxiliary_loss_clip": 0.01511735, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.31636357, "balance_loss_mlp": 1.01322162, "epoch": 0.2688711859311589, "flos": 28411778960640.0, "grad_norm": 2.6637533667195177, "language_loss": 0.7037167, "learning_rate": 3.43070815543947e-06, "loss": 0.72919405, "num_input_tokens_seen": 96740385, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.22753906, "step": 4472, "time_per_iteration": 2.91843581199646 }, { "auxiliary_loss_clip": 0.01514896, "auxiliary_loss_mlp": 0.01046724, "balance_loss_clip": 1.31592548, "balance_loss_mlp": 1.02328753, "epoch": 0.26893130918382685, "flos": 26006123439360.0, "grad_norm": 1.6439356154777058, "language_loss": 0.69349527, "learning_rate": 3.4304359872326656e-06, "loss": 0.7191115, "num_input_tokens_seen": 96761860, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.234375, "step": 4473, "time_per_iteration": 2.9201087951660156 }, { "auxiliary_loss_clip": 0.01515815, "auxiliary_loss_mlp": 0.01041825, "balance_loss_clip": 1.31863236, "balance_loss_mlp": 1.01835322, "epoch": 0.2689914324364948, "flos": 20349329218560.0, "grad_norm": 1.6951365580709838, "language_loss": 0.84205401, "learning_rate": 3.4301637647835843e-06, "loss": 0.86763048, "num_input_tokens_seen": 96781890, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.23498535, "step": 4474, "time_per_iteration": 2.894864320755005 }, { "auxiliary_loss_clip": 0.01523863, "auxiliary_loss_mlp": 0.01042278, "balance_loss_clip": 1.32642031, "balance_loss_mlp": 1.01909184, "epoch": 0.2690515556891628, "flos": 19474183434240.0, "grad_norm": 2.2175235405452702, "language_loss": 0.71801925, "learning_rate": 3.4298914881025494e-06, "loss": 0.7436806, "num_input_tokens_seen": 96800390, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.23193359, "step": 4475, "time_per_iteration": 2.8265228271484375 }, { "auxiliary_loss_clip": 0.01531614, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.32857573, "balance_loss_mlp": 1.01620603, "epoch": 0.26911167894183075, "flos": 18154768527360.0, "grad_norm": 1.7490499895849227, "language_loss": 0.74200571, "learning_rate": 3.4296191571998863e-06, "loss": 0.76773083, "num_input_tokens_seen": 96816685, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.24682617, "step": 4476, "time_per_iteration": 2.899834156036377 }, { "auxiliary_loss_clip": 0.01518713, "auxiliary_loss_mlp": 0.01035445, "balance_loss_clip": 1.32059312, "balance_loss_mlp": 1.01234245, "epoch": 0.2691718021944987, "flos": 19984835957760.0, "grad_norm": 2.081641303547754, "language_loss": 0.81800151, "learning_rate": 3.429346772085922e-06, "loss": 0.84354305, "num_input_tokens_seen": 96836285, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.23120117, "step": 4477, "time_per_iteration": 2.844987154006958 }, { "auxiliary_loss_clip": 0.01534855, "auxiliary_loss_mlp": 0.01041472, "balance_loss_clip": 1.33069563, "balance_loss_mlp": 1.01776171, "epoch": 0.2692319254471667, "flos": 37460260788480.0, "grad_norm": 1.6903498597265838, "language_loss": 0.665362, "learning_rate": 3.429074332770984e-06, "loss": 0.69112527, "num_input_tokens_seen": 96857745, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.23681641, "step": 4478, "time_per_iteration": 2.975780487060547 }, { "auxiliary_loss_clip": 0.01514371, "auxiliary_loss_mlp": 0.01037136, "balance_loss_clip": 1.31386089, "balance_loss_mlp": 1.01411712, "epoch": 0.26929204869983464, "flos": 22138060884480.0, "grad_norm": 2.07843198557456, "language_loss": 0.81425226, "learning_rate": 3.4288018392654047e-06, "loss": 0.83976734, "num_input_tokens_seen": 96877295, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.23046875, "step": 4479, "time_per_iteration": 2.8427698612213135 }, { "auxiliary_loss_clip": 0.01531587, "auxiliary_loss_mlp": 0.01047432, "balance_loss_clip": 1.32863688, "balance_loss_mlp": 1.02345896, "epoch": 0.2693521719525026, "flos": 19802498837760.0, "grad_norm": 2.0892483971033022, "language_loss": 0.81991559, "learning_rate": 3.4285292915795166e-06, "loss": 0.84570581, "num_input_tokens_seen": 96896160, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.2401123, "step": 4480, "time_per_iteration": 2.8133246898651123 }, { "auxiliary_loss_clip": 0.01515571, "auxiliary_loss_mlp": 0.01042121, "balance_loss_clip": 1.3169663, "balance_loss_mlp": 1.0184226, "epoch": 0.2694122952051706, "flos": 21003743030400.0, "grad_norm": 1.5643643635270468, "language_loss": 0.78294063, "learning_rate": 3.4282566897236543e-06, "loss": 0.80851746, "num_input_tokens_seen": 96915410, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.23693848, "step": 4481, "time_per_iteration": 2.8998193740844727 }, { "auxiliary_loss_clip": 0.01518088, "auxiliary_loss_mlp": 0.01045882, "balance_loss_clip": 1.31754112, "balance_loss_mlp": 1.02175379, "epoch": 0.2694724184578386, "flos": 25860507114240.0, "grad_norm": 1.7556804992802264, "language_loss": 0.74469578, "learning_rate": 3.4279840337081547e-06, "loss": 0.77033544, "num_input_tokens_seen": 96937865, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.24133301, "step": 4482, "time_per_iteration": 4.287272214889526 }, { "auxiliary_loss_clip": 0.01520579, "auxiliary_loss_mlp": 0.01039507, "balance_loss_clip": 1.32060075, "balance_loss_mlp": 1.01536703, "epoch": 0.26953254171050656, "flos": 21737254032000.0, "grad_norm": 1.7941885585558535, "language_loss": 0.73235637, "learning_rate": 3.4277113235433584e-06, "loss": 0.75795722, "num_input_tokens_seen": 96957710, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.24133301, "step": 4483, "time_per_iteration": 2.8306100368499756 }, { "auxiliary_loss_clip": 0.015187, "auxiliary_loss_mlp": 0.01045371, "balance_loss_clip": 1.31381774, "balance_loss_mlp": 1.02095699, "epoch": 0.2695926649631745, "flos": 19692472187520.0, "grad_norm": 1.9586012438449065, "language_loss": 0.87803388, "learning_rate": 3.427438559239605e-06, "loss": 0.9036746, "num_input_tokens_seen": 96975890, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.24438477, "step": 4484, "time_per_iteration": 2.9276130199432373 }, { "auxiliary_loss_clip": 0.0152582, "auxiliary_loss_mlp": 0.01043915, "balance_loss_clip": 1.32417393, "balance_loss_mlp": 1.01988268, "epoch": 0.2696527882158425, "flos": 32898484673280.0, "grad_norm": 1.6773623536912314, "language_loss": 0.67409897, "learning_rate": 3.427165740807239e-06, "loss": 0.69979632, "num_input_tokens_seen": 96998595, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.24035645, "step": 4485, "time_per_iteration": 2.917393207550049 }, { "auxiliary_loss_clip": 0.01521215, "auxiliary_loss_mlp": 0.01042859, "balance_loss_clip": 1.32023418, "balance_loss_mlp": 1.01860046, "epoch": 0.26971291146851045, "flos": 12130992581760.0, "grad_norm": 2.5273858962031874, "language_loss": 0.74549788, "learning_rate": 3.426892868256604e-06, "loss": 0.77113867, "num_input_tokens_seen": 97013715, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.24279785, "step": 4486, "time_per_iteration": 2.8294968605041504 }, { "auxiliary_loss_clip": 0.01532343, "auxiliary_loss_mlp": 0.01041252, "balance_loss_clip": 1.32925034, "balance_loss_mlp": 1.01673067, "epoch": 0.2697730347211784, "flos": 22643781724800.0, "grad_norm": 2.075857005406408, "language_loss": 0.85435903, "learning_rate": 3.4266199415980495e-06, "loss": 0.88009489, "num_input_tokens_seen": 97031570, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.24511719, "step": 4487, "time_per_iteration": 2.8783228397369385 }, { "auxiliary_loss_clip": 0.01549794, "auxiliary_loss_mlp": 0.01047499, "balance_loss_clip": 1.34115684, "balance_loss_mlp": 1.02223873, "epoch": 0.2698331579738464, "flos": 23523361499520.0, "grad_norm": 2.587359203731111, "language_loss": 0.73266125, "learning_rate": 3.4263469608419234e-06, "loss": 0.75863415, "num_input_tokens_seen": 97049815, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.25292969, "step": 4488, "time_per_iteration": 2.876934051513672 }, { "auxiliary_loss_clip": 0.01531321, "auxiliary_loss_mlp": 0.01052291, "balance_loss_clip": 1.32812726, "balance_loss_mlp": 1.02676845, "epoch": 0.26989328122651435, "flos": 24650847388800.0, "grad_norm": 1.6458859640149677, "language_loss": 0.84273887, "learning_rate": 3.426073925998578e-06, "loss": 0.86857498, "num_input_tokens_seen": 97067570, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.25549316, "step": 4489, "time_per_iteration": 4.276723861694336 }, { "auxiliary_loss_clip": 0.01544081, "auxiliary_loss_mlp": 0.01053778, "balance_loss_clip": 1.3387084, "balance_loss_mlp": 1.02841067, "epoch": 0.2699534044791823, "flos": 10777752547200.0, "grad_norm": 2.365556009860475, "language_loss": 0.91654402, "learning_rate": 3.4258008370783656e-06, "loss": 0.94252259, "num_input_tokens_seen": 97082180, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.25390625, "step": 4490, "time_per_iteration": 2.8266777992248535 }, { "auxiliary_loss_clip": 0.01522404, "auxiliary_loss_mlp": 0.01048883, "balance_loss_clip": 1.32380557, "balance_loss_mlp": 1.02541089, "epoch": 0.2700135277318503, "flos": 36184670110080.0, "grad_norm": 2.422056637120002, "language_loss": 0.73936009, "learning_rate": 3.4255276940916434e-06, "loss": 0.765073, "num_input_tokens_seen": 97103470, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.23498535, "step": 4491, "time_per_iteration": 3.000488519668579 }, { "auxiliary_loss_clip": 0.0153894, "auxiliary_loss_mlp": 0.0104753, "balance_loss_clip": 1.33541024, "balance_loss_mlp": 1.02322268, "epoch": 0.27007365098451824, "flos": 17427003615360.0, "grad_norm": 2.6415423951019754, "language_loss": 0.74846494, "learning_rate": 3.4252544970487676e-06, "loss": 0.7743296, "num_input_tokens_seen": 97118100, "router_z_loss_clip": 2.03613281, "router_z_loss_mlp": 0.24328613, "step": 4492, "time_per_iteration": 4.165241718292236 }, { "auxiliary_loss_clip": 0.01529333, "auxiliary_loss_mlp": 0.01048103, "balance_loss_clip": 1.32946682, "balance_loss_mlp": 1.02342701, "epoch": 0.2701337742371862, "flos": 23196177216000.0, "grad_norm": 2.2112982659418963, "language_loss": 0.90074587, "learning_rate": 3.4249812459600986e-06, "loss": 0.92652023, "num_input_tokens_seen": 97136765, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.24707031, "step": 4493, "time_per_iteration": 2.8973934650421143 }, { "auxiliary_loss_clip": 0.01544985, "auxiliary_loss_mlp": 0.01044494, "balance_loss_clip": 1.34319484, "balance_loss_mlp": 1.02015185, "epoch": 0.2701938974898542, "flos": 24400181341440.0, "grad_norm": 1.476103825633319, "language_loss": 0.72259426, "learning_rate": 3.424707940835998e-06, "loss": 0.74848908, "num_input_tokens_seen": 97157470, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.24353027, "step": 4494, "time_per_iteration": 4.419919967651367 }, { "auxiliary_loss_clip": 0.01534548, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.33494902, "balance_loss_mlp": 1.01889145, "epoch": 0.2702540207425222, "flos": 26225679047040.0, "grad_norm": 1.9245671842252807, "language_loss": 0.87764406, "learning_rate": 3.42443458168683e-06, "loss": 0.90340853, "num_input_tokens_seen": 97176905, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.23010254, "step": 4495, "time_per_iteration": 2.9214870929718018 }, { "auxiliary_loss_clip": 0.01537274, "auxiliary_loss_mlp": 0.01040778, "balance_loss_clip": 1.335096, "balance_loss_mlp": 1.01769948, "epoch": 0.27031414399519016, "flos": 22935783536640.0, "grad_norm": 1.6715632523666182, "language_loss": 0.77769727, "learning_rate": 3.424161168522959e-06, "loss": 0.80347776, "num_input_tokens_seen": 97196380, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.23071289, "step": 4496, "time_per_iteration": 2.839212417602539 }, { "auxiliary_loss_clip": 0.01292383, "auxiliary_loss_mlp": 0.01028138, "balance_loss_clip": 1.17786098, "balance_loss_mlp": 1.00439167, "epoch": 0.2703742672478581, "flos": 63047904641280.0, "grad_norm": 0.6929025080574113, "language_loss": 0.50212032, "learning_rate": 3.423887701354754e-06, "loss": 0.52532554, "num_input_tokens_seen": 97260100, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.23730469, "step": 4497, "time_per_iteration": 3.3999369144439697 }, { "auxiliary_loss_clip": 0.01549161, "auxiliary_loss_mlp": 0.01043094, "balance_loss_clip": 1.34698558, "balance_loss_mlp": 1.02036071, "epoch": 0.2704343905005261, "flos": 18849386983680.0, "grad_norm": 1.7815894381854147, "language_loss": 0.7360279, "learning_rate": 3.4236141801925847e-06, "loss": 0.76195049, "num_input_tokens_seen": 97277935, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.22741699, "step": 4498, "time_per_iteration": 2.859025478363037 }, { "auxiliary_loss_clip": 0.01293681, "auxiliary_loss_mlp": 0.01026668, "balance_loss_clip": 1.17508101, "balance_loss_mlp": 1.00435245, "epoch": 0.27049451375319405, "flos": 71264793444480.0, "grad_norm": 0.7573861723297939, "language_loss": 0.59180617, "learning_rate": 3.4233406050468237e-06, "loss": 0.61500967, "num_input_tokens_seen": 97338845, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.22363281, "step": 4499, "time_per_iteration": 3.299910306930542 }, { "auxiliary_loss_clip": 0.01544718, "auxiliary_loss_mlp": 0.01043233, "balance_loss_clip": 1.3423903, "balance_loss_mlp": 1.01945126, "epoch": 0.270554637005862, "flos": 24288933081600.0, "grad_norm": 2.395059282927292, "language_loss": 0.74327737, "learning_rate": 3.4230669759278438e-06, "loss": 0.76915693, "num_input_tokens_seen": 97356640, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.23803711, "step": 4500, "time_per_iteration": 2.84537935256958 }, { "auxiliary_loss_clip": 0.01531424, "auxiliary_loss_mlp": 0.01043595, "balance_loss_clip": 1.33205068, "balance_loss_mlp": 1.01982522, "epoch": 0.27061476025853, "flos": 17639410544640.0, "grad_norm": 3.464175558924343, "language_loss": 0.82455879, "learning_rate": 3.4227932928460215e-06, "loss": 0.85030895, "num_input_tokens_seen": 97372585, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.2376709, "step": 4501, "time_per_iteration": 2.7714593410491943 }, { "auxiliary_loss_clip": 0.01551137, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.34441113, "balance_loss_mlp": 1.0220468, "epoch": 0.27067488351119795, "flos": 22720164226560.0, "grad_norm": 1.8646656525468983, "language_loss": 0.73067367, "learning_rate": 3.422519555811735e-06, "loss": 0.75665188, "num_input_tokens_seen": 97393315, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.24621582, "step": 4502, "time_per_iteration": 2.8765456676483154 }, { "auxiliary_loss_clip": 0.01548797, "auxiliary_loss_mlp": 0.01039329, "balance_loss_clip": 1.34215581, "balance_loss_mlp": 1.0148195, "epoch": 0.2707350067638659, "flos": 41734695306240.0, "grad_norm": 1.9024346202620086, "language_loss": 0.69334483, "learning_rate": 3.4222457648353642e-06, "loss": 0.71922612, "num_input_tokens_seen": 97417860, "router_z_loss_clip": 2.06738281, "router_z_loss_mlp": 0.24511719, "step": 4503, "time_per_iteration": 3.0678322315216064 }, { "auxiliary_loss_clip": 0.01532001, "auxiliary_loss_mlp": 0.0104219, "balance_loss_clip": 1.32947779, "balance_loss_mlp": 1.01812184, "epoch": 0.2707951300165339, "flos": 20202536528640.0, "grad_norm": 2.091670978712513, "language_loss": 0.6880694, "learning_rate": 3.4219719199272918e-06, "loss": 0.71381128, "num_input_tokens_seen": 97436780, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.24060059, "step": 4504, "time_per_iteration": 2.8131747245788574 }, { "auxiliary_loss_clip": 0.01539801, "auxiliary_loss_mlp": 0.01043826, "balance_loss_clip": 1.33672678, "balance_loss_mlp": 1.01935232, "epoch": 0.27085525326920185, "flos": 21443894876160.0, "grad_norm": 1.639890872603608, "language_loss": 0.7628355, "learning_rate": 3.421698021097902e-06, "loss": 0.78867179, "num_input_tokens_seen": 97456190, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.24499512, "step": 4505, "time_per_iteration": 2.8475844860076904 }, { "auxiliary_loss_clip": 0.01549067, "auxiliary_loss_mlp": 0.01045028, "balance_loss_clip": 1.34139395, "balance_loss_mlp": 1.01957631, "epoch": 0.2709153765218698, "flos": 17684003934720.0, "grad_norm": 3.1035579874120405, "language_loss": 0.7515623, "learning_rate": 3.42142406835758e-06, "loss": 0.77750325, "num_input_tokens_seen": 97474545, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.2545166, "step": 4506, "time_per_iteration": 2.937532663345337 }, { "auxiliary_loss_clip": 0.01540674, "auxiliary_loss_mlp": 0.01044014, "balance_loss_clip": 1.33571398, "balance_loss_mlp": 1.01951694, "epoch": 0.2709754997745378, "flos": 24465388377600.0, "grad_norm": 1.9194779144569616, "language_loss": 0.81609458, "learning_rate": 3.421150061716715e-06, "loss": 0.84194148, "num_input_tokens_seen": 97494520, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.24511719, "step": 4507, "time_per_iteration": 2.8937251567840576 }, { "auxiliary_loss_clip": 0.01277042, "auxiliary_loss_mlp": 0.01025878, "balance_loss_clip": 1.16090786, "balance_loss_mlp": 1.00585079, "epoch": 0.2710356230272058, "flos": 65239976868480.0, "grad_norm": 0.7388666150128858, "language_loss": 0.50854933, "learning_rate": 3.420876001185698e-06, "loss": 0.53157854, "num_input_tokens_seen": 97552455, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.20019531, "step": 4508, "time_per_iteration": 3.261735200881958 }, { "auxiliary_loss_clip": 0.0152474, "auxiliary_loss_mlp": 0.01046376, "balance_loss_clip": 1.32556009, "balance_loss_mlp": 1.02285612, "epoch": 0.27109574627987376, "flos": 25495561405440.0, "grad_norm": 1.991897825364481, "language_loss": 0.75892687, "learning_rate": 3.4206018867749197e-06, "loss": 0.78463805, "num_input_tokens_seen": 97572650, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.23535156, "step": 4509, "time_per_iteration": 2.8819637298583984 }, { "auxiliary_loss_clip": 0.01505652, "auxiliary_loss_mlp": 0.01051832, "balance_loss_clip": 1.30797935, "balance_loss_mlp": 1.02886069, "epoch": 0.2711558695325417, "flos": 19692291208320.0, "grad_norm": 1.7903139396306431, "language_loss": 0.72443724, "learning_rate": 3.4203277184947757e-06, "loss": 0.75001204, "num_input_tokens_seen": 97591150, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.22961426, "step": 4510, "time_per_iteration": 2.864017963409424 }, { "auxiliary_loss_clip": 0.01525966, "auxiliary_loss_mlp": 0.01045097, "balance_loss_clip": 1.32391834, "balance_loss_mlp": 1.02164865, "epoch": 0.2712159927852097, "flos": 18596865899520.0, "grad_norm": 2.206869137674953, "language_loss": 0.71527231, "learning_rate": 3.4200534963556627e-06, "loss": 0.74098301, "num_input_tokens_seen": 97607410, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.23449707, "step": 4511, "time_per_iteration": 2.7969539165496826 }, { "auxiliary_loss_clip": 0.01538405, "auxiliary_loss_mlp": 0.01051071, "balance_loss_clip": 1.33237386, "balance_loss_mlp": 1.0265379, "epoch": 0.27127611603787766, "flos": 25641358709760.0, "grad_norm": 1.9821475581319983, "language_loss": 0.8225131, "learning_rate": 3.419779220367979e-06, "loss": 0.84840786, "num_input_tokens_seen": 97626870, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.2454834, "step": 4512, "time_per_iteration": 2.8763821125030518 }, { "auxiliary_loss_clip": 0.01523746, "auxiliary_loss_mlp": 0.01049393, "balance_loss_clip": 1.32336104, "balance_loss_mlp": 1.02512217, "epoch": 0.2713362392905456, "flos": 23159365931520.0, "grad_norm": 3.338849466002978, "language_loss": 0.80903387, "learning_rate": 3.419504890542124e-06, "loss": 0.83476526, "num_input_tokens_seen": 97646595, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.24279785, "step": 4513, "time_per_iteration": 2.8452086448669434 }, { "auxiliary_loss_clip": 0.01533667, "auxiliary_loss_mlp": 0.01051156, "balance_loss_clip": 1.32888484, "balance_loss_mlp": 1.0280174, "epoch": 0.2713963625432136, "flos": 18374369379840.0, "grad_norm": 2.684706151923164, "language_loss": 0.88874876, "learning_rate": 3.4192305068885026e-06, "loss": 0.91459703, "num_input_tokens_seen": 97665485, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.23144531, "step": 4514, "time_per_iteration": 2.8170228004455566 }, { "auxiliary_loss_clip": 0.01538351, "auxiliary_loss_mlp": 0.01049571, "balance_loss_clip": 1.33589149, "balance_loss_mlp": 1.02620554, "epoch": 0.27145648579588155, "flos": 22501875473280.0, "grad_norm": 1.5879991795552901, "language_loss": 0.92382729, "learning_rate": 3.418956069417517e-06, "loss": 0.94970649, "num_input_tokens_seen": 97683800, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.23339844, "step": 4515, "time_per_iteration": 2.854952573776245 }, { "auxiliary_loss_clip": 0.01541849, "auxiliary_loss_mlp": 0.01059893, "balance_loss_clip": 1.33435988, "balance_loss_mlp": 1.03247488, "epoch": 0.2715166090485495, "flos": 19247071944960.0, "grad_norm": 5.701665402824538, "language_loss": 0.75141078, "learning_rate": 3.4186815781395756e-06, "loss": 0.77742821, "num_input_tokens_seen": 97700505, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.27416992, "step": 4516, "time_per_iteration": 2.7883284091949463 }, { "auxiliary_loss_clip": 0.01523304, "auxiliary_loss_mlp": 0.01051539, "balance_loss_clip": 1.32336271, "balance_loss_mlp": 1.0271492, "epoch": 0.2715767323012175, "flos": 17717829062400.0, "grad_norm": 1.8205656916879882, "language_loss": 0.77065909, "learning_rate": 3.4184070330650866e-06, "loss": 0.79640758, "num_input_tokens_seen": 97717410, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.24401855, "step": 4517, "time_per_iteration": 4.288268804550171 }, { "auxiliary_loss_clip": 0.01524075, "auxiliary_loss_mlp": 0.01049378, "balance_loss_clip": 1.3225044, "balance_loss_mlp": 1.0257628, "epoch": 0.27163685555388545, "flos": 22393070432640.0, "grad_norm": 3.151959911710973, "language_loss": 0.78918213, "learning_rate": 3.4181324342044607e-06, "loss": 0.81491667, "num_input_tokens_seen": 97734545, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.23608398, "step": 4518, "time_per_iteration": 2.8431448936462402 }, { "auxiliary_loss_clip": 0.01531604, "auxiliary_loss_mlp": 0.01051724, "balance_loss_clip": 1.3290987, "balance_loss_mlp": 1.02900314, "epoch": 0.2716969788065534, "flos": 22357028309760.0, "grad_norm": 1.698227330384358, "language_loss": 0.69249296, "learning_rate": 3.41785778156811e-06, "loss": 0.71832621, "num_input_tokens_seen": 97754000, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.22717285, "step": 4519, "time_per_iteration": 2.847877025604248 }, { "auxiliary_loss_clip": 0.01522266, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.32212591, "balance_loss_mlp": 1.02544928, "epoch": 0.2717571020592214, "flos": 25239375492480.0, "grad_norm": 2.023738187590326, "language_loss": 0.76329529, "learning_rate": 3.417583075166451e-06, "loss": 0.78900027, "num_input_tokens_seen": 97772080, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.2277832, "step": 4520, "time_per_iteration": 2.869149923324585 }, { "auxiliary_loss_clip": 0.01541822, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.3353548, "balance_loss_mlp": 1.02395868, "epoch": 0.2718172253118894, "flos": 20198419251840.0, "grad_norm": 2.8833189281257345, "language_loss": 0.76940417, "learning_rate": 3.4173083150099e-06, "loss": 0.79532599, "num_input_tokens_seen": 97789370, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.26403809, "step": 4521, "time_per_iteration": 2.819533586502075 }, { "auxiliary_loss_clip": 0.01540183, "auxiliary_loss_mlp": 0.01056946, "balance_loss_clip": 1.33379292, "balance_loss_mlp": 1.03232932, "epoch": 0.27187734856455736, "flos": 14327182085760.0, "grad_norm": 2.8330427663272504, "language_loss": 0.76224911, "learning_rate": 3.417033501108875e-06, "loss": 0.78822041, "num_input_tokens_seen": 97807385, "router_z_loss_clip": 2.06347656, "router_z_loss_mlp": 0.24645996, "step": 4522, "time_per_iteration": 2.824429988861084 }, { "auxiliary_loss_clip": 0.01539029, "auxiliary_loss_mlp": 0.01050308, "balance_loss_clip": 1.33424997, "balance_loss_mlp": 1.02479768, "epoch": 0.27193747181722533, "flos": 21118067936640.0, "grad_norm": 1.6635452219029099, "language_loss": 0.7335614, "learning_rate": 3.416758633473798e-06, "loss": 0.75945473, "num_input_tokens_seen": 97827930, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.25500488, "step": 4523, "time_per_iteration": 2.839393377304077 }, { "auxiliary_loss_clip": 0.0152041, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.32195401, "balance_loss_mlp": 1.02175057, "epoch": 0.2719975950698933, "flos": 19692291208320.0, "grad_norm": 1.5592061137812998, "language_loss": 0.74660599, "learning_rate": 3.4164837121150915e-06, "loss": 0.77228677, "num_input_tokens_seen": 97847440, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.25915527, "step": 4524, "time_per_iteration": 4.234760284423828 }, { "auxiliary_loss_clip": 0.01540466, "auxiliary_loss_mlp": 0.0105208, "balance_loss_clip": 1.33870673, "balance_loss_mlp": 1.02654564, "epoch": 0.27205771832256126, "flos": 24765353274240.0, "grad_norm": 1.592078818239205, "language_loss": 0.77258193, "learning_rate": 3.4162087370431803e-06, "loss": 0.79850733, "num_input_tokens_seen": 97867620, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.25512695, "step": 4525, "time_per_iteration": 2.926093578338623 }, { "auxiliary_loss_clip": 0.01525974, "auxiliary_loss_mlp": 0.01050906, "balance_loss_clip": 1.32428527, "balance_loss_mlp": 1.02730227, "epoch": 0.2721178415752292, "flos": 21763839991680.0, "grad_norm": 2.339047790230912, "language_loss": 0.82520175, "learning_rate": 3.4159337082684926e-06, "loss": 0.85097063, "num_input_tokens_seen": 97884345, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.23596191, "step": 4526, "time_per_iteration": 2.8065624237060547 }, { "auxiliary_loss_clip": 0.01554951, "auxiliary_loss_mlp": 0.01046532, "balance_loss_clip": 1.34348369, "balance_loss_mlp": 1.02166486, "epoch": 0.2721779648278972, "flos": 12684338213760.0, "grad_norm": 2.2119183252801795, "language_loss": 0.77919114, "learning_rate": 3.4156586258014566e-06, "loss": 0.80520606, "num_input_tokens_seen": 97901500, "router_z_loss_clip": 2.11523438, "router_z_loss_mlp": 0.2487793, "step": 4527, "time_per_iteration": 4.177669525146484 }, { "auxiliary_loss_clip": 0.01537586, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.3349539, "balance_loss_mlp": 1.02279973, "epoch": 0.27223808808056515, "flos": 16261846790400.0, "grad_norm": 2.19648132928749, "language_loss": 0.82656884, "learning_rate": 3.415383489652503e-06, "loss": 0.85240829, "num_input_tokens_seen": 97917800, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.23571777, "step": 4528, "time_per_iteration": 2.8477084636688232 }, { "auxiliary_loss_clip": 0.01515965, "auxiliary_loss_mlp": 0.01041892, "balance_loss_clip": 1.31728482, "balance_loss_mlp": 1.01753747, "epoch": 0.2722982113332331, "flos": 27757319904000.0, "grad_norm": 5.803982971644948, "language_loss": 0.78252745, "learning_rate": 3.4151082998320666e-06, "loss": 0.80810595, "num_input_tokens_seen": 97937225, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.24365234, "step": 4529, "time_per_iteration": 4.350273609161377 }, { "auxiliary_loss_clip": 0.01538389, "auxiliary_loss_mlp": 0.01052958, "balance_loss_clip": 1.33270931, "balance_loss_mlp": 1.02730429, "epoch": 0.2723583345859011, "flos": 21736349136000.0, "grad_norm": 7.69959924023562, "language_loss": 0.83446801, "learning_rate": 3.4148330563505805e-06, "loss": 0.86038154, "num_input_tokens_seen": 97956845, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.25671387, "step": 4530, "time_per_iteration": 2.8245480060577393 }, { "auxiliary_loss_clip": 0.01522358, "auxiliary_loss_mlp": 0.01046513, "balance_loss_clip": 1.32181787, "balance_loss_mlp": 1.02042961, "epoch": 0.27241845783856905, "flos": 17356050489600.0, "grad_norm": 2.0049636029477926, "language_loss": 0.92915356, "learning_rate": 3.4145577592184838e-06, "loss": 0.95484227, "num_input_tokens_seen": 97972465, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.26098633, "step": 4531, "time_per_iteration": 2.8348162174224854 }, { "auxiliary_loss_clip": 0.01533091, "auxiliary_loss_mlp": 0.0104841, "balance_loss_clip": 1.32849991, "balance_loss_mlp": 1.02196932, "epoch": 0.272478581091237, "flos": 24765127050240.0, "grad_norm": 2.5009026653901936, "language_loss": 0.7770735, "learning_rate": 3.4142824084462155e-06, "loss": 0.80288851, "num_input_tokens_seen": 97990770, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.26416016, "step": 4532, "time_per_iteration": 2.848690986633301 }, { "auxiliary_loss_clip": 0.0151142, "auxiliary_loss_mlp": 0.01041724, "balance_loss_clip": 1.31404686, "balance_loss_mlp": 1.01613009, "epoch": 0.272538704343905, "flos": 17898537369600.0, "grad_norm": 3.462935185690067, "language_loss": 0.90475899, "learning_rate": 3.4140070040442162e-06, "loss": 0.93029046, "num_input_tokens_seen": 98005775, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.25585938, "step": 4533, "time_per_iteration": 2.844538688659668 }, { "auxiliary_loss_clip": 0.01509171, "auxiliary_loss_mlp": 0.01041826, "balance_loss_clip": 1.31142092, "balance_loss_mlp": 1.01618385, "epoch": 0.272598827596573, "flos": 22942886970240.0, "grad_norm": 2.348844818246144, "language_loss": 0.72197914, "learning_rate": 3.413731546022929e-06, "loss": 0.74748909, "num_input_tokens_seen": 98025750, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.25646973, "step": 4534, "time_per_iteration": 2.8592658042907715 }, { "auxiliary_loss_clip": 0.01531149, "auxiliary_loss_mlp": 0.01046175, "balance_loss_clip": 1.32564723, "balance_loss_mlp": 1.01854229, "epoch": 0.27265895084924097, "flos": 24247778296320.0, "grad_norm": 1.6410667037528965, "language_loss": 0.91781628, "learning_rate": 3.4134560343928005e-06, "loss": 0.94358957, "num_input_tokens_seen": 98044955, "router_z_loss_clip": 2.05859375, "router_z_loss_mlp": 0.27685547, "step": 4535, "time_per_iteration": 2.8365399837493896 }, { "auxiliary_loss_clip": 0.01529001, "auxiliary_loss_mlp": 0.01047645, "balance_loss_clip": 1.32493389, "balance_loss_mlp": 1.02088201, "epoch": 0.27271907410190893, "flos": 27023718412800.0, "grad_norm": 1.717533848895498, "language_loss": 0.73555899, "learning_rate": 3.4131804691642778e-06, "loss": 0.76132536, "num_input_tokens_seen": 98065860, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.2677002, "step": 4536, "time_per_iteration": 2.9224085807800293 }, { "auxiliary_loss_clip": 0.01535549, "auxiliary_loss_mlp": 0.01040055, "balance_loss_clip": 1.33303261, "balance_loss_mlp": 1.01437747, "epoch": 0.2727791973545769, "flos": 34464131637120.0, "grad_norm": 2.082901301202596, "language_loss": 0.72126818, "learning_rate": 3.41290485034781e-06, "loss": 0.7470243, "num_input_tokens_seen": 98085450, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.25708008, "step": 4537, "time_per_iteration": 2.926367998123169 }, { "auxiliary_loss_clip": 0.01510762, "auxiliary_loss_mlp": 0.01041109, "balance_loss_clip": 1.30902731, "balance_loss_mlp": 1.01404905, "epoch": 0.27283932060724486, "flos": 15048115032960.0, "grad_norm": 2.1746670319933625, "language_loss": 0.79310179, "learning_rate": 3.4126291779538485e-06, "loss": 0.8186205, "num_input_tokens_seen": 98099115, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.27124023, "step": 4538, "time_per_iteration": 2.81303071975708 }, { "auxiliary_loss_clip": 0.01507939, "auxiliary_loss_mlp": 0.01046757, "balance_loss_clip": 1.30788612, "balance_loss_mlp": 1.02055526, "epoch": 0.2728994438599128, "flos": 21662274119040.0, "grad_norm": 1.5855004726806494, "language_loss": 0.90852118, "learning_rate": 3.412353451992847e-06, "loss": 0.93406808, "num_input_tokens_seen": 98118415, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.26196289, "step": 4539, "time_per_iteration": 2.815244674682617 }, { "auxiliary_loss_clip": 0.01503333, "auxiliary_loss_mlp": 0.01039876, "balance_loss_clip": 1.3052336, "balance_loss_mlp": 1.01347065, "epoch": 0.2729595671125808, "flos": 17496011214720.0, "grad_norm": 2.0022747815866566, "language_loss": 0.89022708, "learning_rate": 3.4120776724752607e-06, "loss": 0.91565919, "num_input_tokens_seen": 98136300, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.26428223, "step": 4540, "time_per_iteration": 2.8378312587738037 }, { "auxiliary_loss_clip": 0.01513514, "auxiliary_loss_mlp": 0.01042352, "balance_loss_clip": 1.31273699, "balance_loss_mlp": 1.01637626, "epoch": 0.27301969036524876, "flos": 19327616968320.0, "grad_norm": 1.978725106874439, "language_loss": 0.82348096, "learning_rate": 3.4118018394115476e-06, "loss": 0.84903967, "num_input_tokens_seen": 98154580, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.25939941, "step": 4541, "time_per_iteration": 2.8452773094177246 }, { "auxiliary_loss_clip": 0.01495738, "auxiliary_loss_mlp": 0.0104445, "balance_loss_clip": 1.2974658, "balance_loss_mlp": 1.0174017, "epoch": 0.2730798136179167, "flos": 21074605666560.0, "grad_norm": 1.854041562097251, "language_loss": 0.80730712, "learning_rate": 3.4115259528121678e-06, "loss": 0.83270895, "num_input_tokens_seen": 98173115, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.27062988, "step": 4542, "time_per_iteration": 2.8231611251831055 }, { "auxiliary_loss_clip": 0.01505304, "auxiliary_loss_mlp": 0.01044562, "balance_loss_clip": 1.30703092, "balance_loss_mlp": 1.01803827, "epoch": 0.2731399368705847, "flos": 19181050502400.0, "grad_norm": 2.614988424904129, "language_loss": 0.91043079, "learning_rate": 3.411250012687582e-06, "loss": 0.93592948, "num_input_tokens_seen": 98190260, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.265625, "step": 4543, "time_per_iteration": 2.856334924697876 }, { "auxiliary_loss_clip": 0.01510002, "auxiliary_loss_mlp": 0.01048766, "balance_loss_clip": 1.30515695, "balance_loss_mlp": 1.02127647, "epoch": 0.27320006012325265, "flos": 18297670164480.0, "grad_norm": 2.8272418826105183, "language_loss": 0.64570796, "learning_rate": 3.410974019048255e-06, "loss": 0.67129564, "num_input_tokens_seen": 98207115, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.27514648, "step": 4544, "time_per_iteration": 2.840031147003174 }, { "auxiliary_loss_clip": 0.01500918, "auxiliary_loss_mlp": 0.01048297, "balance_loss_clip": 1.30235147, "balance_loss_mlp": 1.0203191, "epoch": 0.2732601833759206, "flos": 34874394652800.0, "grad_norm": 2.564007933045178, "language_loss": 0.70604551, "learning_rate": 3.410697971904651e-06, "loss": 0.7315377, "num_input_tokens_seen": 98230610, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.27966309, "step": 4545, "time_per_iteration": 3.0034029483795166 }, { "auxiliary_loss_clip": 0.01345538, "auxiliary_loss_mlp": 0.01076333, "balance_loss_clip": 1.22234082, "balance_loss_mlp": 1.02998471, "epoch": 0.2733203066285886, "flos": 53939826132480.0, "grad_norm": 0.7394199027663796, "language_loss": 0.61664516, "learning_rate": 3.4104218712672383e-06, "loss": 0.6408639, "num_input_tokens_seen": 98293585, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 0.46289062, "step": 4546, "time_per_iteration": 3.3792715072631836 }, { "auxiliary_loss_clip": 0.01496103, "auxiliary_loss_mlp": 0.01050521, "balance_loss_clip": 1.29841542, "balance_loss_mlp": 1.02199435, "epoch": 0.2733804298812566, "flos": 20668188458880.0, "grad_norm": 1.8335473419018207, "language_loss": 0.6622355, "learning_rate": 3.410145717146488e-06, "loss": 0.6877017, "num_input_tokens_seen": 98311680, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.28503418, "step": 4547, "time_per_iteration": 2.8350131511688232 }, { "auxiliary_loss_clip": 0.01486254, "auxiliary_loss_mlp": 0.01045668, "balance_loss_clip": 1.29160058, "balance_loss_mlp": 1.01729572, "epoch": 0.27344055313392457, "flos": 25895191893120.0, "grad_norm": 1.8470132447492067, "language_loss": 0.79330468, "learning_rate": 3.4098695095528694e-06, "loss": 0.8186239, "num_input_tokens_seen": 98330770, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.28393555, "step": 4548, "time_per_iteration": 2.908257484436035 }, { "auxiliary_loss_clip": 0.01498162, "auxiliary_loss_mlp": 0.01045321, "balance_loss_clip": 1.29990685, "balance_loss_mlp": 1.02000046, "epoch": 0.27350067638659253, "flos": 22940036547840.0, "grad_norm": 1.9287180684371499, "language_loss": 0.83753467, "learning_rate": 3.4095932484968585e-06, "loss": 0.8629694, "num_input_tokens_seen": 98349860, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.25354004, "step": 4549, "time_per_iteration": 2.90997314453125 }, { "auxiliary_loss_clip": 0.01507216, "auxiliary_loss_mlp": 0.01043601, "balance_loss_clip": 1.30711985, "balance_loss_mlp": 1.01623094, "epoch": 0.2735607996392605, "flos": 16581791905920.0, "grad_norm": 2.148910374784724, "language_loss": 0.71807754, "learning_rate": 3.4093169339889305e-06, "loss": 0.74358571, "num_input_tokens_seen": 98367040, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.27404785, "step": 4550, "time_per_iteration": 2.8507440090179443 }, { "auxiliary_loss_clip": 0.01491796, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.29434848, "balance_loss_mlp": 1.01340508, "epoch": 0.27362092289192846, "flos": 19654529783040.0, "grad_norm": 2.404857785320402, "language_loss": 0.79648721, "learning_rate": 3.409040566039563e-06, "loss": 0.82179481, "num_input_tokens_seen": 98384010, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.25537109, "step": 4551, "time_per_iteration": 2.7933847904205322 }, { "auxiliary_loss_clip": 0.01493162, "auxiliary_loss_mlp": 0.01046451, "balance_loss_clip": 1.29456675, "balance_loss_mlp": 1.01968884, "epoch": 0.27368104614459643, "flos": 17648007056640.0, "grad_norm": 2.8935141184636937, "language_loss": 0.72593892, "learning_rate": 3.4087641446592362e-06, "loss": 0.75133502, "num_input_tokens_seen": 98399625, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.26757812, "step": 4552, "time_per_iteration": 4.182410478591919 }, { "auxiliary_loss_clip": 0.01506672, "auxiliary_loss_mlp": 0.01043508, "balance_loss_clip": 1.30659437, "balance_loss_mlp": 1.01707971, "epoch": 0.2737411693972644, "flos": 21590008894080.0, "grad_norm": 2.0802376294271587, "language_loss": 0.72684461, "learning_rate": 3.408487669858431e-06, "loss": 0.7523464, "num_input_tokens_seen": 98417310, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.26428223, "step": 4553, "time_per_iteration": 2.8192899227142334 }, { "auxiliary_loss_clip": 0.01506273, "auxiliary_loss_mlp": 0.01051026, "balance_loss_clip": 1.30738676, "balance_loss_mlp": 1.02376294, "epoch": 0.27380129264993236, "flos": 25495063712640.0, "grad_norm": 1.7894983915907772, "language_loss": 0.60348129, "learning_rate": 3.4082111416476337e-06, "loss": 0.62905425, "num_input_tokens_seen": 98438670, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.27270508, "step": 4554, "time_per_iteration": 2.847777843475342 }, { "auxiliary_loss_clip": 0.01518703, "auxiliary_loss_mlp": 0.01050494, "balance_loss_clip": 1.31324422, "balance_loss_mlp": 1.02274203, "epoch": 0.2738614159026003, "flos": 18670443223680.0, "grad_norm": 2.04036368862014, "language_loss": 0.75009131, "learning_rate": 3.4079345600373275e-06, "loss": 0.7757833, "num_input_tokens_seen": 98456060, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.27734375, "step": 4555, "time_per_iteration": 2.81400728225708 }, { "auxiliary_loss_clip": 0.01514885, "auxiliary_loss_mlp": 0.01058563, "balance_loss_clip": 1.31254518, "balance_loss_mlp": 1.0315026, "epoch": 0.2739215391552683, "flos": 23487590845440.0, "grad_norm": 2.0856211976569825, "language_loss": 0.78470099, "learning_rate": 3.407657925038002e-06, "loss": 0.81043541, "num_input_tokens_seen": 98473765, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.27087402, "step": 4556, "time_per_iteration": 2.869431495666504 }, { "auxiliary_loss_clip": 0.01545293, "auxiliary_loss_mlp": 0.01055973, "balance_loss_clip": 1.33194709, "balance_loss_mlp": 1.02748203, "epoch": 0.27398166240793626, "flos": 17137445022720.0, "grad_norm": 1.9235899448487848, "language_loss": 0.83634549, "learning_rate": 3.4073812366601473e-06, "loss": 0.86235809, "num_input_tokens_seen": 98490590, "router_z_loss_clip": 2.1328125, "router_z_loss_mlp": 0.28503418, "step": 4557, "time_per_iteration": 2.825578212738037 }, { "auxiliary_loss_clip": 0.01499108, "auxiliary_loss_mlp": 0.01050142, "balance_loss_clip": 1.29869473, "balance_loss_mlp": 1.02269983, "epoch": 0.2740417856606042, "flos": 23415325620480.0, "grad_norm": 2.1407080566497534, "language_loss": 0.74800706, "learning_rate": 3.4071044949142547e-06, "loss": 0.77349961, "num_input_tokens_seen": 98510590, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.27429199, "step": 4558, "time_per_iteration": 2.8777639865875244 }, { "auxiliary_loss_clip": 0.01505302, "auxiliary_loss_mlp": 0.01049647, "balance_loss_clip": 1.30390239, "balance_loss_mlp": 1.02275372, "epoch": 0.2741019089132722, "flos": 12786447024000.0, "grad_norm": 2.525979296606505, "language_loss": 0.69562888, "learning_rate": 3.406827699810819e-06, "loss": 0.72117841, "num_input_tokens_seen": 98527875, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.26904297, "step": 4559, "time_per_iteration": 4.185668468475342 }, { "auxiliary_loss_clip": 0.01497495, "auxiliary_loss_mlp": 0.01047717, "balance_loss_clip": 1.30041337, "balance_loss_mlp": 1.02211034, "epoch": 0.27416203216594015, "flos": 20641331030400.0, "grad_norm": 1.7261170266664965, "language_loss": 0.72878945, "learning_rate": 3.4065508513603353e-06, "loss": 0.75424159, "num_input_tokens_seen": 98547575, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.25622559, "step": 4560, "time_per_iteration": 2.8363261222839355 }, { "auxiliary_loss_clip": 0.01514845, "auxiliary_loss_mlp": 0.01049028, "balance_loss_clip": 1.31215739, "balance_loss_mlp": 1.02301669, "epoch": 0.27422215541860817, "flos": 26552727596160.0, "grad_norm": 1.7354299555030641, "language_loss": 0.82541311, "learning_rate": 3.406273949573303e-06, "loss": 0.85105187, "num_input_tokens_seen": 98566290, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.26037598, "step": 4561, "time_per_iteration": 2.943784475326538 }, { "auxiliary_loss_clip": 0.01516818, "auxiliary_loss_mlp": 0.01052767, "balance_loss_clip": 1.31353021, "balance_loss_mlp": 1.02533722, "epoch": 0.27428227867127614, "flos": 23341567317120.0, "grad_norm": 1.7510891669136592, "language_loss": 0.75606668, "learning_rate": 3.4059969944602214e-06, "loss": 0.78176248, "num_input_tokens_seen": 98586255, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.27441406, "step": 4562, "time_per_iteration": 4.257068157196045 }, { "auxiliary_loss_clip": 0.01524084, "auxiliary_loss_mlp": 0.01040341, "balance_loss_clip": 1.32107627, "balance_loss_mlp": 1.01394773, "epoch": 0.2743424019239441, "flos": 23044362353280.0, "grad_norm": 2.851078352068534, "language_loss": 0.75238842, "learning_rate": 3.4057199860315928e-06, "loss": 0.77803266, "num_input_tokens_seen": 98606030, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.2644043, "step": 4563, "time_per_iteration": 2.8459272384643555 }, { "auxiliary_loss_clip": 0.01540977, "auxiliary_loss_mlp": 0.01048863, "balance_loss_clip": 1.33139777, "balance_loss_mlp": 1.02280343, "epoch": 0.27440252517661207, "flos": 21991132460160.0, "grad_norm": 2.1455139158019625, "language_loss": 0.64605242, "learning_rate": 3.4054429242979213e-06, "loss": 0.67195082, "num_input_tokens_seen": 98625225, "router_z_loss_clip": 2.09570312, "router_z_loss_mlp": 0.26098633, "step": 4564, "time_per_iteration": 4.349806070327759 }, { "auxiliary_loss_clip": 0.01528239, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.32500899, "balance_loss_mlp": 1.02489638, "epoch": 0.27446264842928003, "flos": 40202194798080.0, "grad_norm": 2.5323808437312754, "language_loss": 0.79746449, "learning_rate": 3.4051658092697135e-06, "loss": 0.82325757, "num_input_tokens_seen": 98649470, "router_z_loss_clip": 2.03027344, "router_z_loss_mlp": 0.26184082, "step": 4565, "time_per_iteration": 3.005516767501831 }, { "auxiliary_loss_clip": 0.01526095, "auxiliary_loss_mlp": 0.01049668, "balance_loss_clip": 1.32414377, "balance_loss_mlp": 1.0232867, "epoch": 0.274522771681948, "flos": 13487444997120.0, "grad_norm": 1.8894271268054148, "language_loss": 0.69786227, "learning_rate": 3.404888640957477e-06, "loss": 0.72361994, "num_input_tokens_seen": 98666915, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.2635498, "step": 4566, "time_per_iteration": 2.788851022720337 }, { "auxiliary_loss_clip": 0.01515334, "auxiliary_loss_mlp": 0.01043505, "balance_loss_clip": 1.31712091, "balance_loss_mlp": 1.01952028, "epoch": 0.27458289493461596, "flos": 28634049256320.0, "grad_norm": 1.825092094995753, "language_loss": 0.62178409, "learning_rate": 3.404611419371723e-06, "loss": 0.64737248, "num_input_tokens_seen": 98688240, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.23999023, "step": 4567, "time_per_iteration": 2.9288392066955566 }, { "auxiliary_loss_clip": 0.01528007, "auxiliary_loss_mlp": 0.01048102, "balance_loss_clip": 1.32770872, "balance_loss_mlp": 1.02284181, "epoch": 0.2746430181872839, "flos": 20129230673280.0, "grad_norm": 1.5586648814580764, "language_loss": 0.8322438, "learning_rate": 3.4043341445229627e-06, "loss": 0.85800487, "num_input_tokens_seen": 98708245, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.25268555, "step": 4568, "time_per_iteration": 2.8157613277435303 }, { "auxiliary_loss_clip": 0.01542483, "auxiliary_loss_mlp": 0.01044801, "balance_loss_clip": 1.33550274, "balance_loss_mlp": 1.01899183, "epoch": 0.2747031414399519, "flos": 20203079466240.0, "grad_norm": 2.022382924671659, "language_loss": 0.68958116, "learning_rate": 3.4040568164217117e-06, "loss": 0.71545398, "num_input_tokens_seen": 98724575, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.25817871, "step": 4569, "time_per_iteration": 2.854052782058716 }, { "auxiliary_loss_clip": 0.01539644, "auxiliary_loss_mlp": 0.0104369, "balance_loss_clip": 1.33467495, "balance_loss_mlp": 1.01883483, "epoch": 0.27476326469261986, "flos": 13524165792000.0, "grad_norm": 1.9863987952133477, "language_loss": 0.72014451, "learning_rate": 3.4037794350784848e-06, "loss": 0.74597788, "num_input_tokens_seen": 98740700, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.24853516, "step": 4570, "time_per_iteration": 2.798959255218506 }, { "auxiliary_loss_clip": 0.01285946, "auxiliary_loss_mlp": 0.01052087, "balance_loss_clip": 1.16532719, "balance_loss_mlp": 1.0272913, "epoch": 0.2748233879452878, "flos": 65965615274880.0, "grad_norm": 0.7283752934738706, "language_loss": 0.55874455, "learning_rate": 3.4035020005038014e-06, "loss": 0.58212483, "num_input_tokens_seen": 98803030, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.24707031, "step": 4571, "time_per_iteration": 3.4542109966278076 }, { "auxiliary_loss_clip": 0.01540045, "auxiliary_loss_mlp": 0.01046713, "balance_loss_clip": 1.33485126, "balance_loss_mlp": 1.02114236, "epoch": 0.2748835111979558, "flos": 17393133242880.0, "grad_norm": 2.4801956600766855, "language_loss": 0.78695858, "learning_rate": 3.4032245127081812e-06, "loss": 0.81282616, "num_input_tokens_seen": 98820505, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.25598145, "step": 4572, "time_per_iteration": 2.8242697715759277 }, { "auxiliary_loss_clip": 0.01518509, "auxiliary_loss_mlp": 0.0104233, "balance_loss_clip": 1.32123899, "balance_loss_mlp": 1.01888108, "epoch": 0.27494363445062375, "flos": 23598296167680.0, "grad_norm": 1.5960862862018832, "language_loss": 0.81963313, "learning_rate": 3.402946971702147e-06, "loss": 0.84524155, "num_input_tokens_seen": 98842150, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.23413086, "step": 4573, "time_per_iteration": 2.8348841667175293 }, { "auxiliary_loss_clip": 0.01522583, "auxiliary_loss_mlp": 0.01042949, "balance_loss_clip": 1.32340789, "balance_loss_mlp": 1.01873767, "epoch": 0.2750037577032918, "flos": 17173260921600.0, "grad_norm": 1.5907405401441665, "language_loss": 0.80240077, "learning_rate": 3.402669377496223e-06, "loss": 0.8280561, "num_input_tokens_seen": 98861050, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.2421875, "step": 4574, "time_per_iteration": 2.876026153564453 }, { "auxiliary_loss_clip": 0.01539661, "auxiliary_loss_mlp": 0.01048825, "balance_loss_clip": 1.33535147, "balance_loss_mlp": 1.02342129, "epoch": 0.27506388095595974, "flos": 24500842318080.0, "grad_norm": 2.078946460073405, "language_loss": 0.75125706, "learning_rate": 3.402391730100936e-06, "loss": 0.77714193, "num_input_tokens_seen": 98879695, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.25415039, "step": 4575, "time_per_iteration": 2.8231585025787354 }, { "auxiliary_loss_clip": 0.01522383, "auxiliary_loss_mlp": 0.01043763, "balance_loss_clip": 1.32447982, "balance_loss_mlp": 1.01942098, "epoch": 0.2751240042086277, "flos": 38779132757760.0, "grad_norm": 1.684254368231547, "language_loss": 0.7247293, "learning_rate": 3.402114029526814e-06, "loss": 0.75039077, "num_input_tokens_seen": 98902035, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.24365234, "step": 4576, "time_per_iteration": 2.973783493041992 }, { "auxiliary_loss_clip": 0.01534371, "auxiliary_loss_mlp": 0.01044451, "balance_loss_clip": 1.33186245, "balance_loss_mlp": 1.01895261, "epoch": 0.27518412746129567, "flos": 26918442466560.0, "grad_norm": 1.7342720332718886, "language_loss": 0.73989058, "learning_rate": 3.4018362757843866e-06, "loss": 0.76567882, "num_input_tokens_seen": 98921835, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.25524902, "step": 4577, "time_per_iteration": 2.880286931991577 }, { "auxiliary_loss_clip": 0.01547554, "auxiliary_loss_mlp": 0.01041013, "balance_loss_clip": 1.34291625, "balance_loss_mlp": 1.01631284, "epoch": 0.27524425071396363, "flos": 24911467292160.0, "grad_norm": 1.8732453667425142, "language_loss": 0.76524925, "learning_rate": 3.401558468884188e-06, "loss": 0.79113489, "num_input_tokens_seen": 98939610, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.24682617, "step": 4578, "time_per_iteration": 2.832770586013794 }, { "auxiliary_loss_clip": 0.01533945, "auxiliary_loss_mlp": 0.01052344, "balance_loss_clip": 1.33109498, "balance_loss_mlp": 1.02491355, "epoch": 0.2753043739666316, "flos": 26299618329600.0, "grad_norm": 1.5240506280315338, "language_loss": 0.6728282, "learning_rate": 3.4012806088367516e-06, "loss": 0.69869113, "num_input_tokens_seen": 98962250, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.27392578, "step": 4579, "time_per_iteration": 2.9281535148620605 }, { "auxiliary_loss_clip": 0.01543474, "auxiliary_loss_mlp": 0.0104619, "balance_loss_clip": 1.33831728, "balance_loss_mlp": 1.02163315, "epoch": 0.27536449721929956, "flos": 24217391773440.0, "grad_norm": 2.5914645399575007, "language_loss": 0.8049382, "learning_rate": 3.4010026956526137e-06, "loss": 0.83083487, "num_input_tokens_seen": 98981845, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.24560547, "step": 4580, "time_per_iteration": 2.8311805725097656 }, { "auxiliary_loss_clip": 0.01529526, "auxiliary_loss_mlp": 0.01046478, "balance_loss_clip": 1.32831049, "balance_loss_mlp": 1.02018034, "epoch": 0.27542462047196753, "flos": 19546855862400.0, "grad_norm": 6.709255179387057, "language_loss": 0.68198246, "learning_rate": 3.4007247293423137e-06, "loss": 0.70774251, "num_input_tokens_seen": 99001855, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.26318359, "step": 4581, "time_per_iteration": 2.8250272274017334 }, { "auxiliary_loss_clip": 0.01545279, "auxiliary_loss_mlp": 0.01043098, "balance_loss_clip": 1.33879662, "balance_loss_mlp": 1.01873112, "epoch": 0.2754847437246355, "flos": 14327227330560.0, "grad_norm": 1.9370423777982413, "language_loss": 0.79335976, "learning_rate": 3.400446709916392e-06, "loss": 0.81924355, "num_input_tokens_seen": 99019880, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.24353027, "step": 4582, "time_per_iteration": 2.801544427871704 }, { "auxiliary_loss_clip": 0.01521041, "auxiliary_loss_mlp": 0.01045505, "balance_loss_clip": 1.32334006, "balance_loss_mlp": 1.02106738, "epoch": 0.27554486697730346, "flos": 18846808030080.0, "grad_norm": 2.2099114612687023, "language_loss": 0.85225248, "learning_rate": 3.4001686373853895e-06, "loss": 0.87791789, "num_input_tokens_seen": 99037570, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.24462891, "step": 4583, "time_per_iteration": 2.837780714035034 }, { "auxiliary_loss_clip": 0.0156041, "auxiliary_loss_mlp": 0.01045574, "balance_loss_clip": 1.35399652, "balance_loss_mlp": 1.02191091, "epoch": 0.2756049902299714, "flos": 22392391760640.0, "grad_norm": 1.7174657876341697, "language_loss": 0.68196785, "learning_rate": 3.3998905117598528e-06, "loss": 0.70802772, "num_input_tokens_seen": 99056875, "router_z_loss_clip": 2.06054688, "router_z_loss_mlp": 0.23669434, "step": 4584, "time_per_iteration": 2.856309175491333 }, { "auxiliary_loss_clip": 0.01533395, "auxiliary_loss_mlp": 0.010418, "balance_loss_clip": 1.33317351, "balance_loss_mlp": 1.01823211, "epoch": 0.2756651134826394, "flos": 19583531412480.0, "grad_norm": 1.8677388301225768, "language_loss": 0.77715826, "learning_rate": 3.399612333050327e-06, "loss": 0.80291021, "num_input_tokens_seen": 99074685, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.23571777, "step": 4585, "time_per_iteration": 2.8364601135253906 }, { "auxiliary_loss_clip": 0.01561923, "auxiliary_loss_mlp": 0.01046082, "balance_loss_clip": 1.35306692, "balance_loss_mlp": 1.0217396, "epoch": 0.27572523673530736, "flos": 23597029313280.0, "grad_norm": 2.2378575714728743, "language_loss": 0.72696877, "learning_rate": 3.399334101267362e-06, "loss": 0.75304878, "num_input_tokens_seen": 99095300, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.24353027, "step": 4586, "time_per_iteration": 2.8900082111358643 }, { "auxiliary_loss_clip": 0.01555409, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.35243404, "balance_loss_mlp": 1.01439381, "epoch": 0.2757853599879754, "flos": 22830326611200.0, "grad_norm": 1.548072796292089, "language_loss": 0.81298763, "learning_rate": 3.3990558164215073e-06, "loss": 0.83893716, "num_input_tokens_seen": 99115965, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.25170898, "step": 4587, "time_per_iteration": 4.255146741867065 }, { "auxiliary_loss_clip": 0.01544128, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.34242654, "balance_loss_mlp": 1.01870573, "epoch": 0.27584548324064334, "flos": 18560914266240.0, "grad_norm": 2.2646017902912092, "language_loss": 0.83866489, "learning_rate": 3.398777478523316e-06, "loss": 0.86453402, "num_input_tokens_seen": 99134265, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.24084473, "step": 4588, "time_per_iteration": 2.8282077312469482 }, { "auxiliary_loss_clip": 0.01534408, "auxiliary_loss_mlp": 0.01040199, "balance_loss_clip": 1.33484697, "balance_loss_mlp": 1.01564217, "epoch": 0.2759056064933113, "flos": 23780633287680.0, "grad_norm": 1.2902929099492473, "language_loss": 0.76221764, "learning_rate": 3.398499087583342e-06, "loss": 0.78796363, "num_input_tokens_seen": 99156185, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.24572754, "step": 4589, "time_per_iteration": 2.879354953765869 }, { "auxiliary_loss_clip": 0.01534495, "auxiliary_loss_mlp": 0.01043122, "balance_loss_clip": 1.33439922, "balance_loss_mlp": 1.01728892, "epoch": 0.27596572974597927, "flos": 24293095603200.0, "grad_norm": 2.048050114701066, "language_loss": 0.89341879, "learning_rate": 3.398220643612143e-06, "loss": 0.91919494, "num_input_tokens_seen": 99176735, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.25866699, "step": 4590, "time_per_iteration": 2.845214366912842 }, { "auxiliary_loss_clip": 0.01545667, "auxiliary_loss_mlp": 0.01039793, "balance_loss_clip": 1.34106159, "balance_loss_mlp": 1.01422286, "epoch": 0.27602585299864724, "flos": 35053881350400.0, "grad_norm": 1.6500436613029796, "language_loss": 0.72211373, "learning_rate": 3.397942146620277e-06, "loss": 0.74796832, "num_input_tokens_seen": 99199765, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.25537109, "step": 4591, "time_per_iteration": 2.9673690795898438 }, { "auxiliary_loss_clip": 0.01535457, "auxiliary_loss_mlp": 0.01043206, "balance_loss_clip": 1.33442307, "balance_loss_mlp": 1.01968551, "epoch": 0.2760859762513152, "flos": 24318278974080.0, "grad_norm": 2.183959393867276, "language_loss": 0.80786479, "learning_rate": 3.3976635966183046e-06, "loss": 0.83365142, "num_input_tokens_seen": 99218435, "router_z_loss_clip": 2.01074219, "router_z_loss_mlp": 0.23535156, "step": 4592, "time_per_iteration": 2.818568468093872 }, { "auxiliary_loss_clip": 0.01305967, "auxiliary_loss_mlp": 0.0102847, "balance_loss_clip": 1.18179369, "balance_loss_mlp": 1.00119495, "epoch": 0.27614609950398317, "flos": 71289750591360.0, "grad_norm": 0.7209575223253534, "language_loss": 0.61669213, "learning_rate": 3.3973849936167886e-06, "loss": 0.64003652, "num_input_tokens_seen": 99276200, "router_z_loss_clip": 1.2421875, "router_z_loss_mlp": 0.2734375, "step": 4593, "time_per_iteration": 3.3252077102661133 }, { "auxiliary_loss_clip": 0.01539064, "auxiliary_loss_mlp": 0.01041592, "balance_loss_clip": 1.33638752, "balance_loss_mlp": 1.01628363, "epoch": 0.27620622275665113, "flos": 29685650336640.0, "grad_norm": 1.9582342426761867, "language_loss": 0.78484404, "learning_rate": 3.3971063376262937e-06, "loss": 0.81065059, "num_input_tokens_seen": 99297625, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.2532959, "step": 4594, "time_per_iteration": 4.421769618988037 }, { "auxiliary_loss_clip": 0.01537715, "auxiliary_loss_mlp": 0.01043863, "balance_loss_clip": 1.3358717, "balance_loss_mlp": 1.01875794, "epoch": 0.2762663460093191, "flos": 15386610516480.0, "grad_norm": 1.4470753332296886, "language_loss": 0.92162383, "learning_rate": 3.3968276286573866e-06, "loss": 0.94743967, "num_input_tokens_seen": 99315790, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.25097656, "step": 4595, "time_per_iteration": 2.8010053634643555 }, { "auxiliary_loss_clip": 0.01568263, "auxiliary_loss_mlp": 0.01050648, "balance_loss_clip": 1.36113751, "balance_loss_mlp": 1.02536392, "epoch": 0.27632646926198706, "flos": 20713777234560.0, "grad_norm": 1.7271266298650194, "language_loss": 0.69684005, "learning_rate": 3.3965488667206353e-06, "loss": 0.7230292, "num_input_tokens_seen": 99334615, "router_z_loss_clip": 2.06835938, "router_z_loss_mlp": 0.25317383, "step": 4596, "time_per_iteration": 2.852928400039673 }, { "auxiliary_loss_clip": 0.01567218, "auxiliary_loss_mlp": 0.01045332, "balance_loss_clip": 1.35689497, "balance_loss_mlp": 1.02113271, "epoch": 0.276386592514655, "flos": 32825178817920.0, "grad_norm": 2.6592327410852548, "language_loss": 0.64976323, "learning_rate": 3.3962700518266113e-06, "loss": 0.67588878, "num_input_tokens_seen": 99356685, "router_z_loss_clip": 2.1015625, "router_z_loss_mlp": 0.24206543, "step": 4597, "time_per_iteration": 4.394801139831543 }, { "auxiliary_loss_clip": 0.01522115, "auxiliary_loss_mlp": 0.01049137, "balance_loss_clip": 1.32492149, "balance_loss_mlp": 1.02291059, "epoch": 0.276446715767323, "flos": 18560914266240.0, "grad_norm": 2.032945242425822, "language_loss": 0.87233692, "learning_rate": 3.395991183985887e-06, "loss": 0.89804947, "num_input_tokens_seen": 99374810, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.26245117, "step": 4598, "time_per_iteration": 2.804931402206421 }, { "auxiliary_loss_clip": 0.01548839, "auxiliary_loss_mlp": 0.01045242, "balance_loss_clip": 1.34395301, "balance_loss_mlp": 1.02035117, "epoch": 0.27650683901999096, "flos": 22829693184000.0, "grad_norm": 2.968458842715942, "language_loss": 0.80413496, "learning_rate": 3.395712263209037e-06, "loss": 0.83007574, "num_input_tokens_seen": 99391290, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.24902344, "step": 4599, "time_per_iteration": 4.319958448410034 }, { "auxiliary_loss_clip": 0.01557726, "auxiliary_loss_mlp": 0.01048253, "balance_loss_clip": 1.34716046, "balance_loss_mlp": 1.02400565, "epoch": 0.276566962272659, "flos": 21371584406400.0, "grad_norm": 1.7698198595303798, "language_loss": 0.79889119, "learning_rate": 3.395433289506639e-06, "loss": 0.82495093, "num_input_tokens_seen": 99409120, "router_z_loss_clip": 2.10546875, "router_z_loss_mlp": 0.24255371, "step": 4600, "time_per_iteration": 2.8466014862060547 }, { "auxiliary_loss_clip": 0.01543212, "auxiliary_loss_mlp": 0.01044938, "balance_loss_clip": 1.33616173, "balance_loss_mlp": 1.01978493, "epoch": 0.27662708552532694, "flos": 17719095916800.0, "grad_norm": 5.9980200279512985, "language_loss": 0.74319249, "learning_rate": 3.3951542628892694e-06, "loss": 0.76907396, "num_input_tokens_seen": 99426180, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.25146484, "step": 4601, "time_per_iteration": 2.8012125492095947 }, { "auxiliary_loss_clip": 0.01539215, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.3354398, "balance_loss_mlp": 1.0234201, "epoch": 0.2766872087779949, "flos": 21262915100160.0, "grad_norm": 1.6175638417207572, "language_loss": 0.80865705, "learning_rate": 3.3948751833675113e-06, "loss": 0.83452588, "num_input_tokens_seen": 99447720, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.24255371, "step": 4602, "time_per_iteration": 2.8983089923858643 }, { "auxiliary_loss_clip": 0.01557491, "auxiliary_loss_mlp": 0.0105394, "balance_loss_clip": 1.34823811, "balance_loss_mlp": 1.02895415, "epoch": 0.2767473320306629, "flos": 12939393006720.0, "grad_norm": 2.0707761106692626, "language_loss": 0.77752101, "learning_rate": 3.3945960509519455e-06, "loss": 0.80363536, "num_input_tokens_seen": 99464720, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.25, "step": 4603, "time_per_iteration": 2.8350472450256348 }, { "auxiliary_loss_clip": 0.01524923, "auxiliary_loss_mlp": 0.01042199, "balance_loss_clip": 1.32448781, "balance_loss_mlp": 1.01809502, "epoch": 0.27680745528333084, "flos": 15021212359680.0, "grad_norm": 1.662229647945683, "language_loss": 0.82384354, "learning_rate": 3.3943168656531585e-06, "loss": 0.84951472, "num_input_tokens_seen": 99482310, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.24084473, "step": 4604, "time_per_iteration": 2.8759212493896484 }, { "auxiliary_loss_clip": 0.01536921, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.3306849, "balance_loss_mlp": 1.01513731, "epoch": 0.2768675785359988, "flos": 22648034736000.0, "grad_norm": 1.7252303119729349, "language_loss": 0.70884854, "learning_rate": 3.3940376274817363e-06, "loss": 0.73461795, "num_input_tokens_seen": 99501255, "router_z_loss_clip": 2.0625, "router_z_loss_mlp": 0.24890137, "step": 4605, "time_per_iteration": 2.8840291500091553 }, { "auxiliary_loss_clip": 0.01314143, "auxiliary_loss_mlp": 0.01050199, "balance_loss_clip": 1.18564272, "balance_loss_mlp": 1.0261668, "epoch": 0.27692770178866677, "flos": 66161933038080.0, "grad_norm": 0.7074546394580616, "language_loss": 0.57260883, "learning_rate": 3.3937583364482673e-06, "loss": 0.59625226, "num_input_tokens_seen": 99568925, "router_z_loss_clip": 1.28125, "router_z_loss_mlp": 0.24023438, "step": 4606, "time_per_iteration": 3.4279680252075195 }, { "auxiliary_loss_clip": 0.01551929, "auxiliary_loss_mlp": 0.01044134, "balance_loss_clip": 1.34276605, "balance_loss_mlp": 1.0177176, "epoch": 0.27698782504133473, "flos": 26475168729600.0, "grad_norm": 1.8599510979991014, "language_loss": 0.69864291, "learning_rate": 3.3934789925633424e-06, "loss": 0.72460353, "num_input_tokens_seen": 99588455, "router_z_loss_clip": 2.09179688, "router_z_loss_mlp": 0.26416016, "step": 4607, "time_per_iteration": 2.9115915298461914 }, { "auxiliary_loss_clip": 0.01514675, "auxiliary_loss_mlp": 0.01042328, "balance_loss_clip": 1.31645417, "balance_loss_mlp": 1.01802111, "epoch": 0.2770479482940027, "flos": 25895780075520.0, "grad_norm": 1.672784807577903, "language_loss": 0.70592916, "learning_rate": 3.393199595837555e-06, "loss": 0.7314992, "num_input_tokens_seen": 99609355, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.24304199, "step": 4608, "time_per_iteration": 2.885216236114502 }, { "auxiliary_loss_clip": 0.01540832, "auxiliary_loss_mlp": 0.01048923, "balance_loss_clip": 1.33609223, "balance_loss_mlp": 1.02413917, "epoch": 0.27710807154667066, "flos": 22867635588480.0, "grad_norm": 2.987021878169039, "language_loss": 0.73727334, "learning_rate": 3.392920146281499e-06, "loss": 0.76317096, "num_input_tokens_seen": 99628780, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.2479248, "step": 4609, "time_per_iteration": 2.8323919773101807 }, { "auxiliary_loss_clip": 0.01538839, "auxiliary_loss_mlp": 0.01057802, "balance_loss_clip": 1.33321285, "balance_loss_mlp": 1.03179085, "epoch": 0.27716819479933863, "flos": 17719684099200.0, "grad_norm": 2.640414336835422, "language_loss": 0.85046774, "learning_rate": 3.3926406439057714e-06, "loss": 0.87643421, "num_input_tokens_seen": 99644545, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.26025391, "step": 4610, "time_per_iteration": 2.806337833404541 }, { "auxiliary_loss_clip": 0.01548114, "auxiliary_loss_mlp": 0.01060704, "balance_loss_clip": 1.34016192, "balance_loss_mlp": 1.03491926, "epoch": 0.2772283180520066, "flos": 19655479923840.0, "grad_norm": 2.1286993464098027, "language_loss": 0.70774925, "learning_rate": 3.3923610887209705e-06, "loss": 0.73383743, "num_input_tokens_seen": 99663125, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.25817871, "step": 4611, "time_per_iteration": 2.7924163341522217 }, { "auxiliary_loss_clip": 0.01520174, "auxiliary_loss_mlp": 0.0105426, "balance_loss_clip": 1.32112646, "balance_loss_mlp": 1.02959585, "epoch": 0.27728844130467456, "flos": 21042454596480.0, "grad_norm": 2.4278464311670085, "language_loss": 0.75246328, "learning_rate": 3.392081480737698e-06, "loss": 0.77820766, "num_input_tokens_seen": 99682645, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.24658203, "step": 4612, "time_per_iteration": 2.869947671890259 }, { "auxiliary_loss_clip": 0.01538048, "auxiliary_loss_mlp": 0.01061051, "balance_loss_clip": 1.33162093, "balance_loss_mlp": 1.03561211, "epoch": 0.2773485645573425, "flos": 18998441913600.0, "grad_norm": 2.0319390717385617, "language_loss": 0.67448604, "learning_rate": 3.3918018199665563e-06, "loss": 0.700477, "num_input_tokens_seen": 99700520, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.25427246, "step": 4613, "time_per_iteration": 2.8175766468048096 }, { "auxiliary_loss_clip": 0.01523435, "auxiliary_loss_mlp": 0.01060593, "balance_loss_clip": 1.3225956, "balance_loss_mlp": 1.03552318, "epoch": 0.27740868781001055, "flos": 21477855738240.0, "grad_norm": 1.7541154415001636, "language_loss": 0.80227017, "learning_rate": 3.39152210641815e-06, "loss": 0.8281104, "num_input_tokens_seen": 99720355, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.25085449, "step": 4614, "time_per_iteration": 2.8796653747558594 }, { "auxiliary_loss_clip": 0.01533911, "auxiliary_loss_mlp": 0.01072058, "balance_loss_clip": 1.32894158, "balance_loss_mlp": 1.04729831, "epoch": 0.2774688110626785, "flos": 19837138371840.0, "grad_norm": 2.743448438737496, "language_loss": 0.82102746, "learning_rate": 3.3912423401030865e-06, "loss": 0.84708714, "num_input_tokens_seen": 99736090, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.24755859, "step": 4615, "time_per_iteration": 2.8705389499664307 }, { "auxiliary_loss_clip": 0.01545491, "auxiliary_loss_mlp": 0.01067086, "balance_loss_clip": 1.33725977, "balance_loss_mlp": 1.04133677, "epoch": 0.2775289343153465, "flos": 18223504657920.0, "grad_norm": 2.579180398463482, "language_loss": 0.65234613, "learning_rate": 3.3909625210319735e-06, "loss": 0.67847192, "num_input_tokens_seen": 99751805, "router_z_loss_clip": 2.0859375, "router_z_loss_mlp": 0.25744629, "step": 4616, "time_per_iteration": 2.7636282444000244 }, { "auxiliary_loss_clip": 0.01541605, "auxiliary_loss_mlp": 0.01054772, "balance_loss_clip": 1.33600295, "balance_loss_mlp": 1.03027439, "epoch": 0.27758905756801444, "flos": 16481266663680.0, "grad_norm": 1.9163970095237182, "language_loss": 0.83205897, "learning_rate": 3.3906826492154226e-06, "loss": 0.85802281, "num_input_tokens_seen": 99770610, "router_z_loss_clip": 2.05273438, "router_z_loss_mlp": 0.24487305, "step": 4617, "time_per_iteration": 2.8285861015319824 }, { "auxiliary_loss_clip": 0.01537357, "auxiliary_loss_mlp": 0.01062537, "balance_loss_clip": 1.33036554, "balance_loss_mlp": 1.03670382, "epoch": 0.2776491808206824, "flos": 18735966973440.0, "grad_norm": 2.506602551667203, "language_loss": 0.78070891, "learning_rate": 3.3904027246640458e-06, "loss": 0.80670786, "num_input_tokens_seen": 99787305, "router_z_loss_clip": 2.07226562, "router_z_loss_mlp": 0.25842285, "step": 4618, "time_per_iteration": 2.790485382080078 }, { "auxiliary_loss_clip": 0.01540877, "auxiliary_loss_mlp": 0.01050244, "balance_loss_clip": 1.3354857, "balance_loss_mlp": 1.02503157, "epoch": 0.27770930407335037, "flos": 28049909898240.0, "grad_norm": 2.0981497819521, "language_loss": 0.85987276, "learning_rate": 3.390122747388459e-06, "loss": 0.88578397, "num_input_tokens_seen": 99808940, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.25231934, "step": 4619, "time_per_iteration": 2.9171717166900635 }, { "auxiliary_loss_clip": 0.01518828, "auxiliary_loss_mlp": 0.01048402, "balance_loss_clip": 1.32036698, "balance_loss_mlp": 1.02440488, "epoch": 0.27776942732601834, "flos": 23560218028800.0, "grad_norm": 1.8295519990077045, "language_loss": 0.77560079, "learning_rate": 3.3898427173992778e-06, "loss": 0.80127305, "num_input_tokens_seen": 99829575, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.2401123, "step": 4620, "time_per_iteration": 2.8868675231933594 }, { "auxiliary_loss_clip": 0.01522896, "auxiliary_loss_mlp": 0.01042262, "balance_loss_clip": 1.32157016, "balance_loss_mlp": 1.01677561, "epoch": 0.2778295505786863, "flos": 23917653100800.0, "grad_norm": 2.114915792714337, "language_loss": 0.79546845, "learning_rate": 3.389562634707122e-06, "loss": 0.82112002, "num_input_tokens_seen": 99847575, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.25488281, "step": 4621, "time_per_iteration": 2.855079412460327 }, { "auxiliary_loss_clip": 0.01540873, "auxiliary_loss_mlp": 0.01046759, "balance_loss_clip": 1.33473682, "balance_loss_mlp": 1.02120018, "epoch": 0.27788967383135427, "flos": 25564976208000.0, "grad_norm": 2.6595836273293165, "language_loss": 0.88628858, "learning_rate": 3.389282499322611e-06, "loss": 0.91216493, "num_input_tokens_seen": 99864995, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.25598145, "step": 4622, "time_per_iteration": 4.297405242919922 }, { "auxiliary_loss_clip": 0.01535429, "auxiliary_loss_mlp": 0.01043658, "balance_loss_clip": 1.33021593, "balance_loss_mlp": 1.01881456, "epoch": 0.27794979708402223, "flos": 16261122873600.0, "grad_norm": 2.1568256824721326, "language_loss": 0.82272172, "learning_rate": 3.389002311256369e-06, "loss": 0.84851265, "num_input_tokens_seen": 99881540, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.24865723, "step": 4623, "time_per_iteration": 2.794316053390503 }, { "auxiliary_loss_clip": 0.01543727, "auxiliary_loss_mlp": 0.0105039, "balance_loss_clip": 1.33947849, "balance_loss_mlp": 1.02506983, "epoch": 0.2780099203366902, "flos": 20677327908480.0, "grad_norm": 2.0751413988873972, "language_loss": 0.82842803, "learning_rate": 3.3887220705190204e-06, "loss": 0.85436916, "num_input_tokens_seen": 99899595, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.2532959, "step": 4624, "time_per_iteration": 2.864180326461792 }, { "auxiliary_loss_clip": 0.01543499, "auxiliary_loss_mlp": 0.01038562, "balance_loss_clip": 1.3402791, "balance_loss_mlp": 1.013659, "epoch": 0.27807004358935816, "flos": 17746134324480.0, "grad_norm": 2.43931145898085, "language_loss": 0.77130038, "learning_rate": 3.388441777121191e-06, "loss": 0.79712105, "num_input_tokens_seen": 99913020, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.24914551, "step": 4625, "time_per_iteration": 2.9032764434814453 }, { "auxiliary_loss_clip": 0.01528323, "auxiliary_loss_mlp": 0.01042368, "balance_loss_clip": 1.32621503, "balance_loss_mlp": 1.01677394, "epoch": 0.2781301668420261, "flos": 16735778519040.0, "grad_norm": 1.8912341641458437, "language_loss": 0.70539951, "learning_rate": 3.388161431073511e-06, "loss": 0.73110646, "num_input_tokens_seen": 99931405, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.25598145, "step": 4626, "time_per_iteration": 2.8291382789611816 }, { "auxiliary_loss_clip": 0.0155299, "auxiliary_loss_mlp": 0.01043955, "balance_loss_clip": 1.34429312, "balance_loss_mlp": 1.01813436, "epoch": 0.27819029009469415, "flos": 13852662174720.0, "grad_norm": 2.3986917234663974, "language_loss": 0.94300973, "learning_rate": 3.38788103238661e-06, "loss": 0.96897924, "num_input_tokens_seen": 99948100, "router_z_loss_clip": 2.08886719, "router_z_loss_mlp": 0.25842285, "step": 4627, "time_per_iteration": 2.877378463745117 }, { "auxiliary_loss_clip": 0.01546987, "auxiliary_loss_mlp": 0.01038903, "balance_loss_clip": 1.33910179, "balance_loss_mlp": 1.0145365, "epoch": 0.2782504133473621, "flos": 27100689096960.0, "grad_norm": 1.9092315929673354, "language_loss": 0.86144537, "learning_rate": 3.387600581071121e-06, "loss": 0.88730431, "num_input_tokens_seen": 99966470, "router_z_loss_clip": 2.08007812, "router_z_loss_mlp": 0.24377441, "step": 4628, "time_per_iteration": 2.883159637451172 }, { "auxiliary_loss_clip": 0.01532752, "auxiliary_loss_mlp": 0.01048003, "balance_loss_clip": 1.32973945, "balance_loss_mlp": 1.02318335, "epoch": 0.2783105366000301, "flos": 21078587208960.0, "grad_norm": 1.4904598099886661, "language_loss": 0.79874563, "learning_rate": 3.387320077137679e-06, "loss": 0.82455313, "num_input_tokens_seen": 99985930, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.24841309, "step": 4629, "time_per_iteration": 4.251392602920532 }, { "auxiliary_loss_clip": 0.01509567, "auxiliary_loss_mlp": 0.01048994, "balance_loss_clip": 1.31459594, "balance_loss_mlp": 1.02422214, "epoch": 0.27837065985269804, "flos": 26512115748480.0, "grad_norm": 1.4693772870595243, "language_loss": 0.85303301, "learning_rate": 3.3870395205969208e-06, "loss": 0.8786186, "num_input_tokens_seen": 100006235, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.24768066, "step": 4630, "time_per_iteration": 2.9031822681427 }, { "auxiliary_loss_clip": 0.01545443, "auxiliary_loss_mlp": 0.01048075, "balance_loss_clip": 1.33971214, "balance_loss_mlp": 1.02231359, "epoch": 0.278430783105366, "flos": 20231068014720.0, "grad_norm": 2.124326738510666, "language_loss": 0.82340109, "learning_rate": 3.386758911459485e-06, "loss": 0.84933627, "num_input_tokens_seen": 100023655, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.25793457, "step": 4631, "time_per_iteration": 2.7962851524353027 }, { "auxiliary_loss_clip": 0.0154781, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.34119439, "balance_loss_mlp": 1.02134132, "epoch": 0.278490906358034, "flos": 25603778263680.0, "grad_norm": 1.6455193491755362, "language_loss": 0.7216841, "learning_rate": 3.3864782497360126e-06, "loss": 0.74762332, "num_input_tokens_seen": 100043280, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.24768066, "step": 4632, "time_per_iteration": 4.245196580886841 }, { "auxiliary_loss_clip": 0.01517516, "auxiliary_loss_mlp": 0.01043686, "balance_loss_clip": 1.32038212, "balance_loss_mlp": 1.01804388, "epoch": 0.27855102961070194, "flos": 16177908407040.0, "grad_norm": 1.7271435369225177, "language_loss": 0.82668477, "learning_rate": 3.386197535437145e-06, "loss": 0.85229683, "num_input_tokens_seen": 100057690, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.25622559, "step": 4633, "time_per_iteration": 2.819655418395996 }, { "auxiliary_loss_clip": 0.01531074, "auxiliary_loss_mlp": 0.01044535, "balance_loss_clip": 1.32778645, "balance_loss_mlp": 1.01888108, "epoch": 0.2786111528633699, "flos": 22937321859840.0, "grad_norm": 1.946099015580353, "language_loss": 0.88504589, "learning_rate": 3.385916768573529e-06, "loss": 0.91080201, "num_input_tokens_seen": 100075875, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.25683594, "step": 4634, "time_per_iteration": 4.304944038391113 }, { "auxiliary_loss_clip": 0.01552665, "auxiliary_loss_mlp": 0.01045972, "balance_loss_clip": 1.34614277, "balance_loss_mlp": 1.01973379, "epoch": 0.27867127611603787, "flos": 23414646948480.0, "grad_norm": 3.280566655690463, "language_loss": 0.77230716, "learning_rate": 3.38563594915581e-06, "loss": 0.79829353, "num_input_tokens_seen": 100092930, "router_z_loss_clip": 2.06640625, "router_z_loss_mlp": 0.2623291, "step": 4635, "time_per_iteration": 2.887576103210449 }, { "auxiliary_loss_clip": 0.01544783, "auxiliary_loss_mlp": 0.01047753, "balance_loss_clip": 1.33936, "balance_loss_mlp": 1.02243316, "epoch": 0.27873139936870583, "flos": 19838631450240.0, "grad_norm": 1.5570154569175416, "language_loss": 0.66323161, "learning_rate": 3.385355077194637e-06, "loss": 0.68915701, "num_input_tokens_seen": 100110790, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.25317383, "step": 4636, "time_per_iteration": 2.8910937309265137 }, { "auxiliary_loss_clip": 0.0155913, "auxiliary_loss_mlp": 0.01043839, "balance_loss_clip": 1.34904361, "balance_loss_mlp": 1.01770782, "epoch": 0.2787915226213738, "flos": 17715385843200.0, "grad_norm": 3.3214328754646094, "language_loss": 0.85630822, "learning_rate": 3.3850741527006604e-06, "loss": 0.88233793, "num_input_tokens_seen": 100126970, "router_z_loss_clip": 2.10351562, "router_z_loss_mlp": 0.26171875, "step": 4637, "time_per_iteration": 2.7995731830596924 }, { "auxiliary_loss_clip": 0.0153495, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.33287501, "balance_loss_mlp": 1.01574683, "epoch": 0.27885164587404176, "flos": 22100616172800.0, "grad_norm": 1.5400894595827943, "language_loss": 0.76772499, "learning_rate": 3.384793175684533e-06, "loss": 0.79347426, "num_input_tokens_seen": 100146720, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.24243164, "step": 4638, "time_per_iteration": 2.901843309402466 }, { "auxiliary_loss_clip": 0.01536147, "auxiliary_loss_mlp": 0.01046521, "balance_loss_clip": 1.33113837, "balance_loss_mlp": 1.02191663, "epoch": 0.27891176912670973, "flos": 19216866401280.0, "grad_norm": 5.793029568572898, "language_loss": 0.72729164, "learning_rate": 3.38451214615691e-06, "loss": 0.7531184, "num_input_tokens_seen": 100165920, "router_z_loss_clip": 2.04882812, "router_z_loss_mlp": 0.24597168, "step": 4639, "time_per_iteration": 2.833029270172119 }, { "auxiliary_loss_clip": 0.01549509, "auxiliary_loss_mlp": 0.01042967, "balance_loss_clip": 1.34194875, "balance_loss_mlp": 1.01767087, "epoch": 0.27897189237937775, "flos": 27611522599680.0, "grad_norm": 5.232036838104929, "language_loss": 0.67072976, "learning_rate": 3.384231064128447e-06, "loss": 0.69665456, "num_input_tokens_seen": 100185525, "router_z_loss_clip": 2.07617188, "router_z_loss_mlp": 0.25305176, "step": 4640, "time_per_iteration": 2.885321617126465 }, { "auxiliary_loss_clip": 0.01542868, "auxiliary_loss_mlp": 0.01040934, "balance_loss_clip": 1.33745182, "balance_loss_mlp": 1.01734233, "epoch": 0.2790320156320457, "flos": 21187980432000.0, "grad_norm": 1.8938355614234992, "language_loss": 0.7321893, "learning_rate": 3.383949929609804e-06, "loss": 0.75802732, "num_input_tokens_seen": 100204850, "router_z_loss_clip": 2.05371094, "router_z_loss_mlp": 0.23596191, "step": 4641, "time_per_iteration": 2.919515609741211 }, { "auxiliary_loss_clip": 0.01553053, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.34426117, "balance_loss_mlp": 1.02252674, "epoch": 0.2790921388847137, "flos": 22794193998720.0, "grad_norm": 1.700644699238442, "language_loss": 0.75840861, "learning_rate": 3.383668742611641e-06, "loss": 0.78441274, "num_input_tokens_seen": 100224520, "router_z_loss_clip": 2.08984375, "router_z_loss_mlp": 0.24829102, "step": 4642, "time_per_iteration": 2.8536155223846436 }, { "auxiliary_loss_clip": 0.01548953, "auxiliary_loss_mlp": 0.01043175, "balance_loss_clip": 1.34388447, "balance_loss_mlp": 1.01858199, "epoch": 0.27915226213738165, "flos": 23410167713280.0, "grad_norm": 1.7938251560830922, "language_loss": 0.8660773, "learning_rate": 3.3833875031446205e-06, "loss": 0.89199859, "num_input_tokens_seen": 100243935, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.24584961, "step": 4643, "time_per_iteration": 2.8598272800445557 }, { "auxiliary_loss_clip": 0.01544538, "auxiliary_loss_mlp": 0.01050137, "balance_loss_clip": 1.33976865, "balance_loss_mlp": 1.02671278, "epoch": 0.2792123853900496, "flos": 22758151875840.0, "grad_norm": 3.5731725664139002, "language_loss": 0.83688301, "learning_rate": 3.383106211219407e-06, "loss": 0.8628298, "num_input_tokens_seen": 100262290, "router_z_loss_clip": 2.04785156, "router_z_loss_mlp": 0.23400879, "step": 4644, "time_per_iteration": 2.8783676624298096 }, { "auxiliary_loss_clip": 0.01545827, "auxiliary_loss_mlp": 0.01048904, "balance_loss_clip": 1.3401103, "balance_loss_mlp": 1.02456164, "epoch": 0.2792725086427176, "flos": 15057752175360.0, "grad_norm": 1.9364088351056048, "language_loss": 0.79577458, "learning_rate": 3.3828248668466673e-06, "loss": 0.82172191, "num_input_tokens_seen": 100280015, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.24353027, "step": 4645, "time_per_iteration": 2.861405849456787 }, { "auxiliary_loss_clip": 0.01322871, "auxiliary_loss_mlp": 0.011084, "balance_loss_clip": 1.19682479, "balance_loss_mlp": 1.08226895, "epoch": 0.27933263189538554, "flos": 62572706058240.0, "grad_norm": 0.7973396276459345, "language_loss": 0.62302721, "learning_rate": 3.3825434700370705e-06, "loss": 0.64733994, "num_input_tokens_seen": 100338935, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.26171875, "step": 4646, "time_per_iteration": 3.3534772396087646 }, { "auxiliary_loss_clip": 0.01528596, "auxiliary_loss_mlp": 0.01047204, "balance_loss_clip": 1.32900679, "balance_loss_mlp": 1.02430367, "epoch": 0.2793927551480535, "flos": 25129077373440.0, "grad_norm": 1.5622114300692187, "language_loss": 0.9004631, "learning_rate": 3.3822620208012865e-06, "loss": 0.92622113, "num_input_tokens_seen": 100359905, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.22924805, "step": 4647, "time_per_iteration": 2.863480567932129 }, { "auxiliary_loss_clip": 0.01544224, "auxiliary_loss_mlp": 0.0104505, "balance_loss_clip": 1.33771777, "balance_loss_mlp": 1.01939619, "epoch": 0.27945287840072147, "flos": 21334501653120.0, "grad_norm": 1.840238865979896, "language_loss": 0.87372589, "learning_rate": 3.381980519149988e-06, "loss": 0.89961863, "num_input_tokens_seen": 100376955, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.2565918, "step": 4648, "time_per_iteration": 2.9075167179107666 }, { "auxiliary_loss_clip": 0.01536671, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.3294642, "balance_loss_mlp": 1.01847339, "epoch": 0.27951300165338944, "flos": 27461110325760.0, "grad_norm": 2.023937686825664, "language_loss": 0.73908085, "learning_rate": 3.38169896509385e-06, "loss": 0.76487643, "num_input_tokens_seen": 100397545, "router_z_loss_clip": 2.06933594, "router_z_loss_mlp": 0.24401855, "step": 4649, "time_per_iteration": 2.8961830139160156 }, { "auxiliary_loss_clip": 0.01518811, "auxiliary_loss_mlp": 0.01047343, "balance_loss_clip": 1.31805754, "balance_loss_mlp": 1.02240491, "epoch": 0.2795731249060574, "flos": 15167462112000.0, "grad_norm": 2.3892794451258843, "language_loss": 0.82700133, "learning_rate": 3.381417358643549e-06, "loss": 0.85266292, "num_input_tokens_seen": 100415080, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.24926758, "step": 4650, "time_per_iteration": 2.867680311203003 }, { "auxiliary_loss_clip": 0.01330618, "auxiliary_loss_mlp": 0.01052936, "balance_loss_clip": 1.20462823, "balance_loss_mlp": 1.02661467, "epoch": 0.27963324815872537, "flos": 60152454449280.0, "grad_norm": 0.8271797006925427, "language_loss": 0.58898622, "learning_rate": 3.3811356998097624e-06, "loss": 0.61282176, "num_input_tokens_seen": 100471105, "router_z_loss_clip": 1.265625, "router_z_loss_mlp": 0.26367188, "step": 4651, "time_per_iteration": 3.4177563190460205 }, { "auxiliary_loss_clip": 0.01540292, "auxiliary_loss_mlp": 0.01042089, "balance_loss_clip": 1.3327651, "balance_loss_mlp": 1.0173533, "epoch": 0.27969337141139333, "flos": 21776915738880.0, "grad_norm": 1.68135448337319, "language_loss": 0.74842942, "learning_rate": 3.3808539886031726e-06, "loss": 0.77425325, "num_input_tokens_seen": 100492520, "router_z_loss_clip": 2.07421875, "router_z_loss_mlp": 0.24755859, "step": 4652, "time_per_iteration": 2.8721179962158203 }, { "auxiliary_loss_clip": 0.01544613, "auxiliary_loss_mlp": 0.01050895, "balance_loss_clip": 1.33875394, "balance_loss_mlp": 1.02515721, "epoch": 0.27975349466406135, "flos": 39864061272960.0, "grad_norm": 2.1875686962143894, "language_loss": 0.8071149, "learning_rate": 3.380572225034461e-06, "loss": 0.83306998, "num_input_tokens_seen": 100512870, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.25744629, "step": 4653, "time_per_iteration": 2.997776508331299 }, { "auxiliary_loss_clip": 0.0153355, "auxiliary_loss_mlp": 0.01049638, "balance_loss_clip": 1.33197021, "balance_loss_mlp": 1.02413869, "epoch": 0.2798136179167293, "flos": 21589737425280.0, "grad_norm": 4.3015701274963725, "language_loss": 0.79453504, "learning_rate": 3.380290409114312e-06, "loss": 0.82036692, "num_input_tokens_seen": 100531655, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.25488281, "step": 4654, "time_per_iteration": 2.854058027267456 }, { "auxiliary_loss_clip": 0.0155977, "auxiliary_loss_mlp": 0.01038234, "balance_loss_clip": 1.34991658, "balance_loss_mlp": 1.01386809, "epoch": 0.2798737411693973, "flos": 21546139420800.0, "grad_norm": 3.5504370601875594, "language_loss": 0.81556749, "learning_rate": 3.3800085408534127e-06, "loss": 0.84154761, "num_input_tokens_seen": 100548005, "router_z_loss_clip": 2.10253906, "router_z_loss_mlp": 0.24377441, "step": 4655, "time_per_iteration": 2.8716318607330322 }, { "auxiliary_loss_clip": 0.01533665, "auxiliary_loss_mlp": 0.01043507, "balance_loss_clip": 1.33143592, "balance_loss_mlp": 1.01842535, "epoch": 0.27993386442206525, "flos": 26992743707520.0, "grad_norm": 2.008666765525074, "language_loss": 0.82256258, "learning_rate": 3.3797266202624506e-06, "loss": 0.84833431, "num_input_tokens_seen": 100567980, "router_z_loss_clip": 2.01855469, "router_z_loss_mlp": 0.25073242, "step": 4656, "time_per_iteration": 2.903451442718506 }, { "auxiliary_loss_clip": 0.01524464, "auxiliary_loss_mlp": 0.01046749, "balance_loss_clip": 1.32274771, "balance_loss_mlp": 1.02201319, "epoch": 0.2799939876747332, "flos": 24359795717760.0, "grad_norm": 1.7331837113931718, "language_loss": 0.83489799, "learning_rate": 3.3794446473521176e-06, "loss": 0.86061013, "num_input_tokens_seen": 100588630, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.24731445, "step": 4657, "time_per_iteration": 4.387537479400635 }, { "auxiliary_loss_clip": 0.01532414, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.33038568, "balance_loss_mlp": 1.01868725, "epoch": 0.2800541109274012, "flos": 33670164303360.0, "grad_norm": 2.3015731500631307, "language_loss": 0.65082824, "learning_rate": 3.379162622133105e-06, "loss": 0.67657936, "num_input_tokens_seen": 100608775, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.23999023, "step": 4658, "time_per_iteration": 2.9561588764190674 }, { "auxiliary_loss_clip": 0.01537235, "auxiliary_loss_mlp": 0.01042702, "balance_loss_clip": 1.33401549, "balance_loss_mlp": 1.01764441, "epoch": 0.28011423418006914, "flos": 21623879266560.0, "grad_norm": 1.7010338918134356, "language_loss": 0.78942442, "learning_rate": 3.3788805446161073e-06, "loss": 0.81522381, "num_input_tokens_seen": 100627975, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.25048828, "step": 4659, "time_per_iteration": 2.84621262550354 }, { "auxiliary_loss_clip": 0.01532893, "auxiliary_loss_mlp": 0.01043276, "balance_loss_clip": 1.33214092, "balance_loss_mlp": 1.01952934, "epoch": 0.2801743574327371, "flos": 23122826115840.0, "grad_norm": 1.7197859378833344, "language_loss": 0.79932857, "learning_rate": 3.3785984148118215e-06, "loss": 0.82509029, "num_input_tokens_seen": 100645430, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.23742676, "step": 4660, "time_per_iteration": 2.8436808586120605 }, { "auxiliary_loss_clip": 0.01511795, "auxiliary_loss_mlp": 0.0104297, "balance_loss_clip": 1.31515467, "balance_loss_mlp": 1.01950979, "epoch": 0.2802344806854051, "flos": 12649200986880.0, "grad_norm": 2.5080872965318215, "language_loss": 0.81914425, "learning_rate": 3.3783162327309453e-06, "loss": 0.84469187, "num_input_tokens_seen": 100663775, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.23474121, "step": 4661, "time_per_iteration": 2.9237582683563232 }, { "auxiliary_loss_clip": 0.01564107, "auxiliary_loss_mlp": 0.01045328, "balance_loss_clip": 1.36132216, "balance_loss_mlp": 1.0206871, "epoch": 0.28029460393807304, "flos": 37281678986880.0, "grad_norm": 1.585671035263152, "language_loss": 0.79747635, "learning_rate": 3.3780339983841794e-06, "loss": 0.82357067, "num_input_tokens_seen": 100686085, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.24658203, "step": 4662, "time_per_iteration": 2.99454927444458 }, { "auxiliary_loss_clip": 0.01541011, "auxiliary_loss_mlp": 0.01046194, "balance_loss_clip": 1.3343209, "balance_loss_mlp": 1.02051616, "epoch": 0.280354727190741, "flos": 20750950477440.0, "grad_norm": 2.159628797626251, "language_loss": 0.70753348, "learning_rate": 3.377751711782227e-06, "loss": 0.73340547, "num_input_tokens_seen": 100705135, "router_z_loss_clip": 2.06542969, "router_z_loss_mlp": 0.25695801, "step": 4663, "time_per_iteration": 2.9657039642333984 }, { "auxiliary_loss_clip": 0.01551369, "auxiliary_loss_mlp": 0.01054287, "balance_loss_clip": 1.34585166, "balance_loss_mlp": 1.0276556, "epoch": 0.28041485044340897, "flos": 21481113363840.0, "grad_norm": 1.7764748517804245, "language_loss": 0.78708601, "learning_rate": 3.377469372935791e-06, "loss": 0.81314254, "num_input_tokens_seen": 100724960, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.26647949, "step": 4664, "time_per_iteration": 4.294102430343628 }, { "auxiliary_loss_clip": 0.01518802, "auxiliary_loss_mlp": 0.01047013, "balance_loss_clip": 1.32325792, "balance_loss_mlp": 1.02234888, "epoch": 0.28047497369607693, "flos": 14802652137600.0, "grad_norm": 1.8010202999073999, "language_loss": 0.80932271, "learning_rate": 3.377186981855578e-06, "loss": 0.83498085, "num_input_tokens_seen": 100741995, "router_z_loss_clip": 1.95117188, "router_z_loss_mlp": 0.24682617, "step": 4665, "time_per_iteration": 2.8403472900390625 }, { "auxiliary_loss_clip": 0.01524698, "auxiliary_loss_mlp": 0.01043979, "balance_loss_clip": 1.326859, "balance_loss_mlp": 1.02104282, "epoch": 0.2805350969487449, "flos": 23079771048960.0, "grad_norm": 2.1218793222469725, "language_loss": 0.8130908, "learning_rate": 3.3769045385522968e-06, "loss": 0.83877754, "num_input_tokens_seen": 100758985, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.22937012, "step": 4666, "time_per_iteration": 2.900665044784546 }, { "auxiliary_loss_clip": 0.01541748, "auxiliary_loss_mlp": 0.0104713, "balance_loss_clip": 1.3407129, "balance_loss_mlp": 1.0228591, "epoch": 0.2805952202014129, "flos": 20488113578880.0, "grad_norm": 2.6795683638836194, "language_loss": 0.85958481, "learning_rate": 3.376622043036658e-06, "loss": 0.88547361, "num_input_tokens_seen": 100777820, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.24255371, "step": 4667, "time_per_iteration": 4.266583681106567 }, { "auxiliary_loss_clip": 0.01554971, "auxiliary_loss_mlp": 0.01046516, "balance_loss_clip": 1.35023057, "balance_loss_mlp": 1.0210892, "epoch": 0.2806553434540809, "flos": 27428733031680.0, "grad_norm": 1.9148913153239218, "language_loss": 0.8063733, "learning_rate": 3.376339495319373e-06, "loss": 0.83238816, "num_input_tokens_seen": 100798205, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.25439453, "step": 4668, "time_per_iteration": 2.870173454284668 }, { "auxiliary_loss_clip": 0.0155874, "auxiliary_loss_mlp": 0.0105143, "balance_loss_clip": 1.35190082, "balance_loss_mlp": 1.02498972, "epoch": 0.28071546670674885, "flos": 26516142535680.0, "grad_norm": 1.535041282629128, "language_loss": 0.77328372, "learning_rate": 3.3760568954111563e-06, "loss": 0.79938543, "num_input_tokens_seen": 100819800, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.26464844, "step": 4669, "time_per_iteration": 4.3043739795684814 }, { "auxiliary_loss_clip": 0.01548904, "auxiliary_loss_mlp": 0.01054744, "balance_loss_clip": 1.34664297, "balance_loss_mlp": 1.03060436, "epoch": 0.2807755899594168, "flos": 20568477623040.0, "grad_norm": 2.228219269067643, "language_loss": 0.8120259, "learning_rate": 3.375774243322725e-06, "loss": 0.83806235, "num_input_tokens_seen": 100837880, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.24169922, "step": 4670, "time_per_iteration": 2.868858575820923 }, { "auxiliary_loss_clip": 0.01558915, "auxiliary_loss_mlp": 0.01042874, "balance_loss_clip": 1.35318613, "balance_loss_mlp": 1.01750636, "epoch": 0.2808357132120848, "flos": 24323210657280.0, "grad_norm": 3.897746809945319, "language_loss": 0.80677187, "learning_rate": 3.3754915390647955e-06, "loss": 0.83278978, "num_input_tokens_seen": 100856350, "router_z_loss_clip": 2.05566406, "router_z_loss_mlp": 0.25341797, "step": 4671, "time_per_iteration": 2.8513708114624023 }, { "auxiliary_loss_clip": 0.01536425, "auxiliary_loss_mlp": 0.01045079, "balance_loss_clip": 1.34043121, "balance_loss_mlp": 1.0217973, "epoch": 0.28089583646475275, "flos": 26443605841920.0, "grad_norm": 1.9383736840625814, "language_loss": 0.76011848, "learning_rate": 3.37520878264809e-06, "loss": 0.78593349, "num_input_tokens_seen": 100876135, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.23291016, "step": 4672, "time_per_iteration": 2.9534871578216553 }, { "auxiliary_loss_clip": 0.01558954, "auxiliary_loss_mlp": 0.01048909, "balance_loss_clip": 1.35500503, "balance_loss_mlp": 1.02315927, "epoch": 0.2809559597174207, "flos": 23122056954240.0, "grad_norm": 3.4757278882684592, "language_loss": 0.77245682, "learning_rate": 3.3749259740833286e-06, "loss": 0.79853547, "num_input_tokens_seen": 100894790, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.25744629, "step": 4673, "time_per_iteration": 2.8241679668426514 }, { "auxiliary_loss_clip": 0.01543215, "auxiliary_loss_mlp": 0.01047148, "balance_loss_clip": 1.34209037, "balance_loss_mlp": 1.02263856, "epoch": 0.2810160829700887, "flos": 20933378087040.0, "grad_norm": 2.567518587508363, "language_loss": 0.72647887, "learning_rate": 3.374643113381237e-06, "loss": 0.75238252, "num_input_tokens_seen": 100915100, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.24499512, "step": 4674, "time_per_iteration": 2.904388666152954 }, { "auxiliary_loss_clip": 0.01556005, "auxiliary_loss_mlp": 0.01048872, "balance_loss_clip": 1.3535006, "balance_loss_mlp": 1.02288413, "epoch": 0.28107620622275664, "flos": 14364038615040.0, "grad_norm": 1.8763683492875924, "language_loss": 0.78239489, "learning_rate": 3.374360200552541e-06, "loss": 0.80844367, "num_input_tokens_seen": 100932795, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.2598877, "step": 4675, "time_per_iteration": 2.8612029552459717 }, { "auxiliary_loss_clip": 0.01549115, "auxiliary_loss_mlp": 0.01044771, "balance_loss_clip": 1.34530449, "balance_loss_mlp": 1.01875949, "epoch": 0.2811363294754246, "flos": 20927812976640.0, "grad_norm": 2.5460174394612083, "language_loss": 0.71106488, "learning_rate": 3.374077235607968e-06, "loss": 0.73700368, "num_input_tokens_seen": 100950505, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.26025391, "step": 4676, "time_per_iteration": 2.838313579559326 }, { "auxiliary_loss_clip": 0.0152618, "auxiliary_loss_mlp": 0.01047178, "balance_loss_clip": 1.33215666, "balance_loss_mlp": 1.02326441, "epoch": 0.28119645272809257, "flos": 20604564990720.0, "grad_norm": 1.5642248791697388, "language_loss": 0.7156117, "learning_rate": 3.3737942185582487e-06, "loss": 0.74134529, "num_input_tokens_seen": 100968790, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.23925781, "step": 4677, "time_per_iteration": 2.885960340499878 }, { "auxiliary_loss_clip": 0.01529996, "auxiliary_loss_mlp": 0.01044483, "balance_loss_clip": 1.33248019, "balance_loss_mlp": 1.01885295, "epoch": 0.28125657598076054, "flos": 25348225777920.0, "grad_norm": 1.4571964528613524, "language_loss": 0.64190924, "learning_rate": 3.3735111494141153e-06, "loss": 0.66765398, "num_input_tokens_seen": 100990205, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.25646973, "step": 4678, "time_per_iteration": 3.018908977508545 }, { "auxiliary_loss_clip": 0.01527918, "auxiliary_loss_mlp": 0.0105479, "balance_loss_clip": 1.33142817, "balance_loss_mlp": 1.02964854, "epoch": 0.2813166992334285, "flos": 24837889968000.0, "grad_norm": 1.7639118578293043, "language_loss": 0.71348417, "learning_rate": 3.3732280281863013e-06, "loss": 0.73931122, "num_input_tokens_seen": 101009815, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.25158691, "step": 4679, "time_per_iteration": 2.925842761993408 }, { "auxiliary_loss_clip": 0.01544511, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.34294033, "balance_loss_mlp": 1.01594234, "epoch": 0.2813768224860965, "flos": 21770355242880.0, "grad_norm": 1.8225102854136799, "language_loss": 0.75622869, "learning_rate": 3.3729448548855422e-06, "loss": 0.78209168, "num_input_tokens_seen": 101026780, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.25842285, "step": 4680, "time_per_iteration": 2.8713812828063965 }, { "auxiliary_loss_clip": 0.01526636, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.32852817, "balance_loss_mlp": 1.01761985, "epoch": 0.2814369457387645, "flos": 24327599402880.0, "grad_norm": 1.7485702854390646, "language_loss": 0.78409052, "learning_rate": 3.3726616295225774e-06, "loss": 0.80977333, "num_input_tokens_seen": 101046215, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.24023438, "step": 4681, "time_per_iteration": 2.8598570823669434 }, { "auxiliary_loss_clip": 0.01562775, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.35878611, "balance_loss_mlp": 1.02067327, "epoch": 0.28149706899143245, "flos": 18524555429760.0, "grad_norm": 1.8959955349102147, "language_loss": 0.75099069, "learning_rate": 3.372378352108146e-06, "loss": 0.77707303, "num_input_tokens_seen": 101063365, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.24768066, "step": 4682, "time_per_iteration": 2.8738749027252197 }, { "auxiliary_loss_clip": 0.01533092, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.33717644, "balance_loss_mlp": 1.01305437, "epoch": 0.2815571922441004, "flos": 24873977335680.0, "grad_norm": 1.8054203667947146, "language_loss": 0.81513488, "learning_rate": 3.3720950226529894e-06, "loss": 0.84083629, "num_input_tokens_seen": 101083835, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.23986816, "step": 4683, "time_per_iteration": 2.918318510055542 }, { "auxiliary_loss_clip": 0.01543401, "auxiliary_loss_mlp": 0.01040066, "balance_loss_clip": 1.34330952, "balance_loss_mlp": 1.01544929, "epoch": 0.2816173154967684, "flos": 19911168144000.0, "grad_norm": 1.6315426221960077, "language_loss": 0.76771981, "learning_rate": 3.371811641167852e-06, "loss": 0.79355443, "num_input_tokens_seen": 101101740, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.24633789, "step": 4684, "time_per_iteration": 2.915719985961914 }, { "auxiliary_loss_clip": 0.01534375, "auxiliary_loss_mlp": 0.0104215, "balance_loss_clip": 1.33789122, "balance_loss_mlp": 1.01793838, "epoch": 0.28167743874943635, "flos": 17499676043520.0, "grad_norm": 2.4688233561699016, "language_loss": 0.77871305, "learning_rate": 3.3715282076634807e-06, "loss": 0.80447829, "num_input_tokens_seen": 101120480, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.24230957, "step": 4685, "time_per_iteration": 2.8332231044769287 }, { "auxiliary_loss_clip": 0.01532676, "auxiliary_loss_mlp": 0.01041779, "balance_loss_clip": 1.33743799, "balance_loss_mlp": 1.01809239, "epoch": 0.2817375620021043, "flos": 25313224285440.0, "grad_norm": 1.6706722733422557, "language_loss": 0.76556635, "learning_rate": 3.3712447221506218e-06, "loss": 0.79131091, "num_input_tokens_seen": 101142910, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.23657227, "step": 4686, "time_per_iteration": 2.883486270904541 }, { "auxiliary_loss_clip": 0.01555091, "auxiliary_loss_mlp": 0.01047634, "balance_loss_clip": 1.35301411, "balance_loss_mlp": 1.02113402, "epoch": 0.2817976852547723, "flos": 18701327439360.0, "grad_norm": 2.7916709092651657, "language_loss": 0.65518856, "learning_rate": 3.370961184640025e-06, "loss": 0.68121582, "num_input_tokens_seen": 101160030, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.26501465, "step": 4687, "time_per_iteration": 2.833021879196167 }, { "auxiliary_loss_clip": 0.01535521, "auxiliary_loss_mlp": 0.01044518, "balance_loss_clip": 1.33768117, "balance_loss_mlp": 1.01981771, "epoch": 0.28185780850744024, "flos": 22751184176640.0, "grad_norm": 2.074734402016617, "language_loss": 0.76993173, "learning_rate": 3.3706775951424433e-06, "loss": 0.79573214, "num_input_tokens_seen": 101177675, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.24707031, "step": 4688, "time_per_iteration": 2.8543012142181396 }, { "auxiliary_loss_clip": 0.01529211, "auxiliary_loss_mlp": 0.01039537, "balance_loss_clip": 1.33251071, "balance_loss_mlp": 1.01551628, "epoch": 0.2819179317601082, "flos": 14940712581120.0, "grad_norm": 1.8669340414847686, "language_loss": 0.79972821, "learning_rate": 3.37039395366863e-06, "loss": 0.82541567, "num_input_tokens_seen": 101192225, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.24023438, "step": 4689, "time_per_iteration": 2.807666301727295 }, { "auxiliary_loss_clip": 0.01525473, "auxiliary_loss_mlp": 0.01041461, "balance_loss_clip": 1.32807481, "balance_loss_mlp": 1.01505566, "epoch": 0.2819780550127762, "flos": 23154796206720.0, "grad_norm": 1.6115667497613988, "language_loss": 0.7875281, "learning_rate": 3.37011026022934e-06, "loss": 0.81319743, "num_input_tokens_seen": 101210870, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.26403809, "step": 4690, "time_per_iteration": 2.827425241470337 }, { "auxiliary_loss_clip": 0.01530284, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.33232069, "balance_loss_mlp": 1.0195291, "epoch": 0.28203817826544414, "flos": 21626322485760.0, "grad_norm": 1.8937802824501626, "language_loss": 0.88482881, "learning_rate": 3.369826514835332e-06, "loss": 0.91057777, "num_input_tokens_seen": 101229965, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.25073242, "step": 4691, "time_per_iteration": 2.9522907733917236 }, { "auxiliary_loss_clip": 0.01554121, "auxiliary_loss_mlp": 0.01047928, "balance_loss_clip": 1.3513484, "balance_loss_mlp": 1.02226186, "epoch": 0.2820983015181121, "flos": 24037995565440.0, "grad_norm": 1.6923797400431964, "language_loss": 0.82992399, "learning_rate": 3.3695427174973654e-06, "loss": 0.8559444, "num_input_tokens_seen": 101250980, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.25683594, "step": 4692, "time_per_iteration": 4.4081690311431885 }, { "auxiliary_loss_clip": 0.01524897, "auxiliary_loss_mlp": 0.01039562, "balance_loss_clip": 1.32786357, "balance_loss_mlp": 1.01452804, "epoch": 0.2821584247707801, "flos": 30020028543360.0, "grad_norm": 1.549928035595499, "language_loss": 0.74931788, "learning_rate": 3.3692588682262022e-06, "loss": 0.77496248, "num_input_tokens_seen": 101273335, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.25048828, "step": 4693, "time_per_iteration": 2.983670473098755 }, { "auxiliary_loss_clip": 0.01526991, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.32918215, "balance_loss_mlp": 1.02075982, "epoch": 0.2822185480234481, "flos": 21406495409280.0, "grad_norm": 1.7301246618014963, "language_loss": 0.78709459, "learning_rate": 3.3689749670326046e-06, "loss": 0.8128159, "num_input_tokens_seen": 101292110, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.24401855, "step": 4694, "time_per_iteration": 2.9282925128936768 }, { "auxiliary_loss_clip": 0.01525878, "auxiliary_loss_mlp": 0.01041046, "balance_loss_clip": 1.33226407, "balance_loss_mlp": 1.01617861, "epoch": 0.28227867127611606, "flos": 27463960748160.0, "grad_norm": 1.8442849760530755, "language_loss": 0.67464554, "learning_rate": 3.3686910139273392e-06, "loss": 0.70031476, "num_input_tokens_seen": 101312815, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.2487793, "step": 4695, "time_per_iteration": 2.939635992050171 }, { "auxiliary_loss_clip": 0.01540186, "auxiliary_loss_mlp": 0.01046176, "balance_loss_clip": 1.3410027, "balance_loss_mlp": 1.01894903, "epoch": 0.282338794528784, "flos": 22601948267520.0, "grad_norm": 2.1147615314791506, "language_loss": 0.77148104, "learning_rate": 3.3684070089211736e-06, "loss": 0.79734474, "num_input_tokens_seen": 101329045, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.27209473, "step": 4696, "time_per_iteration": 2.8597843647003174 }, { "auxiliary_loss_clip": 0.01522409, "auxiliary_loss_mlp": 0.01048266, "balance_loss_clip": 1.32815826, "balance_loss_mlp": 1.02269578, "epoch": 0.282398917781452, "flos": 42025927956480.0, "grad_norm": 1.4849399987450038, "language_loss": 0.63274503, "learning_rate": 3.368122952024877e-06, "loss": 0.6584518, "num_input_tokens_seen": 101352715, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.25561523, "step": 4697, "time_per_iteration": 3.0392119884490967 }, { "auxiliary_loss_clip": 0.01509446, "auxiliary_loss_mlp": 0.01041572, "balance_loss_clip": 1.31685567, "balance_loss_mlp": 1.01788557, "epoch": 0.28245904103411995, "flos": 23235884167680.0, "grad_norm": 1.4321939738574294, "language_loss": 0.73982912, "learning_rate": 3.3678388432492214e-06, "loss": 0.76533931, "num_input_tokens_seen": 101374640, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.23681641, "step": 4698, "time_per_iteration": 2.8972625732421875 }, { "auxiliary_loss_clip": 0.01512732, "auxiliary_loss_mlp": 0.0104463, "balance_loss_clip": 1.32054329, "balance_loss_mlp": 1.02122927, "epoch": 0.2825191642867879, "flos": 25385625244800.0, "grad_norm": 1.6860931578177958, "language_loss": 0.76593262, "learning_rate": 3.3675546826049788e-06, "loss": 0.79150629, "num_input_tokens_seen": 101393595, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.23413086, "step": 4699, "time_per_iteration": 4.247616767883301 }, { "auxiliary_loss_clip": 0.01531013, "auxiliary_loss_mlp": 0.01043027, "balance_loss_clip": 1.33160102, "balance_loss_mlp": 1.01923299, "epoch": 0.2825792875394559, "flos": 17245299922560.0, "grad_norm": 2.865809525090896, "language_loss": 0.81809878, "learning_rate": 3.3672704701029265e-06, "loss": 0.84383917, "num_input_tokens_seen": 101409265, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.23815918, "step": 4700, "time_per_iteration": 2.8890793323516846 }, { "auxiliary_loss_clip": 0.0151299, "auxiliary_loss_mlp": 0.01049351, "balance_loss_clip": 1.32437623, "balance_loss_mlp": 1.02593839, "epoch": 0.28263941079212385, "flos": 26735155205760.0, "grad_norm": 4.38560175114185, "language_loss": 0.83032811, "learning_rate": 3.3669862057538402e-06, "loss": 0.85595149, "num_input_tokens_seen": 101428365, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.23413086, "step": 4701, "time_per_iteration": 2.8632400035858154 }, { "auxiliary_loss_clip": 0.01526289, "auxiliary_loss_mlp": 0.0104902, "balance_loss_clip": 1.33054101, "balance_loss_mlp": 1.02454603, "epoch": 0.2826995340447918, "flos": 25932319891200.0, "grad_norm": 2.807110824905541, "language_loss": 0.74116647, "learning_rate": 3.3667018895685004e-06, "loss": 0.76691961, "num_input_tokens_seen": 101447280, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.24462891, "step": 4702, "time_per_iteration": 4.309850454330444 }, { "auxiliary_loss_clip": 0.01517308, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.32575905, "balance_loss_mlp": 1.02683866, "epoch": 0.2827596572974598, "flos": 22389134135040.0, "grad_norm": 1.6720260986019393, "language_loss": 0.78999037, "learning_rate": 3.3664175215576886e-06, "loss": 0.81567419, "num_input_tokens_seen": 101465435, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.2421875, "step": 4703, "time_per_iteration": 2.8646583557128906 }, { "auxiliary_loss_clip": 0.01532508, "auxiliary_loss_mlp": 0.01060054, "balance_loss_clip": 1.33537972, "balance_loss_mlp": 1.03580689, "epoch": 0.28281978055012774, "flos": 33560182897920.0, "grad_norm": 2.1112866262858137, "language_loss": 0.70653123, "learning_rate": 3.3661331017321867e-06, "loss": 0.73245692, "num_input_tokens_seen": 101486355, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.24230957, "step": 4704, "time_per_iteration": 4.379939794540405 }, { "auxiliary_loss_clip": 0.01518395, "auxiliary_loss_mlp": 0.01059889, "balance_loss_clip": 1.3230114, "balance_loss_mlp": 1.03456903, "epoch": 0.2828799038027957, "flos": 23451367743360.0, "grad_norm": 2.705748032796369, "language_loss": 0.71627772, "learning_rate": 3.3658486301027807e-06, "loss": 0.74206054, "num_input_tokens_seen": 101505875, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.2532959, "step": 4705, "time_per_iteration": 2.8943123817443848 }, { "auxiliary_loss_clip": 0.01331311, "auxiliary_loss_mlp": 0.01040551, "balance_loss_clip": 1.21509302, "balance_loss_mlp": 1.01098752, "epoch": 0.2829400270554637, "flos": 69902187736320.0, "grad_norm": 0.7276219294092496, "language_loss": 0.59334534, "learning_rate": 3.3655641066802577e-06, "loss": 0.617064, "num_input_tokens_seen": 101565045, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.29492188, "step": 4706, "time_per_iteration": 3.4356019496917725 }, { "auxiliary_loss_clip": 0.01503686, "auxiliary_loss_mlp": 0.010532, "balance_loss_clip": 1.3137207, "balance_loss_mlp": 1.03050303, "epoch": 0.2830001503081317, "flos": 24799404625920.0, "grad_norm": 1.3124473688305813, "language_loss": 0.82231933, "learning_rate": 3.365279531475407e-06, "loss": 0.84788823, "num_input_tokens_seen": 101585825, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.22705078, "step": 4707, "time_per_iteration": 2.8827314376831055 }, { "auxiliary_loss_clip": 0.01528942, "auxiliary_loss_mlp": 0.010588, "balance_loss_clip": 1.3306632, "balance_loss_mlp": 1.03348029, "epoch": 0.28306027356079966, "flos": 27679987261440.0, "grad_norm": 1.5573623328948076, "language_loss": 0.81530559, "learning_rate": 3.36499490449902e-06, "loss": 0.84118301, "num_input_tokens_seen": 101606105, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.25354004, "step": 4708, "time_per_iteration": 2.8968722820281982 }, { "auxiliary_loss_clip": 0.01328079, "auxiliary_loss_mlp": 0.010404, "balance_loss_clip": 1.21391368, "balance_loss_mlp": 1.00988245, "epoch": 0.2831203968134676, "flos": 60552609891840.0, "grad_norm": 0.8861583858936074, "language_loss": 0.62870002, "learning_rate": 3.3647102257618895e-06, "loss": 0.65238482, "num_input_tokens_seen": 101656875, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.3046875, "step": 4709, "time_per_iteration": 3.196225643157959 }, { "auxiliary_loss_clip": 0.01522512, "auxiliary_loss_mlp": 0.01051835, "balance_loss_clip": 1.32835698, "balance_loss_mlp": 1.02832675, "epoch": 0.2831805200661356, "flos": 22065026497920.0, "grad_norm": 1.5064957870738145, "language_loss": 0.74800897, "learning_rate": 3.3644254952748103e-06, "loss": 0.77375245, "num_input_tokens_seen": 101676225, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.23498535, "step": 4710, "time_per_iteration": 2.8652305603027344 }, { "auxiliary_loss_clip": 0.01512123, "auxiliary_loss_mlp": 0.01054973, "balance_loss_clip": 1.31593132, "balance_loss_mlp": 1.03027284, "epoch": 0.28324064331880355, "flos": 22610680513920.0, "grad_norm": 1.9389356358468988, "language_loss": 0.80494118, "learning_rate": 3.364140713048579e-06, "loss": 0.83061212, "num_input_tokens_seen": 101693710, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.24719238, "step": 4711, "time_per_iteration": 2.861617088317871 }, { "auxiliary_loss_clip": 0.01514789, "auxiliary_loss_mlp": 0.01050309, "balance_loss_clip": 1.31900597, "balance_loss_mlp": 1.02558517, "epoch": 0.2833007665714715, "flos": 30414998816640.0, "grad_norm": 1.880188577798531, "language_loss": 0.71939212, "learning_rate": 3.363855879093996e-06, "loss": 0.7450431, "num_input_tokens_seen": 101714010, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.24743652, "step": 4712, "time_per_iteration": 2.925346851348877 }, { "auxiliary_loss_clip": 0.01515472, "auxiliary_loss_mlp": 0.01050197, "balance_loss_clip": 1.32021332, "balance_loss_mlp": 1.02531791, "epoch": 0.2833608898241395, "flos": 23559810825600.0, "grad_norm": 1.878078310937145, "language_loss": 0.83264327, "learning_rate": 3.3635709934218605e-06, "loss": 0.85829997, "num_input_tokens_seen": 101732995, "router_z_loss_clip": 1.95117188, "router_z_loss_mlp": 0.24841309, "step": 4713, "time_per_iteration": 2.872859001159668 }, { "auxiliary_loss_clip": 0.01523436, "auxiliary_loss_mlp": 0.01048645, "balance_loss_clip": 1.32564282, "balance_loss_mlp": 1.02502942, "epoch": 0.28342101307680745, "flos": 20276430566400.0, "grad_norm": 1.7921276678469167, "language_loss": 0.76168084, "learning_rate": 3.3632860560429766e-06, "loss": 0.78740162, "num_input_tokens_seen": 101751385, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.23608398, "step": 4714, "time_per_iteration": 2.920032501220703 }, { "auxiliary_loss_clip": 0.01514739, "auxiliary_loss_mlp": 0.01047364, "balance_loss_clip": 1.31836128, "balance_loss_mlp": 1.0225563, "epoch": 0.2834811363294754, "flos": 30859358428800.0, "grad_norm": 1.4611404743647336, "language_loss": 0.78774172, "learning_rate": 3.3630010669681494e-06, "loss": 0.81336278, "num_input_tokens_seen": 101773825, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.24829102, "step": 4715, "time_per_iteration": 2.910038709640503 }, { "auxiliary_loss_clip": 0.01499883, "auxiliary_loss_mlp": 0.01042473, "balance_loss_clip": 1.30618119, "balance_loss_mlp": 1.01807058, "epoch": 0.2835412595821434, "flos": 22721295346560.0, "grad_norm": 1.6829840685349544, "language_loss": 0.74018884, "learning_rate": 3.3627160262081845e-06, "loss": 0.76561236, "num_input_tokens_seen": 101791920, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.24401855, "step": 4716, "time_per_iteration": 2.850165605545044 }, { "auxiliary_loss_clip": 0.01524101, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.32137024, "balance_loss_mlp": 1.02457869, "epoch": 0.28360138283481134, "flos": 18086892048000.0, "grad_norm": 2.6400990233305475, "language_loss": 0.75703943, "learning_rate": 3.3624309337738917e-06, "loss": 0.78277409, "num_input_tokens_seen": 101809515, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.2479248, "step": 4717, "time_per_iteration": 2.8362069129943848 }, { "auxiliary_loss_clip": 0.01517023, "auxiliary_loss_mlp": 0.01059474, "balance_loss_clip": 1.31715405, "balance_loss_mlp": 1.03294945, "epoch": 0.2836615060874793, "flos": 17862857205120.0, "grad_norm": 1.6207697111450592, "language_loss": 0.68132955, "learning_rate": 3.3621457896760813e-06, "loss": 0.70709455, "num_input_tokens_seen": 101827735, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.26538086, "step": 4718, "time_per_iteration": 2.907357931137085 }, { "auxiliary_loss_clip": 0.01509635, "auxiliary_loss_mlp": 0.01044795, "balance_loss_clip": 1.3126843, "balance_loss_mlp": 1.01824665, "epoch": 0.2837216293401473, "flos": 25751611584000.0, "grad_norm": 1.7721477108839503, "language_loss": 0.7357738, "learning_rate": 3.361860593925566e-06, "loss": 0.76131815, "num_input_tokens_seen": 101845970, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.26538086, "step": 4719, "time_per_iteration": 2.911893606185913 }, { "auxiliary_loss_clip": 0.01494084, "auxiliary_loss_mlp": 0.01043923, "balance_loss_clip": 1.30193686, "balance_loss_mlp": 1.0187819, "epoch": 0.2837817525928153, "flos": 20933423331840.0, "grad_norm": 1.653835896307464, "language_loss": 0.81230581, "learning_rate": 3.3615753465331605e-06, "loss": 0.83768588, "num_input_tokens_seen": 101865040, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.2512207, "step": 4720, "time_per_iteration": 2.8864338397979736 }, { "auxiliary_loss_clip": 0.01514641, "auxiliary_loss_mlp": 0.01048426, "balance_loss_clip": 1.31674194, "balance_loss_mlp": 1.0216043, "epoch": 0.28384187584548326, "flos": 18926040954240.0, "grad_norm": 1.7317916658375525, "language_loss": 0.80190432, "learning_rate": 3.3612900475096817e-06, "loss": 0.82753503, "num_input_tokens_seen": 101883735, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.26831055, "step": 4721, "time_per_iteration": 3.0064148902893066 }, { "auxiliary_loss_clip": 0.0150957, "auxiliary_loss_mlp": 0.01044823, "balance_loss_clip": 1.31390977, "balance_loss_mlp": 1.01789379, "epoch": 0.2839019990981512, "flos": 27355246197120.0, "grad_norm": 2.7940043409086788, "language_loss": 0.83557612, "learning_rate": 3.3610046968659474e-06, "loss": 0.8611201, "num_input_tokens_seen": 101903025, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.26904297, "step": 4722, "time_per_iteration": 2.9106860160827637 }, { "auxiliary_loss_clip": 0.01510346, "auxiliary_loss_mlp": 0.01042775, "balance_loss_clip": 1.31457543, "balance_loss_mlp": 1.01671636, "epoch": 0.2839621223508192, "flos": 18123522353280.0, "grad_norm": 1.7032323835639431, "language_loss": 0.71342647, "learning_rate": 3.3607192946127785e-06, "loss": 0.73895764, "num_input_tokens_seen": 101922255, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.26049805, "step": 4723, "time_per_iteration": 2.839162588119507 }, { "auxiliary_loss_clip": 0.01496337, "auxiliary_loss_mlp": 0.01047946, "balance_loss_clip": 1.30293131, "balance_loss_mlp": 1.02231598, "epoch": 0.28402224560348716, "flos": 26369259356160.0, "grad_norm": 1.4764526209025686, "language_loss": 0.79172635, "learning_rate": 3.360433840760998e-06, "loss": 0.81716919, "num_input_tokens_seen": 101943100, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.25634766, "step": 4724, "time_per_iteration": 2.869922161102295 }, { "auxiliary_loss_clip": 0.01492281, "auxiliary_loss_mlp": 0.01045425, "balance_loss_clip": 1.29643726, "balance_loss_mlp": 1.02035522, "epoch": 0.2840823688561551, "flos": 24071368245120.0, "grad_norm": 1.7126071159096026, "language_loss": 0.93138361, "learning_rate": 3.36014833532143e-06, "loss": 0.95676064, "num_input_tokens_seen": 101963160, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.25073242, "step": 4725, "time_per_iteration": 2.9126152992248535 }, { "auxiliary_loss_clip": 0.01515567, "auxiliary_loss_mlp": 0.01051498, "balance_loss_clip": 1.31765449, "balance_loss_mlp": 1.02543926, "epoch": 0.2841424921088231, "flos": 29471569349760.0, "grad_norm": 1.5808292075232815, "language_loss": 0.89436287, "learning_rate": 3.3598627783049e-06, "loss": 0.92003351, "num_input_tokens_seen": 101984300, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.26074219, "step": 4726, "time_per_iteration": 2.8982410430908203 }, { "auxiliary_loss_clip": 0.01522212, "auxiliary_loss_mlp": 0.01049104, "balance_loss_clip": 1.32205629, "balance_loss_mlp": 1.02195978, "epoch": 0.28420261536149105, "flos": 48115996813440.0, "grad_norm": 1.6968954269463958, "language_loss": 0.79549456, "learning_rate": 3.359577169722238e-06, "loss": 0.8212077, "num_input_tokens_seen": 102005765, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.2713623, "step": 4727, "time_per_iteration": 3.0966713428497314 }, { "auxiliary_loss_clip": 0.01492299, "auxiliary_loss_mlp": 0.01048146, "balance_loss_clip": 1.30095112, "balance_loss_mlp": 1.02447152, "epoch": 0.284262738614159, "flos": 25677129363840.0, "grad_norm": 3.69142491422953, "language_loss": 0.67755425, "learning_rate": 3.3592915095842733e-06, "loss": 0.7029587, "num_input_tokens_seen": 102022755, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.23693848, "step": 4728, "time_per_iteration": 4.314008951187134 }, { "auxiliary_loss_clip": 0.01495845, "auxiliary_loss_mlp": 0.01049562, "balance_loss_clip": 1.30194592, "balance_loss_mlp": 1.02372968, "epoch": 0.284322861866827, "flos": 19728423820800.0, "grad_norm": 1.7796369760240949, "language_loss": 0.77424693, "learning_rate": 3.3590057979018386e-06, "loss": 0.79970098, "num_input_tokens_seen": 102041850, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.25842285, "step": 4729, "time_per_iteration": 2.8259856700897217 }, { "auxiliary_loss_clip": 0.01514179, "auxiliary_loss_mlp": 0.0104866, "balance_loss_clip": 1.31708479, "balance_loss_mlp": 1.02280378, "epoch": 0.28438298511949495, "flos": 23925887654400.0, "grad_norm": 1.8052719678131146, "language_loss": 0.67644238, "learning_rate": 3.3587200346857674e-06, "loss": 0.70207071, "num_input_tokens_seen": 102059500, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.25842285, "step": 4730, "time_per_iteration": 2.836742639541626 }, { "auxiliary_loss_clip": 0.01520698, "auxiliary_loss_mlp": 0.01042365, "balance_loss_clip": 1.3227427, "balance_loss_mlp": 1.01687777, "epoch": 0.2844431083721629, "flos": 26078388664320.0, "grad_norm": 2.005640200641468, "language_loss": 0.75179785, "learning_rate": 3.3584342199468965e-06, "loss": 0.77742851, "num_input_tokens_seen": 102080460, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.25512695, "step": 4731, "time_per_iteration": 2.9251632690429688 }, { "auxiliary_loss_clip": 0.01510795, "auxiliary_loss_mlp": 0.01046858, "balance_loss_clip": 1.31472063, "balance_loss_mlp": 1.0210855, "epoch": 0.2845032316248309, "flos": 25821026386560.0, "grad_norm": 3.099827876844954, "language_loss": 0.84337628, "learning_rate": 3.3581483536960638e-06, "loss": 0.86895287, "num_input_tokens_seen": 102100950, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.2578125, "step": 4732, "time_per_iteration": 2.893732786178589 }, { "auxiliary_loss_clip": 0.01493991, "auxiliary_loss_mlp": 0.01044555, "balance_loss_clip": 1.2989049, "balance_loss_mlp": 1.0187819, "epoch": 0.2845633548774989, "flos": 19831301792640.0, "grad_norm": 3.1508705382181827, "language_loss": 0.79753107, "learning_rate": 3.357862435944109e-06, "loss": 0.82291651, "num_input_tokens_seen": 102119345, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.25769043, "step": 4733, "time_per_iteration": 2.8954620361328125 }, { "auxiliary_loss_clip": 0.01506488, "auxiliary_loss_mlp": 0.0104361, "balance_loss_clip": 1.30770159, "balance_loss_mlp": 1.01782513, "epoch": 0.28462347813016686, "flos": 23192557632000.0, "grad_norm": 2.647748607671742, "language_loss": 0.72055209, "learning_rate": 3.357576466701875e-06, "loss": 0.74605304, "num_input_tokens_seen": 102139050, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.25793457, "step": 4734, "time_per_iteration": 4.317319631576538 }, { "auxiliary_loss_clip": 0.0149655, "auxiliary_loss_mlp": 0.01040157, "balance_loss_clip": 1.30264592, "balance_loss_mlp": 1.01477695, "epoch": 0.2846836013828348, "flos": 18669538327680.0, "grad_norm": 1.885149835893014, "language_loss": 0.74659753, "learning_rate": 3.3572904459802056e-06, "loss": 0.77196461, "num_input_tokens_seen": 102157935, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.25366211, "step": 4735, "time_per_iteration": 2.8169057369232178 }, { "auxiliary_loss_clip": 0.01497332, "auxiliary_loss_mlp": 0.01043791, "balance_loss_clip": 1.3028059, "balance_loss_mlp": 1.01903117, "epoch": 0.2847437246355028, "flos": 14181611005440.0, "grad_norm": 1.7125840821238163, "language_loss": 0.80901444, "learning_rate": 3.357004373789946e-06, "loss": 0.83442569, "num_input_tokens_seen": 102175325, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.24768066, "step": 4736, "time_per_iteration": 2.8812968730926514 }, { "auxiliary_loss_clip": 0.01508352, "auxiliary_loss_mlp": 0.01050697, "balance_loss_clip": 1.31057191, "balance_loss_mlp": 1.02401829, "epoch": 0.28480384788817076, "flos": 29290318104960.0, "grad_norm": 2.1819452959012633, "language_loss": 0.61009496, "learning_rate": 3.3567182501419453e-06, "loss": 0.63568544, "num_input_tokens_seen": 102196625, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.26708984, "step": 4737, "time_per_iteration": 4.342505693435669 }, { "auxiliary_loss_clip": 0.01498671, "auxiliary_loss_mlp": 0.01039749, "balance_loss_clip": 1.30586421, "balance_loss_mlp": 1.01485825, "epoch": 0.2848639711408387, "flos": 22611494920320.0, "grad_norm": 1.703269928583071, "language_loss": 0.86801445, "learning_rate": 3.356432075047052e-06, "loss": 0.89339864, "num_input_tokens_seen": 102214975, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.24914551, "step": 4738, "time_per_iteration": 4.272680759429932 }, { "auxiliary_loss_clip": 0.01508452, "auxiliary_loss_mlp": 0.01045792, "balance_loss_clip": 1.31147277, "balance_loss_mlp": 1.01926756, "epoch": 0.2849240943935067, "flos": 17607485698560.0, "grad_norm": 2.171795904894641, "language_loss": 0.90952903, "learning_rate": 3.356145848516118e-06, "loss": 0.93507147, "num_input_tokens_seen": 102231885, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.26538086, "step": 4739, "time_per_iteration": 2.801121711730957 }, { "auxiliary_loss_clip": 0.01484506, "auxiliary_loss_mlp": 0.01047758, "balance_loss_clip": 1.29239464, "balance_loss_mlp": 1.02310562, "epoch": 0.28498421764617465, "flos": 24873117684480.0, "grad_norm": 1.4363656083523157, "language_loss": 0.73054177, "learning_rate": 3.355859570559998e-06, "loss": 0.75586438, "num_input_tokens_seen": 102252725, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.2467041, "step": 4740, "time_per_iteration": 2.877530336380005 }, { "auxiliary_loss_clip": 0.01489315, "auxiliary_loss_mlp": 0.01041054, "balance_loss_clip": 1.29800463, "balance_loss_mlp": 1.01528072, "epoch": 0.2850443408988426, "flos": 22792474696320.0, "grad_norm": 1.51428086975398, "language_loss": 0.78658742, "learning_rate": 3.3555732411895477e-06, "loss": 0.81189108, "num_input_tokens_seen": 102271730, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.2578125, "step": 4741, "time_per_iteration": 2.8267436027526855 }, { "auxiliary_loss_clip": 0.01515183, "auxiliary_loss_mlp": 0.0104106, "balance_loss_clip": 1.31326234, "balance_loss_mlp": 1.0147984, "epoch": 0.2851044641515106, "flos": 18853459015680.0, "grad_norm": 2.5112808609059134, "language_loss": 0.77833521, "learning_rate": 3.3552868604156235e-06, "loss": 0.80389768, "num_input_tokens_seen": 102291325, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.26281738, "step": 4742, "time_per_iteration": 2.8497121334075928 }, { "auxiliary_loss_clip": 0.01508945, "auxiliary_loss_mlp": 0.01053264, "balance_loss_clip": 1.31153107, "balance_loss_mlp": 1.02663243, "epoch": 0.28516458740417855, "flos": 18889908341760.0, "grad_norm": 2.0181971032845136, "language_loss": 0.58533013, "learning_rate": 3.355000428249086e-06, "loss": 0.61095226, "num_input_tokens_seen": 102309000, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.26635742, "step": 4743, "time_per_iteration": 2.823291540145874 }, { "auxiliary_loss_clip": 0.01511024, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.31701422, "balance_loss_mlp": 1.02792525, "epoch": 0.2852247106568465, "flos": 25310328618240.0, "grad_norm": 1.6221476803885846, "language_loss": 0.74659848, "learning_rate": 3.354713944700797e-06, "loss": 0.77224576, "num_input_tokens_seen": 102329240, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.25793457, "step": 4744, "time_per_iteration": 2.9066648483276367 }, { "auxiliary_loss_clip": 0.01489087, "auxiliary_loss_mlp": 0.01047678, "balance_loss_clip": 1.29827225, "balance_loss_mlp": 1.02283454, "epoch": 0.2852848339095145, "flos": 11662671208320.0, "grad_norm": 2.2828742923004772, "language_loss": 0.78554976, "learning_rate": 3.3544274097816185e-06, "loss": 0.8109175, "num_input_tokens_seen": 102344440, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.24829102, "step": 4745, "time_per_iteration": 2.817720413208008 }, { "auxiliary_loss_clip": 0.01478162, "auxiliary_loss_mlp": 0.01051318, "balance_loss_clip": 1.28979075, "balance_loss_mlp": 1.02561665, "epoch": 0.2853449571621825, "flos": 12941067064320.0, "grad_norm": 1.7490014232934052, "language_loss": 0.83547139, "learning_rate": 3.3541408235024173e-06, "loss": 0.86076629, "num_input_tokens_seen": 102360985, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.25720215, "step": 4746, "time_per_iteration": 2.858774185180664 }, { "auxiliary_loss_clip": 0.01522136, "auxiliary_loss_mlp": 0.01049907, "balance_loss_clip": 1.31957936, "balance_loss_mlp": 1.0238359, "epoch": 0.28540508041485046, "flos": 20020018429440.0, "grad_norm": 1.670729684306003, "language_loss": 0.80069458, "learning_rate": 3.3538541858740604e-06, "loss": 0.82641506, "num_input_tokens_seen": 102380320, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.26086426, "step": 4747, "time_per_iteration": 2.8487648963928223 }, { "auxiliary_loss_clip": 0.01309691, "auxiliary_loss_mlp": 0.01061469, "balance_loss_clip": 1.190166, "balance_loss_mlp": 1.02351284, "epoch": 0.28546520366751843, "flos": 68171803879680.0, "grad_norm": 0.7846862600552561, "language_loss": 0.60574257, "learning_rate": 3.3535674969074173e-06, "loss": 0.6294542, "num_input_tokens_seen": 102439140, "router_z_loss_clip": 1.1953125, "router_z_loss_mlp": 0.37890625, "step": 4748, "time_per_iteration": 3.3766870498657227 }, { "auxiliary_loss_clip": 0.01491269, "auxiliary_loss_mlp": 0.01052005, "balance_loss_clip": 1.29910278, "balance_loss_mlp": 1.02565956, "epoch": 0.2855253269201864, "flos": 13256849658240.0, "grad_norm": 2.2001388756965117, "language_loss": 0.81103456, "learning_rate": 3.3532807566133592e-06, "loss": 0.83646727, "num_input_tokens_seen": 102450990, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.26342773, "step": 4749, "time_per_iteration": 2.7959842681884766 }, { "auxiliary_loss_clip": 0.01493549, "auxiliary_loss_mlp": 0.0106253, "balance_loss_clip": 1.30045962, "balance_loss_mlp": 1.03716183, "epoch": 0.28558545017285436, "flos": 28632465688320.0, "grad_norm": 2.0568467800864214, "language_loss": 0.72014463, "learning_rate": 3.3529939650027587e-06, "loss": 0.74570537, "num_input_tokens_seen": 102471820, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.25378418, "step": 4750, "time_per_iteration": 2.9076783657073975 }, { "auxiliary_loss_clip": 0.01486217, "auxiliary_loss_mlp": 0.01064693, "balance_loss_clip": 1.29637504, "balance_loss_mlp": 1.03861022, "epoch": 0.2856455734255223, "flos": 34143553094400.0, "grad_norm": 1.5290321335115002, "language_loss": 0.82949907, "learning_rate": 3.3527071220864917e-06, "loss": 0.85500818, "num_input_tokens_seen": 102492625, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.26062012, "step": 4751, "time_per_iteration": 2.9837379455566406 }, { "auxiliary_loss_clip": 0.0149516, "auxiliary_loss_mlp": 0.01054832, "balance_loss_clip": 1.30271709, "balance_loss_mlp": 1.02882075, "epoch": 0.2857056966781903, "flos": 39800799763200.0, "grad_norm": 1.8631218522601813, "language_loss": 0.80828583, "learning_rate": 3.3524202278754353e-06, "loss": 0.83378577, "num_input_tokens_seen": 102514145, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.26000977, "step": 4752, "time_per_iteration": 3.028844118118286 }, { "auxiliary_loss_clip": 0.01484005, "auxiliary_loss_mlp": 0.01058803, "balance_loss_clip": 1.29258692, "balance_loss_mlp": 1.03368568, "epoch": 0.28576581993085826, "flos": 21882463153920.0, "grad_norm": 1.7938414793930575, "language_loss": 0.79762179, "learning_rate": 3.3521332823804676e-06, "loss": 0.82304984, "num_input_tokens_seen": 102532365, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.2512207, "step": 4753, "time_per_iteration": 2.8503472805023193 }, { "auxiliary_loss_clip": 0.01513889, "auxiliary_loss_mlp": 0.01065793, "balance_loss_clip": 1.31567454, "balance_loss_mlp": 1.03917372, "epoch": 0.2858259431835262, "flos": 19099102890240.0, "grad_norm": 2.096881783836177, "language_loss": 0.9081322, "learning_rate": 3.3518462856124704e-06, "loss": 0.93392909, "num_input_tokens_seen": 102548425, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.26599121, "step": 4754, "time_per_iteration": 2.8935513496398926 }, { "auxiliary_loss_clip": 0.01475193, "auxiliary_loss_mlp": 0.01056941, "balance_loss_clip": 1.28873599, "balance_loss_mlp": 1.03214598, "epoch": 0.2858860664361942, "flos": 20342632988160.0, "grad_norm": 2.0219378729797017, "language_loss": 0.83061904, "learning_rate": 3.3515592375823267e-06, "loss": 0.85594034, "num_input_tokens_seen": 102566370, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.2479248, "step": 4755, "time_per_iteration": 2.8317129611968994 }, { "auxiliary_loss_clip": 0.01487455, "auxiliary_loss_mlp": 0.01059804, "balance_loss_clip": 1.2954855, "balance_loss_mlp": 1.03503215, "epoch": 0.28594618968886215, "flos": 24472310832000.0, "grad_norm": 1.7516452945033922, "language_loss": 0.84439814, "learning_rate": 3.351272138300922e-06, "loss": 0.86987072, "num_input_tokens_seen": 102588715, "router_z_loss_clip": 1.91796875, "router_z_loss_mlp": 0.24780273, "step": 4756, "time_per_iteration": 3.0033655166625977 }, { "auxiliary_loss_clip": 0.01297475, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.17993987, "balance_loss_mlp": 1.01304078, "epoch": 0.2860063129415301, "flos": 71689037103360.0, "grad_norm": 0.8703360164449953, "language_loss": 0.6110726, "learning_rate": 3.350984987779142e-06, "loss": 0.63441235, "num_input_tokens_seen": 102656715, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.234375, "step": 4757, "time_per_iteration": 3.541386365890503 }, { "auxiliary_loss_clip": 0.01487582, "auxiliary_loss_mlp": 0.01056089, "balance_loss_clip": 1.29641056, "balance_loss_mlp": 1.031353, "epoch": 0.2860664361941981, "flos": 20568477623040.0, "grad_norm": 1.8884014095786694, "language_loss": 0.66967642, "learning_rate": 3.3506977860278756e-06, "loss": 0.69511312, "num_input_tokens_seen": 102676545, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.24743652, "step": 4758, "time_per_iteration": 2.9287109375 }, { "auxiliary_loss_clip": 0.0150636, "auxiliary_loss_mlp": 0.01062374, "balance_loss_clip": 1.31024134, "balance_loss_mlp": 1.03620791, "epoch": 0.2861265594468661, "flos": 36011291460480.0, "grad_norm": 1.8973321578026119, "language_loss": 0.64079773, "learning_rate": 3.3504105330580143e-06, "loss": 0.66648513, "num_input_tokens_seen": 102702875, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.26184082, "step": 4759, "time_per_iteration": 3.0510735511779785 }, { "auxiliary_loss_clip": 0.01482836, "auxiliary_loss_mlp": 0.01049349, "balance_loss_clip": 1.29136312, "balance_loss_mlp": 1.02392149, "epoch": 0.28618668269953407, "flos": 20056965448320.0, "grad_norm": 1.7461428311036613, "language_loss": 0.75438738, "learning_rate": 3.3501232288804496e-06, "loss": 0.77970922, "num_input_tokens_seen": 102723160, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.25415039, "step": 4760, "time_per_iteration": 2.865426540374756 }, { "auxiliary_loss_clip": 0.01480516, "auxiliary_loss_mlp": 0.01042527, "balance_loss_clip": 1.29454505, "balance_loss_mlp": 1.01798201, "epoch": 0.28624680595220203, "flos": 24982375173120.0, "grad_norm": 1.821856558059459, "language_loss": 0.73229641, "learning_rate": 3.3498358735060773e-06, "loss": 0.75752687, "num_input_tokens_seen": 102743855, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.2454834, "step": 4761, "time_per_iteration": 2.8548829555511475 }, { "auxiliary_loss_clip": 0.01514169, "auxiliary_loss_mlp": 0.01044856, "balance_loss_clip": 1.31755412, "balance_loss_mlp": 1.02095437, "epoch": 0.28630692920487, "flos": 22502508900480.0, "grad_norm": 2.327202285670019, "language_loss": 0.75605893, "learning_rate": 3.349548466945793e-06, "loss": 0.78164923, "num_input_tokens_seen": 102761370, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.23901367, "step": 4762, "time_per_iteration": 2.8607499599456787 }, { "auxiliary_loss_clip": 0.01486534, "auxiliary_loss_mlp": 0.01043035, "balance_loss_clip": 1.29682314, "balance_loss_mlp": 1.01640368, "epoch": 0.28636705245753796, "flos": 21259340760960.0, "grad_norm": 1.5307964470388251, "language_loss": 0.76240659, "learning_rate": 3.349261009210496e-06, "loss": 0.7877022, "num_input_tokens_seen": 102780885, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.26635742, "step": 4763, "time_per_iteration": 4.243765830993652 }, { "auxiliary_loss_clip": 0.01496102, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.30178452, "balance_loss_mlp": 1.01721025, "epoch": 0.28642717571020593, "flos": 24105962534400.0, "grad_norm": 1.6324543065511585, "language_loss": 0.77755934, "learning_rate": 3.348973500311086e-06, "loss": 0.80295014, "num_input_tokens_seen": 102801000, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.25769043, "step": 4764, "time_per_iteration": 3.0266950130462646 }, { "auxiliary_loss_clip": 0.01499544, "auxiliary_loss_mlp": 0.01045974, "balance_loss_clip": 1.30615306, "balance_loss_mlp": 1.0192709, "epoch": 0.2864872989628739, "flos": 22611585409920.0, "grad_norm": 2.532564079671479, "language_loss": 0.72652328, "learning_rate": 3.348685940258466e-06, "loss": 0.75197846, "num_input_tokens_seen": 102820230, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.26696777, "step": 4765, "time_per_iteration": 2.8975718021392822 }, { "auxiliary_loss_clip": 0.0149401, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.30144429, "balance_loss_mlp": 1.01482069, "epoch": 0.28654742221554186, "flos": 32758071500160.0, "grad_norm": 1.4807160349689954, "language_loss": 0.76799506, "learning_rate": 3.3483983290635395e-06, "loss": 0.79334897, "num_input_tokens_seen": 102842670, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.26586914, "step": 4766, "time_per_iteration": 3.0123000144958496 }, { "auxiliary_loss_clip": 0.01479225, "auxiliary_loss_mlp": 0.01044378, "balance_loss_clip": 1.28818309, "balance_loss_mlp": 1.01792574, "epoch": 0.2866075454682098, "flos": 26993467624320.0, "grad_norm": 1.5662624183931255, "language_loss": 0.78970724, "learning_rate": 3.348110666737214e-06, "loss": 0.81494331, "num_input_tokens_seen": 102864480, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.26477051, "step": 4767, "time_per_iteration": 2.912851572036743 }, { "auxiliary_loss_clip": 0.01494732, "auxiliary_loss_mlp": 0.01048304, "balance_loss_clip": 1.30335712, "balance_loss_mlp": 1.0218159, "epoch": 0.2866676687208778, "flos": 23263103554560.0, "grad_norm": 2.8863508008693413, "language_loss": 0.65998948, "learning_rate": 3.3478229532903956e-06, "loss": 0.68541992, "num_input_tokens_seen": 102883740, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.26513672, "step": 4768, "time_per_iteration": 2.8207504749298096 }, { "auxiliary_loss_clip": 0.015151, "auxiliary_loss_mlp": 0.01048696, "balance_loss_clip": 1.31645727, "balance_loss_mlp": 1.02151644, "epoch": 0.28672779197354575, "flos": 21589646935680.0, "grad_norm": 1.612546264536734, "language_loss": 0.7117641, "learning_rate": 3.3475351887339967e-06, "loss": 0.73740208, "num_input_tokens_seen": 102902945, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.27197266, "step": 4769, "time_per_iteration": 4.20481014251709 }, { "auxiliary_loss_clip": 0.01493512, "auxiliary_loss_mlp": 0.01044725, "balance_loss_clip": 1.29856801, "balance_loss_mlp": 1.01926184, "epoch": 0.2867879152262137, "flos": 19875216510720.0, "grad_norm": 2.5031391437997077, "language_loss": 0.7555992, "learning_rate": 3.3472473730789288e-06, "loss": 0.78098154, "num_input_tokens_seen": 102922405, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.25476074, "step": 4770, "time_per_iteration": 2.8624627590179443 }, { "auxiliary_loss_clip": 0.01489837, "auxiliary_loss_mlp": 0.01050208, "balance_loss_clip": 1.29682672, "balance_loss_mlp": 1.02344596, "epoch": 0.2868480384788817, "flos": 28223198058240.0, "grad_norm": 3.043930823755729, "language_loss": 0.69244659, "learning_rate": 3.3469595063361045e-06, "loss": 0.71784699, "num_input_tokens_seen": 102938980, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.26794434, "step": 4771, "time_per_iteration": 2.9029109477996826 }, { "auxiliary_loss_clip": 0.01270399, "auxiliary_loss_mlp": 0.01094924, "balance_loss_clip": 1.15784776, "balance_loss_mlp": 1.06707692, "epoch": 0.2869081617315497, "flos": 65452112328960.0, "grad_norm": 0.7881122780514342, "language_loss": 0.56926215, "learning_rate": 3.3466715885164414e-06, "loss": 0.59291542, "num_input_tokens_seen": 103000405, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.27929688, "step": 4772, "time_per_iteration": 4.717581033706665 }, { "auxiliary_loss_clip": 0.01492246, "auxiliary_loss_mlp": 0.01047742, "balance_loss_clip": 1.29880393, "balance_loss_mlp": 1.02064538, "epoch": 0.28696828498421767, "flos": 18669855041280.0, "grad_norm": 2.451224572889315, "language_loss": 0.84018207, "learning_rate": 3.346383619630856e-06, "loss": 0.86558187, "num_input_tokens_seen": 103017970, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.27111816, "step": 4773, "time_per_iteration": 4.286787748336792 }, { "auxiliary_loss_clip": 0.0150153, "auxiliary_loss_mlp": 0.01044276, "balance_loss_clip": 1.30586982, "balance_loss_mlp": 1.01804948, "epoch": 0.28702840823688563, "flos": 23670289923840.0, "grad_norm": 2.347925815446587, "language_loss": 0.78770173, "learning_rate": 3.34609559969027e-06, "loss": 0.81315982, "num_input_tokens_seen": 103036385, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.26245117, "step": 4774, "time_per_iteration": 2.871081590652466 }, { "auxiliary_loss_clip": 0.01502739, "auxiliary_loss_mlp": 0.01046156, "balance_loss_clip": 1.30799973, "balance_loss_mlp": 1.01995385, "epoch": 0.2870885314895536, "flos": 13811733613440.0, "grad_norm": 2.3785858820081094, "language_loss": 0.74121994, "learning_rate": 3.3458075287056034e-06, "loss": 0.76670885, "num_input_tokens_seen": 103052170, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.26220703, "step": 4775, "time_per_iteration": 2.891812801361084 }, { "auxiliary_loss_clip": 0.01494072, "auxiliary_loss_mlp": 0.01051011, "balance_loss_clip": 1.30010927, "balance_loss_mlp": 1.02449906, "epoch": 0.28714865474222157, "flos": 17796111845760.0, "grad_norm": 1.7950991067530138, "language_loss": 0.88770175, "learning_rate": 3.34551940668778e-06, "loss": 0.91315258, "num_input_tokens_seen": 103070510, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.26538086, "step": 4776, "time_per_iteration": 2.81238055229187 }, { "auxiliary_loss_clip": 0.01484342, "auxiliary_loss_mlp": 0.01041437, "balance_loss_clip": 1.29307866, "balance_loss_mlp": 1.01614034, "epoch": 0.28720877799488953, "flos": 16005977591040.0, "grad_norm": 1.8845034002333572, "language_loss": 0.75637901, "learning_rate": 3.345231233647726e-06, "loss": 0.78163671, "num_input_tokens_seen": 103089590, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.25292969, "step": 4777, "time_per_iteration": 2.8237879276275635 }, { "auxiliary_loss_clip": 0.01523599, "auxiliary_loss_mlp": 0.01046869, "balance_loss_clip": 1.32319117, "balance_loss_mlp": 1.02041602, "epoch": 0.2872689012475575, "flos": 20932925639040.0, "grad_norm": 2.1763213431904735, "language_loss": 0.81052136, "learning_rate": 3.3449430095963696e-06, "loss": 0.83622599, "num_input_tokens_seen": 103109080, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.26489258, "step": 4778, "time_per_iteration": 2.897320508956909 }, { "auxiliary_loss_clip": 0.01486802, "auxiliary_loss_mlp": 0.0104558, "balance_loss_clip": 1.29614019, "balance_loss_mlp": 1.01999784, "epoch": 0.28732902450022546, "flos": 21335180325120.0, "grad_norm": 1.7009157907313364, "language_loss": 0.74653947, "learning_rate": 3.3446547345446386e-06, "loss": 0.77186334, "num_input_tokens_seen": 103127755, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.25622559, "step": 4779, "time_per_iteration": 2.883408784866333 }, { "auxiliary_loss_clip": 0.01505819, "auxiliary_loss_mlp": 0.01041429, "balance_loss_clip": 1.30982184, "balance_loss_mlp": 1.01548862, "epoch": 0.2873891477528934, "flos": 20860207966080.0, "grad_norm": 1.6258429334797018, "language_loss": 0.77075899, "learning_rate": 3.3443664085034656e-06, "loss": 0.79623139, "num_input_tokens_seen": 103147035, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.2590332, "step": 4780, "time_per_iteration": 2.801316022872925 }, { "auxiliary_loss_clip": 0.0149114, "auxiliary_loss_mlp": 0.01045749, "balance_loss_clip": 1.30094898, "balance_loss_mlp": 1.02079868, "epoch": 0.2874492710055614, "flos": 17428813407360.0, "grad_norm": 1.7052157467693, "language_loss": 0.82078338, "learning_rate": 3.344078031483784e-06, "loss": 0.84615231, "num_input_tokens_seen": 103165410, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.24951172, "step": 4781, "time_per_iteration": 2.8098368644714355 }, { "auxiliary_loss_clip": 0.01511118, "auxiliary_loss_mlp": 0.01045342, "balance_loss_clip": 1.31322229, "balance_loss_mlp": 1.01908016, "epoch": 0.28750939425822936, "flos": 13414591589760.0, "grad_norm": 1.8907059606061274, "language_loss": 0.87813222, "learning_rate": 3.3437896034965283e-06, "loss": 0.90369689, "num_input_tokens_seen": 103183710, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.26269531, "step": 4782, "time_per_iteration": 2.8150341510772705 }, { "auxiliary_loss_clip": 0.01531959, "auxiliary_loss_mlp": 0.01051909, "balance_loss_clip": 1.33242977, "balance_loss_mlp": 1.02719665, "epoch": 0.2875695175108973, "flos": 21879612731520.0, "grad_norm": 1.7778164148486508, "language_loss": 0.71918118, "learning_rate": 3.3435011245526357e-06, "loss": 0.74501979, "num_input_tokens_seen": 103203790, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.24682617, "step": 4783, "time_per_iteration": 2.8736379146575928 }, { "auxiliary_loss_clip": 0.01508598, "auxiliary_loss_mlp": 0.01055518, "balance_loss_clip": 1.31456077, "balance_loss_mlp": 1.02960145, "epoch": 0.2876296407635653, "flos": 26255432142720.0, "grad_norm": 1.7084215358520662, "language_loss": 0.77802449, "learning_rate": 3.343212594663047e-06, "loss": 0.80366564, "num_input_tokens_seen": 103223925, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.25952148, "step": 4784, "time_per_iteration": 2.9347474575042725 }, { "auxiliary_loss_clip": 0.01482698, "auxiliary_loss_mlp": 0.01056091, "balance_loss_clip": 1.29432261, "balance_loss_mlp": 1.03145015, "epoch": 0.28768976401623325, "flos": 25384403635200.0, "grad_norm": 1.7530953381454806, "language_loss": 0.76446629, "learning_rate": 3.3429240138387015e-06, "loss": 0.78985417, "num_input_tokens_seen": 103244760, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.24633789, "step": 4785, "time_per_iteration": 2.868757486343384 }, { "auxiliary_loss_clip": 0.01500534, "auxiliary_loss_mlp": 0.01057192, "balance_loss_clip": 1.30593991, "balance_loss_mlp": 1.03303993, "epoch": 0.28774988726890127, "flos": 30676568860800.0, "grad_norm": 2.0965302498574356, "language_loss": 0.83496881, "learning_rate": 3.3426353820905425e-06, "loss": 0.86054599, "num_input_tokens_seen": 103261995, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.24169922, "step": 4786, "time_per_iteration": 2.9376611709594727 }, { "auxiliary_loss_clip": 0.01510732, "auxiliary_loss_mlp": 0.01062198, "balance_loss_clip": 1.31845522, "balance_loss_mlp": 1.03903556, "epoch": 0.28781001052156924, "flos": 20605334152320.0, "grad_norm": 1.7296748370200077, "language_loss": 0.80825078, "learning_rate": 3.342346699429516e-06, "loss": 0.83398008, "num_input_tokens_seen": 103279780, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.23168945, "step": 4787, "time_per_iteration": 2.81431245803833 }, { "auxiliary_loss_clip": 0.01518984, "auxiliary_loss_mlp": 0.01064791, "balance_loss_clip": 1.31989765, "balance_loss_mlp": 1.03956676, "epoch": 0.2878701337742372, "flos": 26553677736960.0, "grad_norm": 1.9175270793084964, "language_loss": 0.84747088, "learning_rate": 3.3420579658665677e-06, "loss": 0.87330866, "num_input_tokens_seen": 103300580, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.25231934, "step": 4788, "time_per_iteration": 2.8786122798919678 }, { "auxiliary_loss_clip": 0.0153837, "auxiliary_loss_mlp": 0.01066295, "balance_loss_clip": 1.3374548, "balance_loss_mlp": 1.04091525, "epoch": 0.28793025702690517, "flos": 28158398225280.0, "grad_norm": 1.7256961203830699, "language_loss": 0.7505005, "learning_rate": 3.3417691814126468e-06, "loss": 0.77654707, "num_input_tokens_seen": 103320430, "router_z_loss_clip": 2.0078125, "router_z_loss_mlp": 0.25390625, "step": 4789, "time_per_iteration": 2.9617104530334473 }, { "auxiliary_loss_clip": 0.0149418, "auxiliary_loss_mlp": 0.01069081, "balance_loss_clip": 1.30517805, "balance_loss_mlp": 1.04421449, "epoch": 0.28799038027957313, "flos": 23815996738560.0, "grad_norm": 1.9872898590608237, "language_loss": 0.85524035, "learning_rate": 3.341480346078704e-06, "loss": 0.88087291, "num_input_tokens_seen": 103337695, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.24865723, "step": 4790, "time_per_iteration": 2.900662660598755 }, { "auxiliary_loss_clip": 0.01531064, "auxiliary_loss_mlp": 0.0105567, "balance_loss_clip": 1.33426785, "balance_loss_mlp": 1.03038597, "epoch": 0.2880505035322411, "flos": 22353770684160.0, "grad_norm": 1.6612089252654387, "language_loss": 0.78785765, "learning_rate": 3.3411914598756922e-06, "loss": 0.81372499, "num_input_tokens_seen": 103357010, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.25292969, "step": 4791, "time_per_iteration": 2.8733279705047607 }, { "auxiliary_loss_clip": 0.0153752, "auxiliary_loss_mlp": 0.01074639, "balance_loss_clip": 1.33621359, "balance_loss_mlp": 1.0500226, "epoch": 0.28811062678490906, "flos": 18013269479040.0, "grad_norm": 1.6452938409640467, "language_loss": 0.71372646, "learning_rate": 3.3409025228145654e-06, "loss": 0.73984808, "num_input_tokens_seen": 103375600, "router_z_loss_clip": 2.01171875, "router_z_loss_mlp": 0.24597168, "step": 4792, "time_per_iteration": 2.8041181564331055 }, { "auxiliary_loss_clip": 0.01540191, "auxiliary_loss_mlp": 0.01063425, "balance_loss_clip": 1.34112024, "balance_loss_mlp": 1.0403347, "epoch": 0.28817075003757703, "flos": 22101023376000.0, "grad_norm": 1.7104852192781774, "language_loss": 0.81046593, "learning_rate": 3.3406135349062812e-06, "loss": 0.83650208, "num_input_tokens_seen": 103395225, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.23095703, "step": 4793, "time_per_iteration": 2.857980251312256 }, { "auxiliary_loss_clip": 0.01519572, "auxiliary_loss_mlp": 0.01063189, "balance_loss_clip": 1.32537687, "balance_loss_mlp": 1.03698671, "epoch": 0.288230873290245, "flos": 41698200735360.0, "grad_norm": 1.6368201998318224, "language_loss": 0.78794962, "learning_rate": 3.340324496161797e-06, "loss": 0.81377721, "num_input_tokens_seen": 103417245, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.26220703, "step": 4794, "time_per_iteration": 2.997415065765381 }, { "auxiliary_loss_clip": 0.01536792, "auxiliary_loss_mlp": 0.01069593, "balance_loss_clip": 1.3385793, "balance_loss_mlp": 1.04466617, "epoch": 0.28829099654291296, "flos": 18633586694400.0, "grad_norm": 2.152731538696966, "language_loss": 0.84172487, "learning_rate": 3.340035406592074e-06, "loss": 0.86778873, "num_input_tokens_seen": 103435500, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.24926758, "step": 4795, "time_per_iteration": 2.8568642139434814 }, { "auxiliary_loss_clip": 0.0151231, "auxiliary_loss_mlp": 0.0105711, "balance_loss_clip": 1.32251263, "balance_loss_mlp": 1.03289819, "epoch": 0.2883511197955809, "flos": 24683948599680.0, "grad_norm": 6.7672658570282165, "language_loss": 0.75343412, "learning_rate": 3.339746266208074e-06, "loss": 0.77912831, "num_input_tokens_seen": 103451040, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.2421875, "step": 4796, "time_per_iteration": 2.8403103351593018 }, { "auxiliary_loss_clip": 0.01542355, "auxiliary_loss_mlp": 0.01054884, "balance_loss_clip": 1.34120059, "balance_loss_mlp": 1.02912331, "epoch": 0.2884112430482489, "flos": 23122464157440.0, "grad_norm": 1.9491594816427178, "language_loss": 0.73695242, "learning_rate": 3.3394570750207614e-06, "loss": 0.76292485, "num_input_tokens_seen": 103471330, "router_z_loss_clip": 2.01074219, "router_z_loss_mlp": 0.2578125, "step": 4797, "time_per_iteration": 2.8224287033081055 }, { "auxiliary_loss_clip": 0.01538706, "auxiliary_loss_mlp": 0.01059786, "balance_loss_clip": 1.33951068, "balance_loss_mlp": 1.03473985, "epoch": 0.28847136630091685, "flos": 16882118760960.0, "grad_norm": 1.848974945977431, "language_loss": 0.7551378, "learning_rate": 3.3391678330411017e-06, "loss": 0.78112268, "num_input_tokens_seen": 103488060, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.25036621, "step": 4798, "time_per_iteration": 4.252542495727539 }, { "auxiliary_loss_clip": 0.01546263, "auxiliary_loss_mlp": 0.01069613, "balance_loss_clip": 1.34596443, "balance_loss_mlp": 1.04338694, "epoch": 0.2885314895535849, "flos": 25666270611840.0, "grad_norm": 2.6080454971333866, "language_loss": 0.66446519, "learning_rate": 3.3388785402800642e-06, "loss": 0.69062394, "num_input_tokens_seen": 103503600, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.26220703, "step": 4799, "time_per_iteration": 2.9233834743499756 }, { "auxiliary_loss_clip": 0.01557067, "auxiliary_loss_mlp": 0.01060981, "balance_loss_clip": 1.35420537, "balance_loss_mlp": 1.03607845, "epoch": 0.28859161280625284, "flos": 21117524999040.0, "grad_norm": 1.791865574619288, "language_loss": 0.83269471, "learning_rate": 3.3385891967486178e-06, "loss": 0.85887516, "num_input_tokens_seen": 103524195, "router_z_loss_clip": 2.02539062, "router_z_loss_mlp": 0.24926758, "step": 4800, "time_per_iteration": 2.926154136657715 }, { "auxiliary_loss_clip": 0.01529771, "auxiliary_loss_mlp": 0.01053814, "balance_loss_clip": 1.33586669, "balance_loss_mlp": 1.02901816, "epoch": 0.2886517360589208, "flos": 26480914819200.0, "grad_norm": 1.6723654386955853, "language_loss": 0.91697717, "learning_rate": 3.3382998024577347e-06, "loss": 0.94281298, "num_input_tokens_seen": 103545235, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.2479248, "step": 4801, "time_per_iteration": 2.9288527965545654 }, { "auxiliary_loss_clip": 0.01531605, "auxiliary_loss_mlp": 0.01044753, "balance_loss_clip": 1.33637595, "balance_loss_mlp": 1.0201838, "epoch": 0.28871185931158877, "flos": 25276277266560.0, "grad_norm": 1.8169857862655183, "language_loss": 0.74297076, "learning_rate": 3.33801035741839e-06, "loss": 0.76873434, "num_input_tokens_seen": 103563305, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.24584961, "step": 4802, "time_per_iteration": 2.8692831993103027 }, { "auxiliary_loss_clip": 0.01275066, "auxiliary_loss_mlp": 0.01029569, "balance_loss_clip": 1.16424322, "balance_loss_mlp": 1.00858808, "epoch": 0.28877198256425674, "flos": 66693742145280.0, "grad_norm": 0.7847566583064334, "language_loss": 0.6301139, "learning_rate": 3.337720861641558e-06, "loss": 0.65316027, "num_input_tokens_seen": 103625025, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.20996094, "step": 4803, "time_per_iteration": 3.3381829261779785 }, { "auxiliary_loss_clip": 0.01544043, "auxiliary_loss_mlp": 0.01049506, "balance_loss_clip": 1.34608674, "balance_loss_mlp": 1.02401888, "epoch": 0.2888321058169247, "flos": 20312698913280.0, "grad_norm": 2.9726662510172672, "language_loss": 0.71983719, "learning_rate": 3.3374313151382165e-06, "loss": 0.74577272, "num_input_tokens_seen": 103644235, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.25512695, "step": 4804, "time_per_iteration": 4.236265182495117 }, { "auxiliary_loss_clip": 0.0155845, "auxiliary_loss_mlp": 0.01048433, "balance_loss_clip": 1.35582495, "balance_loss_mlp": 1.0223738, "epoch": 0.28889222906959267, "flos": 25526762334720.0, "grad_norm": 1.9195262912256217, "language_loss": 0.69079304, "learning_rate": 3.337141717919346e-06, "loss": 0.7168619, "num_input_tokens_seen": 103664700, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.26074219, "step": 4805, "time_per_iteration": 2.8494715690612793 }, { "auxiliary_loss_clip": 0.01562349, "auxiliary_loss_mlp": 0.010503, "balance_loss_clip": 1.36036122, "balance_loss_mlp": 1.0248127, "epoch": 0.28895235232226063, "flos": 32684086972800.0, "grad_norm": 1.490165497953458, "language_loss": 0.70443308, "learning_rate": 3.3368520699959272e-06, "loss": 0.73055953, "num_input_tokens_seen": 103686595, "router_z_loss_clip": 2.01855469, "router_z_loss_mlp": 0.25524902, "step": 4806, "time_per_iteration": 2.9652867317199707 }, { "auxiliary_loss_clip": 0.01533811, "auxiliary_loss_mlp": 0.01048598, "balance_loss_clip": 1.33905625, "balance_loss_mlp": 1.02324247, "epoch": 0.2890124755749286, "flos": 29726352673920.0, "grad_norm": 1.502460000694783, "language_loss": 0.71882743, "learning_rate": 3.3365623713789443e-06, "loss": 0.7446515, "num_input_tokens_seen": 103707525, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.25354004, "step": 4807, "time_per_iteration": 2.917811870574951 }, { "auxiliary_loss_clip": 0.01536283, "auxiliary_loss_mlp": 0.01045777, "balance_loss_clip": 1.33804059, "balance_loss_mlp": 1.02054, "epoch": 0.28907259882759656, "flos": 22684710286080.0, "grad_norm": 2.1252845371801437, "language_loss": 0.82176751, "learning_rate": 3.336272622079382e-06, "loss": 0.84758806, "num_input_tokens_seen": 103727905, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.25256348, "step": 4808, "time_per_iteration": 4.260455369949341 }, { "auxiliary_loss_clip": 0.01544137, "auxiliary_loss_mlp": 0.01050307, "balance_loss_clip": 1.35023117, "balance_loss_mlp": 1.02392638, "epoch": 0.2891327220802645, "flos": 22576538672640.0, "grad_norm": 1.5421227318371271, "language_loss": 0.791233, "learning_rate": 3.3359828221082276e-06, "loss": 0.81717741, "num_input_tokens_seen": 103748335, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.26403809, "step": 4809, "time_per_iteration": 2.8474299907684326 }, { "auxiliary_loss_clip": 0.0155848, "auxiliary_loss_mlp": 0.01045971, "balance_loss_clip": 1.35414088, "balance_loss_mlp": 1.01819551, "epoch": 0.2891928453329325, "flos": 21662771811840.0, "grad_norm": 5.248377939640719, "language_loss": 0.79396164, "learning_rate": 3.3356929714764714e-06, "loss": 0.82000613, "num_input_tokens_seen": 103767020, "router_z_loss_clip": 2.04199219, "router_z_loss_mlp": 0.27807617, "step": 4810, "time_per_iteration": 2.8342230319976807 }, { "auxiliary_loss_clip": 0.01539964, "auxiliary_loss_mlp": 0.01046677, "balance_loss_clip": 1.34407663, "balance_loss_mlp": 1.02141643, "epoch": 0.28925296858560046, "flos": 23232581297280.0, "grad_norm": 1.5946035602143989, "language_loss": 0.77711838, "learning_rate": 3.3354030701951032e-06, "loss": 0.80298483, "num_input_tokens_seen": 103786355, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.25292969, "step": 4811, "time_per_iteration": 2.8543879985809326 }, { "auxiliary_loss_clip": 0.01549954, "auxiliary_loss_mlp": 0.01045643, "balance_loss_clip": 1.35037315, "balance_loss_mlp": 1.01909506, "epoch": 0.2893130918382685, "flos": 28633461073920.0, "grad_norm": 1.4777567360714885, "language_loss": 0.78272724, "learning_rate": 3.335113118275117e-06, "loss": 0.80868322, "num_input_tokens_seen": 103809345, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.26538086, "step": 4812, "time_per_iteration": 2.967966079711914 }, { "auxiliary_loss_clip": 0.0127695, "auxiliary_loss_mlp": 0.01038676, "balance_loss_clip": 1.16372037, "balance_loss_mlp": 1.02189112, "epoch": 0.28937321509093644, "flos": 72335487830400.0, "grad_norm": 0.8816011424443164, "language_loss": 0.60383868, "learning_rate": 3.3348231157275085e-06, "loss": 0.62699497, "num_input_tokens_seen": 103871180, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 0.16796875, "step": 4813, "time_per_iteration": 3.504157066345215 }, { "auxiliary_loss_clip": 0.01530657, "auxiliary_loss_mlp": 0.01044034, "balance_loss_clip": 1.33560109, "balance_loss_mlp": 1.01830912, "epoch": 0.2894333383436044, "flos": 16224945016320.0, "grad_norm": 1.9976018439025227, "language_loss": 0.83201283, "learning_rate": 3.3345330625632725e-06, "loss": 0.85775977, "num_input_tokens_seen": 103889040, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.25769043, "step": 4814, "time_per_iteration": 2.9470860958099365 }, { "auxiliary_loss_clip": 0.01547586, "auxiliary_loss_mlp": 0.01051618, "balance_loss_clip": 1.3461144, "balance_loss_mlp": 1.02623844, "epoch": 0.2894934615962724, "flos": 24839383046400.0, "grad_norm": 1.887844347334312, "language_loss": 0.73618543, "learning_rate": 3.3342429587934094e-06, "loss": 0.76217747, "num_input_tokens_seen": 103910380, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.25378418, "step": 4815, "time_per_iteration": 3.013873338699341 }, { "auxiliary_loss_clip": 0.01527475, "auxiliary_loss_mlp": 0.010517, "balance_loss_clip": 1.33689773, "balance_loss_mlp": 1.02671373, "epoch": 0.28955358484894034, "flos": 20459944051200.0, "grad_norm": 1.6668992190737464, "language_loss": 0.71457058, "learning_rate": 3.3339528044289198e-06, "loss": 0.74036229, "num_input_tokens_seen": 103929955, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.24975586, "step": 4816, "time_per_iteration": 2.935013771057129 }, { "auxiliary_loss_clip": 0.01551796, "auxiliary_loss_mlp": 0.01049026, "balance_loss_clip": 1.34699893, "balance_loss_mlp": 1.02223969, "epoch": 0.2896137081016083, "flos": 22575452797440.0, "grad_norm": 3.0683797747652894, "language_loss": 0.77180052, "learning_rate": 3.3336625994808055e-06, "loss": 0.79780871, "num_input_tokens_seen": 103948020, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.26806641, "step": 4817, "time_per_iteration": 2.957340955734253 }, { "auxiliary_loss_clip": 0.01545201, "auxiliary_loss_mlp": 0.01052486, "balance_loss_clip": 1.34511602, "balance_loss_mlp": 1.02565205, "epoch": 0.28967383135427627, "flos": 26699248817280.0, "grad_norm": 2.304306623731047, "language_loss": 0.77743804, "learning_rate": 3.3333723439600723e-06, "loss": 0.80341494, "num_input_tokens_seen": 103968740, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.26818848, "step": 4818, "time_per_iteration": 2.926295280456543 }, { "auxiliary_loss_clip": 0.01544049, "auxiliary_loss_mlp": 0.01051175, "balance_loss_clip": 1.34426093, "balance_loss_mlp": 1.02524662, "epoch": 0.28973395460694423, "flos": 15566278193280.0, "grad_norm": 2.0244186916553777, "language_loss": 0.81146032, "learning_rate": 3.3330820378777263e-06, "loss": 0.83741254, "num_input_tokens_seen": 103986005, "router_z_loss_clip": 1.99609375, "router_z_loss_mlp": 0.25952148, "step": 4819, "time_per_iteration": 2.852612018585205 }, { "auxiliary_loss_clip": 0.01547162, "auxiliary_loss_mlp": 0.01045187, "balance_loss_clip": 1.34558666, "balance_loss_mlp": 1.01917601, "epoch": 0.2897940778596122, "flos": 18706666325760.0, "grad_norm": 2.1067661997091007, "language_loss": 0.7977221, "learning_rate": 3.332791681244776e-06, "loss": 0.82364559, "num_input_tokens_seen": 104005070, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.26049805, "step": 4820, "time_per_iteration": 2.8610074520111084 }, { "auxiliary_loss_clip": 0.01555825, "auxiliary_loss_mlp": 0.01042882, "balance_loss_clip": 1.35320866, "balance_loss_mlp": 1.01772928, "epoch": 0.28985420111228016, "flos": 18779474488320.0, "grad_norm": 2.317086672323062, "language_loss": 0.7420398, "learning_rate": 3.332501274072231e-06, "loss": 0.76802689, "num_input_tokens_seen": 104022945, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.25134277, "step": 4821, "time_per_iteration": 2.988969564437866 }, { "auxiliary_loss_clip": 0.01536339, "auxiliary_loss_mlp": 0.01043736, "balance_loss_clip": 1.33676958, "balance_loss_mlp": 1.01774859, "epoch": 0.28991432436494813, "flos": 23078639928960.0, "grad_norm": 2.043097632165815, "language_loss": 0.72881335, "learning_rate": 3.332210816371104e-06, "loss": 0.75461411, "num_input_tokens_seen": 104042080, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.26013184, "step": 4822, "time_per_iteration": 2.9859092235565186 }, { "auxiliary_loss_clip": 0.01536833, "auxiliary_loss_mlp": 0.01046257, "balance_loss_clip": 1.33908534, "balance_loss_mlp": 1.02066255, "epoch": 0.2899744476176161, "flos": 17612191157760.0, "grad_norm": 1.8348030889975135, "language_loss": 0.67054808, "learning_rate": 3.3319203081524102e-06, "loss": 0.69637895, "num_input_tokens_seen": 104060975, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.25561523, "step": 4823, "time_per_iteration": 2.836082935333252 }, { "auxiliary_loss_clip": 0.01535753, "auxiliary_loss_mlp": 0.01049515, "balance_loss_clip": 1.33913839, "balance_loss_mlp": 1.02302623, "epoch": 0.29003457087028406, "flos": 22319447863680.0, "grad_norm": 2.3728541739496927, "language_loss": 0.82517797, "learning_rate": 3.331629749427164e-06, "loss": 0.85103065, "num_input_tokens_seen": 104081395, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.26489258, "step": 4824, "time_per_iteration": 2.951359272003174 }, { "auxiliary_loss_clip": 0.01562456, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.35772693, "balance_loss_mlp": 1.02192235, "epoch": 0.2900946941229521, "flos": 21955090337280.0, "grad_norm": 1.8306094758322198, "language_loss": 0.73933882, "learning_rate": 3.331339140206385e-06, "loss": 0.76546037, "num_input_tokens_seen": 104099995, "router_z_loss_clip": 2.046875, "router_z_loss_mlp": 0.27783203, "step": 4825, "time_per_iteration": 2.8490102291107178 }, { "auxiliary_loss_clip": 0.01549437, "auxiliary_loss_mlp": 0.01041995, "balance_loss_clip": 1.34649682, "balance_loss_mlp": 1.01685405, "epoch": 0.29015481737562004, "flos": 17941411457280.0, "grad_norm": 2.423924847443782, "language_loss": 0.7482748, "learning_rate": 3.331048480501092e-06, "loss": 0.77418911, "num_input_tokens_seen": 104118930, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.25146484, "step": 4826, "time_per_iteration": 2.8724327087402344 }, { "auxiliary_loss_clip": 0.01552093, "auxiliary_loss_mlp": 0.01042231, "balance_loss_clip": 1.35058808, "balance_loss_mlp": 1.01796031, "epoch": 0.290214940628288, "flos": 22793696305920.0, "grad_norm": 2.3918772551749843, "language_loss": 0.69299293, "learning_rate": 3.3307577703223073e-06, "loss": 0.7189362, "num_input_tokens_seen": 104136940, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.24291992, "step": 4827, "time_per_iteration": 2.8559670448303223 }, { "auxiliary_loss_clip": 0.01546761, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.34841633, "balance_loss_mlp": 1.01445222, "epoch": 0.290275063880956, "flos": 20014860522240.0, "grad_norm": 2.4213517209957724, "language_loss": 0.80933261, "learning_rate": 3.3304670096810545e-06, "loss": 0.83520359, "num_input_tokens_seen": 104154280, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.25915527, "step": 4828, "time_per_iteration": 2.845991611480713 }, { "auxiliary_loss_clip": 0.01533895, "auxiliary_loss_mlp": 0.01047723, "balance_loss_clip": 1.33837795, "balance_loss_mlp": 1.02125812, "epoch": 0.29033518713362394, "flos": 22063533419520.0, "grad_norm": 1.674046460545599, "language_loss": 0.80694538, "learning_rate": 3.33017619858836e-06, "loss": 0.83276153, "num_input_tokens_seen": 104172605, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.2644043, "step": 4829, "time_per_iteration": 2.8725380897521973 }, { "auxiliary_loss_clip": 0.01524387, "auxiliary_loss_mlp": 0.01038224, "balance_loss_clip": 1.33061326, "balance_loss_mlp": 1.01329732, "epoch": 0.2903953103862919, "flos": 25641132485760.0, "grad_norm": 1.5314307835901912, "language_loss": 0.83443868, "learning_rate": 3.329885337055249e-06, "loss": 0.86006474, "num_input_tokens_seen": 104194120, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.24926758, "step": 4830, "time_per_iteration": 2.8604884147644043 }, { "auxiliary_loss_clip": 0.01556761, "auxiliary_loss_mlp": 0.01048489, "balance_loss_clip": 1.35440576, "balance_loss_mlp": 1.0230732, "epoch": 0.29045543363895987, "flos": 16954519720320.0, "grad_norm": 2.2594201700147782, "language_loss": 0.80796683, "learning_rate": 3.3295944250927546e-06, "loss": 0.83401936, "num_input_tokens_seen": 104210875, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.25415039, "step": 4831, "time_per_iteration": 2.827427864074707 }, { "auxiliary_loss_clip": 0.01523599, "auxiliary_loss_mlp": 0.01039442, "balance_loss_clip": 1.33143163, "balance_loss_mlp": 1.0156002, "epoch": 0.29051555689162784, "flos": 26406568333440.0, "grad_norm": 1.569339014101516, "language_loss": 0.74944234, "learning_rate": 3.3293034627119055e-06, "loss": 0.77507269, "num_input_tokens_seen": 104229875, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.23840332, "step": 4832, "time_per_iteration": 2.9645872116088867 }, { "auxiliary_loss_clip": 0.01530057, "auxiliary_loss_mlp": 0.01041454, "balance_loss_clip": 1.33500779, "balance_loss_mlp": 1.01823163, "epoch": 0.2905756801442958, "flos": 21113271987840.0, "grad_norm": 1.5555517991946426, "language_loss": 0.76983559, "learning_rate": 3.329012449923736e-06, "loss": 0.7955507, "num_input_tokens_seen": 104250405, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.2322998, "step": 4833, "time_per_iteration": 4.333189010620117 }, { "auxiliary_loss_clip": 0.01528442, "auxiliary_loss_mlp": 0.01040264, "balance_loss_clip": 1.33234608, "balance_loss_mlp": 1.01581395, "epoch": 0.29063580339696377, "flos": 15714609206400.0, "grad_norm": 2.1293522336846005, "language_loss": 0.66124332, "learning_rate": 3.3287213867392813e-06, "loss": 0.68693042, "num_input_tokens_seen": 104269185, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.24450684, "step": 4834, "time_per_iteration": 2.855473279953003 }, { "auxiliary_loss_clip": 0.01520831, "auxiliary_loss_mlp": 0.01038656, "balance_loss_clip": 1.32796001, "balance_loss_mlp": 1.01581526, "epoch": 0.29069592664963173, "flos": 24655824316800.0, "grad_norm": 1.4999922117605387, "language_loss": 0.72862506, "learning_rate": 3.3284302731695783e-06, "loss": 0.75421989, "num_input_tokens_seen": 104289400, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.22839355, "step": 4835, "time_per_iteration": 2.851914167404175 }, { "auxiliary_loss_clip": 0.01512091, "auxiliary_loss_mlp": 0.0103916, "balance_loss_clip": 1.32027423, "balance_loss_mlp": 1.01587892, "epoch": 0.2907560499022997, "flos": 24984773147520.0, "grad_norm": 1.9449550242850762, "language_loss": 0.80424219, "learning_rate": 3.3281391092256668e-06, "loss": 0.82975471, "num_input_tokens_seen": 104310485, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.23266602, "step": 4836, "time_per_iteration": 2.896366596221924 }, { "auxiliary_loss_clip": 0.0152986, "auxiliary_loss_mlp": 0.01045449, "balance_loss_clip": 1.338099, "balance_loss_mlp": 1.02051079, "epoch": 0.29081617315496766, "flos": 18665783009280.0, "grad_norm": 1.802684032490924, "language_loss": 0.81442881, "learning_rate": 3.3278478949185865e-06, "loss": 0.84018195, "num_input_tokens_seen": 104327330, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.24963379, "step": 4837, "time_per_iteration": 2.839606761932373 }, { "auxiliary_loss_clip": 0.01526446, "auxiliary_loss_mlp": 0.01042371, "balance_loss_clip": 1.33165526, "balance_loss_mlp": 1.01805234, "epoch": 0.2908762964076356, "flos": 35343530432640.0, "grad_norm": 1.8571268875625093, "language_loss": 0.68422246, "learning_rate": 3.327556630259381e-06, "loss": 0.70991063, "num_input_tokens_seen": 104350350, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.24328613, "step": 4838, "time_per_iteration": 2.9503445625305176 }, { "auxiliary_loss_clip": 0.01538116, "auxiliary_loss_mlp": 0.01044978, "balance_loss_clip": 1.34083331, "balance_loss_mlp": 1.01976538, "epoch": 0.29093641966030365, "flos": 23086783992960.0, "grad_norm": 1.8878325090666845, "language_loss": 0.72596723, "learning_rate": 3.327265315259095e-06, "loss": 0.75179815, "num_input_tokens_seen": 104369995, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.25219727, "step": 4839, "time_per_iteration": 4.245973587036133 }, { "auxiliary_loss_clip": 0.01539673, "auxiliary_loss_mlp": 0.01043853, "balance_loss_clip": 1.34275234, "balance_loss_mlp": 1.01933169, "epoch": 0.2909965429129716, "flos": 35969141289600.0, "grad_norm": 1.7732923098622853, "language_loss": 0.77105856, "learning_rate": 3.326973949928776e-06, "loss": 0.79689384, "num_input_tokens_seen": 104392285, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.24523926, "step": 4840, "time_per_iteration": 2.9662835597991943 }, { "auxiliary_loss_clip": 0.01523114, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.33011484, "balance_loss_mlp": 1.01712835, "epoch": 0.2910566661656396, "flos": 30891690478080.0, "grad_norm": 2.2190223097104806, "language_loss": 0.61041629, "learning_rate": 3.326682534279471e-06, "loss": 0.6360476, "num_input_tokens_seen": 104412640, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.22875977, "step": 4841, "time_per_iteration": 2.891633987426758 }, { "auxiliary_loss_clip": 0.01532875, "auxiliary_loss_mlp": 0.01045133, "balance_loss_clip": 1.33904815, "balance_loss_mlp": 1.02064705, "epoch": 0.29111678941830754, "flos": 30022878965760.0, "grad_norm": 1.3610948618208418, "language_loss": 0.72327399, "learning_rate": 3.326391068322232e-06, "loss": 0.74905396, "num_input_tokens_seen": 104435245, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.24487305, "step": 4842, "time_per_iteration": 2.936018466949463 }, { "auxiliary_loss_clip": 0.01535815, "auxiliary_loss_mlp": 0.01041489, "balance_loss_clip": 1.34281409, "balance_loss_mlp": 1.01892316, "epoch": 0.2911769126709755, "flos": 22867816567680.0, "grad_norm": 1.5526561295763968, "language_loss": 0.74521071, "learning_rate": 3.3260995520681098e-06, "loss": 0.77098382, "num_input_tokens_seen": 104455395, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.22558594, "step": 4843, "time_per_iteration": 4.274471998214722 }, { "auxiliary_loss_clip": 0.01526612, "auxiliary_loss_mlp": 0.01040065, "balance_loss_clip": 1.33098733, "balance_loss_mlp": 1.01724863, "epoch": 0.2912370359236435, "flos": 21659876144640.0, "grad_norm": 2.730233097054129, "language_loss": 0.59739441, "learning_rate": 3.3258079855281602e-06, "loss": 0.62306118, "num_input_tokens_seen": 104473350, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.22839355, "step": 4844, "time_per_iteration": 2.886984348297119 }, { "auxiliary_loss_clip": 0.01553472, "auxiliary_loss_mlp": 0.01043616, "balance_loss_clip": 1.35481524, "balance_loss_mlp": 1.0184387, "epoch": 0.29129715917631144, "flos": 22903632466560.0, "grad_norm": 2.176980253439563, "language_loss": 0.87435961, "learning_rate": 3.3255163687134396e-06, "loss": 0.90033048, "num_input_tokens_seen": 104492265, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.25195312, "step": 4845, "time_per_iteration": 2.8728482723236084 }, { "auxiliary_loss_clip": 0.01541299, "auxiliary_loss_mlp": 0.01048768, "balance_loss_clip": 1.3456285, "balance_loss_mlp": 1.02428246, "epoch": 0.2913572824289794, "flos": 22684710286080.0, "grad_norm": 1.6837862498302114, "language_loss": 0.67999321, "learning_rate": 3.3252247016350046e-06, "loss": 0.70589387, "num_input_tokens_seen": 104510755, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.24462891, "step": 4846, "time_per_iteration": 2.8466150760650635 }, { "auxiliary_loss_clip": 0.01514748, "auxiliary_loss_mlp": 0.01045379, "balance_loss_clip": 1.32501113, "balance_loss_mlp": 1.02197862, "epoch": 0.29141740568164737, "flos": 23116491843840.0, "grad_norm": 1.9915218051066874, "language_loss": 0.71215254, "learning_rate": 3.3249329843039166e-06, "loss": 0.73775381, "num_input_tokens_seen": 104530830, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.23413086, "step": 4847, "time_per_iteration": 2.895432949066162 }, { "auxiliary_loss_clip": 0.01538456, "auxiliary_loss_mlp": 0.01041942, "balance_loss_clip": 1.34541702, "balance_loss_mlp": 1.01860094, "epoch": 0.29147752893431533, "flos": 23597617495680.0, "grad_norm": 1.5592810047931016, "language_loss": 0.74414724, "learning_rate": 3.324641216731237e-06, "loss": 0.76995122, "num_input_tokens_seen": 104550115, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.23352051, "step": 4848, "time_per_iteration": 2.884173631668091 }, { "auxiliary_loss_clip": 0.01525543, "auxiliary_loss_mlp": 0.01044568, "balance_loss_clip": 1.33031058, "balance_loss_mlp": 1.02152491, "epoch": 0.2915376521869833, "flos": 20600945406720.0, "grad_norm": 2.6944950377436427, "language_loss": 0.77402794, "learning_rate": 3.3243493989280295e-06, "loss": 0.79972905, "num_input_tokens_seen": 104566255, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.23022461, "step": 4849, "time_per_iteration": 2.8488566875457764 }, { "auxiliary_loss_clip": 0.0155703, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.35770774, "balance_loss_mlp": 1.01876485, "epoch": 0.29159777543965126, "flos": 20820681993600.0, "grad_norm": 1.8448419366586088, "language_loss": 0.79740727, "learning_rate": 3.3240575309053596e-06, "loss": 0.82340664, "num_input_tokens_seen": 104585235, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.24121094, "step": 4850, "time_per_iteration": 2.9935688972473145 }, { "auxiliary_loss_clip": 0.01520104, "auxiliary_loss_mlp": 0.01044738, "balance_loss_clip": 1.33030248, "balance_loss_mlp": 1.02002621, "epoch": 0.29165789869231923, "flos": 24255017464320.0, "grad_norm": 2.397375492551384, "language_loss": 0.76534855, "learning_rate": 3.323765612674296e-06, "loss": 0.79099691, "num_input_tokens_seen": 104605315, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.24694824, "step": 4851, "time_per_iteration": 2.8717474937438965 }, { "auxiliary_loss_clip": 0.01531667, "auxiliary_loss_mlp": 0.01042962, "balance_loss_clip": 1.34110999, "balance_loss_mlp": 1.0207417, "epoch": 0.29171802194498725, "flos": 28961776477440.0, "grad_norm": 1.3208301180941222, "language_loss": 0.78186846, "learning_rate": 3.3234736442459078e-06, "loss": 0.80761474, "num_input_tokens_seen": 104626055, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.22216797, "step": 4852, "time_per_iteration": 2.8919177055358887 }, { "auxiliary_loss_clip": 0.01531064, "auxiliary_loss_mlp": 0.0105082, "balance_loss_clip": 1.33513737, "balance_loss_mlp": 1.02628684, "epoch": 0.2917781451976552, "flos": 22607603867520.0, "grad_norm": 1.8763100868142792, "language_loss": 0.78700179, "learning_rate": 3.3231816256312665e-06, "loss": 0.81282067, "num_input_tokens_seen": 104646005, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.24536133, "step": 4853, "time_per_iteration": 2.8952176570892334 }, { "auxiliary_loss_clip": 0.01531968, "auxiliary_loss_mlp": 0.01047224, "balance_loss_clip": 1.33793628, "balance_loss_mlp": 1.02419281, "epoch": 0.2918382684503232, "flos": 21582995950080.0, "grad_norm": 3.878396843782532, "language_loss": 0.88163388, "learning_rate": 3.322889556841445e-06, "loss": 0.90742582, "num_input_tokens_seen": 104661620, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.23022461, "step": 4854, "time_per_iteration": 2.844064712524414 }, { "auxiliary_loss_clip": 0.01527517, "auxiliary_loss_mlp": 0.01050233, "balance_loss_clip": 1.33498228, "balance_loss_mlp": 1.02517498, "epoch": 0.29189839170299114, "flos": 24364365442560.0, "grad_norm": 1.744825566185345, "language_loss": 0.8709482, "learning_rate": 3.322597437887519e-06, "loss": 0.89672565, "num_input_tokens_seen": 104681445, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.25085449, "step": 4855, "time_per_iteration": 2.915282726287842 }, { "auxiliary_loss_clip": 0.01282184, "auxiliary_loss_mlp": 0.01044068, "balance_loss_clip": 1.17160237, "balance_loss_mlp": 1.02089334, "epoch": 0.2919585149556591, "flos": 71350813088640.0, "grad_norm": 0.8062873874133749, "language_loss": 0.6018914, "learning_rate": 3.322305268780566e-06, "loss": 0.6251539, "num_input_tokens_seen": 104747945, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.23144531, "step": 4856, "time_per_iteration": 3.4645884037017822 }, { "auxiliary_loss_clip": 0.01526537, "auxiliary_loss_mlp": 0.01043622, "balance_loss_clip": 1.33510208, "balance_loss_mlp": 1.02127004, "epoch": 0.2920186382083271, "flos": 15641755799040.0, "grad_norm": 2.0002598352783294, "language_loss": 0.69057965, "learning_rate": 3.322013049531664e-06, "loss": 0.71628118, "num_input_tokens_seen": 104766225, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.22363281, "step": 4857, "time_per_iteration": 2.8321139812469482 }, { "auxiliary_loss_clip": 0.0152362, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.33255827, "balance_loss_mlp": 1.01741052, "epoch": 0.29207876146099504, "flos": 28377682364160.0, "grad_norm": 2.3010382853208142, "language_loss": 0.84899032, "learning_rate": 3.321720780151895e-06, "loss": 0.87463415, "num_input_tokens_seen": 104785345, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.23364258, "step": 4858, "time_per_iteration": 2.8988840579986572 }, { "auxiliary_loss_clip": 0.01528814, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.34009886, "balance_loss_mlp": 1.01493883, "epoch": 0.292138884713663, "flos": 21880789096320.0, "grad_norm": 2.0630649448216087, "language_loss": 0.78247088, "learning_rate": 3.321428460652342e-06, "loss": 0.80812633, "num_input_tokens_seen": 104804560, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.21789551, "step": 4859, "time_per_iteration": 2.86971116065979 }, { "auxiliary_loss_clip": 0.01538906, "auxiliary_loss_mlp": 0.01040792, "balance_loss_clip": 1.34105766, "balance_loss_mlp": 1.016891, "epoch": 0.29219900796633097, "flos": 21001435545600.0, "grad_norm": 2.734799747402687, "language_loss": 0.69530678, "learning_rate": 3.3211360910440885e-06, "loss": 0.72110379, "num_input_tokens_seen": 104821105, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.23913574, "step": 4860, "time_per_iteration": 2.835451364517212 }, { "auxiliary_loss_clip": 0.01528173, "auxiliary_loss_mlp": 0.01039757, "balance_loss_clip": 1.339149, "balance_loss_mlp": 1.01773906, "epoch": 0.29225913121899894, "flos": 35017522513920.0, "grad_norm": 2.3296138396236263, "language_loss": 0.76085657, "learning_rate": 3.320843671338222e-06, "loss": 0.78653586, "num_input_tokens_seen": 104841440, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.22021484, "step": 4861, "time_per_iteration": 2.954454183578491 }, { "auxiliary_loss_clip": 0.01535793, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.3438921, "balance_loss_mlp": 1.01832724, "epoch": 0.2923192544716669, "flos": 13523306140800.0, "grad_norm": 1.8386470673134288, "language_loss": 0.92467928, "learning_rate": 3.320551201545832e-06, "loss": 0.95046365, "num_input_tokens_seen": 104858210, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.24316406, "step": 4862, "time_per_iteration": 2.8857734203338623 }, { "auxiliary_loss_clip": 0.01513345, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.32350779, "balance_loss_mlp": 1.01690936, "epoch": 0.29237937772433487, "flos": 19472871335040.0, "grad_norm": 2.1054205792569367, "language_loss": 0.74526262, "learning_rate": 3.320258681678008e-06, "loss": 0.7707988, "num_input_tokens_seen": 104875620, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.23364258, "step": 4863, "time_per_iteration": 2.8300278186798096 }, { "auxiliary_loss_clip": 0.01520349, "auxiliary_loss_mlp": 0.01042846, "balance_loss_clip": 1.33192468, "balance_loss_mlp": 1.02062535, "epoch": 0.29243950097700283, "flos": 20860479434880.0, "grad_norm": 1.756460713055701, "language_loss": 0.79002368, "learning_rate": 3.319966111745842e-06, "loss": 0.81565565, "num_input_tokens_seen": 104894600, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.22229004, "step": 4864, "time_per_iteration": 2.8762505054473877 }, { "auxiliary_loss_clip": 0.01538523, "auxiliary_loss_mlp": 0.01043596, "balance_loss_clip": 1.34326458, "balance_loss_mlp": 1.0195868, "epoch": 0.29249962422967085, "flos": 23594088401280.0, "grad_norm": 1.7791752497708395, "language_loss": 0.82881081, "learning_rate": 3.319673491760429e-06, "loss": 0.85463202, "num_input_tokens_seen": 104914530, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.2401123, "step": 4865, "time_per_iteration": 2.9375648498535156 }, { "auxiliary_loss_clip": 0.01542075, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.34733713, "balance_loss_mlp": 1.01510537, "epoch": 0.2925597474823388, "flos": 22283541475200.0, "grad_norm": 1.9118610850609186, "language_loss": 0.86229885, "learning_rate": 3.3193808217328645e-06, "loss": 0.88810742, "num_input_tokens_seen": 104933460, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.23669434, "step": 4866, "time_per_iteration": 2.8902969360351562 }, { "auxiliary_loss_clip": 0.01518276, "auxiliary_loss_mlp": 0.01042259, "balance_loss_clip": 1.32930064, "balance_loss_mlp": 1.01841736, "epoch": 0.2926198707350068, "flos": 34468565627520.0, "grad_norm": 2.0160114293431532, "language_loss": 0.76587665, "learning_rate": 3.3190881016742476e-06, "loss": 0.79148197, "num_input_tokens_seen": 104954495, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.23840332, "step": 4867, "time_per_iteration": 3.008183002471924 }, { "auxiliary_loss_clip": 0.01533303, "auxiliary_loss_mlp": 0.01041746, "balance_loss_clip": 1.33735347, "balance_loss_mlp": 1.01759386, "epoch": 0.29267999398767475, "flos": 20713867724160.0, "grad_norm": 1.960050782029983, "language_loss": 0.74922061, "learning_rate": 3.3187953315956776e-06, "loss": 0.77497113, "num_input_tokens_seen": 104971915, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.24157715, "step": 4868, "time_per_iteration": 4.286706209182739 }, { "auxiliary_loss_clip": 0.01519556, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.33051467, "balance_loss_mlp": 1.01803708, "epoch": 0.2927401172403427, "flos": 18377672250240.0, "grad_norm": 1.4191968971136724, "language_loss": 0.75609601, "learning_rate": 3.3185025115082566e-06, "loss": 0.78170449, "num_input_tokens_seen": 104991335, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.2322998, "step": 4869, "time_per_iteration": 2.8878872394561768 }, { "auxiliary_loss_clip": 0.01521719, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.33056462, "balance_loss_mlp": 1.01861477, "epoch": 0.2928002404930107, "flos": 26115154704000.0, "grad_norm": 1.6219859382381214, "language_loss": 0.7743417, "learning_rate": 3.318209641423088e-06, "loss": 0.79998237, "num_input_tokens_seen": 105012015, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.23730469, "step": 4870, "time_per_iteration": 2.9074721336364746 }, { "auxiliary_loss_clip": 0.01547089, "auxiliary_loss_mlp": 0.01042147, "balance_loss_clip": 1.34937668, "balance_loss_mlp": 1.01816177, "epoch": 0.29286036374567864, "flos": 21334682632320.0, "grad_norm": 2.2010404276808573, "language_loss": 0.69276273, "learning_rate": 3.3179167213512777e-06, "loss": 0.71865511, "num_input_tokens_seen": 105031460, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.23986816, "step": 4871, "time_per_iteration": 2.8736696243286133 }, { "auxiliary_loss_clip": 0.01524867, "auxiliary_loss_mlp": 0.01039018, "balance_loss_clip": 1.33290839, "balance_loss_mlp": 1.01634431, "epoch": 0.2929204869983466, "flos": 29581324531200.0, "grad_norm": 3.9092162221117723, "language_loss": 0.78789973, "learning_rate": 3.317623751303933e-06, "loss": 0.81353855, "num_input_tokens_seen": 105052965, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.22668457, "step": 4872, "time_per_iteration": 2.9338669776916504 }, { "auxiliary_loss_clip": 0.01535388, "auxiliary_loss_mlp": 0.01042063, "balance_loss_clip": 1.33821607, "balance_loss_mlp": 1.01742244, "epoch": 0.2929806102510146, "flos": 19066499372160.0, "grad_norm": 1.984474728434417, "language_loss": 0.73049825, "learning_rate": 3.317330731292164e-06, "loss": 0.75627279, "num_input_tokens_seen": 105071840, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.2467041, "step": 4873, "time_per_iteration": 2.8545923233032227 }, { "auxiliary_loss_clip": 0.01545603, "auxiliary_loss_mlp": 0.01043365, "balance_loss_clip": 1.34888148, "balance_loss_mlp": 1.01971364, "epoch": 0.29304073350368254, "flos": 21954140196480.0, "grad_norm": 4.680627879807124, "language_loss": 0.79091197, "learning_rate": 3.3170376613270812e-06, "loss": 0.81680161, "num_input_tokens_seen": 105089445, "router_z_loss_clip": 1.96679688, "router_z_loss_mlp": 0.23657227, "step": 4874, "time_per_iteration": 4.320667028427124 }, { "auxiliary_loss_clip": 0.01552841, "auxiliary_loss_mlp": 0.01045757, "balance_loss_clip": 1.3506248, "balance_loss_mlp": 1.02117586, "epoch": 0.2931008567563505, "flos": 15459328189440.0, "grad_norm": 2.058862323118028, "language_loss": 0.78623843, "learning_rate": 3.3167445414197985e-06, "loss": 0.81222439, "num_input_tokens_seen": 105106210, "router_z_loss_clip": 2.02441406, "router_z_loss_mlp": 0.24584961, "step": 4875, "time_per_iteration": 2.9040255546569824 }, { "auxiliary_loss_clip": 0.01530395, "auxiliary_loss_mlp": 0.01043111, "balance_loss_clip": 1.33684039, "balance_loss_mlp": 1.01832759, "epoch": 0.29316098000901847, "flos": 16992054921600.0, "grad_norm": 2.092750627964291, "language_loss": 0.7026211, "learning_rate": 3.316451371581431e-06, "loss": 0.72835618, "num_input_tokens_seen": 105124200, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.2479248, "step": 4876, "time_per_iteration": 2.875425338745117 }, { "auxiliary_loss_clip": 0.01514971, "auxiliary_loss_mlp": 0.01043358, "balance_loss_clip": 1.32243049, "balance_loss_mlp": 1.02006412, "epoch": 0.29322110326168643, "flos": 16365086720640.0, "grad_norm": 1.9180342404878234, "language_loss": 0.83360881, "learning_rate": 3.316158151823096e-06, "loss": 0.85919201, "num_input_tokens_seen": 105140400, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.23291016, "step": 4877, "time_per_iteration": 2.8471710681915283 }, { "auxiliary_loss_clip": 0.01541036, "auxiliary_loss_mlp": 0.01045883, "balance_loss_clip": 1.34278631, "balance_loss_mlp": 1.02213669, "epoch": 0.29328122651435445, "flos": 13998866682240.0, "grad_norm": 1.9828485282389599, "language_loss": 0.68991065, "learning_rate": 3.315864882155911e-06, "loss": 0.71577978, "num_input_tokens_seen": 105157535, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.23754883, "step": 4878, "time_per_iteration": 4.361980676651001 }, { "auxiliary_loss_clip": 0.01521252, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.3283515, "balance_loss_mlp": 1.01915598, "epoch": 0.2933413497670224, "flos": 25275417615360.0, "grad_norm": 2.589905879694502, "language_loss": 0.74170744, "learning_rate": 3.3155715625909982e-06, "loss": 0.76734924, "num_input_tokens_seen": 105175185, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.2376709, "step": 4879, "time_per_iteration": 2.876278877258301 }, { "auxiliary_loss_clip": 0.01543646, "auxiliary_loss_mlp": 0.01049638, "balance_loss_clip": 1.34660339, "balance_loss_mlp": 1.02425814, "epoch": 0.2934014730196904, "flos": 32136306451200.0, "grad_norm": 2.1242386139852814, "language_loss": 0.67465305, "learning_rate": 3.3152781931394803e-06, "loss": 0.70058584, "num_input_tokens_seen": 105194540, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.25378418, "step": 4880, "time_per_iteration": 2.945148229598999 }, { "auxiliary_loss_clip": 0.01527222, "auxiliary_loss_mlp": 0.01051452, "balance_loss_clip": 1.3306942, "balance_loss_mlp": 1.02724075, "epoch": 0.29346159627235835, "flos": 24363098588160.0, "grad_norm": 2.7265221894160603, "language_loss": 0.71906066, "learning_rate": 3.314984773812481e-06, "loss": 0.74484748, "num_input_tokens_seen": 105213215, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.24206543, "step": 4881, "time_per_iteration": 2.8709263801574707 }, { "auxiliary_loss_clip": 0.01529342, "auxiliary_loss_mlp": 0.01039117, "balance_loss_clip": 1.33305657, "balance_loss_mlp": 1.01606202, "epoch": 0.2935217195250263, "flos": 22756839776640.0, "grad_norm": 1.6083957014103831, "language_loss": 0.84018385, "learning_rate": 3.314691304621127e-06, "loss": 0.86586845, "num_input_tokens_seen": 105231585, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.23071289, "step": 4882, "time_per_iteration": 2.867452621459961 }, { "auxiliary_loss_clip": 0.01557338, "auxiliary_loss_mlp": 0.0104781, "balance_loss_clip": 1.35457015, "balance_loss_mlp": 1.02223921, "epoch": 0.2935818427776943, "flos": 21735489484800.0, "grad_norm": 2.1191585621505906, "language_loss": 0.73101318, "learning_rate": 3.314397785576548e-06, "loss": 0.75706458, "num_input_tokens_seen": 105250120, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.25598145, "step": 4883, "time_per_iteration": 2.8862459659576416 }, { "auxiliary_loss_clip": 0.01539015, "auxiliary_loss_mlp": 0.01044109, "balance_loss_clip": 1.34251916, "balance_loss_mlp": 1.02051747, "epoch": 0.29364196603036224, "flos": 23815363311360.0, "grad_norm": 4.398786132718032, "language_loss": 0.93146265, "learning_rate": 3.3141042166898726e-06, "loss": 0.95729387, "num_input_tokens_seen": 105266065, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.23571777, "step": 4884, "time_per_iteration": 2.861205577850342 }, { "auxiliary_loss_clip": 0.01548594, "auxiliary_loss_mlp": 0.01044523, "balance_loss_clip": 1.35051203, "balance_loss_mlp": 1.01976347, "epoch": 0.2937020892830302, "flos": 23478677619840.0, "grad_norm": 3.1338603900124498, "language_loss": 0.7448709, "learning_rate": 3.313810597972234e-06, "loss": 0.77080214, "num_input_tokens_seen": 105282155, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.24768066, "step": 4885, "time_per_iteration": 2.8936352729797363 }, { "auxiliary_loss_clip": 0.0153208, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.33726823, "balance_loss_mlp": 1.02096057, "epoch": 0.2937622125356982, "flos": 24281558179200.0, "grad_norm": 1.8083308356850656, "language_loss": 0.86111861, "learning_rate": 3.3135169294347655e-06, "loss": 0.88687956, "num_input_tokens_seen": 105299225, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.23046875, "step": 4886, "time_per_iteration": 2.8527188301086426 }, { "auxiliary_loss_clip": 0.01531342, "auxiliary_loss_mlp": 0.01042517, "balance_loss_clip": 1.3345468, "balance_loss_mlp": 1.01980758, "epoch": 0.29382233578836614, "flos": 20670812657280.0, "grad_norm": 3.93025235179736, "language_loss": 0.77943081, "learning_rate": 3.313223211088603e-06, "loss": 0.8051694, "num_input_tokens_seen": 105315710, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.22692871, "step": 4887, "time_per_iteration": 2.857243776321411 }, { "auxiliary_loss_clip": 0.01533465, "auxiliary_loss_mlp": 0.01042123, "balance_loss_clip": 1.33469582, "balance_loss_mlp": 1.01964056, "epoch": 0.2938824590410341, "flos": 16553758112640.0, "grad_norm": 4.884754183185183, "language_loss": 0.80419457, "learning_rate": 3.3129294429448855e-06, "loss": 0.82995045, "num_input_tokens_seen": 105333505, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.22460938, "step": 4888, "time_per_iteration": 2.89253306388855 }, { "auxiliary_loss_clip": 0.01526394, "auxiliary_loss_mlp": 0.0104772, "balance_loss_clip": 1.33136714, "balance_loss_mlp": 1.02329421, "epoch": 0.29394258229370207, "flos": 37939667137920.0, "grad_norm": 1.4564926487818959, "language_loss": 0.55962968, "learning_rate": 3.3126356250147517e-06, "loss": 0.58537078, "num_input_tokens_seen": 105355605, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.24438477, "step": 4889, "time_per_iteration": 3.0274593830108643 }, { "auxiliary_loss_clip": 0.01530776, "auxiliary_loss_mlp": 0.01050102, "balance_loss_clip": 1.33372796, "balance_loss_mlp": 1.02592587, "epoch": 0.29400270554637004, "flos": 20053526843520.0, "grad_norm": 2.7510419294350785, "language_loss": 0.85621178, "learning_rate": 3.3123417573093434e-06, "loss": 0.88202059, "num_input_tokens_seen": 105374225, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.24194336, "step": 4890, "time_per_iteration": 2.86726713180542 }, { "auxiliary_loss_clip": 0.01534436, "auxiliary_loss_mlp": 0.01050776, "balance_loss_clip": 1.33387566, "balance_loss_mlp": 1.02546811, "epoch": 0.294062828799038, "flos": 15274321626240.0, "grad_norm": 2.0845394886231383, "language_loss": 0.73848218, "learning_rate": 3.3120478398398046e-06, "loss": 0.76433432, "num_input_tokens_seen": 105391565, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.25305176, "step": 4891, "time_per_iteration": 2.8367621898651123 }, { "auxiliary_loss_clip": 0.01535862, "auxiliary_loss_mlp": 0.01046759, "balance_loss_clip": 1.33701372, "balance_loss_mlp": 1.0226903, "epoch": 0.294122952051706, "flos": 22757337469440.0, "grad_norm": 2.3204329272565474, "language_loss": 0.78384876, "learning_rate": 3.3117538726172797e-06, "loss": 0.80967498, "num_input_tokens_seen": 105409840, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.24084473, "step": 4892, "time_per_iteration": 2.8875036239624023 }, { "auxiliary_loss_clip": 0.01522769, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.32821631, "balance_loss_mlp": 1.01359642, "epoch": 0.294183075304374, "flos": 24983687272320.0, "grad_norm": 1.6372796940592154, "language_loss": 0.78318542, "learning_rate": 3.3114598556529164e-06, "loss": 0.80878073, "num_input_tokens_seen": 105428645, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.23181152, "step": 4893, "time_per_iteration": 3.059711456298828 }, { "auxiliary_loss_clip": 0.01537031, "auxiliary_loss_mlp": 0.01048844, "balance_loss_clip": 1.34012079, "balance_loss_mlp": 1.02534842, "epoch": 0.29424319855704195, "flos": 30964408151040.0, "grad_norm": 1.66569711605378, "language_loss": 0.85364348, "learning_rate": 3.311165788957864e-06, "loss": 0.87950224, "num_input_tokens_seen": 105447480, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.23510742, "step": 4894, "time_per_iteration": 2.9443840980529785 }, { "auxiliary_loss_clip": 0.01540318, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.34008622, "balance_loss_mlp": 1.02300119, "epoch": 0.2943033218097099, "flos": 15239953560960.0, "grad_norm": 2.6815384010521788, "language_loss": 0.91394985, "learning_rate": 3.310871672543274e-06, "loss": 0.93982077, "num_input_tokens_seen": 105464600, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.23742676, "step": 4895, "time_per_iteration": 2.908296585083008 }, { "auxiliary_loss_clip": 0.01535644, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.33466816, "balance_loss_mlp": 1.0193156, "epoch": 0.2943634450623779, "flos": 21735896688000.0, "grad_norm": 2.661632450135291, "language_loss": 0.87944651, "learning_rate": 3.3105775064202982e-06, "loss": 0.90524942, "num_input_tokens_seen": 105481510, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.25354004, "step": 4896, "time_per_iteration": 2.8837602138519287 }, { "auxiliary_loss_clip": 0.01539619, "auxiliary_loss_mlp": 0.01048052, "balance_loss_clip": 1.34047794, "balance_loss_mlp": 1.02415061, "epoch": 0.29442356831504585, "flos": 22612490305920.0, "grad_norm": 1.7921180394575458, "language_loss": 0.74468344, "learning_rate": 3.3102832906000924e-06, "loss": 0.77056015, "num_input_tokens_seen": 105501390, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.23901367, "step": 4897, "time_per_iteration": 2.860987424850464 }, { "auxiliary_loss_clip": 0.01544886, "auxiliary_loss_mlp": 0.01047452, "balance_loss_clip": 1.34012115, "balance_loss_mlp": 1.02223957, "epoch": 0.2944836915677138, "flos": 20021059059840.0, "grad_norm": 1.8816052502821425, "language_loss": 0.75124109, "learning_rate": 3.309989025093813e-06, "loss": 0.77716446, "num_input_tokens_seen": 105519600, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.25219727, "step": 4898, "time_per_iteration": 2.83255672454834 }, { "auxiliary_loss_clip": 0.01544546, "auxiliary_loss_mlp": 0.01051989, "balance_loss_clip": 1.34175086, "balance_loss_mlp": 1.0263474, "epoch": 0.2945438148203818, "flos": 20055019921920.0, "grad_norm": 6.539752892255044, "language_loss": 0.71773231, "learning_rate": 3.309694709912618e-06, "loss": 0.74369764, "num_input_tokens_seen": 105535970, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.2565918, "step": 4899, "time_per_iteration": 2.8461575508117676 }, { "auxiliary_loss_clip": 0.01519946, "auxiliary_loss_mlp": 0.0105425, "balance_loss_clip": 1.32440925, "balance_loss_mlp": 1.02929926, "epoch": 0.29460393807304974, "flos": 23744319696000.0, "grad_norm": 1.9878150960211893, "language_loss": 0.80258918, "learning_rate": 3.3094003450676685e-06, "loss": 0.82833111, "num_input_tokens_seen": 105556735, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.24975586, "step": 4900, "time_per_iteration": 2.876877784729004 }, { "auxiliary_loss_clip": 0.01522909, "auxiliary_loss_mlp": 0.01045345, "balance_loss_clip": 1.32488346, "balance_loss_mlp": 1.02116942, "epoch": 0.2946640613257177, "flos": 14984355830400.0, "grad_norm": 1.8196305564122879, "language_loss": 0.81531596, "learning_rate": 3.3091059305701268e-06, "loss": 0.84099853, "num_input_tokens_seen": 105574875, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.24182129, "step": 4901, "time_per_iteration": 2.814955949783325 }, { "auxiliary_loss_clip": 0.0151258, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.32187533, "balance_loss_mlp": 1.0174644, "epoch": 0.2947241845783857, "flos": 24254700750720.0, "grad_norm": 3.5167816290111147, "language_loss": 0.58869886, "learning_rate": 3.308811466431157e-06, "loss": 0.61423433, "num_input_tokens_seen": 105594225, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.23498535, "step": 4902, "time_per_iteration": 2.891887903213501 }, { "auxiliary_loss_clip": 0.01513142, "auxiliary_loss_mlp": 0.0104231, "balance_loss_clip": 1.31874824, "balance_loss_mlp": 1.01802659, "epoch": 0.29478430783105364, "flos": 19948024673280.0, "grad_norm": 1.566105891320166, "language_loss": 0.7690345, "learning_rate": 3.308516952661925e-06, "loss": 0.79458904, "num_input_tokens_seen": 105614000, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.24291992, "step": 4903, "time_per_iteration": 4.317591190338135 }, { "auxiliary_loss_clip": 0.01533441, "auxiliary_loss_mlp": 0.0104496, "balance_loss_clip": 1.33746862, "balance_loss_mlp": 1.01925826, "epoch": 0.2948444310837216, "flos": 27392871888000.0, "grad_norm": 1.8117101815746506, "language_loss": 0.62750602, "learning_rate": 3.3082223892736e-06, "loss": 0.65329003, "num_input_tokens_seen": 105634575, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.25720215, "step": 4904, "time_per_iteration": 2.8760673999786377 }, { "auxiliary_loss_clip": 0.01533691, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.33501124, "balance_loss_mlp": 1.01924741, "epoch": 0.2949045543363896, "flos": 23416230516480.0, "grad_norm": 4.5737289393405165, "language_loss": 0.73943007, "learning_rate": 3.3079277762773496e-06, "loss": 0.76520479, "num_input_tokens_seen": 105654385, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.24536133, "step": 4905, "time_per_iteration": 2.903163433074951 }, { "auxiliary_loss_clip": 0.01529577, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.33389771, "balance_loss_mlp": 1.01609468, "epoch": 0.2949646775890576, "flos": 23962065511680.0, "grad_norm": 1.5996083176144953, "language_loss": 0.81970352, "learning_rate": 3.3076331136843476e-06, "loss": 0.84541214, "num_input_tokens_seen": 105673570, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.25183105, "step": 4906, "time_per_iteration": 2.906320571899414 }, { "auxiliary_loss_clip": 0.01520631, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.32962823, "balance_loss_mlp": 1.01530206, "epoch": 0.29502480084172555, "flos": 22794691691520.0, "grad_norm": 2.9017536736950533, "language_loss": 0.88206303, "learning_rate": 3.3073384015057667e-06, "loss": 0.90765977, "num_input_tokens_seen": 105691940, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.23742676, "step": 4907, "time_per_iteration": 2.9583616256713867 }, { "auxiliary_loss_clip": 0.01556811, "auxiliary_loss_mlp": 0.01045705, "balance_loss_clip": 1.3575058, "balance_loss_mlp": 1.01946676, "epoch": 0.2950849240943935, "flos": 19656249085440.0, "grad_norm": 2.2702432711454947, "language_loss": 0.83048761, "learning_rate": 3.307043639752782e-06, "loss": 0.85651273, "num_input_tokens_seen": 105709825, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.2623291, "step": 4908, "time_per_iteration": 2.87441349029541 }, { "auxiliary_loss_clip": 0.01274833, "auxiliary_loss_mlp": 0.01023378, "balance_loss_clip": 1.16208935, "balance_loss_mlp": 0.99839133, "epoch": 0.2951450473470615, "flos": 71031121459200.0, "grad_norm": 0.7829443288956217, "language_loss": 0.57302344, "learning_rate": 3.3067488284365728e-06, "loss": 0.59600562, "num_input_tokens_seen": 105766880, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.25, "step": 4909, "time_per_iteration": 4.642868518829346 }, { "auxiliary_loss_clip": 0.01512257, "auxiliary_loss_mlp": 0.01040394, "balance_loss_clip": 1.31901681, "balance_loss_mlp": 1.01706493, "epoch": 0.29520517059972945, "flos": 22976621608320.0, "grad_norm": 1.4920629757163228, "language_loss": 0.87257779, "learning_rate": 3.3064539675683163e-06, "loss": 0.89810431, "num_input_tokens_seen": 105786875, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.23352051, "step": 4910, "time_per_iteration": 2.8913161754608154 }, { "auxiliary_loss_clip": 0.01511527, "auxiliary_loss_mlp": 0.01045439, "balance_loss_clip": 1.31962204, "balance_loss_mlp": 1.02062011, "epoch": 0.2952652938523974, "flos": 20495443236480.0, "grad_norm": 1.735403264659366, "language_loss": 0.73970205, "learning_rate": 3.3061590571591946e-06, "loss": 0.76527172, "num_input_tokens_seen": 105805315, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.24841309, "step": 4911, "time_per_iteration": 2.8653385639190674 }, { "auxiliary_loss_clip": 0.01515409, "auxiliary_loss_mlp": 0.01040059, "balance_loss_clip": 1.32649553, "balance_loss_mlp": 1.01533473, "epoch": 0.2953254171050654, "flos": 19656158595840.0, "grad_norm": 2.611400387935349, "language_loss": 0.90530622, "learning_rate": 3.3058640972203904e-06, "loss": 0.93086082, "num_input_tokens_seen": 105825125, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.24731445, "step": 4912, "time_per_iteration": 5.723008394241333 }, { "auxiliary_loss_clip": 0.01521487, "auxiliary_loss_mlp": 0.01046157, "balance_loss_clip": 1.32773948, "balance_loss_mlp": 1.02163577, "epoch": 0.29538554035773334, "flos": 22758378099840.0, "grad_norm": 1.955514526276525, "language_loss": 0.84367937, "learning_rate": 3.3055690877630894e-06, "loss": 0.8693558, "num_input_tokens_seen": 105846085, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.24536133, "step": 4913, "time_per_iteration": 2.8549680709838867 }, { "auxiliary_loss_clip": 0.01510345, "auxiliary_loss_mlp": 0.01043727, "balance_loss_clip": 1.31843436, "balance_loss_mlp": 1.01955128, "epoch": 0.2954456636104013, "flos": 21881920216320.0, "grad_norm": 1.9858125470664223, "language_loss": 0.77735823, "learning_rate": 3.3052740287984765e-06, "loss": 0.80289888, "num_input_tokens_seen": 105865400, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.24169922, "step": 4914, "time_per_iteration": 2.876253366470337 }, { "auxiliary_loss_clip": 0.01514595, "auxiliary_loss_mlp": 0.01039453, "balance_loss_clip": 1.32313275, "balance_loss_mlp": 1.01471758, "epoch": 0.2955057868630693, "flos": 40457294835840.0, "grad_norm": 1.7967215235318004, "language_loss": 0.82578081, "learning_rate": 3.3049789203377424e-06, "loss": 0.85132128, "num_input_tokens_seen": 105887920, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.24755859, "step": 4915, "time_per_iteration": 3.078629970550537 }, { "auxiliary_loss_clip": 0.01533861, "auxiliary_loss_mlp": 0.01044353, "balance_loss_clip": 1.33960664, "balance_loss_mlp": 1.01968884, "epoch": 0.29556591011573724, "flos": 22574593146240.0, "grad_norm": 1.8510158137637434, "language_loss": 0.85425466, "learning_rate": 3.3046837623920772e-06, "loss": 0.88003671, "num_input_tokens_seen": 105904035, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.24694824, "step": 4916, "time_per_iteration": 2.8459367752075195 }, { "auxiliary_loss_clip": 0.01531898, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.33787382, "balance_loss_mlp": 1.01380754, "epoch": 0.2956260333684052, "flos": 22099032604800.0, "grad_norm": 1.7742324559194904, "language_loss": 0.70701897, "learning_rate": 3.3043885549726723e-06, "loss": 0.73271966, "num_input_tokens_seen": 105922685, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.24389648, "step": 4917, "time_per_iteration": 2.854931354522705 }, { "auxiliary_loss_clip": 0.01529164, "auxiliary_loss_mlp": 0.01039537, "balance_loss_clip": 1.33460867, "balance_loss_mlp": 1.01562333, "epoch": 0.2956861566210732, "flos": 16444138665600.0, "grad_norm": 2.1228934180457837, "language_loss": 0.91581422, "learning_rate": 3.3040932980907226e-06, "loss": 0.9415012, "num_input_tokens_seen": 105940425, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.23913574, "step": 4918, "time_per_iteration": 2.838308334350586 }, { "auxiliary_loss_clip": 0.01529754, "auxiliary_loss_mlp": 0.01038648, "balance_loss_clip": 1.33318543, "balance_loss_mlp": 1.01370931, "epoch": 0.2957462798737412, "flos": 25823107647360.0, "grad_norm": 2.001310918680199, "language_loss": 0.73416328, "learning_rate": 3.303797991757425e-06, "loss": 0.75984728, "num_input_tokens_seen": 105960550, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.24926758, "step": 4919, "time_per_iteration": 2.8938424587249756 }, { "auxiliary_loss_clip": 0.01517927, "auxiliary_loss_mlp": 0.01043835, "balance_loss_clip": 1.32574129, "balance_loss_mlp": 1.0194329, "epoch": 0.29580640312640916, "flos": 16699555416960.0, "grad_norm": 1.7684296112037556, "language_loss": 0.77451813, "learning_rate": 3.3035026359839763e-06, "loss": 0.80013573, "num_input_tokens_seen": 105978820, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.2442627, "step": 4920, "time_per_iteration": 2.852980375289917 }, { "auxiliary_loss_clip": 0.0153525, "auxiliary_loss_mlp": 0.01045647, "balance_loss_clip": 1.33689332, "balance_loss_mlp": 1.02019632, "epoch": 0.2958665263790771, "flos": 23954645364480.0, "grad_norm": 2.881678797187391, "language_loss": 0.69476038, "learning_rate": 3.3032072307815774e-06, "loss": 0.72056937, "num_input_tokens_seen": 105997545, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.25439453, "step": 4921, "time_per_iteration": 2.878619432449341 }, { "auxiliary_loss_clip": 0.01550206, "auxiliary_loss_mlp": 0.01047892, "balance_loss_clip": 1.3486526, "balance_loss_mlp": 1.02239323, "epoch": 0.2959266496317451, "flos": 18487517921280.0, "grad_norm": 1.889584123608708, "language_loss": 0.7557826, "learning_rate": 3.3029117761614298e-06, "loss": 0.78176355, "num_input_tokens_seen": 106015320, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.25512695, "step": 4922, "time_per_iteration": 2.823582887649536 }, { "auxiliary_loss_clip": 0.01549174, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.34748793, "balance_loss_mlp": 1.01907468, "epoch": 0.29598677288441305, "flos": 25968407258880.0, "grad_norm": 1.9237383426609478, "language_loss": 0.77484345, "learning_rate": 3.302616272134737e-06, "loss": 0.8007586, "num_input_tokens_seen": 106034555, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.23242188, "step": 4923, "time_per_iteration": 2.8569960594177246 }, { "auxiliary_loss_clip": 0.01537164, "auxiliary_loss_mlp": 0.01043409, "balance_loss_clip": 1.34068477, "balance_loss_mlp": 1.01937628, "epoch": 0.296046896137081, "flos": 25167019777920.0, "grad_norm": 2.406129168654899, "language_loss": 0.87425554, "learning_rate": 3.3023207187127042e-06, "loss": 0.90006125, "num_input_tokens_seen": 106054200, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.24047852, "step": 4924, "time_per_iteration": 2.894080877304077 }, { "auxiliary_loss_clip": 0.01524385, "auxiliary_loss_mlp": 0.01038871, "balance_loss_clip": 1.3308717, "balance_loss_mlp": 1.0135746, "epoch": 0.296107019389749, "flos": 21770852935680.0, "grad_norm": 1.4736664396761814, "language_loss": 0.82107008, "learning_rate": 3.3020251159065396e-06, "loss": 0.84670264, "num_input_tokens_seen": 106074700, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.25305176, "step": 4925, "time_per_iteration": 2.867567300796509 }, { "auxiliary_loss_clip": 0.01530033, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.33626437, "balance_loss_mlp": 1.01648271, "epoch": 0.29616714264241695, "flos": 17967047276160.0, "grad_norm": 3.0619678910917556, "language_loss": 0.87820709, "learning_rate": 3.301729463727452e-06, "loss": 0.90390933, "num_input_tokens_seen": 106091415, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.23706055, "step": 4926, "time_per_iteration": 2.9052720069885254 }, { "auxiliary_loss_clip": 0.01548832, "auxiliary_loss_mlp": 0.01044928, "balance_loss_clip": 1.35090256, "balance_loss_mlp": 1.021873, "epoch": 0.2962272658950849, "flos": 15021121870080.0, "grad_norm": 2.0302793594268245, "language_loss": 0.87627745, "learning_rate": 3.3014337621866527e-06, "loss": 0.902215, "num_input_tokens_seen": 106109135, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.23071289, "step": 4927, "time_per_iteration": 2.831144332885742 }, { "auxiliary_loss_clip": 0.01527697, "auxiliary_loss_mlp": 0.01040979, "balance_loss_clip": 1.33528423, "balance_loss_mlp": 1.01862741, "epoch": 0.2962873891477529, "flos": 14729120058240.0, "grad_norm": 1.7805875483029454, "language_loss": 0.81284195, "learning_rate": 3.3011380112953553e-06, "loss": 0.83852869, "num_input_tokens_seen": 106125750, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.22351074, "step": 4928, "time_per_iteration": 2.859617233276367 }, { "auxiliary_loss_clip": 0.01541598, "auxiliary_loss_mlp": 0.01048103, "balance_loss_clip": 1.33818555, "balance_loss_mlp": 1.02264023, "epoch": 0.29634751240042084, "flos": 26734928981760.0, "grad_norm": 2.3288392180537456, "language_loss": 0.73596847, "learning_rate": 3.300842211064773e-06, "loss": 0.7618655, "num_input_tokens_seen": 106142835, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.25500488, "step": 4929, "time_per_iteration": 2.881685972213745 }, { "auxiliary_loss_clip": 0.01547543, "auxiliary_loss_mlp": 0.01045848, "balance_loss_clip": 1.34762061, "balance_loss_mlp": 1.02074289, "epoch": 0.2964076356530888, "flos": 14578572049920.0, "grad_norm": 2.2212626972885823, "language_loss": 0.73724723, "learning_rate": 3.3005463615061246e-06, "loss": 0.76318115, "num_input_tokens_seen": 106160680, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.25109863, "step": 4930, "time_per_iteration": 2.8367295265197754 }, { "auxiliary_loss_clip": 0.01278756, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 1.16273761, "balance_loss_mlp": 1.00567782, "epoch": 0.29646775890575683, "flos": 63135462608640.0, "grad_norm": 0.8263645429582543, "language_loss": 0.60752988, "learning_rate": 3.3002504626306275e-06, "loss": 0.63058788, "num_input_tokens_seen": 106224415, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.21386719, "step": 4931, "time_per_iteration": 3.326747179031372 }, { "auxiliary_loss_clip": 0.01278786, "auxiliary_loss_mlp": 0.01025153, "balance_loss_clip": 1.16257524, "balance_loss_mlp": 1.00627029, "epoch": 0.2965278821584248, "flos": 63098787058560.0, "grad_norm": 0.7453569473888378, "language_loss": 0.52513027, "learning_rate": 3.2999545144495023e-06, "loss": 0.54816967, "num_input_tokens_seen": 106279140, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.18847656, "step": 4932, "time_per_iteration": 3.1977884769439697 }, { "auxiliary_loss_clip": 0.01527654, "auxiliary_loss_mlp": 0.01036676, "balance_loss_clip": 1.3333993, "balance_loss_mlp": 1.01442003, "epoch": 0.29658800541109276, "flos": 23779185454080.0, "grad_norm": 1.622578645638351, "language_loss": 0.82749587, "learning_rate": 3.299658516973972e-06, "loss": 0.85313922, "num_input_tokens_seen": 106298190, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.22265625, "step": 4933, "time_per_iteration": 2.9025139808654785 }, { "auxiliary_loss_clip": 0.01520699, "auxiliary_loss_mlp": 0.01042828, "balance_loss_clip": 1.32953, "balance_loss_mlp": 1.01828289, "epoch": 0.2966481286637607, "flos": 23999283999360.0, "grad_norm": 2.2530013668319535, "language_loss": 0.76106381, "learning_rate": 3.299362470215261e-06, "loss": 0.78669918, "num_input_tokens_seen": 106319065, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.2454834, "step": 4934, "time_per_iteration": 2.9833431243896484 }, { "auxiliary_loss_clip": 0.01540117, "auxiliary_loss_mlp": 0.01049731, "balance_loss_clip": 1.34180045, "balance_loss_mlp": 1.02397013, "epoch": 0.2967082519164287, "flos": 17173984838400.0, "grad_norm": 1.6968028482853748, "language_loss": 0.63725448, "learning_rate": 3.299066374184594e-06, "loss": 0.66315293, "num_input_tokens_seen": 106338040, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.25744629, "step": 4935, "time_per_iteration": 2.817028760910034 }, { "auxiliary_loss_clip": 0.01531671, "auxiliary_loss_mlp": 0.0104657, "balance_loss_clip": 1.33714342, "balance_loss_mlp": 1.02291894, "epoch": 0.29676837516909665, "flos": 29399666083200.0, "grad_norm": 1.5728541186778446, "language_loss": 0.80157572, "learning_rate": 3.2987702288932e-06, "loss": 0.82735807, "num_input_tokens_seen": 106358900, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.2364502, "step": 4936, "time_per_iteration": 3.0583746433258057 }, { "auxiliary_loss_clip": 0.01548331, "auxiliary_loss_mlp": 0.01044851, "balance_loss_clip": 1.34896338, "balance_loss_mlp": 1.02092528, "epoch": 0.2968284984217646, "flos": 34764594226560.0, "grad_norm": 1.5952664404043264, "language_loss": 0.75325072, "learning_rate": 3.298474034352309e-06, "loss": 0.77918255, "num_input_tokens_seen": 106381805, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.23925781, "step": 4937, "time_per_iteration": 2.949100971221924 }, { "auxiliary_loss_clip": 0.015389, "auxiliary_loss_mlp": 0.01044432, "balance_loss_clip": 1.34227467, "balance_loss_mlp": 1.01986337, "epoch": 0.2968886216744326, "flos": 21554238240000.0, "grad_norm": 1.8662305753533548, "language_loss": 0.78422713, "learning_rate": 3.2981777905731526e-06, "loss": 0.8100605, "num_input_tokens_seen": 106402365, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.24597168, "step": 4938, "time_per_iteration": 2.8709776401519775 }, { "auxiliary_loss_clip": 0.0154116, "auxiliary_loss_mlp": 0.01045951, "balance_loss_clip": 1.33958232, "balance_loss_mlp": 1.02096462, "epoch": 0.29694874492710055, "flos": 12795134025600.0, "grad_norm": 1.9466539686794282, "language_loss": 0.7774719, "learning_rate": 3.297881497566964e-06, "loss": 0.803343, "num_input_tokens_seen": 106419800, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.25, "step": 4939, "time_per_iteration": 4.238704442977905 }, { "auxiliary_loss_clip": 0.01543144, "auxiliary_loss_mlp": 0.0104763, "balance_loss_clip": 1.34260213, "balance_loss_mlp": 1.02397943, "epoch": 0.2970088681797685, "flos": 24580256221440.0, "grad_norm": 1.6596308280847512, "language_loss": 0.79094684, "learning_rate": 3.297585155344979e-06, "loss": 0.8168546, "num_input_tokens_seen": 106440300, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.23657227, "step": 4940, "time_per_iteration": 2.878542184829712 }, { "auxiliary_loss_clip": 0.01544533, "auxiliary_loss_mlp": 0.01048367, "balance_loss_clip": 1.34374642, "balance_loss_mlp": 1.0223552, "epoch": 0.2970689914324365, "flos": 23669566007040.0, "grad_norm": 1.6658249036613395, "language_loss": 0.75946504, "learning_rate": 3.297288763918435e-06, "loss": 0.78539407, "num_input_tokens_seen": 106460035, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.26013184, "step": 4941, "time_per_iteration": 2.9013755321502686 }, { "auxiliary_loss_clip": 0.01551939, "auxiliary_loss_mlp": 0.01044954, "balance_loss_clip": 1.34769964, "balance_loss_mlp": 1.01862025, "epoch": 0.29712911468510445, "flos": 39683262476160.0, "grad_norm": 2.268665035435123, "language_loss": 0.75386113, "learning_rate": 3.2969923232985712e-06, "loss": 0.77982998, "num_input_tokens_seen": 106481095, "router_z_loss_clip": 2.03710938, "router_z_loss_mlp": 0.26391602, "step": 4942, "time_per_iteration": 3.0222926139831543 }, { "auxiliary_loss_clip": 0.01541787, "auxiliary_loss_mlp": 0.0104559, "balance_loss_clip": 1.34043276, "balance_loss_mlp": 1.01929283, "epoch": 0.2971892379377724, "flos": 26406342109440.0, "grad_norm": 1.7119260650401216, "language_loss": 0.71180439, "learning_rate": 3.2966958334966287e-06, "loss": 0.73767817, "num_input_tokens_seen": 106501590, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.26306152, "step": 4943, "time_per_iteration": 2.888421058654785 }, { "auxiliary_loss_clip": 0.01542261, "auxiliary_loss_mlp": 0.01044376, "balance_loss_clip": 1.34138083, "balance_loss_mlp": 1.01935363, "epoch": 0.2972493611904404, "flos": 17612191157760.0, "grad_norm": 2.2296683555622123, "language_loss": 0.80351478, "learning_rate": 3.2963992945238497e-06, "loss": 0.82938111, "num_input_tokens_seen": 106519430, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.25012207, "step": 4944, "time_per_iteration": 4.278645753860474 }, { "auxiliary_loss_clip": 0.01521804, "auxiliary_loss_mlp": 0.01043189, "balance_loss_clip": 1.32758474, "balance_loss_mlp": 1.01940656, "epoch": 0.2973094844431084, "flos": 20422544584320.0, "grad_norm": 2.178044085322441, "language_loss": 0.84634256, "learning_rate": 3.2961027063914795e-06, "loss": 0.87199253, "num_input_tokens_seen": 106535870, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.23791504, "step": 4945, "time_per_iteration": 2.871676206588745 }, { "auxiliary_loss_clip": 0.01520657, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.32732844, "balance_loss_mlp": 1.01599324, "epoch": 0.29736960769577636, "flos": 17502390731520.0, "grad_norm": 1.7564182488903646, "language_loss": 0.68326652, "learning_rate": 3.2958060691107654e-06, "loss": 0.70889342, "num_input_tokens_seen": 106553560, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.26037598, "step": 4946, "time_per_iteration": 2.8632309436798096 }, { "auxiliary_loss_clip": 0.01529374, "auxiliary_loss_mlp": 0.01041595, "balance_loss_clip": 1.33287787, "balance_loss_mlp": 1.01640582, "epoch": 0.2974297309484443, "flos": 26115064214400.0, "grad_norm": 1.819632709315618, "language_loss": 0.75210977, "learning_rate": 3.2955093826929547e-06, "loss": 0.77781951, "num_input_tokens_seen": 106574115, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.25195312, "step": 4947, "time_per_iteration": 5.790498495101929 }, { "auxiliary_loss_clip": 0.01539364, "auxiliary_loss_mlp": 0.01042387, "balance_loss_clip": 1.3405807, "balance_loss_mlp": 1.01754403, "epoch": 0.2974898542011123, "flos": 25677491322240.0, "grad_norm": 3.1122686081790785, "language_loss": 0.73835534, "learning_rate": 3.2952126471492985e-06, "loss": 0.76417285, "num_input_tokens_seen": 106593070, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.24865723, "step": 4948, "time_per_iteration": 2.9019315242767334 }, { "auxiliary_loss_clip": 0.01513586, "auxiliary_loss_mlp": 0.01038954, "balance_loss_clip": 1.31934214, "balance_loss_mlp": 1.01469517, "epoch": 0.29754997745378026, "flos": 18670624202880.0, "grad_norm": 1.8736951914597135, "language_loss": 0.84322369, "learning_rate": 3.2949158624910497e-06, "loss": 0.86874908, "num_input_tokens_seen": 106610695, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.24255371, "step": 4949, "time_per_iteration": 2.8188557624816895 }, { "auxiliary_loss_clip": 0.01527276, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.33073854, "balance_loss_mlp": 1.01608562, "epoch": 0.2976101007064482, "flos": 22285125043200.0, "grad_norm": 2.039329958056863, "language_loss": 0.71320117, "learning_rate": 3.2946190287294603e-06, "loss": 0.73889256, "num_input_tokens_seen": 106631300, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.25793457, "step": 4950, "time_per_iteration": 2.886660099029541 }, { "auxiliary_loss_clip": 0.01503092, "auxiliary_loss_mlp": 0.01041337, "balance_loss_clip": 1.31415164, "balance_loss_mlp": 1.01619589, "epoch": 0.2976702239591162, "flos": 21955949988480.0, "grad_norm": 1.7473776854985792, "language_loss": 0.84170836, "learning_rate": 3.294322145875789e-06, "loss": 0.86715263, "num_input_tokens_seen": 106650065, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.25158691, "step": 4951, "time_per_iteration": 2.8846027851104736 }, { "auxiliary_loss_clip": 0.01520187, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.32296979, "balance_loss_mlp": 1.01480877, "epoch": 0.29773034721178415, "flos": 24646006195200.0, "grad_norm": 3.5215008992561443, "language_loss": 0.76546621, "learning_rate": 3.2940252139412912e-06, "loss": 0.7910614, "num_input_tokens_seen": 106668230, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.24511719, "step": 4952, "time_per_iteration": 2.896153211593628 }, { "auxiliary_loss_clip": 0.0151318, "auxiliary_loss_mlp": 0.01046037, "balance_loss_clip": 1.31937957, "balance_loss_mlp": 1.02108657, "epoch": 0.2977904704644521, "flos": 20567075034240.0, "grad_norm": 1.6881428555717954, "language_loss": 0.85002607, "learning_rate": 3.293728232937228e-06, "loss": 0.87561822, "num_input_tokens_seen": 106687785, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.24975586, "step": 4953, "time_per_iteration": 2.8993093967437744 }, { "auxiliary_loss_clip": 0.01524572, "auxiliary_loss_mlp": 0.01040424, "balance_loss_clip": 1.32702374, "balance_loss_mlp": 1.01554453, "epoch": 0.2978505937171201, "flos": 18925724240640.0, "grad_norm": 2.0966530801809706, "language_loss": 0.75144899, "learning_rate": 3.2934312028748597e-06, "loss": 0.77709889, "num_input_tokens_seen": 106706875, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.24902344, "step": 4954, "time_per_iteration": 2.8388619422912598 }, { "auxiliary_loss_clip": 0.0152548, "auxiliary_loss_mlp": 0.01041571, "balance_loss_clip": 1.32953119, "balance_loss_mlp": 1.01683497, "epoch": 0.29791071696978805, "flos": 19327119275520.0, "grad_norm": 1.826172120530772, "language_loss": 0.76162696, "learning_rate": 3.293134123765452e-06, "loss": 0.78729737, "num_input_tokens_seen": 106725105, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.24743652, "step": 4955, "time_per_iteration": 2.85305118560791 }, { "auxiliary_loss_clip": 0.01530099, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.33097315, "balance_loss_mlp": 1.01870179, "epoch": 0.297970840222456, "flos": 18816014304000.0, "grad_norm": 1.8902690553195527, "language_loss": 0.73187441, "learning_rate": 3.2928369956202684e-06, "loss": 0.75761515, "num_input_tokens_seen": 106744780, "router_z_loss_clip": 1.98730469, "router_z_loss_mlp": 0.25305176, "step": 4956, "time_per_iteration": 2.853522539138794 }, { "auxiliary_loss_clip": 0.0154677, "auxiliary_loss_mlp": 0.01044376, "balance_loss_clip": 1.34551096, "balance_loss_mlp": 1.01912713, "epoch": 0.298030963475124, "flos": 22861979988480.0, "grad_norm": 1.702060332079338, "language_loss": 0.79969275, "learning_rate": 3.2925398184505754e-06, "loss": 0.8256042, "num_input_tokens_seen": 106764670, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.25268555, "step": 4957, "time_per_iteration": 2.86283278465271 }, { "auxiliary_loss_clip": 0.01524023, "auxiliary_loss_mlp": 0.01042327, "balance_loss_clip": 1.32707787, "balance_loss_mlp": 1.01562452, "epoch": 0.298091086727792, "flos": 21877667205120.0, "grad_norm": 1.5202312820928867, "language_loss": 0.7104528, "learning_rate": 3.2922425922676437e-06, "loss": 0.73611629, "num_input_tokens_seen": 106783695, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.26708984, "step": 4958, "time_per_iteration": 2.877079963684082 }, { "auxiliary_loss_clip": 0.01514125, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.32210779, "balance_loss_mlp": 1.01780295, "epoch": 0.29815120998045996, "flos": 21183953644800.0, "grad_norm": 1.796746825018875, "language_loss": 0.79515374, "learning_rate": 3.291945317082743e-06, "loss": 0.82073104, "num_input_tokens_seen": 106803150, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.25793457, "step": 4959, "time_per_iteration": 2.853182554244995 }, { "auxiliary_loss_clip": 0.01511748, "auxiliary_loss_mlp": 0.01040226, "balance_loss_clip": 1.31702185, "balance_loss_mlp": 1.01495337, "epoch": 0.29821133323312793, "flos": 19904426668800.0, "grad_norm": 1.692841854667108, "language_loss": 0.80107194, "learning_rate": 3.291647992907147e-06, "loss": 0.82659167, "num_input_tokens_seen": 106820705, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.25280762, "step": 4960, "time_per_iteration": 2.8593592643737793 }, { "auxiliary_loss_clip": 0.01532626, "auxiliary_loss_mlp": 0.01051997, "balance_loss_clip": 1.33092117, "balance_loss_mlp": 1.02398324, "epoch": 0.2982714564857959, "flos": 12758548965120.0, "grad_norm": 4.830231658893543, "language_loss": 0.75470066, "learning_rate": 3.291350619752129e-06, "loss": 0.7805469, "num_input_tokens_seen": 106837335, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.2800293, "step": 4961, "time_per_iteration": 2.8379149436950684 }, { "auxiliary_loss_clip": 0.01541198, "auxiliary_loss_mlp": 0.01049305, "balance_loss_clip": 1.34103012, "balance_loss_mlp": 1.02316189, "epoch": 0.29833157973846386, "flos": 22281731683200.0, "grad_norm": 2.042139832968929, "language_loss": 0.62831706, "learning_rate": 3.291053197628967e-06, "loss": 0.65422201, "num_input_tokens_seen": 106856250, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.26147461, "step": 4962, "time_per_iteration": 2.873258113861084 }, { "auxiliary_loss_clip": 0.01528471, "auxiliary_loss_mlp": 0.01056165, "balance_loss_clip": 1.33278012, "balance_loss_mlp": 1.03020167, "epoch": 0.2983917029911318, "flos": 15380230999680.0, "grad_norm": 1.900275798432957, "language_loss": 0.83791649, "learning_rate": 3.2907557265489375e-06, "loss": 0.86376286, "num_input_tokens_seen": 106873370, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.25976562, "step": 4963, "time_per_iteration": 2.832347869873047 }, { "auxiliary_loss_clip": 0.01513683, "auxiliary_loss_mlp": 0.01049572, "balance_loss_clip": 1.31993067, "balance_loss_mlp": 1.02356017, "epoch": 0.2984518262437998, "flos": 15386067578880.0, "grad_norm": 2.006067391860304, "language_loss": 0.6746937, "learning_rate": 3.290458206523322e-06, "loss": 0.7003262, "num_input_tokens_seen": 106890330, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.26000977, "step": 4964, "time_per_iteration": 2.8442320823669434 }, { "auxiliary_loss_clip": 0.01513246, "auxiliary_loss_mlp": 0.01052445, "balance_loss_clip": 1.31909394, "balance_loss_mlp": 1.02837658, "epoch": 0.29851194949646775, "flos": 18116147450880.0, "grad_norm": 1.7477787239090943, "language_loss": 0.71926272, "learning_rate": 3.2901606375634015e-06, "loss": 0.74491966, "num_input_tokens_seen": 106909190, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.24060059, "step": 4965, "time_per_iteration": 2.8743093013763428 }, { "auxiliary_loss_clip": 0.01527714, "auxiliary_loss_mlp": 0.01060035, "balance_loss_clip": 1.33255482, "balance_loss_mlp": 1.03416622, "epoch": 0.2985720727491357, "flos": 22028441437440.0, "grad_norm": 1.8045038025303382, "language_loss": 0.67669845, "learning_rate": 3.289863019680461e-06, "loss": 0.70257586, "num_input_tokens_seen": 106927825, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.25854492, "step": 4966, "time_per_iteration": 2.8956854343414307 }, { "auxiliary_loss_clip": 0.01527474, "auxiliary_loss_mlp": 0.01054026, "balance_loss_clip": 1.33111978, "balance_loss_mlp": 1.0288136, "epoch": 0.2986321960018037, "flos": 13048876719360.0, "grad_norm": 2.3789950621654232, "language_loss": 0.7452122, "learning_rate": 3.289565352885785e-06, "loss": 0.77102721, "num_input_tokens_seen": 106943155, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.25244141, "step": 4967, "time_per_iteration": 2.8474502563476562 }, { "auxiliary_loss_clip": 0.01519001, "auxiliary_loss_mlp": 0.0104906, "balance_loss_clip": 1.32364357, "balance_loss_mlp": 1.026088, "epoch": 0.29869231925447165, "flos": 14473115124480.0, "grad_norm": 2.3891407498057173, "language_loss": 0.72118884, "learning_rate": 3.2892676371906614e-06, "loss": 0.74686944, "num_input_tokens_seen": 106960295, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.22961426, "step": 4968, "time_per_iteration": 2.8301682472229004 }, { "auxiliary_loss_clip": 0.01522728, "auxiliary_loss_mlp": 0.01046147, "balance_loss_clip": 1.32503438, "balance_loss_mlp": 1.02182817, "epoch": 0.2987524425071396, "flos": 31662872415360.0, "grad_norm": 1.7421132792310432, "language_loss": 0.77528214, "learning_rate": 3.2889698726063805e-06, "loss": 0.80097091, "num_input_tokens_seen": 106982870, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.2434082, "step": 4969, "time_per_iteration": 2.9342589378356934 }, { "auxiliary_loss_clip": 0.01512507, "auxiliary_loss_mlp": 0.0105319, "balance_loss_clip": 1.3201375, "balance_loss_mlp": 1.02961063, "epoch": 0.2988125657598076, "flos": 21443216204160.0, "grad_norm": 1.7156062496099085, "language_loss": 0.70525753, "learning_rate": 3.2886720591442327e-06, "loss": 0.73091447, "num_input_tokens_seen": 107002405, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.23583984, "step": 4970, "time_per_iteration": 2.856318950653076 }, { "auxiliary_loss_clip": 0.01539169, "auxiliary_loss_mlp": 0.01049566, "balance_loss_clip": 1.33881044, "balance_loss_mlp": 1.02471042, "epoch": 0.2988726890124756, "flos": 18085941907200.0, "grad_norm": 2.167375756619445, "language_loss": 0.85900652, "learning_rate": 3.2883741968155103e-06, "loss": 0.88489383, "num_input_tokens_seen": 107017310, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.24865723, "step": 4971, "time_per_iteration": 2.818199634552002 }, { "auxiliary_loss_clip": 0.01496284, "auxiliary_loss_mlp": 0.01042077, "balance_loss_clip": 1.30743814, "balance_loss_mlp": 1.01729321, "epoch": 0.29893281226514357, "flos": 21763930481280.0, "grad_norm": 1.7889536580720473, "language_loss": 0.8001219, "learning_rate": 3.2880762856315107e-06, "loss": 0.82550544, "num_input_tokens_seen": 107034645, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.24804688, "step": 4972, "time_per_iteration": 2.9020330905914307 }, { "auxiliary_loss_clip": 0.01497496, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.30496895, "balance_loss_mlp": 1.01915097, "epoch": 0.29899293551781153, "flos": 16845443210880.0, "grad_norm": 1.9351163121622845, "language_loss": 0.86447096, "learning_rate": 3.2877783256035285e-06, "loss": 0.88988054, "num_input_tokens_seen": 107051125, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.24304199, "step": 4973, "time_per_iteration": 2.8049018383026123 }, { "auxiliary_loss_clip": 0.01492302, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.30572879, "balance_loss_mlp": 1.01589894, "epoch": 0.2990530587704795, "flos": 11736746225280.0, "grad_norm": 2.6538637345197076, "language_loss": 0.7871415, "learning_rate": 3.287480316742863e-06, "loss": 0.81246001, "num_input_tokens_seen": 107068815, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.2364502, "step": 4974, "time_per_iteration": 4.238566875457764 }, { "auxiliary_loss_clip": 0.01515449, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.32252228, "balance_loss_mlp": 1.01882625, "epoch": 0.29911318202314746, "flos": 28052036403840.0, "grad_norm": 1.8149858797459129, "language_loss": 0.73349226, "learning_rate": 3.287182259060815e-06, "loss": 0.7590794, "num_input_tokens_seen": 107090420, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.24438477, "step": 4975, "time_per_iteration": 2.9395973682403564 }, { "auxiliary_loss_clip": 0.01507922, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.3148973, "balance_loss_mlp": 1.01843536, "epoch": 0.2991733052758154, "flos": 18742663203840.0, "grad_norm": 2.4746998979590606, "language_loss": 0.76267493, "learning_rate": 3.286884152568687e-06, "loss": 0.78818977, "num_input_tokens_seen": 107107255, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.25134277, "step": 4976, "time_per_iteration": 2.8173210620880127 }, { "auxiliary_loss_clip": 0.01499193, "auxiliary_loss_mlp": 0.01041528, "balance_loss_clip": 1.30946422, "balance_loss_mlp": 1.01816344, "epoch": 0.2992334285284834, "flos": 15567409313280.0, "grad_norm": 2.4600671299763532, "language_loss": 0.87565279, "learning_rate": 3.2865859972777827e-06, "loss": 0.90105987, "num_input_tokens_seen": 107123840, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.23376465, "step": 4977, "time_per_iteration": 2.873326539993286 }, { "auxiliary_loss_clip": 0.01505436, "auxiliary_loss_mlp": 0.01044668, "balance_loss_clip": 1.31373334, "balance_loss_mlp": 1.02118349, "epoch": 0.29929355178115136, "flos": 21807257016960.0, "grad_norm": 1.750267219674661, "language_loss": 0.69379723, "learning_rate": 3.2862877931994088e-06, "loss": 0.71929824, "num_input_tokens_seen": 107143475, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.23461914, "step": 4978, "time_per_iteration": 2.860891819000244 }, { "auxiliary_loss_clip": 0.01512573, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.31996012, "balance_loss_mlp": 1.02169538, "epoch": 0.2993536750338193, "flos": 21188387635200.0, "grad_norm": 2.2195261885006365, "language_loss": 0.77389264, "learning_rate": 3.2859895403448726e-06, "loss": 0.79946947, "num_input_tokens_seen": 107161725, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.23425293, "step": 4979, "time_per_iteration": 4.3676581382751465 }, { "auxiliary_loss_clip": 0.0151199, "auxiliary_loss_mlp": 0.01048577, "balance_loss_clip": 1.31707621, "balance_loss_mlp": 1.02634406, "epoch": 0.2994137982864873, "flos": 32134270435200.0, "grad_norm": 1.7356472964633352, "language_loss": 0.69432449, "learning_rate": 3.285691238725484e-06, "loss": 0.71993023, "num_input_tokens_seen": 107183935, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.22241211, "step": 4980, "time_per_iteration": 2.945446014404297 }, { "auxiliary_loss_clip": 0.01498829, "auxiliary_loss_mlp": 0.01045639, "balance_loss_clip": 1.3095777, "balance_loss_mlp": 1.02319193, "epoch": 0.29947392153915525, "flos": 21115217514240.0, "grad_norm": 1.7895894692323624, "language_loss": 0.74294406, "learning_rate": 3.285392888352555e-06, "loss": 0.76838875, "num_input_tokens_seen": 107204285, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.22436523, "step": 4981, "time_per_iteration": 2.882982015609741 }, { "auxiliary_loss_clip": 0.01522537, "auxiliary_loss_mlp": 0.01051158, "balance_loss_clip": 1.32408321, "balance_loss_mlp": 1.02700627, "epoch": 0.2995340447918232, "flos": 21552383203200.0, "grad_norm": 1.584956523796422, "language_loss": 0.87124664, "learning_rate": 3.2850944892373987e-06, "loss": 0.89698356, "num_input_tokens_seen": 107225265, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.24157715, "step": 4982, "time_per_iteration": 5.755779266357422 }, { "auxiliary_loss_clip": 0.01530647, "auxiliary_loss_mlp": 0.01047773, "balance_loss_clip": 1.33214724, "balance_loss_mlp": 1.02333474, "epoch": 0.2995941680444912, "flos": 16733290055040.0, "grad_norm": 2.1879555637998416, "language_loss": 0.87098384, "learning_rate": 3.2847960413913307e-06, "loss": 0.89676803, "num_input_tokens_seen": 107241335, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.2442627, "step": 4983, "time_per_iteration": 2.8243215084075928 }, { "auxiliary_loss_clip": 0.01501857, "auxiliary_loss_mlp": 0.0104763, "balance_loss_clip": 1.30848968, "balance_loss_mlp": 1.02285826, "epoch": 0.2996542912971592, "flos": 20933378087040.0, "grad_norm": 2.0700609002736767, "language_loss": 0.79478157, "learning_rate": 3.284497544825668e-06, "loss": 0.8202765, "num_input_tokens_seen": 107259375, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.2479248, "step": 4984, "time_per_iteration": 2.8983683586120605 }, { "auxiliary_loss_clip": 0.01521545, "auxiliary_loss_mlp": 0.01053525, "balance_loss_clip": 1.32681537, "balance_loss_mlp": 1.0279429, "epoch": 0.29971441454982717, "flos": 25090049093760.0, "grad_norm": 1.6872794277233043, "language_loss": 0.7945683, "learning_rate": 3.2841989995517303e-06, "loss": 0.82031906, "num_input_tokens_seen": 107279890, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.25561523, "step": 4985, "time_per_iteration": 2.9446492195129395 }, { "auxiliary_loss_clip": 0.01525984, "auxiliary_loss_mlp": 0.0104985, "balance_loss_clip": 1.32597566, "balance_loss_mlp": 1.02484012, "epoch": 0.29977453780249513, "flos": 52573809081600.0, "grad_norm": 1.985353855925173, "language_loss": 0.72177476, "learning_rate": 3.283900405580837e-06, "loss": 0.74753308, "num_input_tokens_seen": 107303430, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.25012207, "step": 4986, "time_per_iteration": 3.163069009780884 }, { "auxiliary_loss_clip": 0.01519211, "auxiliary_loss_mlp": 0.01046026, "balance_loss_clip": 1.32081616, "balance_loss_mlp": 1.02132583, "epoch": 0.2998346610551631, "flos": 22247408862720.0, "grad_norm": 1.6742762955298107, "language_loss": 0.74654102, "learning_rate": 3.283601762924312e-06, "loss": 0.77219337, "num_input_tokens_seen": 107323700, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.24707031, "step": 4987, "time_per_iteration": 2.9177708625793457 }, { "auxiliary_loss_clip": 0.01501834, "auxiliary_loss_mlp": 0.01041162, "balance_loss_clip": 1.30989242, "balance_loss_mlp": 1.01668811, "epoch": 0.29989478430783106, "flos": 16881937781760.0, "grad_norm": 1.624740352452569, "language_loss": 0.81307065, "learning_rate": 3.2833030715934793e-06, "loss": 0.83850056, "num_input_tokens_seen": 107341965, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.24487305, "step": 4988, "time_per_iteration": 2.865680694580078 }, { "auxiliary_loss_clip": 0.01487355, "auxiliary_loss_mlp": 0.01041334, "balance_loss_clip": 1.29613817, "balance_loss_mlp": 1.01756334, "epoch": 0.29995490756049903, "flos": 23779366433280.0, "grad_norm": 1.6917469884855654, "language_loss": 0.71577239, "learning_rate": 3.2830043315996658e-06, "loss": 0.74105924, "num_input_tokens_seen": 107362615, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.23742676, "step": 4989, "time_per_iteration": 2.8918540477752686 }, { "auxiliary_loss_clip": 0.01514302, "auxiliary_loss_mlp": 0.01045633, "balance_loss_clip": 1.31595039, "balance_loss_mlp": 1.0192759, "epoch": 0.300015030813167, "flos": 14473477082880.0, "grad_norm": 1.8890737684807088, "language_loss": 0.86367601, "learning_rate": 3.282705542954199e-06, "loss": 0.88927537, "num_input_tokens_seen": 107378980, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.26379395, "step": 4990, "time_per_iteration": 2.898923873901367 }, { "auxiliary_loss_clip": 0.01520327, "auxiliary_loss_mlp": 0.01041915, "balance_loss_clip": 1.3186214, "balance_loss_mlp": 1.01647544, "epoch": 0.30007515406583496, "flos": 25202880921600.0, "grad_norm": 1.953462398326638, "language_loss": 0.67288941, "learning_rate": 3.28240670566841e-06, "loss": 0.69851184, "num_input_tokens_seen": 107397640, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.25463867, "step": 4991, "time_per_iteration": 2.952831983566284 }, { "auxiliary_loss_clip": 0.01527473, "auxiliary_loss_mlp": 0.01041484, "balance_loss_clip": 1.32706022, "balance_loss_mlp": 1.01553214, "epoch": 0.3001352773185029, "flos": 19400696599680.0, "grad_norm": 1.9867963497681904, "language_loss": 0.79910946, "learning_rate": 3.28210781975363e-06, "loss": 0.82479906, "num_input_tokens_seen": 107416020, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.25964355, "step": 4992, "time_per_iteration": 2.8430426120758057 }, { "auxiliary_loss_clip": 0.01505301, "auxiliary_loss_mlp": 0.01039147, "balance_loss_clip": 1.31215453, "balance_loss_mlp": 1.01374388, "epoch": 0.3001954005711709, "flos": 21553921526400.0, "grad_norm": 2.037121126740254, "language_loss": 0.83733428, "learning_rate": 3.281808885221193e-06, "loss": 0.86277878, "num_input_tokens_seen": 107436340, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.25402832, "step": 4993, "time_per_iteration": 2.878878116607666 }, { "auxiliary_loss_clip": 0.01524339, "auxiliary_loss_mlp": 0.01045769, "balance_loss_clip": 1.32477164, "balance_loss_mlp": 1.01843429, "epoch": 0.30025552382383885, "flos": 17393268977280.0, "grad_norm": 2.1251955251298313, "language_loss": 0.88319087, "learning_rate": 3.2815099020824345e-06, "loss": 0.90889204, "num_input_tokens_seen": 107454585, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.2734375, "step": 4994, "time_per_iteration": 2.831803321838379 }, { "auxiliary_loss_clip": 0.01509835, "auxiliary_loss_mlp": 0.01043702, "balance_loss_clip": 1.31396866, "balance_loss_mlp": 1.01783395, "epoch": 0.3003156470765068, "flos": 29545237163520.0, "grad_norm": 1.5379373617904444, "language_loss": 0.81474525, "learning_rate": 3.2812108703486924e-06, "loss": 0.84028059, "num_input_tokens_seen": 107477180, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.25891113, "step": 4995, "time_per_iteration": 2.914771318435669 }, { "auxiliary_loss_clip": 0.01503983, "auxiliary_loss_mlp": 0.01043896, "balance_loss_clip": 1.31180906, "balance_loss_mlp": 1.01769376, "epoch": 0.3003757703291748, "flos": 43660175316480.0, "grad_norm": 2.202796520299419, "language_loss": 0.67534155, "learning_rate": 3.2809117900313055e-06, "loss": 0.70082033, "num_input_tokens_seen": 107500250, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.26208496, "step": 4996, "time_per_iteration": 3.064422130584717 }, { "auxiliary_loss_clip": 0.01504605, "auxiliary_loss_mlp": 0.01041749, "balance_loss_clip": 1.3113482, "balance_loss_mlp": 1.01582158, "epoch": 0.30043589358184275, "flos": 22538551023360.0, "grad_norm": 2.052674566961902, "language_loss": 0.76576674, "learning_rate": 3.280612661141615e-06, "loss": 0.7912302, "num_input_tokens_seen": 107520070, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.25952148, "step": 4997, "time_per_iteration": 2.86826229095459 }, { "auxiliary_loss_clip": 0.01491518, "auxiliary_loss_mlp": 0.01044942, "balance_loss_clip": 1.30116022, "balance_loss_mlp": 1.01865673, "epoch": 0.30049601683451077, "flos": 21005643312000.0, "grad_norm": 2.0654344859003926, "language_loss": 0.79205829, "learning_rate": 3.2803134836909646e-06, "loss": 0.81742293, "num_input_tokens_seen": 107539285, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.26281738, "step": 4998, "time_per_iteration": 2.884098768234253 }, { "auxiliary_loss_clip": 0.01505187, "auxiliary_loss_mlp": 0.01037141, "balance_loss_clip": 1.31438029, "balance_loss_mlp": 1.01282263, "epoch": 0.30055614008717874, "flos": 23926837795200.0, "grad_norm": 1.9220901947730018, "language_loss": 0.74580634, "learning_rate": 3.2800142576906985e-06, "loss": 0.77122962, "num_input_tokens_seen": 107560260, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.2434082, "step": 4999, "time_per_iteration": 2.9165408611297607 }, { "auxiliary_loss_clip": 0.01510262, "auxiliary_loss_mlp": 0.01041397, "balance_loss_clip": 1.31425762, "balance_loss_mlp": 1.01698267, "epoch": 0.3006162633398467, "flos": 19178607283200.0, "grad_norm": 1.6043719300483879, "language_loss": 0.76435542, "learning_rate": 3.2797149831521626e-06, "loss": 0.78987199, "num_input_tokens_seen": 107579260, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.24414062, "step": 5000, "time_per_iteration": 2.9508867263793945 }, { "auxiliary_loss_clip": 0.01495431, "auxiliary_loss_mlp": 0.01049563, "balance_loss_clip": 1.30537653, "balance_loss_mlp": 1.02364659, "epoch": 0.30067638659251467, "flos": 14686155480960.0, "grad_norm": 1.7700919753641575, "language_loss": 0.82287818, "learning_rate": 3.2794156600867073e-06, "loss": 0.84832811, "num_input_tokens_seen": 107595245, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.25927734, "step": 5001, "time_per_iteration": 2.8442962169647217 }, { "auxiliary_loss_clip": 0.01517126, "auxiliary_loss_mlp": 0.01051373, "balance_loss_clip": 1.32322884, "balance_loss_mlp": 1.02499235, "epoch": 0.30073650984518263, "flos": 23378514336000.0, "grad_norm": 1.6814079250139846, "language_loss": 0.81555045, "learning_rate": 3.2791162885056815e-06, "loss": 0.8412354, "num_input_tokens_seen": 107613985, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.26391602, "step": 5002, "time_per_iteration": 2.8489949703216553 }, { "auxiliary_loss_clip": 0.01523003, "auxiliary_loss_mlp": 0.01042172, "balance_loss_clip": 1.32469165, "balance_loss_mlp": 1.0173409, "epoch": 0.3007966330978506, "flos": 22977028811520.0, "grad_norm": 1.8240161787655698, "language_loss": 0.72302115, "learning_rate": 3.2788168684204376e-06, "loss": 0.7486729, "num_input_tokens_seen": 107631435, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.24804688, "step": 5003, "time_per_iteration": 2.8865537643432617 }, { "auxiliary_loss_clip": 0.01518396, "auxiliary_loss_mlp": 0.01044855, "balance_loss_clip": 1.32051873, "balance_loss_mlp": 1.01965404, "epoch": 0.30085675635051856, "flos": 27829901842560.0, "grad_norm": 5.293364398473275, "language_loss": 0.72181964, "learning_rate": 3.27851739984233e-06, "loss": 0.74745208, "num_input_tokens_seen": 107650530, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.25231934, "step": 5004, "time_per_iteration": 2.8986434936523438 }, { "auxiliary_loss_clip": 0.01519704, "auxiliary_loss_mlp": 0.01044421, "balance_loss_clip": 1.32253504, "balance_loss_mlp": 1.01733708, "epoch": 0.3009168796031865, "flos": 10888141155840.0, "grad_norm": 2.964168243136023, "language_loss": 0.82620788, "learning_rate": 3.278217882782715e-06, "loss": 0.85184908, "num_input_tokens_seen": 107662240, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.27099609, "step": 5005, "time_per_iteration": 2.818019151687622 }, { "auxiliary_loss_clip": 0.01511439, "auxiliary_loss_mlp": 0.01043201, "balance_loss_clip": 1.31888843, "balance_loss_mlp": 1.01686788, "epoch": 0.3009770028558545, "flos": 23815906248960.0, "grad_norm": 3.1324928037926085, "language_loss": 0.76322967, "learning_rate": 3.2779183172529497e-06, "loss": 0.7887761, "num_input_tokens_seen": 107680330, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.26306152, "step": 5006, "time_per_iteration": 2.874333143234253 }, { "auxiliary_loss_clip": 0.01501803, "auxiliary_loss_mlp": 0.01042212, "balance_loss_clip": 1.31086969, "balance_loss_mlp": 1.01659369, "epoch": 0.30103712610852246, "flos": 26479240761600.0, "grad_norm": 1.8770196595040238, "language_loss": 0.72344851, "learning_rate": 3.2776187032643932e-06, "loss": 0.74888861, "num_input_tokens_seen": 107700020, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.25634766, "step": 5007, "time_per_iteration": 2.8995800018310547 }, { "auxiliary_loss_clip": 0.0151426, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.31930208, "balance_loss_mlp": 1.01459455, "epoch": 0.3010972493611904, "flos": 22866640202880.0, "grad_norm": 2.777594654260953, "language_loss": 0.77397937, "learning_rate": 3.2773190408284075e-06, "loss": 0.79953206, "num_input_tokens_seen": 107718575, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.2644043, "step": 5008, "time_per_iteration": 2.852952003479004 }, { "auxiliary_loss_clip": 0.01513172, "auxiliary_loss_mlp": 0.01038944, "balance_loss_clip": 1.31935024, "balance_loss_mlp": 1.01448226, "epoch": 0.3011573726138584, "flos": 24062409774720.0, "grad_norm": 3.336653498929683, "language_loss": 0.85147703, "learning_rate": 3.2770193299563564e-06, "loss": 0.87699819, "num_input_tokens_seen": 107738635, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.24450684, "step": 5009, "time_per_iteration": 4.3645994663238525 }, { "auxiliary_loss_clip": 0.0153901, "auxiliary_loss_mlp": 0.01048939, "balance_loss_clip": 1.33754647, "balance_loss_mlp": 1.01930332, "epoch": 0.30121749586652635, "flos": 20267562585600.0, "grad_norm": 1.8742723982462457, "language_loss": 0.84066796, "learning_rate": 3.276719570659604e-06, "loss": 0.86654752, "num_input_tokens_seen": 107753415, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.29663086, "step": 5010, "time_per_iteration": 2.933987617492676 }, { "auxiliary_loss_clip": 0.01503079, "auxiliary_loss_mlp": 0.01040413, "balance_loss_clip": 1.31005549, "balance_loss_mlp": 1.01602256, "epoch": 0.3012776191191944, "flos": 26954348855040.0, "grad_norm": 2.108992249847228, "language_loss": 0.85467303, "learning_rate": 3.2764197629495176e-06, "loss": 0.880108, "num_input_tokens_seen": 107773840, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.24389648, "step": 5011, "time_per_iteration": 2.984506368637085 }, { "auxiliary_loss_clip": 0.01529189, "auxiliary_loss_mlp": 0.01045678, "balance_loss_clip": 1.33096576, "balance_loss_mlp": 1.01901102, "epoch": 0.30133774237186234, "flos": 20421956401920.0, "grad_norm": 2.6742678105509827, "language_loss": 0.73522162, "learning_rate": 3.2761199068374656e-06, "loss": 0.76097023, "num_input_tokens_seen": 107792020, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.26660156, "step": 5012, "time_per_iteration": 2.825655937194824 }, { "auxiliary_loss_clip": 0.01520602, "auxiliary_loss_mlp": 0.01041649, "balance_loss_clip": 1.32367456, "balance_loss_mlp": 1.01700819, "epoch": 0.3013978656245303, "flos": 19802046389760.0, "grad_norm": 2.1257960519242314, "language_loss": 0.88943869, "learning_rate": 3.275820002334819e-06, "loss": 0.91506124, "num_input_tokens_seen": 107809595, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.24633789, "step": 5013, "time_per_iteration": 4.228014230728149 }, { "auxiliary_loss_clip": 0.01530696, "auxiliary_loss_mlp": 0.01044302, "balance_loss_clip": 1.33182645, "balance_loss_mlp": 1.0185529, "epoch": 0.30145798887719827, "flos": 16257367555200.0, "grad_norm": 1.9666007181115197, "language_loss": 0.84294629, "learning_rate": 3.2755200494529496e-06, "loss": 0.86869621, "num_input_tokens_seen": 107827230, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.25720215, "step": 5014, "time_per_iteration": 2.8550171852111816 }, { "auxiliary_loss_clip": 0.01502742, "auxiliary_loss_mlp": 0.01045994, "balance_loss_clip": 1.31220603, "balance_loss_mlp": 1.02081704, "epoch": 0.30151811212986623, "flos": 24582473216640.0, "grad_norm": 1.6453986563622518, "language_loss": 0.68970013, "learning_rate": 3.2752200482032323e-06, "loss": 0.71518743, "num_input_tokens_seen": 107847195, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.25170898, "step": 5015, "time_per_iteration": 2.873831272125244 }, { "auxiliary_loss_clip": 0.01521954, "auxiliary_loss_mlp": 0.01047142, "balance_loss_clip": 1.32829642, "balance_loss_mlp": 1.01962852, "epoch": 0.3015782353825342, "flos": 21882191685120.0, "grad_norm": 2.614020173913498, "language_loss": 0.76302409, "learning_rate": 3.2749199985970436e-06, "loss": 0.78871512, "num_input_tokens_seen": 107866420, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.27514648, "step": 5016, "time_per_iteration": 4.275460243225098 }, { "auxiliary_loss_clip": 0.01527883, "auxiliary_loss_mlp": 0.01045877, "balance_loss_clip": 1.33185673, "balance_loss_mlp": 1.02091455, "epoch": 0.30163835863520216, "flos": 28781611107840.0, "grad_norm": 1.7305255892810578, "language_loss": 0.6661582, "learning_rate": 3.2746199006457603e-06, "loss": 0.69189584, "num_input_tokens_seen": 107889090, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.24963379, "step": 5017, "time_per_iteration": 4.414970874786377 }, { "auxiliary_loss_clip": 0.01533732, "auxiliary_loss_mlp": 0.01045715, "balance_loss_clip": 1.33659124, "balance_loss_mlp": 1.02171803, "epoch": 0.30169848188787013, "flos": 22976485873920.0, "grad_norm": 3.5568876471714916, "language_loss": 0.69854516, "learning_rate": 3.2743197543607628e-06, "loss": 0.72433966, "num_input_tokens_seen": 107907520, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.23986816, "step": 5018, "time_per_iteration": 2.8537416458129883 }, { "auxiliary_loss_clip": 0.01508884, "auxiliary_loss_mlp": 0.01047466, "balance_loss_clip": 1.31870437, "balance_loss_mlp": 1.02358842, "epoch": 0.3017586051405381, "flos": 21845289911040.0, "grad_norm": 1.9821531468337283, "language_loss": 0.80332911, "learning_rate": 3.2740195597534327e-06, "loss": 0.82889265, "num_input_tokens_seen": 107925650, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.23901367, "step": 5019, "time_per_iteration": 2.8651931285858154 }, { "auxiliary_loss_clip": 0.01526969, "auxiliary_loss_mlp": 0.01044851, "balance_loss_clip": 1.33181953, "balance_loss_mlp": 1.01968598, "epoch": 0.30181872839320606, "flos": 22169669016960.0, "grad_norm": 2.1648294018670553, "language_loss": 0.7167539, "learning_rate": 3.2737193168351527e-06, "loss": 0.74247211, "num_input_tokens_seen": 107943975, "router_z_loss_clip": 1.94824219, "router_z_loss_mlp": 0.25183105, "step": 5020, "time_per_iteration": 2.8809921741485596 }, { "auxiliary_loss_clip": 0.01548744, "auxiliary_loss_mlp": 0.01048579, "balance_loss_clip": 1.34834516, "balance_loss_mlp": 1.02429581, "epoch": 0.301878851645874, "flos": 18123341374080.0, "grad_norm": 1.9470972066517187, "language_loss": 0.79806578, "learning_rate": 3.2734190256173085e-06, "loss": 0.82403898, "num_input_tokens_seen": 107962950, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.24291992, "step": 5021, "time_per_iteration": 2.9474642276763916 }, { "auxiliary_loss_clip": 0.01532888, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.33738256, "balance_loss_mlp": 1.01987302, "epoch": 0.301938974898542, "flos": 17610969548160.0, "grad_norm": 2.1840732050459954, "language_loss": 0.76618522, "learning_rate": 3.2731186861112877e-06, "loss": 0.79194129, "num_input_tokens_seen": 107979700, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.22827148, "step": 5022, "time_per_iteration": 2.8643288612365723 }, { "auxiliary_loss_clip": 0.01528107, "auxiliary_loss_mlp": 0.01047842, "balance_loss_clip": 1.3315953, "balance_loss_mlp": 1.02273703, "epoch": 0.30199909815120995, "flos": 11188422766080.0, "grad_norm": 1.9577679090714486, "language_loss": 0.70689785, "learning_rate": 3.2728182983284793e-06, "loss": 0.73265731, "num_input_tokens_seen": 107996645, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.25146484, "step": 5023, "time_per_iteration": 2.870351552963257 }, { "auxiliary_loss_clip": 0.01545075, "auxiliary_loss_mlp": 0.01043421, "balance_loss_clip": 1.34480941, "balance_loss_mlp": 1.02036619, "epoch": 0.302059221403878, "flos": 21917781360000.0, "grad_norm": 1.786327559070514, "language_loss": 0.71987766, "learning_rate": 3.2725178622802724e-06, "loss": 0.74576259, "num_input_tokens_seen": 108015020, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.23059082, "step": 5024, "time_per_iteration": 2.8936710357666016 }, { "auxiliary_loss_clip": 0.01525752, "auxiliary_loss_mlp": 0.01047726, "balance_loss_clip": 1.33353996, "balance_loss_mlp": 1.02307367, "epoch": 0.30211934465654594, "flos": 26407699453440.0, "grad_norm": 1.6085231078886566, "language_loss": 0.75271457, "learning_rate": 3.272217377978061e-06, "loss": 0.7784493, "num_input_tokens_seen": 108036430, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.24658203, "step": 5025, "time_per_iteration": 2.9208872318267822 }, { "auxiliary_loss_clip": 0.01511687, "auxiliary_loss_mlp": 0.01042794, "balance_loss_clip": 1.32134056, "balance_loss_mlp": 1.01941741, "epoch": 0.3021794679092139, "flos": 23409941489280.0, "grad_norm": 1.9192242828231918, "language_loss": 0.6794281, "learning_rate": 3.2719168454332387e-06, "loss": 0.70497298, "num_input_tokens_seen": 108054250, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.23364258, "step": 5026, "time_per_iteration": 2.948808193206787 }, { "auxiliary_loss_clip": 0.01535386, "auxiliary_loss_mlp": 0.01045342, "balance_loss_clip": 1.34076202, "balance_loss_mlp": 1.01972401, "epoch": 0.30223959116188187, "flos": 20269191398400.0, "grad_norm": 1.726245893193024, "language_loss": 0.85899138, "learning_rate": 3.2716162646572034e-06, "loss": 0.88479865, "num_input_tokens_seen": 108071495, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.25622559, "step": 5027, "time_per_iteration": 2.9346561431884766 }, { "auxiliary_loss_clip": 0.01531742, "auxiliary_loss_mlp": 0.01046276, "balance_loss_clip": 1.33865035, "balance_loss_mlp": 1.02302992, "epoch": 0.30229971441454984, "flos": 26699113082880.0, "grad_norm": 1.683548450924166, "language_loss": 0.79013371, "learning_rate": 3.271315635661351e-06, "loss": 0.81591386, "num_input_tokens_seen": 108092135, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.23242188, "step": 5028, "time_per_iteration": 2.9031426906585693 }, { "auxiliary_loss_clip": 0.01530489, "auxiliary_loss_mlp": 0.01048872, "balance_loss_clip": 1.33724952, "balance_loss_mlp": 1.02392197, "epoch": 0.3023598376672178, "flos": 34357407857280.0, "grad_norm": 1.9780386088868531, "language_loss": 0.77898651, "learning_rate": 3.2710149584570826e-06, "loss": 0.80478013, "num_input_tokens_seen": 108112945, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.24938965, "step": 5029, "time_per_iteration": 2.990168809890747 }, { "auxiliary_loss_clip": 0.01554291, "auxiliary_loss_mlp": 0.01050591, "balance_loss_clip": 1.35575914, "balance_loss_mlp": 1.02527118, "epoch": 0.30241996091988577, "flos": 23122464157440.0, "grad_norm": 2.0364940490691876, "language_loss": 0.82661474, "learning_rate": 3.2707142330557993e-06, "loss": 0.85266352, "num_input_tokens_seen": 108130325, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.25354004, "step": 5030, "time_per_iteration": 2.8578219413757324 }, { "auxiliary_loss_clip": 0.01551023, "auxiliary_loss_mlp": 0.01048784, "balance_loss_clip": 1.35233426, "balance_loss_mlp": 1.02386951, "epoch": 0.30248008417255373, "flos": 19399203521280.0, "grad_norm": 2.8824655644837236, "language_loss": 0.70839572, "learning_rate": 3.270413459468905e-06, "loss": 0.73439384, "num_input_tokens_seen": 108150300, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.24902344, "step": 5031, "time_per_iteration": 2.871394395828247 }, { "auxiliary_loss_clip": 0.01544148, "auxiliary_loss_mlp": 0.01044257, "balance_loss_clip": 1.34823251, "balance_loss_mlp": 1.02052236, "epoch": 0.3025402074252217, "flos": 23780407063680.0, "grad_norm": 2.2880249802517776, "language_loss": 0.83021796, "learning_rate": 3.2701126377078047e-06, "loss": 0.85610205, "num_input_tokens_seen": 108170330, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.23730469, "step": 5032, "time_per_iteration": 2.866802453994751 }, { "auxiliary_loss_clip": 0.01560751, "auxiliary_loss_mlp": 0.01052908, "balance_loss_clip": 1.36173749, "balance_loss_mlp": 1.02698016, "epoch": 0.30260033067788966, "flos": 26005082808960.0, "grad_norm": 2.2514221181944176, "language_loss": 0.74727851, "learning_rate": 3.269811767783906e-06, "loss": 0.77341509, "num_input_tokens_seen": 108191265, "router_z_loss_clip": 1.98925781, "router_z_loss_mlp": 0.25952148, "step": 5033, "time_per_iteration": 2.912060499191284 }, { "auxiliary_loss_clip": 0.01532561, "auxiliary_loss_mlp": 0.01052146, "balance_loss_clip": 1.33839476, "balance_loss_mlp": 1.02786326, "epoch": 0.3026604539305576, "flos": 25385534755200.0, "grad_norm": 1.9790331712501803, "language_loss": 0.75103998, "learning_rate": 3.2695108497086185e-06, "loss": 0.77688706, "num_input_tokens_seen": 108211615, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.24291992, "step": 5034, "time_per_iteration": 2.920081377029419 }, { "auxiliary_loss_clip": 0.01545418, "auxiliary_loss_mlp": 0.01046882, "balance_loss_clip": 1.34878302, "balance_loss_mlp": 1.02244413, "epoch": 0.3027205771832256, "flos": 25823922053760.0, "grad_norm": 1.848475547830947, "language_loss": 0.72620887, "learning_rate": 3.269209883493352e-06, "loss": 0.75213182, "num_input_tokens_seen": 108231080, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.24450684, "step": 5035, "time_per_iteration": 2.8822824954986572 }, { "auxiliary_loss_clip": 0.01530623, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.33824158, "balance_loss_mlp": 1.01765919, "epoch": 0.30278070043589356, "flos": 27355970113920.0, "grad_norm": 1.8932384513328882, "language_loss": 0.8793326, "learning_rate": 3.2689088691495196e-06, "loss": 0.90504014, "num_input_tokens_seen": 108251125, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.22473145, "step": 5036, "time_per_iteration": 2.9166908264160156 }, { "auxiliary_loss_clip": 0.01528016, "auxiliary_loss_mlp": 0.01053396, "balance_loss_clip": 1.33587706, "balance_loss_mlp": 1.02775407, "epoch": 0.3028408236885616, "flos": 24795875531520.0, "grad_norm": 1.4067770012799214, "language_loss": 0.77937269, "learning_rate": 3.268607806688536e-06, "loss": 0.80518681, "num_input_tokens_seen": 108272545, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.25646973, "step": 5037, "time_per_iteration": 2.8773856163024902 }, { "auxiliary_loss_clip": 0.01544625, "auxiliary_loss_mlp": 0.01040871, "balance_loss_clip": 1.34488988, "balance_loss_mlp": 1.01658821, "epoch": 0.30290094694122954, "flos": 12940116923520.0, "grad_norm": 3.0673784240923143, "language_loss": 0.79295754, "learning_rate": 3.268306696121816e-06, "loss": 0.81881249, "num_input_tokens_seen": 108289725, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.24279785, "step": 5038, "time_per_iteration": 2.829651117324829 }, { "auxiliary_loss_clip": 0.01530298, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.3380363, "balance_loss_mlp": 1.01778412, "epoch": 0.3029610701938975, "flos": 25926166598400.0, "grad_norm": 1.8550284127077146, "language_loss": 0.74931133, "learning_rate": 3.2680055374607804e-06, "loss": 0.77501911, "num_input_tokens_seen": 108310690, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.22705078, "step": 5039, "time_per_iteration": 2.8959546089172363 }, { "auxiliary_loss_clip": 0.0152711, "auxiliary_loss_mlp": 0.01044962, "balance_loss_clip": 1.3348819, "balance_loss_mlp": 1.0224793, "epoch": 0.3030211934465655, "flos": 21990996725760.0, "grad_norm": 1.9936484733854658, "language_loss": 0.80769992, "learning_rate": 3.267704330716847e-06, "loss": 0.83342063, "num_input_tokens_seen": 108328905, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.22485352, "step": 5040, "time_per_iteration": 2.8488874435424805 }, { "auxiliary_loss_clip": 0.01540334, "auxiliary_loss_mlp": 0.01048471, "balance_loss_clip": 1.34620011, "balance_loss_mlp": 1.02601147, "epoch": 0.30308131669923344, "flos": 21000711628800.0, "grad_norm": 1.5696883490311577, "language_loss": 0.82630336, "learning_rate": 3.267403075901438e-06, "loss": 0.85219133, "num_input_tokens_seen": 108346680, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.22460938, "step": 5041, "time_per_iteration": 2.8700506687164307 }, { "auxiliary_loss_clip": 0.01294815, "auxiliary_loss_mlp": 0.01053805, "balance_loss_clip": 1.1798358, "balance_loss_mlp": 1.02042615, "epoch": 0.3031414399519014, "flos": 60578987610240.0, "grad_norm": 0.7792938093820913, "language_loss": 0.59546387, "learning_rate": 3.267101773025978e-06, "loss": 0.61895007, "num_input_tokens_seen": 108413885, "router_z_loss_clip": 1.1484375, "router_z_loss_mlp": 0.33398438, "step": 5042, "time_per_iteration": 3.480782985687256 }, { "auxiliary_loss_clip": 0.01550453, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.35181355, "balance_loss_mlp": 1.01714325, "epoch": 0.30320156320456937, "flos": 21917555136000.0, "grad_norm": 1.6755703922313625, "language_loss": 0.72203809, "learning_rate": 3.266800422101892e-06, "loss": 0.7479521, "num_input_tokens_seen": 108433640, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.23815918, "step": 5043, "time_per_iteration": 2.887463331222534 }, { "auxiliary_loss_clip": 0.01529934, "auxiliary_loss_mlp": 0.01038236, "balance_loss_clip": 1.33521557, "balance_loss_mlp": 1.01468027, "epoch": 0.30326168645723733, "flos": 21662726567040.0, "grad_norm": 2.0790536364280015, "language_loss": 0.70544195, "learning_rate": 3.266499023140606e-06, "loss": 0.73112369, "num_input_tokens_seen": 108452640, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.23535156, "step": 5044, "time_per_iteration": 4.278761625289917 }, { "auxiliary_loss_clip": 0.01528866, "auxiliary_loss_mlp": 0.01037872, "balance_loss_clip": 1.33623266, "balance_loss_mlp": 1.01407802, "epoch": 0.3033218097099053, "flos": 21881196299520.0, "grad_norm": 1.4170235070145143, "language_loss": 0.7828747, "learning_rate": 3.2661975761535513e-06, "loss": 0.80854213, "num_input_tokens_seen": 108472470, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.23803711, "step": 5045, "time_per_iteration": 2.8708434104919434 }, { "auxiliary_loss_clip": 0.01544721, "auxiliary_loss_mlp": 0.0104072, "balance_loss_clip": 1.3487041, "balance_loss_mlp": 1.01669979, "epoch": 0.30338193296257326, "flos": 27101232034560.0, "grad_norm": 1.6713053571332037, "language_loss": 0.72626328, "learning_rate": 3.2658960811521564e-06, "loss": 0.75211775, "num_input_tokens_seen": 108493025, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.24023438, "step": 5046, "time_per_iteration": 2.945239782333374 }, { "auxiliary_loss_clip": 0.0153785, "auxiliary_loss_mlp": 0.01039976, "balance_loss_clip": 1.33790374, "balance_loss_mlp": 1.01569283, "epoch": 0.30344205621524123, "flos": 19543191033600.0, "grad_norm": 1.5792495399695765, "language_loss": 0.8179971, "learning_rate": 3.2655945381478564e-06, "loss": 0.84377539, "num_input_tokens_seen": 108513480, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.24267578, "step": 5047, "time_per_iteration": 2.9066264629364014 }, { "auxiliary_loss_clip": 0.01526834, "auxiliary_loss_mlp": 0.01041613, "balance_loss_clip": 1.33183885, "balance_loss_mlp": 1.01810467, "epoch": 0.3035021794679092, "flos": 23920503523200.0, "grad_norm": 1.7776022743715796, "language_loss": 0.72315115, "learning_rate": 3.265292947152084e-06, "loss": 0.74883562, "num_input_tokens_seen": 108533155, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.23510742, "step": 5048, "time_per_iteration": 4.317375898361206 }, { "auxiliary_loss_clip": 0.01524983, "auxiliary_loss_mlp": 0.01040541, "balance_loss_clip": 1.33164883, "balance_loss_mlp": 1.01710415, "epoch": 0.30356230272057716, "flos": 16152182098560.0, "grad_norm": 1.7163742270982554, "language_loss": 0.7567147, "learning_rate": 3.2649913081762763e-06, "loss": 0.78236991, "num_input_tokens_seen": 108551900, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.23461914, "step": 5049, "time_per_iteration": 2.8491060733795166 }, { "auxiliary_loss_clip": 0.01531297, "auxiliary_loss_mlp": 0.01045116, "balance_loss_clip": 1.33352149, "balance_loss_mlp": 1.02165556, "epoch": 0.3036224259732452, "flos": 28926865474560.0, "grad_norm": 1.6254399524197258, "language_loss": 0.83053517, "learning_rate": 3.2646896212318717e-06, "loss": 0.8562994, "num_input_tokens_seen": 108574005, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.23461914, "step": 5050, "time_per_iteration": 2.8952255249023438 }, { "auxiliary_loss_clip": 0.01530708, "auxiliary_loss_mlp": 0.01041984, "balance_loss_clip": 1.33375192, "balance_loss_mlp": 1.01743901, "epoch": 0.30368254922591315, "flos": 21115443738240.0, "grad_norm": 2.5510511492734302, "language_loss": 0.75318909, "learning_rate": 3.2643878863303106e-06, "loss": 0.778916, "num_input_tokens_seen": 108592715, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.2454834, "step": 5051, "time_per_iteration": 4.236498832702637 }, { "auxiliary_loss_clip": 0.01522256, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.32808042, "balance_loss_mlp": 1.01955795, "epoch": 0.3037426724785811, "flos": 23012482752000.0, "grad_norm": 1.6931053116641501, "language_loss": 0.76962841, "learning_rate": 3.264086103483033e-06, "loss": 0.79528308, "num_input_tokens_seen": 108611770, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.2364502, "step": 5052, "time_per_iteration": 4.219244480133057 }, { "auxiliary_loss_clip": 0.01528806, "auxiliary_loss_mlp": 0.01045576, "balance_loss_clip": 1.33126092, "balance_loss_mlp": 1.02166224, "epoch": 0.3038027957312491, "flos": 15641122371840.0, "grad_norm": 2.3457032938073574, "language_loss": 0.84189773, "learning_rate": 3.2637842727014836e-06, "loss": 0.86764157, "num_input_tokens_seen": 108629070, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.23901367, "step": 5053, "time_per_iteration": 2.839395761489868 }, { "auxiliary_loss_clip": 0.0152284, "auxiliary_loss_mlp": 0.01044373, "balance_loss_clip": 1.32882047, "balance_loss_mlp": 1.01913631, "epoch": 0.30386291898391704, "flos": 12721692435840.0, "grad_norm": 1.7059683250793194, "language_loss": 0.71990973, "learning_rate": 3.2634823939971083e-06, "loss": 0.74558187, "num_input_tokens_seen": 108646315, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.25231934, "step": 5054, "time_per_iteration": 2.8406155109405518 }, { "auxiliary_loss_clip": 0.01517322, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.32103777, "balance_loss_mlp": 1.0177263, "epoch": 0.303923042236585, "flos": 26370616700160.0, "grad_norm": 2.847380791250688, "language_loss": 0.69949341, "learning_rate": 3.2631804673813545e-06, "loss": 0.72507787, "num_input_tokens_seen": 108665920, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.23400879, "step": 5055, "time_per_iteration": 2.9113967418670654 }, { "auxiliary_loss_clip": 0.01525565, "auxiliary_loss_mlp": 0.01044498, "balance_loss_clip": 1.33059883, "balance_loss_mlp": 1.02045393, "epoch": 0.30398316548925297, "flos": 19728740534400.0, "grad_norm": 1.7701287922232614, "language_loss": 0.68249208, "learning_rate": 3.2628784928656707e-06, "loss": 0.70819265, "num_input_tokens_seen": 108683485, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.24047852, "step": 5056, "time_per_iteration": 2.9149222373962402 }, { "auxiliary_loss_clip": 0.01513529, "auxiliary_loss_mlp": 0.01038774, "balance_loss_clip": 1.32272315, "balance_loss_mlp": 1.0146699, "epoch": 0.30404328874192094, "flos": 24249588088320.0, "grad_norm": 1.510805006353096, "language_loss": 0.8302837, "learning_rate": 3.262576470461507e-06, "loss": 0.85580677, "num_input_tokens_seen": 108702700, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.2409668, "step": 5057, "time_per_iteration": 2.86606502532959 }, { "auxiliary_loss_clip": 0.01507882, "auxiliary_loss_mlp": 0.01046126, "balance_loss_clip": 1.31616426, "balance_loss_mlp": 1.02086568, "epoch": 0.3041034119945889, "flos": 24509710298880.0, "grad_norm": 1.6326004060494128, "language_loss": 0.8926931, "learning_rate": 3.2622744001803176e-06, "loss": 0.91823316, "num_input_tokens_seen": 108721860, "router_z_loss_clip": 1.91796875, "router_z_loss_mlp": 0.25280762, "step": 5058, "time_per_iteration": 2.8984851837158203 }, { "auxiliary_loss_clip": 0.01526471, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.33104205, "balance_loss_mlp": 1.02221847, "epoch": 0.30416353524725687, "flos": 28299444825600.0, "grad_norm": 2.022590824876327, "language_loss": 0.71743989, "learning_rate": 3.2619722820335564e-06, "loss": 0.74315929, "num_input_tokens_seen": 108743215, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.23242188, "step": 5059, "time_per_iteration": 2.926253318786621 }, { "auxiliary_loss_clip": 0.01513501, "auxiliary_loss_mlp": 0.01043321, "balance_loss_clip": 1.3203094, "balance_loss_mlp": 1.01742887, "epoch": 0.30422365849992483, "flos": 23671104330240.0, "grad_norm": 1.98683840918947, "language_loss": 0.73647231, "learning_rate": 3.26167011603268e-06, "loss": 0.7620405, "num_input_tokens_seen": 108765505, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.25878906, "step": 5060, "time_per_iteration": 2.941573143005371 }, { "auxiliary_loss_clip": 0.01517197, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.32344222, "balance_loss_mlp": 1.02043962, "epoch": 0.3042837817525928, "flos": 23008048761600.0, "grad_norm": 1.7835855942515848, "language_loss": 0.78123784, "learning_rate": 3.2613679021891463e-06, "loss": 0.80685937, "num_input_tokens_seen": 108783370, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.24511719, "step": 5061, "time_per_iteration": 2.8847463130950928 }, { "auxiliary_loss_clip": 0.01529473, "auxiliary_loss_mlp": 0.01042661, "balance_loss_clip": 1.33217955, "balance_loss_mlp": 1.01700735, "epoch": 0.30434390500526076, "flos": 22090255113600.0, "grad_norm": 1.9219140385195363, "language_loss": 0.83078253, "learning_rate": 3.261065640514415e-06, "loss": 0.85650384, "num_input_tokens_seen": 108797430, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.25671387, "step": 5062, "time_per_iteration": 2.8271775245666504 }, { "auxiliary_loss_clip": 0.01509587, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 1.31712627, "balance_loss_mlp": 1.01470757, "epoch": 0.3044040282579287, "flos": 25494385040640.0, "grad_norm": 1.7782673870244914, "language_loss": 0.75091672, "learning_rate": 3.2607633310199483e-06, "loss": 0.77640855, "num_input_tokens_seen": 108816945, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.24902344, "step": 5063, "time_per_iteration": 2.921821355819702 }, { "auxiliary_loss_clip": 0.01506384, "auxiliary_loss_mlp": 0.01046438, "balance_loss_clip": 1.31407154, "balance_loss_mlp": 1.02079606, "epoch": 0.30446415151059675, "flos": 21955723764480.0, "grad_norm": 1.968107689238428, "language_loss": 0.85178673, "learning_rate": 3.26046097371721e-06, "loss": 0.87731493, "num_input_tokens_seen": 108836615, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.25634766, "step": 5064, "time_per_iteration": 3.0064053535461426 }, { "auxiliary_loss_clip": 0.01506219, "auxiliary_loss_mlp": 0.01042985, "balance_loss_clip": 1.31352019, "balance_loss_mlp": 1.01658022, "epoch": 0.3045242747632647, "flos": 16444048176000.0, "grad_norm": 2.0874427370300355, "language_loss": 0.76685095, "learning_rate": 3.2601585686176655e-06, "loss": 0.79234302, "num_input_tokens_seen": 108855165, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.26391602, "step": 5065, "time_per_iteration": 2.904982805252075 }, { "auxiliary_loss_clip": 0.01526095, "auxiliary_loss_mlp": 0.01045243, "balance_loss_clip": 1.32946324, "balance_loss_mlp": 1.01920724, "epoch": 0.3045843980159327, "flos": 31552845765120.0, "grad_norm": 1.822831898909084, "language_loss": 0.63512993, "learning_rate": 3.2598561157327814e-06, "loss": 0.66084325, "num_input_tokens_seen": 108874690, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.26013184, "step": 5066, "time_per_iteration": 2.964115619659424 }, { "auxiliary_loss_clip": 0.01529582, "auxiliary_loss_mlp": 0.01043675, "balance_loss_clip": 1.32797551, "balance_loss_mlp": 1.01815248, "epoch": 0.30464452126860064, "flos": 17861409371520.0, "grad_norm": 1.8612124104707453, "language_loss": 0.83407629, "learning_rate": 3.2595536150740265e-06, "loss": 0.8598088, "num_input_tokens_seen": 108893140, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.25549316, "step": 5067, "time_per_iteration": 2.8486366271972656 }, { "auxiliary_loss_clip": 0.01491593, "auxiliary_loss_mlp": 0.01045772, "balance_loss_clip": 1.30229402, "balance_loss_mlp": 1.02029657, "epoch": 0.3047046445212686, "flos": 20641150051200.0, "grad_norm": 1.8499736918320313, "language_loss": 0.63340563, "learning_rate": 3.259251066652873e-06, "loss": 0.65877926, "num_input_tokens_seen": 108911880, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.25488281, "step": 5068, "time_per_iteration": 2.9019763469696045 }, { "auxiliary_loss_clip": 0.015033, "auxiliary_loss_mlp": 0.01043553, "balance_loss_clip": 1.31077504, "balance_loss_mlp": 1.01816106, "epoch": 0.3047647677739366, "flos": 21297373655040.0, "grad_norm": 1.9286109475593438, "language_loss": 0.76377654, "learning_rate": 3.258948470480793e-06, "loss": 0.78924513, "num_input_tokens_seen": 108930440, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.25427246, "step": 5069, "time_per_iteration": 2.9278621673583984 }, { "auxiliary_loss_clip": 0.01497946, "auxiliary_loss_mlp": 0.01048912, "balance_loss_clip": 1.30699158, "balance_loss_mlp": 1.02344871, "epoch": 0.30482489102660454, "flos": 21005688556800.0, "grad_norm": 2.3432113254310494, "language_loss": 0.76637828, "learning_rate": 3.258645826569261e-06, "loss": 0.79184681, "num_input_tokens_seen": 108949125, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.2545166, "step": 5070, "time_per_iteration": 2.8674910068511963 }, { "auxiliary_loss_clip": 0.01523433, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.32395709, "balance_loss_mlp": 1.01891112, "epoch": 0.3048850142792725, "flos": 26303102179200.0, "grad_norm": 1.9326963157754316, "language_loss": 0.82834792, "learning_rate": 3.2583431349297527e-06, "loss": 0.85401285, "num_input_tokens_seen": 108972190, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.24157715, "step": 5071, "time_per_iteration": 2.9050464630126953 }, { "auxiliary_loss_clip": 0.01520452, "auxiliary_loss_mlp": 0.01045717, "balance_loss_clip": 1.32227969, "balance_loss_mlp": 1.01983666, "epoch": 0.30494513753194047, "flos": 22356440127360.0, "grad_norm": 1.8908467468030958, "language_loss": 0.76739907, "learning_rate": 3.2580403955737467e-06, "loss": 0.79306078, "num_input_tokens_seen": 108990325, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.25915527, "step": 5072, "time_per_iteration": 2.8716886043548584 }, { "auxiliary_loss_clip": 0.0151209, "auxiliary_loss_mlp": 0.01049811, "balance_loss_clip": 1.31663465, "balance_loss_mlp": 1.02276278, "epoch": 0.30500526078460843, "flos": 19547353555200.0, "grad_norm": 1.8151215722367053, "language_loss": 0.72694188, "learning_rate": 3.257737608512723e-06, "loss": 0.75256085, "num_input_tokens_seen": 109009505, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.27050781, "step": 5073, "time_per_iteration": 2.8332679271698 }, { "auxiliary_loss_clip": 0.01515486, "auxiliary_loss_mlp": 0.01048169, "balance_loss_clip": 1.31790209, "balance_loss_mlp": 1.02219296, "epoch": 0.3050653840372764, "flos": 14473477082880.0, "grad_norm": 5.684932935425009, "language_loss": 0.77550489, "learning_rate": 3.257434773758163e-06, "loss": 0.80114138, "num_input_tokens_seen": 109026350, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.25964355, "step": 5074, "time_per_iteration": 2.804622173309326 }, { "auxiliary_loss_clip": 0.01520965, "auxiliary_loss_mlp": 0.01046083, "balance_loss_clip": 1.3247782, "balance_loss_mlp": 1.02157378, "epoch": 0.30512550728994436, "flos": 24254565016320.0, "grad_norm": 2.530775999402961, "language_loss": 0.75439668, "learning_rate": 3.25713189132155e-06, "loss": 0.78006715, "num_input_tokens_seen": 109044165, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.24523926, "step": 5075, "time_per_iteration": 2.904207229614258 }, { "auxiliary_loss_clip": 0.01535971, "auxiliary_loss_mlp": 0.01052027, "balance_loss_clip": 1.33299494, "balance_loss_mlp": 1.02444184, "epoch": 0.30518563054261233, "flos": 16368661059840.0, "grad_norm": 2.2184586790815857, "language_loss": 0.76755893, "learning_rate": 3.2568289612143703e-06, "loss": 0.79343891, "num_input_tokens_seen": 109060665, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 0.27587891, "step": 5076, "time_per_iteration": 2.8144164085388184 }, { "auxiliary_loss_clip": 0.01528188, "auxiliary_loss_mlp": 0.01048766, "balance_loss_clip": 1.33208823, "balance_loss_mlp": 1.02295685, "epoch": 0.30524575379528035, "flos": 21589465956480.0, "grad_norm": 3.0217734486579735, "language_loss": 0.79230279, "learning_rate": 3.25652598344811e-06, "loss": 0.81807232, "num_input_tokens_seen": 109080035, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.25830078, "step": 5077, "time_per_iteration": 3.0125181674957275 }, { "auxiliary_loss_clip": 0.01501531, "auxiliary_loss_mlp": 0.01046322, "balance_loss_clip": 1.31175447, "balance_loss_mlp": 1.02112126, "epoch": 0.3053058770479483, "flos": 16553984336640.0, "grad_norm": 1.6121717380260605, "language_loss": 0.76435298, "learning_rate": 3.256222958034259e-06, "loss": 0.78983152, "num_input_tokens_seen": 109097385, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.25219727, "step": 5078, "time_per_iteration": 4.274973392486572 }, { "auxiliary_loss_clip": 0.01516922, "auxiliary_loss_mlp": 0.01043013, "balance_loss_clip": 1.32236218, "balance_loss_mlp": 1.01876545, "epoch": 0.3053660003006163, "flos": 12320568869760.0, "grad_norm": 1.9498475854428516, "language_loss": 0.67800635, "learning_rate": 3.255919884984307e-06, "loss": 0.70360571, "num_input_tokens_seen": 109115495, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.24267578, "step": 5079, "time_per_iteration": 2.8443379402160645 }, { "auxiliary_loss_clip": 0.0151861, "auxiliary_loss_mlp": 0.01052243, "balance_loss_clip": 1.32167327, "balance_loss_mlp": 1.02769756, "epoch": 0.30542612355328425, "flos": 23122645136640.0, "grad_norm": 1.8980831751325304, "language_loss": 0.81326473, "learning_rate": 3.2556167643097477e-06, "loss": 0.83897328, "num_input_tokens_seen": 109134235, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.24523926, "step": 5080, "time_per_iteration": 2.875612258911133 }, { "auxiliary_loss_clip": 0.01517294, "auxiliary_loss_mlp": 0.01046958, "balance_loss_clip": 1.32179284, "balance_loss_mlp": 1.02041054, "epoch": 0.3054862468059522, "flos": 24400181341440.0, "grad_norm": 2.658461699923453, "language_loss": 0.8173455, "learning_rate": 3.255313596022074e-06, "loss": 0.84298801, "num_input_tokens_seen": 109152760, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.26525879, "step": 5081, "time_per_iteration": 3.027339458465576 }, { "auxiliary_loss_clip": 0.01508254, "auxiliary_loss_mlp": 0.01049496, "balance_loss_clip": 1.31363082, "balance_loss_mlp": 1.02430725, "epoch": 0.3055463700586202, "flos": 29397630067200.0, "grad_norm": 1.6471427397399279, "language_loss": 0.72477889, "learning_rate": 3.255010380132783e-06, "loss": 0.75035638, "num_input_tokens_seen": 109173925, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.25183105, "step": 5082, "time_per_iteration": 2.9955246448516846 }, { "auxiliary_loss_clip": 0.01511905, "auxiliary_loss_mlp": 0.01046269, "balance_loss_clip": 1.31139112, "balance_loss_mlp": 1.0206387, "epoch": 0.30560649331128814, "flos": 25602375674880.0, "grad_norm": 2.3682605818411777, "language_loss": 0.73644161, "learning_rate": 3.2547071166533736e-06, "loss": 0.76202333, "num_input_tokens_seen": 109192510, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.25671387, "step": 5083, "time_per_iteration": 2.9172441959381104 }, { "auxiliary_loss_clip": 0.01513567, "auxiliary_loss_mlp": 0.01042558, "balance_loss_clip": 1.31510139, "balance_loss_mlp": 1.01822746, "epoch": 0.3056666165639561, "flos": 19135823685120.0, "grad_norm": 2.3110297538242235, "language_loss": 0.71795344, "learning_rate": 3.254403805595344e-06, "loss": 0.74351478, "num_input_tokens_seen": 109210885, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.2434082, "step": 5084, "time_per_iteration": 4.2829132080078125 }, { "auxiliary_loss_clip": 0.01533134, "auxiliary_loss_mlp": 0.0104426, "balance_loss_clip": 1.33264279, "balance_loss_mlp": 1.01953638, "epoch": 0.30572673981662407, "flos": 15532407820800.0, "grad_norm": 2.0020444169105116, "language_loss": 0.79992092, "learning_rate": 3.2541004469701962e-06, "loss": 0.82569492, "num_input_tokens_seen": 109229180, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.24719238, "step": 5085, "time_per_iteration": 2.878168821334839 }, { "auxiliary_loss_clip": 0.01508102, "auxiliary_loss_mlp": 0.01045313, "balance_loss_clip": 1.31500268, "balance_loss_mlp": 1.02013588, "epoch": 0.30578686306929204, "flos": 21516386325120.0, "grad_norm": 1.551367390673303, "language_loss": 0.78911781, "learning_rate": 3.2537970407894342e-06, "loss": 0.81465203, "num_input_tokens_seen": 109249510, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.2520752, "step": 5086, "time_per_iteration": 4.265935659408569 }, { "auxiliary_loss_clip": 0.01508471, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.31734633, "balance_loss_mlp": 1.0211997, "epoch": 0.30584698632196, "flos": 20962678734720.0, "grad_norm": 2.5549164270938887, "language_loss": 0.78040326, "learning_rate": 3.253493587064563e-06, "loss": 0.8059625, "num_input_tokens_seen": 109268200, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.26245117, "step": 5087, "time_per_iteration": 4.311020135879517 }, { "auxiliary_loss_clip": 0.01520196, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.31969726, "balance_loss_mlp": 1.01647854, "epoch": 0.30590710957462797, "flos": 24691866439680.0, "grad_norm": 3.115184633124134, "language_loss": 0.73422849, "learning_rate": 3.2531900858070885e-06, "loss": 0.75985289, "num_input_tokens_seen": 109288370, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.25793457, "step": 5088, "time_per_iteration": 2.9140238761901855 }, { "auxiliary_loss_clip": 0.01537738, "auxiliary_loss_mlp": 0.01047319, "balance_loss_clip": 1.33300662, "balance_loss_mlp": 1.02187932, "epoch": 0.30596723282729593, "flos": 17094344711040.0, "grad_norm": 3.0642660844719773, "language_loss": 0.8068068, "learning_rate": 3.252886537028521e-06, "loss": 0.8326574, "num_input_tokens_seen": 109306730, "router_z_loss_clip": 2.04980469, "router_z_loss_mlp": 0.25439453, "step": 5089, "time_per_iteration": 2.8644042015075684 }, { "auxiliary_loss_clip": 0.01519459, "auxiliary_loss_mlp": 0.01047748, "balance_loss_clip": 1.32310915, "balance_loss_mlp": 1.02284551, "epoch": 0.30602735607996395, "flos": 22867454609280.0, "grad_norm": 1.9535396371306832, "language_loss": 0.77552128, "learning_rate": 3.2525829407403703e-06, "loss": 0.80119324, "num_input_tokens_seen": 109327360, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.24902344, "step": 5090, "time_per_iteration": 2.8599305152893066 }, { "auxiliary_loss_clip": 0.01535799, "auxiliary_loss_mlp": 0.01049042, "balance_loss_clip": 1.33339548, "balance_loss_mlp": 1.02391243, "epoch": 0.3060874793326319, "flos": 29873416832640.0, "grad_norm": 2.0957961040183055, "language_loss": 0.77294654, "learning_rate": 3.2522792969541488e-06, "loss": 0.79879498, "num_input_tokens_seen": 109348135, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.25134277, "step": 5091, "time_per_iteration": 2.9089274406433105 }, { "auxiliary_loss_clip": 0.01529658, "auxiliary_loss_mlp": 0.01053244, "balance_loss_clip": 1.33017075, "balance_loss_mlp": 1.02853143, "epoch": 0.3061476025852999, "flos": 20458179504000.0, "grad_norm": 1.6035818326431206, "language_loss": 0.72225702, "learning_rate": 3.2519756056813705e-06, "loss": 0.74808598, "num_input_tokens_seen": 109366220, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.24694824, "step": 5092, "time_per_iteration": 2.865889072418213 }, { "auxiliary_loss_clip": 0.01533631, "auxiliary_loss_mlp": 0.01055001, "balance_loss_clip": 1.33486342, "balance_loss_mlp": 1.03047991, "epoch": 0.30620772583796785, "flos": 19401058558080.0, "grad_norm": 1.897035939274073, "language_loss": 0.83435655, "learning_rate": 3.2516718669335522e-06, "loss": 0.8602429, "num_input_tokens_seen": 109385260, "router_z_loss_clip": 1.98730469, "router_z_loss_mlp": 0.24536133, "step": 5093, "time_per_iteration": 2.850254774093628 }, { "auxiliary_loss_clip": 0.01522415, "auxiliary_loss_mlp": 0.01050518, "balance_loss_clip": 1.32685065, "balance_loss_mlp": 1.02683139, "epoch": 0.3062678490906358, "flos": 24035190387840.0, "grad_norm": 2.5222979367463623, "language_loss": 0.76076043, "learning_rate": 3.2513680807222114e-06, "loss": 0.78648973, "num_input_tokens_seen": 109405025, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.23706055, "step": 5094, "time_per_iteration": 2.870781660079956 }, { "auxiliary_loss_clip": 0.01512669, "auxiliary_loss_mlp": 0.01043073, "balance_loss_clip": 1.31848955, "balance_loss_mlp": 1.01944566, "epoch": 0.3063279723433038, "flos": 19763877761280.0, "grad_norm": 1.892468372967705, "language_loss": 0.76502579, "learning_rate": 3.251064247058868e-06, "loss": 0.79058325, "num_input_tokens_seen": 109422465, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.2364502, "step": 5095, "time_per_iteration": 2.8397059440612793 }, { "auxiliary_loss_clip": 0.0152033, "auxiliary_loss_mlp": 0.01050057, "balance_loss_clip": 1.32562113, "balance_loss_mlp": 1.02399755, "epoch": 0.30638809559597174, "flos": 22458775161600.0, "grad_norm": 1.8875145556916268, "language_loss": 0.81427395, "learning_rate": 3.250760365955042e-06, "loss": 0.83997786, "num_input_tokens_seen": 109440575, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.26074219, "step": 5096, "time_per_iteration": 2.903921127319336 }, { "auxiliary_loss_clip": 0.01534285, "auxiliary_loss_mlp": 0.01046095, "balance_loss_clip": 1.33440244, "balance_loss_mlp": 1.02114415, "epoch": 0.3064482188486397, "flos": 17173984838400.0, "grad_norm": 4.969834589557623, "language_loss": 0.82575011, "learning_rate": 3.250456437422258e-06, "loss": 0.85155392, "num_input_tokens_seen": 109459050, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.24975586, "step": 5097, "time_per_iteration": 2.8769636154174805 }, { "auxiliary_loss_clip": 0.01522134, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.32470536, "balance_loss_mlp": 1.01974499, "epoch": 0.3065083421013077, "flos": 23778868740480.0, "grad_norm": 1.789587327546449, "language_loss": 0.78016669, "learning_rate": 3.250152461472041e-06, "loss": 0.80582619, "num_input_tokens_seen": 109475860, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.24072266, "step": 5098, "time_per_iteration": 2.8389575481414795 }, { "auxiliary_loss_clip": 0.01518873, "auxiliary_loss_mlp": 0.01046955, "balance_loss_clip": 1.32315099, "balance_loss_mlp": 1.02240944, "epoch": 0.30656846535397564, "flos": 26442881925120.0, "grad_norm": 1.8258649155228994, "language_loss": 0.84975684, "learning_rate": 3.249848438115917e-06, "loss": 0.87541521, "num_input_tokens_seen": 109494760, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.24536133, "step": 5099, "time_per_iteration": 2.9014062881469727 }, { "auxiliary_loss_clip": 0.01528207, "auxiliary_loss_mlp": 0.01043988, "balance_loss_clip": 1.33033872, "balance_loss_mlp": 1.01846528, "epoch": 0.3066285886066436, "flos": 26663161449600.0, "grad_norm": 1.6260609993425659, "language_loss": 0.8620078, "learning_rate": 3.2495443673654148e-06, "loss": 0.8877297, "num_input_tokens_seen": 109516480, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.25488281, "step": 5100, "time_per_iteration": 2.905745506286621 }, { "auxiliary_loss_clip": 0.01529324, "auxiliary_loss_mlp": 0.01041761, "balance_loss_clip": 1.33169007, "balance_loss_mlp": 1.01694202, "epoch": 0.30668871185931157, "flos": 15058476092160.0, "grad_norm": 2.191230103265372, "language_loss": 0.7979722, "learning_rate": 3.249240249232065e-06, "loss": 0.82368302, "num_input_tokens_seen": 109534615, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.24841309, "step": 5101, "time_per_iteration": 2.8432817459106445 }, { "auxiliary_loss_clip": 0.01544967, "auxiliary_loss_mlp": 0.01047844, "balance_loss_clip": 1.34394419, "balance_loss_mlp": 1.02067614, "epoch": 0.30674883511197953, "flos": 20091197779200.0, "grad_norm": 1.914733799970639, "language_loss": 0.808514, "learning_rate": 3.2489360837273998e-06, "loss": 0.83444214, "num_input_tokens_seen": 109554040, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.27148438, "step": 5102, "time_per_iteration": 2.8522660732269287 }, { "auxiliary_loss_clip": 0.01523484, "auxiliary_loss_mlp": 0.01043732, "balance_loss_clip": 1.32636428, "balance_loss_mlp": 1.01770818, "epoch": 0.30680895836464755, "flos": 22904220648960.0, "grad_norm": 2.49579542049509, "language_loss": 0.8938908, "learning_rate": 3.2486318708629532e-06, "loss": 0.91956294, "num_input_tokens_seen": 109574345, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.26037598, "step": 5103, "time_per_iteration": 2.8759870529174805 }, { "auxiliary_loss_clip": 0.01534162, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.33456349, "balance_loss_mlp": 1.01747656, "epoch": 0.3068690816173155, "flos": 23706694005120.0, "grad_norm": 1.6517259168487446, "language_loss": 0.74499571, "learning_rate": 3.2483276106502607e-06, "loss": 0.77075255, "num_input_tokens_seen": 109593670, "router_z_loss_clip": 1.99707031, "router_z_loss_mlp": 0.24035645, "step": 5104, "time_per_iteration": 2.9119296073913574 }, { "auxiliary_loss_clip": 0.01546868, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.34403133, "balance_loss_mlp": 1.0232352, "epoch": 0.3069292048699835, "flos": 23561756352000.0, "grad_norm": 1.9020941932175348, "language_loss": 0.73579848, "learning_rate": 3.2480233031008605e-06, "loss": 0.7617417, "num_input_tokens_seen": 109613385, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.24230957, "step": 5105, "time_per_iteration": 2.8734936714172363 }, { "auxiliary_loss_clip": 0.01525881, "auxiliary_loss_mlp": 0.01046086, "balance_loss_clip": 1.32831848, "balance_loss_mlp": 1.02136171, "epoch": 0.30698932812265145, "flos": 24541454165760.0, "grad_norm": 2.002436644820995, "language_loss": 0.87981296, "learning_rate": 3.2477189482262916e-06, "loss": 0.9055326, "num_input_tokens_seen": 109632395, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.24731445, "step": 5106, "time_per_iteration": 2.8930745124816895 }, { "auxiliary_loss_clip": 0.01552618, "auxiliary_loss_mlp": 0.01049641, "balance_loss_clip": 1.34711361, "balance_loss_mlp": 1.02509534, "epoch": 0.3070494513753194, "flos": 21006321984000.0, "grad_norm": 2.2326936823105448, "language_loss": 0.71825612, "learning_rate": 3.2474145460380945e-06, "loss": 0.74427873, "num_input_tokens_seen": 109651380, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.2454834, "step": 5107, "time_per_iteration": 2.9113588333129883 }, { "auxiliary_loss_clip": 0.01510151, "auxiliary_loss_mlp": 0.01043753, "balance_loss_clip": 1.31499195, "balance_loss_mlp": 1.01895738, "epoch": 0.3071095746279874, "flos": 19035253198080.0, "grad_norm": 2.304917200351878, "language_loss": 0.7290076, "learning_rate": 3.247110096547814e-06, "loss": 0.75454658, "num_input_tokens_seen": 109670240, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.24816895, "step": 5108, "time_per_iteration": 2.927574872970581 }, { "auxiliary_loss_clip": 0.01528865, "auxiliary_loss_mlp": 0.01045698, "balance_loss_clip": 1.33025932, "balance_loss_mlp": 1.02053344, "epoch": 0.30716969788065535, "flos": 21225515633280.0, "grad_norm": 1.6231989282709887, "language_loss": 0.86450469, "learning_rate": 3.2468055997669926e-06, "loss": 0.89025038, "num_input_tokens_seen": 109690810, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.25146484, "step": 5109, "time_per_iteration": 2.8859140872955322 }, { "auxiliary_loss_clip": 0.01516865, "auxiliary_loss_mlp": 0.01040363, "balance_loss_clip": 1.31999874, "balance_loss_mlp": 1.01681876, "epoch": 0.3072298211333233, "flos": 25783038737280.0, "grad_norm": 1.948210976240241, "language_loss": 0.68380982, "learning_rate": 3.2465010557071788e-06, "loss": 0.70938212, "num_input_tokens_seen": 109711145, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.2355957, "step": 5110, "time_per_iteration": 2.8982293605804443 }, { "auxiliary_loss_clip": 0.01517458, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.32332313, "balance_loss_mlp": 1.01465225, "epoch": 0.3072899443859913, "flos": 25860054666240.0, "grad_norm": 1.6656944395037456, "language_loss": 0.77718282, "learning_rate": 3.246196464379919e-06, "loss": 0.80272686, "num_input_tokens_seen": 109731425, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.22302246, "step": 5111, "time_per_iteration": 2.9260807037353516 }, { "auxiliary_loss_clip": 0.01529647, "auxiliary_loss_mlp": 0.01041984, "balance_loss_clip": 1.32957387, "balance_loss_mlp": 1.0167836, "epoch": 0.30735006763865924, "flos": 25934174928000.0, "grad_norm": 1.9415890381017564, "language_loss": 0.68009096, "learning_rate": 3.245891825796765e-06, "loss": 0.70580727, "num_input_tokens_seen": 109752720, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.25219727, "step": 5112, "time_per_iteration": 2.8874776363372803 }, { "auxiliary_loss_clip": 0.01542861, "auxiliary_loss_mlp": 0.01037787, "balance_loss_clip": 1.3413198, "balance_loss_mlp": 1.01275277, "epoch": 0.3074101908913272, "flos": 30928547007360.0, "grad_norm": 2.239094134335687, "language_loss": 0.80723393, "learning_rate": 3.2455871399692678e-06, "loss": 0.8330403, "num_input_tokens_seen": 109772840, "router_z_loss_clip": 2.01367188, "router_z_loss_mlp": 0.25024414, "step": 5113, "time_per_iteration": 4.402537107467651 }, { "auxiliary_loss_clip": 0.01538218, "auxiliary_loss_mlp": 0.01039481, "balance_loss_clip": 1.33610368, "balance_loss_mlp": 1.01600885, "epoch": 0.30747031414399517, "flos": 18409008913920.0, "grad_norm": 2.5969231933618446, "language_loss": 0.7809484, "learning_rate": 3.2452824069089815e-06, "loss": 0.80672538, "num_input_tokens_seen": 109790150, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.23486328, "step": 5114, "time_per_iteration": 2.8215558528900146 }, { "auxiliary_loss_clip": 0.01518241, "auxiliary_loss_mlp": 0.01041387, "balance_loss_clip": 1.32215071, "balance_loss_mlp": 1.01659191, "epoch": 0.30753043739666314, "flos": 22642152912000.0, "grad_norm": 2.0155495486802324, "language_loss": 0.62689435, "learning_rate": 3.2449776266274623e-06, "loss": 0.65249068, "num_input_tokens_seen": 109807985, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.2479248, "step": 5115, "time_per_iteration": 2.852583646774292 }, { "auxiliary_loss_clip": 0.01528682, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.33094668, "balance_loss_mlp": 1.01256657, "epoch": 0.3075905606493311, "flos": 27355381931520.0, "grad_norm": 2.6441278351990456, "language_loss": 0.83239281, "learning_rate": 3.2446727991362657e-06, "loss": 0.85803831, "num_input_tokens_seen": 109825920, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.23291016, "step": 5116, "time_per_iteration": 2.9011270999908447 }, { "auxiliary_loss_clip": 0.01524375, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.32757926, "balance_loss_mlp": 1.01599324, "epoch": 0.3076506839019991, "flos": 22100932886400.0, "grad_norm": 1.6984312372638461, "language_loss": 0.76331145, "learning_rate": 3.244367924446952e-06, "loss": 0.7889421, "num_input_tokens_seen": 109846220, "router_z_loss_clip": 1.96582031, "router_z_loss_mlp": 0.22680664, "step": 5117, "time_per_iteration": 2.8562419414520264 }, { "auxiliary_loss_clip": 0.01552827, "auxiliary_loss_mlp": 0.01045559, "balance_loss_clip": 1.35241258, "balance_loss_mlp": 1.02186036, "epoch": 0.3077108071546671, "flos": 21299816874240.0, "grad_norm": 2.731567314424615, "language_loss": 0.72724116, "learning_rate": 3.2440630025710826e-06, "loss": 0.75322503, "num_input_tokens_seen": 109863870, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.23706055, "step": 5118, "time_per_iteration": 2.824920415878296 }, { "auxiliary_loss_clip": 0.01527984, "auxiliary_loss_mlp": 0.01039711, "balance_loss_clip": 1.3284142, "balance_loss_mlp": 1.01634634, "epoch": 0.30777093040733505, "flos": 21440139557760.0, "grad_norm": 1.8109162567834796, "language_loss": 0.7507714, "learning_rate": 3.243758033520219e-06, "loss": 0.77644837, "num_input_tokens_seen": 109883500, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.23364258, "step": 5119, "time_per_iteration": 4.324901342391968 }, { "auxiliary_loss_clip": 0.01548319, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.34546447, "balance_loss_mlp": 1.02105653, "epoch": 0.307831053660003, "flos": 23159773134720.0, "grad_norm": 2.1368306676196496, "language_loss": 0.80891538, "learning_rate": 3.243453017305926e-06, "loss": 0.83484501, "num_input_tokens_seen": 109904620, "router_z_loss_clip": 2.02929688, "router_z_loss_mlp": 0.23583984, "step": 5120, "time_per_iteration": 2.945108652114868 }, { "auxiliary_loss_clip": 0.01530168, "auxiliary_loss_mlp": 0.01043595, "balance_loss_clip": 1.3344996, "balance_loss_mlp": 1.01934814, "epoch": 0.307891176912671, "flos": 17028685226880.0, "grad_norm": 1.7640058377486132, "language_loss": 0.80951458, "learning_rate": 3.24314795393977e-06, "loss": 0.83525223, "num_input_tokens_seen": 109922275, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.24243164, "step": 5121, "time_per_iteration": 2.8610305786132812 }, { "auxiliary_loss_clip": 0.01541617, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.345433, "balance_loss_mlp": 1.01543283, "epoch": 0.30795130016533895, "flos": 27715441201920.0, "grad_norm": 1.669702808430349, "language_loss": 0.8335973, "learning_rate": 3.242842843433319e-06, "loss": 0.85939896, "num_input_tokens_seen": 109944265, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.23120117, "step": 5122, "time_per_iteration": 5.926538467407227 }, { "auxiliary_loss_clip": 0.01349946, "auxiliary_loss_mlp": 0.01097227, "balance_loss_clip": 1.23812473, "balance_loss_mlp": 1.06938004, "epoch": 0.3080114234180069, "flos": 69093262356480.0, "grad_norm": 0.7689857478606799, "language_loss": 0.58620989, "learning_rate": 3.242537685798143e-06, "loss": 0.61068165, "num_input_tokens_seen": 110014160, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.27929688, "step": 5123, "time_per_iteration": 3.566836357116699 }, { "auxiliary_loss_clip": 0.01567432, "auxiliary_loss_mlp": 0.01044474, "balance_loss_clip": 1.35920274, "balance_loss_mlp": 1.02051306, "epoch": 0.3080715466706749, "flos": 24070644328320.0, "grad_norm": 1.6339584945848296, "language_loss": 0.83718997, "learning_rate": 3.242232481045813e-06, "loss": 0.86330903, "num_input_tokens_seen": 110034865, "router_z_loss_clip": 2.08300781, "router_z_loss_mlp": 0.23974609, "step": 5124, "time_per_iteration": 2.906496286392212 }, { "auxiliary_loss_clip": 0.0154116, "auxiliary_loss_mlp": 0.01039151, "balance_loss_clip": 1.33967507, "balance_loss_mlp": 1.01595235, "epoch": 0.30813166992334284, "flos": 25859737952640.0, "grad_norm": 2.3551759008249102, "language_loss": 0.80508423, "learning_rate": 3.2419272291879035e-06, "loss": 0.83088732, "num_input_tokens_seen": 110052930, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.23193359, "step": 5125, "time_per_iteration": 2.8837056159973145 }, { "auxiliary_loss_clip": 0.01558683, "auxiliary_loss_mlp": 0.01047149, "balance_loss_clip": 1.35168862, "balance_loss_mlp": 1.0222578, "epoch": 0.3081917931760108, "flos": 20459491603200.0, "grad_norm": 1.838447393341427, "language_loss": 0.65303266, "learning_rate": 3.241621930235989e-06, "loss": 0.67909098, "num_input_tokens_seen": 110071765, "router_z_loss_clip": 2.0703125, "router_z_loss_mlp": 0.24902344, "step": 5126, "time_per_iteration": 2.945152521133423 }, { "auxiliary_loss_clip": 0.01521672, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.32662487, "balance_loss_mlp": 1.01981211, "epoch": 0.3082519164286788, "flos": 22177043919360.0, "grad_norm": 1.644532973600044, "language_loss": 0.87322229, "learning_rate": 3.241316584201646e-06, "loss": 0.89887393, "num_input_tokens_seen": 110092660, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.23681641, "step": 5127, "time_per_iteration": 2.885012626647949 }, { "auxiliary_loss_clip": 0.01538359, "auxiliary_loss_mlp": 0.01043188, "balance_loss_clip": 1.34079218, "balance_loss_mlp": 1.02113461, "epoch": 0.30831203968134674, "flos": 28925236661760.0, "grad_norm": 2.6522698950649555, "language_loss": 0.69152355, "learning_rate": 3.2410111910964538e-06, "loss": 0.71733904, "num_input_tokens_seen": 110114960, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.22045898, "step": 5128, "time_per_iteration": 2.9108567237854004 }, { "auxiliary_loss_clip": 0.01557042, "auxiliary_loss_mlp": 0.01046047, "balance_loss_clip": 1.3520174, "balance_loss_mlp": 1.02231193, "epoch": 0.3083721629340147, "flos": 25679120135040.0, "grad_norm": 2.0036776184323593, "language_loss": 0.72545362, "learning_rate": 3.240705750931993e-06, "loss": 0.75148451, "num_input_tokens_seen": 110135750, "router_z_loss_clip": 2.05078125, "router_z_loss_mlp": 0.23742676, "step": 5129, "time_per_iteration": 2.8721532821655273 }, { "auxiliary_loss_clip": 0.01342799, "auxiliary_loss_mlp": 0.01061602, "balance_loss_clip": 1.22659898, "balance_loss_mlp": 1.02917755, "epoch": 0.3084322861866827, "flos": 68245245469440.0, "grad_norm": 0.8419197658500721, "language_loss": 0.59387404, "learning_rate": 3.240400263719846e-06, "loss": 0.61791801, "num_input_tokens_seen": 110189480, "router_z_loss_clip": 1.1640625, "router_z_loss_mlp": 0.32421875, "step": 5130, "time_per_iteration": 3.349917411804199 }, { "auxiliary_loss_clip": 0.01571579, "auxiliary_loss_mlp": 0.01044835, "balance_loss_clip": 1.36578298, "balance_loss_mlp": 1.02157688, "epoch": 0.3084924094393507, "flos": 20304464359680.0, "grad_norm": 2.2511400152667886, "language_loss": 0.74416912, "learning_rate": 3.2400947294715957e-06, "loss": 0.77033329, "num_input_tokens_seen": 110206445, "router_z_loss_clip": 2.05957031, "router_z_loss_mlp": 0.23266602, "step": 5131, "time_per_iteration": 2.9103968143463135 }, { "auxiliary_loss_clip": 0.01552318, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.35395312, "balance_loss_mlp": 1.02313387, "epoch": 0.30855253269201866, "flos": 23959984250880.0, "grad_norm": 1.5641806292141003, "language_loss": 0.71842885, "learning_rate": 3.2397891481988303e-06, "loss": 0.74440163, "num_input_tokens_seen": 110226845, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.21838379, "step": 5132, "time_per_iteration": 2.8766536712646484 }, { "auxiliary_loss_clip": 0.01536818, "auxiliary_loss_mlp": 0.01045376, "balance_loss_clip": 1.34228539, "balance_loss_mlp": 1.02348971, "epoch": 0.3086126559446866, "flos": 19291393866240.0, "grad_norm": 1.7900727868980226, "language_loss": 0.90473872, "learning_rate": 3.239483519913136e-06, "loss": 0.93056065, "num_input_tokens_seen": 110244095, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.21899414, "step": 5133, "time_per_iteration": 2.84804105758667 }, { "auxiliary_loss_clip": 0.01553802, "auxiliary_loss_mlp": 0.01045267, "balance_loss_clip": 1.34980536, "balance_loss_mlp": 1.0218066, "epoch": 0.3086727791973546, "flos": 33772499337600.0, "grad_norm": 3.184449077739982, "language_loss": 0.68374538, "learning_rate": 3.239177844626102e-06, "loss": 0.70973611, "num_input_tokens_seen": 110264240, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.23461914, "step": 5134, "time_per_iteration": 2.9653737545013428 }, { "auxiliary_loss_clip": 0.0155578, "auxiliary_loss_mlp": 0.01048282, "balance_loss_clip": 1.35136724, "balance_loss_mlp": 1.02551317, "epoch": 0.30873290245002255, "flos": 16042698385920.0, "grad_norm": 2.181458572614256, "language_loss": 0.83564591, "learning_rate": 3.2388721223493197e-06, "loss": 0.86168659, "num_input_tokens_seen": 110282450, "router_z_loss_clip": 2.04492188, "router_z_loss_mlp": 0.22766113, "step": 5135, "time_per_iteration": 2.8393285274505615 }, { "auxiliary_loss_clip": 0.01324988, "auxiliary_loss_mlp": 0.01030961, "balance_loss_clip": 1.20963204, "balance_loss_mlp": 0.99910849, "epoch": 0.3087930257026905, "flos": 65082614878080.0, "grad_norm": 0.7015897598997128, "language_loss": 0.55332732, "learning_rate": 3.2385663530943824e-06, "loss": 0.57688683, "num_input_tokens_seen": 110343715, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.31835938, "step": 5136, "time_per_iteration": 3.4245336055755615 }, { "auxiliary_loss_clip": 0.01546245, "auxiliary_loss_mlp": 0.01045048, "balance_loss_clip": 1.34735787, "balance_loss_mlp": 1.02133775, "epoch": 0.3088531489553585, "flos": 74762372442240.0, "grad_norm": 2.022984386543483, "language_loss": 0.77013546, "learning_rate": 3.2382605368728852e-06, "loss": 0.7960484, "num_input_tokens_seen": 110368430, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.23718262, "step": 5137, "time_per_iteration": 3.24355411529541 }, { "auxiliary_loss_clip": 0.0155015, "auxiliary_loss_mlp": 0.01045918, "balance_loss_clip": 1.34925818, "balance_loss_mlp": 1.02431679, "epoch": 0.30891327220802645, "flos": 21152209777920.0, "grad_norm": 2.4242515525288013, "language_loss": 0.80838501, "learning_rate": 3.237954673696424e-06, "loss": 0.8343457, "num_input_tokens_seen": 110386735, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.21606445, "step": 5138, "time_per_iteration": 2.948270082473755 }, { "auxiliary_loss_clip": 0.01554887, "auxiliary_loss_mlp": 0.01046799, "balance_loss_clip": 1.35224485, "balance_loss_mlp": 1.02399433, "epoch": 0.3089733954606944, "flos": 25675048103040.0, "grad_norm": 1.563443530945742, "language_loss": 0.82123864, "learning_rate": 3.2376487635765983e-06, "loss": 0.84725553, "num_input_tokens_seen": 110406820, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.22827148, "step": 5139, "time_per_iteration": 2.903390407562256 }, { "auxiliary_loss_clip": 0.01567846, "auxiliary_loss_mlp": 0.0105127, "balance_loss_clip": 1.35850859, "balance_loss_mlp": 1.02670145, "epoch": 0.3090335187133624, "flos": 19436783967360.0, "grad_norm": 1.895995878801638, "language_loss": 0.77577549, "learning_rate": 3.2373428065250067e-06, "loss": 0.80196667, "num_input_tokens_seen": 110424225, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.24560547, "step": 5140, "time_per_iteration": 2.831421136856079 }, { "auxiliary_loss_clip": 0.0151548, "auxiliary_loss_mlp": 0.01050212, "balance_loss_clip": 1.32320213, "balance_loss_mlp": 1.02851617, "epoch": 0.30909364196603034, "flos": 20020968570240.0, "grad_norm": 1.8001831515779823, "language_loss": 0.788001, "learning_rate": 3.237036802553252e-06, "loss": 0.81365794, "num_input_tokens_seen": 110443310, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.21704102, "step": 5141, "time_per_iteration": 2.863175868988037 }, { "auxiliary_loss_clip": 0.01549421, "auxiliary_loss_mlp": 0.01047338, "balance_loss_clip": 1.34595513, "balance_loss_mlp": 1.02274561, "epoch": 0.3091537652186983, "flos": 19685730712320.0, "grad_norm": 2.4835727325304315, "language_loss": 0.87957466, "learning_rate": 3.2367307516729377e-06, "loss": 0.90554225, "num_input_tokens_seen": 110460215, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.24597168, "step": 5142, "time_per_iteration": 2.822154998779297 }, { "auxiliary_loss_clip": 0.01553146, "auxiliary_loss_mlp": 0.0104766, "balance_loss_clip": 1.34963107, "balance_loss_mlp": 1.02472472, "epoch": 0.3092138884713663, "flos": 17029047185280.0, "grad_norm": 2.705306223159886, "language_loss": 0.7999056, "learning_rate": 3.23642465389567e-06, "loss": 0.82591361, "num_input_tokens_seen": 110479385, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.22924805, "step": 5143, "time_per_iteration": 2.8427388668060303 }, { "auxiliary_loss_clip": 0.01532706, "auxiliary_loss_mlp": 0.0104382, "balance_loss_clip": 1.33411789, "balance_loss_mlp": 1.02056265, "epoch": 0.3092740117240343, "flos": 25020951004800.0, "grad_norm": 3.8964071788514563, "language_loss": 0.73031723, "learning_rate": 3.236118509233055e-06, "loss": 0.75608248, "num_input_tokens_seen": 110499885, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.2322998, "step": 5144, "time_per_iteration": 2.87309193611145 }, { "auxiliary_loss_clip": 0.01547209, "auxiliary_loss_mlp": 0.01050724, "balance_loss_clip": 1.34349203, "balance_loss_mlp": 1.02808666, "epoch": 0.30933413497670226, "flos": 25600656372480.0, "grad_norm": 3.02704440295389, "language_loss": 0.74580085, "learning_rate": 3.235812317696702e-06, "loss": 0.77178013, "num_input_tokens_seen": 110519690, "router_z_loss_clip": 2.03808594, "router_z_loss_mlp": 0.22631836, "step": 5145, "time_per_iteration": 2.9030110836029053 }, { "auxiliary_loss_clip": 0.01525864, "auxiliary_loss_mlp": 0.01045792, "balance_loss_clip": 1.32646704, "balance_loss_mlp": 1.02198625, "epoch": 0.3093942582293702, "flos": 24400452810240.0, "grad_norm": 1.712915366471327, "language_loss": 0.76811993, "learning_rate": 3.2355060792982224e-06, "loss": 0.79383641, "num_input_tokens_seen": 110540520, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.23815918, "step": 5146, "time_per_iteration": 2.935161828994751 }, { "auxiliary_loss_clip": 0.01533833, "auxiliary_loss_mlp": 0.01035171, "balance_loss_clip": 1.33596623, "balance_loss_mlp": 1.01325989, "epoch": 0.3094543814820382, "flos": 19655977616640.0, "grad_norm": 2.1561556727538225, "language_loss": 0.67911488, "learning_rate": 3.2351997940492286e-06, "loss": 0.7048049, "num_input_tokens_seen": 110557950, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.21911621, "step": 5147, "time_per_iteration": 2.8600285053253174 }, { "auxiliary_loss_clip": 0.01554172, "auxiliary_loss_mlp": 0.01045186, "balance_loss_clip": 1.35192049, "balance_loss_mlp": 1.02290606, "epoch": 0.30951450473470615, "flos": 25674188451840.0, "grad_norm": 2.724317064246959, "language_loss": 0.76115531, "learning_rate": 3.2348934619613346e-06, "loss": 0.78714895, "num_input_tokens_seen": 110578215, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.22290039, "step": 5148, "time_per_iteration": 4.3097755908966064 }, { "auxiliary_loss_clip": 0.0156764, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.35811794, "balance_loss_mlp": 1.02368116, "epoch": 0.3095746279873741, "flos": 12027843141120.0, "grad_norm": 2.0187723623022893, "language_loss": 0.73697513, "learning_rate": 3.2345870830461567e-06, "loss": 0.76312745, "num_input_tokens_seen": 110592990, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 0.23913574, "step": 5149, "time_per_iteration": 2.8093461990356445 }, { "auxiliary_loss_clip": 0.01550079, "auxiliary_loss_mlp": 0.01045221, "balance_loss_clip": 1.34458899, "balance_loss_mlp": 1.02143896, "epoch": 0.3096347512400421, "flos": 23633523884160.0, "grad_norm": 1.943822926546144, "language_loss": 0.85785097, "learning_rate": 3.2342806573153132e-06, "loss": 0.88380396, "num_input_tokens_seen": 110612130, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.23791504, "step": 5150, "time_per_iteration": 2.9348647594451904 }, { "auxiliary_loss_clip": 0.01537678, "auxiliary_loss_mlp": 0.01039943, "balance_loss_clip": 1.33914888, "balance_loss_mlp": 1.01687646, "epoch": 0.30969487449271005, "flos": 22539727388160.0, "grad_norm": 2.3868174092431294, "language_loss": 0.79478151, "learning_rate": 3.233974184780424e-06, "loss": 0.82055771, "num_input_tokens_seen": 110632045, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.23059082, "step": 5151, "time_per_iteration": 2.9043610095977783 }, { "auxiliary_loss_clip": 0.01546491, "auxiliary_loss_mlp": 0.01041218, "balance_loss_clip": 1.34413159, "balance_loss_mlp": 1.01810312, "epoch": 0.309754997745378, "flos": 15276267152640.0, "grad_norm": 2.320458237701512, "language_loss": 0.67965043, "learning_rate": 3.2336676654531084e-06, "loss": 0.70552754, "num_input_tokens_seen": 110649340, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.23144531, "step": 5152, "time_per_iteration": 2.8461403846740723 }, { "auxiliary_loss_clip": 0.01542695, "auxiliary_loss_mlp": 0.01044079, "balance_loss_clip": 1.3417486, "balance_loss_mlp": 1.01956964, "epoch": 0.309815120998046, "flos": 26990526712320.0, "grad_norm": 2.160614435426142, "language_loss": 0.83397353, "learning_rate": 3.2333610993449926e-06, "loss": 0.85984129, "num_input_tokens_seen": 110668450, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.24523926, "step": 5153, "time_per_iteration": 2.9329991340637207 }, { "auxiliary_loss_clip": 0.01540112, "auxiliary_loss_mlp": 0.01045228, "balance_loss_clip": 1.34035313, "balance_loss_mlp": 1.0209806, "epoch": 0.30987524425071394, "flos": 21153295653120.0, "grad_norm": 1.859146620208576, "language_loss": 0.74863404, "learning_rate": 3.2330544864676997e-06, "loss": 0.77448738, "num_input_tokens_seen": 110689410, "router_z_loss_clip": 1.99414062, "router_z_loss_mlp": 0.24255371, "step": 5154, "time_per_iteration": 4.276312351226807 }, { "auxiliary_loss_clip": 0.01541806, "auxiliary_loss_mlp": 0.01045913, "balance_loss_clip": 1.34461427, "balance_loss_mlp": 1.02215505, "epoch": 0.3099353675033819, "flos": 15276719600640.0, "grad_norm": 2.020028819122007, "language_loss": 0.7723763, "learning_rate": 3.232747826832858e-06, "loss": 0.79825354, "num_input_tokens_seen": 110707350, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.2376709, "step": 5155, "time_per_iteration": 2.8350586891174316 }, { "auxiliary_loss_clip": 0.01543558, "auxiliary_loss_mlp": 0.01041512, "balance_loss_clip": 1.34094453, "balance_loss_mlp": 1.01736045, "epoch": 0.30999549075604993, "flos": 15422154946560.0, "grad_norm": 1.8858272665506959, "language_loss": 0.79840457, "learning_rate": 3.232441120452094e-06, "loss": 0.82425535, "num_input_tokens_seen": 110724910, "router_z_loss_clip": 2.02832031, "router_z_loss_mlp": 0.24169922, "step": 5156, "time_per_iteration": 2.904628276824951 }, { "auxiliary_loss_clip": 0.0154662, "auxiliary_loss_mlp": 0.01045932, "balance_loss_clip": 1.3425107, "balance_loss_mlp": 1.02170849, "epoch": 0.3100556140087179, "flos": 23194729382400.0, "grad_norm": 2.092220494959802, "language_loss": 0.75463319, "learning_rate": 3.23213436733704e-06, "loss": 0.78055871, "num_input_tokens_seen": 110744010, "router_z_loss_clip": 2.04101562, "router_z_loss_mlp": 0.2421875, "step": 5157, "time_per_iteration": 5.64022970199585 }, { "auxiliary_loss_clip": 0.01521826, "auxiliary_loss_mlp": 0.01041533, "balance_loss_clip": 1.32483804, "balance_loss_mlp": 1.01739323, "epoch": 0.31011573726138586, "flos": 25753964313600.0, "grad_norm": 1.6284773653250768, "language_loss": 0.70344549, "learning_rate": 3.231827567499327e-06, "loss": 0.72907901, "num_input_tokens_seen": 110765835, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.24133301, "step": 5158, "time_per_iteration": 2.91727876663208 }, { "auxiliary_loss_clip": 0.01509119, "auxiliary_loss_mlp": 0.01040497, "balance_loss_clip": 1.31519365, "balance_loss_mlp": 1.0178709, "epoch": 0.3101758605140538, "flos": 20020968570240.0, "grad_norm": 2.203945764206072, "language_loss": 0.85349101, "learning_rate": 3.2315207209505896e-06, "loss": 0.87898719, "num_input_tokens_seen": 110784655, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.22607422, "step": 5159, "time_per_iteration": 2.873427391052246 }, { "auxiliary_loss_clip": 0.01538873, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.33817887, "balance_loss_mlp": 1.01569486, "epoch": 0.3102359837667218, "flos": 19145098869120.0, "grad_norm": 1.8295323207005212, "language_loss": 0.85800552, "learning_rate": 3.231213827702462e-06, "loss": 0.88379353, "num_input_tokens_seen": 110802545, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.24206543, "step": 5160, "time_per_iteration": 2.893089771270752 }, { "auxiliary_loss_clip": 0.0152133, "auxiliary_loss_mlp": 0.01039228, "balance_loss_clip": 1.32574999, "balance_loss_mlp": 1.01550555, "epoch": 0.31029610701938976, "flos": 22273542374400.0, "grad_norm": 8.985555580697863, "language_loss": 0.7698763, "learning_rate": 3.230906887766584e-06, "loss": 0.79548186, "num_input_tokens_seen": 110820265, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.23718262, "step": 5161, "time_per_iteration": 2.8945724964141846 }, { "auxiliary_loss_clip": 0.0154451, "auxiliary_loss_mlp": 0.01046955, "balance_loss_clip": 1.34044147, "balance_loss_mlp": 1.02202868, "epoch": 0.3103562302720577, "flos": 20812945132800.0, "grad_norm": 2.018752707764171, "language_loss": 0.82452875, "learning_rate": 3.2305999011545924e-06, "loss": 0.85044336, "num_input_tokens_seen": 110836195, "router_z_loss_clip": 2.04003906, "router_z_loss_mlp": 0.24914551, "step": 5162, "time_per_iteration": 3.060969591140747 }, { "auxiliary_loss_clip": 0.01523506, "auxiliary_loss_mlp": 0.01040424, "balance_loss_clip": 1.32657146, "balance_loss_mlp": 1.01678503, "epoch": 0.3104163535247257, "flos": 22353815928960.0, "grad_norm": 1.5365247864117302, "language_loss": 0.83326781, "learning_rate": 3.2302928678781295e-06, "loss": 0.8589071, "num_input_tokens_seen": 110856420, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.23657227, "step": 5163, "time_per_iteration": 2.8725178241729736 }, { "auxiliary_loss_clip": 0.01533234, "auxiliary_loss_mlp": 0.01039677, "balance_loss_clip": 1.33479047, "balance_loss_mlp": 1.01664615, "epoch": 0.31047647677739365, "flos": 21699356872320.0, "grad_norm": 2.1596707371073185, "language_loss": 0.76513338, "learning_rate": 3.2299857879488376e-06, "loss": 0.79086256, "num_input_tokens_seen": 110876650, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.23010254, "step": 5164, "time_per_iteration": 2.928584337234497 }, { "auxiliary_loss_clip": 0.01530462, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.33321035, "balance_loss_mlp": 1.0156796, "epoch": 0.3105366000300616, "flos": 18926764871040.0, "grad_norm": 2.694951405418487, "language_loss": 0.75826049, "learning_rate": 3.2296786613783626e-06, "loss": 0.78395832, "num_input_tokens_seen": 110894445, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.23632812, "step": 5165, "time_per_iteration": 2.8837180137634277 }, { "auxiliary_loss_clip": 0.01532458, "auxiliary_loss_mlp": 0.01039807, "balance_loss_clip": 1.33581686, "balance_loss_mlp": 1.01558375, "epoch": 0.3105967232827296, "flos": 18269681616000.0, "grad_norm": 1.4771711477224014, "language_loss": 0.76693726, "learning_rate": 3.229371488178348e-06, "loss": 0.79265988, "num_input_tokens_seen": 110912855, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.24230957, "step": 5166, "time_per_iteration": 2.8363800048828125 }, { "auxiliary_loss_clip": 0.01533831, "auxiliary_loss_mlp": 0.01039818, "balance_loss_clip": 1.33545041, "balance_loss_mlp": 1.01557124, "epoch": 0.31065684653539755, "flos": 17680112881920.0, "grad_norm": 2.423809866077527, "language_loss": 0.74884462, "learning_rate": 3.229064268360444e-06, "loss": 0.77458107, "num_input_tokens_seen": 110928025, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.24230957, "step": 5167, "time_per_iteration": 2.82952880859375 }, { "auxiliary_loss_clip": 0.01300186, "auxiliary_loss_mlp": 0.01025721, "balance_loss_clip": 1.18061972, "balance_loss_mlp": 1.00111639, "epoch": 0.3107169697880655, "flos": 68562294917760.0, "grad_norm": 0.7167578015775972, "language_loss": 0.53039575, "learning_rate": 3.2287570019362997e-06, "loss": 0.55365479, "num_input_tokens_seen": 110992215, "router_z_loss_clip": 1.1953125, "router_z_loss_mlp": 0.24511719, "step": 5168, "time_per_iteration": 3.4684948921203613 }, { "auxiliary_loss_clip": 0.0153787, "auxiliary_loss_mlp": 0.0103719, "balance_loss_clip": 1.33435464, "balance_loss_mlp": 1.01194191, "epoch": 0.3107770930407335, "flos": 13196664794880.0, "grad_norm": 1.9002428924721744, "language_loss": 0.80362558, "learning_rate": 3.2284496889175668e-06, "loss": 0.82937622, "num_input_tokens_seen": 111010400, "router_z_loss_clip": 2.03417969, "router_z_loss_mlp": 0.25244141, "step": 5169, "time_per_iteration": 2.9534802436828613 }, { "auxiliary_loss_clip": 0.01539642, "auxiliary_loss_mlp": 0.0104674, "balance_loss_clip": 1.33621621, "balance_loss_mlp": 1.02240908, "epoch": 0.3108372162934015, "flos": 31594407753600.0, "grad_norm": 1.5891733678315627, "language_loss": 0.65302378, "learning_rate": 3.2281423293158986e-06, "loss": 0.67888761, "num_input_tokens_seen": 111033960, "router_z_loss_clip": 2.03515625, "router_z_loss_mlp": 0.2434082, "step": 5170, "time_per_iteration": 2.984879493713379 }, { "auxiliary_loss_clip": 0.01535156, "auxiliary_loss_mlp": 0.01041679, "balance_loss_clip": 1.33716869, "balance_loss_mlp": 1.01666927, "epoch": 0.31089733954606946, "flos": 28741315973760.0, "grad_norm": 2.090131672071812, "language_loss": 0.78241563, "learning_rate": 3.22783492314295e-06, "loss": 0.80818403, "num_input_tokens_seen": 111053265, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.25, "step": 5171, "time_per_iteration": 2.9152045249938965 }, { "auxiliary_loss_clip": 0.01535092, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.33353329, "balance_loss_mlp": 1.01979542, "epoch": 0.3109574627987374, "flos": 19692879390720.0, "grad_norm": 2.240237915990043, "language_loss": 0.84203506, "learning_rate": 3.2275274704103785e-06, "loss": 0.867836, "num_input_tokens_seen": 111071130, "router_z_loss_clip": 2.01464844, "router_z_loss_mlp": 0.25219727, "step": 5172, "time_per_iteration": 2.868312120437622 }, { "auxiliary_loss_clip": 0.01532197, "auxiliary_loss_mlp": 0.01042832, "balance_loss_clip": 1.33007956, "balance_loss_mlp": 1.01810789, "epoch": 0.3110175860514054, "flos": 14692127794560.0, "grad_norm": 2.1136543169132187, "language_loss": 0.85858458, "learning_rate": 3.227219971129842e-06, "loss": 0.8843348, "num_input_tokens_seen": 111089560, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.24743652, "step": 5173, "time_per_iteration": 2.8316757678985596 }, { "auxiliary_loss_clip": 0.01513991, "auxiliary_loss_mlp": 0.01042858, "balance_loss_clip": 1.31954801, "balance_loss_mlp": 1.01909935, "epoch": 0.31107770930407336, "flos": 25750797177600.0, "grad_norm": 1.604442620155975, "language_loss": 0.83980739, "learning_rate": 3.226912425313001e-06, "loss": 0.86537588, "num_input_tokens_seen": 111109960, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.23754883, "step": 5174, "time_per_iteration": 2.933851718902588 }, { "auxiliary_loss_clip": 0.01535713, "auxiliary_loss_mlp": 0.01046278, "balance_loss_clip": 1.33537483, "balance_loss_mlp": 1.02222192, "epoch": 0.3111378325567413, "flos": 19217590318080.0, "grad_norm": 1.8192178061925992, "language_loss": 0.85455048, "learning_rate": 3.2266048329715183e-06, "loss": 0.88037044, "num_input_tokens_seen": 111127960, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.24060059, "step": 5175, "time_per_iteration": 2.8638949394226074 }, { "auxiliary_loss_clip": 0.01527358, "auxiliary_loss_mlp": 0.0104271, "balance_loss_clip": 1.33233225, "balance_loss_mlp": 1.01789093, "epoch": 0.3111979558094093, "flos": 23706965473920.0, "grad_norm": 3.0842465003929003, "language_loss": 0.85106754, "learning_rate": 3.2262971941170575e-06, "loss": 0.87676823, "num_input_tokens_seen": 111146730, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.24853516, "step": 5176, "time_per_iteration": 2.95747971534729 }, { "auxiliary_loss_clip": 0.01518882, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.31981564, "balance_loss_mlp": 1.01435876, "epoch": 0.31125807906207725, "flos": 21042952289280.0, "grad_norm": 1.946237020718645, "language_loss": 0.82179785, "learning_rate": 3.2259895087612837e-06, "loss": 0.84737825, "num_input_tokens_seen": 111166295, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.24816895, "step": 5177, "time_per_iteration": 2.9012701511383057 }, { "auxiliary_loss_clip": 0.01537647, "auxiliary_loss_mlp": 0.01041331, "balance_loss_clip": 1.33790851, "balance_loss_mlp": 1.01655912, "epoch": 0.3113182023147452, "flos": 23087191196160.0, "grad_norm": 1.858604024031705, "language_loss": 0.81410795, "learning_rate": 3.2256817769158657e-06, "loss": 0.83989775, "num_input_tokens_seen": 111185665, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.24768066, "step": 5178, "time_per_iteration": 2.870715618133545 }, { "auxiliary_loss_clip": 0.01545534, "auxiliary_loss_mlp": 0.01045719, "balance_loss_clip": 1.3431747, "balance_loss_mlp": 1.02159095, "epoch": 0.3113783255674132, "flos": 11846637141120.0, "grad_norm": 2.4951595646321505, "language_loss": 0.81802797, "learning_rate": 3.225373998592471e-06, "loss": 0.8439405, "num_input_tokens_seen": 111201615, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.24133301, "step": 5179, "time_per_iteration": 2.836204767227173 }, { "auxiliary_loss_clip": 0.0153006, "auxiliary_loss_mlp": 0.01045085, "balance_loss_clip": 1.33257258, "balance_loss_mlp": 1.02123117, "epoch": 0.31143844882008115, "flos": 16298160382080.0, "grad_norm": 3.7024828049705896, "language_loss": 0.79567409, "learning_rate": 3.2250661738027715e-06, "loss": 0.82142556, "num_input_tokens_seen": 111220515, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.23840332, "step": 5180, "time_per_iteration": 2.8346080780029297 }, { "auxiliary_loss_clip": 0.01543318, "auxiliary_loss_mlp": 0.01039379, "balance_loss_clip": 1.34290564, "balance_loss_mlp": 1.01485753, "epoch": 0.3114985720727491, "flos": 23227151921280.0, "grad_norm": 1.5754857422044288, "language_loss": 0.84675384, "learning_rate": 3.22475830255844e-06, "loss": 0.87258077, "num_input_tokens_seen": 111240395, "router_z_loss_clip": 2.00585938, "router_z_loss_mlp": 0.24536133, "step": 5181, "time_per_iteration": 2.8967292308807373 }, { "auxiliary_loss_clip": 0.01527664, "auxiliary_loss_mlp": 0.01044951, "balance_loss_clip": 1.33137488, "balance_loss_mlp": 1.02208662, "epoch": 0.3115586953254171, "flos": 30056794583040.0, "grad_norm": 1.7967906947190966, "language_loss": 0.75051314, "learning_rate": 3.2244503848711516e-06, "loss": 0.77623928, "num_input_tokens_seen": 111261100, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.2286377, "step": 5182, "time_per_iteration": 2.9654948711395264 }, { "auxiliary_loss_clip": 0.01560698, "auxiliary_loss_mlp": 0.01045188, "balance_loss_clip": 1.3558594, "balance_loss_mlp": 1.02067876, "epoch": 0.3116188185780851, "flos": 25677355587840.0, "grad_norm": 2.321260486877012, "language_loss": 0.71566343, "learning_rate": 3.2241424207525815e-06, "loss": 0.74172229, "num_input_tokens_seen": 111281320, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.24511719, "step": 5183, "time_per_iteration": 4.323462009429932 }, { "auxiliary_loss_clip": 0.01302694, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.18465185, "balance_loss_mlp": 1.00948, "epoch": 0.31167894183075306, "flos": 69538780350720.0, "grad_norm": 0.9620718447469093, "language_loss": 0.59799278, "learning_rate": 3.223834410214408e-06, "loss": 0.62136823, "num_input_tokens_seen": 111341405, "router_z_loss_clip": 1.1796875, "router_z_loss_mlp": 0.25390625, "step": 5184, "time_per_iteration": 3.3900020122528076 }, { "auxiliary_loss_clip": 0.01539732, "auxiliary_loss_mlp": 0.01041314, "balance_loss_clip": 1.34001231, "balance_loss_mlp": 1.01780581, "epoch": 0.31173906508342103, "flos": 14948177973120.0, "grad_norm": 2.879271923158347, "language_loss": 0.71037436, "learning_rate": 3.223526353268311e-06, "loss": 0.73618484, "num_input_tokens_seen": 111358975, "router_z_loss_clip": 1.99902344, "router_z_loss_mlp": 0.23535156, "step": 5185, "time_per_iteration": 2.8665223121643066 }, { "auxiliary_loss_clip": 0.0155297, "auxiliary_loss_mlp": 0.01045544, "balance_loss_clip": 1.34966826, "balance_loss_mlp": 1.02207112, "epoch": 0.311799188336089, "flos": 16183564007040.0, "grad_norm": 2.415996251650437, "language_loss": 0.65247631, "learning_rate": 3.2232182499259725e-06, "loss": 0.67846143, "num_input_tokens_seen": 111375845, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.23474121, "step": 5186, "time_per_iteration": 2.881960868835449 }, { "auxiliary_loss_clip": 0.0156456, "auxiliary_loss_mlp": 0.01042944, "balance_loss_clip": 1.35657191, "balance_loss_mlp": 1.01805365, "epoch": 0.31185931158875696, "flos": 25020498556800.0, "grad_norm": 2.7693634291331053, "language_loss": 0.87538707, "learning_rate": 3.2229101001990747e-06, "loss": 0.90146208, "num_input_tokens_seen": 111394150, "router_z_loss_clip": 2.08105469, "router_z_loss_mlp": 0.24902344, "step": 5187, "time_per_iteration": 2.8873414993286133 }, { "auxiliary_loss_clip": 0.01554112, "auxiliary_loss_mlp": 0.01049938, "balance_loss_clip": 1.35333478, "balance_loss_mlp": 1.02611971, "epoch": 0.3119194348414249, "flos": 37247084697600.0, "grad_norm": 1.4504454595466472, "language_loss": 0.63560283, "learning_rate": 3.2226019040993036e-06, "loss": 0.66164333, "num_input_tokens_seen": 111418355, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.23815918, "step": 5188, "time_per_iteration": 4.508891344070435 }, { "auxiliary_loss_clip": 0.01553914, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.35317969, "balance_loss_mlp": 1.02216172, "epoch": 0.3119795580940929, "flos": 15021121870080.0, "grad_norm": 2.425076530004471, "language_loss": 0.839746, "learning_rate": 3.222293661638346e-06, "loss": 0.86574411, "num_input_tokens_seen": 111435445, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.23742676, "step": 5189, "time_per_iteration": 2.8208768367767334 }, { "auxiliary_loss_clip": 0.01534394, "auxiliary_loss_mlp": 0.0103779, "balance_loss_clip": 1.33784556, "balance_loss_mlp": 1.01429439, "epoch": 0.31203968134676086, "flos": 16006837242240.0, "grad_norm": 2.172733196522863, "language_loss": 0.80042249, "learning_rate": 3.22198537282789e-06, "loss": 0.82614434, "num_input_tokens_seen": 111453430, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.23498535, "step": 5190, "time_per_iteration": 2.8360445499420166 }, { "auxiliary_loss_clip": 0.01552596, "auxiliary_loss_mlp": 0.01047017, "balance_loss_clip": 1.3516891, "balance_loss_mlp": 1.02352071, "epoch": 0.3120998045994288, "flos": 23846699975040.0, "grad_norm": 1.643102514499009, "language_loss": 0.75625211, "learning_rate": 3.2216770376796262e-06, "loss": 0.78224826, "num_input_tokens_seen": 111475325, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.23486328, "step": 5191, "time_per_iteration": 4.412895202636719 }, { "auxiliary_loss_clip": 0.01295506, "auxiliary_loss_mlp": 0.01060607, "balance_loss_clip": 1.17864168, "balance_loss_mlp": 1.03752828, "epoch": 0.3121599278520968, "flos": 69213496348800.0, "grad_norm": 0.8511461519675058, "language_loss": 0.63940513, "learning_rate": 3.221368656205247e-06, "loss": 0.66296625, "num_input_tokens_seen": 111533960, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.23046875, "step": 5192, "time_per_iteration": 4.823137044906616 }, { "auxiliary_loss_clip": 0.01556034, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.3507638, "balance_loss_mlp": 1.01755166, "epoch": 0.31222005110476475, "flos": 23817127858560.0, "grad_norm": 1.8583944864877093, "language_loss": 0.80945331, "learning_rate": 3.221060228416446e-06, "loss": 0.83543372, "num_input_tokens_seen": 111554055, "router_z_loss_clip": 2.0546875, "router_z_loss_mlp": 0.24462891, "step": 5193, "time_per_iteration": 2.8800132274627686 }, { "auxiliary_loss_clip": 0.0154647, "auxiliary_loss_mlp": 0.01042051, "balance_loss_clip": 1.3427031, "balance_loss_mlp": 1.01975918, "epoch": 0.3122801743574327, "flos": 25236434580480.0, "grad_norm": 2.633137300290936, "language_loss": 0.72753894, "learning_rate": 3.2207517543249183e-06, "loss": 0.75342417, "num_input_tokens_seen": 111574305, "router_z_loss_clip": 2.0390625, "router_z_loss_mlp": 0.22290039, "step": 5194, "time_per_iteration": 2.910816192626953 }, { "auxiliary_loss_clip": 0.01548853, "auxiliary_loss_mlp": 0.01042773, "balance_loss_clip": 1.35042906, "balance_loss_mlp": 1.01959836, "epoch": 0.3123402976101007, "flos": 22976847832320.0, "grad_norm": 1.8223793779620374, "language_loss": 0.77111173, "learning_rate": 3.2204432339423616e-06, "loss": 0.79702801, "num_input_tokens_seen": 111595680, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.23168945, "step": 5195, "time_per_iteration": 3.0503242015838623 }, { "auxiliary_loss_clip": 0.01551176, "auxiliary_loss_mlp": 0.01051757, "balance_loss_clip": 1.34809196, "balance_loss_mlp": 1.02816534, "epoch": 0.3124004208627687, "flos": 25203016656000.0, "grad_norm": 3.9614249544780793, "language_loss": 0.79404581, "learning_rate": 3.220134667280476e-06, "loss": 0.82007515, "num_input_tokens_seen": 111618135, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.23620605, "step": 5196, "time_per_iteration": 3.0194263458251953 }, { "auxiliary_loss_clip": 0.01292123, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 1.17694044, "balance_loss_mlp": 1.00521314, "epoch": 0.31246054411543667, "flos": 67518113984640.0, "grad_norm": 0.7820391511203064, "language_loss": 0.54867625, "learning_rate": 3.2198260543509613e-06, "loss": 0.57185173, "num_input_tokens_seen": 111682220, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.20214844, "step": 5197, "time_per_iteration": 3.472892999649048 }, { "auxiliary_loss_clip": 0.01520927, "auxiliary_loss_mlp": 0.01040526, "balance_loss_clip": 1.32513666, "balance_loss_mlp": 1.01759028, "epoch": 0.31252066736810463, "flos": 17867562664320.0, "grad_norm": 2.0505173880490664, "language_loss": 0.6785484, "learning_rate": 3.21951739516552e-06, "loss": 0.70416296, "num_input_tokens_seen": 111700815, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.22949219, "step": 5198, "time_per_iteration": 2.8578035831451416 }, { "auxiliary_loss_clip": 0.01570537, "auxiliary_loss_mlp": 0.01043529, "balance_loss_clip": 1.36493611, "balance_loss_mlp": 1.01969886, "epoch": 0.3125807906207726, "flos": 18483264910080.0, "grad_norm": 13.875804938609175, "language_loss": 0.70782948, "learning_rate": 3.219208689735857e-06, "loss": 0.73397005, "num_input_tokens_seen": 111718195, "router_z_loss_clip": 2.05664062, "router_z_loss_mlp": 0.23840332, "step": 5199, "time_per_iteration": 2.891808032989502 }, { "auxiliary_loss_clip": 0.01541757, "auxiliary_loss_mlp": 0.01046823, "balance_loss_clip": 1.34142911, "balance_loss_mlp": 1.02251601, "epoch": 0.31264091387344056, "flos": 18954165237120.0, "grad_norm": 1.7099846599399227, "language_loss": 0.79736131, "learning_rate": 3.2188999380736785e-06, "loss": 0.82324713, "num_input_tokens_seen": 111734440, "router_z_loss_clip": 2.00195312, "router_z_loss_mlp": 0.24279785, "step": 5200, "time_per_iteration": 2.8861944675445557 }, { "auxiliary_loss_clip": 0.01525081, "auxiliary_loss_mlp": 0.01045025, "balance_loss_clip": 1.32905841, "balance_loss_mlp": 1.02155304, "epoch": 0.3127010371261085, "flos": 21477946227840.0, "grad_norm": 1.9848708334642413, "language_loss": 0.8404057, "learning_rate": 3.2185911401906917e-06, "loss": 0.86610675, "num_input_tokens_seen": 111751960, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.23486328, "step": 5201, "time_per_iteration": 2.872189521789551 }, { "auxiliary_loss_clip": 0.01532403, "auxiliary_loss_mlp": 0.01047324, "balance_loss_clip": 1.33149314, "balance_loss_mlp": 1.02304113, "epoch": 0.3127611603787765, "flos": 15343148246400.0, "grad_norm": 2.6345413972506204, "language_loss": 0.70283234, "learning_rate": 3.2182822960986072e-06, "loss": 0.72862959, "num_input_tokens_seen": 111769585, "router_z_loss_clip": 2.00878906, "router_z_loss_mlp": 0.24279785, "step": 5202, "time_per_iteration": 2.877743721008301 }, { "auxiliary_loss_clip": 0.01531369, "auxiliary_loss_mlp": 0.01047609, "balance_loss_clip": 1.32905948, "balance_loss_mlp": 1.02543628, "epoch": 0.31282128363144446, "flos": 17611919688960.0, "grad_norm": 1.78200571762854, "language_loss": 0.85548514, "learning_rate": 3.2179734058091358e-06, "loss": 0.88127494, "num_input_tokens_seen": 111787880, "router_z_loss_clip": 2.0234375, "router_z_loss_mlp": 0.22167969, "step": 5203, "time_per_iteration": 2.8283791542053223 }, { "auxiliary_loss_clip": 0.01545477, "auxiliary_loss_mlp": 0.01043185, "balance_loss_clip": 1.3437655, "balance_loss_mlp": 1.02007031, "epoch": 0.3128814068841124, "flos": 26766989562240.0, "grad_norm": 2.2487768792208946, "language_loss": 0.61977184, "learning_rate": 3.2176644693339913e-06, "loss": 0.64565843, "num_input_tokens_seen": 111805950, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.2310791, "step": 5204, "time_per_iteration": 2.943950891494751 }, { "auxiliary_loss_clip": 0.01522349, "auxiliary_loss_mlp": 0.01043736, "balance_loss_clip": 1.3267808, "balance_loss_mlp": 1.02126503, "epoch": 0.3129415301367804, "flos": 22282319865600.0, "grad_norm": 2.1912709112254634, "language_loss": 0.66353989, "learning_rate": 3.217355486684887e-06, "loss": 0.68920076, "num_input_tokens_seen": 111826135, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.22460938, "step": 5205, "time_per_iteration": 2.9052481651306152 }, { "auxiliary_loss_clip": 0.01530698, "auxiliary_loss_mlp": 0.01044581, "balance_loss_clip": 1.32987893, "balance_loss_mlp": 1.02039385, "epoch": 0.31300165338944835, "flos": 26475078240000.0, "grad_norm": 1.5315952957260235, "language_loss": 0.77046096, "learning_rate": 3.2170464578735414e-06, "loss": 0.79621375, "num_input_tokens_seen": 111844700, "router_z_loss_clip": 2.00683594, "router_z_loss_mlp": 0.24194336, "step": 5206, "time_per_iteration": 2.9199774265289307 }, { "auxiliary_loss_clip": 0.01525754, "auxiliary_loss_mlp": 0.01039929, "balance_loss_clip": 1.32850194, "balance_loss_mlp": 1.01732683, "epoch": 0.3130617766421163, "flos": 21954683134080.0, "grad_norm": 1.9403421751906427, "language_loss": 0.84218705, "learning_rate": 3.216737382911672e-06, "loss": 0.86784387, "num_input_tokens_seen": 111861585, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.22607422, "step": 5207, "time_per_iteration": 2.8477680683135986 }, { "auxiliary_loss_clip": 0.01516852, "auxiliary_loss_mlp": 0.01044319, "balance_loss_clip": 1.32095516, "balance_loss_mlp": 1.02143085, "epoch": 0.3131218998947843, "flos": 23302674771840.0, "grad_norm": 1.6862624622443259, "language_loss": 0.72107327, "learning_rate": 3.216428261810999e-06, "loss": 0.74668503, "num_input_tokens_seen": 111882950, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.22912598, "step": 5208, "time_per_iteration": 2.883291006088257 }, { "auxiliary_loss_clip": 0.01525261, "auxiliary_loss_mlp": 0.01042408, "balance_loss_clip": 1.32695103, "balance_loss_mlp": 1.01991332, "epoch": 0.3131820231474523, "flos": 21148861662720.0, "grad_norm": 1.7806045432004773, "language_loss": 0.75351351, "learning_rate": 3.2161190945832445e-06, "loss": 0.77919024, "num_input_tokens_seen": 111901640, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.22485352, "step": 5209, "time_per_iteration": 2.9013309478759766 }, { "auxiliary_loss_clip": 0.01516601, "auxiliary_loss_mlp": 0.01042033, "balance_loss_clip": 1.3187722, "balance_loss_mlp": 1.02036047, "epoch": 0.31324214640012027, "flos": 23919100934400.0, "grad_norm": 2.0464994826118423, "language_loss": 0.77774125, "learning_rate": 3.2158098812401325e-06, "loss": 0.80332756, "num_input_tokens_seen": 111919615, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.21679688, "step": 5210, "time_per_iteration": 2.8622682094573975 }, { "auxiliary_loss_clip": 0.01503822, "auxiliary_loss_mlp": 0.01045073, "balance_loss_clip": 1.31256413, "balance_loss_mlp": 1.0216608, "epoch": 0.31330226965278823, "flos": 22247137393920.0, "grad_norm": 1.826369790574698, "language_loss": 0.79877794, "learning_rate": 3.2155006217933874e-06, "loss": 0.82426691, "num_input_tokens_seen": 111938485, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.23425293, "step": 5211, "time_per_iteration": 2.8524739742279053 }, { "auxiliary_loss_clip": 0.0151192, "auxiliary_loss_mlp": 0.01045783, "balance_loss_clip": 1.31535625, "balance_loss_mlp": 1.02355027, "epoch": 0.3133623929054562, "flos": 19763063354880.0, "grad_norm": 1.6899115389517014, "language_loss": 0.79842603, "learning_rate": 3.2151913162547367e-06, "loss": 0.82400304, "num_input_tokens_seen": 111956425, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.22229004, "step": 5212, "time_per_iteration": 2.846198081970215 }, { "auxiliary_loss_clip": 0.01537863, "auxiliary_loss_mlp": 0.0104548, "balance_loss_clip": 1.33725739, "balance_loss_mlp": 1.02333117, "epoch": 0.31342251615812416, "flos": 27173678238720.0, "grad_norm": 2.4997708121336704, "language_loss": 0.71576476, "learning_rate": 3.2148819646359097e-06, "loss": 0.74159825, "num_input_tokens_seen": 111975915, "router_z_loss_clip": 2.00292969, "router_z_loss_mlp": 0.22167969, "step": 5213, "time_per_iteration": 2.905486822128296 }, { "auxiliary_loss_clip": 0.01534276, "auxiliary_loss_mlp": 0.01046846, "balance_loss_clip": 1.33282804, "balance_loss_mlp": 1.02411246, "epoch": 0.31348263941079213, "flos": 20239393057920.0, "grad_norm": 2.0502132787192813, "language_loss": 0.78763318, "learning_rate": 3.2145725669486374e-06, "loss": 0.8134445, "num_input_tokens_seen": 111995055, "router_z_loss_clip": 2.01757812, "router_z_loss_mlp": 0.22729492, "step": 5214, "time_per_iteration": 2.8909809589385986 }, { "auxiliary_loss_clip": 0.01513788, "auxiliary_loss_mlp": 0.01050246, "balance_loss_clip": 1.32107759, "balance_loss_mlp": 1.02810907, "epoch": 0.3135427626634601, "flos": 24618243870720.0, "grad_norm": 1.5611158660720477, "language_loss": 0.83127725, "learning_rate": 3.2142631232046517e-06, "loss": 0.85691762, "num_input_tokens_seen": 112015830, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.22131348, "step": 5215, "time_per_iteration": 2.897444725036621 }, { "auxiliary_loss_clip": 0.01519698, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.32207906, "balance_loss_mlp": 1.02253783, "epoch": 0.31360288591612806, "flos": 20969736923520.0, "grad_norm": 1.9357098955004386, "language_loss": 0.80697727, "learning_rate": 3.213953633415686e-06, "loss": 0.83262825, "num_input_tokens_seen": 112035065, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.2286377, "step": 5216, "time_per_iteration": 2.9133191108703613 }, { "auxiliary_loss_clip": 0.01539794, "auxiliary_loss_mlp": 0.01054179, "balance_loss_clip": 1.33668387, "balance_loss_mlp": 1.03068256, "epoch": 0.313663009168796, "flos": 26992065035520.0, "grad_norm": 1.925736636615209, "language_loss": 0.69326091, "learning_rate": 3.213644097593477e-06, "loss": 0.71920061, "num_input_tokens_seen": 112058405, "router_z_loss_clip": 2.03320312, "router_z_loss_mlp": 0.23510742, "step": 5217, "time_per_iteration": 2.9203834533691406 }, { "auxiliary_loss_clip": 0.01520588, "auxiliary_loss_mlp": 0.01047751, "balance_loss_clip": 1.32344019, "balance_loss_mlp": 1.02498174, "epoch": 0.313723132421464, "flos": 18049990273920.0, "grad_norm": 1.6563119852808028, "language_loss": 0.81557512, "learning_rate": 3.2133345157497624e-06, "loss": 0.84125859, "num_input_tokens_seen": 112076420, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.2277832, "step": 5218, "time_per_iteration": 2.884458541870117 }, { "auxiliary_loss_clip": 0.01538551, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.33795702, "balance_loss_mlp": 1.02077174, "epoch": 0.31378325567413196, "flos": 22498301134080.0, "grad_norm": 2.499166988739463, "language_loss": 0.6952045, "learning_rate": 3.2130248878962813e-06, "loss": 0.72103363, "num_input_tokens_seen": 112090775, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.23583984, "step": 5219, "time_per_iteration": 4.353666543960571 }, { "auxiliary_loss_clip": 0.01527516, "auxiliary_loss_mlp": 0.01039845, "balance_loss_clip": 1.32917547, "balance_loss_mlp": 1.01806533, "epoch": 0.3138433789267999, "flos": 22429474513920.0, "grad_norm": 2.336972247684757, "language_loss": 0.80640525, "learning_rate": 3.2127152140447747e-06, "loss": 0.83207887, "num_input_tokens_seen": 112110980, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.21777344, "step": 5220, "time_per_iteration": 2.9171934127807617 }, { "auxiliary_loss_clip": 0.01529801, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.33190656, "balance_loss_mlp": 1.02509296, "epoch": 0.3139035021794679, "flos": 13013694247680.0, "grad_norm": 1.899963810663342, "language_loss": 0.73692524, "learning_rate": 3.212405494206986e-06, "loss": 0.76268542, "num_input_tokens_seen": 112129020, "router_z_loss_clip": 1.98046875, "router_z_loss_mlp": 0.21130371, "step": 5221, "time_per_iteration": 2.8489012718200684 }, { "auxiliary_loss_clip": 0.01516594, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.32162786, "balance_loss_mlp": 1.0196228, "epoch": 0.31396362543213585, "flos": 16954700699520.0, "grad_norm": 2.183669911061567, "language_loss": 0.82791519, "learning_rate": 3.2120957283946588e-06, "loss": 0.85349941, "num_input_tokens_seen": 112147865, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.2220459, "step": 5222, "time_per_iteration": 2.866429090499878 }, { "auxiliary_loss_clip": 0.01528765, "auxiliary_loss_mlp": 0.01043463, "balance_loss_clip": 1.32723784, "balance_loss_mlp": 1.0194428, "epoch": 0.31402374868480387, "flos": 20166268181760.0, "grad_norm": 1.8265473903464173, "language_loss": 0.71368992, "learning_rate": 3.2117859166195407e-06, "loss": 0.73941219, "num_input_tokens_seen": 112166745, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.24023438, "step": 5223, "time_per_iteration": 4.240664720535278 }, { "auxiliary_loss_clip": 0.01507189, "auxiliary_loss_mlp": 0.01040179, "balance_loss_clip": 1.31287456, "balance_loss_mlp": 1.01954424, "epoch": 0.31408387193747184, "flos": 21260833839360.0, "grad_norm": 1.6077611834206444, "language_loss": 0.81113374, "learning_rate": 3.211476058893379e-06, "loss": 0.83660734, "num_input_tokens_seen": 112185895, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.2064209, "step": 5224, "time_per_iteration": 2.8568296432495117 }, { "auxiliary_loss_clip": 0.01536881, "auxiliary_loss_mlp": 0.01049339, "balance_loss_clip": 1.3335681, "balance_loss_mlp": 1.02570009, "epoch": 0.3141439951901398, "flos": 27494121047040.0, "grad_norm": 2.2164222924673322, "language_loss": 0.58971721, "learning_rate": 3.2111661552279243e-06, "loss": 0.61557943, "num_input_tokens_seen": 112204465, "router_z_loss_clip": 2.03222656, "router_z_loss_mlp": 0.2364502, "step": 5225, "time_per_iteration": 2.8776493072509766 }, { "auxiliary_loss_clip": 0.01512103, "auxiliary_loss_mlp": 0.01037878, "balance_loss_clip": 1.31894815, "balance_loss_mlp": 1.01657569, "epoch": 0.31420411844280777, "flos": 17859644824320.0, "grad_norm": 2.8409601650015546, "language_loss": 0.82417637, "learning_rate": 3.2108562056349273e-06, "loss": 0.84967625, "num_input_tokens_seen": 112221635, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.2130127, "step": 5226, "time_per_iteration": 4.236299514770508 }, { "auxiliary_loss_clip": 0.01526903, "auxiliary_loss_mlp": 0.01042291, "balance_loss_clip": 1.32679033, "balance_loss_mlp": 1.0193435, "epoch": 0.31426424169547573, "flos": 21627091647360.0, "grad_norm": 1.9581441963060615, "language_loss": 0.75195527, "learning_rate": 3.210546210126141e-06, "loss": 0.77764726, "num_input_tokens_seen": 112241240, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.22937012, "step": 5227, "time_per_iteration": 4.316488742828369 }, { "auxiliary_loss_clip": 0.01534796, "auxiliary_loss_mlp": 0.01042448, "balance_loss_clip": 1.33753991, "balance_loss_mlp": 1.02002501, "epoch": 0.3143243649481437, "flos": 30933116732160.0, "grad_norm": 2.085507737641461, "language_loss": 0.69417882, "learning_rate": 3.2102361687133213e-06, "loss": 0.71995127, "num_input_tokens_seen": 112262350, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.22412109, "step": 5228, "time_per_iteration": 2.947946786880493 }, { "auxiliary_loss_clip": 0.01526439, "auxiliary_loss_mlp": 0.01042639, "balance_loss_clip": 1.32943559, "balance_loss_mlp": 1.02018023, "epoch": 0.31438448820081166, "flos": 22831955424000.0, "grad_norm": 1.6756484783602597, "language_loss": 0.80487251, "learning_rate": 3.2099260814082254e-06, "loss": 0.83056325, "num_input_tokens_seen": 112283710, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.22460938, "step": 5229, "time_per_iteration": 2.8820884227752686 }, { "auxiliary_loss_clip": 0.01524347, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.33006942, "balance_loss_mlp": 1.0145992, "epoch": 0.3144446114534796, "flos": 23301860365440.0, "grad_norm": 1.721774643312017, "language_loss": 0.70787567, "learning_rate": 3.209615948222611e-06, "loss": 0.7334919, "num_input_tokens_seen": 112304285, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.22668457, "step": 5230, "time_per_iteration": 2.879142999649048 }, { "auxiliary_loss_clip": 0.01515786, "auxiliary_loss_mlp": 0.01040236, "balance_loss_clip": 1.32031965, "balance_loss_mlp": 1.01777673, "epoch": 0.3145047347061476, "flos": 31367929691520.0, "grad_norm": 1.5884728011171225, "language_loss": 0.7987082, "learning_rate": 3.209305769168239e-06, "loss": 0.82426846, "num_input_tokens_seen": 112325110, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.2244873, "step": 5231, "time_per_iteration": 2.9461941719055176 }, { "auxiliary_loss_clip": 0.0152296, "auxiliary_loss_mlp": 0.01044344, "balance_loss_clip": 1.32691932, "balance_loss_mlp": 1.02146721, "epoch": 0.31456485795881556, "flos": 10896737667840.0, "grad_norm": 1.9667279456939306, "language_loss": 0.86124188, "learning_rate": 3.2089955442568704e-06, "loss": 0.88691491, "num_input_tokens_seen": 112339855, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.22875977, "step": 5232, "time_per_iteration": 2.8304853439331055 }, { "auxiliary_loss_clip": 0.01502462, "auxiliary_loss_mlp": 0.01042192, "balance_loss_clip": 1.31073952, "balance_loss_mlp": 1.01926839, "epoch": 0.3146249812114835, "flos": 17101312410240.0, "grad_norm": 1.7666017913665626, "language_loss": 0.81087852, "learning_rate": 3.2086852735002692e-06, "loss": 0.83632505, "num_input_tokens_seen": 112358480, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.22937012, "step": 5233, "time_per_iteration": 2.864776134490967 }, { "auxiliary_loss_clip": 0.015243, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.32465732, "balance_loss_mlp": 1.01601732, "epoch": 0.3146851044641515, "flos": 55309816022400.0, "grad_norm": 1.7269902783318816, "language_loss": 0.71928227, "learning_rate": 3.2083749569102024e-06, "loss": 0.74491644, "num_input_tokens_seen": 112382350, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.23083496, "step": 5234, "time_per_iteration": 3.2603299617767334 }, { "auxiliary_loss_clip": 0.01531827, "auxiliary_loss_mlp": 0.01037886, "balance_loss_clip": 1.33189464, "balance_loss_mlp": 1.01586771, "epoch": 0.31474522771681945, "flos": 27027157017600.0, "grad_norm": 1.8672246072668386, "language_loss": 0.73177409, "learning_rate": 3.2080645944984356e-06, "loss": 0.7574712, "num_input_tokens_seen": 112400260, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.22009277, "step": 5235, "time_per_iteration": 2.9153060913085938 }, { "auxiliary_loss_clip": 0.01511877, "auxiliary_loss_mlp": 0.01041486, "balance_loss_clip": 1.31696773, "balance_loss_mlp": 1.01971841, "epoch": 0.3148053509694875, "flos": 21261919714560.0, "grad_norm": 2.0085273557243264, "language_loss": 0.79497993, "learning_rate": 3.2077541862767384e-06, "loss": 0.82051361, "num_input_tokens_seen": 112419400, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.21765137, "step": 5236, "time_per_iteration": 2.8782529830932617 }, { "auxiliary_loss_clip": 0.01524457, "auxiliary_loss_mlp": 0.01043275, "balance_loss_clip": 1.32378328, "balance_loss_mlp": 1.02093482, "epoch": 0.31486547422215544, "flos": 31261160666880.0, "grad_norm": 1.9275716802098413, "language_loss": 0.7710222, "learning_rate": 3.207443732256881e-06, "loss": 0.79669946, "num_input_tokens_seen": 112440825, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.2232666, "step": 5237, "time_per_iteration": 2.949321985244751 }, { "auxiliary_loss_clip": 0.01509525, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.31825674, "balance_loss_mlp": 1.01887023, "epoch": 0.3149255974748234, "flos": 19838133757440.0, "grad_norm": 4.3634303758261055, "language_loss": 0.79835898, "learning_rate": 3.2071332324506372e-06, "loss": 0.82385933, "num_input_tokens_seen": 112459180, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.21630859, "step": 5238, "time_per_iteration": 2.8863747119903564 }, { "auxiliary_loss_clip": 0.01276868, "auxiliary_loss_mlp": 0.01045551, "balance_loss_clip": 1.15720272, "balance_loss_mlp": 1.01751268, "epoch": 0.31498572072749137, "flos": 67711744321920.0, "grad_norm": 0.8461327112452774, "language_loss": 0.67941278, "learning_rate": 3.2068226868697795e-06, "loss": 0.70263696, "num_input_tokens_seen": 112516680, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.28125, "step": 5239, "time_per_iteration": 3.3732187747955322 }, { "auxiliary_loss_clip": 0.01534579, "auxiliary_loss_mlp": 0.01052522, "balance_loss_clip": 1.33174729, "balance_loss_mlp": 1.0282867, "epoch": 0.31504584398015933, "flos": 19802906040960.0, "grad_norm": 2.1692149578045865, "language_loss": 0.83550441, "learning_rate": 3.2065120955260846e-06, "loss": 0.86137539, "num_input_tokens_seen": 112535895, "router_z_loss_clip": 2.02636719, "router_z_loss_mlp": 0.2421875, "step": 5240, "time_per_iteration": 2.8989758491516113 }, { "auxiliary_loss_clip": 0.01516456, "auxiliary_loss_mlp": 0.01054737, "balance_loss_clip": 1.32227397, "balance_loss_mlp": 1.03140783, "epoch": 0.3151059672328273, "flos": 26626802613120.0, "grad_norm": 1.6520242784610895, "language_loss": 0.81553161, "learning_rate": 3.2062014584313302e-06, "loss": 0.84124351, "num_input_tokens_seen": 112557490, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.2331543, "step": 5241, "time_per_iteration": 2.9146387577056885 }, { "auxiliary_loss_clip": 0.01504136, "auxiliary_loss_mlp": 0.01047963, "balance_loss_clip": 1.31421912, "balance_loss_mlp": 1.02528954, "epoch": 0.31516609048549526, "flos": 24214450861440.0, "grad_norm": 1.7965630009417497, "language_loss": 0.74901938, "learning_rate": 3.2058907755972956e-06, "loss": 0.77454031, "num_input_tokens_seen": 112577075, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.22692871, "step": 5242, "time_per_iteration": 2.8918778896331787 }, { "auxiliary_loss_clip": 0.01516417, "auxiliary_loss_mlp": 0.01049539, "balance_loss_clip": 1.32390368, "balance_loss_mlp": 1.02641261, "epoch": 0.31522621373816323, "flos": 25969945582080.0, "grad_norm": 1.7035995248804716, "language_loss": 0.74588096, "learning_rate": 3.2055800470357626e-06, "loss": 0.77154052, "num_input_tokens_seen": 112597620, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.23132324, "step": 5243, "time_per_iteration": 2.924443244934082 }, { "auxiliary_loss_clip": 0.01519849, "auxiliary_loss_mlp": 0.01047114, "balance_loss_clip": 1.32132244, "balance_loss_mlp": 1.02351022, "epoch": 0.3152863369908312, "flos": 21919002969600.0, "grad_norm": 1.8575271327684466, "language_loss": 0.65283012, "learning_rate": 3.205269272758513e-06, "loss": 0.67849976, "num_input_tokens_seen": 112617150, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.23608398, "step": 5244, "time_per_iteration": 2.858238935470581 }, { "auxiliary_loss_clip": 0.01522177, "auxiliary_loss_mlp": 0.01048859, "balance_loss_clip": 1.32280707, "balance_loss_mlp": 1.02588785, "epoch": 0.31534646024349916, "flos": 16287482609280.0, "grad_norm": 2.6439472579696157, "language_loss": 0.92164338, "learning_rate": 3.2049584527773313e-06, "loss": 0.94735372, "num_input_tokens_seen": 112631090, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.22961426, "step": 5245, "time_per_iteration": 2.82991361618042 }, { "auxiliary_loss_clip": 0.01528163, "auxiliary_loss_mlp": 0.01051156, "balance_loss_clip": 1.33055472, "balance_loss_mlp": 1.02724218, "epoch": 0.3154065834961671, "flos": 24727908562560.0, "grad_norm": 2.055457313310126, "language_loss": 0.75751138, "learning_rate": 3.2046475871040048e-06, "loss": 0.78330463, "num_input_tokens_seen": 112651220, "router_z_loss_clip": 1.9765625, "router_z_loss_mlp": 0.23901367, "step": 5246, "time_per_iteration": 2.9090206623077393 }, { "auxiliary_loss_clip": 0.01514728, "auxiliary_loss_mlp": 0.01051202, "balance_loss_clip": 1.31849492, "balance_loss_mlp": 1.0271101, "epoch": 0.3154667067488351, "flos": 35384594728320.0, "grad_norm": 1.7688300449422854, "language_loss": 0.62302488, "learning_rate": 3.204336675750321e-06, "loss": 0.64868414, "num_input_tokens_seen": 112671560, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.24084473, "step": 5247, "time_per_iteration": 2.994504690170288 }, { "auxiliary_loss_clip": 0.01526312, "auxiliary_loss_mlp": 0.0104899, "balance_loss_clip": 1.32643867, "balance_loss_mlp": 1.0252676, "epoch": 0.31552683000150306, "flos": 17465217488640.0, "grad_norm": 2.221969753357131, "language_loss": 0.8328104, "learning_rate": 3.2040257187280693e-06, "loss": 0.85856342, "num_input_tokens_seen": 112689790, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.23730469, "step": 5248, "time_per_iteration": 2.8469808101654053 }, { "auxiliary_loss_clip": 0.01508548, "auxiliary_loss_mlp": 0.01048736, "balance_loss_clip": 1.31118441, "balance_loss_mlp": 1.02464414, "epoch": 0.3155869532541711, "flos": 18414935982720.0, "grad_norm": 1.7537379690476955, "language_loss": 0.86409593, "learning_rate": 3.2037147160490423e-06, "loss": 0.88966876, "num_input_tokens_seen": 112708265, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.24108887, "step": 5249, "time_per_iteration": 2.822160243988037 }, { "auxiliary_loss_clip": 0.01521658, "auxiliary_loss_mlp": 0.01046359, "balance_loss_clip": 1.32430625, "balance_loss_mlp": 1.02217102, "epoch": 0.31564707650683904, "flos": 21589692180480.0, "grad_norm": 1.764742168449224, "language_loss": 0.87202895, "learning_rate": 3.2034036677250322e-06, "loss": 0.89770913, "num_input_tokens_seen": 112727820, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.24206543, "step": 5250, "time_per_iteration": 2.885922908782959 }, { "auxiliary_loss_clip": 0.01510158, "auxiliary_loss_mlp": 0.01050408, "balance_loss_clip": 1.31560349, "balance_loss_mlp": 1.02730489, "epoch": 0.315707199759507, "flos": 21040373335680.0, "grad_norm": 2.4247903918457934, "language_loss": 0.69794762, "learning_rate": 3.203092573767835e-06, "loss": 0.72355324, "num_input_tokens_seen": 112743140, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.2310791, "step": 5251, "time_per_iteration": 2.8327720165252686 }, { "auxiliary_loss_clip": 0.01511291, "auxiliary_loss_mlp": 0.01048936, "balance_loss_clip": 1.31653547, "balance_loss_mlp": 1.02565479, "epoch": 0.31576732301217497, "flos": 26838892828800.0, "grad_norm": 2.8575251066363547, "language_loss": 0.79605997, "learning_rate": 3.202781434189246e-06, "loss": 0.82166219, "num_input_tokens_seen": 112764705, "router_z_loss_clip": 1.94824219, "router_z_loss_mlp": 0.23266602, "step": 5252, "time_per_iteration": 2.9043102264404297 }, { "auxiliary_loss_clip": 0.01516913, "auxiliary_loss_mlp": 0.01050886, "balance_loss_clip": 1.32251132, "balance_loss_mlp": 1.02833188, "epoch": 0.31582744626484294, "flos": 22721476325760.0, "grad_norm": 1.7315355888827477, "language_loss": 0.74690974, "learning_rate": 3.202470249001066e-06, "loss": 0.77258772, "num_input_tokens_seen": 112785310, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.22546387, "step": 5253, "time_per_iteration": 2.8986923694610596 }, { "auxiliary_loss_clip": 0.01529953, "auxiliary_loss_mlp": 0.01044478, "balance_loss_clip": 1.33179021, "balance_loss_mlp": 1.02017117, "epoch": 0.3158875695175109, "flos": 23962608449280.0, "grad_norm": 1.6872903560455994, "language_loss": 0.74405801, "learning_rate": 3.2021590182150924e-06, "loss": 0.76980233, "num_input_tokens_seen": 112802905, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.24291992, "step": 5254, "time_per_iteration": 4.355388879776001 }, { "auxiliary_loss_clip": 0.01527111, "auxiliary_loss_mlp": 0.01054375, "balance_loss_clip": 1.32715249, "balance_loss_mlp": 1.03059292, "epoch": 0.31594769277017887, "flos": 13269880160640.0, "grad_norm": 1.7944936812422003, "language_loss": 0.79472506, "learning_rate": 3.201847741843128e-06, "loss": 0.82053995, "num_input_tokens_seen": 112820305, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.23791504, "step": 5255, "time_per_iteration": 2.889650821685791 }, { "auxiliary_loss_clip": 0.01511317, "auxiliary_loss_mlp": 0.01053735, "balance_loss_clip": 1.31614554, "balance_loss_mlp": 1.02864194, "epoch": 0.31600781602284683, "flos": 23378831049600.0, "grad_norm": 1.9118087741714693, "language_loss": 0.79162896, "learning_rate": 3.2015364198969772e-06, "loss": 0.81727946, "num_input_tokens_seen": 112841185, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.25109863, "step": 5256, "time_per_iteration": 2.9084343910217285 }, { "auxiliary_loss_clip": 0.01486768, "auxiliary_loss_mlp": 0.01050812, "balance_loss_clip": 1.29965663, "balance_loss_mlp": 1.02789974, "epoch": 0.3160679392755148, "flos": 19838088512640.0, "grad_norm": 1.5054919922482648, "language_loss": 0.72011101, "learning_rate": 3.2012250523884453e-06, "loss": 0.74548686, "num_input_tokens_seen": 112860570, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.22912598, "step": 5257, "time_per_iteration": 4.307504177093506 }, { "auxiliary_loss_clip": 0.0151132, "auxiliary_loss_mlp": 0.01051858, "balance_loss_clip": 1.31656551, "balance_loss_mlp": 1.02892244, "epoch": 0.31612806252818276, "flos": 20202762752640.0, "grad_norm": 1.9553211225643465, "language_loss": 0.77730095, "learning_rate": 3.2009136393293393e-06, "loss": 0.80293274, "num_input_tokens_seen": 112877975, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.22924805, "step": 5258, "time_per_iteration": 2.8762292861938477 }, { "auxiliary_loss_clip": 0.01518873, "auxiliary_loss_mlp": 0.0104603, "balance_loss_clip": 1.32410932, "balance_loss_mlp": 1.0233326, "epoch": 0.31618818578085073, "flos": 24244701649920.0, "grad_norm": 2.1917291377371693, "language_loss": 0.74058425, "learning_rate": 3.200602180731467e-06, "loss": 0.76623327, "num_input_tokens_seen": 112896170, "router_z_loss_clip": 1.94628906, "router_z_loss_mlp": 0.22680664, "step": 5259, "time_per_iteration": 2.916705846786499 }, { "auxiliary_loss_clip": 0.015083, "auxiliary_loss_mlp": 0.01050237, "balance_loss_clip": 1.31198323, "balance_loss_mlp": 1.02838635, "epoch": 0.3162483090335187, "flos": 25092537557760.0, "grad_norm": 2.007742390705015, "language_loss": 0.67014736, "learning_rate": 3.20029067660664e-06, "loss": 0.69573271, "num_input_tokens_seen": 112916180, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.21862793, "step": 5260, "time_per_iteration": 2.881371259689331 }, { "auxiliary_loss_clip": 0.01509499, "auxiliary_loss_mlp": 0.01044134, "balance_loss_clip": 1.31462896, "balance_loss_mlp": 1.02038753, "epoch": 0.31630843228618666, "flos": 26334665066880.0, "grad_norm": 1.7904469769722526, "language_loss": 0.73279345, "learning_rate": 3.1999791269666706e-06, "loss": 0.75832981, "num_input_tokens_seen": 112936745, "router_z_loss_clip": 1.94628906, "router_z_loss_mlp": 0.23742676, "step": 5261, "time_per_iteration": 4.309060096740723 }, { "auxiliary_loss_clip": 0.01287893, "auxiliary_loss_mlp": 0.01063661, "balance_loss_clip": 1.16592026, "balance_loss_mlp": 1.0230341, "epoch": 0.3163685555388547, "flos": 66791977885440.0, "grad_norm": 0.7499843547675183, "language_loss": 0.50696504, "learning_rate": 3.1996675318233716e-06, "loss": 0.53048062, "num_input_tokens_seen": 112994845, "router_z_loss_clip": 1.21875, "router_z_loss_mlp": 0.40625, "step": 5262, "time_per_iteration": 4.732795476913452 }, { "auxiliary_loss_clip": 0.01510517, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.31520224, "balance_loss_mlp": 1.02059567, "epoch": 0.31642867879152264, "flos": 26007028335360.0, "grad_norm": 1.5625877479126398, "language_loss": 0.8592869, "learning_rate": 3.19935589118856e-06, "loss": 0.88482738, "num_input_tokens_seen": 113015125, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.22924805, "step": 5263, "time_per_iteration": 2.872580051422119 }, { "auxiliary_loss_clip": 0.0149249, "auxiliary_loss_mlp": 0.01041287, "balance_loss_clip": 1.30498588, "balance_loss_mlp": 1.01515687, "epoch": 0.3164888020441906, "flos": 25785798670080.0, "grad_norm": 1.7156841462889296, "language_loss": 0.82287186, "learning_rate": 3.1990442050740535e-06, "loss": 0.84820962, "num_input_tokens_seen": 113035535, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.26159668, "step": 5264, "time_per_iteration": 2.914870500564575 }, { "auxiliary_loss_clip": 0.01523112, "auxiliary_loss_mlp": 0.01043777, "balance_loss_clip": 1.32450235, "balance_loss_mlp": 1.02009058, "epoch": 0.3165489252968586, "flos": 19765868532480.0, "grad_norm": 2.22237906194405, "language_loss": 0.80272633, "learning_rate": 3.19873247349167e-06, "loss": 0.82839525, "num_input_tokens_seen": 113052720, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.23706055, "step": 5265, "time_per_iteration": 2.9567506313323975 }, { "auxiliary_loss_clip": 0.01525378, "auxiliary_loss_mlp": 0.0104685, "balance_loss_clip": 1.32833397, "balance_loss_mlp": 1.02299643, "epoch": 0.31660904854952654, "flos": 23193960220800.0, "grad_norm": 1.5944691495450953, "language_loss": 0.7548219, "learning_rate": 3.1984206964532307e-06, "loss": 0.78054416, "num_input_tokens_seen": 113071435, "router_z_loss_clip": 1.96972656, "router_z_loss_mlp": 0.23864746, "step": 5266, "time_per_iteration": 2.849513530731201 }, { "auxiliary_loss_clip": 0.01532687, "auxiliary_loss_mlp": 0.01045267, "balance_loss_clip": 1.33441365, "balance_loss_mlp": 1.02240241, "epoch": 0.3166691718021945, "flos": 20417160453120.0, "grad_norm": 2.957081227760116, "language_loss": 0.81195205, "learning_rate": 3.1981088739705585e-06, "loss": 0.83773154, "num_input_tokens_seen": 113088645, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.22851562, "step": 5267, "time_per_iteration": 2.877411127090454 }, { "auxiliary_loss_clip": 0.01286289, "auxiliary_loss_mlp": 0.01065405, "balance_loss_clip": 1.16875172, "balance_loss_mlp": 1.03469718, "epoch": 0.31672929505486247, "flos": 70177421710080.0, "grad_norm": 0.7414074209008746, "language_loss": 0.57926869, "learning_rate": 3.197797006055478e-06, "loss": 0.60278559, "num_input_tokens_seen": 113152775, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.30664062, "step": 5268, "time_per_iteration": 3.3855483531951904 }, { "auxiliary_loss_clip": 0.0151293, "auxiliary_loss_mlp": 0.01044015, "balance_loss_clip": 1.31633234, "balance_loss_mlp": 1.02098417, "epoch": 0.31678941830753043, "flos": 14363857635840.0, "grad_norm": 5.754239153416772, "language_loss": 0.75286281, "learning_rate": 3.197485092719815e-06, "loss": 0.77843225, "num_input_tokens_seen": 113171410, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.23034668, "step": 5269, "time_per_iteration": 2.853336811065674 }, { "auxiliary_loss_clip": 0.01498405, "auxiliary_loss_mlp": 0.01049458, "balance_loss_clip": 1.30625451, "balance_loss_mlp": 1.02653432, "epoch": 0.3168495415601984, "flos": 22758061386240.0, "grad_norm": 2.1070560755882664, "language_loss": 0.80516315, "learning_rate": 3.1971731339753973e-06, "loss": 0.83064175, "num_input_tokens_seen": 113189965, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.22924805, "step": 5270, "time_per_iteration": 2.905733346939087 }, { "auxiliary_loss_clip": 0.01529818, "auxiliary_loss_mlp": 0.01044668, "balance_loss_clip": 1.33165526, "balance_loss_mlp": 1.02158856, "epoch": 0.31690966481286637, "flos": 20123484583680.0, "grad_norm": 14.986266530674218, "language_loss": 0.80439854, "learning_rate": 3.1968611298340545e-06, "loss": 0.83014345, "num_input_tokens_seen": 113206355, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.23059082, "step": 5271, "time_per_iteration": 2.822153091430664 }, { "auxiliary_loss_clip": 0.01522636, "auxiliary_loss_mlp": 0.01046836, "balance_loss_clip": 1.32689619, "balance_loss_mlp": 1.02233815, "epoch": 0.31696978806553433, "flos": 21188885328000.0, "grad_norm": 1.8573727606244586, "language_loss": 0.74254405, "learning_rate": 3.1965490803076173e-06, "loss": 0.76823872, "num_input_tokens_seen": 113225440, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.24499512, "step": 5272, "time_per_iteration": 2.868091583251953 }, { "auxiliary_loss_clip": 0.01523113, "auxiliary_loss_mlp": 0.01041477, "balance_loss_clip": 1.3245306, "balance_loss_mlp": 1.01610899, "epoch": 0.3170299113182023, "flos": 43012548224640.0, "grad_norm": 2.1259278891969013, "language_loss": 0.70837665, "learning_rate": 3.1962369854079194e-06, "loss": 0.73402256, "num_input_tokens_seen": 113248840, "router_z_loss_clip": 1.98144531, "router_z_loss_mlp": 0.25341797, "step": 5273, "time_per_iteration": 3.037597894668579 }, { "auxiliary_loss_clip": 0.01519779, "auxiliary_loss_mlp": 0.010447, "balance_loss_clip": 1.32350123, "balance_loss_mlp": 1.02092981, "epoch": 0.31709003457087026, "flos": 24470320060800.0, "grad_norm": 1.8123731010184865, "language_loss": 0.69346052, "learning_rate": 3.195924845146795e-06, "loss": 0.71910536, "num_input_tokens_seen": 113269630, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.2376709, "step": 5274, "time_per_iteration": 2.8730521202087402 }, { "auxiliary_loss_clip": 0.01500674, "auxiliary_loss_mlp": 0.01047746, "balance_loss_clip": 1.31046975, "balance_loss_mlp": 1.02488208, "epoch": 0.3171501578235382, "flos": 24146302913280.0, "grad_norm": 1.4431414783523342, "language_loss": 0.81149828, "learning_rate": 3.195612659536081e-06, "loss": 0.83698249, "num_input_tokens_seen": 113291200, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.22875977, "step": 5275, "time_per_iteration": 2.931072950363159 }, { "auxiliary_loss_clip": 0.01520139, "auxiliary_loss_mlp": 0.01042046, "balance_loss_clip": 1.32330358, "balance_loss_mlp": 1.02018261, "epoch": 0.31721028107620625, "flos": 18888912956160.0, "grad_norm": 1.7160727049883893, "language_loss": 0.73538011, "learning_rate": 3.1953004285876147e-06, "loss": 0.76100194, "num_input_tokens_seen": 113310170, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.21862793, "step": 5276, "time_per_iteration": 2.853792428970337 }, { "auxiliary_loss_clip": 0.01493787, "auxiliary_loss_mlp": 0.01037137, "balance_loss_clip": 1.30352449, "balance_loss_mlp": 1.01510763, "epoch": 0.3172704043288742, "flos": 23158008587520.0, "grad_norm": 1.4007174992429483, "language_loss": 0.78867292, "learning_rate": 3.194988152313236e-06, "loss": 0.81398213, "num_input_tokens_seen": 113331140, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.22033691, "step": 5277, "time_per_iteration": 3.05853533744812 }, { "auxiliary_loss_clip": 0.01524903, "auxiliary_loss_mlp": 0.01040798, "balance_loss_clip": 1.32909417, "balance_loss_mlp": 1.01849437, "epoch": 0.3173305275815422, "flos": 17867562664320.0, "grad_norm": 3.0554214098193877, "language_loss": 0.80961466, "learning_rate": 3.1946758307247878e-06, "loss": 0.83527172, "num_input_tokens_seen": 113350030, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.22302246, "step": 5278, "time_per_iteration": 2.8470168113708496 }, { "auxiliary_loss_clip": 0.01274735, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.16251886, "balance_loss_mlp": 1.01250923, "epoch": 0.31739065083421014, "flos": 59996142368640.0, "grad_norm": 0.8875947285068659, "language_loss": 0.62866551, "learning_rate": 3.1943634638341114e-06, "loss": 0.6517697, "num_input_tokens_seen": 113395820, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.23144531, "step": 5279, "time_per_iteration": 3.122891664505005 }, { "auxiliary_loss_clip": 0.01521288, "auxiliary_loss_mlp": 0.01046928, "balance_loss_clip": 1.32407463, "balance_loss_mlp": 1.0230031, "epoch": 0.3174507740868781, "flos": 23810341138560.0, "grad_norm": 1.6260835789316455, "language_loss": 0.82702506, "learning_rate": 3.194051051653053e-06, "loss": 0.85270727, "num_input_tokens_seen": 113416835, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.23925781, "step": 5280, "time_per_iteration": 2.8883888721466064 }, { "auxiliary_loss_clip": 0.01503583, "auxiliary_loss_mlp": 0.01053099, "balance_loss_clip": 1.31151772, "balance_loss_mlp": 1.02969873, "epoch": 0.31751089733954607, "flos": 27651048572160.0, "grad_norm": 1.889970678277756, "language_loss": 0.79182565, "learning_rate": 3.19373859419346e-06, "loss": 0.81739253, "num_input_tokens_seen": 113440850, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.23425293, "step": 5281, "time_per_iteration": 2.9371800422668457 }, { "auxiliary_loss_clip": 0.01501626, "auxiliary_loss_mlp": 0.01044286, "balance_loss_clip": 1.30857897, "balance_loss_mlp": 1.02063513, "epoch": 0.31757102059221404, "flos": 23779864126080.0, "grad_norm": 1.9306600025042602, "language_loss": 0.78867471, "learning_rate": 3.193426091467179e-06, "loss": 0.81413382, "num_input_tokens_seen": 113461000, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.23657227, "step": 5282, "time_per_iteration": 2.9043967723846436 }, { "auxiliary_loss_clip": 0.01530759, "auxiliary_loss_mlp": 0.01048971, "balance_loss_clip": 1.3319757, "balance_loss_mlp": 1.0243187, "epoch": 0.317631143844882, "flos": 25275327125760.0, "grad_norm": 1.878553608123145, "language_loss": 0.68324184, "learning_rate": 3.193113543486061e-06, "loss": 0.70903909, "num_input_tokens_seen": 113480820, "router_z_loss_clip": 1.98535156, "router_z_loss_mlp": 0.2467041, "step": 5283, "time_per_iteration": 2.8771603107452393 }, { "auxiliary_loss_clip": 0.01284059, "auxiliary_loss_mlp": 0.01035552, "balance_loss_clip": 1.16536498, "balance_loss_mlp": 1.00408041, "epoch": 0.31769126709754997, "flos": 55850601582720.0, "grad_norm": 0.7404405987643154, "language_loss": 0.52967417, "learning_rate": 3.192800950261958e-06, "loss": 0.55287027, "num_input_tokens_seen": 113536910, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.31445312, "step": 5284, "time_per_iteration": 3.346916437149048 }, { "auxiliary_loss_clip": 0.01535345, "auxiliary_loss_mlp": 0.0105261, "balance_loss_clip": 1.33354211, "balance_loss_mlp": 1.03006721, "epoch": 0.31775139035021793, "flos": 16699193458560.0, "grad_norm": 1.7349913712188814, "language_loss": 0.71384311, "learning_rate": 3.1924883118067235e-06, "loss": 0.73972267, "num_input_tokens_seen": 113555480, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.2253418, "step": 5285, "time_per_iteration": 2.851264238357544 }, { "auxiliary_loss_clip": 0.01282693, "auxiliary_loss_mlp": 0.01035553, "balance_loss_clip": 1.16454005, "balance_loss_mlp": 1.00579834, "epoch": 0.3178115136028859, "flos": 64256568981120.0, "grad_norm": 0.8183356448292595, "language_loss": 0.60528338, "learning_rate": 3.1921756281322123e-06, "loss": 0.62846583, "num_input_tokens_seen": 113616790, "router_z_loss_clip": 1.1796875, "router_z_loss_mlp": 0.296875, "step": 5286, "time_per_iteration": 3.3379528522491455 }, { "auxiliary_loss_clip": 0.01519325, "auxiliary_loss_mlp": 0.01049498, "balance_loss_clip": 1.32145345, "balance_loss_mlp": 1.02535772, "epoch": 0.31787163685555386, "flos": 18706666325760.0, "grad_norm": 3.5314672190279617, "language_loss": 0.73881066, "learning_rate": 3.1918628992502826e-06, "loss": 0.76449889, "num_input_tokens_seen": 113635320, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.24133301, "step": 5287, "time_per_iteration": 2.8710999488830566 }, { "auxiliary_loss_clip": 0.01528145, "auxiliary_loss_mlp": 0.01048708, "balance_loss_clip": 1.32783937, "balance_loss_mlp": 1.02441311, "epoch": 0.31793176010822183, "flos": 21334727877120.0, "grad_norm": 2.2439237754214325, "language_loss": 0.76906526, "learning_rate": 3.191550125172792e-06, "loss": 0.79483384, "num_input_tokens_seen": 113654000, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.24291992, "step": 5288, "time_per_iteration": 2.9577395915985107 }, { "auxiliary_loss_clip": 0.01494953, "auxiliary_loss_mlp": 0.01045771, "balance_loss_clip": 1.30304337, "balance_loss_mlp": 1.02276373, "epoch": 0.31799188336088985, "flos": 20968243845120.0, "grad_norm": 2.936952720531677, "language_loss": 0.88601214, "learning_rate": 3.1912373059116007e-06, "loss": 0.91141939, "num_input_tokens_seen": 113672375, "router_z_loss_clip": 1.91796875, "router_z_loss_mlp": 0.23022461, "step": 5289, "time_per_iteration": 4.465365171432495 }, { "auxiliary_loss_clip": 0.01493681, "auxiliary_loss_mlp": 0.01043231, "balance_loss_clip": 1.30157328, "balance_loss_mlp": 1.01959229, "epoch": 0.3180520066135578, "flos": 22502101697280.0, "grad_norm": 2.3649445543953256, "language_loss": 0.68690026, "learning_rate": 3.190924441478572e-06, "loss": 0.71226937, "num_input_tokens_seen": 113692385, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.23632812, "step": 5290, "time_per_iteration": 2.905673027038574 }, { "auxiliary_loss_clip": 0.01527473, "auxiliary_loss_mlp": 0.01047425, "balance_loss_clip": 1.32548666, "balance_loss_mlp": 1.02328515, "epoch": 0.3181121298662258, "flos": 27246803114880.0, "grad_norm": 1.8617056150926263, "language_loss": 0.80182552, "learning_rate": 3.1906115318855687e-06, "loss": 0.82757449, "num_input_tokens_seen": 113712145, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.24121094, "step": 5291, "time_per_iteration": 2.8867008686065674 }, { "auxiliary_loss_clip": 0.01513625, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.31694341, "balance_loss_mlp": 1.01748037, "epoch": 0.31817225311889374, "flos": 23189209516800.0, "grad_norm": 1.906523467256779, "language_loss": 0.80571425, "learning_rate": 3.1902985771444577e-06, "loss": 0.83126903, "num_input_tokens_seen": 113731435, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.24389648, "step": 5292, "time_per_iteration": 4.377549648284912 }, { "auxiliary_loss_clip": 0.0148209, "auxiliary_loss_mlp": 0.01041149, "balance_loss_clip": 1.29296076, "balance_loss_mlp": 1.01731908, "epoch": 0.3182323763715617, "flos": 23269347336960.0, "grad_norm": 1.650341908358965, "language_loss": 0.75991225, "learning_rate": 3.1899855772671043e-06, "loss": 0.78514469, "num_input_tokens_seen": 113750825, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.23828125, "step": 5293, "time_per_iteration": 2.887141227722168 }, { "auxiliary_loss_clip": 0.01509302, "auxiliary_loss_mlp": 0.01044661, "balance_loss_clip": 1.31486726, "balance_loss_mlp": 1.02204669, "epoch": 0.3182924996242297, "flos": 29026938268800.0, "grad_norm": 1.8377585286485858, "language_loss": 0.75709283, "learning_rate": 3.189672532265379e-06, "loss": 0.78263247, "num_input_tokens_seen": 113770010, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.22607422, "step": 5294, "time_per_iteration": 2.934217691421509 }, { "auxiliary_loss_clip": 0.01501334, "auxiliary_loss_mlp": 0.01040901, "balance_loss_clip": 1.30652428, "balance_loss_mlp": 1.01448417, "epoch": 0.31835262287689764, "flos": 20458948665600.0, "grad_norm": 2.126492740590171, "language_loss": 0.77536148, "learning_rate": 3.189359442151152e-06, "loss": 0.80078375, "num_input_tokens_seen": 113788640, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.2644043, "step": 5295, "time_per_iteration": 3.0163979530334473 }, { "auxiliary_loss_clip": 0.01530248, "auxiliary_loss_mlp": 0.01050999, "balance_loss_clip": 1.32961321, "balance_loss_mlp": 1.02732432, "epoch": 0.3184127461295656, "flos": 25130479962240.0, "grad_norm": 1.5525324505108404, "language_loss": 0.7049666, "learning_rate": 3.189046306936296e-06, "loss": 0.73077905, "num_input_tokens_seen": 113809515, "router_z_loss_clip": 2.00390625, "router_z_loss_mlp": 0.23681641, "step": 5296, "time_per_iteration": 4.373674154281616 }, { "auxiliary_loss_clip": 0.01499281, "auxiliary_loss_mlp": 0.01044929, "balance_loss_clip": 1.30503559, "balance_loss_mlp": 1.02065754, "epoch": 0.31847286938223357, "flos": 25560768441600.0, "grad_norm": 2.137614688164464, "language_loss": 0.78315407, "learning_rate": 3.1887331266326846e-06, "loss": 0.80859613, "num_input_tokens_seen": 113829770, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.24243164, "step": 5297, "time_per_iteration": 4.324610948562622 }, { "auxiliary_loss_clip": 0.01498221, "auxiliary_loss_mlp": 0.01040799, "balance_loss_clip": 1.30472946, "balance_loss_mlp": 1.01576471, "epoch": 0.31853299263490154, "flos": 27794040698880.0, "grad_norm": 1.8989612074180957, "language_loss": 0.80110586, "learning_rate": 3.1884199012521942e-06, "loss": 0.82649601, "num_input_tokens_seen": 113849320, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.25012207, "step": 5298, "time_per_iteration": 2.925905466079712 }, { "auxiliary_loss_clip": 0.01526982, "auxiliary_loss_mlp": 0.0104122, "balance_loss_clip": 1.326823, "balance_loss_mlp": 1.01684129, "epoch": 0.3185931158875695, "flos": 22715911215360.0, "grad_norm": 1.7419058099837923, "language_loss": 0.75340676, "learning_rate": 3.1881066308067016e-06, "loss": 0.77908874, "num_input_tokens_seen": 113867860, "router_z_loss_clip": 2.00097656, "router_z_loss_mlp": 0.24389648, "step": 5299, "time_per_iteration": 2.9240753650665283 }, { "auxiliary_loss_clip": 0.01507897, "auxiliary_loss_mlp": 0.01044789, "balance_loss_clip": 1.30929267, "balance_loss_mlp": 1.0193733, "epoch": 0.31865323914023747, "flos": 24582201747840.0, "grad_norm": 4.567643443459848, "language_loss": 0.79424596, "learning_rate": 3.1877933153080873e-06, "loss": 0.81977284, "num_input_tokens_seen": 113886375, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.25427246, "step": 5300, "time_per_iteration": 2.895050287246704 }, { "auxiliary_loss_clip": 0.01505301, "auxiliary_loss_mlp": 0.01042003, "balance_loss_clip": 1.30979776, "balance_loss_mlp": 1.01622987, "epoch": 0.31871336239290543, "flos": 18195606599040.0, "grad_norm": 1.815257939279845, "language_loss": 0.84755194, "learning_rate": 3.1874799547682304e-06, "loss": 0.87302488, "num_input_tokens_seen": 113904065, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.25793457, "step": 5301, "time_per_iteration": 2.849749803543091 }, { "auxiliary_loss_clip": 0.01499102, "auxiliary_loss_mlp": 0.01047193, "balance_loss_clip": 1.30616534, "balance_loss_mlp": 1.02172947, "epoch": 0.31877348564557345, "flos": 21834566893440.0, "grad_norm": 3.0594244529928143, "language_loss": 0.77962416, "learning_rate": 3.187166549199015e-06, "loss": 0.80508709, "num_input_tokens_seen": 113918415, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.25439453, "step": 5302, "time_per_iteration": 2.855435371398926 }, { "auxiliary_loss_clip": 0.01480554, "auxiliary_loss_mlp": 0.01039628, "balance_loss_clip": 1.28964376, "balance_loss_mlp": 1.01378393, "epoch": 0.3188336088982414, "flos": 22024686119040.0, "grad_norm": 1.6729728799638008, "language_loss": 0.80438745, "learning_rate": 3.1868530986123255e-06, "loss": 0.82958925, "num_input_tokens_seen": 113938135, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.25866699, "step": 5303, "time_per_iteration": 2.85546612739563 }, { "auxiliary_loss_clip": 0.01529562, "auxiliary_loss_mlp": 0.01046349, "balance_loss_clip": 1.32491672, "balance_loss_mlp": 1.01993251, "epoch": 0.3188937321509094, "flos": 20057553630720.0, "grad_norm": 2.7398417132049055, "language_loss": 0.74048752, "learning_rate": 3.186539603020047e-06, "loss": 0.76624668, "num_input_tokens_seen": 113957125, "router_z_loss_clip": 2.04589844, "router_z_loss_mlp": 0.26416016, "step": 5304, "time_per_iteration": 2.8784852027893066 }, { "auxiliary_loss_clip": 0.01499255, "auxiliary_loss_mlp": 0.01039895, "balance_loss_clip": 1.3081212, "balance_loss_mlp": 1.01571929, "epoch": 0.31895385540357735, "flos": 25859014035840.0, "grad_norm": 3.223280749264502, "language_loss": 0.72905147, "learning_rate": 3.186226062434068e-06, "loss": 0.75444299, "num_input_tokens_seen": 113974875, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.24182129, "step": 5305, "time_per_iteration": 2.881859540939331 }, { "auxiliary_loss_clip": 0.01507751, "auxiliary_loss_mlp": 0.01040996, "balance_loss_clip": 1.31210828, "balance_loss_mlp": 1.01612926, "epoch": 0.3190139786562453, "flos": 23488314762240.0, "grad_norm": 1.9060531785785426, "language_loss": 0.64715576, "learning_rate": 3.1859124768662778e-06, "loss": 0.6726433, "num_input_tokens_seen": 113994450, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.2487793, "step": 5306, "time_per_iteration": 2.8981449604034424 }, { "auxiliary_loss_clip": 0.01502421, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.30624664, "balance_loss_mlp": 1.02013421, "epoch": 0.3190741019089133, "flos": 29107347557760.0, "grad_norm": 7.353705614910065, "language_loss": 0.80890465, "learning_rate": 3.1855988463285678e-06, "loss": 0.83437699, "num_input_tokens_seen": 114013945, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.24682617, "step": 5307, "time_per_iteration": 2.9293360710144043 }, { "auxiliary_loss_clip": 0.01490708, "auxiliary_loss_mlp": 0.01036762, "balance_loss_clip": 1.29910719, "balance_loss_mlp": 1.01167989, "epoch": 0.31913422516158124, "flos": 17138576142720.0, "grad_norm": 1.943762525310381, "language_loss": 0.79073042, "learning_rate": 3.1852851708328308e-06, "loss": 0.81600517, "num_input_tokens_seen": 114031375, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.25085449, "step": 5308, "time_per_iteration": 2.82450008392334 }, { "auxiliary_loss_clip": 0.01537431, "auxiliary_loss_mlp": 0.01043755, "balance_loss_clip": 1.33141041, "balance_loss_mlp": 1.01705205, "epoch": 0.3191943484142492, "flos": 16078197571200.0, "grad_norm": 2.4926081454639437, "language_loss": 0.75529838, "learning_rate": 3.184971450390961e-06, "loss": 0.78111023, "num_input_tokens_seen": 114048465, "router_z_loss_clip": 2.06445312, "router_z_loss_mlp": 0.26733398, "step": 5309, "time_per_iteration": 2.9043562412261963 }, { "auxiliary_loss_clip": 0.01509188, "auxiliary_loss_mlp": 0.01041093, "balance_loss_clip": 1.31470251, "balance_loss_mlp": 1.0172987, "epoch": 0.3192544716669172, "flos": 22976440629120.0, "grad_norm": 2.2235721323288473, "language_loss": 0.83737171, "learning_rate": 3.184657685014856e-06, "loss": 0.86287451, "num_input_tokens_seen": 114068415, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.23803711, "step": 5310, "time_per_iteration": 2.87599778175354 }, { "auxiliary_loss_clip": 0.01494874, "auxiliary_loss_mlp": 0.01043144, "balance_loss_clip": 1.30101752, "balance_loss_mlp": 1.01832438, "epoch": 0.31931459491958514, "flos": 26881676426880.0, "grad_norm": 1.3560862230278141, "language_loss": 0.7887913, "learning_rate": 3.184343874716412e-06, "loss": 0.81417143, "num_input_tokens_seen": 114088565, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.24804688, "step": 5311, "time_per_iteration": 2.9371345043182373 }, { "auxiliary_loss_clip": 0.0149422, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.30130279, "balance_loss_mlp": 1.01704729, "epoch": 0.3193747181722531, "flos": 21846873479040.0, "grad_norm": 1.6585465383861815, "language_loss": 0.8481583, "learning_rate": 3.1840300195075295e-06, "loss": 0.87351346, "num_input_tokens_seen": 114107160, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.24267578, "step": 5312, "time_per_iteration": 2.851501226425171 }, { "auxiliary_loss_clip": 0.01519629, "auxiliary_loss_mlp": 0.0104201, "balance_loss_clip": 1.31977987, "balance_loss_mlp": 1.01713061, "epoch": 0.31943484142492107, "flos": 18332128719360.0, "grad_norm": 3.8154055635796147, "language_loss": 0.80432582, "learning_rate": 3.1837161194001102e-06, "loss": 0.82994223, "num_input_tokens_seen": 114123420, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 0.24902344, "step": 5313, "time_per_iteration": 2.912537097930908 }, { "auxiliary_loss_clip": 0.01509123, "auxiliary_loss_mlp": 0.01045364, "balance_loss_clip": 1.31395411, "balance_loss_mlp": 1.02055681, "epoch": 0.31949496467758903, "flos": 21625643813760.0, "grad_norm": 2.2916589673116188, "language_loss": 0.877514, "learning_rate": 3.183402174406057e-06, "loss": 0.90305889, "num_input_tokens_seen": 114139230, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.24780273, "step": 5314, "time_per_iteration": 2.8988044261932373 }, { "auxiliary_loss_clip": 0.01503583, "auxiliary_loss_mlp": 0.01049566, "balance_loss_clip": 1.30901885, "balance_loss_mlp": 1.02305388, "epoch": 0.31955508793025705, "flos": 21769676570880.0, "grad_norm": 1.8344217502432987, "language_loss": 0.80581528, "learning_rate": 3.1830881845372747e-06, "loss": 0.83134675, "num_input_tokens_seen": 114159290, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.26538086, "step": 5315, "time_per_iteration": 2.8809375762939453 }, { "auxiliary_loss_clip": 0.01510424, "auxiliary_loss_mlp": 0.01046538, "balance_loss_clip": 1.31520784, "balance_loss_mlp": 1.02199328, "epoch": 0.319615211182925, "flos": 17172944208000.0, "grad_norm": 2.009734922160297, "language_loss": 0.68688428, "learning_rate": 3.18277414980567e-06, "loss": 0.71245384, "num_input_tokens_seen": 114177655, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.24536133, "step": 5316, "time_per_iteration": 2.9032540321350098 }, { "auxiliary_loss_clip": 0.01502258, "auxiliary_loss_mlp": 0.01043002, "balance_loss_clip": 1.31049979, "balance_loss_mlp": 1.01900518, "epoch": 0.319675334435593, "flos": 28124980300800.0, "grad_norm": 1.4697451563879496, "language_loss": 0.70284879, "learning_rate": 3.1824600702231515e-06, "loss": 0.72830141, "num_input_tokens_seen": 114200880, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.2401123, "step": 5317, "time_per_iteration": 2.927469491958618 }, { "auxiliary_loss_clip": 0.01279234, "auxiliary_loss_mlp": 0.01053905, "balance_loss_clip": 1.15940142, "balance_loss_mlp": 1.02453196, "epoch": 0.31973545768826095, "flos": 69536988541440.0, "grad_norm": 0.7358679920987604, "language_loss": 0.53204483, "learning_rate": 3.182145945801628e-06, "loss": 0.55537623, "num_input_tokens_seen": 114267145, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.29296875, "step": 5318, "time_per_iteration": 3.518836259841919 }, { "auxiliary_loss_clip": 0.01495398, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.30451035, "balance_loss_mlp": 1.02344251, "epoch": 0.3197955809409289, "flos": 13707181584000.0, "grad_norm": 1.8950739840740365, "language_loss": 0.85014379, "learning_rate": 3.181831776553012e-06, "loss": 0.87557364, "num_input_tokens_seen": 114284630, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.24145508, "step": 5319, "time_per_iteration": 2.9108405113220215 }, { "auxiliary_loss_clip": 0.01487587, "auxiliary_loss_mlp": 0.01046387, "balance_loss_clip": 1.29560065, "balance_loss_mlp": 1.02167487, "epoch": 0.3198557041935969, "flos": 33230917353600.0, "grad_norm": 1.6935531941561635, "language_loss": 0.64388299, "learning_rate": 3.1815175624892165e-06, "loss": 0.66922277, "num_input_tokens_seen": 114305830, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.24731445, "step": 5320, "time_per_iteration": 2.966646909713745 }, { "auxiliary_loss_clip": 0.01504953, "auxiliary_loss_mlp": 0.0104249, "balance_loss_clip": 1.30914879, "balance_loss_mlp": 1.01811206, "epoch": 0.31991582744626484, "flos": 23742419414400.0, "grad_norm": 1.981286110337951, "language_loss": 0.71396059, "learning_rate": 3.1812033036221567e-06, "loss": 0.73943508, "num_input_tokens_seen": 114325165, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.24389648, "step": 5321, "time_per_iteration": 2.9343106746673584 }, { "auxiliary_loss_clip": 0.01530181, "auxiliary_loss_mlp": 0.0105166, "balance_loss_clip": 1.32853031, "balance_loss_mlp": 1.02604198, "epoch": 0.3199759506989328, "flos": 18559828391040.0, "grad_norm": 2.6224458117916005, "language_loss": 0.87034047, "learning_rate": 3.180888999963749e-06, "loss": 0.89615887, "num_input_tokens_seen": 114341310, "router_z_loss_clip": 2.01660156, "router_z_loss_mlp": 0.25622559, "step": 5322, "time_per_iteration": 2.8440749645233154 }, { "auxiliary_loss_clip": 0.01500771, "auxiliary_loss_mlp": 0.01041034, "balance_loss_clip": 1.30723047, "balance_loss_mlp": 1.0174191, "epoch": 0.3200360739516008, "flos": 22429022065920.0, "grad_norm": 1.7146123566441598, "language_loss": 0.83791494, "learning_rate": 3.1805746515259123e-06, "loss": 0.86333299, "num_input_tokens_seen": 114360355, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.23596191, "step": 5323, "time_per_iteration": 2.9148483276367188 }, { "auxiliary_loss_clip": 0.01488063, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.29705143, "balance_loss_mlp": 1.01342797, "epoch": 0.32009619720426874, "flos": 20605062683520.0, "grad_norm": 1.729706145607028, "language_loss": 0.79316449, "learning_rate": 3.1802602583205663e-06, "loss": 0.81843609, "num_input_tokens_seen": 114379220, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.25671387, "step": 5324, "time_per_iteration": 2.8997793197631836 }, { "auxiliary_loss_clip": 0.01507107, "auxiliary_loss_mlp": 0.01045589, "balance_loss_clip": 1.31325972, "balance_loss_mlp": 1.02042389, "epoch": 0.3201563204569367, "flos": 18156035381760.0, "grad_norm": 1.925002493463545, "language_loss": 0.80968869, "learning_rate": 3.1799458203596333e-06, "loss": 0.83521569, "num_input_tokens_seen": 114396365, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.25146484, "step": 5325, "time_per_iteration": 4.305352449417114 }, { "auxiliary_loss_clip": 0.0151269, "auxiliary_loss_mlp": 0.01040029, "balance_loss_clip": 1.31706464, "balance_loss_mlp": 1.01628256, "epoch": 0.32021644370960467, "flos": 31696245095040.0, "grad_norm": 1.5774943141256061, "language_loss": 0.75823343, "learning_rate": 3.179631337655037e-06, "loss": 0.78376067, "num_input_tokens_seen": 114416780, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.23754883, "step": 5326, "time_per_iteration": 2.9322025775909424 }, { "auxiliary_loss_clip": 0.01493366, "auxiliary_loss_mlp": 0.01044239, "balance_loss_clip": 1.30160618, "balance_loss_mlp": 1.01915693, "epoch": 0.32027656696227264, "flos": 26876111316480.0, "grad_norm": 1.696775430110584, "language_loss": 0.8155514, "learning_rate": 3.179316810218701e-06, "loss": 0.84092748, "num_input_tokens_seen": 114437405, "router_z_loss_clip": 1.91796875, "router_z_loss_mlp": 0.25097656, "step": 5327, "time_per_iteration": 4.351373672485352 }, { "auxiliary_loss_clip": 0.01511899, "auxiliary_loss_mlp": 0.01044179, "balance_loss_clip": 1.31279826, "balance_loss_mlp": 1.01993132, "epoch": 0.32033669021494066, "flos": 24180444754560.0, "grad_norm": 2.0529185265331766, "language_loss": 0.78308213, "learning_rate": 3.179002238062554e-06, "loss": 0.80864286, "num_input_tokens_seen": 114458505, "router_z_loss_clip": 1.99121094, "router_z_loss_mlp": 0.24267578, "step": 5328, "time_per_iteration": 2.9056758880615234 }, { "auxiliary_loss_clip": 0.01514573, "auxiliary_loss_mlp": 0.01045751, "balance_loss_clip": 1.31734669, "balance_loss_mlp": 1.02051449, "epoch": 0.3203968134676086, "flos": 24471089222400.0, "grad_norm": 1.5627652265682492, "language_loss": 0.74572384, "learning_rate": 3.178687621198524e-06, "loss": 0.77132702, "num_input_tokens_seen": 114479050, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.25256348, "step": 5329, "time_per_iteration": 2.897634267807007 }, { "auxiliary_loss_clip": 0.01478875, "auxiliary_loss_mlp": 0.01039848, "balance_loss_clip": 1.29226398, "balance_loss_mlp": 1.01617289, "epoch": 0.3204569367202766, "flos": 18013857661440.0, "grad_norm": 1.6730705785783615, "language_loss": 0.72461259, "learning_rate": 3.1783729596385415e-06, "loss": 0.74979985, "num_input_tokens_seen": 114497415, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.23669434, "step": 5330, "time_per_iteration": 2.849073886871338 }, { "auxiliary_loss_clip": 0.01515442, "auxiliary_loss_mlp": 0.01044581, "balance_loss_clip": 1.3160038, "balance_loss_mlp": 1.01872468, "epoch": 0.32051705997294455, "flos": 30601407968640.0, "grad_norm": 1.7551329345286848, "language_loss": 0.80693853, "learning_rate": 3.1780582533945376e-06, "loss": 0.83253872, "num_input_tokens_seen": 114518785, "router_z_loss_clip": 1.9921875, "router_z_loss_mlp": 0.25866699, "step": 5331, "time_per_iteration": 4.464034795761108 }, { "auxiliary_loss_clip": 0.01270828, "auxiliary_loss_mlp": 0.01065461, "balance_loss_clip": 1.15670526, "balance_loss_mlp": 1.04066586, "epoch": 0.3205771832256125, "flos": 68447065115520.0, "grad_norm": 0.8561338478182748, "language_loss": 0.578354, "learning_rate": 3.177743502478447e-06, "loss": 0.60171694, "num_input_tokens_seen": 114577710, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.24804688, "step": 5332, "time_per_iteration": 4.694497585296631 }, { "auxiliary_loss_clip": 0.01514508, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 1.31447136, "balance_loss_mlp": 1.01855707, "epoch": 0.3206373064782805, "flos": 30455701153920.0, "grad_norm": 1.5448587946852224, "language_loss": 0.73884761, "learning_rate": 3.177428706902205e-06, "loss": 0.76441741, "num_input_tokens_seen": 114598640, "router_z_loss_clip": 1.99804688, "router_z_loss_mlp": 0.23937988, "step": 5333, "time_per_iteration": 2.963329553604126 }, { "auxiliary_loss_clip": 0.01507894, "auxiliary_loss_mlp": 0.01042357, "balance_loss_clip": 1.31236005, "balance_loss_mlp": 1.01732326, "epoch": 0.32069742973094845, "flos": 22064664539520.0, "grad_norm": 2.0280929843213804, "language_loss": 0.71791255, "learning_rate": 3.1771138666777485e-06, "loss": 0.74341512, "num_input_tokens_seen": 114618780, "router_z_loss_clip": 1.95507812, "router_z_loss_mlp": 0.25048828, "step": 5334, "time_per_iteration": 2.9486796855926514 }, { "auxiliary_loss_clip": 0.01496207, "auxiliary_loss_mlp": 0.01043404, "balance_loss_clip": 1.30273485, "balance_loss_mlp": 1.01896608, "epoch": 0.3207575529836164, "flos": 22064031112320.0, "grad_norm": 2.0313439405830236, "language_loss": 0.78361452, "learning_rate": 3.1767989818170156e-06, "loss": 0.80901057, "num_input_tokens_seen": 114637525, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.24450684, "step": 5335, "time_per_iteration": 2.906370162963867 }, { "auxiliary_loss_clip": 0.01504746, "auxiliary_loss_mlp": 0.01044838, "balance_loss_clip": 1.31017196, "balance_loss_mlp": 1.02079344, "epoch": 0.3208176762362844, "flos": 34070609197440.0, "grad_norm": 1.5786535116271179, "language_loss": 0.69107306, "learning_rate": 3.1764840523319477e-06, "loss": 0.71656895, "num_input_tokens_seen": 114659705, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.24047852, "step": 5336, "time_per_iteration": 2.991572856903076 }, { "auxiliary_loss_clip": 0.01511328, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.31619787, "balance_loss_mlp": 1.02249932, "epoch": 0.32087779948895234, "flos": 21808704850560.0, "grad_norm": 1.988523406493965, "language_loss": 0.79552644, "learning_rate": 3.176169078234487e-06, "loss": 0.82111096, "num_input_tokens_seen": 114678340, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.24609375, "step": 5337, "time_per_iteration": 2.8606202602386475 }, { "auxiliary_loss_clip": 0.01482128, "auxiliary_loss_mlp": 0.0104389, "balance_loss_clip": 1.29272532, "balance_loss_mlp": 1.02077556, "epoch": 0.3209379227416203, "flos": 21444075855360.0, "grad_norm": 1.8553450019254087, "language_loss": 0.75340986, "learning_rate": 3.1758540595365766e-06, "loss": 0.77867007, "num_input_tokens_seen": 114696980, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.23120117, "step": 5338, "time_per_iteration": 2.8953824043273926 }, { "auxiliary_loss_clip": 0.0150145, "auxiliary_loss_mlp": 0.01046601, "balance_loss_clip": 1.30416012, "balance_loss_mlp": 1.02173352, "epoch": 0.3209980459942883, "flos": 25860235645440.0, "grad_norm": 2.7753681460445536, "language_loss": 0.63180691, "learning_rate": 3.1755389962501626e-06, "loss": 0.65728736, "num_input_tokens_seen": 114717330, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.24865723, "step": 5339, "time_per_iteration": 2.869962215423584 }, { "auxiliary_loss_clip": 0.01513265, "auxiliary_loss_mlp": 0.01049265, "balance_loss_clip": 1.31550992, "balance_loss_mlp": 1.02319348, "epoch": 0.32105816924695624, "flos": 19108559053440.0, "grad_norm": 2.4968396419650647, "language_loss": 0.82650727, "learning_rate": 3.175223888387192e-06, "loss": 0.85213256, "num_input_tokens_seen": 114736320, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.26049805, "step": 5340, "time_per_iteration": 2.867220401763916 }, { "auxiliary_loss_clip": 0.01499916, "auxiliary_loss_mlp": 0.01041393, "balance_loss_clip": 1.30556405, "balance_loss_mlp": 1.01763427, "epoch": 0.3211182924996242, "flos": 16590659886720.0, "grad_norm": 3.1508307507214273, "language_loss": 0.7701298, "learning_rate": 3.1749087359596137e-06, "loss": 0.7955429, "num_input_tokens_seen": 114754575, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.23779297, "step": 5341, "time_per_iteration": 2.847330093383789 }, { "auxiliary_loss_clip": 0.01491606, "auxiliary_loss_mlp": 0.01047078, "balance_loss_clip": 1.30107355, "balance_loss_mlp": 1.02279544, "epoch": 0.3211784157522922, "flos": 22681859863680.0, "grad_norm": 1.6465805022781819, "language_loss": 0.79932237, "learning_rate": 3.1745935389793786e-06, "loss": 0.82470924, "num_input_tokens_seen": 114773590, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.24279785, "step": 5342, "time_per_iteration": 2.8411126136779785 }, { "auxiliary_loss_clip": 0.01509201, "auxiliary_loss_mlp": 0.01043377, "balance_loss_clip": 1.31171179, "balance_loss_mlp": 1.01904678, "epoch": 0.3212385390049602, "flos": 20568613357440.0, "grad_norm": 4.038830366834268, "language_loss": 0.76084775, "learning_rate": 3.174278297458438e-06, "loss": 0.7863735, "num_input_tokens_seen": 114790775, "router_z_loss_clip": 1.97558594, "router_z_loss_mlp": 0.24365234, "step": 5343, "time_per_iteration": 2.8592886924743652 }, { "auxiliary_loss_clip": 0.01504663, "auxiliary_loss_mlp": 0.01043492, "balance_loss_clip": 1.30916917, "balance_loss_mlp": 1.01911414, "epoch": 0.32129866225762815, "flos": 24802074069120.0, "grad_norm": 1.5599903316441746, "language_loss": 0.83774114, "learning_rate": 3.173963011408748e-06, "loss": 0.86322272, "num_input_tokens_seen": 114809835, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.24389648, "step": 5344, "time_per_iteration": 2.98291277885437 }, { "auxiliary_loss_clip": 0.01510296, "auxiliary_loss_mlp": 0.01048345, "balance_loss_clip": 1.31282878, "balance_loss_mlp": 1.02403784, "epoch": 0.3213587855102961, "flos": 18375410010240.0, "grad_norm": 2.3192202015701975, "language_loss": 0.81287825, "learning_rate": 3.173647680842262e-06, "loss": 0.83846462, "num_input_tokens_seen": 114826505, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.24316406, "step": 5345, "time_per_iteration": 2.874897003173828 }, { "auxiliary_loss_clip": 0.01510307, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.3132931, "balance_loss_mlp": 1.02079976, "epoch": 0.3214189087629641, "flos": 27027292752000.0, "grad_norm": 1.970850704696115, "language_loss": 0.83986872, "learning_rate": 3.1733323057709384e-06, "loss": 0.86541367, "num_input_tokens_seen": 114846140, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.23413086, "step": 5346, "time_per_iteration": 2.9444947242736816 }, { "auxiliary_loss_clip": 0.01526626, "auxiliary_loss_mlp": 0.01047769, "balance_loss_clip": 1.32687318, "balance_loss_mlp": 1.02386737, "epoch": 0.32147903201563205, "flos": 23158506280320.0, "grad_norm": 1.6772972679721327, "language_loss": 0.82014918, "learning_rate": 3.1730168862067366e-06, "loss": 0.84589314, "num_input_tokens_seen": 114866660, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.23913574, "step": 5347, "time_per_iteration": 2.8962972164154053 }, { "auxiliary_loss_clip": 0.01504362, "auxiliary_loss_mlp": 0.01049539, "balance_loss_clip": 1.30958116, "balance_loss_mlp": 1.02439737, "epoch": 0.3215391552683, "flos": 16589257297920.0, "grad_norm": 2.1282630294847062, "language_loss": 0.80272722, "learning_rate": 3.1727014221616164e-06, "loss": 0.8282662, "num_input_tokens_seen": 114882820, "router_z_loss_clip": 1.94628906, "router_z_loss_mlp": 0.25134277, "step": 5348, "time_per_iteration": 2.931619167327881 }, { "auxiliary_loss_clip": 0.01507107, "auxiliary_loss_mlp": 0.01042901, "balance_loss_clip": 1.31042099, "balance_loss_mlp": 1.01863027, "epoch": 0.321599278520968, "flos": 17830796624640.0, "grad_norm": 2.2070303587777174, "language_loss": 0.85969353, "learning_rate": 3.172385913647542e-06, "loss": 0.88519359, "num_input_tokens_seen": 114900745, "router_z_loss_clip": 1.96679688, "router_z_loss_mlp": 0.24243164, "step": 5349, "time_per_iteration": 2.86352276802063 }, { "auxiliary_loss_clip": 0.01513769, "auxiliary_loss_mlp": 0.01044062, "balance_loss_clip": 1.31608808, "balance_loss_mlp": 1.02062523, "epoch": 0.32165940177363594, "flos": 16260036998400.0, "grad_norm": 8.085624213604277, "language_loss": 0.8072415, "learning_rate": 3.172070360676475e-06, "loss": 0.83281982, "num_input_tokens_seen": 114917940, "router_z_loss_clip": 1.97363281, "router_z_loss_mlp": 0.23425293, "step": 5350, "time_per_iteration": 2.876023292541504 }, { "auxiliary_loss_clip": 0.0150585, "auxiliary_loss_mlp": 0.0105078, "balance_loss_clip": 1.31195569, "balance_loss_mlp": 1.02807033, "epoch": 0.3217195250263039, "flos": 27611386865280.0, "grad_norm": 1.827492185840312, "language_loss": 0.80484056, "learning_rate": 3.1717547632603828e-06, "loss": 0.83040684, "num_input_tokens_seen": 114937735, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.22717285, "step": 5351, "time_per_iteration": 2.907456874847412 }, { "auxiliary_loss_clip": 0.01502213, "auxiliary_loss_mlp": 0.01048043, "balance_loss_clip": 1.30654693, "balance_loss_mlp": 1.02358174, "epoch": 0.3217796482789719, "flos": 21480570426240.0, "grad_norm": 5.144025406040443, "language_loss": 0.76512444, "learning_rate": 3.1714391214112326e-06, "loss": 0.790627, "num_input_tokens_seen": 114956630, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.24475098, "step": 5352, "time_per_iteration": 2.88177752494812 }, { "auxiliary_loss_clip": 0.01496438, "auxiliary_loss_mlp": 0.01041626, "balance_loss_clip": 1.30289221, "balance_loss_mlp": 1.01749778, "epoch": 0.32183977153163984, "flos": 21225606122880.0, "grad_norm": 1.8047148154779895, "language_loss": 0.83215374, "learning_rate": 3.1711234351409933e-06, "loss": 0.85753435, "num_input_tokens_seen": 114976470, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.24108887, "step": 5353, "time_per_iteration": 2.8809335231781006 }, { "auxiliary_loss_clip": 0.01499392, "auxiliary_loss_mlp": 0.01046627, "balance_loss_clip": 1.30732584, "balance_loss_mlp": 1.02241588, "epoch": 0.3218998947843078, "flos": 24618922542720.0, "grad_norm": 1.6116840527155312, "language_loss": 0.73966312, "learning_rate": 3.1708077044616365e-06, "loss": 0.76512325, "num_input_tokens_seen": 114996710, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.24230957, "step": 5354, "time_per_iteration": 2.9160239696502686 }, { "auxiliary_loss_clip": 0.01503047, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.30916739, "balance_loss_mlp": 1.02225113, "epoch": 0.3219600180369758, "flos": 22280057625600.0, "grad_norm": 1.722602069476886, "language_loss": 0.84201694, "learning_rate": 3.1704919293851334e-06, "loss": 0.86749697, "num_input_tokens_seen": 115015775, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.22717285, "step": 5355, "time_per_iteration": 2.89809250831604 }, { "auxiliary_loss_clip": 0.01534049, "auxiliary_loss_mlp": 0.0104605, "balance_loss_clip": 1.33693838, "balance_loss_mlp": 1.02276862, "epoch": 0.3220201412896438, "flos": 14947273077120.0, "grad_norm": 1.8353633040347797, "language_loss": 0.7184478, "learning_rate": 3.1701761099234597e-06, "loss": 0.74424875, "num_input_tokens_seen": 115034265, "router_z_loss_clip": 1.96679688, "router_z_loss_mlp": 0.23291016, "step": 5356, "time_per_iteration": 2.82808518409729 }, { "auxiliary_loss_clip": 0.0154925, "auxiliary_loss_mlp": 0.01050565, "balance_loss_clip": 1.34406435, "balance_loss_mlp": 1.02655578, "epoch": 0.32208026454231176, "flos": 22675797060480.0, "grad_norm": 4.232243204612482, "language_loss": 0.69794881, "learning_rate": 3.1698602460885903e-06, "loss": 0.72394699, "num_input_tokens_seen": 115051945, "router_z_loss_clip": 2.05175781, "router_z_loss_mlp": 0.23999023, "step": 5357, "time_per_iteration": 2.892834424972534 }, { "auxiliary_loss_clip": 0.01277731, "auxiliary_loss_mlp": 0.01038206, "balance_loss_clip": 1.16160512, "balance_loss_mlp": 1.01665306, "epoch": 0.3221403877949797, "flos": 64637938552320.0, "grad_norm": 0.7138098629525257, "language_loss": 0.58317238, "learning_rate": 3.1695443378925035e-06, "loss": 0.60633177, "num_input_tokens_seen": 115119090, "router_z_loss_clip": 1.1640625, "router_z_loss_mlp": 0.21582031, "step": 5358, "time_per_iteration": 3.468287944793701 }, { "auxiliary_loss_clip": 0.01520934, "auxiliary_loss_mlp": 0.01045565, "balance_loss_clip": 1.32382381, "balance_loss_mlp": 1.02078092, "epoch": 0.3222005110476477, "flos": 20166630140160.0, "grad_norm": 1.685544407835655, "language_loss": 0.84127069, "learning_rate": 3.1692283853471777e-06, "loss": 0.86693573, "num_input_tokens_seen": 115137755, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.24804688, "step": 5359, "time_per_iteration": 2.8964905738830566 }, { "auxiliary_loss_clip": 0.01508286, "auxiliary_loss_mlp": 0.0104186, "balance_loss_clip": 1.31334949, "balance_loss_mlp": 1.01966286, "epoch": 0.32226063430031565, "flos": 22684348327680.0, "grad_norm": 1.7210720576681828, "language_loss": 0.80186486, "learning_rate": 3.168912388464595e-06, "loss": 0.82736635, "num_input_tokens_seen": 115158150, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.2220459, "step": 5360, "time_per_iteration": 4.317646503448486 }, { "auxiliary_loss_clip": 0.01272725, "auxiliary_loss_mlp": 0.01020865, "balance_loss_clip": 1.15493631, "balance_loss_mlp": 1.00045609, "epoch": 0.3223207575529836, "flos": 63858431571840.0, "grad_norm": 0.661459346900478, "language_loss": 0.57128668, "learning_rate": 3.168596347256737e-06, "loss": 0.59422266, "num_input_tokens_seen": 115212755, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.20410156, "step": 5361, "time_per_iteration": 3.202414035797119 }, { "auxiliary_loss_clip": 0.01516588, "auxiliary_loss_mlp": 0.01045312, "balance_loss_clip": 1.32122874, "balance_loss_mlp": 1.02143443, "epoch": 0.3223808808056516, "flos": 26881404958080.0, "grad_norm": 2.68784670844084, "language_loss": 0.71984565, "learning_rate": 3.168280261735588e-06, "loss": 0.74546462, "num_input_tokens_seen": 115233090, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.23876953, "step": 5362, "time_per_iteration": 4.461483478546143 }, { "auxiliary_loss_clip": 0.01513895, "auxiliary_loss_mlp": 0.01050532, "balance_loss_clip": 1.31731308, "balance_loss_mlp": 1.02715516, "epoch": 0.32244100405831955, "flos": 26772780896640.0, "grad_norm": 2.1685097266721978, "language_loss": 0.74403387, "learning_rate": 3.167964131913135e-06, "loss": 0.76967812, "num_input_tokens_seen": 115252645, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.23388672, "step": 5363, "time_per_iteration": 2.8971471786499023 }, { "auxiliary_loss_clip": 0.01528394, "auxiliary_loss_mlp": 0.01040313, "balance_loss_clip": 1.3255055, "balance_loss_mlp": 1.01637578, "epoch": 0.3225011273109875, "flos": 23812920092160.0, "grad_norm": 6.571431689959631, "language_loss": 0.76873839, "learning_rate": 3.167647957801365e-06, "loss": 0.79442549, "num_input_tokens_seen": 115269085, "router_z_loss_clip": 2.02734375, "router_z_loss_mlp": 0.23925781, "step": 5364, "time_per_iteration": 2.9084885120391846 }, { "auxiliary_loss_clip": 0.01514111, "auxiliary_loss_mlp": 0.01044065, "balance_loss_clip": 1.31708765, "balance_loss_mlp": 1.02012801, "epoch": 0.3225612505636555, "flos": 17283332816640.0, "grad_norm": 2.401875069986875, "language_loss": 0.77783775, "learning_rate": 3.1673317394122672e-06, "loss": 0.80341953, "num_input_tokens_seen": 115286470, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.23950195, "step": 5365, "time_per_iteration": 2.8234853744506836 }, { "auxiliary_loss_clip": 0.01527863, "auxiliary_loss_mlp": 0.01044444, "balance_loss_clip": 1.32998908, "balance_loss_mlp": 1.02075744, "epoch": 0.32262137381632344, "flos": 23376297340800.0, "grad_norm": 2.1033965000500476, "language_loss": 0.7708075, "learning_rate": 3.1670154767578333e-06, "loss": 0.79653066, "num_input_tokens_seen": 115307000, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.23706055, "step": 5366, "time_per_iteration": 4.297854423522949 }, { "auxiliary_loss_clip": 0.01517798, "auxiliary_loss_mlp": 0.01044232, "balance_loss_clip": 1.32155716, "balance_loss_mlp": 1.02186882, "epoch": 0.3226814970689914, "flos": 23269347336960.0, "grad_norm": 1.883545976863215, "language_loss": 0.72873425, "learning_rate": 3.166699169850055e-06, "loss": 0.7543546, "num_input_tokens_seen": 115325925, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.22375488, "step": 5367, "time_per_iteration": 4.2808732986450195 }, { "auxiliary_loss_clip": 0.01510892, "auxiliary_loss_mlp": 0.01040907, "balance_loss_clip": 1.31582832, "balance_loss_mlp": 1.01713657, "epoch": 0.32274162032165943, "flos": 16402848145920.0, "grad_norm": 2.5983664018930663, "language_loss": 0.75887817, "learning_rate": 3.1663828187009274e-06, "loss": 0.78439617, "num_input_tokens_seen": 115343705, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.23779297, "step": 5368, "time_per_iteration": 2.8614120483398438 }, { "auxiliary_loss_clip": 0.01506221, "auxiliary_loss_mlp": 0.01035968, "balance_loss_clip": 1.3101908, "balance_loss_mlp": 1.01337838, "epoch": 0.3228017435743274, "flos": 27866577392640.0, "grad_norm": 2.2466874673525914, "language_loss": 0.79037303, "learning_rate": 3.1660664233224467e-06, "loss": 0.81579494, "num_input_tokens_seen": 115364170, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.22595215, "step": 5369, "time_per_iteration": 2.943206548690796 }, { "auxiliary_loss_clip": 0.01508363, "auxiliary_loss_mlp": 0.01036193, "balance_loss_clip": 1.31682611, "balance_loss_mlp": 1.01374602, "epoch": 0.32286186682699536, "flos": 19617989967360.0, "grad_norm": 2.167055807126553, "language_loss": 0.84190011, "learning_rate": 3.16574998372661e-06, "loss": 0.86734569, "num_input_tokens_seen": 115382495, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.2244873, "step": 5370, "time_per_iteration": 2.8974969387054443 }, { "auxiliary_loss_clip": 0.01515835, "auxiliary_loss_mlp": 0.01040302, "balance_loss_clip": 1.31860387, "balance_loss_mlp": 1.01603103, "epoch": 0.3229219900796633, "flos": 24144674100480.0, "grad_norm": 1.9376631385325869, "language_loss": 0.83726782, "learning_rate": 3.1654334999254177e-06, "loss": 0.86282915, "num_input_tokens_seen": 115399450, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.24291992, "step": 5371, "time_per_iteration": 2.8881256580352783 }, { "auxiliary_loss_clip": 0.01518775, "auxiliary_loss_mlp": 0.01046093, "balance_loss_clip": 1.31916738, "balance_loss_mlp": 1.02244174, "epoch": 0.3229821133323313, "flos": 17757626503680.0, "grad_norm": 2.0339266326739467, "language_loss": 0.89071882, "learning_rate": 3.1651169719308695e-06, "loss": 0.91636747, "num_input_tokens_seen": 115417700, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.2364502, "step": 5372, "time_per_iteration": 2.891850233078003 }, { "auxiliary_loss_clip": 0.01531321, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.33305907, "balance_loss_mlp": 1.01627243, "epoch": 0.32304223658499925, "flos": 22356168658560.0, "grad_norm": 9.74871941757989, "language_loss": 0.73097318, "learning_rate": 3.1648003997549694e-06, "loss": 0.75669277, "num_input_tokens_seen": 115435840, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.24389648, "step": 5373, "time_per_iteration": 3.0952396392822266 }, { "auxiliary_loss_clip": 0.01498072, "auxiliary_loss_mlp": 0.01037961, "balance_loss_clip": 1.30681157, "balance_loss_mlp": 1.01441729, "epoch": 0.3231023598376672, "flos": 18487382186880.0, "grad_norm": 2.803537962686067, "language_loss": 0.82782316, "learning_rate": 3.1644837834097214e-06, "loss": 0.85318357, "num_input_tokens_seen": 115454210, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.23571777, "step": 5374, "time_per_iteration": 2.9387032985687256 }, { "auxiliary_loss_clip": 0.01496773, "auxiliary_loss_mlp": 0.01038658, "balance_loss_clip": 1.30612195, "balance_loss_mlp": 1.01606727, "epoch": 0.3231624830903352, "flos": 27647790946560.0, "grad_norm": 2.032993976782501, "language_loss": 0.88115168, "learning_rate": 3.1641671229071317e-06, "loss": 0.90650594, "num_input_tokens_seen": 115471785, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.22595215, "step": 5375, "time_per_iteration": 2.9855504035949707 }, { "auxiliary_loss_clip": 0.01536559, "auxiliary_loss_mlp": 0.0104122, "balance_loss_clip": 1.33447361, "balance_loss_mlp": 1.01643634, "epoch": 0.32322260634300315, "flos": 21736575360000.0, "grad_norm": 1.9769869346126496, "language_loss": 0.76922601, "learning_rate": 3.1638504182592076e-06, "loss": 0.79500377, "num_input_tokens_seen": 115491405, "router_z_loss_clip": 2.02050781, "router_z_loss_mlp": 0.24768066, "step": 5376, "time_per_iteration": 2.868070602416992 }, { "auxiliary_loss_clip": 0.01518113, "auxiliary_loss_mlp": 0.01037288, "balance_loss_clip": 1.32150745, "balance_loss_mlp": 1.01523411, "epoch": 0.3232827295956711, "flos": 22647808512000.0, "grad_norm": 3.1691060160511553, "language_loss": 0.67543817, "learning_rate": 3.1635336694779594e-06, "loss": 0.70099217, "num_input_tokens_seen": 115511555, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.22033691, "step": 5377, "time_per_iteration": 2.894752264022827 }, { "auxiliary_loss_clip": 0.01514563, "auxiliary_loss_mlp": 0.01047885, "balance_loss_clip": 1.31879807, "balance_loss_mlp": 1.02395964, "epoch": 0.3233428528483391, "flos": 26333669681280.0, "grad_norm": 1.4524672168108606, "language_loss": 0.73031807, "learning_rate": 3.1632168765753982e-06, "loss": 0.75594252, "num_input_tokens_seen": 115532860, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.23962402, "step": 5378, "time_per_iteration": 2.9289238452911377 }, { "auxiliary_loss_clip": 0.01514051, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.31655872, "balance_loss_mlp": 1.01422715, "epoch": 0.32340297610100704, "flos": 28597011747840.0, "grad_norm": 2.2146907655562114, "language_loss": 0.8289699, "learning_rate": 3.1629000395635357e-06, "loss": 0.85448909, "num_input_tokens_seen": 115553850, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.2364502, "step": 5379, "time_per_iteration": 2.9225308895111084 }, { "auxiliary_loss_clip": 0.01526866, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.32577193, "balance_loss_mlp": 1.01846027, "epoch": 0.323463099353675, "flos": 30786007328640.0, "grad_norm": 1.6727185990844444, "language_loss": 0.79776061, "learning_rate": 3.162583158454388e-06, "loss": 0.82344919, "num_input_tokens_seen": 115575530, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.23535156, "step": 5380, "time_per_iteration": 2.9653091430664062 }, { "auxiliary_loss_clip": 0.01526042, "auxiliary_loss_mlp": 0.01041481, "balance_loss_clip": 1.32700777, "balance_loss_mlp": 1.01934433, "epoch": 0.32352322260634303, "flos": 25239556471680.0, "grad_norm": 1.6762360821354092, "language_loss": 0.77739131, "learning_rate": 3.1622662332599697e-06, "loss": 0.80306655, "num_input_tokens_seen": 115594885, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.22143555, "step": 5381, "time_per_iteration": 2.9014899730682373 }, { "auxiliary_loss_clip": 0.0151049, "auxiliary_loss_mlp": 0.01035175, "balance_loss_clip": 1.31698537, "balance_loss_mlp": 1.01359808, "epoch": 0.323583345859011, "flos": 23340707665920.0, "grad_norm": 3.096196118615213, "language_loss": 0.7271111, "learning_rate": 3.1619492639922998e-06, "loss": 0.75256777, "num_input_tokens_seen": 115614080, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.21569824, "step": 5382, "time_per_iteration": 2.891120433807373 }, { "auxiliary_loss_clip": 0.01529083, "auxiliary_loss_mlp": 0.010386, "balance_loss_clip": 1.32845783, "balance_loss_mlp": 1.01558042, "epoch": 0.32364346911167896, "flos": 26217082535040.0, "grad_norm": 3.9847260796021025, "language_loss": 0.71884263, "learning_rate": 3.1616322506633964e-06, "loss": 0.74451947, "num_input_tokens_seen": 115632820, "router_z_loss_clip": 2.00488281, "router_z_loss_mlp": 0.22998047, "step": 5383, "time_per_iteration": 2.9093077182769775 }, { "auxiliary_loss_clip": 0.01502356, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.31077039, "balance_loss_mlp": 1.01555109, "epoch": 0.3237035923643469, "flos": 23706196312320.0, "grad_norm": 1.5790684465092832, "language_loss": 0.78901964, "learning_rate": 3.161315193285283e-06, "loss": 0.81442553, "num_input_tokens_seen": 115652860, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.22680664, "step": 5384, "time_per_iteration": 2.9115638732910156 }, { "auxiliary_loss_clip": 0.015338, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.3319726, "balance_loss_mlp": 1.01905274, "epoch": 0.3237637156170149, "flos": 14436846777600.0, "grad_norm": 2.1490304291947733, "language_loss": 0.76289022, "learning_rate": 3.16099809186998e-06, "loss": 0.78866088, "num_input_tokens_seen": 115670940, "router_z_loss_clip": 2.015625, "router_z_loss_mlp": 0.2421875, "step": 5385, "time_per_iteration": 2.8328845500946045 }, { "auxiliary_loss_clip": 0.01513529, "auxiliary_loss_mlp": 0.01045579, "balance_loss_clip": 1.31686437, "balance_loss_mlp": 1.02224958, "epoch": 0.32382383886968286, "flos": 31074570535680.0, "grad_norm": 2.9700903077917586, "language_loss": 0.72294825, "learning_rate": 3.1606809464295145e-06, "loss": 0.74853933, "num_input_tokens_seen": 115691155, "router_z_loss_clip": 1.96875, "router_z_loss_mlp": 0.23339844, "step": 5386, "time_per_iteration": 2.93491530418396 }, { "auxiliary_loss_clip": 0.0153301, "auxiliary_loss_mlp": 0.01041185, "balance_loss_clip": 1.33070588, "balance_loss_mlp": 1.01722383, "epoch": 0.3238839621223508, "flos": 23266994607360.0, "grad_norm": 2.973072016498827, "language_loss": 0.9506619, "learning_rate": 3.1603637569759095e-06, "loss": 0.97640389, "num_input_tokens_seen": 115710340, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.23962402, "step": 5387, "time_per_iteration": 2.8973398208618164 }, { "auxiliary_loss_clip": 0.01548789, "auxiliary_loss_mlp": 0.0104381, "balance_loss_clip": 1.34677732, "balance_loss_mlp": 1.02026629, "epoch": 0.3239440853750188, "flos": 22974540347520.0, "grad_norm": 2.0904258095686896, "language_loss": 0.78317177, "learning_rate": 3.1600465235211956e-06, "loss": 0.80909777, "num_input_tokens_seen": 115726745, "router_z_loss_clip": 2.02148438, "router_z_loss_mlp": 0.23547363, "step": 5388, "time_per_iteration": 2.8879318237304688 }, { "auxiliary_loss_clip": 0.0152445, "auxiliary_loss_mlp": 0.01043345, "balance_loss_clip": 1.32763791, "balance_loss_mlp": 1.01937234, "epoch": 0.32400420862768675, "flos": 36260238205440.0, "grad_norm": 2.3318983243417777, "language_loss": 0.72587085, "learning_rate": 3.1597292460774006e-06, "loss": 0.75154877, "num_input_tokens_seen": 115749385, "router_z_loss_clip": 1.96679688, "router_z_loss_mlp": 0.23986816, "step": 5389, "time_per_iteration": 3.063370943069458 }, { "auxiliary_loss_clip": 0.01529827, "auxiliary_loss_mlp": 0.01044082, "balance_loss_clip": 1.33353567, "balance_loss_mlp": 1.02029955, "epoch": 0.3240643318803547, "flos": 21626503464960.0, "grad_norm": 1.9848507858877082, "language_loss": 0.82194501, "learning_rate": 3.159411924656557e-06, "loss": 0.84768409, "num_input_tokens_seen": 115768105, "router_z_loss_clip": 1.96289062, "router_z_loss_mlp": 0.23791504, "step": 5390, "time_per_iteration": 2.946657657623291 }, { "auxiliary_loss_clip": 0.01543245, "auxiliary_loss_mlp": 0.0103788, "balance_loss_clip": 1.34481645, "balance_loss_mlp": 1.01376402, "epoch": 0.3241244551330227, "flos": 23306249111040.0, "grad_norm": 1.9009228989205331, "language_loss": 0.74177372, "learning_rate": 3.1590945592706967e-06, "loss": 0.76758504, "num_input_tokens_seen": 115787340, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.24108887, "step": 5391, "time_per_iteration": 2.9430923461914062 }, { "auxiliary_loss_clip": 0.01513652, "auxiliary_loss_mlp": 0.01039032, "balance_loss_clip": 1.31759286, "balance_loss_mlp": 1.01551175, "epoch": 0.32418457838569065, "flos": 14104459342080.0, "grad_norm": 1.5974815487084333, "language_loss": 0.78075534, "learning_rate": 3.158777149931855e-06, "loss": 0.80628216, "num_input_tokens_seen": 115805565, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.23522949, "step": 5392, "time_per_iteration": 2.8958423137664795 }, { "auxiliary_loss_clip": 0.01532149, "auxiliary_loss_mlp": 0.01037727, "balance_loss_clip": 1.33083475, "balance_loss_mlp": 1.01439738, "epoch": 0.3242447016383586, "flos": 29764747526400.0, "grad_norm": 4.1697884738172535, "language_loss": 0.64123982, "learning_rate": 3.158459696652067e-06, "loss": 0.6669386, "num_input_tokens_seen": 115826725, "router_z_loss_clip": 2.00976562, "router_z_loss_mlp": 0.23327637, "step": 5393, "time_per_iteration": 2.932168483734131 }, { "auxiliary_loss_clip": 0.01530201, "auxiliary_loss_mlp": 0.01038961, "balance_loss_clip": 1.33325398, "balance_loss_mlp": 1.01575041, "epoch": 0.3243048248910266, "flos": 24361469775360.0, "grad_norm": 2.0299057717223796, "language_loss": 0.83642662, "learning_rate": 3.158142199443371e-06, "loss": 0.86211824, "num_input_tokens_seen": 115846955, "router_z_loss_clip": 1.97070312, "router_z_loss_mlp": 0.23205566, "step": 5394, "time_per_iteration": 4.324660062789917 }, { "auxiliary_loss_clip": 0.01502723, "auxiliary_loss_mlp": 0.01046325, "balance_loss_clip": 1.31305408, "balance_loss_mlp": 1.02425885, "epoch": 0.3243649481436946, "flos": 24363777260160.0, "grad_norm": 1.9099050328630327, "language_loss": 0.82583505, "learning_rate": 3.1578246583178076e-06, "loss": 0.85132551, "num_input_tokens_seen": 115865975, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.2208252, "step": 5395, "time_per_iteration": 2.9064745903015137 }, { "auxiliary_loss_clip": 0.01511629, "auxiliary_loss_mlp": 0.01044172, "balance_loss_clip": 1.31931937, "balance_loss_mlp": 1.02117634, "epoch": 0.32442507139636256, "flos": 22934335703040.0, "grad_norm": 2.764506831768972, "language_loss": 0.84121019, "learning_rate": 3.157507073287417e-06, "loss": 0.86676812, "num_input_tokens_seen": 115884950, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.23022461, "step": 5396, "time_per_iteration": 2.88105845451355 }, { "auxiliary_loss_clip": 0.01539563, "auxiliary_loss_mlp": 0.01046167, "balance_loss_clip": 1.33820248, "balance_loss_mlp": 1.02312422, "epoch": 0.32448519464903053, "flos": 22210326109440.0, "grad_norm": 3.2639527019100276, "language_loss": 0.77893758, "learning_rate": 3.1571894443642414e-06, "loss": 0.80479491, "num_input_tokens_seen": 115904170, "router_z_loss_clip": 2.01074219, "router_z_loss_mlp": 0.23034668, "step": 5397, "time_per_iteration": 4.277055501937866 }, { "auxiliary_loss_clip": 0.01505481, "auxiliary_loss_mlp": 0.01042464, "balance_loss_clip": 1.31426561, "balance_loss_mlp": 1.01921844, "epoch": 0.3245453179016985, "flos": 18846400826880.0, "grad_norm": 4.194179064047444, "language_loss": 0.68793637, "learning_rate": 3.1568717715603263e-06, "loss": 0.7134158, "num_input_tokens_seen": 115919255, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.23217773, "step": 5398, "time_per_iteration": 2.8372721672058105 }, { "auxiliary_loss_clip": 0.0151075, "auxiliary_loss_mlp": 0.01043472, "balance_loss_clip": 1.31416726, "balance_loss_mlp": 1.02022588, "epoch": 0.32460544115436646, "flos": 21188206656000.0, "grad_norm": 1.370848530291454, "language_loss": 0.73907667, "learning_rate": 3.156554054887718e-06, "loss": 0.76461887, "num_input_tokens_seen": 115938535, "router_z_loss_clip": 1.96386719, "router_z_loss_mlp": 0.23254395, "step": 5399, "time_per_iteration": 2.8462655544281006 }, { "auxiliary_loss_clip": 0.01508547, "auxiliary_loss_mlp": 0.01041828, "balance_loss_clip": 1.31321478, "balance_loss_mlp": 1.01824808, "epoch": 0.3246655644070344, "flos": 21991177704960.0, "grad_norm": 2.1924504187901697, "language_loss": 0.72043198, "learning_rate": 3.1562362943584645e-06, "loss": 0.74593568, "num_input_tokens_seen": 115955005, "router_z_loss_clip": 1.95214844, "router_z_loss_mlp": 0.23596191, "step": 5400, "time_per_iteration": 2.877751588821411 }, { "auxiliary_loss_clip": 0.01528098, "auxiliary_loss_mlp": 0.01041086, "balance_loss_clip": 1.32948875, "balance_loss_mlp": 1.01788807, "epoch": 0.3247256876597024, "flos": 32172167594880.0, "grad_norm": 1.9622744951842566, "language_loss": 0.80750036, "learning_rate": 3.155918489984614e-06, "loss": 0.83319217, "num_input_tokens_seen": 115975305, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.23205566, "step": 5401, "time_per_iteration": 4.396793842315674 }, { "auxiliary_loss_clip": 0.0151938, "auxiliary_loss_mlp": 0.01037476, "balance_loss_clip": 1.32154834, "balance_loss_mlp": 1.01386082, "epoch": 0.32478581091237035, "flos": 21007724572800.0, "grad_norm": 1.6356878734758173, "language_loss": 0.87776065, "learning_rate": 3.1556006417782196e-06, "loss": 0.90332919, "num_input_tokens_seen": 115994810, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.23620605, "step": 5402, "time_per_iteration": 4.31572151184082 }, { "auxiliary_loss_clip": 0.01506876, "auxiliary_loss_mlp": 0.0104031, "balance_loss_clip": 1.31603885, "balance_loss_mlp": 1.01848292, "epoch": 0.3248459341650383, "flos": 17932362497280.0, "grad_norm": 2.241465406127365, "language_loss": 0.85958004, "learning_rate": 3.155282749751332e-06, "loss": 0.88505185, "num_input_tokens_seen": 116011095, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.21838379, "step": 5403, "time_per_iteration": 3.007122278213501 }, { "auxiliary_loss_clip": 0.01490861, "auxiliary_loss_mlp": 0.01042183, "balance_loss_clip": 1.3036325, "balance_loss_mlp": 1.01971245, "epoch": 0.3249060574177063, "flos": 24546431093760.0, "grad_norm": 2.070887558256594, "language_loss": 0.88442528, "learning_rate": 3.154964813916007e-06, "loss": 0.90975571, "num_input_tokens_seen": 116028805, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.22460938, "step": 5404, "time_per_iteration": 2.9390408992767334 }, { "auxiliary_loss_clip": 0.01504359, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 1.31299937, "balance_loss_mlp": 1.01386499, "epoch": 0.32496618067037425, "flos": 26005897215360.0, "grad_norm": 1.9286427856326507, "language_loss": 0.73587507, "learning_rate": 3.1546468342843008e-06, "loss": 0.76128221, "num_input_tokens_seen": 116047765, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.22497559, "step": 5405, "time_per_iteration": 2.898709535598755 }, { "auxiliary_loss_clip": 0.01497266, "auxiliary_loss_mlp": 0.01038177, "balance_loss_clip": 1.30402541, "balance_loss_mlp": 1.01505089, "epoch": 0.3250263039230422, "flos": 19583350433280.0, "grad_norm": 1.7084522478444493, "language_loss": 0.83571732, "learning_rate": 3.1543288108682707e-06, "loss": 0.86107183, "num_input_tokens_seen": 116068385, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.23120117, "step": 5406, "time_per_iteration": 2.881152391433716 }, { "auxiliary_loss_clip": 0.01507709, "auxiliary_loss_mlp": 0.01038088, "balance_loss_clip": 1.31499577, "balance_loss_mlp": 1.01521158, "epoch": 0.3250864271757102, "flos": 16772318334720.0, "grad_norm": 1.7457883557665272, "language_loss": 0.8866089, "learning_rate": 3.1540107436799764e-06, "loss": 0.91206688, "num_input_tokens_seen": 116085350, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.22875977, "step": 5407, "time_per_iteration": 2.8570916652679443 }, { "auxiliary_loss_clip": 0.01504686, "auxiliary_loss_mlp": 0.01039427, "balance_loss_clip": 1.31067848, "balance_loss_mlp": 1.0162406, "epoch": 0.3251465504283782, "flos": 27831168696960.0, "grad_norm": 1.6567588678314464, "language_loss": 0.69680762, "learning_rate": 3.153692632731479e-06, "loss": 0.72224873, "num_input_tokens_seen": 116107560, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.23193359, "step": 5408, "time_per_iteration": 2.935502052307129 }, { "auxiliary_loss_clip": 0.01518973, "auxiliary_loss_mlp": 0.01040057, "balance_loss_clip": 1.31751931, "balance_loss_mlp": 1.0176934, "epoch": 0.32520667368104617, "flos": 19072652664960.0, "grad_norm": 2.1627715521182376, "language_loss": 0.78579319, "learning_rate": 3.153374478034841e-06, "loss": 0.81138343, "num_input_tokens_seen": 116125980, "router_z_loss_clip": 2.01953125, "router_z_loss_mlp": 0.22351074, "step": 5409, "time_per_iteration": 2.8858842849731445 }, { "auxiliary_loss_clip": 0.01508636, "auxiliary_loss_mlp": 0.01042997, "balance_loss_clip": 1.31231618, "balance_loss_mlp": 1.02069294, "epoch": 0.32526679693371413, "flos": 29392653139200.0, "grad_norm": 2.1051456088247944, "language_loss": 0.83724809, "learning_rate": 3.1530562796021285e-06, "loss": 0.86276448, "num_input_tokens_seen": 116146530, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.22314453, "step": 5410, "time_per_iteration": 2.906587600708008 }, { "auxiliary_loss_clip": 0.0148019, "auxiliary_loss_mlp": 0.01043225, "balance_loss_clip": 1.29233122, "balance_loss_mlp": 1.01934719, "epoch": 0.3253269201863821, "flos": 20714320172160.0, "grad_norm": 1.5206381441725918, "language_loss": 0.71883655, "learning_rate": 3.152738037445405e-06, "loss": 0.74407071, "num_input_tokens_seen": 116165695, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.2388916, "step": 5411, "time_per_iteration": 2.872910737991333 }, { "auxiliary_loss_clip": 0.01502282, "auxiliary_loss_mlp": 0.01042721, "balance_loss_clip": 1.30956817, "balance_loss_mlp": 1.01963019, "epoch": 0.32538704343905006, "flos": 29105583010560.0, "grad_norm": 1.4892604854537639, "language_loss": 0.83559322, "learning_rate": 3.1524197515767403e-06, "loss": 0.86104321, "num_input_tokens_seen": 116185375, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.23059082, "step": 5412, "time_per_iteration": 2.9041829109191895 }, { "auxiliary_loss_clip": 0.01516203, "auxiliary_loss_mlp": 0.01040538, "balance_loss_clip": 1.31906939, "balance_loss_mlp": 1.016505, "epoch": 0.325447166691718, "flos": 24685577412480.0, "grad_norm": 1.6937710269166464, "language_loss": 0.81554866, "learning_rate": 3.152101422008203e-06, "loss": 0.84111607, "num_input_tokens_seen": 116204335, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.24047852, "step": 5413, "time_per_iteration": 2.8988823890686035 }, { "auxiliary_loss_clip": 0.01514556, "auxiliary_loss_mlp": 0.01041942, "balance_loss_clip": 1.32030785, "balance_loss_mlp": 1.01821899, "epoch": 0.325507289944386, "flos": 21553016630400.0, "grad_norm": 1.5735404177522316, "language_loss": 0.77054203, "learning_rate": 3.151783048751864e-06, "loss": 0.79610705, "num_input_tokens_seen": 116222840, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.23742676, "step": 5414, "time_per_iteration": 2.9227371215820312 }, { "auxiliary_loss_clip": 0.01286477, "auxiliary_loss_mlp": 0.01056321, "balance_loss_clip": 1.17123318, "balance_loss_mlp": 1.03448153, "epoch": 0.32556741319705396, "flos": 71548470213120.0, "grad_norm": 0.9256993340148384, "language_loss": 0.64001912, "learning_rate": 3.1514646318197965e-06, "loss": 0.66344708, "num_input_tokens_seen": 116274940, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.21875, "step": 5415, "time_per_iteration": 3.2861709594726562 }, { "auxiliary_loss_clip": 0.01512757, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.32005048, "balance_loss_mlp": 1.01975608, "epoch": 0.3256275364497219, "flos": 23742917107200.0, "grad_norm": 1.5035062971118203, "language_loss": 0.75307345, "learning_rate": 3.151146171224075e-06, "loss": 0.77862906, "num_input_tokens_seen": 116297300, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.23022461, "step": 5416, "time_per_iteration": 3.0393288135528564 }, { "auxiliary_loss_clip": 0.01288095, "auxiliary_loss_mlp": 0.01032641, "balance_loss_clip": 1.17594934, "balance_loss_mlp": 1.01280427, "epoch": 0.3256876597023899, "flos": 67318583840640.0, "grad_norm": 0.791150301897351, "language_loss": 0.57998109, "learning_rate": 3.1508276669767757e-06, "loss": 0.6031884, "num_input_tokens_seen": 116362370, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.19824219, "step": 5417, "time_per_iteration": 3.3712329864501953 }, { "auxiliary_loss_clip": 0.01287764, "auxiliary_loss_mlp": 0.01021408, "balance_loss_clip": 1.17636013, "balance_loss_mlp": 1.00405121, "epoch": 0.32574778295505785, "flos": 71316942716160.0, "grad_norm": 0.8369639823568511, "language_loss": 0.63527369, "learning_rate": 3.150509119089975e-06, "loss": 0.65836537, "num_input_tokens_seen": 116430365, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.17382812, "step": 5418, "time_per_iteration": 3.399148941040039 }, { "auxiliary_loss_clip": 0.01498001, "auxiliary_loss_mlp": 0.01046619, "balance_loss_clip": 1.30534244, "balance_loss_mlp": 1.0242548, "epoch": 0.3258079062077258, "flos": 20785906725120.0, "grad_norm": 1.927568903761663, "language_loss": 0.70666611, "learning_rate": 3.1501905275757537e-06, "loss": 0.73211235, "num_input_tokens_seen": 116447525, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.22375488, "step": 5419, "time_per_iteration": 2.905410051345825 }, { "auxiliary_loss_clip": 0.01518039, "auxiliary_loss_mlp": 0.01047439, "balance_loss_clip": 1.32302856, "balance_loss_mlp": 1.02484894, "epoch": 0.3258680294603938, "flos": 22245282357120.0, "grad_norm": 1.780609507826102, "language_loss": 0.78221262, "learning_rate": 3.1498718924461926e-06, "loss": 0.80786741, "num_input_tokens_seen": 116466310, "router_z_loss_clip": 1.95019531, "router_z_loss_mlp": 0.22583008, "step": 5420, "time_per_iteration": 2.8784520626068115 }, { "auxiliary_loss_clip": 0.01520966, "auxiliary_loss_mlp": 0.01052593, "balance_loss_clip": 1.32495403, "balance_loss_mlp": 1.0296576, "epoch": 0.3259281527130618, "flos": 26991386363520.0, "grad_norm": 3.0329790543617534, "language_loss": 0.8072837, "learning_rate": 3.1495532137133736e-06, "loss": 0.83301932, "num_input_tokens_seen": 116487825, "router_z_loss_clip": 1.95898438, "router_z_loss_mlp": 0.22949219, "step": 5421, "time_per_iteration": 2.945810079574585 }, { "auxiliary_loss_clip": 0.01512607, "auxiliary_loss_mlp": 0.01060058, "balance_loss_clip": 1.32173634, "balance_loss_mlp": 1.0393157, "epoch": 0.32598827596572977, "flos": 26225588557440.0, "grad_norm": 1.602110414703409, "language_loss": 0.7582258, "learning_rate": 3.149234491389381e-06, "loss": 0.78395247, "num_input_tokens_seen": 116509950, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.20751953, "step": 5422, "time_per_iteration": 3.045612335205078 }, { "auxiliary_loss_clip": 0.01527001, "auxiliary_loss_mlp": 0.01056184, "balance_loss_clip": 1.32902181, "balance_loss_mlp": 1.03385639, "epoch": 0.32604839921839773, "flos": 17648776218240.0, "grad_norm": 1.9634518307389681, "language_loss": 0.63733274, "learning_rate": 3.1489157254863026e-06, "loss": 0.66316462, "num_input_tokens_seen": 116527695, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.22338867, "step": 5423, "time_per_iteration": 2.8406739234924316 }, { "auxiliary_loss_clip": 0.01492574, "auxiliary_loss_mlp": 0.01050395, "balance_loss_clip": 1.30435622, "balance_loss_mlp": 1.02797222, "epoch": 0.3261085224710657, "flos": 23633116680960.0, "grad_norm": 1.6239986719920534, "language_loss": 0.75408572, "learning_rate": 3.148596916016224e-06, "loss": 0.77951539, "num_input_tokens_seen": 116547800, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.22399902, "step": 5424, "time_per_iteration": 2.950788974761963 }, { "auxiliary_loss_clip": 0.01501307, "auxiliary_loss_mlp": 0.01051248, "balance_loss_clip": 1.31097031, "balance_loss_mlp": 1.02902794, "epoch": 0.32616864572373366, "flos": 23271428597760.0, "grad_norm": 1.684322419331074, "language_loss": 0.77151203, "learning_rate": 3.1482780629912355e-06, "loss": 0.7970376, "num_input_tokens_seen": 116568460, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.22229004, "step": 5425, "time_per_iteration": 2.879028558731079 }, { "auxiliary_loss_clip": 0.01519794, "auxiliary_loss_mlp": 0.01058266, "balance_loss_clip": 1.32137609, "balance_loss_mlp": 1.03349447, "epoch": 0.32622876897640163, "flos": 25604456935680.0, "grad_norm": 2.2629186364749097, "language_loss": 0.79320002, "learning_rate": 3.147959166423428e-06, "loss": 0.81898069, "num_input_tokens_seen": 116588705, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.24780273, "step": 5426, "time_per_iteration": 2.9384098052978516 }, { "auxiliary_loss_clip": 0.01526748, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.33150578, "balance_loss_mlp": 1.0299319, "epoch": 0.3262888922290696, "flos": 22429157800320.0, "grad_norm": 1.8913530600317383, "language_loss": 0.75063497, "learning_rate": 3.147640226324893e-06, "loss": 0.77643222, "num_input_tokens_seen": 116608845, "router_z_loss_clip": 1.95117188, "router_z_loss_mlp": 0.23046875, "step": 5427, "time_per_iteration": 2.885321617126465 }, { "auxiliary_loss_clip": 0.01527048, "auxiliary_loss_mlp": 0.01053364, "balance_loss_clip": 1.32871997, "balance_loss_mlp": 1.03096437, "epoch": 0.32634901548173756, "flos": 19728333331200.0, "grad_norm": 1.608216911416011, "language_loss": 0.7967267, "learning_rate": 3.1473212427077266e-06, "loss": 0.82253087, "num_input_tokens_seen": 116628145, "router_z_loss_clip": 1.98242188, "router_z_loss_mlp": 0.22412109, "step": 5428, "time_per_iteration": 2.9001705646514893 }, { "auxiliary_loss_clip": 0.01502532, "auxiliary_loss_mlp": 0.01045788, "balance_loss_clip": 1.30979586, "balance_loss_mlp": 1.02298355, "epoch": 0.3264091387344055, "flos": 16151322447360.0, "grad_norm": 1.9879254913233098, "language_loss": 0.71795195, "learning_rate": 3.147002215584023e-06, "loss": 0.74343514, "num_input_tokens_seen": 116646920, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.22802734, "step": 5429, "time_per_iteration": 4.295518159866333 }, { "auxiliary_loss_clip": 0.01510332, "auxiliary_loss_mlp": 0.01050781, "balance_loss_clip": 1.31622851, "balance_loss_mlp": 1.02842879, "epoch": 0.3264692619870735, "flos": 16407734584320.0, "grad_norm": 1.63556534076697, "language_loss": 0.78877807, "learning_rate": 3.146683144965881e-06, "loss": 0.81438923, "num_input_tokens_seen": 116665100, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.22351074, "step": 5430, "time_per_iteration": 2.854804754257202 }, { "auxiliary_loss_clip": 0.01514989, "auxiliary_loss_mlp": 0.01047581, "balance_loss_clip": 1.31917715, "balance_loss_mlp": 1.02313137, "epoch": 0.32652938523974145, "flos": 22392437005440.0, "grad_norm": 2.1146935937881977, "language_loss": 0.85597014, "learning_rate": 3.146364030865399e-06, "loss": 0.88159585, "num_input_tokens_seen": 116682205, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.24462891, "step": 5431, "time_per_iteration": 2.8513412475585938 }, { "auxiliary_loss_clip": 0.01503864, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.31322336, "balance_loss_mlp": 1.02235615, "epoch": 0.3265895084924094, "flos": 21918052828800.0, "grad_norm": 1.6319410798541392, "language_loss": 0.7102111, "learning_rate": 3.146044873294678e-06, "loss": 0.73569351, "num_input_tokens_seen": 116702575, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.22021484, "step": 5432, "time_per_iteration": 4.33033561706543 }, { "auxiliary_loss_clip": 0.0151839, "auxiliary_loss_mlp": 0.01036693, "balance_loss_clip": 1.32270658, "balance_loss_mlp": 1.01546228, "epoch": 0.3266496317450774, "flos": 16074804211200.0, "grad_norm": 1.7629038126828265, "language_loss": 0.84985924, "learning_rate": 3.1457256722658203e-06, "loss": 0.87541002, "num_input_tokens_seen": 116720885, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.21228027, "step": 5433, "time_per_iteration": 2.8461503982543945 }, { "auxiliary_loss_clip": 0.01504068, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.31366193, "balance_loss_mlp": 1.01807451, "epoch": 0.3267097549977454, "flos": 22538370044160.0, "grad_norm": 1.5100930553673961, "language_loss": 0.86545599, "learning_rate": 3.145406427790931e-06, "loss": 0.89090419, "num_input_tokens_seen": 116740395, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.22668457, "step": 5434, "time_per_iteration": 2.934783458709717 }, { "auxiliary_loss_clip": 0.01521272, "auxiliary_loss_mlp": 0.01039962, "balance_loss_clip": 1.32528985, "balance_loss_mlp": 1.01653695, "epoch": 0.32676987825041337, "flos": 27281533138560.0, "grad_norm": 1.7933847140754358, "language_loss": 0.88738775, "learning_rate": 3.1450871398821147e-06, "loss": 0.91300011, "num_input_tokens_seen": 116758870, "router_z_loss_clip": 1.95800781, "router_z_loss_mlp": 0.234375, "step": 5435, "time_per_iteration": 2.9163033962249756 }, { "auxiliary_loss_clip": 0.01520373, "auxiliary_loss_mlp": 0.0104042, "balance_loss_clip": 1.32493687, "balance_loss_mlp": 1.01804447, "epoch": 0.32683000150308134, "flos": 11515199846400.0, "grad_norm": 3.6365273095162003, "language_loss": 0.77744055, "learning_rate": 3.144767808551479e-06, "loss": 0.80304843, "num_input_tokens_seen": 116773440, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.22375488, "step": 5436, "time_per_iteration": 4.245404481887817 }, { "auxiliary_loss_clip": 0.01510473, "auxiliary_loss_mlp": 0.01036345, "balance_loss_clip": 1.31742275, "balance_loss_mlp": 1.01426721, "epoch": 0.3268901247557493, "flos": 25641313464960.0, "grad_norm": 1.5135461994555524, "language_loss": 0.72692013, "learning_rate": 3.144448433811134e-06, "loss": 0.75238824, "num_input_tokens_seen": 116794375, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.2208252, "step": 5437, "time_per_iteration": 4.313629627227783 }, { "auxiliary_loss_clip": 0.01528894, "auxiliary_loss_mlp": 0.01044663, "balance_loss_clip": 1.33011603, "balance_loss_mlp": 1.02113116, "epoch": 0.32695024800841727, "flos": 24870945934080.0, "grad_norm": 1.5618424567682183, "language_loss": 0.64318722, "learning_rate": 3.144129015673189e-06, "loss": 0.66892278, "num_input_tokens_seen": 116815095, "router_z_loss_clip": 1.98632812, "router_z_loss_mlp": 0.23522949, "step": 5438, "time_per_iteration": 2.9271907806396484 }, { "auxiliary_loss_clip": 0.01519702, "auxiliary_loss_mlp": 0.01044101, "balance_loss_clip": 1.3265264, "balance_loss_mlp": 1.0216186, "epoch": 0.32701037126108523, "flos": 28850754441600.0, "grad_norm": 1.8535277844475424, "language_loss": 0.74656636, "learning_rate": 3.1438095541497576e-06, "loss": 0.7722044, "num_input_tokens_seen": 116836630, "router_z_loss_clip": 1.93554688, "router_z_loss_mlp": 0.22509766, "step": 5439, "time_per_iteration": 2.9144742488861084 }, { "auxiliary_loss_clip": 0.0152007, "auxiliary_loss_mlp": 0.01047928, "balance_loss_clip": 1.32566869, "balance_loss_mlp": 1.02487302, "epoch": 0.3270704945137532, "flos": 27976739777280.0, "grad_norm": 2.5940572640578567, "language_loss": 0.75219041, "learning_rate": 3.1434900492529527e-06, "loss": 0.77787036, "num_input_tokens_seen": 116856880, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.23046875, "step": 5440, "time_per_iteration": 2.94856858253479 }, { "auxiliary_loss_clip": 0.01505652, "auxiliary_loss_mlp": 0.01047725, "balance_loss_clip": 1.31516647, "balance_loss_mlp": 1.02492011, "epoch": 0.32713061776642116, "flos": 23699816795520.0, "grad_norm": 1.9647789462822023, "language_loss": 0.85137582, "learning_rate": 3.1431705009948914e-06, "loss": 0.87690961, "num_input_tokens_seen": 116873770, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.22814941, "step": 5441, "time_per_iteration": 2.9243557453155518 }, { "auxiliary_loss_clip": 0.01520946, "auxiliary_loss_mlp": 0.01045682, "balance_loss_clip": 1.32485998, "balance_loss_mlp": 1.0229969, "epoch": 0.3271907410190891, "flos": 22465833350400.0, "grad_norm": 1.915330956881272, "language_loss": 0.87321818, "learning_rate": 3.1428509093876897e-06, "loss": 0.89888448, "num_input_tokens_seen": 116891225, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.22705078, "step": 5442, "time_per_iteration": 2.8568413257598877 }, { "auxiliary_loss_clip": 0.01531824, "auxiliary_loss_mlp": 0.01039232, "balance_loss_clip": 1.3354342, "balance_loss_mlp": 1.01528323, "epoch": 0.3272508642717571, "flos": 22830326611200.0, "grad_norm": 1.567801282241699, "language_loss": 0.78079498, "learning_rate": 3.1425312744434668e-06, "loss": 0.80650556, "num_input_tokens_seen": 116912300, "router_z_loss_clip": 1.96484375, "router_z_loss_mlp": 0.23950195, "step": 5443, "time_per_iteration": 2.935483932495117 }, { "auxiliary_loss_clip": 0.01529049, "auxiliary_loss_mlp": 0.01048809, "balance_loss_clip": 1.33062911, "balance_loss_mlp": 1.02525353, "epoch": 0.32731098752442506, "flos": 11808694736640.0, "grad_norm": 1.981174734856352, "language_loss": 0.82005936, "learning_rate": 3.142211596174343e-06, "loss": 0.84583795, "num_input_tokens_seen": 116929425, "router_z_loss_clip": 1.98339844, "router_z_loss_mlp": 0.2355957, "step": 5444, "time_per_iteration": 2.8623428344726562 }, { "auxiliary_loss_clip": 0.015122, "auxiliary_loss_mlp": 0.01043928, "balance_loss_clip": 1.31907797, "balance_loss_mlp": 1.02118278, "epoch": 0.327371110777093, "flos": 21036346548480.0, "grad_norm": 2.404673561397496, "language_loss": 0.5991118, "learning_rate": 3.1418918745924423e-06, "loss": 0.62467307, "num_input_tokens_seen": 116948255, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.22741699, "step": 5445, "time_per_iteration": 2.920849084854126 }, { "auxiliary_loss_clip": 0.01521031, "auxiliary_loss_mlp": 0.0104425, "balance_loss_clip": 1.32568717, "balance_loss_mlp": 1.02105212, "epoch": 0.327431234029761, "flos": 19071340565760.0, "grad_norm": 1.9971961738870043, "language_loss": 0.8896848, "learning_rate": 3.1415721097098865e-06, "loss": 0.91533762, "num_input_tokens_seen": 116964905, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.23205566, "step": 5446, "time_per_iteration": 2.980774402618408 }, { "auxiliary_loss_clip": 0.01551909, "auxiliary_loss_mlp": 0.01048443, "balance_loss_clip": 1.34970701, "balance_loss_mlp": 1.01953423, "epoch": 0.32749135728242895, "flos": 25860416624640.0, "grad_norm": 1.5782173473752859, "language_loss": 0.80014902, "learning_rate": 3.141252301538802e-06, "loss": 0.8261525, "num_input_tokens_seen": 116983650, "router_z_loss_clip": 2.02246094, "router_z_loss_mlp": 0.28894043, "step": 5447, "time_per_iteration": 2.9395992755889893 }, { "auxiliary_loss_clip": 0.0150705, "auxiliary_loss_mlp": 0.01042308, "balance_loss_clip": 1.3133297, "balance_loss_mlp": 1.02019501, "epoch": 0.327551480535097, "flos": 20130135569280.0, "grad_norm": 2.9768169093366357, "language_loss": 0.74274057, "learning_rate": 3.1409324500913157e-06, "loss": 0.76823413, "num_input_tokens_seen": 117003265, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.22119141, "step": 5448, "time_per_iteration": 2.873718738555908 }, { "auxiliary_loss_clip": 0.01503358, "auxiliary_loss_mlp": 0.01040833, "balance_loss_clip": 1.31264472, "balance_loss_mlp": 1.01753902, "epoch": 0.32761160378776494, "flos": 28815255256320.0, "grad_norm": 1.4624768167977689, "language_loss": 0.67424893, "learning_rate": 3.1406125553795567e-06, "loss": 0.69969082, "num_input_tokens_seen": 117025370, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.23291016, "step": 5449, "time_per_iteration": 2.9469923973083496 }, { "auxiliary_loss_clip": 0.01512642, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.31728983, "balance_loss_mlp": 1.01515841, "epoch": 0.3276717270404329, "flos": 26947697869440.0, "grad_norm": 3.1036360438189727, "language_loss": 0.65988332, "learning_rate": 3.1402926174156556e-06, "loss": 0.6853953, "num_input_tokens_seen": 117044350, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.23400879, "step": 5450, "time_per_iteration": 2.907425880432129 }, { "auxiliary_loss_clip": 0.01506489, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.3136102, "balance_loss_mlp": 1.01937294, "epoch": 0.32773185029310087, "flos": 25349492632320.0, "grad_norm": 1.9546182005288313, "language_loss": 0.78698468, "learning_rate": 3.1399726362117437e-06, "loss": 0.81247014, "num_input_tokens_seen": 117064450, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.22680664, "step": 5451, "time_per_iteration": 2.9047420024871826 }, { "auxiliary_loss_clip": 0.01527214, "auxiliary_loss_mlp": 0.01045056, "balance_loss_clip": 1.3297863, "balance_loss_mlp": 1.02008128, "epoch": 0.32779197354576883, "flos": 26401817629440.0, "grad_norm": 2.3127574747600375, "language_loss": 0.71180063, "learning_rate": 3.1396526117799555e-06, "loss": 0.73752338, "num_input_tokens_seen": 117083060, "router_z_loss_clip": 1.97460938, "router_z_loss_mlp": 0.24951172, "step": 5452, "time_per_iteration": 2.908872127532959 }, { "auxiliary_loss_clip": 0.01495395, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.3053329, "balance_loss_mlp": 1.01730847, "epoch": 0.3278520967984368, "flos": 24910019458560.0, "grad_norm": 1.9028361869925718, "language_loss": 0.79242647, "learning_rate": 3.1393325441324256e-06, "loss": 0.81779087, "num_input_tokens_seen": 117101860, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.23718262, "step": 5453, "time_per_iteration": 2.9032766819000244 }, { "auxiliary_loss_clip": 0.01516183, "auxiliary_loss_mlp": 0.01044688, "balance_loss_clip": 1.32224107, "balance_loss_mlp": 1.02235961, "epoch": 0.32791222005110476, "flos": 29765154729600.0, "grad_norm": 1.9077135469317346, "language_loss": 0.75739741, "learning_rate": 3.1390124332812916e-06, "loss": 0.78300613, "num_input_tokens_seen": 117123100, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.22314453, "step": 5454, "time_per_iteration": 2.9371445178985596 }, { "auxiliary_loss_clip": 0.01495864, "auxiliary_loss_mlp": 0.01044783, "balance_loss_clip": 1.3081249, "balance_loss_mlp": 1.02231252, "epoch": 0.32797234330377273, "flos": 16516630114560.0, "grad_norm": 1.9186240681328222, "language_loss": 0.7803148, "learning_rate": 3.1386922792386924e-06, "loss": 0.80572128, "num_input_tokens_seen": 117140515, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.22473145, "step": 5455, "time_per_iteration": 2.8630244731903076 }, { "auxiliary_loss_clip": 0.0152219, "auxiliary_loss_mlp": 0.0104877, "balance_loss_clip": 1.32427609, "balance_loss_mlp": 1.02196014, "epoch": 0.3280324665564407, "flos": 26589041187840.0, "grad_norm": 7.485143526876971, "language_loss": 0.74745423, "learning_rate": 3.138372082016768e-06, "loss": 0.7731638, "num_input_tokens_seen": 117161485, "router_z_loss_clip": 1.97949219, "router_z_loss_mlp": 0.26831055, "step": 5456, "time_per_iteration": 2.9126675128936768 }, { "auxiliary_loss_clip": 0.01514285, "auxiliary_loss_mlp": 0.01042175, "balance_loss_clip": 1.31959224, "balance_loss_mlp": 1.01814246, "epoch": 0.32809258980910866, "flos": 22940308016640.0, "grad_norm": 1.4799863293730444, "language_loss": 0.79074842, "learning_rate": 3.1380518416276596e-06, "loss": 0.81631303, "num_input_tokens_seen": 117181870, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.2401123, "step": 5457, "time_per_iteration": 2.8855936527252197 }, { "auxiliary_loss_clip": 0.01520585, "auxiliary_loss_mlp": 0.01041335, "balance_loss_clip": 1.32125914, "balance_loss_mlp": 1.01886427, "epoch": 0.3281527130617766, "flos": 22794103509120.0, "grad_norm": 2.9214752654093457, "language_loss": 0.79676545, "learning_rate": 3.1377315580835115e-06, "loss": 0.8223846, "num_input_tokens_seen": 117201380, "router_z_loss_clip": 1.99316406, "router_z_loss_mlp": 0.22485352, "step": 5458, "time_per_iteration": 2.9148786067962646 }, { "auxiliary_loss_clip": 0.01496173, "auxiliary_loss_mlp": 0.01040065, "balance_loss_clip": 1.30323219, "balance_loss_mlp": 1.01672435, "epoch": 0.3282128363144446, "flos": 21260336146560.0, "grad_norm": 6.005699325526837, "language_loss": 0.73631936, "learning_rate": 3.1374112313964686e-06, "loss": 0.76168168, "num_input_tokens_seen": 117221040, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.23339844, "step": 5459, "time_per_iteration": 2.9444568157196045 }, { "auxiliary_loss_clip": 0.01520764, "auxiliary_loss_mlp": 0.01042311, "balance_loss_clip": 1.32339096, "balance_loss_mlp": 1.01918387, "epoch": 0.32827295956711255, "flos": 30854336256000.0, "grad_norm": 2.421076521104659, "language_loss": 0.84864265, "learning_rate": 3.1370908615786783e-06, "loss": 0.87427342, "num_input_tokens_seen": 117241395, "router_z_loss_clip": 1.97265625, "router_z_loss_mlp": 0.23132324, "step": 5460, "time_per_iteration": 2.9539787769317627 }, { "auxiliary_loss_clip": 0.01499815, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.30665016, "balance_loss_mlp": 1.01603615, "epoch": 0.3283330828197806, "flos": 25924356806400.0, "grad_norm": 3.2813644720771937, "language_loss": 0.77711344, "learning_rate": 3.136770448642288e-06, "loss": 0.80249381, "num_input_tokens_seen": 117259340, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.22192383, "step": 5461, "time_per_iteration": 2.8989076614379883 }, { "auxiliary_loss_clip": 0.01500811, "auxiliary_loss_mlp": 0.01044074, "balance_loss_clip": 1.30826735, "balance_loss_mlp": 1.01870608, "epoch": 0.32839320607244854, "flos": 38596388434560.0, "grad_norm": 1.864001637360099, "language_loss": 0.63040745, "learning_rate": 3.1364499925994484e-06, "loss": 0.65585637, "num_input_tokens_seen": 117282375, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.25366211, "step": 5462, "time_per_iteration": 3.0316312313079834 }, { "auxiliary_loss_clip": 0.01479031, "auxiliary_loss_mlp": 0.01038376, "balance_loss_clip": 1.29065418, "balance_loss_mlp": 1.01534486, "epoch": 0.3284533293251165, "flos": 26662030329600.0, "grad_norm": 1.4646882953829832, "language_loss": 0.79121625, "learning_rate": 3.1361294934623115e-06, "loss": 0.81639034, "num_input_tokens_seen": 117303830, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.23034668, "step": 5463, "time_per_iteration": 2.9362776279449463 }, { "auxiliary_loss_clip": 0.01501626, "auxiliary_loss_mlp": 0.01043327, "balance_loss_clip": 1.30850768, "balance_loss_mlp": 1.01978326, "epoch": 0.32851345257778447, "flos": 15312445009920.0, "grad_norm": 2.008715148886521, "language_loss": 0.70356953, "learning_rate": 3.1358089512430303e-06, "loss": 0.72901905, "num_input_tokens_seen": 117320665, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.2355957, "step": 5464, "time_per_iteration": 4.276487112045288 }, { "auxiliary_loss_clip": 0.01486362, "auxiliary_loss_mlp": 0.01044021, "balance_loss_clip": 1.29821944, "balance_loss_mlp": 1.02011919, "epoch": 0.32857357583045244, "flos": 23524221150720.0, "grad_norm": 1.963871477864965, "language_loss": 0.7294277, "learning_rate": 3.1354883659537594e-06, "loss": 0.75473154, "num_input_tokens_seen": 117339795, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.23913574, "step": 5465, "time_per_iteration": 2.8993887901306152 }, { "auxiliary_loss_clip": 0.01499489, "auxiliary_loss_mlp": 0.01045149, "balance_loss_clip": 1.30689001, "balance_loss_mlp": 1.01993608, "epoch": 0.3286336990831204, "flos": 21004783660800.0, "grad_norm": 1.5148101675395698, "language_loss": 0.8334291, "learning_rate": 3.1351677376066567e-06, "loss": 0.85887551, "num_input_tokens_seen": 117359525, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.2520752, "step": 5466, "time_per_iteration": 2.928316831588745 }, { "auxiliary_loss_clip": 0.01500226, "auxiliary_loss_mlp": 0.01039829, "balance_loss_clip": 1.30840063, "balance_loss_mlp": 1.01698804, "epoch": 0.32869382233578837, "flos": 23669113559040.0, "grad_norm": 2.3779425129734744, "language_loss": 0.79768419, "learning_rate": 3.134847066213879e-06, "loss": 0.82308471, "num_input_tokens_seen": 117380320, "router_z_loss_clip": 1.91992188, "router_z_loss_mlp": 0.22827148, "step": 5467, "time_per_iteration": 2.9820942878723145 }, { "auxiliary_loss_clip": 0.01502633, "auxiliary_loss_mlp": 0.01041579, "balance_loss_clip": 1.30914712, "balance_loss_mlp": 1.01787972, "epoch": 0.32875394558845633, "flos": 25347094657920.0, "grad_norm": 1.6942416045187358, "language_loss": 0.75067806, "learning_rate": 3.134526351787587e-06, "loss": 0.77612019, "num_input_tokens_seen": 117400695, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.23706055, "step": 5468, "time_per_iteration": 4.501052141189575 }, { "auxiliary_loss_clip": 0.01514672, "auxiliary_loss_mlp": 0.01049971, "balance_loss_clip": 1.31634426, "balance_loss_mlp": 1.02320814, "epoch": 0.3288140688411243, "flos": 14911366688640.0, "grad_norm": 8.4800559162438, "language_loss": 0.79307592, "learning_rate": 3.134205594339942e-06, "loss": 0.81872237, "num_input_tokens_seen": 117418800, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 0.26757812, "step": 5469, "time_per_iteration": 2.8568835258483887 }, { "auxiliary_loss_clip": 0.01495726, "auxiliary_loss_mlp": 0.01043596, "balance_loss_clip": 1.30409932, "balance_loss_mlp": 1.02054048, "epoch": 0.32887419209379226, "flos": 18560461818240.0, "grad_norm": 1.7941850164777267, "language_loss": 0.82783198, "learning_rate": 3.133884793883107e-06, "loss": 0.85322529, "num_input_tokens_seen": 117438220, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.23059082, "step": 5470, "time_per_iteration": 4.264109373092651 }, { "auxiliary_loss_clip": 0.01487198, "auxiliary_loss_mlp": 0.01045099, "balance_loss_clip": 1.29602516, "balance_loss_mlp": 1.02233005, "epoch": 0.3289343153464602, "flos": 48122602554240.0, "grad_norm": 1.9066202395115286, "language_loss": 0.68927217, "learning_rate": 3.1335639504292478e-06, "loss": 0.71459508, "num_input_tokens_seen": 117462560, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.22790527, "step": 5471, "time_per_iteration": 3.119304895401001 }, { "auxiliary_loss_clip": 0.01518322, "auxiliary_loss_mlp": 0.01051874, "balance_loss_clip": 1.31959188, "balance_loss_mlp": 1.02608931, "epoch": 0.3289944385991282, "flos": 27611884558080.0, "grad_norm": 1.880925464627752, "language_loss": 0.65712148, "learning_rate": 3.1332430639905288e-06, "loss": 0.68282342, "num_input_tokens_seen": 117483665, "router_z_loss_clip": 1.98730469, "router_z_loss_mlp": 0.25793457, "step": 5472, "time_per_iteration": 4.3394129276275635 }, { "auxiliary_loss_clip": 0.01499208, "auxiliary_loss_mlp": 0.01044078, "balance_loss_clip": 1.30488253, "balance_loss_mlp": 1.01876974, "epoch": 0.32905456185179616, "flos": 20129864100480.0, "grad_norm": 1.7598214677053028, "language_loss": 0.89464569, "learning_rate": 3.13292213457912e-06, "loss": 0.92007852, "num_input_tokens_seen": 117503565, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.25292969, "step": 5473, "time_per_iteration": 2.909392833709717 }, { "auxiliary_loss_clip": 0.01500096, "auxiliary_loss_mlp": 0.01047548, "balance_loss_clip": 1.30540586, "balance_loss_mlp": 1.0214529, "epoch": 0.3291146851044642, "flos": 23189300006400.0, "grad_norm": 2.1565696749155565, "language_loss": 0.79576689, "learning_rate": 3.1326011622071903e-06, "loss": 0.82124329, "num_input_tokens_seen": 117521460, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.2611084, "step": 5474, "time_per_iteration": 2.862248182296753 }, { "auxiliary_loss_clip": 0.01286075, "auxiliary_loss_mlp": 0.01105278, "balance_loss_clip": 1.16967762, "balance_loss_mlp": 1.07399774, "epoch": 0.32917480835713214, "flos": 67652509599360.0, "grad_norm": 0.8428656893570078, "language_loss": 0.60251576, "learning_rate": 3.132280146886911e-06, "loss": 0.62642932, "num_input_tokens_seen": 117580550, "router_z_loss_clip": 1.1640625, "router_z_loss_mlp": 0.3125, "step": 5475, "time_per_iteration": 3.3473572731018066 }, { "auxiliary_loss_clip": 0.01510483, "auxiliary_loss_mlp": 0.01046858, "balance_loss_clip": 1.31252456, "balance_loss_mlp": 1.02060783, "epoch": 0.3292349316098001, "flos": 27976061105280.0, "grad_norm": 2.5338870786224694, "language_loss": 0.77492034, "learning_rate": 3.131959088630455e-06, "loss": 0.80049384, "num_input_tokens_seen": 117600645, "router_z_loss_clip": 1.97753906, "router_z_loss_mlp": 0.26257324, "step": 5476, "time_per_iteration": 2.91557240486145 }, { "auxiliary_loss_clip": 0.01496337, "auxiliary_loss_mlp": 0.01048888, "balance_loss_clip": 1.30502999, "balance_loss_mlp": 1.0246172, "epoch": 0.3292950548624681, "flos": 20272584758400.0, "grad_norm": 1.9027148884680487, "language_loss": 0.75443137, "learning_rate": 3.131637987449997e-06, "loss": 0.77988362, "num_input_tokens_seen": 117618880, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.24279785, "step": 5477, "time_per_iteration": 2.89992356300354 }, { "auxiliary_loss_clip": 0.01468334, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.28180289, "balance_loss_mlp": 1.01506388, "epoch": 0.32935517811513604, "flos": 20822582275200.0, "grad_norm": 2.620283962676017, "language_loss": 0.76777864, "learning_rate": 3.131316843357713e-06, "loss": 0.79284889, "num_input_tokens_seen": 117636445, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.23620605, "step": 5478, "time_per_iteration": 2.86607027053833 }, { "auxiliary_loss_clip": 0.01483455, "auxiliary_loss_mlp": 0.01038265, "balance_loss_clip": 1.29631531, "balance_loss_mlp": 1.0150193, "epoch": 0.329415301367804, "flos": 18450932860800.0, "grad_norm": 1.732896632645341, "language_loss": 0.81192327, "learning_rate": 3.1309956563657807e-06, "loss": 0.83714044, "num_input_tokens_seen": 117653105, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.23266602, "step": 5479, "time_per_iteration": 2.8352530002593994 }, { "auxiliary_loss_clip": 0.01278669, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.16016567, "balance_loss_mlp": 1.02249944, "epoch": 0.32947542462047197, "flos": 66357074436480.0, "grad_norm": 0.7465146669616054, "language_loss": 0.5656755, "learning_rate": 3.1306744264863804e-06, "loss": 0.58892179, "num_input_tokens_seen": 117719225, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.234375, "step": 5480, "time_per_iteration": 3.443718910217285 }, { "auxiliary_loss_clip": 0.01489035, "auxiliary_loss_mlp": 0.01048398, "balance_loss_clip": 1.29775238, "balance_loss_mlp": 1.02366245, "epoch": 0.32953554787313993, "flos": 23231947870080.0, "grad_norm": 1.7748061346057895, "language_loss": 0.78535223, "learning_rate": 3.1303531537316915e-06, "loss": 0.81072664, "num_input_tokens_seen": 117738725, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.24719238, "step": 5481, "time_per_iteration": 2.8862950801849365 }, { "auxiliary_loss_clip": 0.01497996, "auxiliary_loss_mlp": 0.01046617, "balance_loss_clip": 1.30388677, "balance_loss_mlp": 1.0228585, "epoch": 0.3295956711258079, "flos": 27019103443200.0, "grad_norm": 1.7678688295761356, "language_loss": 0.78698111, "learning_rate": 3.130031838113899e-06, "loss": 0.81242728, "num_input_tokens_seen": 117757765, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.2376709, "step": 5482, "time_per_iteration": 2.9602763652801514 }, { "auxiliary_loss_clip": 0.01507473, "auxiliary_loss_mlp": 0.01051028, "balance_loss_clip": 1.31193137, "balance_loss_mlp": 1.02682805, "epoch": 0.32965579437847586, "flos": 19181005257600.0, "grad_norm": 1.733154641005759, "language_loss": 0.74982369, "learning_rate": 3.129710479645185e-06, "loss": 0.77540863, "num_input_tokens_seen": 117776810, "router_z_loss_clip": 1.95605469, "router_z_loss_mlp": 0.24206543, "step": 5483, "time_per_iteration": 2.959923028945923 }, { "auxiliary_loss_clip": 0.01492424, "auxiliary_loss_mlp": 0.01051287, "balance_loss_clip": 1.3003366, "balance_loss_mlp": 1.02705216, "epoch": 0.32971591763114383, "flos": 30494819923200.0, "grad_norm": 1.6544760745359672, "language_loss": 0.76297891, "learning_rate": 3.1293890783377366e-06, "loss": 0.78841603, "num_input_tokens_seen": 117797730, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.2421875, "step": 5484, "time_per_iteration": 2.946425437927246 }, { "auxiliary_loss_clip": 0.01491026, "auxiliary_loss_mlp": 0.01052601, "balance_loss_clip": 1.29965031, "balance_loss_mlp": 1.02802038, "epoch": 0.3297760408838118, "flos": 16298205626880.0, "grad_norm": 1.9354436082571662, "language_loss": 0.7288326, "learning_rate": 3.129067634203742e-06, "loss": 0.75426888, "num_input_tokens_seen": 117815365, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.24621582, "step": 5485, "time_per_iteration": 2.8884856700897217 }, { "auxiliary_loss_clip": 0.01488261, "auxiliary_loss_mlp": 0.01049583, "balance_loss_clip": 1.3004024, "balance_loss_mlp": 1.02609921, "epoch": 0.32983616413647976, "flos": 29542205761920.0, "grad_norm": 1.6868574425659706, "language_loss": 0.81224537, "learning_rate": 3.128746147255388e-06, "loss": 0.83762378, "num_input_tokens_seen": 117836095, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.23498535, "step": 5486, "time_per_iteration": 2.9129762649536133 }, { "auxiliary_loss_clip": 0.01502015, "auxiliary_loss_mlp": 0.01051608, "balance_loss_clip": 1.31204295, "balance_loss_mlp": 1.02856517, "epoch": 0.3298962873891478, "flos": 20641150051200.0, "grad_norm": 8.241651619497196, "language_loss": 0.85246831, "learning_rate": 3.1284246175048683e-06, "loss": 0.87800455, "num_input_tokens_seen": 117854655, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.23046875, "step": 5487, "time_per_iteration": 2.8778603076934814 }, { "auxiliary_loss_clip": 0.01510028, "auxiliary_loss_mlp": 0.01056481, "balance_loss_clip": 1.31499732, "balance_loss_mlp": 1.03150654, "epoch": 0.32995641064181574, "flos": 14984265340800.0, "grad_norm": 2.249790025290847, "language_loss": 0.75569022, "learning_rate": 3.1281030449643735e-06, "loss": 0.78135526, "num_input_tokens_seen": 117873300, "router_z_loss_clip": 1.94824219, "router_z_loss_mlp": 0.24987793, "step": 5488, "time_per_iteration": 3.04264235496521 }, { "auxiliary_loss_clip": 0.01497044, "auxiliary_loss_mlp": 0.01054813, "balance_loss_clip": 1.30467916, "balance_loss_mlp": 1.03016019, "epoch": 0.3300165338944837, "flos": 18670759937280.0, "grad_norm": 2.43678389434333, "language_loss": 0.72985041, "learning_rate": 3.127781429646098e-06, "loss": 0.75536901, "num_input_tokens_seen": 117891540, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.2467041, "step": 5489, "time_per_iteration": 3.0880839824676514 }, { "auxiliary_loss_clip": 0.01499688, "auxiliary_loss_mlp": 0.01044579, "balance_loss_clip": 1.30891347, "balance_loss_mlp": 1.02104664, "epoch": 0.3300766571471517, "flos": 25592240839680.0, "grad_norm": 4.789586345656304, "language_loss": 0.90425789, "learning_rate": 3.127459771562238e-06, "loss": 0.92970061, "num_input_tokens_seen": 117907690, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.23547363, "step": 5490, "time_per_iteration": 2.8745641708374023 }, { "auxiliary_loss_clip": 0.01482434, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.29275775, "balance_loss_mlp": 1.02077377, "epoch": 0.33013678039981964, "flos": 11370714641280.0, "grad_norm": 2.1894375104112704, "language_loss": 0.83969396, "learning_rate": 3.1271380707249907e-06, "loss": 0.86495715, "num_input_tokens_seen": 117925640, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.2310791, "step": 5491, "time_per_iteration": 2.864011287689209 }, { "auxiliary_loss_clip": 0.01495683, "auxiliary_loss_mlp": 0.01046875, "balance_loss_clip": 1.30456567, "balance_loss_mlp": 1.02271116, "epoch": 0.3301969036524876, "flos": 24830469820800.0, "grad_norm": 2.083382881281837, "language_loss": 0.77865267, "learning_rate": 3.126816327146554e-06, "loss": 0.80407822, "num_input_tokens_seen": 117944525, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.24169922, "step": 5492, "time_per_iteration": 2.9445090293884277 }, { "auxiliary_loss_clip": 0.01502609, "auxiliary_loss_mlp": 0.01044995, "balance_loss_clip": 1.30656552, "balance_loss_mlp": 1.02084303, "epoch": 0.33025702690515557, "flos": 15969663999360.0, "grad_norm": 3.5862048078061775, "language_loss": 0.76565647, "learning_rate": 3.12649454083913e-06, "loss": 0.79113257, "num_input_tokens_seen": 117962515, "router_z_loss_clip": 1.9609375, "router_z_loss_mlp": 0.24133301, "step": 5493, "time_per_iteration": 2.899712085723877 }, { "auxiliary_loss_clip": 0.01276001, "auxiliary_loss_mlp": 0.01055001, "balance_loss_clip": 1.16355443, "balance_loss_mlp": 1.02372038, "epoch": 0.33031715015782354, "flos": 59446298568960.0, "grad_norm": 0.7896118684436624, "language_loss": 0.53931975, "learning_rate": 3.12617271181492e-06, "loss": 0.56262982, "num_input_tokens_seen": 118018780, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.3125, "step": 5494, "time_per_iteration": 3.333669424057007 }, { "auxiliary_loss_clip": 0.01505908, "auxiliary_loss_mlp": 0.01045205, "balance_loss_clip": 1.31280756, "balance_loss_mlp": 1.021029, "epoch": 0.3303772734104915, "flos": 23194503158400.0, "grad_norm": 1.880385432093056, "language_loss": 0.87636709, "learning_rate": 3.1258508400861276e-06, "loss": 0.90187824, "num_input_tokens_seen": 118038610, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.24194336, "step": 5495, "time_per_iteration": 2.9754135608673096 }, { "auxiliary_loss_clip": 0.01502848, "auxiliary_loss_mlp": 0.01048063, "balance_loss_clip": 1.30751669, "balance_loss_mlp": 1.02097821, "epoch": 0.33043739666315947, "flos": 33084577111680.0, "grad_norm": 1.9749240192229451, "language_loss": 0.74816823, "learning_rate": 3.1255289256649587e-06, "loss": 0.77367735, "num_input_tokens_seen": 118055905, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 0.27099609, "step": 5496, "time_per_iteration": 2.9954674243927 }, { "auxiliary_loss_clip": 0.01491699, "auxiliary_loss_mlp": 0.01041808, "balance_loss_clip": 1.30207264, "balance_loss_mlp": 1.0175724, "epoch": 0.33049751991582743, "flos": 24905042530560.0, "grad_norm": 2.180883293504036, "language_loss": 0.73512149, "learning_rate": 3.1252069685636196e-06, "loss": 0.76045656, "num_input_tokens_seen": 118073695, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.2421875, "step": 5497, "time_per_iteration": 2.9547500610351562 }, { "auxiliary_loss_clip": 0.0149804, "auxiliary_loss_mlp": 0.01040791, "balance_loss_clip": 1.30863428, "balance_loss_mlp": 1.01723492, "epoch": 0.3305576431684954, "flos": 29472519490560.0, "grad_norm": 2.2325434249205074, "language_loss": 0.82092321, "learning_rate": 3.124884968794321e-06, "loss": 0.84631151, "num_input_tokens_seen": 118094030, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.2355957, "step": 5498, "time_per_iteration": 2.9362800121307373 }, { "auxiliary_loss_clip": 0.01501008, "auxiliary_loss_mlp": 0.01042423, "balance_loss_clip": 1.30696726, "balance_loss_mlp": 1.01760364, "epoch": 0.33061776642116336, "flos": 22641067036800.0, "grad_norm": 2.040336272019396, "language_loss": 0.7666803, "learning_rate": 3.12456292636927e-06, "loss": 0.79211462, "num_input_tokens_seen": 118111665, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.24829102, "step": 5499, "time_per_iteration": 2.884650230407715 }, { "auxiliary_loss_clip": 0.01494068, "auxiliary_loss_mlp": 0.0103801, "balance_loss_clip": 1.30261469, "balance_loss_mlp": 1.01418042, "epoch": 0.3306778896738313, "flos": 25787517972480.0, "grad_norm": 1.541483116706843, "language_loss": 0.7957356, "learning_rate": 3.124240841300681e-06, "loss": 0.82105637, "num_input_tokens_seen": 118132435, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.23815918, "step": 5500, "time_per_iteration": 4.288754940032959 }, { "auxiliary_loss_clip": 0.01505524, "auxiliary_loss_mlp": 0.01040056, "balance_loss_clip": 1.31191134, "balance_loss_mlp": 1.01578498, "epoch": 0.33073801292649935, "flos": 36954539948160.0, "grad_norm": 2.236103787474974, "language_loss": 0.67284137, "learning_rate": 3.1239187136007665e-06, "loss": 0.69829714, "num_input_tokens_seen": 118155255, "router_z_loss_clip": 1.9375, "router_z_loss_mlp": 0.24279785, "step": 5501, "time_per_iteration": 3.0692596435546875 }, { "auxiliary_loss_clip": 0.01501561, "auxiliary_loss_mlp": 0.01049527, "balance_loss_clip": 1.30891538, "balance_loss_mlp": 1.02415919, "epoch": 0.3307981361791673, "flos": 12975073171200.0, "grad_norm": 2.479691276403042, "language_loss": 0.78425163, "learning_rate": 3.1235965432817417e-06, "loss": 0.80976248, "num_input_tokens_seen": 118169865, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.25390625, "step": 5502, "time_per_iteration": 4.217082977294922 }, { "auxiliary_loss_clip": 0.01511523, "auxiliary_loss_mlp": 0.01045956, "balance_loss_clip": 1.31814432, "balance_loss_mlp": 1.02086282, "epoch": 0.3308582594318353, "flos": 25385308531200.0, "grad_norm": 1.7335875634098998, "language_loss": 0.73224354, "learning_rate": 3.123274330355824e-06, "loss": 0.75781834, "num_input_tokens_seen": 118190760, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.25134277, "step": 5503, "time_per_iteration": 2.909001350402832 }, { "auxiliary_loss_clip": 0.01486228, "auxiliary_loss_mlp": 0.01043275, "balance_loss_clip": 1.29525316, "balance_loss_mlp": 1.01869452, "epoch": 0.33091838268450324, "flos": 26479693209600.0, "grad_norm": 2.8003981361778303, "language_loss": 0.75608897, "learning_rate": 3.12295207483523e-06, "loss": 0.78138399, "num_input_tokens_seen": 118213620, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.24584961, "step": 5504, "time_per_iteration": 2.9777297973632812 }, { "auxiliary_loss_clip": 0.0148874, "auxiliary_loss_mlp": 0.01045823, "balance_loss_clip": 1.29814315, "balance_loss_mlp": 1.02324462, "epoch": 0.3309785059371712, "flos": 24981515521920.0, "grad_norm": 1.6332632477629518, "language_loss": 0.70753288, "learning_rate": 3.1226297767321816e-06, "loss": 0.73287851, "num_input_tokens_seen": 118235010, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.22595215, "step": 5505, "time_per_iteration": 4.339970827102661 }, { "auxiliary_loss_clip": 0.01493398, "auxiliary_loss_mlp": 0.01052199, "balance_loss_clip": 1.30413222, "balance_loss_mlp": 1.02866721, "epoch": 0.3310386291898392, "flos": 20455826774400.0, "grad_norm": 1.8889255308854678, "language_loss": 0.82462907, "learning_rate": 3.122307436058899e-06, "loss": 0.85008502, "num_input_tokens_seen": 118255820, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.23522949, "step": 5506, "time_per_iteration": 2.9033901691436768 }, { "auxiliary_loss_clip": 0.01496619, "auxiliary_loss_mlp": 0.0104632, "balance_loss_clip": 1.30488026, "balance_loss_mlp": 1.02225208, "epoch": 0.33109875244250714, "flos": 23192376652800.0, "grad_norm": 1.9358742727891696, "language_loss": 0.79731125, "learning_rate": 3.121985052827606e-06, "loss": 0.82274067, "num_input_tokens_seen": 118274160, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.24084473, "step": 5507, "time_per_iteration": 2.895332098007202 }, { "auxiliary_loss_clip": 0.01491163, "auxiliary_loss_mlp": 0.01049768, "balance_loss_clip": 1.30039024, "balance_loss_mlp": 1.02556884, "epoch": 0.3311588756951751, "flos": 24178499228160.0, "grad_norm": 2.023297746423264, "language_loss": 0.72484231, "learning_rate": 3.1216626270505274e-06, "loss": 0.75025165, "num_input_tokens_seen": 118294385, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.2421875, "step": 5508, "time_per_iteration": 4.273925065994263 }, { "auxiliary_loss_clip": 0.01482953, "auxiliary_loss_mlp": 0.01042083, "balance_loss_clip": 1.29661322, "balance_loss_mlp": 1.02001774, "epoch": 0.33121899894784307, "flos": 28156995636480.0, "grad_norm": 2.379950796173046, "language_loss": 0.72565031, "learning_rate": 3.12134015873989e-06, "loss": 0.75090075, "num_input_tokens_seen": 118313105, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.22070312, "step": 5509, "time_per_iteration": 2.888230562210083 }, { "auxiliary_loss_clip": 0.01488687, "auxiliary_loss_mlp": 0.01041217, "balance_loss_clip": 1.29951489, "balance_loss_mlp": 1.01837599, "epoch": 0.33127912220051103, "flos": 29579017046400.0, "grad_norm": 1.8181885874476666, "language_loss": 0.73830944, "learning_rate": 3.121017647907921e-06, "loss": 0.76360852, "num_input_tokens_seen": 118335250, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.2286377, "step": 5510, "time_per_iteration": 2.955430746078491 }, { "auxiliary_loss_clip": 0.01481679, "auxiliary_loss_mlp": 0.01046051, "balance_loss_clip": 1.2925874, "balance_loss_mlp": 1.02305532, "epoch": 0.331339245453179, "flos": 14436982512000.0, "grad_norm": 2.0671870588529235, "language_loss": 0.88013697, "learning_rate": 3.1206950945668508e-06, "loss": 0.90541428, "num_input_tokens_seen": 118351470, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.22998047, "step": 5511, "time_per_iteration": 2.811195135116577 }, { "auxiliary_loss_clip": 0.01467956, "auxiliary_loss_mlp": 0.01042937, "balance_loss_clip": 1.28639698, "balance_loss_mlp": 1.02053738, "epoch": 0.33139936870584696, "flos": 20897019250560.0, "grad_norm": 1.7232927297574487, "language_loss": 0.7416966, "learning_rate": 3.12037249872891e-06, "loss": 0.76680553, "num_input_tokens_seen": 118370970, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.22387695, "step": 5512, "time_per_iteration": 2.9063351154327393 }, { "auxiliary_loss_clip": 0.01472882, "auxiliary_loss_mlp": 0.01046845, "balance_loss_clip": 1.28642035, "balance_loss_mlp": 1.02145314, "epoch": 0.33145949195851493, "flos": 36298316344320.0, "grad_norm": 2.9305704900878404, "language_loss": 0.74027205, "learning_rate": 3.1200498604063317e-06, "loss": 0.76546925, "num_input_tokens_seen": 118393125, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.25402832, "step": 5513, "time_per_iteration": 2.9898509979248047 }, { "auxiliary_loss_clip": 0.01495163, "auxiliary_loss_mlp": 0.01044604, "balance_loss_clip": 1.30242133, "balance_loss_mlp": 1.02088094, "epoch": 0.33151961521118295, "flos": 14287022686080.0, "grad_norm": 1.8678627210777785, "language_loss": 0.69892496, "learning_rate": 3.1197271796113507e-06, "loss": 0.72432268, "num_input_tokens_seen": 118410860, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.23730469, "step": 5514, "time_per_iteration": 2.8323986530303955 }, { "auxiliary_loss_clip": 0.01497084, "auxiliary_loss_mlp": 0.01054692, "balance_loss_clip": 1.3045311, "balance_loss_mlp": 1.02986073, "epoch": 0.3315797384638509, "flos": 20783327771520.0, "grad_norm": 4.1184658548474795, "language_loss": 0.67573577, "learning_rate": 3.1194044563562026e-06, "loss": 0.70125353, "num_input_tokens_seen": 118429570, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.24841309, "step": 5515, "time_per_iteration": 2.8454108238220215 }, { "auxiliary_loss_clip": 0.01506806, "auxiliary_loss_mlp": 0.0104311, "balance_loss_clip": 1.31332612, "balance_loss_mlp": 1.01919687, "epoch": 0.3316398617165189, "flos": 24689785178880.0, "grad_norm": 3.5017898227395383, "language_loss": 0.69713706, "learning_rate": 3.1190816906531257e-06, "loss": 0.72263616, "num_input_tokens_seen": 118450285, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.23913574, "step": 5516, "time_per_iteration": 2.903982639312744 }, { "auxiliary_loss_clip": 0.01499634, "auxiliary_loss_mlp": 0.01046803, "balance_loss_clip": 1.30555868, "balance_loss_mlp": 1.02380705, "epoch": 0.33169998496918685, "flos": 18597137368320.0, "grad_norm": 2.364697395287522, "language_loss": 0.81609118, "learning_rate": 3.118758882514359e-06, "loss": 0.84155554, "num_input_tokens_seen": 118468270, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.22998047, "step": 5517, "time_per_iteration": 2.854605197906494 }, { "auxiliary_loss_clip": 0.01465222, "auxiliary_loss_mlp": 0.01044385, "balance_loss_clip": 1.2820487, "balance_loss_mlp": 1.02169919, "epoch": 0.3317601082218548, "flos": 20203305690240.0, "grad_norm": 1.66165754560778, "language_loss": 0.75290138, "learning_rate": 3.118436031952143e-06, "loss": 0.77799743, "num_input_tokens_seen": 118486615, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.22680664, "step": 5518, "time_per_iteration": 2.852267265319824 }, { "auxiliary_loss_clip": 0.01287218, "auxiliary_loss_mlp": 0.01041122, "balance_loss_clip": 1.17178583, "balance_loss_mlp": 1.01699376, "epoch": 0.3318202314745228, "flos": 69006383061120.0, "grad_norm": 0.625001371018603, "language_loss": 0.5432682, "learning_rate": 3.1181131389787206e-06, "loss": 0.56655157, "num_input_tokens_seen": 118553580, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.24121094, "step": 5519, "time_per_iteration": 3.5325875282287598 }, { "auxiliary_loss_clip": 0.01488704, "auxiliary_loss_mlp": 0.01041171, "balance_loss_clip": 1.30038309, "balance_loss_mlp": 1.01783013, "epoch": 0.33188035472719074, "flos": 21508287505920.0, "grad_norm": 2.8777742518074074, "language_loss": 0.7930547, "learning_rate": 3.117790203606336e-06, "loss": 0.81835347, "num_input_tokens_seen": 118570280, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.23327637, "step": 5520, "time_per_iteration": 2.876889944076538 }, { "auxiliary_loss_clip": 0.01478694, "auxiliary_loss_mlp": 0.01043543, "balance_loss_clip": 1.29201329, "balance_loss_mlp": 1.0209409, "epoch": 0.3319404779798587, "flos": 28880643271680.0, "grad_norm": 1.8334972481450111, "language_loss": 0.76647294, "learning_rate": 3.1174672258472344e-06, "loss": 0.79169536, "num_input_tokens_seen": 118590455, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.22595215, "step": 5521, "time_per_iteration": 2.9918320178985596 }, { "auxiliary_loss_clip": 0.01492576, "auxiliary_loss_mlp": 0.01046591, "balance_loss_clip": 1.30200219, "balance_loss_mlp": 1.02220023, "epoch": 0.33200060123252667, "flos": 23087553154560.0, "grad_norm": 2.1045297355758326, "language_loss": 0.71247727, "learning_rate": 3.117144205713664e-06, "loss": 0.73786896, "num_input_tokens_seen": 118609495, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.24365234, "step": 5522, "time_per_iteration": 3.0218424797058105 }, { "auxiliary_loss_clip": 0.01480911, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.29337335, "balance_loss_mlp": 1.02445269, "epoch": 0.33206072448519464, "flos": 21152436001920.0, "grad_norm": 1.8519489448444226, "language_loss": 0.74915779, "learning_rate": 3.1168211432178735e-06, "loss": 0.77443862, "num_input_tokens_seen": 118628720, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.22741699, "step": 5523, "time_per_iteration": 2.917095422744751 }, { "auxiliary_loss_clip": 0.01479266, "auxiliary_loss_mlp": 0.01041402, "balance_loss_clip": 1.29230487, "balance_loss_mlp": 1.01738131, "epoch": 0.3321208477378626, "flos": 13086954858240.0, "grad_norm": 1.8375580380587146, "language_loss": 0.82312369, "learning_rate": 3.116498038372114e-06, "loss": 0.84833038, "num_input_tokens_seen": 118645955, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.2401123, "step": 5524, "time_per_iteration": 2.8747291564941406 }, { "auxiliary_loss_clip": 0.01479688, "auxiliary_loss_mlp": 0.01041446, "balance_loss_clip": 1.29284048, "balance_loss_mlp": 1.01771164, "epoch": 0.33218097099053057, "flos": 21225379898880.0, "grad_norm": 1.5949591998226134, "language_loss": 0.83633453, "learning_rate": 3.116174891188636e-06, "loss": 0.86154586, "num_input_tokens_seen": 118665605, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.23730469, "step": 5525, "time_per_iteration": 2.865344762802124 }, { "auxiliary_loss_clip": 0.01278135, "auxiliary_loss_mlp": 0.01019595, "balance_loss_clip": 1.16222978, "balance_loss_mlp": 0.99775618, "epoch": 0.33224109424319853, "flos": 64381526415360.0, "grad_norm": 0.7724121775373993, "language_loss": 0.52711022, "learning_rate": 3.1158517016796945e-06, "loss": 0.55008751, "num_input_tokens_seen": 118728155, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.21875, "step": 5526, "time_per_iteration": 3.3488171100616455 }, { "auxiliary_loss_clip": 0.01497189, "auxiliary_loss_mlp": 0.01040799, "balance_loss_clip": 1.3044039, "balance_loss_mlp": 1.01777995, "epoch": 0.33230121749586655, "flos": 17354466921600.0, "grad_norm": 2.061833455328111, "language_loss": 0.78849041, "learning_rate": 3.1155284698575445e-06, "loss": 0.81387031, "num_input_tokens_seen": 118743955, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.23046875, "step": 5527, "time_per_iteration": 2.829340934753418 }, { "auxiliary_loss_clip": 0.01482088, "auxiliary_loss_mlp": 0.01043178, "balance_loss_clip": 1.29509449, "balance_loss_mlp": 1.01993227, "epoch": 0.3323613407485345, "flos": 21007091145600.0, "grad_norm": 2.019821475984275, "language_loss": 0.73710847, "learning_rate": 3.1152051957344434e-06, "loss": 0.76236117, "num_input_tokens_seen": 118763275, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.23266602, "step": 5528, "time_per_iteration": 2.8849806785583496 }, { "auxiliary_loss_clip": 0.0148844, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.29845464, "balance_loss_mlp": 1.01879835, "epoch": 0.3324214640012025, "flos": 13160532182400.0, "grad_norm": 4.183396210935699, "language_loss": 0.84072047, "learning_rate": 3.1148818793226497e-06, "loss": 0.86601996, "num_input_tokens_seen": 118781110, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.22705078, "step": 5529, "time_per_iteration": 2.87052059173584 }, { "auxiliary_loss_clip": 0.01507956, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.31383991, "balance_loss_mlp": 1.01750708, "epoch": 0.33248158725387045, "flos": 22283767699200.0, "grad_norm": 1.9643627269463968, "language_loss": 0.70735854, "learning_rate": 3.114558520634423e-06, "loss": 0.73284596, "num_input_tokens_seen": 118800620, "router_z_loss_clip": 1.94140625, "router_z_loss_mlp": 0.23291016, "step": 5530, "time_per_iteration": 2.8942863941192627 }, { "auxiliary_loss_clip": 0.01495373, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.30343556, "balance_loss_mlp": 1.02107382, "epoch": 0.3325417105065384, "flos": 20750814743040.0, "grad_norm": 2.522753677392257, "language_loss": 0.76969457, "learning_rate": 3.1142351196820256e-06, "loss": 0.795093, "num_input_tokens_seen": 118818725, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.23400879, "step": 5531, "time_per_iteration": 2.943782091140747 }, { "auxiliary_loss_clip": 0.01503949, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.31108522, "balance_loss_mlp": 1.02063251, "epoch": 0.3326018337592064, "flos": 24801304907520.0, "grad_norm": 1.9099913082059983, "language_loss": 0.73881197, "learning_rate": 3.1139116764777206e-06, "loss": 0.76429582, "num_input_tokens_seen": 118839390, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.23803711, "step": 5532, "time_per_iteration": 2.975524663925171 }, { "auxiliary_loss_clip": 0.01483334, "auxiliary_loss_mlp": 0.01033069, "balance_loss_clip": 1.29458642, "balance_loss_mlp": 1.01165891, "epoch": 0.33266195701187434, "flos": 14509835919360.0, "grad_norm": 1.868930217584913, "language_loss": 0.67013073, "learning_rate": 3.1135881910337735e-06, "loss": 0.69529474, "num_input_tokens_seen": 118856275, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.21411133, "step": 5533, "time_per_iteration": 2.843625068664551 }, { "auxiliary_loss_clip": 0.01486225, "auxiliary_loss_mlp": 0.01038206, "balance_loss_clip": 1.29763329, "balance_loss_mlp": 1.01522255, "epoch": 0.3327220802645423, "flos": 15312761723520.0, "grad_norm": 1.75535778742511, "language_loss": 0.71995223, "learning_rate": 3.113264663362451e-06, "loss": 0.74519658, "num_input_tokens_seen": 118873830, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.2298584, "step": 5534, "time_per_iteration": 4.282532215118408 }, { "auxiliary_loss_clip": 0.01490755, "auxiliary_loss_mlp": 0.01044204, "balance_loss_clip": 1.30102086, "balance_loss_mlp": 1.01947999, "epoch": 0.3327822035172103, "flos": 23488088538240.0, "grad_norm": 2.043015297970604, "language_loss": 0.67278391, "learning_rate": 3.1129410934760204e-06, "loss": 0.69813347, "num_input_tokens_seen": 118891560, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.24719238, "step": 5535, "time_per_iteration": 2.907738208770752 }, { "auxiliary_loss_clip": 0.01488351, "auxiliary_loss_mlp": 0.0103818, "balance_loss_clip": 1.29771948, "balance_loss_mlp": 1.01568508, "epoch": 0.33284232676987824, "flos": 25385489510400.0, "grad_norm": 2.182666457450704, "language_loss": 0.73486614, "learning_rate": 3.1126174813867517e-06, "loss": 0.76013142, "num_input_tokens_seen": 118910260, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.22509766, "step": 5536, "time_per_iteration": 2.896592378616333 }, { "auxiliary_loss_clip": 0.01478651, "auxiliary_loss_mlp": 0.01036449, "balance_loss_clip": 1.28908837, "balance_loss_mlp": 1.01353681, "epoch": 0.3329024500225462, "flos": 23704431765120.0, "grad_norm": 1.5226506075550612, "language_loss": 0.82123673, "learning_rate": 3.112293827106917e-06, "loss": 0.84638774, "num_input_tokens_seen": 118929985, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.22900391, "step": 5537, "time_per_iteration": 4.2844343185424805 }, { "auxiliary_loss_clip": 0.01497234, "auxiliary_loss_mlp": 0.0103936, "balance_loss_clip": 1.3044548, "balance_loss_mlp": 1.01685321, "epoch": 0.33296257327521417, "flos": 31735816312320.0, "grad_norm": 1.7630378477576538, "language_loss": 0.72491419, "learning_rate": 3.111970130648789e-06, "loss": 0.75028014, "num_input_tokens_seen": 118951355, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.22497559, "step": 5538, "time_per_iteration": 2.9599335193634033 }, { "auxiliary_loss_clip": 0.0147901, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.29199791, "balance_loss_mlp": 1.01229382, "epoch": 0.33302269652788213, "flos": 22754260823040.0, "grad_norm": 1.7795640962343158, "language_loss": 0.75359017, "learning_rate": 3.1116463920246424e-06, "loss": 0.77873516, "num_input_tokens_seen": 118970910, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.23205566, "step": 5539, "time_per_iteration": 2.8669655323028564 }, { "auxiliary_loss_clip": 0.01511241, "auxiliary_loss_mlp": 0.01044819, "balance_loss_clip": 1.31547916, "balance_loss_mlp": 1.02220452, "epoch": 0.33308281978055015, "flos": 11481193739520.0, "grad_norm": 2.387810449503002, "language_loss": 0.72154963, "learning_rate": 3.1113226112467527e-06, "loss": 0.74711025, "num_input_tokens_seen": 118989200, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.22619629, "step": 5540, "time_per_iteration": 4.2471396923065186 }, { "auxiliary_loss_clip": 0.01486203, "auxiliary_loss_mlp": 0.01038128, "balance_loss_clip": 1.29683423, "balance_loss_mlp": 1.01606274, "epoch": 0.3331429430332181, "flos": 38227008735360.0, "grad_norm": 1.6533121805799553, "language_loss": 0.61735594, "learning_rate": 3.1109987883273983e-06, "loss": 0.64259923, "num_input_tokens_seen": 119011030, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.22045898, "step": 5541, "time_per_iteration": 3.063832998275757 }, { "auxiliary_loss_clip": 0.0149539, "auxiliary_loss_mlp": 0.01044718, "balance_loss_clip": 1.30460024, "balance_loss_mlp": 1.02063739, "epoch": 0.3332030662858861, "flos": 22538912981760.0, "grad_norm": 1.8274586290128774, "language_loss": 0.69574612, "learning_rate": 3.1106749232788584e-06, "loss": 0.72114718, "num_input_tokens_seen": 119030620, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.24108887, "step": 5542, "time_per_iteration": 3.0555944442749023 }, { "auxiliary_loss_clip": 0.01500524, "auxiliary_loss_mlp": 0.0105525, "balance_loss_clip": 1.30947137, "balance_loss_mlp": 1.0315156, "epoch": 0.33326318953855405, "flos": 16006339549440.0, "grad_norm": 1.6511774841213918, "language_loss": 0.75772238, "learning_rate": 3.110351016113414e-06, "loss": 0.78328013, "num_input_tokens_seen": 119048015, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.23742676, "step": 5543, "time_per_iteration": 4.235898733139038 }, { "auxiliary_loss_clip": 0.01504277, "auxiliary_loss_mlp": 0.01048608, "balance_loss_clip": 1.31070852, "balance_loss_mlp": 1.02645898, "epoch": 0.333323312791222, "flos": 25604592670080.0, "grad_norm": 1.6575317564310843, "language_loss": 0.76131165, "learning_rate": 3.110027066843348e-06, "loss": 0.78684056, "num_input_tokens_seen": 119066280, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.22143555, "step": 5544, "time_per_iteration": 2.919095039367676 }, { "auxiliary_loss_clip": 0.01489388, "auxiliary_loss_mlp": 0.01046214, "balance_loss_clip": 1.30156946, "balance_loss_mlp": 1.02392173, "epoch": 0.33338343604389, "flos": 25130570451840.0, "grad_norm": 1.8448209776438744, "language_loss": 0.71702027, "learning_rate": 3.1097030754809456e-06, "loss": 0.74237633, "num_input_tokens_seen": 119087680, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.22302246, "step": 5545, "time_per_iteration": 2.882535934448242 }, { "auxiliary_loss_clip": 0.01494443, "auxiliary_loss_mlp": 0.01049145, "balance_loss_clip": 1.30686164, "balance_loss_mlp": 1.02706802, "epoch": 0.33344355929655795, "flos": 16956329512320.0, "grad_norm": 2.7145273840593704, "language_loss": 0.70071411, "learning_rate": 3.1093790420384894e-06, "loss": 0.72614992, "num_input_tokens_seen": 119105820, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.22058105, "step": 5546, "time_per_iteration": 2.845655918121338 }, { "auxiliary_loss_clip": 0.0150527, "auxiliary_loss_mlp": 0.01052443, "balance_loss_clip": 1.31169844, "balance_loss_mlp": 1.03055668, "epoch": 0.3335036825492259, "flos": 27900583499520.0, "grad_norm": 2.459649323264281, "language_loss": 0.65905219, "learning_rate": 3.1090549665282702e-06, "loss": 0.68462932, "num_input_tokens_seen": 119126630, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.21887207, "step": 5547, "time_per_iteration": 2.901700496673584 }, { "auxiliary_loss_clip": 0.01499758, "auxiliary_loss_mlp": 0.01042489, "balance_loss_clip": 1.3097769, "balance_loss_mlp": 1.0221045, "epoch": 0.3335638058018939, "flos": 16187997997440.0, "grad_norm": 2.091477255296078, "language_loss": 0.86510539, "learning_rate": 3.1087308489625742e-06, "loss": 0.89052784, "num_input_tokens_seen": 119143375, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.20385742, "step": 5548, "time_per_iteration": 2.8573436737060547 }, { "auxiliary_loss_clip": 0.01501402, "auxiliary_loss_mlp": 0.01052512, "balance_loss_clip": 1.30882096, "balance_loss_mlp": 1.02962375, "epoch": 0.33362392905456184, "flos": 39910916903040.0, "grad_norm": 1.8280044371970818, "language_loss": 0.74958384, "learning_rate": 3.1084066893536945e-06, "loss": 0.77512294, "num_input_tokens_seen": 119166450, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.22888184, "step": 5549, "time_per_iteration": 3.0063424110412598 }, { "auxiliary_loss_clip": 0.01496851, "auxiliary_loss_mlp": 0.01051362, "balance_loss_clip": 1.30551779, "balance_loss_mlp": 1.0285219, "epoch": 0.3336840523072298, "flos": 44286329111040.0, "grad_norm": 1.9667321875173644, "language_loss": 0.69643283, "learning_rate": 3.108082487713921e-06, "loss": 0.72191495, "num_input_tokens_seen": 119189645, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.22814941, "step": 5550, "time_per_iteration": 3.0446786880493164 }, { "auxiliary_loss_clip": 0.01506906, "auxiliary_loss_mlp": 0.0104897, "balance_loss_clip": 1.31551969, "balance_loss_mlp": 1.02739275, "epoch": 0.33374417555989777, "flos": 15094020522240.0, "grad_norm": 2.802139785023421, "language_loss": 0.60778379, "learning_rate": 3.1077582440555495e-06, "loss": 0.6333425, "num_input_tokens_seen": 119208045, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.21582031, "step": 5551, "time_per_iteration": 2.8327903747558594 }, { "auxiliary_loss_clip": 0.0148695, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.29869092, "balance_loss_mlp": 1.01934636, "epoch": 0.33380429881256574, "flos": 15857013150720.0, "grad_norm": 1.7387355789424657, "language_loss": 0.71453953, "learning_rate": 3.1074339583908746e-06, "loss": 0.73982608, "num_input_tokens_seen": 119224910, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.22338867, "step": 5552, "time_per_iteration": 2.912032127380371 }, { "auxiliary_loss_clip": 0.01498259, "auxiliary_loss_mlp": 0.01042362, "balance_loss_clip": 1.3096348, "balance_loss_mlp": 1.02033186, "epoch": 0.33386442206523376, "flos": 13488259403520.0, "grad_norm": 1.9444661959562608, "language_loss": 0.83840346, "learning_rate": 3.107109630732192e-06, "loss": 0.86380965, "num_input_tokens_seen": 119243290, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.22045898, "step": 5553, "time_per_iteration": 2.8933253288269043 }, { "auxiliary_loss_clip": 0.01509694, "auxiliary_loss_mlp": 0.01046874, "balance_loss_clip": 1.31905437, "balance_loss_mlp": 1.02421284, "epoch": 0.3339245453179017, "flos": 16699329192960.0, "grad_norm": 11.370747445795025, "language_loss": 0.82088095, "learning_rate": 3.1067852610918017e-06, "loss": 0.84644669, "num_input_tokens_seen": 119261195, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.22668457, "step": 5554, "time_per_iteration": 2.8687338829040527 }, { "auxiliary_loss_clip": 0.01502762, "auxiliary_loss_mlp": 0.0104202, "balance_loss_clip": 1.31067967, "balance_loss_mlp": 1.01982355, "epoch": 0.3339846685705697, "flos": 24621999189120.0, "grad_norm": 1.5088948886781604, "language_loss": 0.82665765, "learning_rate": 3.1064608494820032e-06, "loss": 0.8521055, "num_input_tokens_seen": 119282845, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.22192383, "step": 5555, "time_per_iteration": 2.9074978828430176 }, { "auxiliary_loss_clip": 0.01497614, "auxiliary_loss_mlp": 0.01042259, "balance_loss_clip": 1.3073554, "balance_loss_mlp": 1.01999044, "epoch": 0.33404479182323765, "flos": 30965448781440.0, "grad_norm": 2.0141091886374647, "language_loss": 0.75388616, "learning_rate": 3.106136395915099e-06, "loss": 0.77928489, "num_input_tokens_seen": 119304430, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.22277832, "step": 5556, "time_per_iteration": 2.9525091648101807 }, { "auxiliary_loss_clip": 0.01481267, "auxiliary_loss_mlp": 0.0103418, "balance_loss_clip": 1.29508436, "balance_loss_mlp": 1.0135926, "epoch": 0.3341049150759056, "flos": 23523678213120.0, "grad_norm": 1.897365274272503, "language_loss": 0.83075196, "learning_rate": 3.105811900403391e-06, "loss": 0.85590643, "num_input_tokens_seen": 119323830, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.20605469, "step": 5557, "time_per_iteration": 2.875737190246582 }, { "auxiliary_loss_clip": 0.01501702, "auxiliary_loss_mlp": 0.01039221, "balance_loss_clip": 1.31147122, "balance_loss_mlp": 1.01685715, "epoch": 0.3341650383285736, "flos": 24038040810240.0, "grad_norm": 1.5438172767335567, "language_loss": 0.81180692, "learning_rate": 3.1054873629591855e-06, "loss": 0.83721608, "num_input_tokens_seen": 119346340, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.22387695, "step": 5558, "time_per_iteration": 2.9527387619018555 }, { "auxiliary_loss_clip": 0.01502775, "auxiliary_loss_mlp": 0.01045181, "balance_loss_clip": 1.31183112, "balance_loss_mlp": 1.02406895, "epoch": 0.33422516158124155, "flos": 24913277084160.0, "grad_norm": 2.179234405352386, "language_loss": 0.82373852, "learning_rate": 3.105162783594788e-06, "loss": 0.84921813, "num_input_tokens_seen": 119367285, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.21105957, "step": 5559, "time_per_iteration": 2.88466477394104 }, { "auxiliary_loss_clip": 0.01488411, "auxiliary_loss_mlp": 0.01044246, "balance_loss_clip": 1.30213356, "balance_loss_mlp": 1.02097607, "epoch": 0.3342852848339095, "flos": 18342716002560.0, "grad_norm": 1.9147359598255298, "language_loss": 0.72302079, "learning_rate": 3.1048381623225074e-06, "loss": 0.74834728, "num_input_tokens_seen": 119385370, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.23242188, "step": 5560, "time_per_iteration": 2.860154151916504 }, { "auxiliary_loss_clip": 0.01512727, "auxiliary_loss_mlp": 0.01044116, "balance_loss_clip": 1.31968379, "balance_loss_mlp": 1.02147758, "epoch": 0.3343454080865775, "flos": 30059192557440.0, "grad_norm": 1.4406989913154584, "language_loss": 0.75714958, "learning_rate": 3.1045134991546526e-06, "loss": 0.782718, "num_input_tokens_seen": 119409150, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.22644043, "step": 5561, "time_per_iteration": 3.025322198867798 }, { "auxiliary_loss_clip": 0.01504826, "auxiliary_loss_mlp": 0.01043419, "balance_loss_clip": 1.31451297, "balance_loss_mlp": 1.02099609, "epoch": 0.33440553133924544, "flos": 16407417870720.0, "grad_norm": 1.7723786096291922, "language_loss": 0.70091093, "learning_rate": 3.1041887941035355e-06, "loss": 0.72639334, "num_input_tokens_seen": 119426475, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.22436523, "step": 5562, "time_per_iteration": 2.8286075592041016 }, { "auxiliary_loss_clip": 0.01494958, "auxiliary_loss_mlp": 0.01043297, "balance_loss_clip": 1.30516005, "balance_loss_mlp": 1.02201784, "epoch": 0.3344656545919134, "flos": 24252393265920.0, "grad_norm": 1.6690501622296605, "language_loss": 0.66132474, "learning_rate": 3.1038640471814685e-06, "loss": 0.68670726, "num_input_tokens_seen": 119446900, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.21276855, "step": 5563, "time_per_iteration": 2.926013231277466 }, { "auxiliary_loss_clip": 0.01507279, "auxiliary_loss_mlp": 0.01047658, "balance_loss_clip": 1.31448793, "balance_loss_mlp": 1.0242337, "epoch": 0.3345257778445814, "flos": 52136824371840.0, "grad_norm": 1.4829779258292959, "language_loss": 0.74560082, "learning_rate": 3.103539258400766e-06, "loss": 0.77115023, "num_input_tokens_seen": 119470945, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.23413086, "step": 5564, "time_per_iteration": 3.1650631427764893 }, { "auxiliary_loss_clip": 0.01279809, "auxiliary_loss_mlp": 0.01051148, "balance_loss_clip": 1.1628319, "balance_loss_mlp": 1.02711546, "epoch": 0.33458590109724934, "flos": 68076418561920.0, "grad_norm": 0.7957323507333102, "language_loss": 0.55538666, "learning_rate": 3.103214427773745e-06, "loss": 0.57869625, "num_input_tokens_seen": 119529925, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.24023438, "step": 5565, "time_per_iteration": 3.3232290744781494 }, { "auxiliary_loss_clip": 0.01490349, "auxiliary_loss_mlp": 0.01050566, "balance_loss_clip": 1.30307245, "balance_loss_mlp": 1.02652144, "epoch": 0.3346460243499173, "flos": 37428878880000.0, "grad_norm": 1.8671753329327845, "language_loss": 0.65621215, "learning_rate": 3.102889555312721e-06, "loss": 0.68162125, "num_input_tokens_seen": 119550700, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.24047852, "step": 5566, "time_per_iteration": 3.0259647369384766 }, { "auxiliary_loss_clip": 0.014908, "auxiliary_loss_mlp": 0.01048957, "balance_loss_clip": 1.30476046, "balance_loss_mlp": 1.02528214, "epoch": 0.3347061476025853, "flos": 18706530591360.0, "grad_norm": 1.816648156960533, "language_loss": 0.78114271, "learning_rate": 3.102564641030016e-06, "loss": 0.80654037, "num_input_tokens_seen": 119569295, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.23657227, "step": 5567, "time_per_iteration": 2.834923028945923 }, { "auxiliary_loss_clip": 0.01497512, "auxiliary_loss_mlp": 0.01046116, "balance_loss_clip": 1.30657172, "balance_loss_mlp": 1.02337086, "epoch": 0.3347662708552533, "flos": 13925787050880.0, "grad_norm": 1.7976267539439292, "language_loss": 0.77689457, "learning_rate": 3.102239684937949e-06, "loss": 0.80233085, "num_input_tokens_seen": 119587375, "router_z_loss_clip": 1.90917969, "router_z_loss_mlp": 0.22766113, "step": 5568, "time_per_iteration": 2.86480712890625 }, { "auxiliary_loss_clip": 0.01500487, "auxiliary_loss_mlp": 0.0104417, "balance_loss_clip": 1.30942881, "balance_loss_mlp": 1.02099562, "epoch": 0.33482639410792125, "flos": 19758538874880.0, "grad_norm": 1.982322748473177, "language_loss": 0.71599859, "learning_rate": 3.101914687048842e-06, "loss": 0.74144512, "num_input_tokens_seen": 119604530, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.23181152, "step": 5569, "time_per_iteration": 4.278224468231201 }, { "auxiliary_loss_clip": 0.01498276, "auxiliary_loss_mlp": 0.01043065, "balance_loss_clip": 1.30742288, "balance_loss_mlp": 1.01956916, "epoch": 0.3348865173605892, "flos": 16110484375680.0, "grad_norm": 2.8018263535646866, "language_loss": 0.9070226, "learning_rate": 3.10158964737502e-06, "loss": 0.93243599, "num_input_tokens_seen": 119621025, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.23498535, "step": 5570, "time_per_iteration": 2.877017021179199 }, { "auxiliary_loss_clip": 0.01490519, "auxiliary_loss_mlp": 0.01037497, "balance_loss_clip": 1.30411994, "balance_loss_mlp": 1.01527596, "epoch": 0.3349466406132572, "flos": 25019638905600.0, "grad_norm": 1.790863179291985, "language_loss": 0.80483353, "learning_rate": 3.101264565928808e-06, "loss": 0.83011371, "num_input_tokens_seen": 119641725, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.22229004, "step": 5571, "time_per_iteration": 2.915003776550293 }, { "auxiliary_loss_clip": 0.01284645, "auxiliary_loss_mlp": 0.01025945, "balance_loss_clip": 1.16647577, "balance_loss_mlp": 1.00410545, "epoch": 0.33500676386592515, "flos": 54351129778560.0, "grad_norm": 0.8989683165746324, "language_loss": 0.56065637, "learning_rate": 3.1009394427225335e-06, "loss": 0.58376229, "num_input_tokens_seen": 119693560, "router_z_loss_clip": 1.1796875, "router_z_loss_mlp": 0.21875, "step": 5572, "time_per_iteration": 3.3182129859924316 }, { "auxiliary_loss_clip": 0.0147973, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.29359913, "balance_loss_mlp": 1.02083862, "epoch": 0.3350668871185931, "flos": 26808642040320.0, "grad_norm": 4.966680394649299, "language_loss": 0.80082309, "learning_rate": 3.1006142777685257e-06, "loss": 0.82605088, "num_input_tokens_seen": 119712935, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.22216797, "step": 5573, "time_per_iteration": 4.45736289024353 }, { "auxiliary_loss_clip": 0.0148784, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.29860139, "balance_loss_mlp": 1.01879859, "epoch": 0.3351270103712611, "flos": 33523507347840.0, "grad_norm": 5.349249583591129, "language_loss": 0.73465735, "learning_rate": 3.1002890710791133e-06, "loss": 0.7599659, "num_input_tokens_seen": 119731680, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.24169922, "step": 5574, "time_per_iteration": 2.9686388969421387 }, { "auxiliary_loss_clip": 0.01470696, "auxiliary_loss_mlp": 0.01040252, "balance_loss_clip": 1.28623033, "balance_loss_mlp": 1.01770949, "epoch": 0.33518713362392905, "flos": 26517680858880.0, "grad_norm": 1.7010641810418943, "language_loss": 0.89022529, "learning_rate": 3.0999638226666287e-06, "loss": 0.9153347, "num_input_tokens_seen": 119752155, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.22558594, "step": 5575, "time_per_iteration": 4.391767263412476 }, { "auxiliary_loss_clip": 0.01521175, "auxiliary_loss_mlp": 0.01050181, "balance_loss_clip": 1.32244241, "balance_loss_mlp": 1.02638674, "epoch": 0.335247256876597, "flos": 17239689567360.0, "grad_norm": 2.188569298175893, "language_loss": 0.83095932, "learning_rate": 3.0996385325434063e-06, "loss": 0.85667294, "num_input_tokens_seen": 119769195, "router_z_loss_clip": 1.98828125, "router_z_loss_mlp": 0.23815918, "step": 5576, "time_per_iteration": 2.916335344314575 }, { "auxiliary_loss_clip": 0.01487603, "auxiliary_loss_mlp": 0.01041762, "balance_loss_clip": 1.29638505, "balance_loss_mlp": 1.01875448, "epoch": 0.335307380129265, "flos": 25640272834560.0, "grad_norm": 2.8362113850668984, "language_loss": 0.75001848, "learning_rate": 3.0993132007217806e-06, "loss": 0.77531213, "num_input_tokens_seen": 119786810, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.2298584, "step": 5577, "time_per_iteration": 2.8779680728912354 }, { "auxiliary_loss_clip": 0.01494204, "auxiliary_loss_mlp": 0.01046706, "balance_loss_clip": 1.30505824, "balance_loss_mlp": 1.02316165, "epoch": 0.33536750338193294, "flos": 19688897848320.0, "grad_norm": 7.830351104763126, "language_loss": 0.82672703, "learning_rate": 3.0989878272140883e-06, "loss": 0.85213608, "num_input_tokens_seen": 119805395, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.23547363, "step": 5578, "time_per_iteration": 4.274037599563599 }, { "auxiliary_loss_clip": 0.01478156, "auxiliary_loss_mlp": 0.01039968, "balance_loss_clip": 1.29402697, "balance_loss_mlp": 1.01824808, "epoch": 0.3354276266346009, "flos": 18341720616960.0, "grad_norm": 2.1696088496013446, "language_loss": 0.72716665, "learning_rate": 3.0986624120326676e-06, "loss": 0.75234783, "num_input_tokens_seen": 119823135, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.21716309, "step": 5579, "time_per_iteration": 2.8392436504364014 }, { "auxiliary_loss_clip": 0.01501882, "auxiliary_loss_mlp": 0.01042604, "balance_loss_clip": 1.31090307, "balance_loss_mlp": 1.02007353, "epoch": 0.3354877498872689, "flos": 17867607909120.0, "grad_norm": 2.425485373418438, "language_loss": 0.82306349, "learning_rate": 3.0983369551898573e-06, "loss": 0.8485083, "num_input_tokens_seen": 119842265, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.2253418, "step": 5580, "time_per_iteration": 2.895878314971924 }, { "auxiliary_loss_clip": 0.01493739, "auxiliary_loss_mlp": 0.01042167, "balance_loss_clip": 1.30304945, "balance_loss_mlp": 1.0184207, "epoch": 0.3355478731399369, "flos": 24728541989760.0, "grad_norm": 1.7185060167676829, "language_loss": 0.77945316, "learning_rate": 3.0980114566980003e-06, "loss": 0.80481219, "num_input_tokens_seen": 119862500, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.23742676, "step": 5581, "time_per_iteration": 2.887403726577759 }, { "auxiliary_loss_clip": 0.01511857, "auxiliary_loss_mlp": 0.01047632, "balance_loss_clip": 1.3156476, "balance_loss_mlp": 1.02224004, "epoch": 0.33560799639260486, "flos": 16882797432960.0, "grad_norm": 2.416711095709031, "language_loss": 0.75195754, "learning_rate": 3.0976859165694384e-06, "loss": 0.77755249, "num_input_tokens_seen": 119880160, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.25378418, "step": 5582, "time_per_iteration": 2.8443257808685303 }, { "auxiliary_loss_clip": 0.01502871, "auxiliary_loss_mlp": 0.01049492, "balance_loss_clip": 1.30946231, "balance_loss_mlp": 1.0261631, "epoch": 0.3356681196452728, "flos": 18342851736960.0, "grad_norm": 1.579977905362513, "language_loss": 0.83566988, "learning_rate": 3.0973603348165166e-06, "loss": 0.86119348, "num_input_tokens_seen": 119899040, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.23327637, "step": 5583, "time_per_iteration": 2.854748010635376 }, { "auxiliary_loss_clip": 0.01490166, "auxiliary_loss_mlp": 0.01049413, "balance_loss_clip": 1.30223227, "balance_loss_mlp": 1.02645361, "epoch": 0.3357282428979408, "flos": 34763825064960.0, "grad_norm": 2.142456765041388, "language_loss": 0.7862941, "learning_rate": 3.097034711451581e-06, "loss": 0.81168985, "num_input_tokens_seen": 119921120, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.22973633, "step": 5584, "time_per_iteration": 2.9743659496307373 }, { "auxiliary_loss_clip": 0.01503821, "auxiliary_loss_mlp": 0.0104594, "balance_loss_clip": 1.31069756, "balance_loss_mlp": 1.02295661, "epoch": 0.33578836615060875, "flos": 21590099383680.0, "grad_norm": 1.5937990432554863, "language_loss": 0.76548672, "learning_rate": 3.0967090464869795e-06, "loss": 0.79098433, "num_input_tokens_seen": 119940165, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.22961426, "step": 5585, "time_per_iteration": 2.930433988571167 }, { "auxiliary_loss_clip": 0.01477496, "auxiliary_loss_mlp": 0.01044733, "balance_loss_clip": 1.29048347, "balance_loss_mlp": 1.02085507, "epoch": 0.3358484894032767, "flos": 24539734863360.0, "grad_norm": 1.5980373630293525, "language_loss": 0.78713608, "learning_rate": 3.0963833399350608e-06, "loss": 0.81235838, "num_input_tokens_seen": 119959730, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.2388916, "step": 5586, "time_per_iteration": 2.9004650115966797 }, { "auxiliary_loss_clip": 0.01527319, "auxiliary_loss_mlp": 0.01045494, "balance_loss_clip": 1.32831383, "balance_loss_mlp": 1.02173567, "epoch": 0.3359086126559447, "flos": 22465788105600.0, "grad_norm": 1.8845980368162183, "language_loss": 0.82255739, "learning_rate": 3.0960575918081756e-06, "loss": 0.84828556, "num_input_tokens_seen": 119979315, "router_z_loss_clip": 1.99023438, "router_z_loss_mlp": 0.23742676, "step": 5587, "time_per_iteration": 2.907565116882324 }, { "auxiliary_loss_clip": 0.01487071, "auxiliary_loss_mlp": 0.01053526, "balance_loss_clip": 1.3029182, "balance_loss_mlp": 1.03105521, "epoch": 0.33596873590861265, "flos": 16552038810240.0, "grad_norm": 1.9687631794041052, "language_loss": 0.6869905, "learning_rate": 3.095731802118677e-06, "loss": 0.71239638, "num_input_tokens_seen": 119996140, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.22473145, "step": 5588, "time_per_iteration": 2.9055745601654053 }, { "auxiliary_loss_clip": 0.01509781, "auxiliary_loss_mlp": 0.01054223, "balance_loss_clip": 1.31734848, "balance_loss_mlp": 1.0306077, "epoch": 0.3360288591612806, "flos": 31188307259520.0, "grad_norm": 1.8837376881026326, "language_loss": 0.70731616, "learning_rate": 3.095405970878919e-06, "loss": 0.73295617, "num_input_tokens_seen": 120017720, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.23596191, "step": 5589, "time_per_iteration": 2.9352684020996094 }, { "auxiliary_loss_clip": 0.01500356, "auxiliary_loss_mlp": 0.01049138, "balance_loss_clip": 1.30754697, "balance_loss_mlp": 1.02613091, "epoch": 0.3360889824139486, "flos": 23707644145920.0, "grad_norm": 1.741791823278173, "language_loss": 0.67536825, "learning_rate": 3.0950800981012567e-06, "loss": 0.70086318, "num_input_tokens_seen": 120036335, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.23010254, "step": 5590, "time_per_iteration": 2.86493182182312 }, { "auxiliary_loss_clip": 0.01486797, "auxiliary_loss_mlp": 0.01045921, "balance_loss_clip": 1.30118465, "balance_loss_mlp": 1.02234173, "epoch": 0.33614910566661654, "flos": 19327571723520.0, "grad_norm": 2.2337028070713854, "language_loss": 0.7460373, "learning_rate": 3.094754183798047e-06, "loss": 0.77136451, "num_input_tokens_seen": 120056120, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.23583984, "step": 5591, "time_per_iteration": 2.8862602710723877 }, { "auxiliary_loss_clip": 0.01485429, "auxiliary_loss_mlp": 0.01047021, "balance_loss_clip": 1.29823864, "balance_loss_mlp": 1.02370346, "epoch": 0.3362092289192845, "flos": 16480995194880.0, "grad_norm": 2.3026444583770753, "language_loss": 0.70273882, "learning_rate": 3.0944282279816493e-06, "loss": 0.72806334, "num_input_tokens_seen": 120073650, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.2331543, "step": 5592, "time_per_iteration": 2.9333269596099854 }, { "auxiliary_loss_clip": 0.01501391, "auxiliary_loss_mlp": 0.01038467, "balance_loss_clip": 1.31454587, "balance_loss_mlp": 1.01660395, "epoch": 0.33626935217195253, "flos": 24254293547520.0, "grad_norm": 2.1598410497734455, "language_loss": 0.77670956, "learning_rate": 3.094102230664423e-06, "loss": 0.80210817, "num_input_tokens_seen": 120093260, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.21862793, "step": 5593, "time_per_iteration": 2.9173765182495117 }, { "auxiliary_loss_clip": 0.0151696, "auxiliary_loss_mlp": 0.01048333, "balance_loss_clip": 1.32037377, "balance_loss_mlp": 1.0238837, "epoch": 0.3363294754246205, "flos": 19728152352000.0, "grad_norm": 2.5693088034531866, "language_loss": 0.72927207, "learning_rate": 3.093776191858731e-06, "loss": 0.75492501, "num_input_tokens_seen": 120111830, "router_z_loss_clip": 1.96777344, "router_z_loss_mlp": 0.24462891, "step": 5594, "time_per_iteration": 2.8508689403533936 }, { "auxiliary_loss_clip": 0.01507923, "auxiliary_loss_mlp": 0.01045552, "balance_loss_clip": 1.31604946, "balance_loss_mlp": 1.02126932, "epoch": 0.33638959867728846, "flos": 22605160648320.0, "grad_norm": 2.4775397685689424, "language_loss": 0.80401522, "learning_rate": 3.0934501115769363e-06, "loss": 0.82954997, "num_input_tokens_seen": 120130470, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.24279785, "step": 5595, "time_per_iteration": 2.902956247329712 }, { "auxiliary_loss_clip": 0.01504889, "auxiliary_loss_mlp": 0.01041085, "balance_loss_clip": 1.31626105, "balance_loss_mlp": 1.01895952, "epoch": 0.3364497219299564, "flos": 21004195478400.0, "grad_norm": 1.7581381187443923, "language_loss": 0.82669365, "learning_rate": 3.0931239898314037e-06, "loss": 0.8521533, "num_input_tokens_seen": 120150735, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.22131348, "step": 5596, "time_per_iteration": 2.921912908554077 }, { "auxiliary_loss_clip": 0.01508034, "auxiliary_loss_mlp": 0.01037505, "balance_loss_clip": 1.31729031, "balance_loss_mlp": 1.01518917, "epoch": 0.3365098451826244, "flos": 25239556471680.0, "grad_norm": 1.706245805180351, "language_loss": 0.76656747, "learning_rate": 3.0927978266344995e-06, "loss": 0.79202294, "num_input_tokens_seen": 120173230, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.2232666, "step": 5597, "time_per_iteration": 2.9426043033599854 }, { "auxiliary_loss_clip": 0.01506685, "auxiliary_loss_mlp": 0.01040379, "balance_loss_clip": 1.31773317, "balance_loss_mlp": 1.01719296, "epoch": 0.33656996843529235, "flos": 24582427971840.0, "grad_norm": 1.8752354213523363, "language_loss": 0.79172641, "learning_rate": 3.0924716219985916e-06, "loss": 0.81719708, "num_input_tokens_seen": 120191860, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.23205566, "step": 5598, "time_per_iteration": 2.900599956512451 }, { "auxiliary_loss_clip": 0.0152873, "auxiliary_loss_mlp": 0.01042237, "balance_loss_clip": 1.32896376, "balance_loss_mlp": 1.01919341, "epoch": 0.3366300916879603, "flos": 44106797168640.0, "grad_norm": 1.4759902649897094, "language_loss": 0.65109068, "learning_rate": 3.0921453759360514e-06, "loss": 0.67680037, "num_input_tokens_seen": 120219195, "router_z_loss_clip": 1.99511719, "router_z_loss_mlp": 0.23059082, "step": 5599, "time_per_iteration": 3.0955095291137695 }, { "auxiliary_loss_clip": 0.0151804, "auxiliary_loss_mlp": 0.01044441, "balance_loss_clip": 1.32079124, "balance_loss_mlp": 1.0199312, "epoch": 0.3366902149406283, "flos": 13887437443200.0, "grad_norm": 2.4652502912247978, "language_loss": 0.83111119, "learning_rate": 3.091819088459249e-06, "loss": 0.85673594, "num_input_tokens_seen": 120232950, "router_z_loss_clip": 1.97167969, "router_z_loss_mlp": 0.24523926, "step": 5600, "time_per_iteration": 2.9089560508728027 }, { "auxiliary_loss_clip": 0.01502937, "auxiliary_loss_mlp": 0.01044787, "balance_loss_clip": 1.3089509, "balance_loss_mlp": 1.02028942, "epoch": 0.33675033819329625, "flos": 16261711056000.0, "grad_norm": 2.177427231852065, "language_loss": 0.84167743, "learning_rate": 3.0914927595805573e-06, "loss": 0.8671546, "num_input_tokens_seen": 120248865, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.24499512, "step": 5601, "time_per_iteration": 2.8733246326446533 }, { "auxiliary_loss_clip": 0.01477478, "auxiliary_loss_mlp": 0.01039067, "balance_loss_clip": 1.29369664, "balance_loss_mlp": 1.01640487, "epoch": 0.3368104614459642, "flos": 17064636860160.0, "grad_norm": 1.590471910268448, "language_loss": 0.84492207, "learning_rate": 3.0911663893123507e-06, "loss": 0.8700875, "num_input_tokens_seen": 120267820, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.22668457, "step": 5602, "time_per_iteration": 2.9372410774230957 }, { "auxiliary_loss_clip": 0.01497032, "auxiliary_loss_mlp": 0.0104018, "balance_loss_clip": 1.30747414, "balance_loss_mlp": 1.01682687, "epoch": 0.3368705846986322, "flos": 17867698398720.0, "grad_norm": 1.7731344280748333, "language_loss": 0.70652318, "learning_rate": 3.0908399776670048e-06, "loss": 0.73189527, "num_input_tokens_seen": 120286540, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.23352051, "step": 5603, "time_per_iteration": 2.8852651119232178 }, { "auxiliary_loss_clip": 0.01513101, "auxiliary_loss_mlp": 0.01040168, "balance_loss_clip": 1.31960511, "balance_loss_mlp": 1.01751876, "epoch": 0.33693070795130015, "flos": 22939719834240.0, "grad_norm": 1.4936656255818428, "language_loss": 0.83733654, "learning_rate": 3.090513524656898e-06, "loss": 0.86286926, "num_input_tokens_seen": 120307305, "router_z_loss_clip": 1.93457031, "router_z_loss_mlp": 0.2265625, "step": 5604, "time_per_iteration": 4.25240159034729 }, { "auxiliary_loss_clip": 0.01499992, "auxiliary_loss_mlp": 0.01039591, "balance_loss_clip": 1.3080039, "balance_loss_mlp": 1.01579654, "epoch": 0.3369908312039681, "flos": 22027446051840.0, "grad_norm": 1.5983134903781098, "language_loss": 0.74555922, "learning_rate": 3.090187030294409e-06, "loss": 0.77095509, "num_input_tokens_seen": 120327845, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.23815918, "step": 5605, "time_per_iteration": 2.8519127368927 }, { "auxiliary_loss_clip": 0.01514965, "auxiliary_loss_mlp": 0.01043141, "balance_loss_clip": 1.31942999, "balance_loss_mlp": 1.02000213, "epoch": 0.33705095445663613, "flos": 11809463898240.0, "grad_norm": 3.0050129119680515, "language_loss": 0.84639478, "learning_rate": 3.089860494591919e-06, "loss": 0.87197584, "num_input_tokens_seen": 120343255, "router_z_loss_clip": 1.95703125, "router_z_loss_mlp": 0.23156738, "step": 5606, "time_per_iteration": 2.8195576667785645 }, { "auxiliary_loss_clip": 0.01485581, "auxiliary_loss_mlp": 0.01041106, "balance_loss_clip": 1.2958647, "balance_loss_mlp": 1.01800346, "epoch": 0.3371110777093041, "flos": 25057355086080.0, "grad_norm": 1.5634217945434674, "language_loss": 0.68605489, "learning_rate": 3.089533917561809e-06, "loss": 0.71132171, "num_input_tokens_seen": 120361745, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.23095703, "step": 5607, "time_per_iteration": 2.8716025352478027 }, { "auxiliary_loss_clip": 0.01513621, "auxiliary_loss_mlp": 0.01042744, "balance_loss_clip": 1.31767678, "balance_loss_mlp": 1.01902127, "epoch": 0.33717120096197206, "flos": 26590443776640.0, "grad_norm": 7.505445397663734, "language_loss": 0.71994984, "learning_rate": 3.089207299216464e-06, "loss": 0.74551356, "num_input_tokens_seen": 120380565, "router_z_loss_clip": 1.95996094, "router_z_loss_mlp": 0.23730469, "step": 5608, "time_per_iteration": 4.286521673202515 }, { "auxiliary_loss_clip": 0.01490519, "auxiliary_loss_mlp": 0.01046187, "balance_loss_clip": 1.29944289, "balance_loss_mlp": 1.02259517, "epoch": 0.33723132421464, "flos": 15166873929600.0, "grad_norm": 1.7569239525437061, "language_loss": 0.8027342, "learning_rate": 3.088880639568269e-06, "loss": 0.82810122, "num_input_tokens_seen": 120399235, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.23608398, "step": 5609, "time_per_iteration": 2.8362104892730713 }, { "auxiliary_loss_clip": 0.01507396, "auxiliary_loss_mlp": 0.01050679, "balance_loss_clip": 1.31552565, "balance_loss_mlp": 1.02614594, "epoch": 0.337291447467308, "flos": 23445350184960.0, "grad_norm": 1.608650980386966, "language_loss": 0.82936156, "learning_rate": 3.0885539386296114e-06, "loss": 0.85494232, "num_input_tokens_seen": 120420095, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.24523926, "step": 5610, "time_per_iteration": 4.381227731704712 }, { "auxiliary_loss_clip": 0.01465079, "auxiliary_loss_mlp": 0.01040537, "balance_loss_clip": 1.28113532, "balance_loss_mlp": 1.01723194, "epoch": 0.33735157071997596, "flos": 17247019224960.0, "grad_norm": 3.0665662529194644, "language_loss": 0.83303165, "learning_rate": 3.088227196412879e-06, "loss": 0.85808784, "num_input_tokens_seen": 120437690, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23291016, "step": 5611, "time_per_iteration": 2.9353599548339844 }, { "auxiliary_loss_clip": 0.01492578, "auxiliary_loss_mlp": 0.01042922, "balance_loss_clip": 1.30247784, "balance_loss_mlp": 1.01764965, "epoch": 0.3374116939726439, "flos": 28269239281920.0, "grad_norm": 1.602858353500534, "language_loss": 0.80241299, "learning_rate": 3.0879004129304626e-06, "loss": 0.82776809, "num_input_tokens_seen": 120459240, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.25280762, "step": 5612, "time_per_iteration": 4.40021276473999 }, { "auxiliary_loss_clip": 0.0147924, "auxiliary_loss_mlp": 0.01043729, "balance_loss_clip": 1.28967798, "balance_loss_mlp": 1.02022076, "epoch": 0.3374718172253119, "flos": 35933234901120.0, "grad_norm": 2.7126295468181274, "language_loss": 0.71527076, "learning_rate": 3.087573588194753e-06, "loss": 0.74050045, "num_input_tokens_seen": 120481090, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.23510742, "step": 5613, "time_per_iteration": 2.989490032196045 }, { "auxiliary_loss_clip": 0.0149223, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.29990816, "balance_loss_mlp": 1.01399064, "epoch": 0.33753194047797985, "flos": 18195742333440.0, "grad_norm": 1.8217999154837716, "language_loss": 0.80400467, "learning_rate": 3.087246722218144e-06, "loss": 0.82930064, "num_input_tokens_seen": 120500045, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.23376465, "step": 5614, "time_per_iteration": 2.9208898544311523 }, { "auxiliary_loss_clip": 0.0149086, "auxiliary_loss_mlp": 0.01045155, "balance_loss_clip": 1.29937661, "balance_loss_mlp": 1.02025199, "epoch": 0.3375920637306478, "flos": 23159320686720.0, "grad_norm": 1.8487572850533522, "language_loss": 0.91431975, "learning_rate": 3.086919815013031e-06, "loss": 0.93967986, "num_input_tokens_seen": 120521125, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.24938965, "step": 5615, "time_per_iteration": 2.9683144092559814 }, { "auxiliary_loss_clip": 0.01471147, "auxiliary_loss_mlp": 0.01041552, "balance_loss_clip": 1.28445148, "balance_loss_mlp": 1.01825869, "epoch": 0.3376521869833158, "flos": 23122554647040.0, "grad_norm": 1.6931177507054995, "language_loss": 0.81725466, "learning_rate": 3.086592866591809e-06, "loss": 0.84238166, "num_input_tokens_seen": 120539180, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.23291016, "step": 5616, "time_per_iteration": 2.874188184738159 }, { "auxiliary_loss_clip": 0.01496783, "auxiliary_loss_mlp": 0.01043235, "balance_loss_clip": 1.30235064, "balance_loss_mlp": 1.01872563, "epoch": 0.33771231023598375, "flos": 19282209171840.0, "grad_norm": 1.9622051089556607, "language_loss": 0.84608501, "learning_rate": 3.0862658769668774e-06, "loss": 0.87148517, "num_input_tokens_seen": 120556280, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.2454834, "step": 5617, "time_per_iteration": 2.849902629852295 }, { "auxiliary_loss_clip": 0.01458358, "auxiliary_loss_mlp": 0.0104092, "balance_loss_clip": 1.27250612, "balance_loss_mlp": 1.01619565, "epoch": 0.3377724334886517, "flos": 18159247762560.0, "grad_norm": 4.501089883826148, "language_loss": 0.8066563, "learning_rate": 3.0859388461506343e-06, "loss": 0.83164907, "num_input_tokens_seen": 120575395, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.24755859, "step": 5618, "time_per_iteration": 2.8378500938415527 }, { "auxiliary_loss_clip": 0.0148243, "auxiliary_loss_mlp": 0.0104124, "balance_loss_clip": 1.29293394, "balance_loss_mlp": 1.01750517, "epoch": 0.3378325567413197, "flos": 25787110769280.0, "grad_norm": 1.6222809228721988, "language_loss": 0.72674775, "learning_rate": 3.085611774155481e-06, "loss": 0.75198448, "num_input_tokens_seen": 120596075, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.23730469, "step": 5619, "time_per_iteration": 2.8863916397094727 }, { "auxiliary_loss_clip": 0.01471133, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.28056979, "balance_loss_mlp": 1.01700377, "epoch": 0.3378926799939877, "flos": 21325090734720.0, "grad_norm": 4.032453665593643, "language_loss": 0.7101295, "learning_rate": 3.085284660993821e-06, "loss": 0.73524725, "num_input_tokens_seen": 120614195, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.23632812, "step": 5620, "time_per_iteration": 2.8823471069335938 }, { "auxiliary_loss_clip": 0.01466783, "auxiliary_loss_mlp": 0.010416, "balance_loss_clip": 1.28025532, "balance_loss_mlp": 1.0183661, "epoch": 0.33795280324665566, "flos": 24910833864960.0, "grad_norm": 2.8151348334880377, "language_loss": 0.69108844, "learning_rate": 3.084957506678058e-06, "loss": 0.71617228, "num_input_tokens_seen": 120634475, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.23242188, "step": 5621, "time_per_iteration": 2.8856420516967773 }, { "auxiliary_loss_clip": 0.01463889, "auxiliary_loss_mlp": 0.01039086, "balance_loss_clip": 1.2798624, "balance_loss_mlp": 1.01607919, "epoch": 0.33801292649932363, "flos": 24764674602240.0, "grad_norm": 2.1091160503576325, "language_loss": 0.8337574, "learning_rate": 3.0846303112205975e-06, "loss": 0.85878718, "num_input_tokens_seen": 120654980, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22998047, "step": 5622, "time_per_iteration": 2.9631879329681396 }, { "auxiliary_loss_clip": 0.01464445, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.28007114, "balance_loss_mlp": 1.01903713, "epoch": 0.3380730497519916, "flos": 26735155205760.0, "grad_norm": 1.5814974355175244, "language_loss": 0.74435991, "learning_rate": 3.0843030746338464e-06, "loss": 0.76942956, "num_input_tokens_seen": 120676245, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.23474121, "step": 5623, "time_per_iteration": 2.903228759765625 }, { "auxiliary_loss_clip": 0.01281748, "auxiliary_loss_mlp": 0.01034841, "balance_loss_clip": 1.16733217, "balance_loss_mlp": 1.01138103, "epoch": 0.33813317300465956, "flos": 70068480935040.0, "grad_norm": 0.7458005393977092, "language_loss": 0.54982281, "learning_rate": 3.083975796930215e-06, "loss": 0.57298875, "num_input_tokens_seen": 120741965, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.234375, "step": 5624, "time_per_iteration": 3.5268478393554688 }, { "auxiliary_loss_clip": 0.01474638, "auxiliary_loss_mlp": 0.01045825, "balance_loss_clip": 1.28425026, "balance_loss_mlp": 1.02088618, "epoch": 0.3381932962573275, "flos": 24107908060800.0, "grad_norm": 2.0732333976514727, "language_loss": 0.73591948, "learning_rate": 3.083648478122111e-06, "loss": 0.76112413, "num_input_tokens_seen": 120760410, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.24938965, "step": 5625, "time_per_iteration": 2.8666694164276123 }, { "auxiliary_loss_clip": 0.01486659, "auxiliary_loss_mlp": 0.01044207, "balance_loss_clip": 1.29174829, "balance_loss_mlp": 1.01920879, "epoch": 0.3382534195099955, "flos": 19286824141440.0, "grad_norm": 2.34659655416355, "language_loss": 0.72163194, "learning_rate": 3.0833211182219497e-06, "loss": 0.74694061, "num_input_tokens_seen": 120777705, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.24975586, "step": 5626, "time_per_iteration": 2.873349905014038 }, { "auxiliary_loss_clip": 0.01456606, "auxiliary_loss_mlp": 0.010413, "balance_loss_clip": 1.27292836, "balance_loss_mlp": 1.01658821, "epoch": 0.33831354276266346, "flos": 25236570314880.0, "grad_norm": 1.4486268979061603, "language_loss": 0.81329215, "learning_rate": 3.0829937172421425e-06, "loss": 0.83827114, "num_input_tokens_seen": 120798660, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.24694824, "step": 5627, "time_per_iteration": 2.956404209136963 }, { "auxiliary_loss_clip": 0.01478635, "auxiliary_loss_mlp": 0.01049525, "balance_loss_clip": 1.28847587, "balance_loss_mlp": 1.02505136, "epoch": 0.3383736660153314, "flos": 23122283178240.0, "grad_norm": 2.3106722815195706, "language_loss": 0.81307137, "learning_rate": 3.0826662751951055e-06, "loss": 0.83835292, "num_input_tokens_seen": 120816705, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.24475098, "step": 5628, "time_per_iteration": 2.9071643352508545 }, { "auxiliary_loss_clip": 0.01486933, "auxiliary_loss_mlp": 0.01044473, "balance_loss_clip": 1.29379845, "balance_loss_mlp": 1.02051187, "epoch": 0.3384337892679994, "flos": 23487590845440.0, "grad_norm": 2.34634782118513, "language_loss": 0.7879926, "learning_rate": 3.082338792093254e-06, "loss": 0.81330669, "num_input_tokens_seen": 120835375, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.23925781, "step": 5629, "time_per_iteration": 2.9267890453338623 }, { "auxiliary_loss_clip": 0.01493533, "auxiliary_loss_mlp": 0.01045698, "balance_loss_clip": 1.29884434, "balance_loss_mlp": 1.02046156, "epoch": 0.33849391252066735, "flos": 19434838440960.0, "grad_norm": 2.201945194696313, "language_loss": 0.85129207, "learning_rate": 3.0820112679490074e-06, "loss": 0.87668431, "num_input_tokens_seen": 120854260, "router_z_loss_clip": 1.94824219, "router_z_loss_mlp": 0.25244141, "step": 5630, "time_per_iteration": 2.8542301654815674 }, { "auxiliary_loss_clip": 0.01486898, "auxiliary_loss_mlp": 0.01050109, "balance_loss_clip": 1.29739773, "balance_loss_mlp": 1.02576602, "epoch": 0.3385540357733353, "flos": 21073565036160.0, "grad_norm": 3.8768844157239486, "language_loss": 0.7199896, "learning_rate": 3.0816837027747857e-06, "loss": 0.74535966, "num_input_tokens_seen": 120871590, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.24328613, "step": 5631, "time_per_iteration": 2.848780632019043 }, { "auxiliary_loss_clip": 0.01291091, "auxiliary_loss_mlp": 0.01030784, "balance_loss_clip": 1.17724133, "balance_loss_mlp": 1.00751424, "epoch": 0.3386141590260033, "flos": 69234019505280.0, "grad_norm": 0.8518126736593178, "language_loss": 0.56114417, "learning_rate": 3.0813560965830084e-06, "loss": 0.58436292, "num_input_tokens_seen": 120925550, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.23242188, "step": 5632, "time_per_iteration": 3.456843137741089 }, { "auxiliary_loss_clip": 0.01479207, "auxiliary_loss_mlp": 0.01045803, "balance_loss_clip": 1.28824103, "balance_loss_mlp": 1.02199674, "epoch": 0.3386742822786713, "flos": 25530246184320.0, "grad_norm": 7.3419413449353, "language_loss": 0.80787325, "learning_rate": 3.0810284493861005e-06, "loss": 0.83312333, "num_input_tokens_seen": 120947620, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.23815918, "step": 5633, "time_per_iteration": 2.901444911956787 }, { "auxiliary_loss_clip": 0.01479688, "auxiliary_loss_mlp": 0.01053852, "balance_loss_clip": 1.29046726, "balance_loss_mlp": 1.029176, "epoch": 0.33873440553133927, "flos": 23633614373760.0, "grad_norm": 2.195714893013156, "language_loss": 0.60281575, "learning_rate": 3.0807007611964855e-06, "loss": 0.62815112, "num_input_tokens_seen": 120965205, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.24658203, "step": 5634, "time_per_iteration": 2.9059152603149414 }, { "auxiliary_loss_clip": 0.01484593, "auxiliary_loss_mlp": 0.01050703, "balance_loss_clip": 1.29516017, "balance_loss_mlp": 1.02675378, "epoch": 0.33879452878400723, "flos": 17096833175040.0, "grad_norm": 1.682067570515668, "language_loss": 0.93264091, "learning_rate": 3.080373032026589e-06, "loss": 0.95799387, "num_input_tokens_seen": 120983560, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.23950195, "step": 5635, "time_per_iteration": 2.8619163036346436 }, { "auxiliary_loss_clip": 0.0144925, "auxiliary_loss_mlp": 0.01040655, "balance_loss_clip": 1.26659346, "balance_loss_mlp": 1.01684928, "epoch": 0.3388546520366752, "flos": 15750470350080.0, "grad_norm": 2.557706050168856, "language_loss": 0.76617914, "learning_rate": 3.0800452618888386e-06, "loss": 0.79107821, "num_input_tokens_seen": 121001400, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.23803711, "step": 5636, "time_per_iteration": 2.856569528579712 }, { "auxiliary_loss_clip": 0.01471316, "auxiliary_loss_mlp": 0.01046032, "balance_loss_clip": 1.28375208, "balance_loss_mlp": 1.02178478, "epoch": 0.33891477528934316, "flos": 22428841086720.0, "grad_norm": 1.5686089456504813, "language_loss": 0.84516633, "learning_rate": 3.0797174507956637e-06, "loss": 0.87033987, "num_input_tokens_seen": 121021760, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.2421875, "step": 5637, "time_per_iteration": 2.914940118789673 }, { "auxiliary_loss_clip": 0.014954, "auxiliary_loss_mlp": 0.01041101, "balance_loss_clip": 1.30302072, "balance_loss_mlp": 1.01456463, "epoch": 0.3389748985420111, "flos": 17283740019840.0, "grad_norm": 2.0673830989690254, "language_loss": 0.71836853, "learning_rate": 3.079389598759495e-06, "loss": 0.74373353, "num_input_tokens_seen": 121041070, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.265625, "step": 5638, "time_per_iteration": 2.827458143234253 }, { "auxiliary_loss_clip": 0.0147359, "auxiliary_loss_mlp": 0.01042664, "balance_loss_clip": 1.28713298, "balance_loss_mlp": 1.01817846, "epoch": 0.3390350217946791, "flos": 27756279273600.0, "grad_norm": 1.8164592188503164, "language_loss": 0.81532168, "learning_rate": 3.079061705792765e-06, "loss": 0.8404842, "num_input_tokens_seen": 121060890, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.24487305, "step": 5639, "time_per_iteration": 4.323322057723999 }, { "auxiliary_loss_clip": 0.01495055, "auxiliary_loss_mlp": 0.01048195, "balance_loss_clip": 1.29963672, "balance_loss_mlp": 1.02381694, "epoch": 0.33909514504734706, "flos": 20349555442560.0, "grad_norm": 2.2337803366822997, "language_loss": 0.68309188, "learning_rate": 3.078733771907907e-06, "loss": 0.70852435, "num_input_tokens_seen": 121079135, "router_z_loss_clip": 1.95410156, "router_z_loss_mlp": 0.24377441, "step": 5640, "time_per_iteration": 2.8533785343170166 }, { "auxiliary_loss_clip": 0.01477558, "auxiliary_loss_mlp": 0.01044215, "balance_loss_clip": 1.28728306, "balance_loss_mlp": 1.02003956, "epoch": 0.339155268300015, "flos": 14838241812480.0, "grad_norm": 1.5757477659519648, "language_loss": 0.7064532, "learning_rate": 3.0784057971173554e-06, "loss": 0.73167098, "num_input_tokens_seen": 121097685, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.24182129, "step": 5641, "time_per_iteration": 2.828672409057617 }, { "auxiliary_loss_clip": 0.01488961, "auxiliary_loss_mlp": 0.01043822, "balance_loss_clip": 1.29821157, "balance_loss_mlp": 1.01988459, "epoch": 0.339215391552683, "flos": 26079157825920.0, "grad_norm": 3.2285427114005687, "language_loss": 0.88904691, "learning_rate": 3.0780777814335483e-06, "loss": 0.91437471, "num_input_tokens_seen": 121115640, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.23937988, "step": 5642, "time_per_iteration": 4.379562139511108 }, { "auxiliary_loss_clip": 0.01455236, "auxiliary_loss_mlp": 0.01039451, "balance_loss_clip": 1.27309465, "balance_loss_mlp": 1.01681328, "epoch": 0.33927551480535095, "flos": 14582870305920.0, "grad_norm": 1.7030647023084968, "language_loss": 0.84499443, "learning_rate": 3.077749724868924e-06, "loss": 0.86994135, "num_input_tokens_seen": 121132485, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.22644043, "step": 5643, "time_per_iteration": 2.8090620040893555 }, { "auxiliary_loss_clip": 0.01457387, "auxiliary_loss_mlp": 0.01047019, "balance_loss_clip": 1.27053225, "balance_loss_mlp": 1.02323651, "epoch": 0.3393356380580189, "flos": 23816132472960.0, "grad_norm": 1.4568864701298074, "language_loss": 0.78201097, "learning_rate": 3.077421627435922e-06, "loss": 0.807055, "num_input_tokens_seen": 121152935, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.23803711, "step": 5644, "time_per_iteration": 2.845407247543335 }, { "auxiliary_loss_clip": 0.01471214, "auxiliary_loss_mlp": 0.01046016, "balance_loss_clip": 1.28287673, "balance_loss_mlp": 1.02222204, "epoch": 0.3393957613106869, "flos": 17356819651200.0, "grad_norm": 3.527305927047716, "language_loss": 0.64130557, "learning_rate": 3.0770934891469832e-06, "loss": 0.66647792, "num_input_tokens_seen": 121169835, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.23815918, "step": 5645, "time_per_iteration": 4.256519079208374 }, { "auxiliary_loss_clip": 0.0146081, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.27494669, "balance_loss_mlp": 1.01892221, "epoch": 0.3394558845633549, "flos": 28444337233920.0, "grad_norm": 2.137041465473129, "language_loss": 0.77758664, "learning_rate": 3.076765310014552e-06, "loss": 0.80261946, "num_input_tokens_seen": 121190290, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.2355957, "step": 5646, "time_per_iteration": 2.9298095703125 }, { "auxiliary_loss_clip": 0.01482706, "auxiliary_loss_mlp": 0.01049273, "balance_loss_clip": 1.29069662, "balance_loss_mlp": 1.02509689, "epoch": 0.33951600781602287, "flos": 22096046448000.0, "grad_norm": 2.1464963687433514, "language_loss": 0.79856282, "learning_rate": 3.0764370900510727e-06, "loss": 0.82388258, "num_input_tokens_seen": 121209060, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.24194336, "step": 5647, "time_per_iteration": 4.312011480331421 }, { "auxiliary_loss_clip": 0.01469594, "auxiliary_loss_mlp": 0.01037461, "balance_loss_clip": 1.28144574, "balance_loss_mlp": 1.01382136, "epoch": 0.33957613106869083, "flos": 23888759656320.0, "grad_norm": 3.1612707002397533, "language_loss": 0.78149372, "learning_rate": 3.0761088292689904e-06, "loss": 0.80656427, "num_input_tokens_seen": 121227480, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.23632812, "step": 5648, "time_per_iteration": 2.882059097290039 }, { "auxiliary_loss_clip": 0.0128101, "auxiliary_loss_mlp": 0.01055305, "balance_loss_clip": 1.16927004, "balance_loss_mlp": 1.0303185, "epoch": 0.3396362543213588, "flos": 71276421358080.0, "grad_norm": 0.791635093803938, "language_loss": 0.56435156, "learning_rate": 3.075780527680754e-06, "loss": 0.58771473, "num_input_tokens_seen": 121291305, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.25, "step": 5649, "time_per_iteration": 3.383288621902466 }, { "auxiliary_loss_clip": 0.01482063, "auxiliary_loss_mlp": 0.0104785, "balance_loss_clip": 1.29292655, "balance_loss_mlp": 1.02407932, "epoch": 0.33969637757402676, "flos": 25932681849600.0, "grad_norm": 1.771498634179408, "language_loss": 0.86186337, "learning_rate": 3.0754521852988117e-06, "loss": 0.88716251, "num_input_tokens_seen": 121312740, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.23779297, "step": 5650, "time_per_iteration": 2.8908934593200684 }, { "auxiliary_loss_clip": 0.01473701, "auxiliary_loss_mlp": 0.01038745, "balance_loss_clip": 1.28701162, "balance_loss_mlp": 1.01615441, "epoch": 0.33975650082669473, "flos": 35275156260480.0, "grad_norm": 1.7491881525770792, "language_loss": 0.71860164, "learning_rate": 3.0751238021356152e-06, "loss": 0.74372607, "num_input_tokens_seen": 121334220, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.22619629, "step": 5651, "time_per_iteration": 3.0218536853790283 }, { "auxiliary_loss_clip": 0.01484725, "auxiliary_loss_mlp": 0.01040968, "balance_loss_clip": 1.29607177, "balance_loss_mlp": 1.01654243, "epoch": 0.3398166240793627, "flos": 16653785662080.0, "grad_norm": 1.8218861325226992, "language_loss": 0.8234551, "learning_rate": 3.074795378203616e-06, "loss": 0.84871209, "num_input_tokens_seen": 121351870, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.2442627, "step": 5652, "time_per_iteration": 2.8534984588623047 }, { "auxiliary_loss_clip": 0.01494309, "auxiliary_loss_mlp": 0.01042474, "balance_loss_clip": 1.30264974, "balance_loss_mlp": 1.01844144, "epoch": 0.33987674733203066, "flos": 24073132792320.0, "grad_norm": 1.9158474929143987, "language_loss": 0.78072107, "learning_rate": 3.0744669135152685e-06, "loss": 0.80608892, "num_input_tokens_seen": 121373400, "router_z_loss_clip": 1.91601562, "router_z_loss_mlp": 0.2401123, "step": 5653, "time_per_iteration": 2.889430522918701 }, { "auxiliary_loss_clip": 0.01485058, "auxiliary_loss_mlp": 0.01038848, "balance_loss_clip": 1.29757595, "balance_loss_mlp": 1.0155189, "epoch": 0.3399368705846986, "flos": 13255763783040.0, "grad_norm": 2.664875844177617, "language_loss": 0.86429167, "learning_rate": 3.0741384080830278e-06, "loss": 0.88953066, "num_input_tokens_seen": 121385225, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.23339844, "step": 5654, "time_per_iteration": 2.8123180866241455 }, { "auxiliary_loss_clip": 0.01480181, "auxiliary_loss_mlp": 0.01042881, "balance_loss_clip": 1.29260087, "balance_loss_mlp": 1.01784658, "epoch": 0.3399969938373666, "flos": 27023311209600.0, "grad_norm": 3.1129361722910995, "language_loss": 0.66014516, "learning_rate": 3.073809861919351e-06, "loss": 0.68537581, "num_input_tokens_seen": 121404735, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.25048828, "step": 5655, "time_per_iteration": 2.930741310119629 }, { "auxiliary_loss_clip": 0.0147668, "auxiliary_loss_mlp": 0.01041621, "balance_loss_clip": 1.28967953, "balance_loss_mlp": 1.01875651, "epoch": 0.34005711709003456, "flos": 28562236479360.0, "grad_norm": 1.6766612306817017, "language_loss": 0.77004075, "learning_rate": 3.073481275036697e-06, "loss": 0.79522377, "num_input_tokens_seen": 121426780, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.22875977, "step": 5656, "time_per_iteration": 2.9428625106811523 }, { "auxiliary_loss_clip": 0.01492402, "auxiliary_loss_mlp": 0.01042662, "balance_loss_clip": 1.29797256, "balance_loss_mlp": 1.0194757, "epoch": 0.3401172403427025, "flos": 21627136892160.0, "grad_norm": 1.5790339475110198, "language_loss": 0.83966804, "learning_rate": 3.073152647447525e-06, "loss": 0.86501873, "num_input_tokens_seen": 121447245, "router_z_loss_clip": 1.94335938, "router_z_loss_mlp": 0.23205566, "step": 5657, "time_per_iteration": 2.966322183609009 }, { "auxiliary_loss_clip": 0.01475755, "auxiliary_loss_mlp": 0.01046593, "balance_loss_clip": 1.28778434, "balance_loss_mlp": 1.02256048, "epoch": 0.3401773635953705, "flos": 25897001685120.0, "grad_norm": 1.7102921781068996, "language_loss": 0.86354464, "learning_rate": 3.0728239791642976e-06, "loss": 0.88876808, "num_input_tokens_seen": 121468165, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.24035645, "step": 5658, "time_per_iteration": 2.9267985820770264 }, { "auxiliary_loss_clip": 0.01281277, "auxiliary_loss_mlp": 0.01057874, "balance_loss_clip": 1.16242576, "balance_loss_mlp": 1.03002703, "epoch": 0.3402374868480385, "flos": 65538629665920.0, "grad_norm": 0.835436640682227, "language_loss": 0.60187316, "learning_rate": 3.072495270199477e-06, "loss": 0.62526464, "num_input_tokens_seen": 121523795, "router_z_loss_clip": 1.1875, "router_z_loss_mlp": 0.27929688, "step": 5659, "time_per_iteration": 3.405350685119629 }, { "auxiliary_loss_clip": 0.01470867, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.28704429, "balance_loss_mlp": 1.01842964, "epoch": 0.34029761010070647, "flos": 24071096776320.0, "grad_norm": 2.0854100201600936, "language_loss": 0.68604004, "learning_rate": 3.0721665205655284e-06, "loss": 0.71116388, "num_input_tokens_seen": 121542950, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23083496, "step": 5660, "time_per_iteration": 2.847632646560669 }, { "auxiliary_loss_clip": 0.01469714, "auxiliary_loss_mlp": 0.01044098, "balance_loss_clip": 1.2825048, "balance_loss_mlp": 1.01822996, "epoch": 0.34035773335337444, "flos": 27611160641280.0, "grad_norm": 1.6201914164856577, "language_loss": 0.6807304, "learning_rate": 3.071837730274918e-06, "loss": 0.70586854, "num_input_tokens_seen": 121562765, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.25866699, "step": 5661, "time_per_iteration": 2.9140677452087402 }, { "auxiliary_loss_clip": 0.01460438, "auxiliary_loss_mlp": 0.01040782, "balance_loss_clip": 1.27607298, "balance_loss_mlp": 1.01747692, "epoch": 0.3404178566060424, "flos": 20822310806400.0, "grad_norm": 1.6077214553871748, "language_loss": 0.8039422, "learning_rate": 3.071508899340113e-06, "loss": 0.82895446, "num_input_tokens_seen": 121581610, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.23303223, "step": 5662, "time_per_iteration": 2.8768222332000732 }, { "auxiliary_loss_clip": 0.01463926, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.27880287, "balance_loss_mlp": 1.0183816, "epoch": 0.34047797985871037, "flos": 26844819897600.0, "grad_norm": 6.659045946765524, "language_loss": 0.74671102, "learning_rate": 3.0711800277735833e-06, "loss": 0.77179253, "num_input_tokens_seen": 121601885, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.25842285, "step": 5663, "time_per_iteration": 2.9183075428009033 }, { "auxiliary_loss_clip": 0.01456881, "auxiliary_loss_mlp": 0.0103685, "balance_loss_clip": 1.27412581, "balance_loss_mlp": 1.0137713, "epoch": 0.34053810311137833, "flos": 19692110229120.0, "grad_norm": 1.6302768428766548, "language_loss": 0.86977762, "learning_rate": 3.0708511155877997e-06, "loss": 0.89471495, "num_input_tokens_seen": 121621335, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.23071289, "step": 5664, "time_per_iteration": 2.8136227130889893 }, { "auxiliary_loss_clip": 0.01486269, "auxiliary_loss_mlp": 0.01042194, "balance_loss_clip": 1.29762924, "balance_loss_mlp": 1.01878095, "epoch": 0.3405982263640463, "flos": 21735715708800.0, "grad_norm": 2.0449476594805196, "language_loss": 0.69940042, "learning_rate": 3.070522162795235e-06, "loss": 0.72468507, "num_input_tokens_seen": 121641310, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.23413086, "step": 5665, "time_per_iteration": 2.869515895843506 }, { "auxiliary_loss_clip": 0.01486682, "auxiliary_loss_mlp": 0.01044327, "balance_loss_clip": 1.29728627, "balance_loss_mlp": 1.01987743, "epoch": 0.34065834961671426, "flos": 18050895169920.0, "grad_norm": 2.9142886150750584, "language_loss": 0.74203491, "learning_rate": 3.0701931694083626e-06, "loss": 0.76734495, "num_input_tokens_seen": 121659625, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.2442627, "step": 5666, "time_per_iteration": 2.8744821548461914 }, { "auxiliary_loss_clip": 0.01496518, "auxiliary_loss_mlp": 0.01043648, "balance_loss_clip": 1.30482244, "balance_loss_mlp": 1.01944852, "epoch": 0.3407184728693822, "flos": 21407219326080.0, "grad_norm": 1.4860344330328579, "language_loss": 0.74012572, "learning_rate": 3.0698641354396576e-06, "loss": 0.76552737, "num_input_tokens_seen": 121679205, "router_z_loss_clip": 1.91796875, "router_z_loss_mlp": 0.24182129, "step": 5667, "time_per_iteration": 2.89290189743042 }, { "auxiliary_loss_clip": 0.01287877, "auxiliary_loss_mlp": 0.01027863, "balance_loss_clip": 1.17445886, "balance_loss_mlp": 1.00020683, "epoch": 0.3407785961220502, "flos": 68721077479680.0, "grad_norm": 0.843616422277839, "language_loss": 0.63311768, "learning_rate": 3.069535060901597e-06, "loss": 0.65627503, "num_input_tokens_seen": 121751085, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 0.27734375, "step": 5668, "time_per_iteration": 3.552546977996826 }, { "auxiliary_loss_clip": 0.01472336, "auxiliary_loss_mlp": 0.01042671, "balance_loss_clip": 1.28427541, "balance_loss_mlp": 1.01791131, "epoch": 0.34083871937471816, "flos": 14072534496000.0, "grad_norm": 1.9886836776935644, "language_loss": 0.73513985, "learning_rate": 3.0692059458066596e-06, "loss": 0.76028991, "num_input_tokens_seen": 121768565, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.24743652, "step": 5669, "time_per_iteration": 2.8623647689819336 }, { "auxiliary_loss_clip": 0.01485754, "auxiliary_loss_mlp": 0.0104077, "balance_loss_clip": 1.29633951, "balance_loss_mlp": 1.01605844, "epoch": 0.3408988426273861, "flos": 17092489674240.0, "grad_norm": 1.6782448775393117, "language_loss": 0.80917978, "learning_rate": 3.0688767901673265e-06, "loss": 0.834445, "num_input_tokens_seen": 121784925, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.24743652, "step": 5670, "time_per_iteration": 2.9192001819610596 }, { "auxiliary_loss_clip": 0.01491173, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.2978456, "balance_loss_mlp": 1.01382291, "epoch": 0.3409589658800541, "flos": 24035190387840.0, "grad_norm": 1.626262312891787, "language_loss": 0.7771191, "learning_rate": 3.068547593996078e-06, "loss": 0.80240864, "num_input_tokens_seen": 121804425, "router_z_loss_clip": 1.93261719, "router_z_loss_mlp": 0.23974609, "step": 5671, "time_per_iteration": 2.915644645690918 }, { "auxiliary_loss_clip": 0.01495552, "auxiliary_loss_mlp": 0.01039148, "balance_loss_clip": 1.30551636, "balance_loss_mlp": 1.01436484, "epoch": 0.34101908913272205, "flos": 21151847819520.0, "grad_norm": 1.7406258004641717, "language_loss": 0.74744332, "learning_rate": 3.0682183573053974e-06, "loss": 0.77279031, "num_input_tokens_seen": 121825145, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.2479248, "step": 5672, "time_per_iteration": 2.8889925479888916 }, { "auxiliary_loss_clip": 0.01488731, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.29881787, "balance_loss_mlp": 1.01451182, "epoch": 0.3410792123853901, "flos": 15709496544000.0, "grad_norm": 1.9063653933772846, "language_loss": 0.74652445, "learning_rate": 3.06788908010777e-06, "loss": 0.7717911, "num_input_tokens_seen": 121842185, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.234375, "step": 5673, "time_per_iteration": 2.843393087387085 }, { "auxiliary_loss_clip": 0.01469866, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.28531384, "balance_loss_mlp": 1.01481247, "epoch": 0.34113933563805804, "flos": 23045493473280.0, "grad_norm": 1.7794753818839084, "language_loss": 0.80820584, "learning_rate": 3.067559762415682e-06, "loss": 0.83329082, "num_input_tokens_seen": 121862260, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.23803711, "step": 5674, "time_per_iteration": 2.862316370010376 }, { "auxiliary_loss_clip": 0.01284063, "auxiliary_loss_mlp": 0.01051494, "balance_loss_clip": 1.17345786, "balance_loss_mlp": 1.02593541, "epoch": 0.341199458890726, "flos": 69641613077760.0, "grad_norm": 0.7913267172623748, "language_loss": 0.56092203, "learning_rate": 3.0672304042416198e-06, "loss": 0.58427751, "num_input_tokens_seen": 121923560, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.25585938, "step": 5675, "time_per_iteration": 4.93659782409668 }, { "auxiliary_loss_clip": 0.01459039, "auxiliary_loss_mlp": 0.01041137, "balance_loss_clip": 1.27603066, "balance_loss_mlp": 1.01749802, "epoch": 0.34125958214339397, "flos": 22356575861760.0, "grad_norm": 1.6167539201099466, "language_loss": 0.80236822, "learning_rate": 3.0669010055980734e-06, "loss": 0.82736999, "num_input_tokens_seen": 121943515, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.23608398, "step": 5676, "time_per_iteration": 2.8936855792999268 }, { "auxiliary_loss_clip": 0.01485225, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.29581547, "balance_loss_mlp": 1.01632619, "epoch": 0.34131970539606193, "flos": 21882010705920.0, "grad_norm": 1.7855992639328806, "language_loss": 0.86369747, "learning_rate": 3.0665715664975357e-06, "loss": 0.88895345, "num_input_tokens_seen": 121962540, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.24047852, "step": 5677, "time_per_iteration": 2.8445372581481934 }, { "auxiliary_loss_clip": 0.01483345, "auxiliary_loss_mlp": 0.01038368, "balance_loss_clip": 1.29563975, "balance_loss_mlp": 1.01341748, "epoch": 0.3413798286487299, "flos": 24946333050240.0, "grad_norm": 2.5599036953417236, "language_loss": 0.80529404, "learning_rate": 3.0662420869524966e-06, "loss": 0.83051115, "num_input_tokens_seen": 121979830, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.24926758, "step": 5678, "time_per_iteration": 4.256150484085083 }, { "auxiliary_loss_clip": 0.0148572, "auxiliary_loss_mlp": 0.01039844, "balance_loss_clip": 1.29695201, "balance_loss_mlp": 1.01677752, "epoch": 0.34143995190139786, "flos": 25385308531200.0, "grad_norm": 1.6765933931444794, "language_loss": 0.76034236, "learning_rate": 3.0659125669754506e-06, "loss": 0.78559798, "num_input_tokens_seen": 121999055, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.23059082, "step": 5679, "time_per_iteration": 2.910964012145996 }, { "auxiliary_loss_clip": 0.01272063, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.16572595, "balance_loss_mlp": 1.00214064, "epoch": 0.34150007515406583, "flos": 67817038250880.0, "grad_norm": 0.7245207366502291, "language_loss": 0.59533644, "learning_rate": 3.0655830065788923e-06, "loss": 0.61831594, "num_input_tokens_seen": 122067015, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.23730469, "step": 5680, "time_per_iteration": 4.885948657989502 }, { "auxiliary_loss_clip": 0.01476597, "auxiliary_loss_mlp": 0.01040793, "balance_loss_clip": 1.29057467, "balance_loss_mlp": 1.01751173, "epoch": 0.3415601984067338, "flos": 20311929751680.0, "grad_norm": 1.8683275951768128, "language_loss": 0.73426342, "learning_rate": 3.0652534057753206e-06, "loss": 0.75943732, "num_input_tokens_seen": 122085295, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.23266602, "step": 5681, "time_per_iteration": 2.9201807975769043 }, { "auxiliary_loss_clip": 0.01478079, "auxiliary_loss_mlp": 0.01046591, "balance_loss_clip": 1.29317081, "balance_loss_mlp": 1.02321374, "epoch": 0.34162032165940176, "flos": 26042301296640.0, "grad_norm": 2.324714334292036, "language_loss": 0.72474849, "learning_rate": 3.064923764577233e-06, "loss": 0.74999517, "num_input_tokens_seen": 122104020, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.23388672, "step": 5682, "time_per_iteration": 4.366451263427734 }, { "auxiliary_loss_clip": 0.01486725, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.2999475, "balance_loss_mlp": 1.01636744, "epoch": 0.3416804449120697, "flos": 28814983787520.0, "grad_norm": 1.4587130835137663, "language_loss": 0.8503682, "learning_rate": 3.0645940829971295e-06, "loss": 0.87562907, "num_input_tokens_seen": 122125080, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.22998047, "step": 5683, "time_per_iteration": 2.918466567993164 }, { "auxiliary_loss_clip": 0.01494125, "auxiliary_loss_mlp": 0.01040818, "balance_loss_clip": 1.30516529, "balance_loss_mlp": 1.01629686, "epoch": 0.3417405681647377, "flos": 22611856878720.0, "grad_norm": 1.789634343145235, "language_loss": 0.71960211, "learning_rate": 3.0642643610475116e-06, "loss": 0.74495149, "num_input_tokens_seen": 122146350, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.24511719, "step": 5684, "time_per_iteration": 2.905488967895508 }, { "auxiliary_loss_clip": 0.01478977, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.29628515, "balance_loss_mlp": 1.01859617, "epoch": 0.34180069141740566, "flos": 24726641708160.0, "grad_norm": 3.963055919160837, "language_loss": 0.76132274, "learning_rate": 3.0639345987408823e-06, "loss": 0.78652763, "num_input_tokens_seen": 122168085, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.22924805, "step": 5685, "time_per_iteration": 2.9006502628326416 }, { "auxiliary_loss_clip": 0.01472371, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.29062247, "balance_loss_mlp": 1.01976562, "epoch": 0.3418608146700737, "flos": 30530997780480.0, "grad_norm": 1.6396355001175116, "language_loss": 0.71397448, "learning_rate": 3.0636047960897468e-06, "loss": 0.73911935, "num_input_tokens_seen": 122191040, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.22363281, "step": 5686, "time_per_iteration": 2.9380719661712646 }, { "auxiliary_loss_clip": 0.01492894, "auxiliary_loss_mlp": 0.01044625, "balance_loss_clip": 1.30431521, "balance_loss_mlp": 1.02098608, "epoch": 0.34192093792274164, "flos": 15130741317120.0, "grad_norm": 1.887302226930833, "language_loss": 0.78336215, "learning_rate": 3.06327495310661e-06, "loss": 0.8087374, "num_input_tokens_seen": 122209225, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.23632812, "step": 5687, "time_per_iteration": 2.882664442062378 }, { "auxiliary_loss_clip": 0.01479658, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.29762793, "balance_loss_mlp": 1.01838422, "epoch": 0.3419810611754096, "flos": 13195488430080.0, "grad_norm": 2.265846173989185, "language_loss": 0.88120115, "learning_rate": 3.062945069803981e-06, "loss": 0.90642905, "num_input_tokens_seen": 122226160, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.24743652, "step": 5688, "time_per_iteration": 2.8851053714752197 }, { "auxiliary_loss_clip": 0.014977, "auxiliary_loss_mlp": 0.01047401, "balance_loss_clip": 1.30654192, "balance_loss_mlp": 1.02274895, "epoch": 0.34204118442807757, "flos": 19545589008000.0, "grad_norm": 1.692954002554797, "language_loss": 0.80701518, "learning_rate": 3.0626151461943684e-06, "loss": 0.83246613, "num_input_tokens_seen": 122243115, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.24658203, "step": 5689, "time_per_iteration": 2.8464603424072266 }, { "auxiliary_loss_clip": 0.01494846, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.30701113, "balance_loss_mlp": 1.02053714, "epoch": 0.34210130768074554, "flos": 15203142276480.0, "grad_norm": 4.480820520340829, "language_loss": 0.74462903, "learning_rate": 3.0622851822902834e-06, "loss": 0.77003038, "num_input_tokens_seen": 122261105, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.24719238, "step": 5690, "time_per_iteration": 2.8449182510375977 }, { "auxiliary_loss_clip": 0.01492243, "auxiliary_loss_mlp": 0.01040112, "balance_loss_clip": 1.30490994, "balance_loss_mlp": 1.01696134, "epoch": 0.3421614309334135, "flos": 24946966477440.0, "grad_norm": 1.8252067677347612, "language_loss": 0.7761569, "learning_rate": 3.061955178104237e-06, "loss": 0.80148047, "num_input_tokens_seen": 122279995, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.23132324, "step": 5691, "time_per_iteration": 3.080873727798462 }, { "auxiliary_loss_clip": 0.01473079, "auxiliary_loss_mlp": 0.01044519, "balance_loss_clip": 1.28906584, "balance_loss_mlp": 1.02108216, "epoch": 0.34222155418608147, "flos": 21918776745600.0, "grad_norm": 1.6027155699983473, "language_loss": 0.69645917, "learning_rate": 3.0616251336487447e-06, "loss": 0.72163516, "num_input_tokens_seen": 122299070, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.234375, "step": 5692, "time_per_iteration": 2.885075092315674 }, { "auxiliary_loss_clip": 0.0149018, "auxiliary_loss_mlp": 0.01042684, "balance_loss_clip": 1.30196989, "balance_loss_mlp": 1.01822209, "epoch": 0.34228167743874943, "flos": 18123205639680.0, "grad_norm": 2.1158466212015647, "language_loss": 0.74178374, "learning_rate": 3.06129504893632e-06, "loss": 0.76711243, "num_input_tokens_seen": 122316800, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.24475098, "step": 5693, "time_per_iteration": 2.8556039333343506 }, { "auxiliary_loss_clip": 0.01473583, "auxiliary_loss_mlp": 0.01047746, "balance_loss_clip": 1.29139328, "balance_loss_mlp": 1.02432108, "epoch": 0.3423418006914174, "flos": 21298640509440.0, "grad_norm": 2.0834179777481503, "language_loss": 0.77085865, "learning_rate": 3.0609649239794813e-06, "loss": 0.79607195, "num_input_tokens_seen": 122335275, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.23413086, "step": 5694, "time_per_iteration": 2.8845224380493164 }, { "auxiliary_loss_clip": 0.01472868, "auxiliary_loss_mlp": 0.01040135, "balance_loss_clip": 1.29101706, "balance_loss_mlp": 1.01756883, "epoch": 0.34240192394408536, "flos": 19831663751040.0, "grad_norm": 1.8319180590360948, "language_loss": 0.80729592, "learning_rate": 3.060634758790747e-06, "loss": 0.83242595, "num_input_tokens_seen": 122353215, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.22570801, "step": 5695, "time_per_iteration": 2.9100561141967773 }, { "auxiliary_loss_clip": 0.01476536, "auxiliary_loss_mlp": 0.01041734, "balance_loss_clip": 1.29110658, "balance_loss_mlp": 1.01774943, "epoch": 0.3424620471967533, "flos": 24546340604160.0, "grad_norm": 7.313005194273893, "language_loss": 0.74529189, "learning_rate": 3.060304553382635e-06, "loss": 0.77047461, "num_input_tokens_seen": 122372495, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.23999023, "step": 5696, "time_per_iteration": 2.968601703643799 }, { "auxiliary_loss_clip": 0.0148179, "auxiliary_loss_mlp": 0.01050972, "balance_loss_clip": 1.29698968, "balance_loss_mlp": 1.02736843, "epoch": 0.3425221704494213, "flos": 25860009421440.0, "grad_norm": 1.8748157644889791, "language_loss": 0.714957, "learning_rate": 3.0599743077676685e-06, "loss": 0.74028462, "num_input_tokens_seen": 122394600, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.23608398, "step": 5697, "time_per_iteration": 3.000431537628174 }, { "auxiliary_loss_clip": 0.01480074, "auxiliary_loss_mlp": 0.0103818, "balance_loss_clip": 1.29667258, "balance_loss_mlp": 1.01543498, "epoch": 0.34258229370208926, "flos": 21549713760000.0, "grad_norm": 1.7608423269418494, "language_loss": 0.82846022, "learning_rate": 3.05964402195837e-06, "loss": 0.8536427, "num_input_tokens_seen": 122414700, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.22753906, "step": 5698, "time_per_iteration": 2.860071897506714 }, { "auxiliary_loss_clip": 0.01494558, "auxiliary_loss_mlp": 0.01047724, "balance_loss_clip": 1.30705237, "balance_loss_mlp": 1.02347636, "epoch": 0.3426424169547573, "flos": 23661512432640.0, "grad_norm": 1.8674801584303693, "language_loss": 0.70214844, "learning_rate": 3.0593136959672645e-06, "loss": 0.72757125, "num_input_tokens_seen": 122432760, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.24279785, "step": 5699, "time_per_iteration": 2.8833255767822266 }, { "auxiliary_loss_clip": 0.01481973, "auxiliary_loss_mlp": 0.01037753, "balance_loss_clip": 1.2971468, "balance_loss_mlp": 1.01513863, "epoch": 0.34270254020742524, "flos": 24655688582400.0, "grad_norm": 2.2221592487681767, "language_loss": 0.73987657, "learning_rate": 3.058983329806877e-06, "loss": 0.76507384, "num_input_tokens_seen": 122449105, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22607422, "step": 5700, "time_per_iteration": 2.952270984649658 }, { "auxiliary_loss_clip": 0.01492169, "auxiliary_loss_mlp": 0.01040976, "balance_loss_clip": 1.30772364, "balance_loss_mlp": 1.01784897, "epoch": 0.3427626634600932, "flos": 21006729187200.0, "grad_norm": 1.7776962679820854, "language_loss": 0.82386231, "learning_rate": 3.0586529234897354e-06, "loss": 0.84919369, "num_input_tokens_seen": 122468700, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.23120117, "step": 5701, "time_per_iteration": 2.8703997135162354 }, { "auxiliary_loss_clip": 0.01486146, "auxiliary_loss_mlp": 0.01044253, "balance_loss_clip": 1.30032587, "balance_loss_mlp": 1.02138877, "epoch": 0.3428227867127612, "flos": 21443668652160.0, "grad_norm": 1.6643539192209136, "language_loss": 0.72538102, "learning_rate": 3.0583224770283694e-06, "loss": 0.75068498, "num_input_tokens_seen": 122488160, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.2286377, "step": 5702, "time_per_iteration": 2.853252410888672 }, { "auxiliary_loss_clip": 0.01251748, "auxiliary_loss_mlp": 0.01020829, "balance_loss_clip": 1.14721239, "balance_loss_mlp": 1.00080156, "epoch": 0.34288290996542914, "flos": 55760392154880.0, "grad_norm": 0.7799765185120751, "language_loss": 0.57669407, "learning_rate": 3.057991990435309e-06, "loss": 0.59941989, "num_input_tokens_seen": 122542890, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.20019531, "step": 5703, "time_per_iteration": 3.251143217086792 }, { "auxiliary_loss_clip": 0.01493765, "auxiliary_loss_mlp": 0.01043369, "balance_loss_clip": 1.30653358, "balance_loss_mlp": 1.01885962, "epoch": 0.3429430332180971, "flos": 20166358671360.0, "grad_norm": 2.0198636519643944, "language_loss": 0.76164848, "learning_rate": 3.057661463723086e-06, "loss": 0.78701979, "num_input_tokens_seen": 122561770, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.24499512, "step": 5704, "time_per_iteration": 2.8523623943328857 }, { "auxiliary_loss_clip": 0.01478742, "auxiliary_loss_mlp": 0.01047902, "balance_loss_clip": 1.29599118, "balance_loss_mlp": 1.02428651, "epoch": 0.34300315647076507, "flos": 17974376933760.0, "grad_norm": 2.0695864268058597, "language_loss": 0.73681819, "learning_rate": 3.0573308969042346e-06, "loss": 0.7620846, "num_input_tokens_seen": 122580580, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.23632812, "step": 5705, "time_per_iteration": 2.860525131225586 }, { "auxiliary_loss_clip": 0.01486183, "auxiliary_loss_mlp": 0.01042138, "balance_loss_clip": 1.30105817, "balance_loss_mlp": 1.01749754, "epoch": 0.34306327972343303, "flos": 22095910713600.0, "grad_norm": 2.044513158405664, "language_loss": 0.80285633, "learning_rate": 3.057000289991289e-06, "loss": 0.82813954, "num_input_tokens_seen": 122599810, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.24658203, "step": 5706, "time_per_iteration": 2.884566307067871 }, { "auxiliary_loss_clip": 0.01493564, "auxiliary_loss_mlp": 0.01041244, "balance_loss_clip": 1.3038497, "balance_loss_mlp": 1.01777148, "epoch": 0.343123402976101, "flos": 18451973491200.0, "grad_norm": 1.8858513992844508, "language_loss": 0.83709687, "learning_rate": 3.056669642996787e-06, "loss": 0.862445, "num_input_tokens_seen": 122616035, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.23474121, "step": 5707, "time_per_iteration": 2.8469135761260986 }, { "auxiliary_loss_clip": 0.01485376, "auxiliary_loss_mlp": 0.01042524, "balance_loss_clip": 1.30076635, "balance_loss_mlp": 1.0189321, "epoch": 0.34318352622876896, "flos": 17172129801600.0, "grad_norm": 1.500566844224646, "language_loss": 0.75750196, "learning_rate": 3.056338955933266e-06, "loss": 0.78278095, "num_input_tokens_seen": 122633785, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.23583984, "step": 5708, "time_per_iteration": 2.866607189178467 }, { "auxiliary_loss_clip": 0.0147062, "auxiliary_loss_mlp": 0.01040921, "balance_loss_clip": 1.28808653, "balance_loss_mlp": 1.01814008, "epoch": 0.34324364948143693, "flos": 26699520286080.0, "grad_norm": 1.5235453183406196, "language_loss": 0.81740677, "learning_rate": 3.0560082288132662e-06, "loss": 0.84252214, "num_input_tokens_seen": 122652100, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.22766113, "step": 5709, "time_per_iteration": 2.9027554988861084 }, { "auxiliary_loss_clip": 0.01487335, "auxiliary_loss_mlp": 0.01047784, "balance_loss_clip": 1.3000505, "balance_loss_mlp": 1.02284527, "epoch": 0.3433037727341049, "flos": 21261467266560.0, "grad_norm": 1.9488103026208399, "language_loss": 0.79678798, "learning_rate": 3.055677461649329e-06, "loss": 0.82213914, "num_input_tokens_seen": 122669720, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.24951172, "step": 5710, "time_per_iteration": 4.2722327709198 }, { "auxiliary_loss_clip": 0.01486733, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.29706621, "balance_loss_mlp": 1.0135386, "epoch": 0.34336389598677286, "flos": 20638661587200.0, "grad_norm": 1.7754777628390912, "language_loss": 0.71572816, "learning_rate": 3.055346654453996e-06, "loss": 0.740978, "num_input_tokens_seen": 122688715, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.24707031, "step": 5711, "time_per_iteration": 2.8505163192749023 }, { "auxiliary_loss_clip": 0.01466748, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 1.28311801, "balance_loss_mlp": 1.01811814, "epoch": 0.3434240192394409, "flos": 14546828183040.0, "grad_norm": 1.7005521764814537, "language_loss": 0.68399847, "learning_rate": 3.055015807239812e-06, "loss": 0.70908809, "num_input_tokens_seen": 122706970, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.2409668, "step": 5712, "time_per_iteration": 2.9147815704345703 }, { "auxiliary_loss_clip": 0.01261446, "auxiliary_loss_mlp": 0.01053607, "balance_loss_clip": 1.15592074, "balance_loss_mlp": 1.03386617, "epoch": 0.34348414249210885, "flos": 58076770406400.0, "grad_norm": 0.8506168200074447, "language_loss": 0.58204055, "learning_rate": 3.0546849200193226e-06, "loss": 0.60519111, "num_input_tokens_seen": 122758095, "router_z_loss_clip": 1.0546875, "router_z_loss_mlp": 0.19726562, "step": 5713, "time_per_iteration": 4.738008737564087 }, { "auxiliary_loss_clip": 0.01487975, "auxiliary_loss_mlp": 0.0104274, "balance_loss_clip": 1.30220628, "balance_loss_mlp": 1.02046025, "epoch": 0.3435442657447768, "flos": 20714229682560.0, "grad_norm": 1.6290913914929215, "language_loss": 0.81837648, "learning_rate": 3.054353992805076e-06, "loss": 0.8436836, "num_input_tokens_seen": 122777815, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.22290039, "step": 5714, "time_per_iteration": 2.8812825679779053 }, { "auxiliary_loss_clip": 0.01482389, "auxiliary_loss_mlp": 0.01036236, "balance_loss_clip": 1.29707336, "balance_loss_mlp": 1.01432502, "epoch": 0.3436043889974448, "flos": 22940669975040.0, "grad_norm": 2.3356131706779584, "language_loss": 0.72924566, "learning_rate": 3.05402302560962e-06, "loss": 0.75443196, "num_input_tokens_seen": 122797555, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.21923828, "step": 5715, "time_per_iteration": 2.8737645149230957 }, { "auxiliary_loss_clip": 0.01258677, "auxiliary_loss_mlp": 0.01027838, "balance_loss_clip": 1.15124726, "balance_loss_mlp": 1.00647545, "epoch": 0.34366451225011274, "flos": 58434675909120.0, "grad_norm": 0.9101243748589947, "language_loss": 0.66107607, "learning_rate": 3.053692018445505e-06, "loss": 0.68394125, "num_input_tokens_seen": 122863955, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.21386719, "step": 5716, "time_per_iteration": 4.772834062576294 }, { "auxiliary_loss_clip": 0.01458134, "auxiliary_loss_mlp": 0.01041274, "balance_loss_clip": 1.27784848, "balance_loss_mlp": 1.01869535, "epoch": 0.3437246355027807, "flos": 15604718290560.0, "grad_norm": 1.8812413620421808, "language_loss": 0.74749094, "learning_rate": 3.0533609713252838e-06, "loss": 0.77248502, "num_input_tokens_seen": 122883000, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.22583008, "step": 5717, "time_per_iteration": 2.862823724746704 }, { "auxiliary_loss_clip": 0.01475337, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.29166365, "balance_loss_mlp": 1.01405931, "epoch": 0.34378475875544867, "flos": 27683606845440.0, "grad_norm": 1.94888562481502, "language_loss": 0.76475859, "learning_rate": 3.0530298842615077e-06, "loss": 0.78987461, "num_input_tokens_seen": 122903265, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.2220459, "step": 5718, "time_per_iteration": 4.343626976013184 }, { "auxiliary_loss_clip": 0.01477319, "auxiliary_loss_mlp": 0.01041697, "balance_loss_clip": 1.2904973, "balance_loss_mlp": 1.01816475, "epoch": 0.34384488200811664, "flos": 31444131214080.0, "grad_norm": 1.9107268943222022, "language_loss": 0.647008, "learning_rate": 3.052698757266734e-06, "loss": 0.67219818, "num_input_tokens_seen": 122923860, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.23535156, "step": 5719, "time_per_iteration": 2.952838182449341 }, { "auxiliary_loss_clip": 0.0147778, "auxiliary_loss_mlp": 0.01040657, "balance_loss_clip": 1.29100251, "balance_loss_mlp": 1.01673198, "epoch": 0.3439050052607846, "flos": 24910698130560.0, "grad_norm": 1.797083032639574, "language_loss": 0.74605501, "learning_rate": 3.0523675903535183e-06, "loss": 0.7712394, "num_input_tokens_seen": 122945305, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.23937988, "step": 5720, "time_per_iteration": 2.915485382080078 }, { "auxiliary_loss_clip": 0.01479269, "auxiliary_loss_mlp": 0.01037501, "balance_loss_clip": 1.29412198, "balance_loss_mlp": 1.01481509, "epoch": 0.34396512851345257, "flos": 18159112028160.0, "grad_norm": 1.6701827873180657, "language_loss": 0.74628866, "learning_rate": 3.0520363835344173e-06, "loss": 0.77145636, "num_input_tokens_seen": 122962535, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.22692871, "step": 5721, "time_per_iteration": 2.8276572227478027 }, { "auxiliary_loss_clip": 0.01479699, "auxiliary_loss_mlp": 0.01044796, "balance_loss_clip": 1.29534388, "balance_loss_mlp": 1.02219427, "epoch": 0.34402525176612053, "flos": 16043060344320.0, "grad_norm": 2.316545739366996, "language_loss": 0.8126182, "learning_rate": 3.051705136821992e-06, "loss": 0.83786315, "num_input_tokens_seen": 122979750, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22619629, "step": 5722, "time_per_iteration": 2.846906900405884 }, { "auxiliary_loss_clip": 0.01467925, "auxiliary_loss_mlp": 0.01038837, "balance_loss_clip": 1.28573656, "balance_loss_mlp": 1.01754594, "epoch": 0.3440853750187885, "flos": 21188523369600.0, "grad_norm": 1.7429046550709189, "language_loss": 0.82292736, "learning_rate": 3.051373850228801e-06, "loss": 0.84799504, "num_input_tokens_seen": 122998955, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.21313477, "step": 5723, "time_per_iteration": 2.8604414463043213 }, { "auxiliary_loss_clip": 0.01476312, "auxiliary_loss_mlp": 0.0104546, "balance_loss_clip": 1.2917031, "balance_loss_mlp": 1.01862597, "epoch": 0.34414549827145646, "flos": 12685650312960.0, "grad_norm": 2.2805962533966566, "language_loss": 0.82440972, "learning_rate": 3.0510425237674096e-06, "loss": 0.84962749, "num_input_tokens_seen": 123016165, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.26818848, "step": 5724, "time_per_iteration": 2.8361175060272217 }, { "auxiliary_loss_clip": 0.01469977, "auxiliary_loss_mlp": 0.01040462, "balance_loss_clip": 1.28581595, "balance_loss_mlp": 1.01781249, "epoch": 0.3442056215241244, "flos": 31296840831360.0, "grad_norm": 1.8619518198957918, "language_loss": 0.69955611, "learning_rate": 3.05071115745038e-06, "loss": 0.72466052, "num_input_tokens_seen": 123036900, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.22631836, "step": 5725, "time_per_iteration": 2.9467291831970215 }, { "auxiliary_loss_clip": 0.01498043, "auxiliary_loss_mlp": 0.01043411, "balance_loss_clip": 1.30954003, "balance_loss_mlp": 1.01933074, "epoch": 0.34426574477679245, "flos": 23377609440000.0, "grad_norm": 1.4170113584557122, "language_loss": 0.70602894, "learning_rate": 3.0503797512902773e-06, "loss": 0.73144346, "num_input_tokens_seen": 123057480, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.24084473, "step": 5726, "time_per_iteration": 2.882237672805786 }, { "auxiliary_loss_clip": 0.01491839, "auxiliary_loss_mlp": 0.01044712, "balance_loss_clip": 1.30476427, "balance_loss_mlp": 1.02327776, "epoch": 0.3443258680294604, "flos": 24546204869760.0, "grad_norm": 1.9208719122408142, "language_loss": 0.73868763, "learning_rate": 3.0500483052996703e-06, "loss": 0.76405317, "num_input_tokens_seen": 123076890, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.21435547, "step": 5727, "time_per_iteration": 2.9142730236053467 }, { "auxiliary_loss_clip": 0.01481104, "auxiliary_loss_mlp": 0.01046624, "balance_loss_clip": 1.29666114, "balance_loss_mlp": 1.02490401, "epoch": 0.3443859912821284, "flos": 20239800261120.0, "grad_norm": 2.0590785042808957, "language_loss": 0.89226019, "learning_rate": 3.0497168194911257e-06, "loss": 0.91753745, "num_input_tokens_seen": 123092530, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.21740723, "step": 5728, "time_per_iteration": 2.8307361602783203 }, { "auxiliary_loss_clip": 0.01479638, "auxiliary_loss_mlp": 0.01047157, "balance_loss_clip": 1.29584014, "balance_loss_mlp": 1.02504396, "epoch": 0.34444611453479634, "flos": 24327237444480.0, "grad_norm": 2.097829760254631, "language_loss": 0.71353674, "learning_rate": 3.0493852938772143e-06, "loss": 0.73880464, "num_input_tokens_seen": 123110560, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.22119141, "step": 5729, "time_per_iteration": 2.8734922409057617 }, { "auxiliary_loss_clip": 0.01472183, "auxiliary_loss_mlp": 0.01043413, "balance_loss_clip": 1.28995001, "balance_loss_mlp": 1.02016675, "epoch": 0.3445062377874643, "flos": 16992371635200.0, "grad_norm": 1.8736306421339783, "language_loss": 0.75093538, "learning_rate": 3.0490537284705078e-06, "loss": 0.77609134, "num_input_tokens_seen": 123128655, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.23242188, "step": 5730, "time_per_iteration": 2.8149631023406982 }, { "auxiliary_loss_clip": 0.01478406, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.29514623, "balance_loss_mlp": 1.02171791, "epoch": 0.3445663610401323, "flos": 20312291710080.0, "grad_norm": 3.7489692215942574, "language_loss": 0.81116503, "learning_rate": 3.048722123283578e-06, "loss": 0.83640105, "num_input_tokens_seen": 123145130, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.23486328, "step": 5731, "time_per_iteration": 2.9419095516204834 }, { "auxiliary_loss_clip": 0.01487835, "auxiliary_loss_mlp": 0.01040859, "balance_loss_clip": 1.30267119, "balance_loss_mlp": 1.01940119, "epoch": 0.34462648429280024, "flos": 15896584368000.0, "grad_norm": 2.8000631531280664, "language_loss": 0.79265344, "learning_rate": 3.0483904783290006e-06, "loss": 0.81794035, "num_input_tokens_seen": 123162265, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.21459961, "step": 5732, "time_per_iteration": 2.8550000190734863 }, { "auxiliary_loss_clip": 0.01253171, "auxiliary_loss_mlp": 0.01081038, "balance_loss_clip": 1.14761519, "balance_loss_mlp": 1.06043899, "epoch": 0.3446866075454682, "flos": 59341520315520.0, "grad_norm": 0.7611188657171202, "language_loss": 0.5359503, "learning_rate": 3.0480587936193505e-06, "loss": 0.55929232, "num_input_tokens_seen": 123218620, "router_z_loss_clip": 1.0546875, "router_z_loss_mlp": 0.20605469, "step": 5733, "time_per_iteration": 3.372556686401367 }, { "auxiliary_loss_clip": 0.01483317, "auxiliary_loss_mlp": 0.01042522, "balance_loss_clip": 1.29688585, "balance_loss_mlp": 1.01862073, "epoch": 0.34474673079813617, "flos": 22353363480960.0, "grad_norm": 2.6901644518049017, "language_loss": 0.84003592, "learning_rate": 3.047727069167207e-06, "loss": 0.86529422, "num_input_tokens_seen": 123237325, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.2388916, "step": 5734, "time_per_iteration": 2.871182441711426 }, { "auxiliary_loss_clip": 0.01490691, "auxiliary_loss_mlp": 0.01040991, "balance_loss_clip": 1.30347383, "balance_loss_mlp": 1.01719689, "epoch": 0.34480685405080413, "flos": 27680937402240.0, "grad_norm": 1.7684844875088794, "language_loss": 0.94077313, "learning_rate": 3.0473953049851478e-06, "loss": 0.9660899, "num_input_tokens_seen": 123258650, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.23791504, "step": 5735, "time_per_iteration": 2.885958671569824 }, { "auxiliary_loss_clip": 0.01497156, "auxiliary_loss_mlp": 0.01038845, "balance_loss_clip": 1.30963016, "balance_loss_mlp": 1.01624274, "epoch": 0.3448669773034721, "flos": 22466331043200.0, "grad_norm": 1.7490086777712723, "language_loss": 0.7769556, "learning_rate": 3.0470635010857533e-06, "loss": 0.80231559, "num_input_tokens_seen": 123277155, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.22595215, "step": 5736, "time_per_iteration": 2.839071750640869 }, { "auxiliary_loss_clip": 0.0148963, "auxiliary_loss_mlp": 0.01038165, "balance_loss_clip": 1.30303907, "balance_loss_mlp": 1.01496649, "epoch": 0.34492710055614006, "flos": 24946921232640.0, "grad_norm": 1.4812093884193605, "language_loss": 0.79298025, "learning_rate": 3.0467316574816064e-06, "loss": 0.81825823, "num_input_tokens_seen": 123297640, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.23217773, "step": 5737, "time_per_iteration": 2.8906710147857666 }, { "auxiliary_loss_clip": 0.0150445, "auxiliary_loss_mlp": 0.01038833, "balance_loss_clip": 1.31295419, "balance_loss_mlp": 1.01543236, "epoch": 0.34498722380880803, "flos": 20130497527680.0, "grad_norm": 1.961297394370401, "language_loss": 0.72684181, "learning_rate": 3.0463997741852893e-06, "loss": 0.75227463, "num_input_tokens_seen": 123314370, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.23376465, "step": 5738, "time_per_iteration": 2.8215975761413574 }, { "auxiliary_loss_clip": 0.01491585, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.30378127, "balance_loss_mlp": 1.01437485, "epoch": 0.34504734706147605, "flos": 28449585630720.0, "grad_norm": 1.7324288078210495, "language_loss": 0.82168853, "learning_rate": 3.046067851209389e-06, "loss": 0.84697545, "num_input_tokens_seen": 123336085, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.22753906, "step": 5739, "time_per_iteration": 2.896411895751953 }, { "auxiliary_loss_clip": 0.0149123, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 1.30521858, "balance_loss_mlp": 1.01683354, "epoch": 0.345107470314144, "flos": 22684710286080.0, "grad_norm": 1.9173009141619501, "language_loss": 0.84159964, "learning_rate": 3.0457358885664898e-06, "loss": 0.86691868, "num_input_tokens_seen": 123354460, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.23828125, "step": 5740, "time_per_iteration": 2.861753463745117 }, { "auxiliary_loss_clip": 0.0149362, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.30902338, "balance_loss_mlp": 1.0158906, "epoch": 0.345167593566812, "flos": 20640290400000.0, "grad_norm": 1.9726559805583839, "language_loss": 0.78051275, "learning_rate": 3.045403886269181e-06, "loss": 0.80583715, "num_input_tokens_seen": 123373420, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.22924805, "step": 5741, "time_per_iteration": 2.8760039806365967 }, { "auxiliary_loss_clip": 0.01493842, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.30385518, "balance_loss_mlp": 1.0148989, "epoch": 0.34522771681947995, "flos": 26225724291840.0, "grad_norm": 2.2623598862933076, "language_loss": 0.77624166, "learning_rate": 3.045071844330053e-06, "loss": 0.80155694, "num_input_tokens_seen": 123394730, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.22753906, "step": 5742, "time_per_iteration": 2.992497682571411 }, { "auxiliary_loss_clip": 0.01480088, "auxiliary_loss_mlp": 0.01038265, "balance_loss_clip": 1.29656124, "balance_loss_mlp": 1.01452994, "epoch": 0.3452878400721479, "flos": 19071657279360.0, "grad_norm": 2.1069843493224414, "language_loss": 0.7715745, "learning_rate": 3.0447397627616955e-06, "loss": 0.79675806, "num_input_tokens_seen": 123412895, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.23730469, "step": 5743, "time_per_iteration": 2.888165235519409 }, { "auxiliary_loss_clip": 0.01484311, "auxiliary_loss_mlp": 0.01042854, "balance_loss_clip": 1.30026627, "balance_loss_mlp": 1.01946545, "epoch": 0.3453479633248159, "flos": 27941693040000.0, "grad_norm": 1.5291127370394664, "language_loss": 0.71237713, "learning_rate": 3.0444076415767016e-06, "loss": 0.73764884, "num_input_tokens_seen": 123432320, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23413086, "step": 5744, "time_per_iteration": 2.9155471324920654 }, { "auxiliary_loss_clip": 0.01470856, "auxiliary_loss_mlp": 0.0104009, "balance_loss_clip": 1.28907025, "balance_loss_mlp": 1.01735651, "epoch": 0.34540808657748384, "flos": 19614641852160.0, "grad_norm": 1.6596394436697124, "language_loss": 0.80640578, "learning_rate": 3.044075480787665e-06, "loss": 0.83151525, "num_input_tokens_seen": 123450980, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.22729492, "step": 5745, "time_per_iteration": 4.210557699203491 }, { "auxiliary_loss_clip": 0.01494639, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.30725682, "balance_loss_mlp": 1.01472449, "epoch": 0.3454682098301518, "flos": 20421141995520.0, "grad_norm": 1.8715411327252032, "language_loss": 0.90165317, "learning_rate": 3.043743280407182e-06, "loss": 0.92697597, "num_input_tokens_seen": 123469365, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.22900391, "step": 5746, "time_per_iteration": 2.834477186203003 }, { "auxiliary_loss_clip": 0.01509693, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.31929672, "balance_loss_mlp": 1.01764607, "epoch": 0.34552833308281977, "flos": 21335180325120.0, "grad_norm": 1.9068009637882408, "language_loss": 0.66481394, "learning_rate": 3.043411040447849e-06, "loss": 0.69032705, "num_input_tokens_seen": 123489425, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.23986816, "step": 5747, "time_per_iteration": 2.8847992420196533 }, { "auxiliary_loss_clip": 0.01487599, "auxiliary_loss_mlp": 0.01040896, "balance_loss_clip": 1.30365694, "balance_loss_mlp": 1.018628, "epoch": 0.34558845633548774, "flos": 36256482887040.0, "grad_norm": 1.4837471102806254, "language_loss": 0.73322713, "learning_rate": 3.043078760922264e-06, "loss": 0.75851202, "num_input_tokens_seen": 123509970, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.22277832, "step": 5748, "time_per_iteration": 4.436941385269165 }, { "auxiliary_loss_clip": 0.01471668, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 1.29126644, "balance_loss_mlp": 1.01324904, "epoch": 0.3456485795881557, "flos": 22459453833600.0, "grad_norm": 1.5430070985842557, "language_loss": 0.76527661, "learning_rate": 3.042746441843029e-06, "loss": 0.79034197, "num_input_tokens_seen": 123531055, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.21630859, "step": 5749, "time_per_iteration": 2.8982269763946533 }, { "auxiliary_loss_clip": 0.01255633, "auxiliary_loss_mlp": 0.01022735, "balance_loss_clip": 1.14954615, "balance_loss_mlp": 1.00328052, "epoch": 0.34570870284082367, "flos": 62033748272640.0, "grad_norm": 0.881660689215939, "language_loss": 0.62774605, "learning_rate": 3.0424140832227437e-06, "loss": 0.65052974, "num_input_tokens_seen": 123584720, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.19433594, "step": 5750, "time_per_iteration": 4.687304496765137 }, { "auxiliary_loss_clip": 0.01471462, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 1.29214263, "balance_loss_mlp": 1.01663947, "epoch": 0.34576882609349163, "flos": 22792022248320.0, "grad_norm": 1.6810878772500837, "language_loss": 0.81789094, "learning_rate": 3.042081685074012e-06, "loss": 0.84300154, "num_input_tokens_seen": 123604465, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22961426, "step": 5751, "time_per_iteration": 2.9845314025878906 }, { "auxiliary_loss_clip": 0.01478686, "auxiliary_loss_mlp": 0.01039919, "balance_loss_clip": 1.29653907, "balance_loss_mlp": 1.01655436, "epoch": 0.34582894934615965, "flos": 12356339523840.0, "grad_norm": 2.202883925539081, "language_loss": 0.85703117, "learning_rate": 3.041749247409439e-06, "loss": 0.88221729, "num_input_tokens_seen": 123622320, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.23352051, "step": 5752, "time_per_iteration": 2.8346107006073 }, { "auxiliary_loss_clip": 0.01252876, "auxiliary_loss_mlp": 0.01022806, "balance_loss_clip": 1.14784169, "balance_loss_mlp": 1.00239706, "epoch": 0.3458890725988276, "flos": 70196813746560.0, "grad_norm": 0.7377490357669844, "language_loss": 0.63167304, "learning_rate": 3.0414167702416296e-06, "loss": 0.65442985, "num_input_tokens_seen": 123678010, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.20410156, "step": 5753, "time_per_iteration": 4.633998155593872 }, { "auxiliary_loss_clip": 0.01481393, "auxiliary_loss_mlp": 0.01036236, "balance_loss_clip": 1.29796886, "balance_loss_mlp": 1.01384842, "epoch": 0.3459491958514956, "flos": 17101493389440.0, "grad_norm": 1.7617481685003882, "language_loss": 0.72330743, "learning_rate": 3.0410842535831914e-06, "loss": 0.74848366, "num_input_tokens_seen": 123696830, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.22375488, "step": 5754, "time_per_iteration": 2.9359426498413086 }, { "auxiliary_loss_clip": 0.01509707, "auxiliary_loss_mlp": 0.01038837, "balance_loss_clip": 1.31998038, "balance_loss_mlp": 1.01600814, "epoch": 0.34600931910416355, "flos": 16658762590080.0, "grad_norm": 1.7876717986096986, "language_loss": 0.73788029, "learning_rate": 3.0407516974467343e-06, "loss": 0.76336575, "num_input_tokens_seen": 123714360, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.22814941, "step": 5755, "time_per_iteration": 2.8450236320495605 }, { "auxiliary_loss_clip": 0.01487707, "auxiliary_loss_mlp": 0.01042558, "balance_loss_clip": 1.30407655, "balance_loss_mlp": 1.01956201, "epoch": 0.3460694423568315, "flos": 38560663025280.0, "grad_norm": 2.1304945593061726, "language_loss": 0.72793311, "learning_rate": 3.040419101844869e-06, "loss": 0.7532357, "num_input_tokens_seen": 123739250, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.2298584, "step": 5756, "time_per_iteration": 3.019719362258911 }, { "auxiliary_loss_clip": 0.01254974, "auxiliary_loss_mlp": 0.0102118, "balance_loss_clip": 1.14960492, "balance_loss_mlp": 0.99934065, "epoch": 0.3461295656094995, "flos": 72115389285120.0, "grad_norm": 0.7590716851070733, "language_loss": 0.62631851, "learning_rate": 3.040086466790207e-06, "loss": 0.6490801, "num_input_tokens_seen": 123802845, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.21875, "step": 5757, "time_per_iteration": 3.3244504928588867 }, { "auxiliary_loss_clip": 0.01256386, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.14938974, "balance_loss_mlp": 1.01326299, "epoch": 0.34618968886216744, "flos": 65487747248640.0, "grad_norm": 0.8260900918288759, "language_loss": 0.59337348, "learning_rate": 3.039753792295362e-06, "loss": 0.61627597, "num_input_tokens_seen": 123861805, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 0.20605469, "step": 5758, "time_per_iteration": 3.332646131515503 }, { "auxiliary_loss_clip": 0.01473741, "auxiliary_loss_mlp": 0.01036873, "balance_loss_clip": 1.2941525, "balance_loss_mlp": 1.01484311, "epoch": 0.3462498121148354, "flos": 23481799511040.0, "grad_norm": 1.6515240148667716, "language_loss": 0.72738576, "learning_rate": 3.0394210783729487e-06, "loss": 0.75249183, "num_input_tokens_seen": 123881820, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.22009277, "step": 5759, "time_per_iteration": 3.0097854137420654 }, { "auxiliary_loss_clip": 0.01477409, "auxiliary_loss_mlp": 0.01041788, "balance_loss_clip": 1.29610896, "balance_loss_mlp": 1.01839876, "epoch": 0.3463099353675034, "flos": 24181530629760.0, "grad_norm": 1.6936561280311544, "language_loss": 0.84536099, "learning_rate": 3.0390883250355836e-06, "loss": 0.87055296, "num_input_tokens_seen": 123903700, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.23376465, "step": 5760, "time_per_iteration": 3.0449624061584473 }, { "auxiliary_loss_clip": 0.01255624, "auxiliary_loss_mlp": 0.01020015, "balance_loss_clip": 1.14809585, "balance_loss_mlp": 0.99884397, "epoch": 0.34637005862017134, "flos": 63725692032000.0, "grad_norm": 0.8393070658327962, "language_loss": 0.56715119, "learning_rate": 3.0387555322958865e-06, "loss": 0.58990759, "num_input_tokens_seen": 123960075, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.21191406, "step": 5761, "time_per_iteration": 3.3759799003601074 }, { "auxiliary_loss_clip": 0.01468697, "auxiliary_loss_mlp": 0.01041337, "balance_loss_clip": 1.28823292, "balance_loss_mlp": 1.01795959, "epoch": 0.3464301818728393, "flos": 13151709446400.0, "grad_norm": 2.4827338971348127, "language_loss": 0.95889151, "learning_rate": 3.038422700166474e-06, "loss": 0.98399192, "num_input_tokens_seen": 123975805, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.23364258, "step": 5762, "time_per_iteration": 2.887610912322998 }, { "auxiliary_loss_clip": 0.01495655, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.30729938, "balance_loss_mlp": 1.01727724, "epoch": 0.34649030512550727, "flos": 29327219879040.0, "grad_norm": 1.5401867044969124, "language_loss": 0.70618534, "learning_rate": 3.0380898286599692e-06, "loss": 0.7315439, "num_input_tokens_seen": 123997530, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.22912598, "step": 5763, "time_per_iteration": 2.9568610191345215 }, { "auxiliary_loss_clip": 0.01504963, "auxiliary_loss_mlp": 0.01043207, "balance_loss_clip": 1.31436515, "balance_loss_mlp": 1.01783943, "epoch": 0.34655042837817523, "flos": 23740971580800.0, "grad_norm": 2.727962863978151, "language_loss": 0.84262002, "learning_rate": 3.0377569177889945e-06, "loss": 0.86810178, "num_input_tokens_seen": 124016375, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.2532959, "step": 5764, "time_per_iteration": 2.9098241329193115 }, { "auxiliary_loss_clip": 0.01475918, "auxiliary_loss_mlp": 0.01037434, "balance_loss_clip": 1.29402208, "balance_loss_mlp": 1.01442695, "epoch": 0.34661055163084326, "flos": 22064302581120.0, "grad_norm": 2.5936655139975104, "language_loss": 0.69419324, "learning_rate": 3.0374239675661722e-06, "loss": 0.71932667, "num_input_tokens_seen": 124033975, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.23034668, "step": 5765, "time_per_iteration": 2.8823161125183105 }, { "auxiliary_loss_clip": 0.01493383, "auxiliary_loss_mlp": 0.01042762, "balance_loss_clip": 1.31128132, "balance_loss_mlp": 1.01906347, "epoch": 0.3466706748835112, "flos": 21809338277760.0, "grad_norm": 2.036136529250617, "language_loss": 0.77542233, "learning_rate": 3.03709097800413e-06, "loss": 0.80078375, "num_input_tokens_seen": 124051930, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.23718262, "step": 5766, "time_per_iteration": 2.872681140899658 }, { "auxiliary_loss_clip": 0.01482722, "auxiliary_loss_mlp": 0.01043207, "balance_loss_clip": 1.30078423, "balance_loss_mlp": 1.02074814, "epoch": 0.3467307981361792, "flos": 19470563850240.0, "grad_norm": 2.2476449288716096, "language_loss": 0.7443248, "learning_rate": 3.0367579491154943e-06, "loss": 0.76958406, "num_input_tokens_seen": 124071220, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.2244873, "step": 5767, "time_per_iteration": 2.876159191131592 }, { "auxiliary_loss_clip": 0.01483492, "auxiliary_loss_mlp": 0.01046324, "balance_loss_clip": 1.30066299, "balance_loss_mlp": 1.02195799, "epoch": 0.34679092138884715, "flos": 24838342416000.0, "grad_norm": 1.9472699982290949, "language_loss": 0.79156113, "learning_rate": 3.036424880912893e-06, "loss": 0.81685925, "num_input_tokens_seen": 124090140, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.24365234, "step": 5768, "time_per_iteration": 2.9208943843841553 }, { "auxiliary_loss_clip": 0.01257133, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.15064418, "balance_loss_mlp": 1.01378059, "epoch": 0.3468510446415151, "flos": 63263569196160.0, "grad_norm": 0.7752184876031226, "language_loss": 0.57570904, "learning_rate": 3.036091773408956e-06, "loss": 0.59861463, "num_input_tokens_seen": 124152025, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.19628906, "step": 5769, "time_per_iteration": 3.504361391067505 }, { "auxiliary_loss_clip": 0.01522073, "auxiliary_loss_mlp": 0.01044683, "balance_loss_clip": 1.32983327, "balance_loss_mlp": 1.01989913, "epoch": 0.3469111678941831, "flos": 12125789429760.0, "grad_norm": 3.1377386917143704, "language_loss": 0.88190985, "learning_rate": 3.0357586266163154e-06, "loss": 0.9075774, "num_input_tokens_seen": 124165795, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.2479248, "step": 5770, "time_per_iteration": 2.9669764041900635 }, { "auxiliary_loss_clip": 0.01267378, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.15830183, "balance_loss_mlp": 1.0112313, "epoch": 0.34697129114685105, "flos": 65961814711680.0, "grad_norm": 0.7661166109854397, "language_loss": 0.59807628, "learning_rate": 3.0354254405476036e-06, "loss": 0.62111986, "num_input_tokens_seen": 124222925, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.2578125, "step": 5771, "time_per_iteration": 3.1558072566986084 }, { "auxiliary_loss_clip": 0.0149202, "auxiliary_loss_mlp": 0.01048643, "balance_loss_clip": 1.3084197, "balance_loss_mlp": 1.02440763, "epoch": 0.347031414399519, "flos": 34466167653120.0, "grad_norm": 1.901584474939687, "language_loss": 0.7296192, "learning_rate": 3.0350922152154557e-06, "loss": 0.7550258, "num_input_tokens_seen": 124240915, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.2421875, "step": 5772, "time_per_iteration": 2.9714596271514893 }, { "auxiliary_loss_clip": 0.01480477, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.29645634, "balance_loss_mlp": 1.01719737, "epoch": 0.347091537652187, "flos": 26955706199040.0, "grad_norm": 1.5316851163332588, "language_loss": 0.77748036, "learning_rate": 3.034758950632507e-06, "loss": 0.80271399, "num_input_tokens_seen": 124262770, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.25720215, "step": 5773, "time_per_iteration": 2.9201712608337402 }, { "auxiliary_loss_clip": 0.0148869, "auxiliary_loss_mlp": 0.01046321, "balance_loss_clip": 1.30181086, "balance_loss_mlp": 1.02134633, "epoch": 0.34715166090485494, "flos": 21152119288320.0, "grad_norm": 2.558925385116559, "language_loss": 0.71099335, "learning_rate": 3.034425646811396e-06, "loss": 0.73634338, "num_input_tokens_seen": 124280950, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.24963379, "step": 5774, "time_per_iteration": 2.860607147216797 }, { "auxiliary_loss_clip": 0.01480174, "auxiliary_loss_mlp": 0.01049925, "balance_loss_clip": 1.29843497, "balance_loss_mlp": 1.02423525, "epoch": 0.3472117841575229, "flos": 23488676720640.0, "grad_norm": 1.6686673785489854, "language_loss": 0.77330315, "learning_rate": 3.0340923037647602e-06, "loss": 0.79860413, "num_input_tokens_seen": 124299540, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.25683594, "step": 5775, "time_per_iteration": 2.8855786323547363 }, { "auxiliary_loss_clip": 0.01495811, "auxiliary_loss_mlp": 0.01045644, "balance_loss_clip": 1.3060503, "balance_loss_mlp": 1.02069342, "epoch": 0.34727190741019087, "flos": 17501531080320.0, "grad_norm": 2.039570194456537, "language_loss": 0.80256343, "learning_rate": 3.0337589215052404e-06, "loss": 0.82797796, "num_input_tokens_seen": 124316285, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.24963379, "step": 5776, "time_per_iteration": 2.824976682662964 }, { "auxiliary_loss_clip": 0.01268954, "auxiliary_loss_mlp": 0.01048512, "balance_loss_clip": 1.15978956, "balance_loss_mlp": 1.01780379, "epoch": 0.34733203066285884, "flos": 65299410552960.0, "grad_norm": 0.8452380062567858, "language_loss": 0.63427079, "learning_rate": 3.033425500045478e-06, "loss": 0.65744543, "num_input_tokens_seen": 124376650, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.30664062, "step": 5777, "time_per_iteration": 3.3874659538269043 }, { "auxiliary_loss_clip": 0.01496779, "auxiliary_loss_mlp": 0.01046212, "balance_loss_clip": 1.30945683, "balance_loss_mlp": 1.02115452, "epoch": 0.3473921539155268, "flos": 28670679561600.0, "grad_norm": 1.7539914791184454, "language_loss": 0.65548873, "learning_rate": 3.033092039398119e-06, "loss": 0.68091869, "num_input_tokens_seen": 124396475, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.25048828, "step": 5778, "time_per_iteration": 2.90110182762146 }, { "auxiliary_loss_clip": 0.01490535, "auxiliary_loss_mlp": 0.01046025, "balance_loss_clip": 1.3016398, "balance_loss_mlp": 1.02143192, "epoch": 0.3474522771681948, "flos": 40849912379520.0, "grad_norm": 1.7736775741039934, "language_loss": 0.73450303, "learning_rate": 3.0327585395758046e-06, "loss": 0.75986862, "num_input_tokens_seen": 124416480, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.24572754, "step": 5779, "time_per_iteration": 4.484851837158203 }, { "auxiliary_loss_clip": 0.01500397, "auxiliary_loss_mlp": 0.01047412, "balance_loss_clip": 1.31059682, "balance_loss_mlp": 1.02209198, "epoch": 0.3475124004208628, "flos": 24619420235520.0, "grad_norm": 32.956691510034204, "language_loss": 0.63438809, "learning_rate": 3.0324250005911837e-06, "loss": 0.65986621, "num_input_tokens_seen": 124435950, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.2532959, "step": 5780, "time_per_iteration": 2.9484384059906006 }, { "auxiliary_loss_clip": 0.0149508, "auxiliary_loss_mlp": 0.01040457, "balance_loss_clip": 1.30964625, "balance_loss_mlp": 1.01660347, "epoch": 0.34757252367353075, "flos": 22721657304960.0, "grad_norm": 1.6981513809640765, "language_loss": 0.72209764, "learning_rate": 3.0320914224569033e-06, "loss": 0.74745303, "num_input_tokens_seen": 124455410, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.23876953, "step": 5781, "time_per_iteration": 2.9209048748016357 }, { "auxiliary_loss_clip": 0.01483963, "auxiliary_loss_mlp": 0.01039828, "balance_loss_clip": 1.29662383, "balance_loss_mlp": 1.0136373, "epoch": 0.3476326469261987, "flos": 19837590819840.0, "grad_norm": 1.9442807314373183, "language_loss": 0.78121793, "learning_rate": 3.031757805185612e-06, "loss": 0.80645579, "num_input_tokens_seen": 124474870, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.2623291, "step": 5782, "time_per_iteration": 2.8724257946014404 }, { "auxiliary_loss_clip": 0.0146792, "auxiliary_loss_mlp": 0.0104068, "balance_loss_clip": 1.28341043, "balance_loss_mlp": 1.01549149, "epoch": 0.3476927701788667, "flos": 19947662714880.0, "grad_norm": 2.377371544962961, "language_loss": 0.6413368, "learning_rate": 3.0314241487899622e-06, "loss": 0.66642278, "num_input_tokens_seen": 124494105, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.2520752, "step": 5783, "time_per_iteration": 2.863662004470825 }, { "auxiliary_loss_clip": 0.01464458, "auxiliary_loss_mlp": 0.01039716, "balance_loss_clip": 1.28381038, "balance_loss_mlp": 1.01596951, "epoch": 0.34775289343153465, "flos": 20743394595840.0, "grad_norm": 1.9806170003858385, "language_loss": 0.88741064, "learning_rate": 3.031090453282605e-06, "loss": 0.9124524, "num_input_tokens_seen": 124512030, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.23742676, "step": 5784, "time_per_iteration": 4.320150852203369 }, { "auxiliary_loss_clip": 0.01471972, "auxiliary_loss_mlp": 0.01037895, "balance_loss_clip": 1.28986144, "balance_loss_mlp": 1.01274228, "epoch": 0.3478130166842026, "flos": 19364383008000.0, "grad_norm": 1.5676845720706745, "language_loss": 0.82079482, "learning_rate": 3.0307567186761946e-06, "loss": 0.8458935, "num_input_tokens_seen": 124530980, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.25134277, "step": 5785, "time_per_iteration": 4.486806392669678 }, { "auxiliary_loss_clip": 0.014731, "auxiliary_loss_mlp": 0.0104366, "balance_loss_clip": 1.28898692, "balance_loss_mlp": 1.01903188, "epoch": 0.3478731399368706, "flos": 22060502017920.0, "grad_norm": 1.9797706371229755, "language_loss": 0.81560671, "learning_rate": 3.0304229449833862e-06, "loss": 0.8407743, "num_input_tokens_seen": 124549330, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.24645996, "step": 5786, "time_per_iteration": 2.8967173099517822 }, { "auxiliary_loss_clip": 0.01464623, "auxiliary_loss_mlp": 0.01041694, "balance_loss_clip": 1.28421998, "balance_loss_mlp": 1.01568222, "epoch": 0.34793326318953854, "flos": 18050668945920.0, "grad_norm": 1.5498976683645185, "language_loss": 0.75718582, "learning_rate": 3.030089132216836e-06, "loss": 0.78224897, "num_input_tokens_seen": 124567200, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.2598877, "step": 5787, "time_per_iteration": 2.828270196914673 }, { "auxiliary_loss_clip": 0.01478903, "auxiliary_loss_mlp": 0.01037697, "balance_loss_clip": 1.29383111, "balance_loss_mlp": 1.01323485, "epoch": 0.3479933864422065, "flos": 29326586451840.0, "grad_norm": 1.694649206176931, "language_loss": 0.82095355, "learning_rate": 3.029755280389203e-06, "loss": 0.84611952, "num_input_tokens_seen": 124587025, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.24438477, "step": 5788, "time_per_iteration": 4.362186908721924 }, { "auxiliary_loss_clip": 0.01514335, "auxiliary_loss_mlp": 0.01044075, "balance_loss_clip": 1.32314348, "balance_loss_mlp": 1.0197562, "epoch": 0.3480535096948745, "flos": 20130361793280.0, "grad_norm": 1.8760531155038436, "language_loss": 0.8639853, "learning_rate": 3.029421389513147e-06, "loss": 0.8895694, "num_input_tokens_seen": 124605860, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.24316406, "step": 5789, "time_per_iteration": 2.8584489822387695 }, { "auxiliary_loss_clip": 0.01493981, "auxiliary_loss_mlp": 0.0104168, "balance_loss_clip": 1.30630028, "balance_loss_mlp": 1.017802, "epoch": 0.34811363294754244, "flos": 18557475661440.0, "grad_norm": 1.6973234789267944, "language_loss": 0.85362184, "learning_rate": 3.029087459601328e-06, "loss": 0.87897849, "num_input_tokens_seen": 124624270, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.23864746, "step": 5790, "time_per_iteration": 2.85018253326416 }, { "auxiliary_loss_clip": 0.01487966, "auxiliary_loss_mlp": 0.01040977, "balance_loss_clip": 1.30260825, "balance_loss_mlp": 1.01547849, "epoch": 0.3481737562002104, "flos": 26881631182080.0, "grad_norm": 1.9785380606081096, "language_loss": 0.82511878, "learning_rate": 3.0287534906664097e-06, "loss": 0.8504082, "num_input_tokens_seen": 124644005, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.25512695, "step": 5791, "time_per_iteration": 2.8835055828094482 }, { "auxiliary_loss_clip": 0.01500249, "auxiliary_loss_mlp": 0.01041456, "balance_loss_clip": 1.30972266, "balance_loss_mlp": 1.01695848, "epoch": 0.3482338794528784, "flos": 28919264348160.0, "grad_norm": 1.6887311194472383, "language_loss": 0.78363359, "learning_rate": 3.028419482721056e-06, "loss": 0.80905068, "num_input_tokens_seen": 124663020, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.24499512, "step": 5792, "time_per_iteration": 2.9002573490142822 }, { "auxiliary_loss_clip": 0.01493428, "auxiliary_loss_mlp": 0.01035852, "balance_loss_clip": 1.30720329, "balance_loss_mlp": 1.01246309, "epoch": 0.3482940027055464, "flos": 22211050026240.0, "grad_norm": 1.5549442868245533, "language_loss": 0.82656598, "learning_rate": 3.0280854357779325e-06, "loss": 0.85185885, "num_input_tokens_seen": 124682975, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.23400879, "step": 5793, "time_per_iteration": 2.848363161087036 }, { "auxiliary_loss_clip": 0.01493271, "auxiliary_loss_mlp": 0.01051171, "balance_loss_clip": 1.3034327, "balance_loss_mlp": 1.02633929, "epoch": 0.34835412595821436, "flos": 20312291710080.0, "grad_norm": 1.7874065512553536, "language_loss": 0.77206039, "learning_rate": 3.027751349849706e-06, "loss": 0.79750484, "num_input_tokens_seen": 124701340, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.24829102, "step": 5794, "time_per_iteration": 2.9665229320526123 }, { "auxiliary_loss_clip": 0.01479203, "auxiliary_loss_mlp": 0.0104186, "balance_loss_clip": 1.2937237, "balance_loss_mlp": 1.01779163, "epoch": 0.3484142492108823, "flos": 20459582092800.0, "grad_norm": 1.8032035988739517, "language_loss": 0.58212912, "learning_rate": 3.0274172249490456e-06, "loss": 0.60733974, "num_input_tokens_seen": 124719165, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.24084473, "step": 5795, "time_per_iteration": 2.8366358280181885 }, { "auxiliary_loss_clip": 0.01486186, "auxiliary_loss_mlp": 0.01044407, "balance_loss_clip": 1.30176485, "balance_loss_mlp": 1.02207971, "epoch": 0.3484743724635503, "flos": 24363641525760.0, "grad_norm": 1.9301709124779443, "language_loss": 0.83530235, "learning_rate": 3.0270830610886213e-06, "loss": 0.86060822, "num_input_tokens_seen": 124738670, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22314453, "step": 5796, "time_per_iteration": 2.9322516918182373 }, { "auxiliary_loss_clip": 0.0147506, "auxiliary_loss_mlp": 0.0104234, "balance_loss_clip": 1.29412782, "balance_loss_mlp": 1.01828325, "epoch": 0.34853449571621825, "flos": 24363732015360.0, "grad_norm": 1.596500129474699, "language_loss": 0.84293962, "learning_rate": 3.0267488582811033e-06, "loss": 0.86811364, "num_input_tokens_seen": 124758760, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.24060059, "step": 5797, "time_per_iteration": 2.8796439170837402 }, { "auxiliary_loss_clip": 0.0147896, "auxiliary_loss_mlp": 0.01043451, "balance_loss_clip": 1.29545295, "balance_loss_mlp": 1.01937103, "epoch": 0.3485946189688862, "flos": 27278094533760.0, "grad_norm": 1.7835352488005007, "language_loss": 0.74103093, "learning_rate": 3.026414616539167e-06, "loss": 0.76625502, "num_input_tokens_seen": 124777765, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.24072266, "step": 5798, "time_per_iteration": 2.9132137298583984 }, { "auxiliary_loss_clip": 0.0149655, "auxiliary_loss_mlp": 0.01044543, "balance_loss_clip": 1.30791879, "balance_loss_mlp": 1.02093935, "epoch": 0.3486547422215542, "flos": 20166222936960.0, "grad_norm": 1.9918282048014941, "language_loss": 0.76994634, "learning_rate": 3.026080335875485e-06, "loss": 0.79535723, "num_input_tokens_seen": 124796775, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.23583984, "step": 5799, "time_per_iteration": 2.8885583877563477 }, { "auxiliary_loss_clip": 0.01496371, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.30955791, "balance_loss_mlp": 1.01713884, "epoch": 0.34871486547422215, "flos": 20240071729920.0, "grad_norm": 1.63881968724741, "language_loss": 0.76706201, "learning_rate": 3.025746016302734e-06, "loss": 0.79243362, "num_input_tokens_seen": 124815825, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.2364502, "step": 5800, "time_per_iteration": 2.8590195178985596 }, { "auxiliary_loss_clip": 0.01498623, "auxiliary_loss_mlp": 0.01040377, "balance_loss_clip": 1.30994153, "balance_loss_mlp": 1.01536691, "epoch": 0.3487749887268901, "flos": 44068538050560.0, "grad_norm": 1.7612909833439905, "language_loss": 0.6818192, "learning_rate": 3.025411657833591e-06, "loss": 0.70720923, "num_input_tokens_seen": 124838420, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.25036621, "step": 5801, "time_per_iteration": 3.0323290824890137 }, { "auxiliary_loss_clip": 0.01482374, "auxiliary_loss_mlp": 0.01042476, "balance_loss_clip": 1.29918075, "balance_loss_mlp": 1.01881242, "epoch": 0.3488351119795581, "flos": 23305660928640.0, "grad_norm": 1.9521044323633399, "language_loss": 0.77486813, "learning_rate": 3.025077260480735e-06, "loss": 0.80011666, "num_input_tokens_seen": 124857320, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.23657227, "step": 5802, "time_per_iteration": 2.830148458480835 }, { "auxiliary_loss_clip": 0.01469966, "auxiliary_loss_mlp": 0.01044489, "balance_loss_clip": 1.29152417, "balance_loss_mlp": 1.0206356, "epoch": 0.34889523523222604, "flos": 19943545438080.0, "grad_norm": 2.3019190832190857, "language_loss": 0.79316026, "learning_rate": 3.0247428242568474e-06, "loss": 0.81830484, "num_input_tokens_seen": 124875685, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.23852539, "step": 5803, "time_per_iteration": 2.8416848182678223 }, { "auxiliary_loss_clip": 0.01500886, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.31144166, "balance_loss_mlp": 1.01785135, "epoch": 0.348955358484894, "flos": 30458506331520.0, "grad_norm": 3.6035080563630553, "language_loss": 0.6936658, "learning_rate": 3.0244083491746085e-06, "loss": 0.71908879, "num_input_tokens_seen": 124895960, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.2355957, "step": 5804, "time_per_iteration": 2.9415807723999023 }, { "auxiliary_loss_clip": 0.01473141, "auxiliary_loss_mlp": 0.01047198, "balance_loss_clip": 1.29288268, "balance_loss_mlp": 1.0233444, "epoch": 0.349015481737562, "flos": 18007885347840.0, "grad_norm": 1.7682332235216662, "language_loss": 0.7746222, "learning_rate": 3.024073835246702e-06, "loss": 0.79982555, "num_input_tokens_seen": 124914140, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.23852539, "step": 5805, "time_per_iteration": 2.7975432872772217 }, { "auxiliary_loss_clip": 0.01494502, "auxiliary_loss_mlp": 0.01046769, "balance_loss_clip": 1.30901206, "balance_loss_mlp": 1.02198565, "epoch": 0.34907560499023, "flos": 27209856096000.0, "grad_norm": 2.0578394169969014, "language_loss": 0.68848068, "learning_rate": 3.023739282485814e-06, "loss": 0.71389341, "num_input_tokens_seen": 124934180, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.24780273, "step": 5806, "time_per_iteration": 2.9179153442382812 }, { "auxiliary_loss_clip": 0.01488631, "auxiliary_loss_mlp": 0.0104295, "balance_loss_clip": 1.30316353, "balance_loss_mlp": 1.01845288, "epoch": 0.34913572824289796, "flos": 30239041213440.0, "grad_norm": 1.6596084849070676, "language_loss": 0.72746509, "learning_rate": 3.023404690904629e-06, "loss": 0.75278091, "num_input_tokens_seen": 124956060, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.24511719, "step": 5807, "time_per_iteration": 2.912523031234741 }, { "auxiliary_loss_clip": 0.01501593, "auxiliary_loss_mlp": 0.01041588, "balance_loss_clip": 1.31259882, "balance_loss_mlp": 1.01703084, "epoch": 0.3491958514955659, "flos": 29984122154880.0, "grad_norm": 2.051775504452474, "language_loss": 0.75620878, "learning_rate": 3.0230700605158364e-06, "loss": 0.78164065, "num_input_tokens_seen": 124976070, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.24560547, "step": 5808, "time_per_iteration": 2.985081434249878 }, { "auxiliary_loss_clip": 0.01474004, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.29572821, "balance_loss_mlp": 1.02067018, "epoch": 0.3492559747482339, "flos": 22793379592320.0, "grad_norm": 1.715538195900244, "language_loss": 0.84631151, "learning_rate": 3.0227353913321238e-06, "loss": 0.87148166, "num_input_tokens_seen": 124996995, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.22338867, "step": 5809, "time_per_iteration": 2.8417210578918457 }, { "auxiliary_loss_clip": 0.01464614, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.28584957, "balance_loss_mlp": 1.01509571, "epoch": 0.34931609800090185, "flos": 26079248315520.0, "grad_norm": 2.895152867168358, "language_loss": 0.82106841, "learning_rate": 3.0224006833661835e-06, "loss": 0.84610212, "num_input_tokens_seen": 125015600, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.2364502, "step": 5810, "time_per_iteration": 2.8990931510925293 }, { "auxiliary_loss_clip": 0.01479292, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.29439056, "balance_loss_mlp": 1.01613069, "epoch": 0.3493762212535698, "flos": 29254004513280.0, "grad_norm": 3.343551757252438, "language_loss": 0.76501614, "learning_rate": 3.0220659366307057e-06, "loss": 0.79021478, "num_input_tokens_seen": 125035290, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.24462891, "step": 5811, "time_per_iteration": 2.9344804286956787 }, { "auxiliary_loss_clip": 0.01480274, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 1.29622912, "balance_loss_mlp": 1.01631355, "epoch": 0.3494363445062378, "flos": 27137545626240.0, "grad_norm": 1.6274871532126767, "language_loss": 0.80928564, "learning_rate": 3.021731151138386e-06, "loss": 0.83451056, "num_input_tokens_seen": 125057130, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.25927734, "step": 5812, "time_per_iteration": 2.873048782348633 }, { "auxiliary_loss_clip": 0.01483859, "auxiliary_loss_mlp": 0.0104091, "balance_loss_clip": 1.29849386, "balance_loss_mlp": 1.01656711, "epoch": 0.34949646775890575, "flos": 12283802830080.0, "grad_norm": 2.0223834592284087, "language_loss": 0.70383334, "learning_rate": 3.021396326901918e-06, "loss": 0.72908103, "num_input_tokens_seen": 125073720, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.24328613, "step": 5813, "time_per_iteration": 2.833826780319214 }, { "auxiliary_loss_clip": 0.01460506, "auxiliary_loss_mlp": 0.01039383, "balance_loss_clip": 1.27924597, "balance_loss_mlp": 1.01482606, "epoch": 0.3495565910115737, "flos": 17174437286400.0, "grad_norm": 5.260107837109985, "language_loss": 0.77650338, "learning_rate": 3.0210614639339998e-06, "loss": 0.80150229, "num_input_tokens_seen": 125090635, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.2454834, "step": 5814, "time_per_iteration": 4.304182291030884 }, { "auxiliary_loss_clip": 0.01483242, "auxiliary_loss_mlp": 0.0104634, "balance_loss_clip": 1.29707599, "balance_loss_mlp": 1.01997113, "epoch": 0.3496167142642417, "flos": 26476028380800.0, "grad_norm": 1.4970898519533578, "language_loss": 0.85383606, "learning_rate": 3.020726562247328e-06, "loss": 0.87913179, "num_input_tokens_seen": 125110070, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.26342773, "step": 5815, "time_per_iteration": 2.866560220718384 }, { "auxiliary_loss_clip": 0.01482687, "auxiliary_loss_mlp": 0.01043538, "balance_loss_clip": 1.29687619, "balance_loss_mlp": 1.01907635, "epoch": 0.34967683751690964, "flos": 17422252911360.0, "grad_norm": 1.9954745982641804, "language_loss": 0.77968359, "learning_rate": 3.0203916218546024e-06, "loss": 0.80494583, "num_input_tokens_seen": 125125730, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.24450684, "step": 5816, "time_per_iteration": 2.821442127227783 }, { "auxiliary_loss_clip": 0.0149818, "auxiliary_loss_mlp": 0.01046592, "balance_loss_clip": 1.30964196, "balance_loss_mlp": 1.02195108, "epoch": 0.3497369607695776, "flos": 22609866107520.0, "grad_norm": 2.436867859210501, "language_loss": 0.60047019, "learning_rate": 3.0200566427685246e-06, "loss": 0.62591791, "num_input_tokens_seen": 125146195, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.24633789, "step": 5817, "time_per_iteration": 2.8496599197387695 }, { "auxiliary_loss_clip": 0.01258697, "auxiliary_loss_mlp": 0.01053349, "balance_loss_clip": 1.15017128, "balance_loss_mlp": 1.02855384, "epoch": 0.34979708402224563, "flos": 68558992047360.0, "grad_norm": 0.8881255823635612, "language_loss": 0.59858012, "learning_rate": 3.0197216250017975e-06, "loss": 0.62170064, "num_input_tokens_seen": 125207790, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 0.24707031, "step": 5818, "time_per_iteration": 3.3916971683502197 }, { "auxiliary_loss_clip": 0.01476813, "auxiliary_loss_mlp": 0.01043411, "balance_loss_clip": 1.29482925, "balance_loss_mlp": 1.01648188, "epoch": 0.3498572072749136, "flos": 18998984851200.0, "grad_norm": 1.7159791302729104, "language_loss": 0.84057033, "learning_rate": 3.019386568567123e-06, "loss": 0.8657726, "num_input_tokens_seen": 125226220, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.26928711, "step": 5819, "time_per_iteration": 4.2695276737213135 }, { "auxiliary_loss_clip": 0.01483072, "auxiliary_loss_mlp": 0.01041391, "balance_loss_clip": 1.29838967, "balance_loss_mlp": 1.0163089, "epoch": 0.34991733052758156, "flos": 27830444780160.0, "grad_norm": 4.371674132950581, "language_loss": 0.71639347, "learning_rate": 3.0190514734772083e-06, "loss": 0.74163806, "num_input_tokens_seen": 125247485, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.25097656, "step": 5820, "time_per_iteration": 4.367142200469971 }, { "auxiliary_loss_clip": 0.01483584, "auxiliary_loss_mlp": 0.01037264, "balance_loss_clip": 1.2975173, "balance_loss_mlp": 1.01304078, "epoch": 0.3499774537802495, "flos": 33597401385600.0, "grad_norm": 1.5898853106826576, "language_loss": 0.70515692, "learning_rate": 3.018716339744759e-06, "loss": 0.7303654, "num_input_tokens_seen": 125268625, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.24230957, "step": 5821, "time_per_iteration": 2.9403603076934814 }, { "auxiliary_loss_clip": 0.01487364, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.298931, "balance_loss_mlp": 1.01545393, "epoch": 0.3500375770329175, "flos": 23487093152640.0, "grad_norm": 2.0849954271644107, "language_loss": 0.74694699, "learning_rate": 3.0183811673824842e-06, "loss": 0.77223295, "num_input_tokens_seen": 125287530, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.25793457, "step": 5822, "time_per_iteration": 2.8512659072875977 }, { "auxiliary_loss_clip": 0.01480565, "auxiliary_loss_mlp": 0.01045298, "balance_loss_clip": 1.29296947, "balance_loss_mlp": 1.01939428, "epoch": 0.35009770028558546, "flos": 19035388932480.0, "grad_norm": 1.670891566374546, "language_loss": 0.78921872, "learning_rate": 3.018045956403094e-06, "loss": 0.81447732, "num_input_tokens_seen": 125307020, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.25915527, "step": 5823, "time_per_iteration": 4.203794717788696 }, { "auxiliary_loss_clip": 0.01265647, "auxiliary_loss_mlp": 0.01046057, "balance_loss_clip": 1.15590394, "balance_loss_mlp": 1.01420426, "epoch": 0.3501578235382534, "flos": 68382717730560.0, "grad_norm": 0.7157619799083322, "language_loss": 0.59281552, "learning_rate": 3.017710706819298e-06, "loss": 0.61593252, "num_input_tokens_seen": 125370445, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.31835938, "step": 5824, "time_per_iteration": 3.379009485244751 }, { "auxiliary_loss_clip": 0.01474157, "auxiliary_loss_mlp": 0.01047685, "balance_loss_clip": 1.28828335, "balance_loss_mlp": 1.02019548, "epoch": 0.3502179467909214, "flos": 21260607615360.0, "grad_norm": 1.8142939836322582, "language_loss": 0.85409707, "learning_rate": 3.017375418643811e-06, "loss": 0.87931556, "num_input_tokens_seen": 125388900, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.27514648, "step": 5825, "time_per_iteration": 2.9218318462371826 }, { "auxiliary_loss_clip": 0.01482748, "auxiliary_loss_mlp": 0.01046928, "balance_loss_clip": 1.29733169, "balance_loss_mlp": 1.01993871, "epoch": 0.35027807004358935, "flos": 11949198399360.0, "grad_norm": 3.3022007426212667, "language_loss": 0.84061795, "learning_rate": 3.0170400918893464e-06, "loss": 0.8659147, "num_input_tokens_seen": 125402675, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.27001953, "step": 5826, "time_per_iteration": 2.9781434535980225 }, { "auxiliary_loss_clip": 0.01489121, "auxiliary_loss_mlp": 0.01045607, "balance_loss_clip": 1.30203605, "balance_loss_mlp": 1.0175209, "epoch": 0.3503381932962573, "flos": 21480977629440.0, "grad_norm": 2.0039370448866283, "language_loss": 0.81436312, "learning_rate": 3.0167047265686186e-06, "loss": 0.83971041, "num_input_tokens_seen": 125421360, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.28063965, "step": 5827, "time_per_iteration": 2.879667043685913 }, { "auxiliary_loss_clip": 0.01481975, "auxiliary_loss_mlp": 0.01047294, "balance_loss_clip": 1.29873323, "balance_loss_mlp": 1.02096033, "epoch": 0.3503983165489253, "flos": 21260879084160.0, "grad_norm": 1.9744332845597816, "language_loss": 0.71906567, "learning_rate": 3.0163693226943467e-06, "loss": 0.74435842, "num_input_tokens_seen": 125440000, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.26318359, "step": 5828, "time_per_iteration": 2.867096424102783 }, { "auxiliary_loss_clip": 0.01485485, "auxiliary_loss_mlp": 0.010508, "balance_loss_clip": 1.29890347, "balance_loss_mlp": 1.02135491, "epoch": 0.35045843980159325, "flos": 27826463237760.0, "grad_norm": 1.8491423749502007, "language_loss": 0.80347228, "learning_rate": 3.016033880279248e-06, "loss": 0.82883507, "num_input_tokens_seen": 125460390, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.29467773, "step": 5829, "time_per_iteration": 2.9218502044677734 }, { "auxiliary_loss_clip": 0.01491771, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.30132842, "balance_loss_mlp": 1.01847351, "epoch": 0.3505185630542612, "flos": 25932093667200.0, "grad_norm": 2.0167690593499983, "language_loss": 0.72743106, "learning_rate": 3.0156983993360417e-06, "loss": 0.7528196, "num_input_tokens_seen": 125478410, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.28637695, "step": 5830, "time_per_iteration": 2.939810276031494 }, { "auxiliary_loss_clip": 0.01474814, "auxiliary_loss_mlp": 0.01048712, "balance_loss_clip": 1.2900331, "balance_loss_mlp": 1.02048349, "epoch": 0.35057868630692923, "flos": 20531440114560.0, "grad_norm": 3.050356908586686, "language_loss": 0.89690924, "learning_rate": 3.0153628798774513e-06, "loss": 0.92214447, "num_input_tokens_seen": 125495975, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.2824707, "step": 5831, "time_per_iteration": 2.828636646270752 }, { "auxiliary_loss_clip": 0.01486628, "auxiliary_loss_mlp": 0.0104428, "balance_loss_clip": 1.30059195, "balance_loss_mlp": 1.01891232, "epoch": 0.3506388095595972, "flos": 20458269993600.0, "grad_norm": 4.087055671865954, "language_loss": 0.79430199, "learning_rate": 3.0150273219161985e-06, "loss": 0.81961101, "num_input_tokens_seen": 125515035, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.25378418, "step": 5832, "time_per_iteration": 2.856583833694458 }, { "auxiliary_loss_clip": 0.01494621, "auxiliary_loss_mlp": 0.01045174, "balance_loss_clip": 1.30464816, "balance_loss_mlp": 1.01885295, "epoch": 0.35069893281226516, "flos": 23119342266240.0, "grad_norm": 3.0278512129681956, "language_loss": 0.71996754, "learning_rate": 3.014691725465008e-06, "loss": 0.7453655, "num_input_tokens_seen": 125535555, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.26342773, "step": 5833, "time_per_iteration": 2.853951930999756 }, { "auxiliary_loss_clip": 0.01466049, "auxiliary_loss_mlp": 0.01041642, "balance_loss_clip": 1.28521836, "balance_loss_mlp": 1.01608336, "epoch": 0.35075905606493313, "flos": 27283523909760.0, "grad_norm": 1.4390089222419462, "language_loss": 0.81538808, "learning_rate": 3.014356090536606e-06, "loss": 0.84046507, "num_input_tokens_seen": 125558195, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.25561523, "step": 5834, "time_per_iteration": 2.890451431274414 }, { "auxiliary_loss_clip": 0.0147622, "auxiliary_loss_mlp": 0.01045197, "balance_loss_clip": 1.28947341, "balance_loss_mlp": 1.01736212, "epoch": 0.3508191793176011, "flos": 19136185643520.0, "grad_norm": 2.946875217344705, "language_loss": 0.84043133, "learning_rate": 3.0140204171437183e-06, "loss": 0.86564553, "num_input_tokens_seen": 125575375, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.27832031, "step": 5835, "time_per_iteration": 2.836390733718872 }, { "auxiliary_loss_clip": 0.01474219, "auxiliary_loss_mlp": 0.01043773, "balance_loss_clip": 1.28975415, "balance_loss_mlp": 1.01788032, "epoch": 0.35087930257026906, "flos": 25568776771200.0, "grad_norm": 2.5390373631838283, "language_loss": 0.77832675, "learning_rate": 3.0136847052990754e-06, "loss": 0.80350661, "num_input_tokens_seen": 125596745, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.2590332, "step": 5836, "time_per_iteration": 2.87524676322937 }, { "auxiliary_loss_clip": 0.01470562, "auxiliary_loss_mlp": 0.0103751, "balance_loss_clip": 1.28864169, "balance_loss_mlp": 1.01112843, "epoch": 0.350939425822937, "flos": 18013269479040.0, "grad_norm": 2.1148938738832506, "language_loss": 0.78671002, "learning_rate": 3.0133489550154074e-06, "loss": 0.81179076, "num_input_tokens_seen": 125613980, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.26403809, "step": 5837, "time_per_iteration": 2.8404996395111084 }, { "auxiliary_loss_clip": 0.01472947, "auxiliary_loss_mlp": 0.01043339, "balance_loss_clip": 1.28831744, "balance_loss_mlp": 1.01704192, "epoch": 0.350999549075605, "flos": 22283360496000.0, "grad_norm": 2.0071055511052713, "language_loss": 0.68766999, "learning_rate": 3.0130131663054442e-06, "loss": 0.71283281, "num_input_tokens_seen": 125632100, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.26306152, "step": 5838, "time_per_iteration": 2.8751027584075928 }, { "auxiliary_loss_clip": 0.01470116, "auxiliary_loss_mlp": 0.01044397, "balance_loss_clip": 1.28693306, "balance_loss_mlp": 1.01728916, "epoch": 0.35105967232827295, "flos": 14400397451520.0, "grad_norm": 2.519553506472713, "language_loss": 0.84130007, "learning_rate": 3.0126773391819215e-06, "loss": 0.86644518, "num_input_tokens_seen": 125649190, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.27111816, "step": 5839, "time_per_iteration": 2.918013334274292 }, { "auxiliary_loss_clip": 0.01483034, "auxiliary_loss_mlp": 0.0104553, "balance_loss_clip": 1.29301751, "balance_loss_mlp": 1.01925635, "epoch": 0.3511197955809409, "flos": 25093170984960.0, "grad_norm": 1.6951662623978128, "language_loss": 0.6016863, "learning_rate": 3.012341473657572e-06, "loss": 0.62697196, "num_input_tokens_seen": 125668680, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.26257324, "step": 5840, "time_per_iteration": 2.962437868118286 }, { "auxiliary_loss_clip": 0.01485443, "auxiliary_loss_mlp": 0.01044976, "balance_loss_clip": 1.29742503, "balance_loss_mlp": 1.02059734, "epoch": 0.3511799188336089, "flos": 25894965669120.0, "grad_norm": 2.164954471130695, "language_loss": 0.89077485, "learning_rate": 3.0120055697451322e-06, "loss": 0.91607904, "num_input_tokens_seen": 125686935, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.24401855, "step": 5841, "time_per_iteration": 2.928478240966797 }, { "auxiliary_loss_clip": 0.01501013, "auxiliary_loss_mlp": 0.01048324, "balance_loss_clip": 1.30873382, "balance_loss_mlp": 1.02042866, "epoch": 0.35124004208627685, "flos": 20093233795200.0, "grad_norm": 2.289754685816453, "language_loss": 0.76500303, "learning_rate": 3.0116696274573406e-06, "loss": 0.79049635, "num_input_tokens_seen": 125707180, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.27905273, "step": 5842, "time_per_iteration": 2.9654524326324463 }, { "auxiliary_loss_clip": 0.01488667, "auxiliary_loss_mlp": 0.01041194, "balance_loss_clip": 1.3003006, "balance_loss_mlp": 1.0152657, "epoch": 0.3513001653389448, "flos": 17791904079360.0, "grad_norm": 3.014944404441442, "language_loss": 0.70112145, "learning_rate": 3.0113336468069346e-06, "loss": 0.72642004, "num_input_tokens_seen": 125722780, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.25927734, "step": 5843, "time_per_iteration": 2.831038475036621 }, { "auxiliary_loss_clip": 0.01488446, "auxiliary_loss_mlp": 0.01045825, "balance_loss_clip": 1.30245638, "balance_loss_mlp": 1.01961112, "epoch": 0.3513602885916128, "flos": 29398942166400.0, "grad_norm": 2.0044682392108526, "language_loss": 0.66020375, "learning_rate": 3.010997627806655e-06, "loss": 0.68554646, "num_input_tokens_seen": 125742110, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.26220703, "step": 5844, "time_per_iteration": 2.8809878826141357 }, { "auxiliary_loss_clip": 0.014896, "auxiliary_loss_mlp": 0.01055587, "balance_loss_clip": 1.30238056, "balance_loss_mlp": 1.03021884, "epoch": 0.3514204118442808, "flos": 16188405200640.0, "grad_norm": 4.163975009452507, "language_loss": 0.76056373, "learning_rate": 3.010661570469245e-06, "loss": 0.78601551, "num_input_tokens_seen": 125759980, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.25354004, "step": 5845, "time_per_iteration": 2.8514504432678223 }, { "auxiliary_loss_clip": 0.01470979, "auxiliary_loss_mlp": 0.0105628, "balance_loss_clip": 1.28844428, "balance_loss_mlp": 1.03073359, "epoch": 0.35148053509694877, "flos": 23843306615040.0, "grad_norm": 3.1248103295757184, "language_loss": 0.73524076, "learning_rate": 3.0103254748074465e-06, "loss": 0.76051337, "num_input_tokens_seen": 125772660, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.25549316, "step": 5846, "time_per_iteration": 2.817150115966797 }, { "auxiliary_loss_clip": 0.01491046, "auxiliary_loss_mlp": 0.0105785, "balance_loss_clip": 1.30248725, "balance_loss_mlp": 1.03237522, "epoch": 0.35154065834961673, "flos": 20999851977600.0, "grad_norm": 1.9776035468846587, "language_loss": 0.76152563, "learning_rate": 3.0099893408340046e-06, "loss": 0.7870146, "num_input_tokens_seen": 125791935, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.25500488, "step": 5847, "time_per_iteration": 2.8636062145233154 }, { "auxiliary_loss_clip": 0.01493066, "auxiliary_loss_mlp": 0.010554, "balance_loss_clip": 1.30288398, "balance_loss_mlp": 1.03022325, "epoch": 0.3516007816022847, "flos": 33268497799680.0, "grad_norm": 2.0901566579614648, "language_loss": 0.733239, "learning_rate": 3.009653168561666e-06, "loss": 0.75872368, "num_input_tokens_seen": 125813455, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.25195312, "step": 5848, "time_per_iteration": 2.9602010250091553 }, { "auxiliary_loss_clip": 0.01487957, "auxiliary_loss_mlp": 0.01059564, "balance_loss_clip": 1.29860187, "balance_loss_mlp": 1.03180003, "epoch": 0.35166090485495266, "flos": 11733895802880.0, "grad_norm": 2.26898933247515, "language_loss": 0.91494161, "learning_rate": 3.009316958003178e-06, "loss": 0.94041681, "num_input_tokens_seen": 125827660, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.27746582, "step": 5849, "time_per_iteration": 4.188556671142578 }, { "auxiliary_loss_clip": 0.01476453, "auxiliary_loss_mlp": 0.01061684, "balance_loss_clip": 1.29036713, "balance_loss_mlp": 1.03668582, "epoch": 0.3517210281076206, "flos": 22648803897600.0, "grad_norm": 1.9500409910933438, "language_loss": 0.75355315, "learning_rate": 3.0089807091712897e-06, "loss": 0.77893448, "num_input_tokens_seen": 125846655, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.25012207, "step": 5850, "time_per_iteration": 2.8783211708068848 }, { "auxiliary_loss_clip": 0.01474134, "auxiliary_loss_mlp": 0.01055576, "balance_loss_clip": 1.28929281, "balance_loss_mlp": 1.0298028, "epoch": 0.3517811513602886, "flos": 21332194168320.0, "grad_norm": 6.339216106173975, "language_loss": 0.77091318, "learning_rate": 3.0086444220787515e-06, "loss": 0.79621029, "num_input_tokens_seen": 125866290, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.25793457, "step": 5851, "time_per_iteration": 2.8392467498779297 }, { "auxiliary_loss_clip": 0.01491996, "auxiliary_loss_mlp": 0.01055511, "balance_loss_clip": 1.30592203, "balance_loss_mlp": 1.02904677, "epoch": 0.35184127461295656, "flos": 21042771310080.0, "grad_norm": 1.7769904412893267, "language_loss": 0.87896705, "learning_rate": 3.0083080967383165e-06, "loss": 0.90444207, "num_input_tokens_seen": 125884620, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.26452637, "step": 5852, "time_per_iteration": 2.835381269454956 }, { "auxiliary_loss_clip": 0.01481575, "auxiliary_loss_mlp": 0.01050353, "balance_loss_clip": 1.29583383, "balance_loss_mlp": 1.02450824, "epoch": 0.3519013978656245, "flos": 22465380902400.0, "grad_norm": 2.5184283918732833, "language_loss": 0.6821894, "learning_rate": 3.007971733162737e-06, "loss": 0.70750868, "num_input_tokens_seen": 125902430, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.25854492, "step": 5853, "time_per_iteration": 2.8290586471557617 }, { "auxiliary_loss_clip": 0.01488291, "auxiliary_loss_mlp": 0.01052472, "balance_loss_clip": 1.30046427, "balance_loss_mlp": 1.02591181, "epoch": 0.3519615211182925, "flos": 13123087470720.0, "grad_norm": 1.8600908742861302, "language_loss": 0.82190526, "learning_rate": 3.0076353313647686e-06, "loss": 0.84731293, "num_input_tokens_seen": 125920570, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.265625, "step": 5854, "time_per_iteration": 4.218945741653442 }, { "auxiliary_loss_clip": 0.01468745, "auxiliary_loss_mlp": 0.01050159, "balance_loss_clip": 1.28625667, "balance_loss_mlp": 1.02542305, "epoch": 0.35202164437096045, "flos": 19144420197120.0, "grad_norm": 1.4757812721698564, "language_loss": 0.74183887, "learning_rate": 3.0072988913571666e-06, "loss": 0.76702791, "num_input_tokens_seen": 125939800, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.24755859, "step": 5855, "time_per_iteration": 4.312297582626343 }, { "auxiliary_loss_clip": 0.01479109, "auxiliary_loss_mlp": 0.01049026, "balance_loss_clip": 1.29403973, "balance_loss_mlp": 1.02468324, "epoch": 0.3520817676236284, "flos": 26553496757760.0, "grad_norm": 2.065417367006445, "language_loss": 0.72163904, "learning_rate": 3.006962413152691e-06, "loss": 0.74692035, "num_input_tokens_seen": 125958720, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.24353027, "step": 5856, "time_per_iteration": 2.8791329860687256 }, { "auxiliary_loss_clip": 0.01504131, "auxiliary_loss_mlp": 0.01051872, "balance_loss_clip": 1.31385112, "balance_loss_mlp": 1.02500224, "epoch": 0.3521418908762964, "flos": 44909270524800.0, "grad_norm": 1.7172873592265194, "language_loss": 0.62498695, "learning_rate": 3.0066258967640987e-06, "loss": 0.65054697, "num_input_tokens_seen": 125984310, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.26855469, "step": 5857, "time_per_iteration": 3.0565097332000732 }, { "auxiliary_loss_clip": 0.01487765, "auxiliary_loss_mlp": 0.01047073, "balance_loss_clip": 1.30164599, "balance_loss_mlp": 1.02045405, "epoch": 0.3522020141289644, "flos": 20195433095040.0, "grad_norm": 1.700694967385964, "language_loss": 0.73884118, "learning_rate": 3.006289342204152e-06, "loss": 0.76418954, "num_input_tokens_seen": 126002410, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.26647949, "step": 5858, "time_per_iteration": 4.250205755233765 }, { "auxiliary_loss_clip": 0.01489972, "auxiliary_loss_mlp": 0.01045369, "balance_loss_clip": 1.30420291, "balance_loss_mlp": 1.02044284, "epoch": 0.35226213738163237, "flos": 27575525721600.0, "grad_norm": 1.61345897219296, "language_loss": 0.77023566, "learning_rate": 3.0059527494856126e-06, "loss": 0.79558909, "num_input_tokens_seen": 126022490, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.24926758, "step": 5859, "time_per_iteration": 2.9078209400177 }, { "auxiliary_loss_clip": 0.01519147, "auxiliary_loss_mlp": 0.01052499, "balance_loss_clip": 1.32589364, "balance_loss_mlp": 1.02518821, "epoch": 0.35232226063430033, "flos": 22976621608320.0, "grad_norm": 1.8097286990554327, "language_loss": 0.72204751, "learning_rate": 3.0056161186212435e-06, "loss": 0.74776393, "num_input_tokens_seen": 126042895, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.27331543, "step": 5860, "time_per_iteration": 2.863739252090454 }, { "auxiliary_loss_clip": 0.01517028, "auxiliary_loss_mlp": 0.01051637, "balance_loss_clip": 1.32302904, "balance_loss_mlp": 1.02470732, "epoch": 0.3523823838869683, "flos": 19176707001600.0, "grad_norm": 2.3405483741150124, "language_loss": 0.67910755, "learning_rate": 3.005279449623811e-06, "loss": 0.70479417, "num_input_tokens_seen": 126060130, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.26916504, "step": 5861, "time_per_iteration": 2.8162944316864014 }, { "auxiliary_loss_clip": 0.01492347, "auxiliary_loss_mlp": 0.01054637, "balance_loss_clip": 1.30743694, "balance_loss_mlp": 1.02897096, "epoch": 0.35244250713963626, "flos": 17939918378880.0, "grad_norm": 1.8419124172669643, "language_loss": 0.67037773, "learning_rate": 3.0049427425060815e-06, "loss": 0.69584757, "num_input_tokens_seen": 126077850, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.25695801, "step": 5862, "time_per_iteration": 2.8115737438201904 }, { "auxiliary_loss_clip": 0.01498538, "auxiliary_loss_mlp": 0.01054354, "balance_loss_clip": 1.30960906, "balance_loss_mlp": 1.02854538, "epoch": 0.35250263039230423, "flos": 21442085084160.0, "grad_norm": 1.9489105762344714, "language_loss": 0.78352082, "learning_rate": 3.0046059972808215e-06, "loss": 0.80904973, "num_input_tokens_seen": 126095985, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.25830078, "step": 5863, "time_per_iteration": 2.825699806213379 }, { "auxiliary_loss_clip": 0.01493111, "auxiliary_loss_mlp": 0.0104537, "balance_loss_clip": 1.30479622, "balance_loss_mlp": 1.02108693, "epoch": 0.3525627536449722, "flos": 27428144849280.0, "grad_norm": 2.26425635605047, "language_loss": 0.762312, "learning_rate": 3.0042692139608024e-06, "loss": 0.78769684, "num_input_tokens_seen": 126116070, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.24304199, "step": 5864, "time_per_iteration": 2.909738540649414 }, { "auxiliary_loss_clip": 0.01490113, "auxiliary_loss_mlp": 0.01058731, "balance_loss_clip": 1.30311036, "balance_loss_mlp": 1.03416228, "epoch": 0.35262287689764016, "flos": 24800400011520.0, "grad_norm": 2.3225588354543243, "language_loss": 0.80402017, "learning_rate": 3.003932392558793e-06, "loss": 0.8295086, "num_input_tokens_seen": 126135205, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.24572754, "step": 5865, "time_per_iteration": 2.9233973026275635 }, { "auxiliary_loss_clip": 0.01514893, "auxiliary_loss_mlp": 0.01045269, "balance_loss_clip": 1.32198393, "balance_loss_mlp": 1.01950812, "epoch": 0.3526830001503081, "flos": 17830525155840.0, "grad_norm": 2.0839971332904743, "language_loss": 0.81974858, "learning_rate": 3.0035955330875677e-06, "loss": 0.84535015, "num_input_tokens_seen": 126151895, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.25769043, "step": 5866, "time_per_iteration": 2.8572540283203125 }, { "auxiliary_loss_clip": 0.01524369, "auxiliary_loss_mlp": 0.01050689, "balance_loss_clip": 1.32804823, "balance_loss_mlp": 1.02433181, "epoch": 0.3527431234029761, "flos": 18087389740800.0, "grad_norm": 3.1920081057223384, "language_loss": 0.85226274, "learning_rate": 3.0032586355598986e-06, "loss": 0.87801331, "num_input_tokens_seen": 126168515, "router_z_loss_clip": 1.96191406, "router_z_loss_mlp": 0.26367188, "step": 5867, "time_per_iteration": 2.804725408554077 }, { "auxiliary_loss_clip": 0.01509994, "auxiliary_loss_mlp": 0.01059592, "balance_loss_clip": 1.3196156, "balance_loss_mlp": 1.0316968, "epoch": 0.35280324665564405, "flos": 19436783967360.0, "grad_norm": 2.1584447880871735, "language_loss": 0.74672031, "learning_rate": 3.0029216999885613e-06, "loss": 0.77241617, "num_input_tokens_seen": 126186460, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.27929688, "step": 5868, "time_per_iteration": 2.848109245300293 }, { "auxiliary_loss_clip": 0.01518176, "auxiliary_loss_mlp": 0.01064291, "balance_loss_clip": 1.32666194, "balance_loss_mlp": 1.03781426, "epoch": 0.352863369908312, "flos": 21513219189120.0, "grad_norm": 1.5754360744581988, "language_loss": 0.623514, "learning_rate": 3.0025847263863327e-06, "loss": 0.64933872, "num_input_tokens_seen": 126206170, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.26501465, "step": 5869, "time_per_iteration": 2.8394765853881836 }, { "auxiliary_loss_clip": 0.01509613, "auxiliary_loss_mlp": 0.01053167, "balance_loss_clip": 1.31973147, "balance_loss_mlp": 1.02789474, "epoch": 0.35292349316098, "flos": 22319628842880.0, "grad_norm": 1.7835937249507423, "language_loss": 0.75319719, "learning_rate": 3.0022477147659917e-06, "loss": 0.77882493, "num_input_tokens_seen": 126225605, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.25280762, "step": 5870, "time_per_iteration": 2.90159273147583 }, { "auxiliary_loss_clip": 0.01500058, "auxiliary_loss_mlp": 0.01054378, "balance_loss_clip": 1.31302094, "balance_loss_mlp": 1.02835476, "epoch": 0.352983616413648, "flos": 33122790984960.0, "grad_norm": 1.3666863348004208, "language_loss": 0.72364175, "learning_rate": 3.001910665140316e-06, "loss": 0.7491861, "num_input_tokens_seen": 126250230, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.26037598, "step": 5871, "time_per_iteration": 2.9779415130615234 }, { "auxiliary_loss_clip": 0.01479638, "auxiliary_loss_mlp": 0.01050555, "balance_loss_clip": 1.29764462, "balance_loss_mlp": 1.02659416, "epoch": 0.35304373966631597, "flos": 18705580450560.0, "grad_norm": 2.2035402299621305, "language_loss": 0.74318624, "learning_rate": 3.0015735775220873e-06, "loss": 0.76848817, "num_input_tokens_seen": 126268315, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.23962402, "step": 5872, "time_per_iteration": 2.835054874420166 }, { "auxiliary_loss_clip": 0.01504122, "auxiliary_loss_mlp": 0.01055548, "balance_loss_clip": 1.31952739, "balance_loss_mlp": 1.03078771, "epoch": 0.35310386291898394, "flos": 23374668528000.0, "grad_norm": 1.7587036871253636, "language_loss": 0.83342397, "learning_rate": 3.001236451924089e-06, "loss": 0.85902071, "num_input_tokens_seen": 126288390, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.24768066, "step": 5873, "time_per_iteration": 2.859938621520996 }, { "auxiliary_loss_clip": 0.01522028, "auxiliary_loss_mlp": 0.01056643, "balance_loss_clip": 1.32966447, "balance_loss_mlp": 1.03034532, "epoch": 0.3531639861716519, "flos": 24472582300800.0, "grad_norm": 2.963680412476417, "language_loss": 0.67558604, "learning_rate": 3.000899288359104e-06, "loss": 0.70137274, "num_input_tokens_seen": 126305750, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.26318359, "step": 5874, "time_per_iteration": 2.8979969024658203 }, { "auxiliary_loss_clip": 0.01283499, "auxiliary_loss_mlp": 0.01107543, "balance_loss_clip": 1.17316484, "balance_loss_mlp": 1.07492769, "epoch": 0.35322410942431987, "flos": 70341298951680.0, "grad_norm": 0.793463254094825, "language_loss": 0.61495793, "learning_rate": 3.000562086839917e-06, "loss": 0.63886833, "num_input_tokens_seen": 126362495, "router_z_loss_clip": 1.1015625, "router_z_loss_mlp": 0.32617188, "step": 5875, "time_per_iteration": 3.24753999710083 }, { "auxiliary_loss_clip": 0.01497028, "auxiliary_loss_mlp": 0.01050032, "balance_loss_clip": 1.31091356, "balance_loss_mlp": 1.02512968, "epoch": 0.35328423267698783, "flos": 19828406125440.0, "grad_norm": 1.7777694925622165, "language_loss": 0.80684143, "learning_rate": 3.0002248473793163e-06, "loss": 0.83231205, "num_input_tokens_seen": 126378320, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.24890137, "step": 5876, "time_per_iteration": 2.849835157394409 }, { "auxiliary_loss_clip": 0.01280479, "auxiliary_loss_mlp": 0.01057711, "balance_loss_clip": 1.1708988, "balance_loss_mlp": 1.02414155, "epoch": 0.3533443559296558, "flos": 60852801012480.0, "grad_norm": 0.6813134684430068, "language_loss": 0.56816685, "learning_rate": 2.999887569990088e-06, "loss": 0.59154874, "num_input_tokens_seen": 126442735, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.3359375, "step": 5877, "time_per_iteration": 3.3602020740509033 }, { "auxiliary_loss_clip": 0.01505947, "auxiliary_loss_mlp": 0.01046182, "balance_loss_clip": 1.31864405, "balance_loss_mlp": 1.02037358, "epoch": 0.35340447918232376, "flos": 24766982087040.0, "grad_norm": 1.5498905983008981, "language_loss": 0.72994941, "learning_rate": 2.999550254685024e-06, "loss": 0.75547069, "num_input_tokens_seen": 126463090, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.25793457, "step": 5878, "time_per_iteration": 2.9558303356170654 }, { "auxiliary_loss_clip": 0.01497737, "auxiliary_loss_mlp": 0.01044718, "balance_loss_clip": 1.31081486, "balance_loss_mlp": 1.019243, "epoch": 0.3534646024349917, "flos": 21805673448960.0, "grad_norm": 2.2759176422691447, "language_loss": 0.79269099, "learning_rate": 2.9992129014769136e-06, "loss": 0.81811559, "num_input_tokens_seen": 126482105, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.25463867, "step": 5879, "time_per_iteration": 2.880070209503174 }, { "auxiliary_loss_clip": 0.01528937, "auxiliary_loss_mlp": 0.01048714, "balance_loss_clip": 1.33589935, "balance_loss_mlp": 1.02360892, "epoch": 0.3535247256876597, "flos": 20021963955840.0, "grad_norm": 2.0691336969778384, "language_loss": 0.64097035, "learning_rate": 2.9988755103785493e-06, "loss": 0.66674691, "num_input_tokens_seen": 126502125, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.2512207, "step": 5880, "time_per_iteration": 2.918210506439209 }, { "auxiliary_loss_clip": 0.01501981, "auxiliary_loss_mlp": 0.0105583, "balance_loss_clip": 1.31214261, "balance_loss_mlp": 1.02977109, "epoch": 0.35358484894032766, "flos": 18197325901440.0, "grad_norm": 2.4463970431712947, "language_loss": 0.66168326, "learning_rate": 2.998538081402727e-06, "loss": 0.6872614, "num_input_tokens_seen": 126521950, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.26086426, "step": 5881, "time_per_iteration": 2.9047420024871826 }, { "auxiliary_loss_clip": 0.01483541, "auxiliary_loss_mlp": 0.01055478, "balance_loss_clip": 1.30229783, "balance_loss_mlp": 1.02959752, "epoch": 0.3536449721929956, "flos": 22830643324800.0, "grad_norm": 1.4363513776401182, "language_loss": 0.76675379, "learning_rate": 2.998200614562239e-06, "loss": 0.79214406, "num_input_tokens_seen": 126542445, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.25878906, "step": 5882, "time_per_iteration": 2.9823551177978516 }, { "auxiliary_loss_clip": 0.01504032, "auxiliary_loss_mlp": 0.01065144, "balance_loss_clip": 1.3155576, "balance_loss_mlp": 1.03777373, "epoch": 0.3537050954456636, "flos": 26443515352320.0, "grad_norm": 2.0686311879512327, "language_loss": 0.71367866, "learning_rate": 2.9978631098698847e-06, "loss": 0.73937041, "num_input_tokens_seen": 126560690, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.27404785, "step": 5883, "time_per_iteration": 2.927515983581543 }, { "auxiliary_loss_clip": 0.01507916, "auxiliary_loss_mlp": 0.01054144, "balance_loss_clip": 1.31574082, "balance_loss_mlp": 1.02958667, "epoch": 0.3537652186983316, "flos": 17204823809280.0, "grad_norm": 1.770351041889775, "language_loss": 0.79293251, "learning_rate": 2.9975255673384614e-06, "loss": 0.81855309, "num_input_tokens_seen": 126577620, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.2454834, "step": 5884, "time_per_iteration": 4.264991521835327 }, { "auxiliary_loss_clip": 0.01490552, "auxiliary_loss_mlp": 0.01056544, "balance_loss_clip": 1.30623221, "balance_loss_mlp": 1.03267801, "epoch": 0.3538253419509996, "flos": 19546222435200.0, "grad_norm": 2.844431623998467, "language_loss": 0.75971007, "learning_rate": 2.9971879869807673e-06, "loss": 0.78518111, "num_input_tokens_seen": 126596235, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.23864746, "step": 5885, "time_per_iteration": 2.8482789993286133 }, { "auxiliary_loss_clip": 0.01498821, "auxiliary_loss_mlp": 0.01049477, "balance_loss_clip": 1.3098712, "balance_loss_mlp": 1.02471745, "epoch": 0.35388546520366754, "flos": 12135698040960.0, "grad_norm": 4.037867910935463, "language_loss": 0.85051203, "learning_rate": 2.996850368809606e-06, "loss": 0.87599504, "num_input_tokens_seen": 126612830, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.24780273, "step": 5886, "time_per_iteration": 2.843576669692993 }, { "auxiliary_loss_clip": 0.01473085, "auxiliary_loss_mlp": 0.01053318, "balance_loss_clip": 1.28962338, "balance_loss_mlp": 1.02705598, "epoch": 0.3539455884563355, "flos": 19686454629120.0, "grad_norm": 2.13003910941335, "language_loss": 0.79395592, "learning_rate": 2.9965127128377787e-06, "loss": 0.81921995, "num_input_tokens_seen": 126630910, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.26293945, "step": 5887, "time_per_iteration": 2.831603765487671 }, { "auxiliary_loss_clip": 0.01483764, "auxiliary_loss_mlp": 0.01051009, "balance_loss_clip": 1.29829979, "balance_loss_mlp": 1.02539146, "epoch": 0.35400571170900347, "flos": 18079743369600.0, "grad_norm": 1.7324122832185167, "language_loss": 0.66553766, "learning_rate": 2.996175019078089e-06, "loss": 0.69088537, "num_input_tokens_seen": 126648365, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.25610352, "step": 5888, "time_per_iteration": 2.8590457439422607 }, { "auxiliary_loss_clip": 0.01496206, "auxiliary_loss_mlp": 0.01046937, "balance_loss_clip": 1.31059289, "balance_loss_mlp": 1.02243972, "epoch": 0.35406583496167143, "flos": 26079293560320.0, "grad_norm": 1.7020038476034263, "language_loss": 0.77402514, "learning_rate": 2.9958372875433437e-06, "loss": 0.79945654, "num_input_tokens_seen": 126667500, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.24487305, "step": 5889, "time_per_iteration": 4.297436714172363 }, { "auxiliary_loss_clip": 0.01499354, "auxiliary_loss_mlp": 0.0104746, "balance_loss_clip": 1.31512165, "balance_loss_mlp": 1.02310562, "epoch": 0.3541259582143394, "flos": 19802046389760.0, "grad_norm": 1.9173843034413605, "language_loss": 0.82130742, "learning_rate": 2.9954995182463478e-06, "loss": 0.84677553, "num_input_tokens_seen": 126686820, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.24365234, "step": 5890, "time_per_iteration": 4.271352291107178 }, { "auxiliary_loss_clip": 0.01479438, "auxiliary_loss_mlp": 0.01041242, "balance_loss_clip": 1.29597616, "balance_loss_mlp": 1.01743543, "epoch": 0.35418608146700736, "flos": 24032068496640.0, "grad_norm": 1.6837926980078042, "language_loss": 0.8031795, "learning_rate": 2.99516171119991e-06, "loss": 0.82838631, "num_input_tokens_seen": 126706965, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.23803711, "step": 5891, "time_per_iteration": 2.8578197956085205 }, { "auxiliary_loss_clip": 0.01486467, "auxiliary_loss_mlp": 0.01049432, "balance_loss_clip": 1.30254412, "balance_loss_mlp": 1.02317011, "epoch": 0.35424620471967533, "flos": 12393422277120.0, "grad_norm": 1.872007149987236, "language_loss": 0.74067748, "learning_rate": 2.9948238664168415e-06, "loss": 0.76603645, "num_input_tokens_seen": 126724015, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.26269531, "step": 5892, "time_per_iteration": 2.869631290435791 }, { "auxiliary_loss_clip": 0.01489939, "auxiliary_loss_mlp": 0.01042338, "balance_loss_clip": 1.30521214, "balance_loss_mlp": 1.01761436, "epoch": 0.3543063279723433, "flos": 19680753784320.0, "grad_norm": 2.0776294752082127, "language_loss": 0.67133772, "learning_rate": 2.9944859839099518e-06, "loss": 0.69666052, "num_input_tokens_seen": 126737565, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.24731445, "step": 5893, "time_per_iteration": 4.252424478530884 }, { "auxiliary_loss_clip": 0.01488806, "auxiliary_loss_mlp": 0.01046957, "balance_loss_clip": 1.30468106, "balance_loss_mlp": 1.02186394, "epoch": 0.35436645122501126, "flos": 21919500662400.0, "grad_norm": 9.169872654723694, "language_loss": 0.7042048, "learning_rate": 2.9941480636920533e-06, "loss": 0.72956252, "num_input_tokens_seen": 126756095, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.25085449, "step": 5894, "time_per_iteration": 2.8282227516174316 }, { "auxiliary_loss_clip": 0.0149021, "auxiliary_loss_mlp": 0.01042554, "balance_loss_clip": 1.30797732, "balance_loss_mlp": 1.01924872, "epoch": 0.3544265744776792, "flos": 21727888358400.0, "grad_norm": 4.437390629711494, "language_loss": 0.74903846, "learning_rate": 2.9938101057759615e-06, "loss": 0.77436614, "num_input_tokens_seen": 126775455, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.23303223, "step": 5895, "time_per_iteration": 2.868025541305542 }, { "auxiliary_loss_clip": 0.01488294, "auxiliary_loss_mlp": 0.01046054, "balance_loss_clip": 1.30522513, "balance_loss_mlp": 1.02191448, "epoch": 0.3544866977303472, "flos": 21222619966080.0, "grad_norm": 1.721417544074894, "language_loss": 0.84947592, "learning_rate": 2.993472110174491e-06, "loss": 0.8748194, "num_input_tokens_seen": 126792320, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.24121094, "step": 5896, "time_per_iteration": 2.841447591781616 }, { "auxiliary_loss_clip": 0.01487409, "auxiliary_loss_mlp": 0.0105497, "balance_loss_clip": 1.30417311, "balance_loss_mlp": 1.02925706, "epoch": 0.35454682098301515, "flos": 29322469175040.0, "grad_norm": 1.6643706111055712, "language_loss": 0.71523643, "learning_rate": 2.9931340769004576e-06, "loss": 0.74066019, "num_input_tokens_seen": 126813680, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.25708008, "step": 5897, "time_per_iteration": 2.9384734630584717 }, { "auxiliary_loss_clip": 0.01482238, "auxiliary_loss_mlp": 0.01050259, "balance_loss_clip": 1.29811811, "balance_loss_mlp": 1.02560699, "epoch": 0.3546069442356832, "flos": 24327327934080.0, "grad_norm": 2.284167427759106, "language_loss": 0.81890249, "learning_rate": 2.9927960059666816e-06, "loss": 0.84422743, "num_input_tokens_seen": 126834395, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.24658203, "step": 5898, "time_per_iteration": 2.8800716400146484 }, { "auxiliary_loss_clip": 0.01472294, "auxiliary_loss_mlp": 0.01050779, "balance_loss_clip": 1.29169273, "balance_loss_mlp": 1.02715182, "epoch": 0.35466706748835114, "flos": 22867680833280.0, "grad_norm": 3.1403215947964114, "language_loss": 0.74938047, "learning_rate": 2.9924578973859804e-06, "loss": 0.77461118, "num_input_tokens_seen": 126855145, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.2364502, "step": 5899, "time_per_iteration": 2.8744659423828125 }, { "auxiliary_loss_clip": 0.01491676, "auxiliary_loss_mlp": 0.01040056, "balance_loss_clip": 1.3045609, "balance_loss_mlp": 1.01610637, "epoch": 0.3547271907410191, "flos": 28341911710080.0, "grad_norm": 1.6556085375803948, "language_loss": 0.80288851, "learning_rate": 2.9921197511711763e-06, "loss": 0.82820582, "num_input_tokens_seen": 126873790, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.23962402, "step": 5900, "time_per_iteration": 2.9583418369293213 }, { "auxiliary_loss_clip": 0.01487031, "auxiliary_loss_mlp": 0.0104772, "balance_loss_clip": 1.30242682, "balance_loss_mlp": 1.02271008, "epoch": 0.35478731399368707, "flos": 23524673598720.0, "grad_norm": 1.814098865853592, "language_loss": 0.82971072, "learning_rate": 2.991781567335093e-06, "loss": 0.85505825, "num_input_tokens_seen": 126892865, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.25024414, "step": 5901, "time_per_iteration": 2.8718202114105225 }, { "auxiliary_loss_clip": 0.01495756, "auxiliary_loss_mlp": 0.0104149, "balance_loss_clip": 1.30826449, "balance_loss_mlp": 1.01737368, "epoch": 0.35484743724635504, "flos": 18633450960000.0, "grad_norm": 1.7676723276022392, "language_loss": 0.76795542, "learning_rate": 2.9914433458905525e-06, "loss": 0.79332793, "num_input_tokens_seen": 126911935, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.24108887, "step": 5902, "time_per_iteration": 2.8588809967041016 }, { "auxiliary_loss_clip": 0.01489991, "auxiliary_loss_mlp": 0.01044549, "balance_loss_clip": 1.30486929, "balance_loss_mlp": 1.0220542, "epoch": 0.354907560499023, "flos": 17393495201280.0, "grad_norm": 1.8421894349485655, "language_loss": 0.7232731, "learning_rate": 2.991105086850381e-06, "loss": 0.74861848, "num_input_tokens_seen": 126930040, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.22473145, "step": 5903, "time_per_iteration": 2.883305072784424 }, { "auxiliary_loss_clip": 0.01497753, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.30822682, "balance_loss_mlp": 1.01104307, "epoch": 0.35496768375169097, "flos": 19217952276480.0, "grad_norm": 2.3379969520385884, "language_loss": 0.74530089, "learning_rate": 2.9907667902274053e-06, "loss": 0.77062607, "num_input_tokens_seen": 126948390, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.23706055, "step": 5904, "time_per_iteration": 2.8320400714874268 }, { "auxiliary_loss_clip": 0.0150183, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.31425881, "balance_loss_mlp": 1.0212239, "epoch": 0.35502780700435893, "flos": 18341946840960.0, "grad_norm": 2.7787966889918296, "language_loss": 0.79144704, "learning_rate": 2.9904284560344536e-06, "loss": 0.81690478, "num_input_tokens_seen": 126964905, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.22717285, "step": 5905, "time_per_iteration": 2.9491379261016846 }, { "auxiliary_loss_clip": 0.01460841, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.2855742, "balance_loss_mlp": 1.01534748, "epoch": 0.3550879302570269, "flos": 15456477767040.0, "grad_norm": 1.9159593793637681, "language_loss": 0.73243093, "learning_rate": 2.990090084284356e-06, "loss": 0.75741756, "num_input_tokens_seen": 126982000, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.22497559, "step": 5906, "time_per_iteration": 2.8441784381866455 }, { "auxiliary_loss_clip": 0.01495123, "auxiliary_loss_mlp": 0.01034604, "balance_loss_clip": 1.30722153, "balance_loss_mlp": 1.01202607, "epoch": 0.35514805350969486, "flos": 21988734485760.0, "grad_norm": 1.9711710997907057, "language_loss": 0.76146978, "learning_rate": 2.9897516749899426e-06, "loss": 0.78676701, "num_input_tokens_seen": 126998390, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.22570801, "step": 5907, "time_per_iteration": 2.856736660003662 }, { "auxiliary_loss_clip": 0.01491189, "auxiliary_loss_mlp": 0.01042063, "balance_loss_clip": 1.30665731, "balance_loss_mlp": 1.01836443, "epoch": 0.3552081767623628, "flos": 29874005015040.0, "grad_norm": 2.1991219501888657, "language_loss": 0.76448536, "learning_rate": 2.989413228164047e-06, "loss": 0.78981787, "num_input_tokens_seen": 127020220, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.23681641, "step": 5908, "time_per_iteration": 2.90158748626709 }, { "auxiliary_loss_clip": 0.01484715, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.30006981, "balance_loss_mlp": 1.01387048, "epoch": 0.3552683000150308, "flos": 26443424862720.0, "grad_norm": 2.051483521906333, "language_loss": 0.69092464, "learning_rate": 2.989074743819502e-06, "loss": 0.71612883, "num_input_tokens_seen": 127038585, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.21838379, "step": 5909, "time_per_iteration": 2.878732204437256 }, { "auxiliary_loss_clip": 0.01471291, "auxiliary_loss_mlp": 0.01040004, "balance_loss_clip": 1.29473257, "balance_loss_mlp": 1.0172112, "epoch": 0.35532842326769876, "flos": 19793902325760.0, "grad_norm": 2.6026918679261013, "language_loss": 0.79087538, "learning_rate": 2.988736221969144e-06, "loss": 0.8159883, "num_input_tokens_seen": 127056215, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.22790527, "step": 5910, "time_per_iteration": 2.8257815837860107 }, { "auxiliary_loss_clip": 0.01500003, "auxiliary_loss_mlp": 0.0103719, "balance_loss_clip": 1.31033671, "balance_loss_mlp": 1.01290739, "epoch": 0.3553885465203668, "flos": 17248421813760.0, "grad_norm": 2.1628292399089024, "language_loss": 0.71833539, "learning_rate": 2.98839766262581e-06, "loss": 0.7437073, "num_input_tokens_seen": 127075825, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.24316406, "step": 5911, "time_per_iteration": 2.8181076049804688 }, { "auxiliary_loss_clip": 0.01473868, "auxiliary_loss_mlp": 0.01039624, "balance_loss_clip": 1.29352283, "balance_loss_mlp": 1.01708162, "epoch": 0.35544866977303474, "flos": 14941255518720.0, "grad_norm": 2.277521934796158, "language_loss": 0.87831986, "learning_rate": 2.9880590658023366e-06, "loss": 0.90345472, "num_input_tokens_seen": 127091205, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.22546387, "step": 5912, "time_per_iteration": 2.8057937622070312 }, { "auxiliary_loss_clip": 0.01490641, "auxiliary_loss_mlp": 0.01040251, "balance_loss_clip": 1.30704141, "balance_loss_mlp": 1.01714826, "epoch": 0.3555087930257027, "flos": 19765642308480.0, "grad_norm": 2.1198469082448823, "language_loss": 0.78188682, "learning_rate": 2.9877204315115646e-06, "loss": 0.80719578, "num_input_tokens_seen": 127109210, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.2310791, "step": 5913, "time_per_iteration": 2.921937942504883 }, { "auxiliary_loss_clip": 0.01479531, "auxiliary_loss_mlp": 0.01041288, "balance_loss_clip": 1.29816782, "balance_loss_mlp": 1.01762486, "epoch": 0.3555689162783707, "flos": 21077863292160.0, "grad_norm": 1.628324258800963, "language_loss": 0.83311939, "learning_rate": 2.9873817597663353e-06, "loss": 0.85832763, "num_input_tokens_seen": 127128400, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.23681641, "step": 5914, "time_per_iteration": 2.872138738632202 }, { "auxiliary_loss_clip": 0.01504103, "auxiliary_loss_mlp": 0.01042565, "balance_loss_clip": 1.31717002, "balance_loss_mlp": 1.01914036, "epoch": 0.35562903953103864, "flos": 33081726689280.0, "grad_norm": 2.3834601736837246, "language_loss": 0.70938003, "learning_rate": 2.98704305057949e-06, "loss": 0.73484671, "num_input_tokens_seen": 127149965, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.23425293, "step": 5915, "time_per_iteration": 2.9198110103607178 }, { "auxiliary_loss_clip": 0.01480528, "auxiliary_loss_mlp": 0.01038757, "balance_loss_clip": 1.29671001, "balance_loss_mlp": 1.01586938, "epoch": 0.3556891627837066, "flos": 20567436992640.0, "grad_norm": 1.6802350603370486, "language_loss": 0.76923537, "learning_rate": 2.9867043039638737e-06, "loss": 0.79442823, "num_input_tokens_seen": 127169865, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.22888184, "step": 5916, "time_per_iteration": 2.8342831134796143 }, { "auxiliary_loss_clip": 0.01488882, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.30312181, "balance_loss_mlp": 1.01605213, "epoch": 0.35574928603637457, "flos": 20712827093760.0, "grad_norm": 4.928583385049272, "language_loss": 0.88884342, "learning_rate": 2.986365519932332e-06, "loss": 0.91410911, "num_input_tokens_seen": 127188075, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.21655273, "step": 5917, "time_per_iteration": 2.8293392658233643 }, { "auxiliary_loss_clip": 0.01485884, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.30149853, "balance_loss_mlp": 1.01399553, "epoch": 0.35580940928904253, "flos": 15202825562880.0, "grad_norm": 2.5556823901701287, "language_loss": 0.76184261, "learning_rate": 2.98602669849771e-06, "loss": 0.78705949, "num_input_tokens_seen": 127206065, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.21801758, "step": 5918, "time_per_iteration": 4.227437257766724 }, { "auxiliary_loss_clip": 0.01261132, "auxiliary_loss_mlp": 0.01027029, "balance_loss_clip": 1.152179, "balance_loss_mlp": 1.00204301, "epoch": 0.3558695325417105, "flos": 58665524734080.0, "grad_norm": 0.9167161447229929, "language_loss": 0.63940626, "learning_rate": 2.985687839672857e-06, "loss": 0.66228789, "num_input_tokens_seen": 127257885, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.25, "step": 5919, "time_per_iteration": 3.1191210746765137 }, { "auxiliary_loss_clip": 0.01502784, "auxiliary_loss_mlp": 0.01041148, "balance_loss_clip": 1.31262481, "balance_loss_mlp": 1.01961899, "epoch": 0.35592965579437846, "flos": 22028441437440.0, "grad_norm": 3.346441450886939, "language_loss": 0.74437094, "learning_rate": 2.9853489434706223e-06, "loss": 0.7698102, "num_input_tokens_seen": 127275550, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.21520996, "step": 5920, "time_per_iteration": 2.8364038467407227 }, { "auxiliary_loss_clip": 0.01500167, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.31489301, "balance_loss_mlp": 1.01791608, "epoch": 0.35598977904704643, "flos": 23378107132800.0, "grad_norm": 1.913080206465589, "language_loss": 0.78585702, "learning_rate": 2.985010009903857e-06, "loss": 0.81126541, "num_input_tokens_seen": 127295110, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.22766113, "step": 5921, "time_per_iteration": 2.9208571910858154 }, { "auxiliary_loss_clip": 0.01496714, "auxiliary_loss_mlp": 0.01043424, "balance_loss_clip": 1.30879152, "balance_loss_mlp": 1.02132285, "epoch": 0.3560499022997144, "flos": 17794302053760.0, "grad_norm": 2.0833746316153636, "language_loss": 0.68443751, "learning_rate": 2.9846710389854133e-06, "loss": 0.70983887, "num_input_tokens_seen": 127312865, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.22106934, "step": 5922, "time_per_iteration": 2.834113121032715 }, { "auxiliary_loss_clip": 0.01494551, "auxiliary_loss_mlp": 0.01038639, "balance_loss_clip": 1.30949008, "balance_loss_mlp": 1.01670408, "epoch": 0.35611002555238236, "flos": 20750136071040.0, "grad_norm": 1.9121156687299066, "language_loss": 0.80188262, "learning_rate": 2.9843320307281454e-06, "loss": 0.8272146, "num_input_tokens_seen": 127331710, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.21923828, "step": 5923, "time_per_iteration": 2.836017370223999 }, { "auxiliary_loss_clip": 0.01491713, "auxiliary_loss_mlp": 0.01041844, "balance_loss_clip": 1.30623889, "balance_loss_mlp": 1.01933718, "epoch": 0.3561701488050504, "flos": 19471242522240.0, "grad_norm": 4.451716274599384, "language_loss": 0.85741365, "learning_rate": 2.983992985144908e-06, "loss": 0.8827492, "num_input_tokens_seen": 127350950, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.22509766, "step": 5924, "time_per_iteration": 4.217506408691406 }, { "auxiliary_loss_clip": 0.0149385, "auxiliary_loss_mlp": 0.01042157, "balance_loss_clip": 1.30822492, "balance_loss_mlp": 1.02032971, "epoch": 0.35623027205771834, "flos": 30787455162240.0, "grad_norm": 3.9600483476531676, "language_loss": 0.77972496, "learning_rate": 2.9836539022485578e-06, "loss": 0.805085, "num_input_tokens_seen": 127369385, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.21826172, "step": 5925, "time_per_iteration": 4.381460189819336 }, { "auxiliary_loss_clip": 0.0148603, "auxiliary_loss_mlp": 0.01041264, "balance_loss_clip": 1.30037475, "balance_loss_mlp": 1.01882887, "epoch": 0.3562903953103863, "flos": 16989430723200.0, "grad_norm": 2.0368867902616046, "language_loss": 0.77653539, "learning_rate": 2.9833147820519535e-06, "loss": 0.80180836, "num_input_tokens_seen": 127386965, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22424316, "step": 5926, "time_per_iteration": 2.798799991607666 }, { "auxiliary_loss_clip": 0.01505815, "auxiliary_loss_mlp": 0.01042633, "balance_loss_clip": 1.31581306, "balance_loss_mlp": 1.01993537, "epoch": 0.3563505185630543, "flos": 23849821866240.0, "grad_norm": 2.042205538791556, "language_loss": 0.70062435, "learning_rate": 2.9829756245679544e-06, "loss": 0.72610885, "num_input_tokens_seen": 127406075, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.22692871, "step": 5927, "time_per_iteration": 2.8645682334899902 }, { "auxiliary_loss_clip": 0.01487209, "auxiliary_loss_mlp": 0.01049667, "balance_loss_clip": 1.30420089, "balance_loss_mlp": 1.02806592, "epoch": 0.35641064181572224, "flos": 22283677209600.0, "grad_norm": 2.3450775708416076, "language_loss": 0.80593276, "learning_rate": 2.9826364298094212e-06, "loss": 0.83130145, "num_input_tokens_seen": 127425350, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.21618652, "step": 5928, "time_per_iteration": 4.197360277175903 }, { "auxiliary_loss_clip": 0.01480179, "auxiliary_loss_mlp": 0.01045547, "balance_loss_clip": 1.29539418, "balance_loss_mlp": 1.02314758, "epoch": 0.3564707650683902, "flos": 23011170652800.0, "grad_norm": 1.3791347546277852, "language_loss": 0.82401001, "learning_rate": 2.982297197789215e-06, "loss": 0.84926724, "num_input_tokens_seen": 127446335, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22399902, "step": 5929, "time_per_iteration": 2.8482091426849365 }, { "auxiliary_loss_clip": 0.01467279, "auxiliary_loss_mlp": 0.01040045, "balance_loss_clip": 1.28688848, "balance_loss_mlp": 1.01805067, "epoch": 0.35653088832105817, "flos": 14692489752960.0, "grad_norm": 2.4558171581397055, "language_loss": 0.71307117, "learning_rate": 2.981957928520201e-06, "loss": 0.73814446, "num_input_tokens_seen": 127462795, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.2199707, "step": 5930, "time_per_iteration": 2.8549370765686035 }, { "auxiliary_loss_clip": 0.01489352, "auxiliary_loss_mlp": 0.01043873, "balance_loss_clip": 1.30110312, "balance_loss_mlp": 1.02168798, "epoch": 0.35659101157372614, "flos": 23487500355840.0, "grad_norm": 2.3432339328826925, "language_loss": 0.69077611, "learning_rate": 2.981618622015244e-06, "loss": 0.71610844, "num_input_tokens_seen": 127482675, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.22192383, "step": 5931, "time_per_iteration": 2.849738121032715 }, { "auxiliary_loss_clip": 0.01470148, "auxiliary_loss_mlp": 0.01048192, "balance_loss_clip": 1.28859377, "balance_loss_mlp": 1.02507782, "epoch": 0.3566511348263941, "flos": 26589991328640.0, "grad_norm": 1.6084686599236901, "language_loss": 0.68441129, "learning_rate": 2.981279278287211e-06, "loss": 0.70959473, "num_input_tokens_seen": 127502275, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.23132324, "step": 5932, "time_per_iteration": 2.902076482772827 }, { "auxiliary_loss_clip": 0.01475153, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 1.2937088, "balance_loss_mlp": 1.01940882, "epoch": 0.35671125807906207, "flos": 13122725512320.0, "grad_norm": 2.5205406756795, "language_loss": 0.80427086, "learning_rate": 2.980939897348969e-06, "loss": 0.82943189, "num_input_tokens_seen": 127520195, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.21533203, "step": 5933, "time_per_iteration": 2.853135824203491 }, { "auxiliary_loss_clip": 0.0148536, "auxiliary_loss_mlp": 0.01046872, "balance_loss_clip": 1.29931355, "balance_loss_mlp": 1.02304244, "epoch": 0.35677138133173003, "flos": 33013669230720.0, "grad_norm": 1.659074760625499, "language_loss": 0.70133281, "learning_rate": 2.980600479213388e-06, "loss": 0.72665519, "num_input_tokens_seen": 127544495, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.23815918, "step": 5934, "time_per_iteration": 2.976716995239258 }, { "auxiliary_loss_clip": 0.01504508, "auxiliary_loss_mlp": 0.01041441, "balance_loss_clip": 1.31167364, "balance_loss_mlp": 1.01796877, "epoch": 0.356831504584398, "flos": 20787761761920.0, "grad_norm": 1.8157790590120326, "language_loss": 0.72107339, "learning_rate": 2.9802610238933384e-06, "loss": 0.74653292, "num_input_tokens_seen": 127563810, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.23474121, "step": 5935, "time_per_iteration": 2.9108774662017822 }, { "auxiliary_loss_clip": 0.01470488, "auxiliary_loss_mlp": 0.01038054, "balance_loss_clip": 1.28655457, "balance_loss_mlp": 1.0156188, "epoch": 0.35689162783706596, "flos": 12172826039040.0, "grad_norm": 2.032077837321027, "language_loss": 0.78488827, "learning_rate": 2.979921531401692e-06, "loss": 0.80997366, "num_input_tokens_seen": 127579065, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.22460938, "step": 5936, "time_per_iteration": 2.816220998764038 }, { "auxiliary_loss_clip": 0.014721, "auxiliary_loss_mlp": 0.01040693, "balance_loss_clip": 1.29011512, "balance_loss_mlp": 1.01846027, "epoch": 0.356951751089734, "flos": 23851903127040.0, "grad_norm": 1.4297885648484168, "language_loss": 0.65164179, "learning_rate": 2.9795820017513242e-06, "loss": 0.67676973, "num_input_tokens_seen": 127599105, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.22216797, "step": 5937, "time_per_iteration": 2.8711416721343994 }, { "auxiliary_loss_clip": 0.01483739, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.29760134, "balance_loss_mlp": 1.01433527, "epoch": 0.35701187434240195, "flos": 11727878244480.0, "grad_norm": 2.2070968330277503, "language_loss": 0.79530442, "learning_rate": 2.9792424349551073e-06, "loss": 0.82051241, "num_input_tokens_seen": 127614940, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.22729492, "step": 5938, "time_per_iteration": 2.8140807151794434 }, { "auxiliary_loss_clip": 0.01477665, "auxiliary_loss_mlp": 0.01042353, "balance_loss_clip": 1.29407287, "balance_loss_mlp": 1.02002501, "epoch": 0.3570719975950699, "flos": 24909386031360.0, "grad_norm": 1.5429947458018223, "language_loss": 0.81119919, "learning_rate": 2.9789028310259202e-06, "loss": 0.83639932, "num_input_tokens_seen": 127634960, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.2232666, "step": 5939, "time_per_iteration": 2.888073444366455 }, { "auxiliary_loss_clip": 0.01505286, "auxiliary_loss_mlp": 0.01038023, "balance_loss_clip": 1.31143785, "balance_loss_mlp": 1.01552784, "epoch": 0.3571321208477379, "flos": 26005987704960.0, "grad_norm": 1.7576658151324345, "language_loss": 0.80104977, "learning_rate": 2.9785631899766395e-06, "loss": 0.82648289, "num_input_tokens_seen": 127654545, "router_z_loss_clip": 1.93652344, "router_z_loss_mlp": 0.22485352, "step": 5940, "time_per_iteration": 2.878488779067993 }, { "auxiliary_loss_clip": 0.01478578, "auxiliary_loss_mlp": 0.01038628, "balance_loss_clip": 1.29047501, "balance_loss_mlp": 1.01629996, "epoch": 0.35719224410040584, "flos": 14509790674560.0, "grad_norm": 2.30464604201849, "language_loss": 0.73329687, "learning_rate": 2.9782235118201443e-06, "loss": 0.75846893, "num_input_tokens_seen": 127672320, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.22314453, "step": 5941, "time_per_iteration": 2.835662603378296 }, { "auxiliary_loss_clip": 0.01493737, "auxiliary_loss_mlp": 0.01041081, "balance_loss_clip": 1.30876851, "balance_loss_mlp": 1.01872909, "epoch": 0.3572523673530738, "flos": 31187854811520.0, "grad_norm": 2.891491454387655, "language_loss": 0.65288311, "learning_rate": 2.9778837965693154e-06, "loss": 0.6782313, "num_input_tokens_seen": 127693315, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.22338867, "step": 5942, "time_per_iteration": 2.934222936630249 }, { "auxiliary_loss_clip": 0.01480825, "auxiliary_loss_mlp": 0.0104255, "balance_loss_clip": 1.2948699, "balance_loss_mlp": 1.01897025, "epoch": 0.3573124906057418, "flos": 15860542245120.0, "grad_norm": 1.77306445642961, "language_loss": 0.74614733, "learning_rate": 2.9775440442370354e-06, "loss": 0.77138108, "num_input_tokens_seen": 127711570, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.23571777, "step": 5943, "time_per_iteration": 2.8524742126464844 }, { "auxiliary_loss_clip": 0.01283448, "auxiliary_loss_mlp": 0.01053642, "balance_loss_clip": 1.1767211, "balance_loss_mlp": 1.03409219, "epoch": 0.35737261385840974, "flos": 60848231287680.0, "grad_norm": 0.8156437494343916, "language_loss": 0.60808945, "learning_rate": 2.9772042548361867e-06, "loss": 0.63146043, "num_input_tokens_seen": 127772475, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.1953125, "step": 5944, "time_per_iteration": 3.4341330528259277 }, { "auxiliary_loss_clip": 0.01475728, "auxiliary_loss_mlp": 0.01045444, "balance_loss_clip": 1.29140449, "balance_loss_mlp": 1.0224489, "epoch": 0.3574327371110777, "flos": 18853097057280.0, "grad_norm": 1.6608294610844703, "language_loss": 0.73252904, "learning_rate": 2.976864428379655e-06, "loss": 0.75774074, "num_input_tokens_seen": 127790940, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.22998047, "step": 5945, "time_per_iteration": 2.8418240547180176 }, { "auxiliary_loss_clip": 0.01469253, "auxiliary_loss_mlp": 0.01042381, "balance_loss_clip": 1.28532958, "balance_loss_mlp": 1.01845574, "epoch": 0.35749286036374567, "flos": 23560037049600.0, "grad_norm": 2.206165827475419, "language_loss": 0.81428039, "learning_rate": 2.976524564880326e-06, "loss": 0.83939672, "num_input_tokens_seen": 127808275, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.23937988, "step": 5946, "time_per_iteration": 2.8385822772979736 }, { "auxiliary_loss_clip": 0.01504571, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.31681025, "balance_loss_mlp": 1.01764655, "epoch": 0.35755298361641363, "flos": 21115308003840.0, "grad_norm": 1.3715599140902572, "language_loss": 0.69977987, "learning_rate": 2.9761846643510882e-06, "loss": 0.72522175, "num_input_tokens_seen": 127828840, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.21960449, "step": 5947, "time_per_iteration": 2.902299404144287 }, { "auxiliary_loss_clip": 0.01466021, "auxiliary_loss_mlp": 0.0104447, "balance_loss_clip": 1.28617311, "balance_loss_mlp": 1.02246356, "epoch": 0.3576131068690816, "flos": 19254446847360.0, "grad_norm": 2.1595837366012147, "language_loss": 0.76357025, "learning_rate": 2.9758447268048297e-06, "loss": 0.78867513, "num_input_tokens_seen": 127846240, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.2199707, "step": 5948, "time_per_iteration": 2.8406879901885986 }, { "auxiliary_loss_clip": 0.01486047, "auxiliary_loss_mlp": 0.01044466, "balance_loss_clip": 1.30057752, "balance_loss_mlp": 1.02231646, "epoch": 0.35767323012174956, "flos": 28665612144000.0, "grad_norm": 1.7538443878487264, "language_loss": 0.71686298, "learning_rate": 2.9755047522544415e-06, "loss": 0.74216807, "num_input_tokens_seen": 127866880, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.22143555, "step": 5949, "time_per_iteration": 2.9003255367279053 }, { "auxiliary_loss_clip": 0.01485913, "auxiliary_loss_mlp": 0.01044079, "balance_loss_clip": 1.30041456, "balance_loss_mlp": 1.02337205, "epoch": 0.35773335337441753, "flos": 17092399184640.0, "grad_norm": 1.9660650844725822, "language_loss": 0.78762442, "learning_rate": 2.9751647407128154e-06, "loss": 0.81292427, "num_input_tokens_seen": 127883560, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.20715332, "step": 5950, "time_per_iteration": 2.8292899131774902 }, { "auxiliary_loss_clip": 0.01493292, "auxiliary_loss_mlp": 0.01043798, "balance_loss_clip": 1.30627346, "balance_loss_mlp": 1.02175641, "epoch": 0.35779347662708555, "flos": 15897217795200.0, "grad_norm": 1.8773067881355587, "language_loss": 0.7350589, "learning_rate": 2.9748246921928445e-06, "loss": 0.7604298, "num_input_tokens_seen": 127902330, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.22045898, "step": 5951, "time_per_iteration": 2.880941390991211 }, { "auxiliary_loss_clip": 0.01510751, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.31962538, "balance_loss_mlp": 1.02339649, "epoch": 0.3578535998797535, "flos": 28670815296000.0, "grad_norm": 2.1354967458706433, "language_loss": 0.70975113, "learning_rate": 2.9744846067074236e-06, "loss": 0.7353127, "num_input_tokens_seen": 127922325, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.21984863, "step": 5952, "time_per_iteration": 2.9058597087860107 }, { "auxiliary_loss_clip": 0.01484844, "auxiliary_loss_mlp": 0.01047392, "balance_loss_clip": 1.30174279, "balance_loss_mlp": 1.02537429, "epoch": 0.3579137231324215, "flos": 37866587506560.0, "grad_norm": 1.7194579061305273, "language_loss": 0.70728195, "learning_rate": 2.974144484269449e-06, "loss": 0.73260432, "num_input_tokens_seen": 127942635, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22033691, "step": 5953, "time_per_iteration": 4.3763508796691895 }, { "auxiliary_loss_clip": 0.01489007, "auxiliary_loss_mlp": 0.01044437, "balance_loss_clip": 1.30340576, "balance_loss_mlp": 1.02241945, "epoch": 0.35797384638508944, "flos": 22357209288960.0, "grad_norm": 1.5181384774718423, "language_loss": 0.67377681, "learning_rate": 2.9738043248918175e-06, "loss": 0.69911122, "num_input_tokens_seen": 127962520, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22009277, "step": 5954, "time_per_iteration": 2.866511106491089 }, { "auxiliary_loss_clip": 0.01491888, "auxiliary_loss_mlp": 0.01048703, "balance_loss_clip": 1.31073666, "balance_loss_mlp": 1.02661395, "epoch": 0.3580339696377574, "flos": 13597652626560.0, "grad_norm": 2.2414895295217048, "language_loss": 0.76282048, "learning_rate": 2.9734641285874282e-06, "loss": 0.78822643, "num_input_tokens_seen": 127981180, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.2208252, "step": 5955, "time_per_iteration": 2.860464096069336 }, { "auxiliary_loss_clip": 0.01482986, "auxiliary_loss_mlp": 0.01047739, "balance_loss_clip": 1.30202663, "balance_loss_mlp": 1.02614975, "epoch": 0.3580940928904254, "flos": 23778552026880.0, "grad_norm": 2.584515097728016, "language_loss": 0.76368213, "learning_rate": 2.973123895369182e-06, "loss": 0.78898931, "num_input_tokens_seen": 127999725, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.21594238, "step": 5956, "time_per_iteration": 2.9444873332977295 }, { "auxiliary_loss_clip": 0.01479622, "auxiliary_loss_mlp": 0.01047208, "balance_loss_clip": 1.30131876, "balance_loss_mlp": 1.02574992, "epoch": 0.35815421614309334, "flos": 19473323783040.0, "grad_norm": 1.889453265257825, "language_loss": 0.74332869, "learning_rate": 2.9727836252499805e-06, "loss": 0.76859701, "num_input_tokens_seen": 128018885, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.21472168, "step": 5957, "time_per_iteration": 2.8872735500335693 }, { "auxiliary_loss_clip": 0.01496449, "auxiliary_loss_mlp": 0.0105164, "balance_loss_clip": 1.31120825, "balance_loss_mlp": 1.03115988, "epoch": 0.3582143393957613, "flos": 23378740560000.0, "grad_norm": 2.009572949899705, "language_loss": 0.72166222, "learning_rate": 2.972443318242726e-06, "loss": 0.74714309, "num_input_tokens_seen": 128037875, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.20483398, "step": 5958, "time_per_iteration": 2.9787826538085938 }, { "auxiliary_loss_clip": 0.01483385, "auxiliary_loss_mlp": 0.01046636, "balance_loss_clip": 1.30329835, "balance_loss_mlp": 1.0245229, "epoch": 0.35827446264842927, "flos": 26334529332480.0, "grad_norm": 1.8203149964298313, "language_loss": 0.89421296, "learning_rate": 2.972102974360324e-06, "loss": 0.91951311, "num_input_tokens_seen": 128056045, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22106934, "step": 5959, "time_per_iteration": 4.326797723770142 }, { "auxiliary_loss_clip": 0.01492559, "auxiliary_loss_mlp": 0.01046098, "balance_loss_clip": 1.31028676, "balance_loss_mlp": 1.02475905, "epoch": 0.35833458590109724, "flos": 30459049269120.0, "grad_norm": 1.6103273068722281, "language_loss": 0.59376442, "learning_rate": 2.971762593615679e-06, "loss": 0.619151, "num_input_tokens_seen": 128077815, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.21337891, "step": 5960, "time_per_iteration": 4.470661163330078 }, { "auxiliary_loss_clip": 0.01486271, "auxiliary_loss_mlp": 0.01047095, "balance_loss_clip": 1.30306721, "balance_loss_mlp": 1.02477884, "epoch": 0.3583947091537652, "flos": 14838060833280.0, "grad_norm": 1.880272712120269, "language_loss": 0.77433181, "learning_rate": 2.9714221760216993e-06, "loss": 0.79966545, "num_input_tokens_seen": 128095460, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.22314453, "step": 5961, "time_per_iteration": 2.894744873046875 }, { "auxiliary_loss_clip": 0.0149072, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.30680418, "balance_loss_mlp": 1.02100062, "epoch": 0.35845483240643317, "flos": 34253941703040.0, "grad_norm": 2.024029564631576, "language_loss": 0.71140575, "learning_rate": 2.971081721591294e-06, "loss": 0.73674411, "num_input_tokens_seen": 128118605, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.22119141, "step": 5962, "time_per_iteration": 3.031308889389038 }, { "auxiliary_loss_clip": 0.01486048, "auxiliary_loss_mlp": 0.01039945, "balance_loss_clip": 1.30493164, "balance_loss_mlp": 1.01902378, "epoch": 0.35851495565910113, "flos": 20970008392320.0, "grad_norm": 1.5701342569419792, "language_loss": 0.75035942, "learning_rate": 2.9707412303373716e-06, "loss": 0.77561939, "num_input_tokens_seen": 128139205, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.20910645, "step": 5963, "time_per_iteration": 4.249804496765137 }, { "auxiliary_loss_clip": 0.01499251, "auxiliary_loss_mlp": 0.01046962, "balance_loss_clip": 1.31642973, "balance_loss_mlp": 1.02452683, "epoch": 0.35857507891176915, "flos": 22320081290880.0, "grad_norm": 1.5903193486383551, "language_loss": 0.79304427, "learning_rate": 2.9704007022728447e-06, "loss": 0.81850642, "num_input_tokens_seen": 128158765, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.22436523, "step": 5964, "time_per_iteration": 2.8704819679260254 }, { "auxiliary_loss_clip": 0.01492649, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.30660701, "balance_loss_mlp": 1.01905894, "epoch": 0.3586352021644371, "flos": 23378333356800.0, "grad_norm": 2.7079500986767817, "language_loss": 0.67707086, "learning_rate": 2.970060137410626e-06, "loss": 0.70240903, "num_input_tokens_seen": 128177850, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.22119141, "step": 5965, "time_per_iteration": 2.8442797660827637 }, { "auxiliary_loss_clip": 0.01487041, "auxiliary_loss_mlp": 0.01041203, "balance_loss_clip": 1.30531955, "balance_loss_mlp": 1.01823115, "epoch": 0.3586953254171051, "flos": 27859202490240.0, "grad_norm": 1.572995959585258, "language_loss": 0.80110067, "learning_rate": 2.9697195357636294e-06, "loss": 0.82638311, "num_input_tokens_seen": 128196925, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.22961426, "step": 5966, "time_per_iteration": 2.9373557567596436 }, { "auxiliary_loss_clip": 0.01490374, "auxiliary_loss_mlp": 0.01044635, "balance_loss_clip": 1.30628479, "balance_loss_mlp": 1.02181816, "epoch": 0.35875544866977305, "flos": 19509999333120.0, "grad_norm": 2.407760084263722, "language_loss": 0.92467141, "learning_rate": 2.9693788973447715e-06, "loss": 0.95002151, "num_input_tokens_seen": 128213955, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.22827148, "step": 5967, "time_per_iteration": 2.9103174209594727 }, { "auxiliary_loss_clip": 0.0148151, "auxiliary_loss_mlp": 0.01039066, "balance_loss_clip": 1.29564428, "balance_loss_mlp": 1.01613045, "epoch": 0.358815571922441, "flos": 21481249098240.0, "grad_norm": 1.6960847891276656, "language_loss": 0.80918187, "learning_rate": 2.9690382221669682e-06, "loss": 0.83438766, "num_input_tokens_seen": 128232980, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.22949219, "step": 5968, "time_per_iteration": 2.828022003173828 }, { "auxiliary_loss_clip": 0.01496921, "auxiliary_loss_mlp": 0.01049277, "balance_loss_clip": 1.31130219, "balance_loss_mlp": 1.02544701, "epoch": 0.358875695175109, "flos": 21845470890240.0, "grad_norm": 2.0794903990954823, "language_loss": 0.85847002, "learning_rate": 2.9686975102431384e-06, "loss": 0.88393199, "num_input_tokens_seen": 128252795, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.23828125, "step": 5969, "time_per_iteration": 2.87953782081604 }, { "auxiliary_loss_clip": 0.01490783, "auxiliary_loss_mlp": 0.01041304, "balance_loss_clip": 1.30901122, "balance_loss_mlp": 1.02006125, "epoch": 0.35893581842777694, "flos": 32023474623360.0, "grad_norm": 2.1749556858516534, "language_loss": 0.73019499, "learning_rate": 2.968356761586202e-06, "loss": 0.75551587, "num_input_tokens_seen": 128273115, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.21240234, "step": 5970, "time_per_iteration": 2.987529993057251 }, { "auxiliary_loss_clip": 0.01488511, "auxiliary_loss_mlp": 0.01040108, "balance_loss_clip": 1.30528069, "balance_loss_mlp": 1.0182693, "epoch": 0.3589959416804449, "flos": 20495624215680.0, "grad_norm": 1.6739271661964252, "language_loss": 0.80937016, "learning_rate": 2.9680159762090805e-06, "loss": 0.83465636, "num_input_tokens_seen": 128292220, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.21838379, "step": 5971, "time_per_iteration": 2.8597004413604736 }, { "auxiliary_loss_clip": 0.01500039, "auxiliary_loss_mlp": 0.01041085, "balance_loss_clip": 1.31184816, "balance_loss_mlp": 1.01800561, "epoch": 0.3590560649331129, "flos": 16189400586240.0, "grad_norm": 1.7113892545255982, "language_loss": 0.79579908, "learning_rate": 2.967675154124696e-06, "loss": 0.82121027, "num_input_tokens_seen": 128310305, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.23071289, "step": 5972, "time_per_iteration": 2.821305751800537 }, { "auxiliary_loss_clip": 0.01496178, "auxiliary_loss_mlp": 0.01042794, "balance_loss_clip": 1.31036615, "balance_loss_mlp": 1.02121711, "epoch": 0.35911618818578084, "flos": 20384918893440.0, "grad_norm": 2.0982417181111592, "language_loss": 0.8193934, "learning_rate": 2.9673342953459722e-06, "loss": 0.84478313, "num_input_tokens_seen": 128328305, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.21594238, "step": 5973, "time_per_iteration": 2.855057716369629 }, { "auxiliary_loss_clip": 0.01274594, "auxiliary_loss_mlp": 0.01065437, "balance_loss_clip": 1.16429627, "balance_loss_mlp": 1.04150009, "epoch": 0.3591763114384488, "flos": 41258881278720.0, "grad_norm": 0.937385527100441, "language_loss": 0.56788468, "learning_rate": 2.9669933998858355e-06, "loss": 0.59128499, "num_input_tokens_seen": 128378380, "router_z_loss_clip": 1.1015625, "router_z_loss_mlp": 0.23925781, "step": 5974, "time_per_iteration": 3.221466064453125 }, { "auxiliary_loss_clip": 0.01490101, "auxiliary_loss_mlp": 0.01043675, "balance_loss_clip": 1.30669689, "balance_loss_mlp": 1.02221704, "epoch": 0.35923643469111677, "flos": 18704132616960.0, "grad_norm": 2.4374672598232894, "language_loss": 0.702465, "learning_rate": 2.9666524677572114e-06, "loss": 0.72780281, "num_input_tokens_seen": 128394315, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.21484375, "step": 5975, "time_per_iteration": 2.8233208656311035 }, { "auxiliary_loss_clip": 0.0148847, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.30470824, "balance_loss_mlp": 1.01943016, "epoch": 0.35929655794378473, "flos": 25020905760000.0, "grad_norm": 2.1039657682169013, "language_loss": 0.8082428, "learning_rate": 2.96631149897303e-06, "loss": 0.83354628, "num_input_tokens_seen": 128414515, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.22460938, "step": 5976, "time_per_iteration": 2.92484188079834 }, { "auxiliary_loss_clip": 0.01477252, "auxiliary_loss_mlp": 0.01046077, "balance_loss_clip": 1.29442537, "balance_loss_mlp": 1.02219987, "epoch": 0.35935668119645275, "flos": 14983722403200.0, "grad_norm": 2.0275011794583984, "language_loss": 0.80002618, "learning_rate": 2.9659704935462194e-06, "loss": 0.82525945, "num_input_tokens_seen": 128430615, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.23852539, "step": 5977, "time_per_iteration": 2.812204122543335 }, { "auxiliary_loss_clip": 0.01472746, "auxiliary_loss_mlp": 0.01037536, "balance_loss_clip": 1.29199219, "balance_loss_mlp": 1.01504135, "epoch": 0.3594168044491207, "flos": 21187889942400.0, "grad_norm": 1.8909056252313554, "language_loss": 0.81374747, "learning_rate": 2.9656294514897102e-06, "loss": 0.83885026, "num_input_tokens_seen": 128449480, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.22497559, "step": 5978, "time_per_iteration": 2.888421058654785 }, { "auxiliary_loss_clip": 0.01497265, "auxiliary_loss_mlp": 0.01041053, "balance_loss_clip": 1.31067848, "balance_loss_mlp": 1.01893973, "epoch": 0.3594769277017887, "flos": 27683471111040.0, "grad_norm": 1.81120802292484, "language_loss": 0.6855104, "learning_rate": 2.965288372816436e-06, "loss": 0.71089357, "num_input_tokens_seen": 128471465, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.22106934, "step": 5979, "time_per_iteration": 2.988886833190918 }, { "auxiliary_loss_clip": 0.01486913, "auxiliary_loss_mlp": 0.01043987, "balance_loss_clip": 1.30287552, "balance_loss_mlp": 1.02186203, "epoch": 0.35953705095445665, "flos": 23012527996800.0, "grad_norm": 2.413972486205155, "language_loss": 0.68391323, "learning_rate": 2.9649472575393296e-06, "loss": 0.70922226, "num_input_tokens_seen": 128490645, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.22131348, "step": 5980, "time_per_iteration": 2.897665023803711 }, { "auxiliary_loss_clip": 0.01519686, "auxiliary_loss_mlp": 0.01053067, "balance_loss_clip": 1.32744837, "balance_loss_mlp": 1.02949905, "epoch": 0.3595971742071246, "flos": 25523323729920.0, "grad_norm": 1.8092605196932077, "language_loss": 0.71995735, "learning_rate": 2.964606105671327e-06, "loss": 0.74568498, "num_input_tokens_seen": 128510225, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.23571777, "step": 5981, "time_per_iteration": 2.962467670440674 }, { "auxiliary_loss_clip": 0.01502061, "auxiliary_loss_mlp": 0.01051471, "balance_loss_clip": 1.3135556, "balance_loss_mlp": 1.02765322, "epoch": 0.3596572974597926, "flos": 29874774176640.0, "grad_norm": 1.7206835342983149, "language_loss": 0.71864879, "learning_rate": 2.9642649172253635e-06, "loss": 0.74418414, "num_input_tokens_seen": 128530195, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.23815918, "step": 5982, "time_per_iteration": 3.0216333866119385 }, { "auxiliary_loss_clip": 0.01477365, "auxiliary_loss_mlp": 0.01048331, "balance_loss_clip": 1.2980938, "balance_loss_mlp": 1.02614617, "epoch": 0.35971742071246054, "flos": 23122961850240.0, "grad_norm": 1.662769111284471, "language_loss": 0.76723337, "learning_rate": 2.9639236922143786e-06, "loss": 0.79249036, "num_input_tokens_seen": 128549990, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22167969, "step": 5983, "time_per_iteration": 2.878126859664917 }, { "auxiliary_loss_clip": 0.01511149, "auxiliary_loss_mlp": 0.01054675, "balance_loss_clip": 1.32289028, "balance_loss_mlp": 1.03140569, "epoch": 0.3597775439651285, "flos": 16733561523840.0, "grad_norm": 2.117915875455009, "language_loss": 0.7658453, "learning_rate": 2.96358243065131e-06, "loss": 0.79150355, "num_input_tokens_seen": 128567925, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.23278809, "step": 5984, "time_per_iteration": 2.840381383895874 }, { "auxiliary_loss_clip": 0.01485698, "auxiliary_loss_mlp": 0.01048311, "balance_loss_clip": 1.30466104, "balance_loss_mlp": 1.02706838, "epoch": 0.3598376672177965, "flos": 19729057248000.0, "grad_norm": 1.865716805996135, "language_loss": 0.87485266, "learning_rate": 2.9632411325490993e-06, "loss": 0.90019274, "num_input_tokens_seen": 128585655, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.21228027, "step": 5985, "time_per_iteration": 2.8272838592529297 }, { "auxiliary_loss_clip": 0.01489688, "auxiliary_loss_mlp": 0.01045294, "balance_loss_clip": 1.3085382, "balance_loss_mlp": 1.02405071, "epoch": 0.35989779047046444, "flos": 17320506059520.0, "grad_norm": 1.4576524482883215, "language_loss": 0.7358681, "learning_rate": 2.9628997979206884e-06, "loss": 0.76121795, "num_input_tokens_seen": 128604820, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.21252441, "step": 5986, "time_per_iteration": 2.8298404216766357 }, { "auxiliary_loss_clip": 0.01520025, "auxiliary_loss_mlp": 0.01053291, "balance_loss_clip": 1.32899809, "balance_loss_mlp": 1.03128493, "epoch": 0.3599579137231324, "flos": 22721566815360.0, "grad_norm": 1.8413381032419178, "language_loss": 0.74763393, "learning_rate": 2.9625584267790204e-06, "loss": 0.77336705, "num_input_tokens_seen": 128623070, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.22009277, "step": 5987, "time_per_iteration": 2.85229754447937 }, { "auxiliary_loss_clip": 0.01499747, "auxiliary_loss_mlp": 0.01049338, "balance_loss_clip": 1.31347215, "balance_loss_mlp": 1.02792764, "epoch": 0.36001803697580037, "flos": 20969736923520.0, "grad_norm": 5.557195787198832, "language_loss": 0.70809519, "learning_rate": 2.9622170191370404e-06, "loss": 0.73358607, "num_input_tokens_seen": 128642430, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.21411133, "step": 5988, "time_per_iteration": 4.283057689666748 }, { "auxiliary_loss_clip": 0.01503994, "auxiliary_loss_mlp": 0.01054621, "balance_loss_clip": 1.31612587, "balance_loss_mlp": 1.0320431, "epoch": 0.36007816022846834, "flos": 20495307502080.0, "grad_norm": 1.7321591849179958, "language_loss": 0.73957825, "learning_rate": 2.9618755750076953e-06, "loss": 0.76516438, "num_input_tokens_seen": 128661285, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.22583008, "step": 5989, "time_per_iteration": 2.8868801593780518 }, { "auxiliary_loss_clip": 0.01491715, "auxiliary_loss_mlp": 0.01047326, "balance_loss_clip": 1.30843019, "balance_loss_mlp": 1.02617824, "epoch": 0.36013828348113636, "flos": 28012827144960.0, "grad_norm": 1.5499160553451723, "language_loss": 0.80542225, "learning_rate": 2.961534094403931e-06, "loss": 0.83081269, "num_input_tokens_seen": 128682210, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.21142578, "step": 5990, "time_per_iteration": 2.911135673522949 }, { "auxiliary_loss_clip": 0.01489874, "auxiliary_loss_mlp": 0.01049376, "balance_loss_clip": 1.30538452, "balance_loss_mlp": 1.02745295, "epoch": 0.3601984067338043, "flos": 20091288268800.0, "grad_norm": 1.5957317966349969, "language_loss": 0.84539437, "learning_rate": 2.961192577338698e-06, "loss": 0.87078679, "num_input_tokens_seen": 128700445, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.21923828, "step": 5991, "time_per_iteration": 2.8812973499298096 }, { "auxiliary_loss_clip": 0.01498956, "auxiliary_loss_mlp": 0.01049322, "balance_loss_clip": 1.31004298, "balance_loss_mlp": 1.02792442, "epoch": 0.3602585299864723, "flos": 18624990182400.0, "grad_norm": 1.8219980675124852, "language_loss": 0.76301718, "learning_rate": 2.9608510238249463e-06, "loss": 0.78849995, "num_input_tokens_seen": 128716855, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.21398926, "step": 5992, "time_per_iteration": 2.8238253593444824 }, { "auxiliary_loss_clip": 0.01492937, "auxiliary_loss_mlp": 0.01052207, "balance_loss_clip": 1.30932975, "balance_loss_mlp": 1.029212, "epoch": 0.36031865323914025, "flos": 19582400292480.0, "grad_norm": 1.9451218457664603, "language_loss": 0.78169882, "learning_rate": 2.960509433875627e-06, "loss": 0.8071503, "num_input_tokens_seen": 128735835, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.23010254, "step": 5993, "time_per_iteration": 2.854663848876953 }, { "auxiliary_loss_clip": 0.01487543, "auxiliary_loss_mlp": 0.01049163, "balance_loss_clip": 1.30133951, "balance_loss_mlp": 1.02638245, "epoch": 0.3603787764918082, "flos": 17498771147520.0, "grad_norm": 3.679889935585441, "language_loss": 0.75882024, "learning_rate": 2.9601678075036943e-06, "loss": 0.78418732, "num_input_tokens_seen": 128752465, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.22766113, "step": 5994, "time_per_iteration": 4.268512964248657 }, { "auxiliary_loss_clip": 0.01499258, "auxiliary_loss_mlp": 0.01042932, "balance_loss_clip": 1.31139731, "balance_loss_mlp": 1.02102149, "epoch": 0.3604388997444762, "flos": 15531819638400.0, "grad_norm": 2.0562246084855267, "language_loss": 0.70488656, "learning_rate": 2.9598261447221024e-06, "loss": 0.73030853, "num_input_tokens_seen": 128770865, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.21911621, "step": 5995, "time_per_iteration": 4.23672080039978 }, { "auxiliary_loss_clip": 0.0150898, "auxiliary_loss_mlp": 0.01051889, "balance_loss_clip": 1.32044816, "balance_loss_mlp": 1.02995491, "epoch": 0.36049902299714415, "flos": 17319691653120.0, "grad_norm": 2.5461852925319537, "language_loss": 0.82792377, "learning_rate": 2.9594844455438057e-06, "loss": 0.85353243, "num_input_tokens_seen": 128789730, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.21948242, "step": 5996, "time_per_iteration": 2.8254098892211914 }, { "auxiliary_loss_clip": 0.01499573, "auxiliary_loss_mlp": 0.01044571, "balance_loss_clip": 1.31444621, "balance_loss_mlp": 1.02270854, "epoch": 0.3605591462498121, "flos": 17064908328960.0, "grad_norm": 1.5472151844505635, "language_loss": 0.73631191, "learning_rate": 2.959142709981763e-06, "loss": 0.76175344, "num_input_tokens_seen": 128806610, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.21862793, "step": 5997, "time_per_iteration": 2.9526751041412354 }, { "auxiliary_loss_clip": 0.01476889, "auxiliary_loss_mlp": 0.01042143, "balance_loss_clip": 1.29508293, "balance_loss_mlp": 1.0204711, "epoch": 0.3606192695024801, "flos": 16845624190080.0, "grad_norm": 2.4186230848539205, "language_loss": 0.70902014, "learning_rate": 2.9588009380489337e-06, "loss": 0.73421043, "num_input_tokens_seen": 128824830, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21679688, "step": 5998, "time_per_iteration": 4.274650573730469 }, { "auxiliary_loss_clip": 0.0147316, "auxiliary_loss_mlp": 0.01037681, "balance_loss_clip": 1.29001379, "balance_loss_mlp": 1.01474512, "epoch": 0.36067939275514804, "flos": 12137055384960.0, "grad_norm": 2.651976858957742, "language_loss": 0.78432447, "learning_rate": 2.9584591297582758e-06, "loss": 0.80943286, "num_input_tokens_seen": 128838170, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.22937012, "step": 5999, "time_per_iteration": 2.7896034717559814 }, { "auxiliary_loss_clip": 0.01497739, "auxiliary_loss_mlp": 0.01041361, "balance_loss_clip": 1.31211114, "balance_loss_mlp": 1.01904464, "epoch": 0.360739516007816, "flos": 18050668945920.0, "grad_norm": 5.085688751135163, "language_loss": 0.78998792, "learning_rate": 2.9581172851227516e-06, "loss": 0.81537902, "num_input_tokens_seen": 128855625, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22302246, "step": 6000, "time_per_iteration": 2.8993637561798096 }, { "auxiliary_loss_clip": 0.01479879, "auxiliary_loss_mlp": 0.01040198, "balance_loss_clip": 1.29653084, "balance_loss_mlp": 1.01637948, "epoch": 0.360799639260484, "flos": 18558516291840.0, "grad_norm": 1.7224950187403487, "language_loss": 0.79688394, "learning_rate": 2.9577754041553243e-06, "loss": 0.82208467, "num_input_tokens_seen": 128873540, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.23828125, "step": 6001, "time_per_iteration": 2.889944314956665 }, { "auxiliary_loss_clip": 0.01469869, "auxiliary_loss_mlp": 0.01038817, "balance_loss_clip": 1.28965604, "balance_loss_mlp": 1.01660872, "epoch": 0.36085976251315194, "flos": 19691341067520.0, "grad_norm": 1.9769917908939385, "language_loss": 0.8401531, "learning_rate": 2.9574334868689575e-06, "loss": 0.86523998, "num_input_tokens_seen": 128889925, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.22216797, "step": 6002, "time_per_iteration": 2.840131998062134 }, { "auxiliary_loss_clip": 0.01451063, "auxiliary_loss_mlp": 0.01033771, "balance_loss_clip": 1.27608585, "balance_loss_mlp": 1.01194382, "epoch": 0.3609198857658199, "flos": 24207528407040.0, "grad_norm": 2.1826438725327506, "language_loss": 0.92234683, "learning_rate": 2.9570915332766165e-06, "loss": 0.94719517, "num_input_tokens_seen": 128906890, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.21838379, "step": 6003, "time_per_iteration": 2.8664186000823975 }, { "auxiliary_loss_clip": 0.01255104, "auxiliary_loss_mlp": 0.01031325, "balance_loss_clip": 1.1455977, "balance_loss_mlp": 1.01139283, "epoch": 0.3609800090184879, "flos": 57144380670720.0, "grad_norm": 0.869269092296618, "language_loss": 0.5346365, "learning_rate": 2.9567495433912693e-06, "loss": 0.55750084, "num_input_tokens_seen": 128965940, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.19921875, "step": 6004, "time_per_iteration": 3.2506120204925537 }, { "auxiliary_loss_clip": 0.01480038, "auxiliary_loss_mlp": 0.01042632, "balance_loss_clip": 1.29481912, "balance_loss_mlp": 1.01821768, "epoch": 0.3610401322711559, "flos": 20820365280000.0, "grad_norm": 2.1921643610064003, "language_loss": 0.7850076, "learning_rate": 2.956407517225883e-06, "loss": 0.81023431, "num_input_tokens_seen": 128985835, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.2442627, "step": 6005, "time_per_iteration": 2.88390851020813 }, { "auxiliary_loss_clip": 0.01467112, "auxiliary_loss_mlp": 0.01039648, "balance_loss_clip": 1.28617859, "balance_loss_mlp": 1.01630664, "epoch": 0.36110025552382385, "flos": 13707136339200.0, "grad_norm": 1.9836770032587356, "language_loss": 0.79772711, "learning_rate": 2.956065454793429e-06, "loss": 0.82279468, "num_input_tokens_seen": 129003120, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.2331543, "step": 6006, "time_per_iteration": 2.838115692138672 }, { "auxiliary_loss_clip": 0.01478501, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.29485846, "balance_loss_mlp": 1.01530337, "epoch": 0.3611603787764918, "flos": 22465064188800.0, "grad_norm": 1.8919396841344198, "language_loss": 0.85496676, "learning_rate": 2.955723356106876e-06, "loss": 0.88014907, "num_input_tokens_seen": 129021645, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.24414062, "step": 6007, "time_per_iteration": 2.8564505577087402 }, { "auxiliary_loss_clip": 0.01504027, "auxiliary_loss_mlp": 0.01041349, "balance_loss_clip": 1.31156743, "balance_loss_mlp": 1.01805568, "epoch": 0.3612205020291598, "flos": 20896431068160.0, "grad_norm": 2.018110748395329, "language_loss": 0.73032677, "learning_rate": 2.955381221179198e-06, "loss": 0.75578058, "num_input_tokens_seen": 129038375, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.23291016, "step": 6008, "time_per_iteration": 2.826467514038086 }, { "auxiliary_loss_clip": 0.01485481, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.3007201, "balance_loss_mlp": 1.01850057, "epoch": 0.36128062528182775, "flos": 15750379860480.0, "grad_norm": 2.361592914507676, "language_loss": 0.84362662, "learning_rate": 2.955039050023368e-06, "loss": 0.86889255, "num_input_tokens_seen": 129056235, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.22631836, "step": 6009, "time_per_iteration": 2.840806245803833 }, { "auxiliary_loss_clip": 0.01483452, "auxiliary_loss_mlp": 0.01042579, "balance_loss_clip": 1.29866838, "balance_loss_mlp": 1.01880908, "epoch": 0.3613407485344957, "flos": 16773404209920.0, "grad_norm": 2.460389416092129, "language_loss": 0.76953375, "learning_rate": 2.954696842652362e-06, "loss": 0.79479408, "num_input_tokens_seen": 129072405, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.2376709, "step": 6010, "time_per_iteration": 2.8667469024658203 }, { "auxiliary_loss_clip": 0.01485212, "auxiliary_loss_mlp": 0.01043363, "balance_loss_clip": 1.30084503, "balance_loss_mlp": 1.0200336, "epoch": 0.3614008717871637, "flos": 20379625251840.0, "grad_norm": 5.359851133144061, "language_loss": 0.835504, "learning_rate": 2.9543545990791554e-06, "loss": 0.86078978, "num_input_tokens_seen": 129090225, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.23327637, "step": 6011, "time_per_iteration": 2.8594775199890137 }, { "auxiliary_loss_clip": 0.01499994, "auxiliary_loss_mlp": 0.01050077, "balance_loss_clip": 1.30966067, "balance_loss_mlp": 1.02457809, "epoch": 0.36146099503983165, "flos": 22785054549120.0, "grad_norm": 2.0202564358939483, "language_loss": 0.62913543, "learning_rate": 2.954012319316727e-06, "loss": 0.6546362, "num_input_tokens_seen": 129107685, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.25488281, "step": 6012, "time_per_iteration": 2.8527724742889404 }, { "auxiliary_loss_clip": 0.01462264, "auxiliary_loss_mlp": 0.01041049, "balance_loss_clip": 1.28174961, "balance_loss_mlp": 1.01850629, "epoch": 0.3615211182924996, "flos": 23005379318400.0, "grad_norm": 1.7624527857848, "language_loss": 0.84157717, "learning_rate": 2.9536700033780565e-06, "loss": 0.86661029, "num_input_tokens_seen": 129125315, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22558594, "step": 6013, "time_per_iteration": 2.863112449645996 }, { "auxiliary_loss_clip": 0.01466984, "auxiliary_loss_mlp": 0.01039737, "balance_loss_clip": 1.28336537, "balance_loss_mlp": 1.0165503, "epoch": 0.3615812415451676, "flos": 16655143006080.0, "grad_norm": 2.242357097041817, "language_loss": 0.92338067, "learning_rate": 2.9533276512761228e-06, "loss": 0.94844782, "num_input_tokens_seen": 129141600, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.23181152, "step": 6014, "time_per_iteration": 2.818052291870117 }, { "auxiliary_loss_clip": 0.01462698, "auxiliary_loss_mlp": 0.01043967, "balance_loss_clip": 1.28275335, "balance_loss_mlp": 1.02101874, "epoch": 0.36164136479783554, "flos": 21328846053120.0, "grad_norm": 1.6116860931682282, "language_loss": 0.74907541, "learning_rate": 2.95298526302391e-06, "loss": 0.77414203, "num_input_tokens_seen": 129160665, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22961426, "step": 6015, "time_per_iteration": 2.875652551651001 }, { "auxiliary_loss_clip": 0.01486155, "auxiliary_loss_mlp": 0.01038997, "balance_loss_clip": 1.30130613, "balance_loss_mlp": 1.01507151, "epoch": 0.3617014880505035, "flos": 24179992306560.0, "grad_norm": 3.44407887045406, "language_loss": 0.66040599, "learning_rate": 2.9526428386344e-06, "loss": 0.6856575, "num_input_tokens_seen": 129179220, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.23901367, "step": 6016, "time_per_iteration": 2.93022084236145 }, { "auxiliary_loss_clip": 0.01481329, "auxiliary_loss_mlp": 0.01040032, "balance_loss_clip": 1.29680371, "balance_loss_mlp": 1.01579642, "epoch": 0.3617616113031715, "flos": 39027943768320.0, "grad_norm": 1.7281043797984248, "language_loss": 0.7265287, "learning_rate": 2.9523003781205785e-06, "loss": 0.7517423, "num_input_tokens_seen": 129200385, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.2421875, "step": 6017, "time_per_iteration": 3.027055025100708 }, { "auxiliary_loss_clip": 0.01478899, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.2924211, "balance_loss_mlp": 1.01402569, "epoch": 0.3618217345558395, "flos": 12138548463360.0, "grad_norm": 1.984576165815701, "language_loss": 0.74565673, "learning_rate": 2.9519578814954307e-06, "loss": 0.77081323, "num_input_tokens_seen": 129217395, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.22741699, "step": 6018, "time_per_iteration": 2.8565731048583984 }, { "auxiliary_loss_clip": 0.0144867, "auxiliary_loss_mlp": 0.0103884, "balance_loss_clip": 1.27234447, "balance_loss_mlp": 1.01478374, "epoch": 0.36188185780850746, "flos": 24945428154240.0, "grad_norm": 1.5514589197786195, "language_loss": 0.6952678, "learning_rate": 2.9516153487719448e-06, "loss": 0.72014296, "num_input_tokens_seen": 129238940, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.24072266, "step": 6019, "time_per_iteration": 2.9547243118286133 }, { "auxiliary_loss_clip": 0.01486761, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.30085254, "balance_loss_mlp": 1.01502705, "epoch": 0.3619419810611754, "flos": 20968424824320.0, "grad_norm": 1.8075555641827235, "language_loss": 0.7680167, "learning_rate": 2.95127277996311e-06, "loss": 0.79326046, "num_input_tokens_seen": 129258240, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.22595215, "step": 6020, "time_per_iteration": 2.8417208194732666 }, { "auxiliary_loss_clip": 0.01477576, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.29336619, "balance_loss_mlp": 1.01662099, "epoch": 0.3620021043138434, "flos": 22539184450560.0, "grad_norm": 2.3496219508852967, "language_loss": 0.74396271, "learning_rate": 2.9509301750819156e-06, "loss": 0.76913577, "num_input_tokens_seen": 129279040, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.23132324, "step": 6021, "time_per_iteration": 2.923877716064453 }, { "auxiliary_loss_clip": 0.01465787, "auxiliary_loss_mlp": 0.01041691, "balance_loss_clip": 1.28388858, "balance_loss_mlp": 1.01863623, "epoch": 0.36206222756651135, "flos": 15605215983360.0, "grad_norm": 1.6613944900577262, "language_loss": 0.82073897, "learning_rate": 2.9505875341413533e-06, "loss": 0.84581381, "num_input_tokens_seen": 129295415, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.23071289, "step": 6022, "time_per_iteration": 2.842090129852295 }, { "auxiliary_loss_clip": 0.01454331, "auxiliary_loss_mlp": 0.01036274, "balance_loss_clip": 1.27783966, "balance_loss_mlp": 1.01318312, "epoch": 0.3621223508191793, "flos": 23597798474880.0, "grad_norm": 1.7398921930124356, "language_loss": 0.82129693, "learning_rate": 2.950244857154417e-06, "loss": 0.84620303, "num_input_tokens_seen": 129312620, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.2310791, "step": 6023, "time_per_iteration": 2.857262134552002 }, { "auxiliary_loss_clip": 0.01481871, "auxiliary_loss_mlp": 0.01037246, "balance_loss_clip": 1.29528308, "balance_loss_mlp": 1.01322508, "epoch": 0.3621824740718473, "flos": 22320126535680.0, "grad_norm": 1.6578131460468306, "language_loss": 0.80939245, "learning_rate": 2.9499021441341e-06, "loss": 0.83458358, "num_input_tokens_seen": 129331825, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.24035645, "step": 6024, "time_per_iteration": 4.258174896240234 }, { "auxiliary_loss_clip": 0.01450895, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.27328265, "balance_loss_mlp": 1.01411557, "epoch": 0.36224259732451525, "flos": 16772318334720.0, "grad_norm": 1.8344432753428062, "language_loss": 0.75758427, "learning_rate": 2.9495593950933997e-06, "loss": 0.78246403, "num_input_tokens_seen": 129350400, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.22998047, "step": 6025, "time_per_iteration": 2.849942922592163 }, { "auxiliary_loss_clip": 0.01462465, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.28141975, "balance_loss_mlp": 1.01581466, "epoch": 0.3623027205771832, "flos": 23160361317120.0, "grad_norm": 1.6778386719154663, "language_loss": 0.73606706, "learning_rate": 2.9492166100453107e-06, "loss": 0.76108897, "num_input_tokens_seen": 129371155, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.23937988, "step": 6026, "time_per_iteration": 2.8670058250427246 }, { "auxiliary_loss_clip": 0.01487545, "auxiliary_loss_mlp": 0.01045057, "balance_loss_clip": 1.29971075, "balance_loss_mlp": 1.02052343, "epoch": 0.3623628438298512, "flos": 28561060114560.0, "grad_norm": 1.9549438957194902, "language_loss": 0.79277408, "learning_rate": 2.948873789002833e-06, "loss": 0.81810009, "num_input_tokens_seen": 129391230, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.24511719, "step": 6027, "time_per_iteration": 2.9297337532043457 }, { "auxiliary_loss_clip": 0.01463469, "auxiliary_loss_mlp": 0.01042362, "balance_loss_clip": 1.28014421, "balance_loss_mlp": 1.0183053, "epoch": 0.36242296708251914, "flos": 25496194832640.0, "grad_norm": 1.8062734011625254, "language_loss": 0.68703246, "learning_rate": 2.9485309319789667e-06, "loss": 0.71209073, "num_input_tokens_seen": 129410065, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.24072266, "step": 6028, "time_per_iteration": 2.9004640579223633 }, { "auxiliary_loss_clip": 0.01468405, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.28793144, "balance_loss_mlp": 1.01643872, "epoch": 0.3624830903351871, "flos": 16299201012480.0, "grad_norm": 1.7769836831604713, "language_loss": 0.86191285, "learning_rate": 2.9481880389867117e-06, "loss": 0.88699049, "num_input_tokens_seen": 129428655, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22937012, "step": 6029, "time_per_iteration": 4.284532308578491 }, { "auxiliary_loss_clip": 0.01464272, "auxiliary_loss_mlp": 0.01040228, "balance_loss_clip": 1.28487945, "balance_loss_mlp": 1.0168035, "epoch": 0.36254321358785513, "flos": 18305452270080.0, "grad_norm": 1.6361069531614483, "language_loss": 0.73559022, "learning_rate": 2.9478451100390714e-06, "loss": 0.7606352, "num_input_tokens_seen": 129447845, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.23413086, "step": 6030, "time_per_iteration": 4.263043403625488 }, { "auxiliary_loss_clip": 0.01483187, "auxiliary_loss_mlp": 0.01042347, "balance_loss_clip": 1.29708982, "balance_loss_mlp": 1.017766, "epoch": 0.3626033368405231, "flos": 14873198060160.0, "grad_norm": 2.1758758588396123, "language_loss": 0.75513828, "learning_rate": 2.94750214514905e-06, "loss": 0.7803936, "num_input_tokens_seen": 129463275, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.24597168, "step": 6031, "time_per_iteration": 2.9086179733276367 }, { "auxiliary_loss_clip": 0.01463608, "auxiliary_loss_mlp": 0.01039067, "balance_loss_clip": 1.28213191, "balance_loss_mlp": 1.01567864, "epoch": 0.36266346009319106, "flos": 22315964014080.0, "grad_norm": 1.687661347278102, "language_loss": 0.74576586, "learning_rate": 2.9471591443296516e-06, "loss": 0.77079266, "num_input_tokens_seen": 129483205, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.23388672, "step": 6032, "time_per_iteration": 2.852872610092163 }, { "auxiliary_loss_clip": 0.01480281, "auxiliary_loss_mlp": 0.01044397, "balance_loss_clip": 1.29647541, "balance_loss_mlp": 1.02167547, "epoch": 0.362723583345859, "flos": 18231196273920.0, "grad_norm": 1.9718548503373239, "language_loss": 0.78449929, "learning_rate": 2.946816107593884e-06, "loss": 0.80974609, "num_input_tokens_seen": 129499885, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.22741699, "step": 6033, "time_per_iteration": 4.240483045578003 }, { "auxiliary_loss_clip": 0.01276045, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.168185, "balance_loss_mlp": 1.01356399, "epoch": 0.362783706598527, "flos": 68530415316480.0, "grad_norm": 0.7963741240708653, "language_loss": 0.64847732, "learning_rate": 2.9464730349547547e-06, "loss": 0.67158413, "num_input_tokens_seen": 129561885, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.2109375, "step": 6034, "time_per_iteration": 3.4088032245635986 }, { "auxiliary_loss_clip": 0.01458953, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.28095245, "balance_loss_mlp": 1.01852059, "epoch": 0.36284382985119495, "flos": 26587367130240.0, "grad_norm": 1.4877683545041624, "language_loss": 0.90268493, "learning_rate": 2.946129926425273e-06, "loss": 0.92768306, "num_input_tokens_seen": 129582325, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.22314453, "step": 6035, "time_per_iteration": 2.8808484077453613 }, { "auxiliary_loss_clip": 0.01480802, "auxiliary_loss_mlp": 0.01049265, "balance_loss_clip": 1.29475784, "balance_loss_mlp": 1.02630568, "epoch": 0.3629039531038629, "flos": 20166313426560.0, "grad_norm": 1.870229554374374, "language_loss": 0.74795854, "learning_rate": 2.9457867820184496e-06, "loss": 0.77325922, "num_input_tokens_seen": 129600350, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.22961426, "step": 6036, "time_per_iteration": 2.8843064308166504 }, { "auxiliary_loss_clip": 0.01479912, "auxiliary_loss_mlp": 0.01043526, "balance_loss_clip": 1.29221439, "balance_loss_mlp": 1.02082837, "epoch": 0.3629640763565309, "flos": 18635215507200.0, "grad_norm": 2.419788597548329, "language_loss": 0.76860595, "learning_rate": 2.945443601747297e-06, "loss": 0.79384029, "num_input_tokens_seen": 129618425, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.22705078, "step": 6037, "time_per_iteration": 2.8315038681030273 }, { "auxiliary_loss_clip": 0.01463974, "auxiliary_loss_mlp": 0.01045092, "balance_loss_clip": 1.28535855, "balance_loss_mlp": 1.02234674, "epoch": 0.36302419960919885, "flos": 19580454766080.0, "grad_norm": 1.5647508732641637, "language_loss": 0.79273653, "learning_rate": 2.945100385624828e-06, "loss": 0.81782722, "num_input_tokens_seen": 129636750, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.22741699, "step": 6038, "time_per_iteration": 2.8738584518432617 }, { "auxiliary_loss_clip": 0.01266466, "auxiliary_loss_mlp": 0.01020096, "balance_loss_clip": 1.15898192, "balance_loss_mlp": 0.99968714, "epoch": 0.3630843228618668, "flos": 63828949944960.0, "grad_norm": 0.8698185729433199, "language_loss": 0.63379496, "learning_rate": 2.9447571336640573e-06, "loss": 0.65666056, "num_input_tokens_seen": 129699030, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.20410156, "step": 6039, "time_per_iteration": 3.445794105529785 }, { "auxiliary_loss_clip": 0.01462844, "auxiliary_loss_mlp": 0.01048094, "balance_loss_clip": 1.28222537, "balance_loss_mlp": 1.02516985, "epoch": 0.3631444461145348, "flos": 21845154176640.0, "grad_norm": 2.1198778475037785, "language_loss": 0.72425091, "learning_rate": 2.944413845878002e-06, "loss": 0.74936026, "num_input_tokens_seen": 129717135, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22924805, "step": 6040, "time_per_iteration": 2.8730623722076416 }, { "auxiliary_loss_clip": 0.01488327, "auxiliary_loss_mlp": 0.01044376, "balance_loss_clip": 1.30243993, "balance_loss_mlp": 1.02179754, "epoch": 0.36320456936720275, "flos": 21731055494400.0, "grad_norm": 1.712448659894353, "language_loss": 0.82057816, "learning_rate": 2.9440705222796783e-06, "loss": 0.84590518, "num_input_tokens_seen": 129735940, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.22583008, "step": 6041, "time_per_iteration": 2.981795072555542 }, { "auxiliary_loss_clip": 0.01480383, "auxiliary_loss_mlp": 0.0104475, "balance_loss_clip": 1.29464853, "balance_loss_mlp": 1.02018118, "epoch": 0.3632646926198707, "flos": 17027825575680.0, "grad_norm": 1.9466610298853078, "language_loss": 0.85515106, "learning_rate": 2.943727162882107e-06, "loss": 0.88040245, "num_input_tokens_seen": 129752790, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.24560547, "step": 6042, "time_per_iteration": 2.8340020179748535 }, { "auxiliary_loss_clip": 0.01468734, "auxiliary_loss_mlp": 0.01050413, "balance_loss_clip": 1.28815436, "balance_loss_mlp": 1.02748883, "epoch": 0.36332481587253873, "flos": 23341567317120.0, "grad_norm": 1.5911113367821486, "language_loss": 0.78785408, "learning_rate": 2.9433837676983064e-06, "loss": 0.8130455, "num_input_tokens_seen": 129773655, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.22900391, "step": 6043, "time_per_iteration": 2.879533052444458 }, { "auxiliary_loss_clip": 0.01471594, "auxiliary_loss_mlp": 0.01045321, "balance_loss_clip": 1.29185247, "balance_loss_mlp": 1.02165794, "epoch": 0.3633849391252067, "flos": 10750668894720.0, "grad_norm": 3.199068266606343, "language_loss": 0.66350454, "learning_rate": 2.943040336741298e-06, "loss": 0.68867373, "num_input_tokens_seen": 129791605, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.23657227, "step": 6044, "time_per_iteration": 2.8355069160461426 }, { "auxiliary_loss_clip": 0.01472464, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.29114687, "balance_loss_mlp": 1.01555276, "epoch": 0.36344506237787466, "flos": 25860326135040.0, "grad_norm": 1.7399331878188895, "language_loss": 0.81384593, "learning_rate": 2.9426968700241066e-06, "loss": 0.83895737, "num_input_tokens_seen": 129811075, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.23132324, "step": 6045, "time_per_iteration": 2.866396188735962 }, { "auxiliary_loss_clip": 0.01473581, "auxiliary_loss_mlp": 0.01045489, "balance_loss_clip": 1.29105449, "balance_loss_mlp": 1.02257657, "epoch": 0.3635051856305426, "flos": 30166233050880.0, "grad_norm": 2.0290333141777825, "language_loss": 0.65480602, "learning_rate": 2.942353367559755e-06, "loss": 0.67999673, "num_input_tokens_seen": 129833755, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.22912598, "step": 6046, "time_per_iteration": 2.9372034072875977 }, { "auxiliary_loss_clip": 0.0147493, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.29199815, "balance_loss_mlp": 1.0191493, "epoch": 0.3635653088832106, "flos": 22208063869440.0, "grad_norm": 1.515138129407602, "language_loss": 0.78014731, "learning_rate": 2.9420098293612692e-06, "loss": 0.80532503, "num_input_tokens_seen": 129854475, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.23706055, "step": 6047, "time_per_iteration": 2.8611257076263428 }, { "auxiliary_loss_clip": 0.0148731, "auxiliary_loss_mlp": 0.01044595, "balance_loss_clip": 1.29734409, "balance_loss_mlp": 1.02100325, "epoch": 0.36362543213587856, "flos": 24796916161920.0, "grad_norm": 1.567852203170704, "language_loss": 0.80343771, "learning_rate": 2.9416662554416767e-06, "loss": 0.82875681, "num_input_tokens_seen": 129873530, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.23596191, "step": 6048, "time_per_iteration": 2.8817338943481445 }, { "auxiliary_loss_clip": 0.01243168, "auxiliary_loss_mlp": 0.01041516, "balance_loss_clip": 1.13901424, "balance_loss_mlp": 1.0227288, "epoch": 0.3636855553885465, "flos": 62558879132160.0, "grad_norm": 0.7782096075010976, "language_loss": 0.52622092, "learning_rate": 2.9413226458140054e-06, "loss": 0.54906774, "num_input_tokens_seen": 129940400, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.1875, "step": 6049, "time_per_iteration": 3.418602466583252 }, { "auxiliary_loss_clip": 0.01482058, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.29695344, "balance_loss_mlp": 1.01595163, "epoch": 0.3637456786412145, "flos": 24071187265920.0, "grad_norm": 1.8279309815169458, "language_loss": 0.87224412, "learning_rate": 2.9409790004912845e-06, "loss": 0.89745915, "num_input_tokens_seen": 129958635, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.23510742, "step": 6050, "time_per_iteration": 2.865732431411743 }, { "auxiliary_loss_clip": 0.01465641, "auxiliary_loss_mlp": 0.01043711, "balance_loss_clip": 1.28501678, "balance_loss_mlp": 1.02104974, "epoch": 0.36380580189388245, "flos": 16700369823360.0, "grad_norm": 1.8590703131379729, "language_loss": 0.79378641, "learning_rate": 2.940635319486546e-06, "loss": 0.81887996, "num_input_tokens_seen": 129977685, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22668457, "step": 6051, "time_per_iteration": 2.904627561569214 }, { "auxiliary_loss_clip": 0.01473583, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.29010475, "balance_loss_mlp": 1.01810217, "epoch": 0.3638659251465504, "flos": 25123693242240.0, "grad_norm": 2.057356218906122, "language_loss": 0.83085883, "learning_rate": 2.940291602812822e-06, "loss": 0.85600621, "num_input_tokens_seen": 129997530, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.23059082, "step": 6052, "time_per_iteration": 2.8706858158111572 }, { "auxiliary_loss_clip": 0.01464847, "auxiliary_loss_mlp": 0.01042602, "balance_loss_clip": 1.287678, "balance_loss_mlp": 1.01980948, "epoch": 0.3639260483992184, "flos": 23013387648000.0, "grad_norm": 1.5533857229964654, "language_loss": 0.72903085, "learning_rate": 2.939947850483145e-06, "loss": 0.75410533, "num_input_tokens_seen": 130017955, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.22802734, "step": 6053, "time_per_iteration": 2.9413135051727295 }, { "auxiliary_loss_clip": 0.01240773, "auxiliary_loss_mlp": 0.01017751, "balance_loss_clip": 1.13818884, "balance_loss_mlp": 1.00163364, "epoch": 0.36398617165188635, "flos": 70745544408960.0, "grad_norm": 0.766787507091847, "language_loss": 0.6124922, "learning_rate": 2.9396040625105532e-06, "loss": 0.63507736, "num_input_tokens_seen": 130074275, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.16113281, "step": 6054, "time_per_iteration": 3.3966116905212402 }, { "auxiliary_loss_clip": 0.01484942, "auxiliary_loss_mlp": 0.01055558, "balance_loss_clip": 1.29997361, "balance_loss_mlp": 1.03077459, "epoch": 0.3640462949045543, "flos": 22245282357120.0, "grad_norm": 3.511688798022756, "language_loss": 0.76344198, "learning_rate": 2.9392602389080802e-06, "loss": 0.78884697, "num_input_tokens_seen": 130091375, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.24829102, "step": 6055, "time_per_iteration": 2.8474414348602295 }, { "auxiliary_loss_clip": 0.0148898, "auxiliary_loss_mlp": 0.01049402, "balance_loss_clip": 1.30562568, "balance_loss_mlp": 1.02603662, "epoch": 0.3641064181572223, "flos": 21553514323200.0, "grad_norm": 1.744908593797914, "language_loss": 0.75914299, "learning_rate": 2.938916379688765e-06, "loss": 0.78452682, "num_input_tokens_seen": 130111595, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.23364258, "step": 6056, "time_per_iteration": 2.900407552719116 }, { "auxiliary_loss_clip": 0.01477012, "auxiliary_loss_mlp": 0.01046731, "balance_loss_clip": 1.29593289, "balance_loss_mlp": 1.02312744, "epoch": 0.3641665414098903, "flos": 22283496230400.0, "grad_norm": 1.8468259289296993, "language_loss": 0.81692141, "learning_rate": 2.9385724848656468e-06, "loss": 0.84215879, "num_input_tokens_seen": 130131440, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.23596191, "step": 6057, "time_per_iteration": 2.869539737701416 }, { "auxiliary_loss_clip": 0.01468295, "auxiliary_loss_mlp": 0.01056062, "balance_loss_clip": 1.28940392, "balance_loss_mlp": 1.03222072, "epoch": 0.36422666466255826, "flos": 28341368772480.0, "grad_norm": 2.0067268963520113, "language_loss": 0.81259072, "learning_rate": 2.9382285544517647e-06, "loss": 0.83783424, "num_input_tokens_seen": 130151375, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.23840332, "step": 6058, "time_per_iteration": 2.921931028366089 }, { "auxiliary_loss_clip": 0.014817, "auxiliary_loss_mlp": 0.01049996, "balance_loss_clip": 1.29790759, "balance_loss_mlp": 1.02715492, "epoch": 0.36428678791522623, "flos": 24181168671360.0, "grad_norm": 1.790003326347346, "language_loss": 0.86089849, "learning_rate": 2.9378845884601636e-06, "loss": 0.88621545, "num_input_tokens_seen": 130169960, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.22827148, "step": 6059, "time_per_iteration": 4.396906614303589 }, { "auxiliary_loss_clip": 0.01479142, "auxiliary_loss_mlp": 0.01050503, "balance_loss_clip": 1.29467463, "balance_loss_mlp": 1.02590966, "epoch": 0.3643469111678942, "flos": 22538641512960.0, "grad_norm": 1.5274394792978823, "language_loss": 0.88725948, "learning_rate": 2.937540586903884e-06, "loss": 0.91255593, "num_input_tokens_seen": 130189800, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.24597168, "step": 6060, "time_per_iteration": 2.855309247970581 }, { "auxiliary_loss_clip": 0.01487539, "auxiliary_loss_mlp": 0.01051685, "balance_loss_clip": 1.30134571, "balance_loss_mlp": 1.02712798, "epoch": 0.36440703442056216, "flos": 19435833826560.0, "grad_norm": 1.9514311047955741, "language_loss": 0.67698783, "learning_rate": 2.937196549795971e-06, "loss": 0.70238006, "num_input_tokens_seen": 130206370, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.24572754, "step": 6061, "time_per_iteration": 2.874579429626465 }, { "auxiliary_loss_clip": 0.01503455, "auxiliary_loss_mlp": 0.01047914, "balance_loss_clip": 1.31636405, "balance_loss_mlp": 1.02475214, "epoch": 0.3644671576732301, "flos": 18049492581120.0, "grad_norm": 2.2317472470628297, "language_loss": 0.77517521, "learning_rate": 2.9368524771494718e-06, "loss": 0.80068892, "num_input_tokens_seen": 130224445, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.23168945, "step": 6062, "time_per_iteration": 2.8331878185272217 }, { "auxiliary_loss_clip": 0.0147259, "auxiliary_loss_mlp": 0.01045129, "balance_loss_clip": 1.29266512, "balance_loss_mlp": 1.02243209, "epoch": 0.3645272809258981, "flos": 21552609427200.0, "grad_norm": 1.6750971410909206, "language_loss": 0.73876131, "learning_rate": 2.936508368977432e-06, "loss": 0.76393843, "num_input_tokens_seen": 130245380, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.22692871, "step": 6063, "time_per_iteration": 2.895822048187256 }, { "auxiliary_loss_clip": 0.01461647, "auxiliary_loss_mlp": 0.01043731, "balance_loss_clip": 1.2826947, "balance_loss_mlp": 1.02053308, "epoch": 0.36458740417856605, "flos": 22756975511040.0, "grad_norm": 1.957146427272074, "language_loss": 0.68836403, "learning_rate": 2.936164225292901e-06, "loss": 0.71341777, "num_input_tokens_seen": 130265575, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.23193359, "step": 6064, "time_per_iteration": 4.294676303863525 }, { "auxiliary_loss_clip": 0.01486788, "auxiliary_loss_mlp": 0.01045652, "balance_loss_clip": 1.30218863, "balance_loss_mlp": 1.02315736, "epoch": 0.364647527431234, "flos": 26151830254080.0, "grad_norm": 11.46844714887566, "language_loss": 0.74899793, "learning_rate": 2.9358200461089297e-06, "loss": 0.77432233, "num_input_tokens_seen": 130286195, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22497559, "step": 6065, "time_per_iteration": 4.2820234298706055 }, { "auxiliary_loss_clip": 0.01486595, "auxiliary_loss_mlp": 0.01046513, "balance_loss_clip": 1.29962635, "balance_loss_mlp": 1.02239728, "epoch": 0.364707650683902, "flos": 31042374220800.0, "grad_norm": 1.8957278342203059, "language_loss": 0.75540525, "learning_rate": 2.9354758314385676e-06, "loss": 0.78073633, "num_input_tokens_seen": 130306095, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.2409668, "step": 6066, "time_per_iteration": 2.895617961883545 }, { "auxiliary_loss_clip": 0.01471715, "auxiliary_loss_mlp": 0.01042832, "balance_loss_clip": 1.2917465, "balance_loss_mlp": 1.01872814, "epoch": 0.36476777393656995, "flos": 19582264558080.0, "grad_norm": 2.145310713997748, "language_loss": 0.77669394, "learning_rate": 2.9351315812948684e-06, "loss": 0.80183941, "num_input_tokens_seen": 130324685, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.2409668, "step": 6067, "time_per_iteration": 2.8569114208221436 }, { "auxiliary_loss_clip": 0.01472778, "auxiliary_loss_mlp": 0.01043682, "balance_loss_clip": 1.29296219, "balance_loss_mlp": 1.02103233, "epoch": 0.3648278971892379, "flos": 17757807482880.0, "grad_norm": 2.3654277471745, "language_loss": 0.727579, "learning_rate": 2.934787295690886e-06, "loss": 0.7527436, "num_input_tokens_seen": 130343855, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.22631836, "step": 6068, "time_per_iteration": 4.258959531784058 }, { "auxiliary_loss_clip": 0.01479125, "auxiliary_loss_mlp": 0.0104269, "balance_loss_clip": 1.2944541, "balance_loss_mlp": 1.019122, "epoch": 0.3648880204419059, "flos": 17940144602880.0, "grad_norm": 2.2156814521262786, "language_loss": 0.75082046, "learning_rate": 2.9344429746396755e-06, "loss": 0.77603865, "num_input_tokens_seen": 130362320, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.23583984, "step": 6069, "time_per_iteration": 2.811143636703491 }, { "auxiliary_loss_clip": 0.01476056, "auxiliary_loss_mlp": 0.01043196, "balance_loss_clip": 1.29200244, "balance_loss_mlp": 1.01937842, "epoch": 0.3649481436945739, "flos": 22648577673600.0, "grad_norm": 3.3868239640756355, "language_loss": 0.67693484, "learning_rate": 2.9340986181542945e-06, "loss": 0.70212734, "num_input_tokens_seen": 130383165, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.23840332, "step": 6070, "time_per_iteration": 2.929018020629883 }, { "auxiliary_loss_clip": 0.01458485, "auxiliary_loss_mlp": 0.01038948, "balance_loss_clip": 1.28013706, "balance_loss_mlp": 1.01597595, "epoch": 0.36500826694724187, "flos": 21589556446080.0, "grad_norm": 1.6406391542435967, "language_loss": 0.75150836, "learning_rate": 2.9337542262477994e-06, "loss": 0.77648264, "num_input_tokens_seen": 130402425, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.22961426, "step": 6071, "time_per_iteration": 2.8490374088287354 }, { "auxiliary_loss_clip": 0.0147129, "auxiliary_loss_mlp": 0.01036383, "balance_loss_clip": 1.28992307, "balance_loss_mlp": 1.01285148, "epoch": 0.36506839019990983, "flos": 13780623173760.0, "grad_norm": 1.650595082254123, "language_loss": 0.89443535, "learning_rate": 2.9334097989332506e-06, "loss": 0.91951203, "num_input_tokens_seen": 130419440, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.2355957, "step": 6072, "time_per_iteration": 2.826725959777832 }, { "auxiliary_loss_clip": 0.01464296, "auxiliary_loss_mlp": 0.0103853, "balance_loss_clip": 1.28349233, "balance_loss_mlp": 1.01611853, "epoch": 0.3651285134525778, "flos": 17283920999040.0, "grad_norm": 2.0714057345681622, "language_loss": 0.74107492, "learning_rate": 2.9330653362237094e-06, "loss": 0.76610321, "num_input_tokens_seen": 130438495, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.22399902, "step": 6073, "time_per_iteration": 2.8510191440582275 }, { "auxiliary_loss_clip": 0.0149006, "auxiliary_loss_mlp": 0.01047065, "balance_loss_clip": 1.30467105, "balance_loss_mlp": 1.02195978, "epoch": 0.36518863670524576, "flos": 21917962339200.0, "grad_norm": 6.355038488664238, "language_loss": 0.67629021, "learning_rate": 2.932720838132236e-06, "loss": 0.70166147, "num_input_tokens_seen": 130455575, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.25109863, "step": 6074, "time_per_iteration": 2.8272197246551514 }, { "auxiliary_loss_clip": 0.01465034, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.28464282, "balance_loss_mlp": 1.01500738, "epoch": 0.3652487599579137, "flos": 27132659187840.0, "grad_norm": 1.6526563472596172, "language_loss": 0.73647147, "learning_rate": 2.9323763046718954e-06, "loss": 0.76150042, "num_input_tokens_seen": 130476385, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22851562, "step": 6075, "time_per_iteration": 2.906740188598633 }, { "auxiliary_loss_clip": 0.0148401, "auxiliary_loss_mlp": 0.01044115, "balance_loss_clip": 1.29781389, "balance_loss_mlp": 1.02023721, "epoch": 0.3653088832105817, "flos": 19765054126080.0, "grad_norm": 2.6624526058001883, "language_loss": 0.9053483, "learning_rate": 2.9320317358557524e-06, "loss": 0.93062955, "num_input_tokens_seen": 130493630, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.23876953, "step": 6076, "time_per_iteration": 2.8243727684020996 }, { "auxiliary_loss_clip": 0.01467436, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.28710794, "balance_loss_mlp": 1.01456833, "epoch": 0.36536900646324966, "flos": 13122725512320.0, "grad_norm": 1.8572592422497634, "language_loss": 0.71577537, "learning_rate": 2.931687131696872e-06, "loss": 0.74082422, "num_input_tokens_seen": 130510735, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.22900391, "step": 6077, "time_per_iteration": 2.8374552726745605 }, { "auxiliary_loss_clip": 0.0124919, "auxiliary_loss_mlp": 0.01023426, "balance_loss_clip": 1.14271927, "balance_loss_mlp": 1.00559199, "epoch": 0.3654291297159176, "flos": 71135193778560.0, "grad_norm": 0.7393010657666114, "language_loss": 0.61834848, "learning_rate": 2.9313424922083224e-06, "loss": 0.64107466, "num_input_tokens_seen": 130577050, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.17871094, "step": 6078, "time_per_iteration": 3.4477672576904297 }, { "auxiliary_loss_clip": 0.014695, "auxiliary_loss_mlp": 0.01037812, "balance_loss_clip": 1.28728557, "balance_loss_mlp": 1.01470923, "epoch": 0.3654892529685856, "flos": 23626510940160.0, "grad_norm": 2.0505189947443094, "language_loss": 0.79122794, "learning_rate": 2.930997817403173e-06, "loss": 0.81630111, "num_input_tokens_seen": 130593780, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.23095703, "step": 6079, "time_per_iteration": 2.837953805923462 }, { "auxiliary_loss_clip": 0.01480574, "auxiliary_loss_mlp": 0.01039535, "balance_loss_clip": 1.29653478, "balance_loss_mlp": 1.01628971, "epoch": 0.36554937622125355, "flos": 43486298974080.0, "grad_norm": 2.3256870484318033, "language_loss": 0.63427025, "learning_rate": 2.9306531072944913e-06, "loss": 0.65947139, "num_input_tokens_seen": 130615510, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.2322998, "step": 6080, "time_per_iteration": 3.0391178131103516 }, { "auxiliary_loss_clip": 0.01491874, "auxiliary_loss_mlp": 0.01044809, "balance_loss_clip": 1.3063395, "balance_loss_mlp": 1.01998997, "epoch": 0.3656094994739215, "flos": 23305027501440.0, "grad_norm": 2.1714781034600503, "language_loss": 0.69401622, "learning_rate": 2.930308361895352e-06, "loss": 0.71938312, "num_input_tokens_seen": 130635410, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.24816895, "step": 6081, "time_per_iteration": 2.8528311252593994 }, { "auxiliary_loss_clip": 0.01506484, "auxiliary_loss_mlp": 0.01047743, "balance_loss_clip": 1.3152777, "balance_loss_mlp": 1.02405643, "epoch": 0.3656696227265895, "flos": 24582789930240.0, "grad_norm": 2.277096047306174, "language_loss": 0.75547802, "learning_rate": 2.9299635812188257e-06, "loss": 0.78102028, "num_input_tokens_seen": 130657725, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.23706055, "step": 6082, "time_per_iteration": 2.9119865894317627 }, { "auxiliary_loss_clip": 0.01481483, "auxiliary_loss_mlp": 0.01041515, "balance_loss_clip": 1.29745471, "balance_loss_mlp": 1.01948535, "epoch": 0.3657297459792575, "flos": 27939973737600.0, "grad_norm": 1.648029347850574, "language_loss": 0.83408499, "learning_rate": 2.929618765277987e-06, "loss": 0.85931504, "num_input_tokens_seen": 130678360, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.22009277, "step": 6083, "time_per_iteration": 2.8939313888549805 }, { "auxiliary_loss_clip": 0.01254201, "auxiliary_loss_mlp": 0.01026665, "balance_loss_clip": 1.15137053, "balance_loss_mlp": 1.01121545, "epoch": 0.36578986923192547, "flos": 67420466426880.0, "grad_norm": 0.8191703717760468, "language_loss": 0.59444797, "learning_rate": 2.9292739140859125e-06, "loss": 0.61725658, "num_input_tokens_seen": 130742110, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.15429688, "step": 6084, "time_per_iteration": 3.4789204597473145 }, { "auxiliary_loss_clip": 0.01471946, "auxiliary_loss_mlp": 0.01039456, "balance_loss_clip": 1.29062939, "balance_loss_mlp": 1.01710427, "epoch": 0.36584999248459343, "flos": 20236497390720.0, "grad_norm": 2.013644125550603, "language_loss": 0.73285282, "learning_rate": 2.9289290276556767e-06, "loss": 0.75796688, "num_input_tokens_seen": 130759870, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.22338867, "step": 6085, "time_per_iteration": 2.8460867404937744 }, { "auxiliary_loss_clip": 0.01481511, "auxiliary_loss_mlp": 0.01043025, "balance_loss_clip": 1.29709053, "balance_loss_mlp": 1.0207926, "epoch": 0.3659101157372614, "flos": 19071793013760.0, "grad_norm": 1.8604244251289053, "language_loss": 0.79172313, "learning_rate": 2.9285841060003604e-06, "loss": 0.81696844, "num_input_tokens_seen": 130778510, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.22229004, "step": 6086, "time_per_iteration": 2.817906618118286 }, { "auxiliary_loss_clip": 0.01464198, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.28603935, "balance_loss_mlp": 1.01900983, "epoch": 0.36597023898992936, "flos": 30823044837120.0, "grad_norm": 2.3537324566808495, "language_loss": 0.78043139, "learning_rate": 2.9282391491330416e-06, "loss": 0.8054806, "num_input_tokens_seen": 130798535, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.21716309, "step": 6087, "time_per_iteration": 2.9251058101654053 }, { "auxiliary_loss_clip": 0.01483786, "auxiliary_loss_mlp": 0.01041496, "balance_loss_clip": 1.29643345, "balance_loss_mlp": 1.01821446, "epoch": 0.36603036224259733, "flos": 20531485359360.0, "grad_norm": 2.043838675917153, "language_loss": 0.71894681, "learning_rate": 2.9278941570668002e-06, "loss": 0.74419963, "num_input_tokens_seen": 130816655, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.23291016, "step": 6088, "time_per_iteration": 2.8366916179656982 }, { "auxiliary_loss_clip": 0.01509634, "auxiliary_loss_mlp": 0.01041359, "balance_loss_clip": 1.31551826, "balance_loss_mlp": 1.01733875, "epoch": 0.3660904854952653, "flos": 38344365043200.0, "grad_norm": 1.5843145128747842, "language_loss": 0.80418748, "learning_rate": 2.92754912981472e-06, "loss": 0.82969737, "num_input_tokens_seen": 130841225, "router_z_loss_clip": 1.94238281, "router_z_loss_mlp": 0.24023438, "step": 6089, "time_per_iteration": 3.0262606143951416 }, { "auxiliary_loss_clip": 0.01468672, "auxiliary_loss_mlp": 0.01038018, "balance_loss_clip": 1.28809226, "balance_loss_mlp": 1.01566613, "epoch": 0.36615060874793326, "flos": 21845651869440.0, "grad_norm": 1.7674714646462293, "language_loss": 0.71823144, "learning_rate": 2.927204067389884e-06, "loss": 0.74329835, "num_input_tokens_seen": 130861050, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22351074, "step": 6090, "time_per_iteration": 2.874279499053955 }, { "auxiliary_loss_clip": 0.01467134, "auxiliary_loss_mlp": 0.01043369, "balance_loss_clip": 1.28846288, "balance_loss_mlp": 1.02163768, "epoch": 0.3662107320006012, "flos": 16590252683520.0, "grad_norm": 2.1299366959225514, "language_loss": 0.74878818, "learning_rate": 2.9268589698053763e-06, "loss": 0.77389318, "num_input_tokens_seen": 130879775, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21728516, "step": 6091, "time_per_iteration": 2.820748805999756 }, { "auxiliary_loss_clip": 0.01490347, "auxiliary_loss_mlp": 0.01041478, "balance_loss_clip": 1.3062166, "balance_loss_mlp": 1.01800525, "epoch": 0.3662708552532692, "flos": 20967836641920.0, "grad_norm": 1.922906075692372, "language_loss": 0.74037707, "learning_rate": 2.926513837074284e-06, "loss": 0.76569533, "num_input_tokens_seen": 130898070, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.23498535, "step": 6092, "time_per_iteration": 2.8433773517608643 }, { "auxiliary_loss_clip": 0.01487833, "auxiliary_loss_mlp": 0.01045885, "balance_loss_clip": 1.30095446, "balance_loss_mlp": 1.02297354, "epoch": 0.36633097850593715, "flos": 21911311353600.0, "grad_norm": 2.5271108205858575, "language_loss": 0.79188573, "learning_rate": 2.9261686692096942e-06, "loss": 0.81722295, "num_input_tokens_seen": 130915250, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.22912598, "step": 6093, "time_per_iteration": 2.818272590637207 }, { "auxiliary_loss_clip": 0.01483586, "auxiliary_loss_mlp": 0.01042205, "balance_loss_clip": 1.29771066, "balance_loss_mlp": 1.0184468, "epoch": 0.3663911017586051, "flos": 32867057520000.0, "grad_norm": 4.814153986388782, "language_loss": 0.75186938, "learning_rate": 2.925823466224696e-06, "loss": 0.77712727, "num_input_tokens_seen": 130936995, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.23754883, "step": 6094, "time_per_iteration": 4.345716714859009 }, { "auxiliary_loss_clip": 0.0147772, "auxiliary_loss_mlp": 0.01040756, "balance_loss_clip": 1.29248476, "balance_loss_mlp": 1.01641309, "epoch": 0.3664512250112731, "flos": 27283207196160.0, "grad_norm": 1.8627425238640913, "language_loss": 0.80273056, "learning_rate": 2.9254782281323785e-06, "loss": 0.82791531, "num_input_tokens_seen": 130957970, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.24353027, "step": 6095, "time_per_iteration": 2.905019998550415 }, { "auxiliary_loss_clip": 0.01498429, "auxiliary_loss_mlp": 0.01040843, "balance_loss_clip": 1.31050277, "balance_loss_mlp": 1.01617873, "epoch": 0.3665113482639411, "flos": 17793170933760.0, "grad_norm": 2.9713807511714267, "language_loss": 0.74826097, "learning_rate": 2.925132954945834e-06, "loss": 0.77365375, "num_input_tokens_seen": 130974915, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.24682617, "step": 6096, "time_per_iteration": 2.9319705963134766 }, { "auxiliary_loss_clip": 0.01482053, "auxiliary_loss_mlp": 0.0103868, "balance_loss_clip": 1.29476893, "balance_loss_mlp": 1.01579225, "epoch": 0.36657147151660907, "flos": 27865989210240.0, "grad_norm": 2.8295043118273093, "language_loss": 0.68195665, "learning_rate": 2.924787646678155e-06, "loss": 0.70716405, "num_input_tokens_seen": 130995745, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.22888184, "step": 6097, "time_per_iteration": 2.8986992835998535 }, { "auxiliary_loss_clip": 0.01486401, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.30197597, "balance_loss_mlp": 1.01680827, "epoch": 0.36663159476927704, "flos": 25384856083200.0, "grad_norm": 1.664446240658449, "language_loss": 0.78559554, "learning_rate": 2.9244423033424365e-06, "loss": 0.81087065, "num_input_tokens_seen": 131015545, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.24291992, "step": 6098, "time_per_iteration": 2.922243118286133 }, { "auxiliary_loss_clip": 0.0147617, "auxiliary_loss_mlp": 0.01038658, "balance_loss_clip": 1.29466474, "balance_loss_mlp": 1.01602042, "epoch": 0.366691718021945, "flos": 21366155030400.0, "grad_norm": 3.9109827466188194, "language_loss": 0.74436277, "learning_rate": 2.9240969249517723e-06, "loss": 0.76951104, "num_input_tokens_seen": 131033990, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.22631836, "step": 6099, "time_per_iteration": 4.374940395355225 }, { "auxiliary_loss_clip": 0.01466271, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.28643334, "balance_loss_mlp": 1.01768398, "epoch": 0.36675184127461297, "flos": 16809129619200.0, "grad_norm": 1.777181724547024, "language_loss": 0.85892975, "learning_rate": 2.9237515115192602e-06, "loss": 0.88400024, "num_input_tokens_seen": 131050710, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.23071289, "step": 6100, "time_per_iteration": 4.255775690078735 }, { "auxiliary_loss_clip": 0.01499709, "auxiliary_loss_mlp": 0.01038082, "balance_loss_clip": 1.31027234, "balance_loss_mlp": 1.0159806, "epoch": 0.36681196452728093, "flos": 21916107302400.0, "grad_norm": 1.5962772499557765, "language_loss": 0.7202062, "learning_rate": 2.9234060630579992e-06, "loss": 0.74558413, "num_input_tokens_seen": 131071435, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.22106934, "step": 6101, "time_per_iteration": 2.855139970779419 }, { "auxiliary_loss_clip": 0.01482895, "auxiliary_loss_mlp": 0.0104219, "balance_loss_clip": 1.29658461, "balance_loss_mlp": 1.01831269, "epoch": 0.3668720877799489, "flos": 17721403401600.0, "grad_norm": 2.8292541672192533, "language_loss": 0.77291501, "learning_rate": 2.9230605795810865e-06, "loss": 0.79816586, "num_input_tokens_seen": 131088775, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.23876953, "step": 6102, "time_per_iteration": 2.7939679622650146 }, { "auxiliary_loss_clip": 0.01504232, "auxiliary_loss_mlp": 0.01045694, "balance_loss_clip": 1.3138901, "balance_loss_mlp": 1.02144659, "epoch": 0.36693221103261686, "flos": 47061635800320.0, "grad_norm": 1.474537139716108, "language_loss": 0.71099162, "learning_rate": 2.922715061101625e-06, "loss": 0.73649085, "num_input_tokens_seen": 131112800, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.24243164, "step": 6103, "time_per_iteration": 4.444256544113159 }, { "auxiliary_loss_clip": 0.01481409, "auxiliary_loss_mlp": 0.01043468, "balance_loss_clip": 1.29537439, "balance_loss_mlp": 1.01966178, "epoch": 0.3669923342852848, "flos": 15969392530560.0, "grad_norm": 1.9420866401388122, "language_loss": 0.72913063, "learning_rate": 2.922369507632716e-06, "loss": 0.75437945, "num_input_tokens_seen": 131131150, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.23791504, "step": 6104, "time_per_iteration": 2.8346498012542725 }, { "auxiliary_loss_clip": 0.01479757, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.29479742, "balance_loss_mlp": 1.01887107, "epoch": 0.3670524575379528, "flos": 19984021551360.0, "grad_norm": 2.1009134905568656, "language_loss": 0.82646203, "learning_rate": 2.9220239191874617e-06, "loss": 0.85168505, "num_input_tokens_seen": 131150365, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.23681641, "step": 6105, "time_per_iteration": 2.835418939590454 }, { "auxiliary_loss_clip": 0.01502148, "auxiliary_loss_mlp": 0.01044722, "balance_loss_clip": 1.31082094, "balance_loss_mlp": 1.02041507, "epoch": 0.36711258079062076, "flos": 25714031137920.0, "grad_norm": 5.173844524264317, "language_loss": 0.81124353, "learning_rate": 2.9216782957789692e-06, "loss": 0.83671224, "num_input_tokens_seen": 131169310, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.24316406, "step": 6106, "time_per_iteration": 2.888395071029663 }, { "auxiliary_loss_clip": 0.01254585, "auxiliary_loss_mlp": 0.01035053, "balance_loss_clip": 1.14627671, "balance_loss_mlp": 1.01712406, "epoch": 0.3671727040432887, "flos": 60804198817920.0, "grad_norm": 0.6906588824839662, "language_loss": 0.59263891, "learning_rate": 2.9213326374203426e-06, "loss": 0.61553532, "num_input_tokens_seen": 131232900, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.1796875, "step": 6107, "time_per_iteration": 3.437012195587158 }, { "auxiliary_loss_clip": 0.01471422, "auxiliary_loss_mlp": 0.01039877, "balance_loss_clip": 1.29004145, "balance_loss_mlp": 1.01773953, "epoch": 0.3672328272959567, "flos": 18670578958080.0, "grad_norm": 1.5179904414639913, "language_loss": 0.75278032, "learning_rate": 2.92098694412469e-06, "loss": 0.7778933, "num_input_tokens_seen": 131250920, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.22131348, "step": 6108, "time_per_iteration": 2.881842851638794 }, { "auxiliary_loss_clip": 0.01493263, "auxiliary_loss_mlp": 0.01040638, "balance_loss_clip": 1.30627275, "balance_loss_mlp": 1.0179646, "epoch": 0.3672929505486247, "flos": 15057344972160.0, "grad_norm": 2.3436450650063643, "language_loss": 0.74807531, "learning_rate": 2.9206412159051213e-06, "loss": 0.77341437, "num_input_tokens_seen": 131267910, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.2265625, "step": 6109, "time_per_iteration": 2.8985915184020996 }, { "auxiliary_loss_clip": 0.01475288, "auxiliary_loss_mlp": 0.01036743, "balance_loss_clip": 1.2921952, "balance_loss_mlp": 1.01516628, "epoch": 0.3673530738012927, "flos": 20597642536320.0, "grad_norm": 2.7194641725695208, "language_loss": 0.53773522, "learning_rate": 2.920295452774744e-06, "loss": 0.56285554, "num_input_tokens_seen": 131287150, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.21582031, "step": 6110, "time_per_iteration": 2.9379734992980957 }, { "auxiliary_loss_clip": 0.01476347, "auxiliary_loss_mlp": 0.01046857, "balance_loss_clip": 1.29558563, "balance_loss_mlp": 1.02314615, "epoch": 0.36741319705396064, "flos": 21699945054720.0, "grad_norm": 1.5529081773761995, "language_loss": 0.81270838, "learning_rate": 2.919949654746672e-06, "loss": 0.83794045, "num_input_tokens_seen": 131308225, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.23718262, "step": 6111, "time_per_iteration": 2.8541338443756104 }, { "auxiliary_loss_clip": 0.01468314, "auxiliary_loss_mlp": 0.01040414, "balance_loss_clip": 1.28768396, "balance_loss_mlp": 1.01808631, "epoch": 0.3674733203066286, "flos": 29874321728640.0, "grad_norm": 1.6617354929515382, "language_loss": 0.72951168, "learning_rate": 2.9196038218340163e-06, "loss": 0.75459892, "num_input_tokens_seen": 131332115, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.2232666, "step": 6112, "time_per_iteration": 2.961596727371216 }, { "auxiliary_loss_clip": 0.01468343, "auxiliary_loss_mlp": 0.01040522, "balance_loss_clip": 1.2868365, "balance_loss_mlp": 1.01783609, "epoch": 0.36753344355929657, "flos": 18265428604800.0, "grad_norm": 1.8096496584819008, "language_loss": 0.85382366, "learning_rate": 2.919257954049892e-06, "loss": 0.87891233, "num_input_tokens_seen": 131351885, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.22680664, "step": 6113, "time_per_iteration": 2.918994903564453 }, { "auxiliary_loss_clip": 0.01484352, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.29831648, "balance_loss_mlp": 1.01520908, "epoch": 0.36759356681196453, "flos": 25312002675840.0, "grad_norm": 1.7822425853438681, "language_loss": 0.79698771, "learning_rate": 2.918912051407413e-06, "loss": 0.82221675, "num_input_tokens_seen": 131370245, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.23352051, "step": 6114, "time_per_iteration": 2.889345169067383 }, { "auxiliary_loss_clip": 0.01488052, "auxiliary_loss_mlp": 0.01043073, "balance_loss_clip": 1.29990172, "balance_loss_mlp": 1.01833725, "epoch": 0.3676536900646325, "flos": 21042861799680.0, "grad_norm": 1.8163666979231712, "language_loss": 0.67991114, "learning_rate": 2.918566113919698e-06, "loss": 0.70522237, "num_input_tokens_seen": 131388115, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.24719238, "step": 6115, "time_per_iteration": 2.8165950775146484 }, { "auxiliary_loss_clip": 0.0146637, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.28630054, "balance_loss_mlp": 1.01466596, "epoch": 0.36771381331730046, "flos": 16296848282880.0, "grad_norm": 2.564451650831288, "language_loss": 0.77520978, "learning_rate": 2.9182201415998636e-06, "loss": 0.80024987, "num_input_tokens_seen": 131404595, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22949219, "step": 6116, "time_per_iteration": 2.849198818206787 }, { "auxiliary_loss_clip": 0.01482487, "auxiliary_loss_mlp": 0.01050984, "balance_loss_clip": 1.29676878, "balance_loss_mlp": 1.02832294, "epoch": 0.36777393656996843, "flos": 22320217025280.0, "grad_norm": 1.9326075307191937, "language_loss": 0.63541031, "learning_rate": 2.9178741344610286e-06, "loss": 0.66074502, "num_input_tokens_seen": 131423760, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22668457, "step": 6117, "time_per_iteration": 2.817072868347168 }, { "auxiliary_loss_clip": 0.01469643, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.28871989, "balance_loss_mlp": 1.02075672, "epoch": 0.3678340598226364, "flos": 26845453324800.0, "grad_norm": 1.7988275151676472, "language_loss": 0.74282086, "learning_rate": 2.9175280925163156e-06, "loss": 0.76794529, "num_input_tokens_seen": 131444955, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.22033691, "step": 6118, "time_per_iteration": 2.87461519241333 }, { "auxiliary_loss_clip": 0.01492968, "auxiliary_loss_mlp": 0.01046769, "balance_loss_clip": 1.3037976, "balance_loss_mlp": 1.02214074, "epoch": 0.36789418307530436, "flos": 21771576852480.0, "grad_norm": 1.6055185409422474, "language_loss": 0.7311554, "learning_rate": 2.9171820157788445e-06, "loss": 0.75655276, "num_input_tokens_seen": 131465720, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.24633789, "step": 6119, "time_per_iteration": 2.8904404640197754 }, { "auxiliary_loss_clip": 0.01481443, "auxiliary_loss_mlp": 0.0104511, "balance_loss_clip": 1.29847813, "balance_loss_mlp": 1.02199507, "epoch": 0.3679543063279723, "flos": 15932581246080.0, "grad_norm": 1.7860672992474638, "language_loss": 0.81593716, "learning_rate": 2.9168359042617404e-06, "loss": 0.84120274, "num_input_tokens_seen": 131483080, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.23120117, "step": 6120, "time_per_iteration": 2.80971622467041 }, { "auxiliary_loss_clip": 0.01477064, "auxiliary_loss_mlp": 0.01051191, "balance_loss_clip": 1.29390478, "balance_loss_mlp": 1.02855277, "epoch": 0.3680144295806403, "flos": 24285087273600.0, "grad_norm": 2.5607043876292632, "language_loss": 0.65166867, "learning_rate": 2.916489757978126e-06, "loss": 0.67695117, "num_input_tokens_seen": 131502545, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22631836, "step": 6121, "time_per_iteration": 2.896505355834961 }, { "auxiliary_loss_clip": 0.01476395, "auxiliary_loss_mlp": 0.0105236, "balance_loss_clip": 1.29389739, "balance_loss_mlp": 1.02954304, "epoch": 0.36807455283330826, "flos": 26115290438400.0, "grad_norm": 2.094244634220413, "language_loss": 0.72165358, "learning_rate": 2.9161435769411286e-06, "loss": 0.74694109, "num_input_tokens_seen": 131522155, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.22827148, "step": 6122, "time_per_iteration": 2.8586266040802 }, { "auxiliary_loss_clip": 0.01461592, "auxiliary_loss_mlp": 0.01048838, "balance_loss_clip": 1.28337455, "balance_loss_mlp": 1.02631903, "epoch": 0.3681346760859763, "flos": 24655824316800.0, "grad_norm": 1.7472504122657813, "language_loss": 0.70330942, "learning_rate": 2.915797361163875e-06, "loss": 0.7284137, "num_input_tokens_seen": 131543865, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.22521973, "step": 6123, "time_per_iteration": 2.8912243843078613 }, { "auxiliary_loss_clip": 0.01496385, "auxiliary_loss_mlp": 0.0105226, "balance_loss_clip": 1.3077116, "balance_loss_mlp": 1.0286448, "epoch": 0.36819479933864424, "flos": 23888895390720.0, "grad_norm": 3.3791018998367566, "language_loss": 0.75047755, "learning_rate": 2.9154511106594933e-06, "loss": 0.77596402, "num_input_tokens_seen": 131562155, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.23620605, "step": 6124, "time_per_iteration": 2.8506994247436523 }, { "auxiliary_loss_clip": 0.01482104, "auxiliary_loss_mlp": 0.01055036, "balance_loss_clip": 1.29677916, "balance_loss_mlp": 1.03106332, "epoch": 0.3682549225913122, "flos": 25564252291200.0, "grad_norm": 2.1740603969701957, "language_loss": 0.75427401, "learning_rate": 2.915104825441114e-06, "loss": 0.77964544, "num_input_tokens_seen": 131581695, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.23986816, "step": 6125, "time_per_iteration": 2.8641669750213623 }, { "auxiliary_loss_clip": 0.01499679, "auxiliary_loss_mlp": 0.01050808, "balance_loss_clip": 1.31134629, "balance_loss_mlp": 1.02589321, "epoch": 0.36831504584398017, "flos": 16955469861120.0, "grad_norm": 1.8557380661578575, "language_loss": 0.7941553, "learning_rate": 2.9147585055218686e-06, "loss": 0.81966019, "num_input_tokens_seen": 131599465, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.24914551, "step": 6126, "time_per_iteration": 2.821234941482544 }, { "auxiliary_loss_clip": 0.01495913, "auxiliary_loss_mlp": 0.01048656, "balance_loss_clip": 1.30525744, "balance_loss_mlp": 1.02402711, "epoch": 0.36837516909664814, "flos": 19874583083520.0, "grad_norm": 2.287705119854965, "language_loss": 0.67334235, "learning_rate": 2.914412150914888e-06, "loss": 0.69878805, "num_input_tokens_seen": 131618330, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.24645996, "step": 6127, "time_per_iteration": 2.9021427631378174 }, { "auxiliary_loss_clip": 0.01492813, "auxiliary_loss_mlp": 0.01051451, "balance_loss_clip": 1.30528641, "balance_loss_mlp": 1.02657187, "epoch": 0.3684352923493161, "flos": 37639385527680.0, "grad_norm": 2.0236871871878908, "language_loss": 0.71451163, "learning_rate": 2.9140657616333074e-06, "loss": 0.73995429, "num_input_tokens_seen": 131638960, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.2487793, "step": 6128, "time_per_iteration": 3.0019876956939697 }, { "auxiliary_loss_clip": 0.01473872, "auxiliary_loss_mlp": 0.01046315, "balance_loss_clip": 1.28975415, "balance_loss_mlp": 1.02217543, "epoch": 0.36849541560198407, "flos": 14473703306880.0, "grad_norm": 1.9062124232330893, "language_loss": 0.76246905, "learning_rate": 2.9137193376902614e-06, "loss": 0.78767091, "num_input_tokens_seen": 131657440, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.24133301, "step": 6129, "time_per_iteration": 4.250041484832764 }, { "auxiliary_loss_clip": 0.01483275, "auxiliary_loss_mlp": 0.01041212, "balance_loss_clip": 1.29745889, "balance_loss_mlp": 1.01697719, "epoch": 0.36855553885465203, "flos": 25780505028480.0, "grad_norm": 1.573525590842421, "language_loss": 0.85355967, "learning_rate": 2.9133728790988868e-06, "loss": 0.87880456, "num_input_tokens_seen": 131678035, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.24230957, "step": 6130, "time_per_iteration": 2.8736064434051514 }, { "auxiliary_loss_clip": 0.01280268, "auxiliary_loss_mlp": 0.01040689, "balance_loss_clip": 1.16259313, "balance_loss_mlp": 1.01770592, "epoch": 0.36861566210732, "flos": 65084017466880.0, "grad_norm": 0.810176330522543, "language_loss": 0.60288447, "learning_rate": 2.913026385872321e-06, "loss": 0.62609404, "num_input_tokens_seen": 131742470, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.22949219, "step": 6131, "time_per_iteration": 3.465580463409424 }, { "auxiliary_loss_clip": 0.01467133, "auxiliary_loss_mlp": 0.01040503, "balance_loss_clip": 1.28456676, "balance_loss_mlp": 1.01700711, "epoch": 0.36867578535998796, "flos": 30966308432640.0, "grad_norm": 1.7846892854947753, "language_loss": 0.7381863, "learning_rate": 2.9126798580237034e-06, "loss": 0.76326269, "num_input_tokens_seen": 131764570, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.23510742, "step": 6132, "time_per_iteration": 2.931349754333496 }, { "auxiliary_loss_clip": 0.0151051, "auxiliary_loss_mlp": 0.0105053, "balance_loss_clip": 1.31787062, "balance_loss_mlp": 1.02648592, "epoch": 0.3687359086126559, "flos": 28849125628800.0, "grad_norm": 1.956673245535813, "language_loss": 0.74875855, "learning_rate": 2.9123332955661736e-06, "loss": 0.774369, "num_input_tokens_seen": 131785720, "router_z_loss_clip": 1.92675781, "router_z_loss_mlp": 0.24072266, "step": 6133, "time_per_iteration": 2.906123161315918 }, { "auxiliary_loss_clip": 0.01460194, "auxiliary_loss_mlp": 0.01044494, "balance_loss_clip": 1.28021598, "balance_loss_mlp": 1.02048564, "epoch": 0.3687960318653239, "flos": 21406540654080.0, "grad_norm": 2.737547403012392, "language_loss": 0.72600615, "learning_rate": 2.911986698512874e-06, "loss": 0.75105304, "num_input_tokens_seen": 131804430, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.24023438, "step": 6134, "time_per_iteration": 2.922503709793091 }, { "auxiliary_loss_clip": 0.01477209, "auxiliary_loss_mlp": 0.01044624, "balance_loss_clip": 1.29377532, "balance_loss_mlp": 1.02164102, "epoch": 0.36885615511799186, "flos": 20275570915200.0, "grad_norm": 41.4552009670501, "language_loss": 0.75773656, "learning_rate": 2.9116400668769477e-06, "loss": 0.78295493, "num_input_tokens_seen": 131822060, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.22973633, "step": 6135, "time_per_iteration": 5.69055438041687 }, { "auxiliary_loss_clip": 0.01286783, "auxiliary_loss_mlp": 0.01025564, "balance_loss_clip": 1.17026424, "balance_loss_mlp": 1.00486934, "epoch": 0.3689162783706599, "flos": 63115962099840.0, "grad_norm": 0.8316487855638516, "language_loss": 0.58884859, "learning_rate": 2.9112934006715376e-06, "loss": 0.61197203, "num_input_tokens_seen": 131880715, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.20703125, "step": 6136, "time_per_iteration": 3.3092172145843506 }, { "auxiliary_loss_clip": 0.01470881, "auxiliary_loss_mlp": 0.01050511, "balance_loss_clip": 1.28740335, "balance_loss_mlp": 1.0260613, "epoch": 0.36897640162332784, "flos": 10969048137600.0, "grad_norm": 2.150763428173469, "language_loss": 0.80162644, "learning_rate": 2.9109466999097918e-06, "loss": 0.8268404, "num_input_tokens_seen": 131895850, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.24462891, "step": 6137, "time_per_iteration": 2.8111350536346436 }, { "auxiliary_loss_clip": 0.01473273, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.29058719, "balance_loss_mlp": 1.02708769, "epoch": 0.3690365248759958, "flos": 20714048703360.0, "grad_norm": 1.876863393325964, "language_loss": 0.75020713, "learning_rate": 2.9105999646048552e-06, "loss": 0.77544743, "num_input_tokens_seen": 131915775, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.23657227, "step": 6138, "time_per_iteration": 4.339171648025513 }, { "auxiliary_loss_clip": 0.01480315, "auxiliary_loss_mlp": 0.01051351, "balance_loss_clip": 1.29255569, "balance_loss_mlp": 1.0272944, "epoch": 0.3690966481286638, "flos": 31837427429760.0, "grad_norm": 2.4379030167686486, "language_loss": 0.66788483, "learning_rate": 2.9102531947698764e-06, "loss": 0.69320154, "num_input_tokens_seen": 131935715, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.24072266, "step": 6139, "time_per_iteration": 2.9106504917144775 }, { "auxiliary_loss_clip": 0.01460106, "auxiliary_loss_mlp": 0.010508, "balance_loss_clip": 1.27983975, "balance_loss_mlp": 1.02619529, "epoch": 0.36915677138133174, "flos": 13122635022720.0, "grad_norm": 2.014060410433049, "language_loss": 0.71779168, "learning_rate": 2.909906390418006e-06, "loss": 0.74290079, "num_input_tokens_seen": 131954120, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.24597168, "step": 6140, "time_per_iteration": 2.9279351234436035 }, { "auxiliary_loss_clip": 0.01282059, "auxiliary_loss_mlp": 0.01048542, "balance_loss_clip": 1.1667881, "balance_loss_mlp": 1.02574873, "epoch": 0.3692168946339997, "flos": 68719539156480.0, "grad_norm": 0.7533367682662114, "language_loss": 0.59386629, "learning_rate": 2.9095595515623934e-06, "loss": 0.6171723, "num_input_tokens_seen": 132017485, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.22753906, "step": 6141, "time_per_iteration": 3.431361198425293 }, { "auxiliary_loss_clip": 0.01479791, "auxiliary_loss_mlp": 0.0104183, "balance_loss_clip": 1.29563761, "balance_loss_mlp": 1.01839328, "epoch": 0.36927701788666767, "flos": 22027988989440.0, "grad_norm": 1.6000270253544508, "language_loss": 0.76054627, "learning_rate": 2.909212678216192e-06, "loss": 0.78576249, "num_input_tokens_seen": 132036760, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.234375, "step": 6142, "time_per_iteration": 2.8373348712921143 }, { "auxiliary_loss_clip": 0.01461456, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.27944684, "balance_loss_mlp": 1.02358699, "epoch": 0.36933714113933563, "flos": 21845697114240.0, "grad_norm": 1.8338464152562803, "language_loss": 0.77267879, "learning_rate": 2.908865770392555e-06, "loss": 0.79775929, "num_input_tokens_seen": 132056935, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.23010254, "step": 6143, "time_per_iteration": 2.8635764122009277 }, { "auxiliary_loss_clip": 0.01469924, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.2875278, "balance_loss_mlp": 1.01631391, "epoch": 0.3693972643920036, "flos": 23701083649920.0, "grad_norm": 1.9196779693834203, "language_loss": 0.82645273, "learning_rate": 2.9085188281046364e-06, "loss": 0.85154355, "num_input_tokens_seen": 132077285, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.22839355, "step": 6144, "time_per_iteration": 2.866054058074951 }, { "auxiliary_loss_clip": 0.01472124, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.28674436, "balance_loss_mlp": 1.0229634, "epoch": 0.36945738764467156, "flos": 22867002161280.0, "grad_norm": 2.1112319876498615, "language_loss": 0.7868005, "learning_rate": 2.908171851365593e-06, "loss": 0.81198412, "num_input_tokens_seen": 132095520, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.23266602, "step": 6145, "time_per_iteration": 2.8703463077545166 }, { "auxiliary_loss_clip": 0.01470397, "auxiliary_loss_mlp": 0.01043061, "balance_loss_clip": 1.28414559, "balance_loss_mlp": 1.01802659, "epoch": 0.36951751089733953, "flos": 16624123056000.0, "grad_norm": 2.3622876489875466, "language_loss": 0.77778411, "learning_rate": 2.9078248401885815e-06, "loss": 0.80291867, "num_input_tokens_seen": 132112810, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.25036621, "step": 6146, "time_per_iteration": 2.8842873573303223 }, { "auxiliary_loss_clip": 0.0148833, "auxiliary_loss_mlp": 0.01052409, "balance_loss_clip": 1.3007381, "balance_loss_mlp": 1.02832842, "epoch": 0.3695776341500075, "flos": 18923326266240.0, "grad_norm": 1.6700099551924126, "language_loss": 0.81471956, "learning_rate": 2.907477794586761e-06, "loss": 0.84012699, "num_input_tokens_seen": 132131615, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.24084473, "step": 6147, "time_per_iteration": 2.8260788917541504 }, { "auxiliary_loss_clip": 0.01477842, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.29044998, "balance_loss_mlp": 1.02199578, "epoch": 0.36963775740267546, "flos": 20816881430400.0, "grad_norm": 1.7917875964956451, "language_loss": 0.84251684, "learning_rate": 2.9071307145732926e-06, "loss": 0.86776495, "num_input_tokens_seen": 132149585, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.25, "step": 6148, "time_per_iteration": 2.8560986518859863 }, { "auxiliary_loss_clip": 0.01479921, "auxiliary_loss_mlp": 0.01059501, "balance_loss_clip": 1.29667211, "balance_loss_mlp": 1.03521836, "epoch": 0.3696978806553435, "flos": 26071511454720.0, "grad_norm": 1.9430812109723135, "language_loss": 0.75107992, "learning_rate": 2.9067836001613357e-06, "loss": 0.77647412, "num_input_tokens_seen": 132165555, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.24291992, "step": 6149, "time_per_iteration": 2.877223014831543 }, { "auxiliary_loss_clip": 0.01488239, "auxiliary_loss_mlp": 0.01061173, "balance_loss_clip": 1.30025649, "balance_loss_mlp": 1.03413677, "epoch": 0.36975800390801145, "flos": 26845272345600.0, "grad_norm": 12.049238490939379, "language_loss": 0.72102016, "learning_rate": 2.906436451364054e-06, "loss": 0.74651432, "num_input_tokens_seen": 132185100, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.27050781, "step": 6150, "time_per_iteration": 2.9266157150268555 }, { "auxiliary_loss_clip": 0.01484547, "auxiliary_loss_mlp": 0.01049723, "balance_loss_clip": 1.29911494, "balance_loss_mlp": 1.02643013, "epoch": 0.3698181271606794, "flos": 21152616981120.0, "grad_norm": 1.5273973720671126, "language_loss": 0.82401466, "learning_rate": 2.906089268194611e-06, "loss": 0.84935737, "num_input_tokens_seen": 132203930, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.23278809, "step": 6151, "time_per_iteration": 2.834444046020508 }, { "auxiliary_loss_clip": 0.01265911, "auxiliary_loss_mlp": 0.0104613, "balance_loss_clip": 1.15208375, "balance_loss_mlp": 1.02476811, "epoch": 0.3698782504133474, "flos": 66772070173440.0, "grad_norm": 0.792460423193385, "language_loss": 0.63163978, "learning_rate": 2.9057420506661726e-06, "loss": 0.65476024, "num_input_tokens_seen": 132263845, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.21386719, "step": 6152, "time_per_iteration": 3.4524428844451904 }, { "auxiliary_loss_clip": 0.01464254, "auxiliary_loss_mlp": 0.0104403, "balance_loss_clip": 1.28494549, "balance_loss_mlp": 1.02118945, "epoch": 0.36993837366601534, "flos": 24320948417280.0, "grad_norm": 2.8054195003551627, "language_loss": 0.7078284, "learning_rate": 2.9053947987919044e-06, "loss": 0.73291123, "num_input_tokens_seen": 132282350, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.22839355, "step": 6153, "time_per_iteration": 2.901646137237549 }, { "auxiliary_loss_clip": 0.01487248, "auxiliary_loss_mlp": 0.01042297, "balance_loss_clip": 1.29969525, "balance_loss_mlp": 1.01834846, "epoch": 0.3699984969186833, "flos": 24358936066560.0, "grad_norm": 1.7397056766040822, "language_loss": 0.73307765, "learning_rate": 2.9050475125849755e-06, "loss": 0.75837314, "num_input_tokens_seen": 132301930, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.23962402, "step": 6154, "time_per_iteration": 2.888070821762085 }, { "auxiliary_loss_clip": 0.01491631, "auxiliary_loss_mlp": 0.01042152, "balance_loss_clip": 1.30499113, "balance_loss_mlp": 1.0187993, "epoch": 0.37005862017135127, "flos": 19838812429440.0, "grad_norm": 2.1119159318500547, "language_loss": 0.68424183, "learning_rate": 2.9047001920585534e-06, "loss": 0.70957971, "num_input_tokens_seen": 132320915, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.23376465, "step": 6155, "time_per_iteration": 2.8335988521575928 }, { "auxiliary_loss_clip": 0.01484952, "auxiliary_loss_mlp": 0.01040362, "balance_loss_clip": 1.29896903, "balance_loss_mlp": 1.01708007, "epoch": 0.37011874342401924, "flos": 19583395678080.0, "grad_norm": 1.8246113844097485, "language_loss": 0.68883437, "learning_rate": 2.9043528372258097e-06, "loss": 0.71408749, "num_input_tokens_seen": 132340415, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.23278809, "step": 6156, "time_per_iteration": 2.865536689758301 }, { "auxiliary_loss_clip": 0.0147291, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.29064059, "balance_loss_mlp": 1.01558423, "epoch": 0.3701788666766872, "flos": 20383606794240.0, "grad_norm": 2.109264771324506, "language_loss": 0.82412326, "learning_rate": 2.904005448099916e-06, "loss": 0.84923792, "num_input_tokens_seen": 132358600, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.22961426, "step": 6157, "time_per_iteration": 2.8589301109313965 }, { "auxiliary_loss_clip": 0.01500458, "auxiliary_loss_mlp": 0.01047792, "balance_loss_clip": 1.30767083, "balance_loss_mlp": 1.02328277, "epoch": 0.37023898992935517, "flos": 15349301539200.0, "grad_norm": 2.7438730371875044, "language_loss": 0.77665997, "learning_rate": 2.9036580246940444e-06, "loss": 0.80214244, "num_input_tokens_seen": 132373160, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.24499512, "step": 6158, "time_per_iteration": 2.820526123046875 }, { "auxiliary_loss_clip": 0.01506114, "auxiliary_loss_mlp": 0.01041208, "balance_loss_clip": 1.31476188, "balance_loss_mlp": 1.01693702, "epoch": 0.37029911318202313, "flos": 19583576657280.0, "grad_norm": 5.490514350651382, "language_loss": 0.69832599, "learning_rate": 2.9033105670213708e-06, "loss": 0.72379923, "num_input_tokens_seen": 132392345, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.24291992, "step": 6159, "time_per_iteration": 2.830812931060791 }, { "auxiliary_loss_clip": 0.01482631, "auxiliary_loss_mlp": 0.01045011, "balance_loss_clip": 1.29644871, "balance_loss_mlp": 1.02159834, "epoch": 0.3703592364346911, "flos": 26224457437440.0, "grad_norm": 3.109444447070503, "language_loss": 0.71774, "learning_rate": 2.9029630750950697e-06, "loss": 0.74301642, "num_input_tokens_seen": 132412620, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.23425293, "step": 6160, "time_per_iteration": 2.927227735519409 }, { "auxiliary_loss_clip": 0.01470143, "auxiliary_loss_mlp": 0.01037938, "balance_loss_clip": 1.28902268, "balance_loss_mlp": 1.01494288, "epoch": 0.37041935968735906, "flos": 20058096568320.0, "grad_norm": 1.7020736677870327, "language_loss": 0.79883558, "learning_rate": 2.9026155489283176e-06, "loss": 0.82391632, "num_input_tokens_seen": 132431570, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.23010254, "step": 6161, "time_per_iteration": 2.844646692276001 }, { "auxiliary_loss_clip": 0.01485571, "auxiliary_loss_mlp": 0.01043153, "balance_loss_clip": 1.29836345, "balance_loss_mlp": 1.01956105, "epoch": 0.3704794829400271, "flos": 24144402631680.0, "grad_norm": 1.6523075584211366, "language_loss": 0.79853642, "learning_rate": 2.902267988534295e-06, "loss": 0.82382369, "num_input_tokens_seen": 132451525, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.23608398, "step": 6162, "time_per_iteration": 2.913203001022339 }, { "auxiliary_loss_clip": 0.01485157, "auxiliary_loss_mlp": 0.01043856, "balance_loss_clip": 1.29939342, "balance_loss_mlp": 1.02050257, "epoch": 0.37053960619269505, "flos": 14875188831360.0, "grad_norm": 2.1644467725274, "language_loss": 0.80256826, "learning_rate": 2.9019203939261783e-06, "loss": 0.82785839, "num_input_tokens_seen": 132469875, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.23376465, "step": 6163, "time_per_iteration": 2.8338193893432617 }, { "auxiliary_loss_clip": 0.01491985, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.30505979, "balance_loss_mlp": 1.01580262, "epoch": 0.370599729445363, "flos": 21371539161600.0, "grad_norm": 1.818074604580883, "language_loss": 0.69128847, "learning_rate": 2.9015727651171507e-06, "loss": 0.71659076, "num_input_tokens_seen": 132488360, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.22436523, "step": 6164, "time_per_iteration": 4.2895588874816895 }, { "auxiliary_loss_clip": 0.01490231, "auxiliary_loss_mlp": 0.01042409, "balance_loss_clip": 1.30309319, "balance_loss_mlp": 1.01904416, "epoch": 0.370659852698031, "flos": 26839119052800.0, "grad_norm": 2.1376517024201416, "language_loss": 0.83792478, "learning_rate": 2.9012251021203935e-06, "loss": 0.86325121, "num_input_tokens_seen": 132508630, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.23376465, "step": 6165, "time_per_iteration": 2.9309298992156982 }, { "auxiliary_loss_clip": 0.01499703, "auxiliary_loss_mlp": 0.0104329, "balance_loss_clip": 1.30883145, "balance_loss_mlp": 1.01823187, "epoch": 0.37071997595069894, "flos": 19108378074240.0, "grad_norm": 1.9851063981824921, "language_loss": 0.69938815, "learning_rate": 2.9008774049490896e-06, "loss": 0.72481811, "num_input_tokens_seen": 132527465, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.25073242, "step": 6166, "time_per_iteration": 2.8714940547943115 }, { "auxiliary_loss_clip": 0.01267523, "auxiliary_loss_mlp": 0.01054599, "balance_loss_clip": 1.16007531, "balance_loss_mlp": 1.03275955, "epoch": 0.3707800992033669, "flos": 52202366104320.0, "grad_norm": 0.7975691393019588, "language_loss": 0.57127386, "learning_rate": 2.9005296736164244e-06, "loss": 0.59449506, "num_input_tokens_seen": 132579940, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.21875, "step": 6167, "time_per_iteration": 3.1985366344451904 }, { "auxiliary_loss_clip": 0.01481138, "auxiliary_loss_mlp": 0.01038059, "balance_loss_clip": 1.29744291, "balance_loss_mlp": 1.0148375, "epoch": 0.3708402224560349, "flos": 19911439612800.0, "grad_norm": 3.279379565318279, "language_loss": 0.7625128, "learning_rate": 2.900181908135584e-06, "loss": 0.78770483, "num_input_tokens_seen": 132598390, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.23217773, "step": 6168, "time_per_iteration": 2.889181613922119 }, { "auxiliary_loss_clip": 0.01489708, "auxiliary_loss_mlp": 0.01039111, "balance_loss_clip": 1.30341697, "balance_loss_mlp": 1.01576996, "epoch": 0.37090034570870284, "flos": 20016308355840.0, "grad_norm": 1.7352618232866452, "language_loss": 0.74910963, "learning_rate": 2.899834108519755e-06, "loss": 0.77439779, "num_input_tokens_seen": 132616920, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.23339844, "step": 6169, "time_per_iteration": 2.951138734817505 }, { "auxiliary_loss_clip": 0.01483888, "auxiliary_loss_mlp": 0.01041237, "balance_loss_clip": 1.30023098, "balance_loss_mlp": 1.01799083, "epoch": 0.3709604689613708, "flos": 24145533751680.0, "grad_norm": 1.8149309976035821, "language_loss": 0.80442917, "learning_rate": 2.899486274782127e-06, "loss": 0.82968044, "num_input_tokens_seen": 132637660, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.23242188, "step": 6170, "time_per_iteration": 5.730466842651367 }, { "auxiliary_loss_clip": 0.0148506, "auxiliary_loss_mlp": 0.01043157, "balance_loss_clip": 1.29716897, "balance_loss_mlp": 1.01941109, "epoch": 0.37102059221403877, "flos": 23885999723520.0, "grad_norm": 2.9327156642793653, "language_loss": 0.77551508, "learning_rate": 2.8991384069358885e-06, "loss": 0.80079722, "num_input_tokens_seen": 132657635, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.23754883, "step": 6171, "time_per_iteration": 2.887183904647827 }, { "auxiliary_loss_clip": 0.0149158, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.30446005, "balance_loss_mlp": 1.0183835, "epoch": 0.37108071546670673, "flos": 14509292981760.0, "grad_norm": 2.6305826895437696, "language_loss": 0.81225383, "learning_rate": 2.898790504994232e-06, "loss": 0.83758783, "num_input_tokens_seen": 132674455, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.23449707, "step": 6172, "time_per_iteration": 2.834150552749634 }, { "auxiliary_loss_clip": 0.01506019, "auxiliary_loss_mlp": 0.01044323, "balance_loss_clip": 1.31650424, "balance_loss_mlp": 1.0211606, "epoch": 0.3711408387193747, "flos": 34574339266560.0, "grad_norm": 2.4048301281959006, "language_loss": 0.60245776, "learning_rate": 2.89844256897035e-06, "loss": 0.62796116, "num_input_tokens_seen": 132695140, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.23168945, "step": 6173, "time_per_iteration": 4.437655448913574 }, { "auxiliary_loss_clip": 0.01498127, "auxiliary_loss_mlp": 0.01049834, "balance_loss_clip": 1.30935049, "balance_loss_mlp": 1.02601576, "epoch": 0.37120096197204266, "flos": 17319465429120.0, "grad_norm": 1.9950311302324908, "language_loss": 0.81942016, "learning_rate": 2.898094598877435e-06, "loss": 0.84489977, "num_input_tokens_seen": 132712470, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.23840332, "step": 6174, "time_per_iteration": 2.8204665184020996 }, { "auxiliary_loss_clip": 0.01486478, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.30335474, "balance_loss_mlp": 1.02154303, "epoch": 0.37126108522471063, "flos": 30676161657600.0, "grad_norm": 2.2234879232751212, "language_loss": 0.80441749, "learning_rate": 2.8977465947286826e-06, "loss": 0.82971919, "num_input_tokens_seen": 132732945, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.22155762, "step": 6175, "time_per_iteration": 2.929279327392578 }, { "auxiliary_loss_clip": 0.01490717, "auxiliary_loss_mlp": 0.01047888, "balance_loss_clip": 1.3054899, "balance_loss_mlp": 1.02520227, "epoch": 0.37132120847737865, "flos": 25166567329920.0, "grad_norm": 2.1870697543388817, "language_loss": 0.89341241, "learning_rate": 2.89739855653729e-06, "loss": 0.91879851, "num_input_tokens_seen": 132752470, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.22680664, "step": 6176, "time_per_iteration": 2.886245012283325 }, { "auxiliary_loss_clip": 0.01498121, "auxiliary_loss_mlp": 0.01051239, "balance_loss_clip": 1.31185138, "balance_loss_mlp": 1.02841032, "epoch": 0.3713813317300466, "flos": 21222936679680.0, "grad_norm": 1.5024782705148505, "language_loss": 0.74299872, "learning_rate": 2.8970504843164546e-06, "loss": 0.76849228, "num_input_tokens_seen": 132771485, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.22839355, "step": 6177, "time_per_iteration": 2.861433982849121 }, { "auxiliary_loss_clip": 0.0149371, "auxiliary_loss_mlp": 0.01054588, "balance_loss_clip": 1.30800986, "balance_loss_mlp": 1.03278422, "epoch": 0.3714414549827146, "flos": 21626684444160.0, "grad_norm": 2.2456372938563907, "language_loss": 0.76167989, "learning_rate": 2.896702378079374e-06, "loss": 0.78716284, "num_input_tokens_seen": 132791465, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.21789551, "step": 6178, "time_per_iteration": 2.826676845550537 }, { "auxiliary_loss_clip": 0.01514105, "auxiliary_loss_mlp": 0.01054395, "balance_loss_clip": 1.32756138, "balance_loss_mlp": 1.03112507, "epoch": 0.37150157823538255, "flos": 19981261618560.0, "grad_norm": 2.412703778219655, "language_loss": 0.72449785, "learning_rate": 2.8963542378392502e-06, "loss": 0.75018287, "num_input_tokens_seen": 132810160, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.23278809, "step": 6179, "time_per_iteration": 2.8697168827056885 }, { "auxiliary_loss_clip": 0.01508593, "auxiliary_loss_mlp": 0.0105248, "balance_loss_clip": 1.31964016, "balance_loss_mlp": 1.02886486, "epoch": 0.3715617014880505, "flos": 24870583975680.0, "grad_norm": 1.8345234204854466, "language_loss": 0.7087447, "learning_rate": 2.896006063609283e-06, "loss": 0.73435545, "num_input_tokens_seen": 132831265, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.23620605, "step": 6180, "time_per_iteration": 2.9819300174713135 }, { "auxiliary_loss_clip": 0.01501812, "auxiliary_loss_mlp": 0.01050072, "balance_loss_clip": 1.31676054, "balance_loss_mlp": 1.02766109, "epoch": 0.3716218247407185, "flos": 20458812931200.0, "grad_norm": 1.8431508703197537, "language_loss": 0.78725576, "learning_rate": 2.8956578554026767e-06, "loss": 0.81277466, "num_input_tokens_seen": 132850005, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.22424316, "step": 6181, "time_per_iteration": 2.8477768898010254 }, { "auxiliary_loss_clip": 0.01491423, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.30702055, "balance_loss_mlp": 1.02721632, "epoch": 0.37168194799338644, "flos": 24143995428480.0, "grad_norm": 3.0170805478235048, "language_loss": 0.79933149, "learning_rate": 2.8953096132326343e-06, "loss": 0.82474303, "num_input_tokens_seen": 132865790, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.22497559, "step": 6182, "time_per_iteration": 2.894883155822754 }, { "auxiliary_loss_clip": 0.01288046, "auxiliary_loss_mlp": 0.01034451, "balance_loss_clip": 1.17472088, "balance_loss_mlp": 1.01366067, "epoch": 0.3717420712460544, "flos": 67441007566080.0, "grad_norm": 0.7818530899177816, "language_loss": 0.57529604, "learning_rate": 2.894961337112362e-06, "loss": 0.59852105, "num_input_tokens_seen": 132921775, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 0.20800781, "step": 6183, "time_per_iteration": 3.324918031692505 }, { "auxiliary_loss_clip": 0.01523308, "auxiliary_loss_mlp": 0.0105369, "balance_loss_clip": 1.32811081, "balance_loss_mlp": 1.02968097, "epoch": 0.37180219449872237, "flos": 22385605040640.0, "grad_norm": 2.3164009933047267, "language_loss": 0.77247167, "learning_rate": 2.894613027055066e-06, "loss": 0.79824167, "num_input_tokens_seen": 132941060, "router_z_loss_clip": 1.94921875, "router_z_loss_mlp": 0.23999023, "step": 6184, "time_per_iteration": 2.8766307830810547 }, { "auxiliary_loss_clip": 0.01484881, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.30089152, "balance_loss_mlp": 1.02099061, "epoch": 0.37186231775139034, "flos": 21879431752320.0, "grad_norm": 1.8049261052586905, "language_loss": 0.72855425, "learning_rate": 2.894264683073954e-06, "loss": 0.75384492, "num_input_tokens_seen": 132961850, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23205566, "step": 6185, "time_per_iteration": 2.86906099319458 }, { "auxiliary_loss_clip": 0.01486031, "auxiliary_loss_mlp": 0.01041625, "balance_loss_clip": 1.3030138, "balance_loss_mlp": 1.01783061, "epoch": 0.3719224410040583, "flos": 22424407096320.0, "grad_norm": 1.5510360307707731, "language_loss": 0.77777874, "learning_rate": 2.8939163051822363e-06, "loss": 0.80305529, "num_input_tokens_seen": 132981625, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.23803711, "step": 6186, "time_per_iteration": 2.868987560272217 }, { "auxiliary_loss_clip": 0.01497186, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.30568421, "balance_loss_mlp": 1.01845527, "epoch": 0.37198256425672627, "flos": 25161002219520.0, "grad_norm": 1.7543539281375307, "language_loss": 0.8449136, "learning_rate": 2.8935678933931224e-06, "loss": 0.87031269, "num_input_tokens_seen": 133001225, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.24230957, "step": 6187, "time_per_iteration": 2.9073140621185303 }, { "auxiliary_loss_clip": 0.01480248, "auxiliary_loss_mlp": 0.01039647, "balance_loss_clip": 1.29668283, "balance_loss_mlp": 1.01643729, "epoch": 0.37204268750939423, "flos": 21147187605120.0, "grad_norm": 1.8633987288567233, "language_loss": 0.84850496, "learning_rate": 2.893219447719824e-06, "loss": 0.87370396, "num_input_tokens_seen": 133018820, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.23193359, "step": 6188, "time_per_iteration": 2.8571369647979736 }, { "auxiliary_loss_clip": 0.01490443, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.30440712, "balance_loss_mlp": 1.01619017, "epoch": 0.37210281076206225, "flos": 21516748283520.0, "grad_norm": 1.8776773539198146, "language_loss": 0.66719586, "learning_rate": 2.8928709681755548e-06, "loss": 0.69250607, "num_input_tokens_seen": 133040205, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.24389648, "step": 6189, "time_per_iteration": 2.926896095275879 }, { "auxiliary_loss_clip": 0.0148734, "auxiliary_loss_mlp": 0.01040276, "balance_loss_clip": 1.30027604, "balance_loss_mlp": 1.01590919, "epoch": 0.3721629340147302, "flos": 17356774406400.0, "grad_norm": 1.9844421944634887, "language_loss": 0.85042906, "learning_rate": 2.8925224547735293e-06, "loss": 0.87570518, "num_input_tokens_seen": 133058095, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.24389648, "step": 6190, "time_per_iteration": 2.87680983543396 }, { "auxiliary_loss_clip": 0.01495853, "auxiliary_loss_mlp": 0.01038667, "balance_loss_clip": 1.30411828, "balance_loss_mlp": 1.01412201, "epoch": 0.3722230572673982, "flos": 16440247612800.0, "grad_norm": 2.54891119821353, "language_loss": 0.89768422, "learning_rate": 2.8921739075269633e-06, "loss": 0.92302948, "num_input_tokens_seen": 133071530, "router_z_loss_clip": 1.91699219, "router_z_loss_mlp": 0.2454834, "step": 6191, "time_per_iteration": 2.867191791534424 }, { "auxiliary_loss_clip": 0.01499773, "auxiliary_loss_mlp": 0.01042235, "balance_loss_clip": 1.30757999, "balance_loss_mlp": 1.01637852, "epoch": 0.37228318052006615, "flos": 22685026999680.0, "grad_norm": 2.3957749309522356, "language_loss": 0.74188596, "learning_rate": 2.891825326449073e-06, "loss": 0.76730597, "num_input_tokens_seen": 133091410, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.25842285, "step": 6192, "time_per_iteration": 2.9041192531585693 }, { "auxiliary_loss_clip": 0.0148693, "auxiliary_loss_mlp": 0.01038385, "balance_loss_clip": 1.29999709, "balance_loss_mlp": 1.0156281, "epoch": 0.3723433037727341, "flos": 25276548735360.0, "grad_norm": 2.2795738233590326, "language_loss": 0.80363429, "learning_rate": 2.8914767115530766e-06, "loss": 0.82888734, "num_input_tokens_seen": 133110365, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.22741699, "step": 6193, "time_per_iteration": 2.911593437194824 }, { "auxiliary_loss_clip": 0.01484774, "auxiliary_loss_mlp": 0.010385, "balance_loss_clip": 1.29517698, "balance_loss_mlp": 1.01539731, "epoch": 0.3724034270254021, "flos": 10531746714240.0, "grad_norm": 2.318332380573544, "language_loss": 0.85841548, "learning_rate": 2.891128062852194e-06, "loss": 0.88364822, "num_input_tokens_seen": 133128255, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.23120117, "step": 6194, "time_per_iteration": 2.828993558883667 }, { "auxiliary_loss_clip": 0.01490587, "auxiliary_loss_mlp": 0.01040554, "balance_loss_clip": 1.30170703, "balance_loss_mlp": 1.01715279, "epoch": 0.37246355027807004, "flos": 20275797139200.0, "grad_norm": 16.940796464073316, "language_loss": 0.784567, "learning_rate": 2.890779380359646e-06, "loss": 0.80987835, "num_input_tokens_seen": 133143975, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.23388672, "step": 6195, "time_per_iteration": 2.81355357170105 }, { "auxiliary_loss_clip": 0.01475465, "auxiliary_loss_mlp": 0.01041069, "balance_loss_clip": 1.29079998, "balance_loss_mlp": 1.01589227, "epoch": 0.372523673530738, "flos": 19510089822720.0, "grad_norm": 1.5750324391620143, "language_loss": 0.79839784, "learning_rate": 2.890430664088655e-06, "loss": 0.82356322, "num_input_tokens_seen": 133162935, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.25195312, "step": 6196, "time_per_iteration": 2.8635315895080566 }, { "auxiliary_loss_clip": 0.01477139, "auxiliary_loss_mlp": 0.01042338, "balance_loss_clip": 1.29207027, "balance_loss_mlp": 1.01948595, "epoch": 0.372583796783406, "flos": 16773087496320.0, "grad_norm": 2.1439339302685174, "language_loss": 0.84947181, "learning_rate": 2.890081914052443e-06, "loss": 0.87466669, "num_input_tokens_seen": 133181180, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.22851562, "step": 6197, "time_per_iteration": 2.8197293281555176 }, { "auxiliary_loss_clip": 0.01459691, "auxiliary_loss_mlp": 0.01046297, "balance_loss_clip": 1.27647388, "balance_loss_mlp": 1.02069068, "epoch": 0.37264392003607394, "flos": 22648215715200.0, "grad_norm": 1.5685881664075663, "language_loss": 0.65433162, "learning_rate": 2.889733130264237e-06, "loss": 0.6793915, "num_input_tokens_seen": 133199615, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.25585938, "step": 6198, "time_per_iteration": 2.844416618347168 }, { "auxiliary_loss_clip": 0.01472627, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.28849363, "balance_loss_mlp": 1.01942921, "epoch": 0.3727040432887419, "flos": 19982302248960.0, "grad_norm": 1.45337022416199, "language_loss": 0.74561405, "learning_rate": 2.889384312737261e-06, "loss": 0.77078009, "num_input_tokens_seen": 133219650, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.24511719, "step": 6199, "time_per_iteration": 4.26497745513916 }, { "auxiliary_loss_clip": 0.01475511, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.29194403, "balance_loss_mlp": 1.01465738, "epoch": 0.37276416654140987, "flos": 63918960410880.0, "grad_norm": 1.7318776150680975, "language_loss": 0.81691313, "learning_rate": 2.889035461484742e-06, "loss": 0.84204495, "num_input_tokens_seen": 133245675, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.23010254, "step": 6200, "time_per_iteration": 3.2291831970214844 }, { "auxiliary_loss_clip": 0.01478486, "auxiliary_loss_mlp": 0.01042859, "balance_loss_clip": 1.29250979, "balance_loss_mlp": 1.01795673, "epoch": 0.37282428979407783, "flos": 39800573539200.0, "grad_norm": 2.0999174373101592, "language_loss": 0.6114282, "learning_rate": 2.88868657651991e-06, "loss": 0.63664168, "num_input_tokens_seen": 133266905, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.24914551, "step": 6201, "time_per_iteration": 3.011847496032715 }, { "auxiliary_loss_clip": 0.01499207, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.30978966, "balance_loss_mlp": 1.015136, "epoch": 0.37288441304674586, "flos": 22718806882560.0, "grad_norm": 1.802365694864444, "language_loss": 0.73977643, "learning_rate": 2.8883376578559934e-06, "loss": 0.76515973, "num_input_tokens_seen": 133286865, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.23999023, "step": 6202, "time_per_iteration": 2.8908324241638184 }, { "auxiliary_loss_clip": 0.01463767, "auxiliary_loss_mlp": 0.01041757, "balance_loss_clip": 1.28001642, "balance_loss_mlp": 1.01709223, "epoch": 0.3729445362994138, "flos": 18779474488320.0, "grad_norm": 2.0396676359201433, "language_loss": 0.7423718, "learning_rate": 2.8879887055062243e-06, "loss": 0.76742703, "num_input_tokens_seen": 133305295, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.24645996, "step": 6203, "time_per_iteration": 2.894284725189209 }, { "auxiliary_loss_clip": 0.01474079, "auxiliary_loss_mlp": 0.01038308, "balance_loss_clip": 1.29001999, "balance_loss_mlp": 1.01631391, "epoch": 0.3730046595520818, "flos": 22466557267200.0, "grad_norm": 1.7494041016272246, "language_loss": 0.82961446, "learning_rate": 2.8876397194838353e-06, "loss": 0.85473835, "num_input_tokens_seen": 133324625, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.2199707, "step": 6204, "time_per_iteration": 2.902602195739746 }, { "auxiliary_loss_clip": 0.0148227, "auxiliary_loss_mlp": 0.01041071, "balance_loss_clip": 1.29434276, "balance_loss_mlp": 1.01784873, "epoch": 0.37306478280474975, "flos": 24327056465280.0, "grad_norm": 1.75052083102906, "language_loss": 0.75189584, "learning_rate": 2.8872906998020577e-06, "loss": 0.77712929, "num_input_tokens_seen": 133344625, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.2322998, "step": 6205, "time_per_iteration": 4.295569658279419 }, { "auxiliary_loss_clip": 0.01471258, "auxiliary_loss_mlp": 0.01039604, "balance_loss_clip": 1.2858882, "balance_loss_mlp": 1.01578593, "epoch": 0.3731249060574177, "flos": 15823640471040.0, "grad_norm": 3.06495370469463, "language_loss": 0.78434873, "learning_rate": 2.886941646474128e-06, "loss": 0.80945742, "num_input_tokens_seen": 133363605, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.23791504, "step": 6206, "time_per_iteration": 4.268855810165405 }, { "auxiliary_loss_clip": 0.01481774, "auxiliary_loss_mlp": 0.01033679, "balance_loss_clip": 1.29566431, "balance_loss_mlp": 1.01001644, "epoch": 0.3731850293100857, "flos": 19837455085440.0, "grad_norm": 2.097872523146228, "language_loss": 0.93687749, "learning_rate": 2.886592559513283e-06, "loss": 0.96203208, "num_input_tokens_seen": 133379405, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.23669434, "step": 6207, "time_per_iteration": 2.8502519130706787 }, { "auxiliary_loss_clip": 0.01490504, "auxiliary_loss_mlp": 0.01036594, "balance_loss_clip": 1.3005631, "balance_loss_mlp": 1.01428974, "epoch": 0.37324515256275365, "flos": 19071657279360.0, "grad_norm": 2.118253559430879, "language_loss": 0.84525496, "learning_rate": 2.886243438932759e-06, "loss": 0.87052596, "num_input_tokens_seen": 133397585, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.22302246, "step": 6208, "time_per_iteration": 4.269124984741211 }, { "auxiliary_loss_clip": 0.01474283, "auxiliary_loss_mlp": 0.01039224, "balance_loss_clip": 1.28673291, "balance_loss_mlp": 1.0155611, "epoch": 0.3733052758154216, "flos": 20714093948160.0, "grad_norm": 7.338840638390395, "language_loss": 0.74009168, "learning_rate": 2.8858942847457953e-06, "loss": 0.76522672, "num_input_tokens_seen": 133415365, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.2364502, "step": 6209, "time_per_iteration": 2.8707008361816406 }, { "auxiliary_loss_clip": 0.0148704, "auxiliary_loss_mlp": 0.01043158, "balance_loss_clip": 1.30142415, "balance_loss_mlp": 1.01848161, "epoch": 0.3733653990680896, "flos": 20203124711040.0, "grad_norm": 3.4950351420881454, "language_loss": 0.71758181, "learning_rate": 2.8855450969656305e-06, "loss": 0.7428838, "num_input_tokens_seen": 133435700, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.24682617, "step": 6210, "time_per_iteration": 2.9039204120635986 }, { "auxiliary_loss_clip": 0.01492708, "auxiliary_loss_mlp": 0.01040811, "balance_loss_clip": 1.30502534, "balance_loss_mlp": 1.01726723, "epoch": 0.37342552232075754, "flos": 20349374463360.0, "grad_norm": 2.0542228386393298, "language_loss": 0.7838124, "learning_rate": 2.8851958756055073e-06, "loss": 0.80914766, "num_input_tokens_seen": 133455180, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.23547363, "step": 6211, "time_per_iteration": 3.0203518867492676 }, { "auxiliary_loss_clip": 0.01488506, "auxiliary_loss_mlp": 0.01043951, "balance_loss_clip": 1.30033886, "balance_loss_mlp": 1.02015734, "epoch": 0.3734856455734255, "flos": 35531568397440.0, "grad_norm": 1.5075568308864071, "language_loss": 0.74231577, "learning_rate": 2.884846620678668e-06, "loss": 0.76764035, "num_input_tokens_seen": 133476715, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.23791504, "step": 6212, "time_per_iteration": 2.991227626800537 }, { "auxiliary_loss_clip": 0.01528496, "auxiliary_loss_mlp": 0.01050183, "balance_loss_clip": 1.33062077, "balance_loss_mlp": 1.02550626, "epoch": 0.37354576882609347, "flos": 21152209777920.0, "grad_norm": 1.8949435571362097, "language_loss": 0.82338262, "learning_rate": 2.884497332198356e-06, "loss": 0.84916937, "num_input_tokens_seen": 133494550, "router_z_loss_clip": 1.97851562, "router_z_loss_mlp": 0.24694824, "step": 6213, "time_per_iteration": 2.8980672359466553 }, { "auxiliary_loss_clip": 0.01484715, "auxiliary_loss_mlp": 0.01043489, "balance_loss_clip": 1.29688668, "balance_loss_mlp": 1.01921844, "epoch": 0.37360589207876144, "flos": 21516703038720.0, "grad_norm": 5.064910822873359, "language_loss": 0.79535091, "learning_rate": 2.8841480101778167e-06, "loss": 0.82063293, "num_input_tokens_seen": 133512640, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.24279785, "step": 6214, "time_per_iteration": 2.9481146335601807 }, { "auxiliary_loss_clip": 0.01481096, "auxiliary_loss_mlp": 0.01041289, "balance_loss_clip": 1.2959069, "balance_loss_mlp": 1.01859188, "epoch": 0.37366601533142946, "flos": 38450002947840.0, "grad_norm": 1.8001624781589776, "language_loss": 0.85406125, "learning_rate": 2.883798654630296e-06, "loss": 0.87928522, "num_input_tokens_seen": 133535540, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.22705078, "step": 6215, "time_per_iteration": 3.0597071647644043 }, { "auxiliary_loss_clip": 0.01497893, "auxiliary_loss_mlp": 0.01044977, "balance_loss_clip": 1.3079505, "balance_loss_mlp": 1.01984775, "epoch": 0.3737261385840974, "flos": 18450073209600.0, "grad_norm": 2.0636303279241686, "language_loss": 0.68660545, "learning_rate": 2.8834492655690423e-06, "loss": 0.71203417, "num_input_tokens_seen": 133555795, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.25134277, "step": 6216, "time_per_iteration": 2.9039924144744873 }, { "auxiliary_loss_clip": 0.01483401, "auxiliary_loss_mlp": 0.01041104, "balance_loss_clip": 1.29542851, "balance_loss_mlp": 1.01746464, "epoch": 0.3737862618367654, "flos": 22940172282240.0, "grad_norm": 2.027560576425042, "language_loss": 0.67462504, "learning_rate": 2.883099843007303e-06, "loss": 0.69987011, "num_input_tokens_seen": 133575905, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.2364502, "step": 6217, "time_per_iteration": 2.8553824424743652 }, { "auxiliary_loss_clip": 0.0150187, "auxiliary_loss_mlp": 0.01040638, "balance_loss_clip": 1.31146801, "balance_loss_mlp": 1.01786947, "epoch": 0.37384638508943335, "flos": 15416951794560.0, "grad_norm": 1.7549830352504139, "language_loss": 0.8159157, "learning_rate": 2.88275038695833e-06, "loss": 0.84134078, "num_input_tokens_seen": 133592585, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.2277832, "step": 6218, "time_per_iteration": 2.8806140422821045 }, { "auxiliary_loss_clip": 0.01482164, "auxiliary_loss_mlp": 0.01043763, "balance_loss_clip": 1.29900408, "balance_loss_mlp": 1.02006471, "epoch": 0.3739065083421013, "flos": 24291738259200.0, "grad_norm": 1.4583770779151362, "language_loss": 0.79183143, "learning_rate": 2.8824008974353736e-06, "loss": 0.81709075, "num_input_tokens_seen": 133615070, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.23706055, "step": 6219, "time_per_iteration": 2.908015251159668 }, { "auxiliary_loss_clip": 0.0148554, "auxiliary_loss_mlp": 0.01042987, "balance_loss_clip": 1.30171561, "balance_loss_mlp": 1.0188117, "epoch": 0.3739666315947693, "flos": 23013161424000.0, "grad_norm": 3.174996278980663, "language_loss": 0.77798331, "learning_rate": 2.8820513744516866e-06, "loss": 0.80326861, "num_input_tokens_seen": 133633490, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.24157715, "step": 6220, "time_per_iteration": 2.8773181438446045 }, { "auxiliary_loss_clip": 0.01498513, "auxiliary_loss_mlp": 0.01040253, "balance_loss_clip": 1.30928981, "balance_loss_mlp": 1.0153501, "epoch": 0.37402675484743725, "flos": 19400606110080.0, "grad_norm": 2.8349866279381297, "language_loss": 0.83631879, "learning_rate": 2.8817018180205235e-06, "loss": 0.86170644, "num_input_tokens_seen": 133653425, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.24926758, "step": 6221, "time_per_iteration": 2.8724052906036377 }, { "auxiliary_loss_clip": 0.01484065, "auxiliary_loss_mlp": 0.01040995, "balance_loss_clip": 1.29875433, "balance_loss_mlp": 1.01716506, "epoch": 0.3740868781001052, "flos": 17134323131520.0, "grad_norm": 1.8033213631844327, "language_loss": 0.7736702, "learning_rate": 2.8813522281551387e-06, "loss": 0.79892075, "num_input_tokens_seen": 133670220, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.23828125, "step": 6222, "time_per_iteration": 2.8508198261260986 }, { "auxiliary_loss_clip": 0.0149281, "auxiliary_loss_mlp": 0.01043741, "balance_loss_clip": 1.30653572, "balance_loss_mlp": 1.01948237, "epoch": 0.3741470013527732, "flos": 20052033765120.0, "grad_norm": 2.0498855903716535, "language_loss": 0.71708632, "learning_rate": 2.881002604868789e-06, "loss": 0.74245185, "num_input_tokens_seen": 133688910, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.24243164, "step": 6223, "time_per_iteration": 2.962596893310547 }, { "auxiliary_loss_clip": 0.01488459, "auxiliary_loss_mlp": 0.01041531, "balance_loss_clip": 1.30395222, "balance_loss_mlp": 1.01768959, "epoch": 0.37420712460544114, "flos": 36909494110080.0, "grad_norm": 2.4537101037388247, "language_loss": 0.69817924, "learning_rate": 2.8806529481747325e-06, "loss": 0.72347915, "num_input_tokens_seen": 133708690, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.23852539, "step": 6224, "time_per_iteration": 2.985929250717163 }, { "auxiliary_loss_clip": 0.01475389, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.29258728, "balance_loss_mlp": 1.01804078, "epoch": 0.3742672478581091, "flos": 22211592963840.0, "grad_norm": 1.8919437232950167, "language_loss": 0.70679736, "learning_rate": 2.880303258086228e-06, "loss": 0.73196912, "num_input_tokens_seen": 133728095, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.23730469, "step": 6225, "time_per_iteration": 2.8942956924438477 }, { "auxiliary_loss_clip": 0.01476523, "auxiliary_loss_mlp": 0.01041735, "balance_loss_clip": 1.29387724, "balance_loss_mlp": 1.01763058, "epoch": 0.3743273711107771, "flos": 24692409377280.0, "grad_norm": 2.062999437648166, "language_loss": 0.80222988, "learning_rate": 2.879953534616536e-06, "loss": 0.82741249, "num_input_tokens_seen": 133745590, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.2409668, "step": 6226, "time_per_iteration": 2.8829362392425537 }, { "auxiliary_loss_clip": 0.01487916, "auxiliary_loss_mlp": 0.01039042, "balance_loss_clip": 1.30160117, "balance_loss_mlp": 1.01562893, "epoch": 0.37438749436344504, "flos": 24469641388800.0, "grad_norm": 1.9190242268320272, "language_loss": 0.68350285, "learning_rate": 2.879603777778917e-06, "loss": 0.70877242, "num_input_tokens_seen": 133766155, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.23425293, "step": 6227, "time_per_iteration": 2.9140465259552 }, { "auxiliary_loss_clip": 0.01481889, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.29735494, "balance_loss_mlp": 1.01600134, "epoch": 0.374447617616113, "flos": 21808750095360.0, "grad_norm": 1.7461725242324144, "language_loss": 0.84199232, "learning_rate": 2.879253987586635e-06, "loss": 0.86720943, "num_input_tokens_seen": 133783185, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.23828125, "step": 6228, "time_per_iteration": 2.94978404045105 }, { "auxiliary_loss_clip": 0.01477908, "auxiliary_loss_mlp": 0.01045695, "balance_loss_clip": 1.29480124, "balance_loss_mlp": 1.02166212, "epoch": 0.374507740868781, "flos": 17977317845760.0, "grad_norm": 1.6777159827182282, "language_loss": 0.75395256, "learning_rate": 2.8789041640529535e-06, "loss": 0.77918857, "num_input_tokens_seen": 133800975, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.24023438, "step": 6229, "time_per_iteration": 2.8197741508483887 }, { "auxiliary_loss_clip": 0.01489671, "auxiliary_loss_mlp": 0.0104517, "balance_loss_clip": 1.30456829, "balance_loss_mlp": 1.01995754, "epoch": 0.374567864121449, "flos": 16114058714880.0, "grad_norm": 1.9001972606036852, "language_loss": 0.84526777, "learning_rate": 2.8785543071911383e-06, "loss": 0.87061614, "num_input_tokens_seen": 133818020, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.25231934, "step": 6230, "time_per_iteration": 2.8428471088409424 }, { "auxiliary_loss_clip": 0.01498524, "auxiliary_loss_mlp": 0.01048427, "balance_loss_clip": 1.31126952, "balance_loss_mlp": 1.02305877, "epoch": 0.37462798737411696, "flos": 25783536430080.0, "grad_norm": 1.7185989442059706, "language_loss": 0.73995811, "learning_rate": 2.878204417014456e-06, "loss": 0.76542765, "num_input_tokens_seen": 133840690, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.25378418, "step": 6231, "time_per_iteration": 2.946723699569702 }, { "auxiliary_loss_clip": 0.01495537, "auxiliary_loss_mlp": 0.01045596, "balance_loss_clip": 1.30821323, "balance_loss_mlp": 1.02050209, "epoch": 0.3746881106267849, "flos": 16663241825280.0, "grad_norm": 2.499735754780122, "language_loss": 0.7457273, "learning_rate": 2.8778544935361735e-06, "loss": 0.77113867, "num_input_tokens_seen": 133858350, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.25085449, "step": 6232, "time_per_iteration": 2.8472650051116943 }, { "auxiliary_loss_clip": 0.01497983, "auxiliary_loss_mlp": 0.01042059, "balance_loss_clip": 1.31055832, "balance_loss_mlp": 1.01782417, "epoch": 0.3747482338794529, "flos": 26189365455360.0, "grad_norm": 1.6224972946686627, "language_loss": 0.77717704, "learning_rate": 2.877504536769561e-06, "loss": 0.8025775, "num_input_tokens_seen": 133879775, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.24243164, "step": 6233, "time_per_iteration": 2.962507486343384 }, { "auxiliary_loss_clip": 0.0148819, "auxiliary_loss_mlp": 0.01045067, "balance_loss_clip": 1.30305636, "balance_loss_mlp": 1.02087998, "epoch": 0.37480835713212085, "flos": 12028657547520.0, "grad_norm": 1.9004702594580773, "language_loss": 0.69540024, "learning_rate": 2.8771545467278883e-06, "loss": 0.72073281, "num_input_tokens_seen": 133898295, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.24194336, "step": 6234, "time_per_iteration": 4.338593244552612 }, { "auxiliary_loss_clip": 0.01490764, "auxiliary_loss_mlp": 0.01045019, "balance_loss_clip": 1.30716193, "balance_loss_mlp": 1.02136838, "epoch": 0.3748684803847888, "flos": 19687857217920.0, "grad_norm": 2.0653449654108336, "language_loss": 0.83401132, "learning_rate": 2.8768045234244276e-06, "loss": 0.85936916, "num_input_tokens_seen": 133915230, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.23657227, "step": 6235, "time_per_iteration": 2.8565175533294678 }, { "auxiliary_loss_clip": 0.01492903, "auxiliary_loss_mlp": 0.01045305, "balance_loss_clip": 1.30625033, "balance_loss_mlp": 1.02137995, "epoch": 0.3749286036374568, "flos": 20530716197760.0, "grad_norm": 2.060620757842387, "language_loss": 0.78777492, "learning_rate": 2.8764544668724517e-06, "loss": 0.81315696, "num_input_tokens_seen": 133934110, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.23950195, "step": 6236, "time_per_iteration": 2.879055976867676 }, { "auxiliary_loss_clip": 0.01513617, "auxiliary_loss_mlp": 0.0104952, "balance_loss_clip": 1.32349658, "balance_loss_mlp": 1.02497447, "epoch": 0.37498872689012475, "flos": 20714455906560.0, "grad_norm": 2.4011591754247026, "language_loss": 0.7453987, "learning_rate": 2.876104377085234e-06, "loss": 0.77103007, "num_input_tokens_seen": 133952395, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.24560547, "step": 6237, "time_per_iteration": 2.830744743347168 }, { "auxiliary_loss_clip": 0.015017, "auxiliary_loss_mlp": 0.01043055, "balance_loss_clip": 1.31204534, "balance_loss_mlp": 1.01922488, "epoch": 0.3750488501427927, "flos": 21583493642880.0, "grad_norm": 1.9397113197964924, "language_loss": 0.93801355, "learning_rate": 2.8757542540760508e-06, "loss": 0.96346116, "num_input_tokens_seen": 133969635, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.23852539, "step": 6238, "time_per_iteration": 2.913250684738159 }, { "auxiliary_loss_clip": 0.01490315, "auxiliary_loss_mlp": 0.01041288, "balance_loss_clip": 1.30350804, "balance_loss_mlp": 1.01689756, "epoch": 0.3751089733954607, "flos": 15932174042880.0, "grad_norm": 2.051205418480865, "language_loss": 0.71657622, "learning_rate": 2.8754040978581777e-06, "loss": 0.74189222, "num_input_tokens_seen": 133987215, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.2442627, "step": 6239, "time_per_iteration": 2.826160192489624 }, { "auxiliary_loss_clip": 0.01511967, "auxiliary_loss_mlp": 0.01048306, "balance_loss_clip": 1.3225944, "balance_loss_mlp": 1.02320027, "epoch": 0.37516909664812864, "flos": 36298678302720.0, "grad_norm": 1.6390202687710338, "language_loss": 0.66141242, "learning_rate": 2.875053908444895e-06, "loss": 0.68701512, "num_input_tokens_seen": 134009250, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.25109863, "step": 6240, "time_per_iteration": 4.402130365371704 }, { "auxiliary_loss_clip": 0.01496166, "auxiliary_loss_mlp": 0.01036799, "balance_loss_clip": 1.30910492, "balance_loss_mlp": 1.01262379, "epoch": 0.3752292199007966, "flos": 13523622854400.0, "grad_norm": 2.1455124300871535, "language_loss": 0.77361584, "learning_rate": 2.8747036858494795e-06, "loss": 0.79894549, "num_input_tokens_seen": 134026875, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.24182129, "step": 6241, "time_per_iteration": 4.242900133132935 }, { "auxiliary_loss_clip": 0.01502923, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.31410551, "balance_loss_mlp": 1.01793838, "epoch": 0.3752893431534646, "flos": 27209358403200.0, "grad_norm": 9.088576073310552, "language_loss": 0.84297776, "learning_rate": 2.874353430085213e-06, "loss": 0.8684355, "num_input_tokens_seen": 134047185, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.24902344, "step": 6242, "time_per_iteration": 2.929429531097412 }, { "auxiliary_loss_clip": 0.01499395, "auxiliary_loss_mlp": 0.01046086, "balance_loss_clip": 1.31042981, "balance_loss_mlp": 1.02247047, "epoch": 0.3753494664061326, "flos": 30019711829760.0, "grad_norm": 2.5693555863303756, "language_loss": 0.68382072, "learning_rate": 2.8740031411653766e-06, "loss": 0.70927548, "num_input_tokens_seen": 134067330, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.23596191, "step": 6243, "time_per_iteration": 4.361286163330078 }, { "auxiliary_loss_clip": 0.01499037, "auxiliary_loss_mlp": 0.01045716, "balance_loss_clip": 1.31063199, "balance_loss_mlp": 1.01997864, "epoch": 0.37540958965880056, "flos": 24472582300800.0, "grad_norm": 2.037182300578338, "language_loss": 0.84641457, "learning_rate": 2.8736528191032535e-06, "loss": 0.87186205, "num_input_tokens_seen": 134085525, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.25756836, "step": 6244, "time_per_iteration": 2.8807318210601807 }, { "auxiliary_loss_clip": 0.01480778, "auxiliary_loss_mlp": 0.01035362, "balance_loss_clip": 1.29865932, "balance_loss_mlp": 1.01248634, "epoch": 0.3754697129114685, "flos": 16517037317760.0, "grad_norm": 2.366047557475479, "language_loss": 0.83522654, "learning_rate": 2.8733024639121277e-06, "loss": 0.86038798, "num_input_tokens_seen": 134101855, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.22888184, "step": 6245, "time_per_iteration": 2.8794474601745605 }, { "auxiliary_loss_clip": 0.01499212, "auxiliary_loss_mlp": 0.01043775, "balance_loss_clip": 1.31103277, "balance_loss_mlp": 1.01745319, "epoch": 0.3755298361641365, "flos": 19400334641280.0, "grad_norm": 2.176140213940743, "language_loss": 0.64547455, "learning_rate": 2.8729520756052853e-06, "loss": 0.6709044, "num_input_tokens_seen": 134119360, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.26342773, "step": 6246, "time_per_iteration": 2.835742712020874 }, { "auxiliary_loss_clip": 0.01508084, "auxiliary_loss_mlp": 0.01041571, "balance_loss_clip": 1.31708026, "balance_loss_mlp": 1.01676393, "epoch": 0.37558995941680445, "flos": 14728531875840.0, "grad_norm": 1.9616735000033112, "language_loss": 0.75814724, "learning_rate": 2.8726016541960124e-06, "loss": 0.78364378, "num_input_tokens_seen": 134137475, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.24780273, "step": 6247, "time_per_iteration": 2.856764078140259 }, { "auxiliary_loss_clip": 0.01501114, "auxiliary_loss_mlp": 0.01044327, "balance_loss_clip": 1.31228328, "balance_loss_mlp": 1.01938808, "epoch": 0.3756500826694724, "flos": 21700035544320.0, "grad_norm": 2.9936173171274967, "language_loss": 0.56940192, "learning_rate": 2.872251199697598e-06, "loss": 0.59485626, "num_input_tokens_seen": 134154580, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.24951172, "step": 6248, "time_per_iteration": 2.8775525093078613 }, { "auxiliary_loss_clip": 0.01492458, "auxiliary_loss_mlp": 0.01042469, "balance_loss_clip": 1.30584836, "balance_loss_mlp": 1.0180074, "epoch": 0.3757102059221404, "flos": 26516956942080.0, "grad_norm": 2.841584551611176, "language_loss": 0.84677184, "learning_rate": 2.8719007121233297e-06, "loss": 0.8721211, "num_input_tokens_seen": 134174285, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.24450684, "step": 6249, "time_per_iteration": 2.968430995941162 }, { "auxiliary_loss_clip": 0.01505676, "auxiliary_loss_mlp": 0.01042261, "balance_loss_clip": 1.31769633, "balance_loss_mlp": 1.01670289, "epoch": 0.37577032917480835, "flos": 37351184279040.0, "grad_norm": 1.5449061588489856, "language_loss": 0.69005388, "learning_rate": 2.8715501914864993e-06, "loss": 0.7155332, "num_input_tokens_seen": 134195940, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.2557373, "step": 6250, "time_per_iteration": 3.0259289741516113 }, { "auxiliary_loss_clip": 0.01499781, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.31186461, "balance_loss_mlp": 1.01923096, "epoch": 0.3758304524274763, "flos": 21918731500800.0, "grad_norm": 1.9476038553374257, "language_loss": 0.78868824, "learning_rate": 2.8711996378003987e-06, "loss": 0.81411511, "num_input_tokens_seen": 134212235, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.23681641, "step": 6251, "time_per_iteration": 2.9016366004943848 }, { "auxiliary_loss_clip": 0.01492817, "auxiliary_loss_mlp": 0.01045424, "balance_loss_clip": 1.30726039, "balance_loss_mlp": 1.02117658, "epoch": 0.3758905756801443, "flos": 36581585909760.0, "grad_norm": 4.087154238135045, "language_loss": 0.5859949, "learning_rate": 2.8708490510783203e-06, "loss": 0.6113773, "num_input_tokens_seen": 134233810, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.24255371, "step": 6252, "time_per_iteration": 3.012312173843384 }, { "auxiliary_loss_clip": 0.01508212, "auxiliary_loss_mlp": 0.01041944, "balance_loss_clip": 1.31755543, "balance_loss_mlp": 1.0177443, "epoch": 0.37595069893281224, "flos": 24537834581760.0, "grad_norm": 1.7310930017170976, "language_loss": 0.89677149, "learning_rate": 2.8704984313335584e-06, "loss": 0.92227304, "num_input_tokens_seen": 134252020, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.24194336, "step": 6253, "time_per_iteration": 2.8733460903167725 }, { "auxiliary_loss_clip": 0.01495845, "auxiliary_loss_mlp": 0.01041677, "balance_loss_clip": 1.31232488, "balance_loss_mlp": 1.01732278, "epoch": 0.3760108221854802, "flos": 16443550483200.0, "grad_norm": 2.636902154729331, "language_loss": 0.7733866, "learning_rate": 2.8701477785794097e-06, "loss": 0.79876184, "num_input_tokens_seen": 134269495, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.24365234, "step": 6254, "time_per_iteration": 2.963459014892578 }, { "auxiliary_loss_clip": 0.0150739, "auxiliary_loss_mlp": 0.01042593, "balance_loss_clip": 1.31858468, "balance_loss_mlp": 1.0186795, "epoch": 0.37607094543814823, "flos": 13779627788160.0, "grad_norm": 2.3303713474248915, "language_loss": 0.63347876, "learning_rate": 2.869797092829169e-06, "loss": 0.65897858, "num_input_tokens_seen": 134287035, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.23937988, "step": 6255, "time_per_iteration": 2.8558382987976074 }, { "auxiliary_loss_clip": 0.01513555, "auxiliary_loss_mlp": 0.01048509, "balance_loss_clip": 1.32242918, "balance_loss_mlp": 1.0233916, "epoch": 0.3761310686908162, "flos": 19865579368320.0, "grad_norm": 2.725237920363147, "language_loss": 0.7519393, "learning_rate": 2.869446374096135e-06, "loss": 0.77755994, "num_input_tokens_seen": 134304840, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.25097656, "step": 6256, "time_per_iteration": 2.8765554428100586 }, { "auxiliary_loss_clip": 0.01503184, "auxiliary_loss_mlp": 0.01046039, "balance_loss_clip": 1.31324291, "balance_loss_mlp": 1.02028954, "epoch": 0.37619119194348416, "flos": 12758594209920.0, "grad_norm": 1.7616335621484713, "language_loss": 0.72244978, "learning_rate": 2.8690956223936088e-06, "loss": 0.74794197, "num_input_tokens_seen": 134323180, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.25769043, "step": 6257, "time_per_iteration": 2.8745737075805664 }, { "auxiliary_loss_clip": 0.01495588, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.30758643, "balance_loss_mlp": 1.01839519, "epoch": 0.3762513151961521, "flos": 17539609219200.0, "grad_norm": 1.6800473014014758, "language_loss": 0.85162795, "learning_rate": 2.868744837734889e-06, "loss": 0.87700129, "num_input_tokens_seen": 134341390, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.23364258, "step": 6258, "time_per_iteration": 2.8813345432281494 }, { "auxiliary_loss_clip": 0.01501426, "auxiliary_loss_mlp": 0.01045768, "balance_loss_clip": 1.31591177, "balance_loss_mlp": 1.02218878, "epoch": 0.3763114384488201, "flos": 23626420450560.0, "grad_norm": 1.424318213160496, "language_loss": 0.81498766, "learning_rate": 2.868394020133277e-06, "loss": 0.84045964, "num_input_tokens_seen": 134360425, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.23608398, "step": 6259, "time_per_iteration": 2.8640594482421875 }, { "auxiliary_loss_clip": 0.01505292, "auxiliary_loss_mlp": 0.01051701, "balance_loss_clip": 1.31283426, "balance_loss_mlp": 1.02656019, "epoch": 0.37637156170148806, "flos": 25416916663680.0, "grad_norm": 2.4918341229501424, "language_loss": 0.72519898, "learning_rate": 2.8680431696020783e-06, "loss": 0.7507689, "num_input_tokens_seen": 134379775, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.25109863, "step": 6260, "time_per_iteration": 2.957181453704834 }, { "auxiliary_loss_clip": 0.01519347, "auxiliary_loss_mlp": 0.01048629, "balance_loss_clip": 1.32627857, "balance_loss_mlp": 1.02462053, "epoch": 0.376431684954156, "flos": 23451412988160.0, "grad_norm": 1.7077504175954468, "language_loss": 0.79048055, "learning_rate": 2.867692286154594e-06, "loss": 0.81616032, "num_input_tokens_seen": 134400315, "router_z_loss_clip": 1.93066406, "router_z_loss_mlp": 0.23999023, "step": 6261, "time_per_iteration": 2.8986258506774902 }, { "auxiliary_loss_clip": 0.01514263, "auxiliary_loss_mlp": 0.01053642, "balance_loss_clip": 1.32284045, "balance_loss_mlp": 1.0292871, "epoch": 0.376491808206824, "flos": 34217854335360.0, "grad_norm": 1.8695750093158916, "language_loss": 0.81270945, "learning_rate": 2.867341369804132e-06, "loss": 0.83838856, "num_input_tokens_seen": 134422875, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.24353027, "step": 6262, "time_per_iteration": 2.973497152328491 }, { "auxiliary_loss_clip": 0.01494072, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.30748057, "balance_loss_mlp": 1.02201557, "epoch": 0.37655193145949195, "flos": 35198276065920.0, "grad_norm": 1.850487327649705, "language_loss": 0.81328011, "learning_rate": 2.866990420563998e-06, "loss": 0.83867478, "num_input_tokens_seen": 134443025, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.23376465, "step": 6263, "time_per_iteration": 2.972407579421997 }, { "auxiliary_loss_clip": 0.01506651, "auxiliary_loss_mlp": 0.01054547, "balance_loss_clip": 1.31859446, "balance_loss_mlp": 1.03149199, "epoch": 0.3766120547121599, "flos": 16770056094720.0, "grad_norm": 1.7327716063353824, "language_loss": 0.80566013, "learning_rate": 2.866639438447501e-06, "loss": 0.83127213, "num_input_tokens_seen": 134460945, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.23059082, "step": 6264, "time_per_iteration": 2.863201856613159 }, { "auxiliary_loss_clip": 0.01492386, "auxiliary_loss_mlp": 0.01052342, "balance_loss_clip": 1.30497432, "balance_loss_mlp": 1.02878618, "epoch": 0.3766721779648279, "flos": 23560896700800.0, "grad_norm": 2.4227128787627223, "language_loss": 0.74706584, "learning_rate": 2.8662884234679497e-06, "loss": 0.77251315, "num_input_tokens_seen": 134480440, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.2355957, "step": 6265, "time_per_iteration": 2.8631815910339355 }, { "auxiliary_loss_clip": 0.01486504, "auxiliary_loss_mlp": 0.01056211, "balance_loss_clip": 1.30424809, "balance_loss_mlp": 1.03239298, "epoch": 0.37673230121749585, "flos": 29140267789440.0, "grad_norm": 1.7320405620081623, "language_loss": 0.68791664, "learning_rate": 2.865937375638654e-06, "loss": 0.7133438, "num_input_tokens_seen": 134501110, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.23840332, "step": 6266, "time_per_iteration": 2.9923810958862305 }, { "auxiliary_loss_clip": 0.01523977, "auxiliary_loss_mlp": 0.01058111, "balance_loss_clip": 1.32972884, "balance_loss_mlp": 1.03311324, "epoch": 0.3767924244701638, "flos": 28158307735680.0, "grad_norm": 2.840935502202564, "language_loss": 0.63225144, "learning_rate": 2.8655862949729264e-06, "loss": 0.65807235, "num_input_tokens_seen": 134522460, "router_z_loss_clip": 1.94433594, "router_z_loss_mlp": 0.25, "step": 6267, "time_per_iteration": 2.9897868633270264 }, { "auxiliary_loss_clip": 0.01293687, "auxiliary_loss_mlp": 0.01035066, "balance_loss_clip": 1.17374337, "balance_loss_mlp": 1.01398969, "epoch": 0.37685254772283183, "flos": 60825536380800.0, "grad_norm": 0.7390293407275279, "language_loss": 0.58926851, "learning_rate": 2.8652351814840795e-06, "loss": 0.61255604, "num_input_tokens_seen": 134589545, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 0.2109375, "step": 6268, "time_per_iteration": 3.495623826980591 }, { "auxiliary_loss_clip": 0.01498132, "auxiliary_loss_mlp": 0.01042953, "balance_loss_clip": 1.30958843, "balance_loss_mlp": 1.01903975, "epoch": 0.3769126709754998, "flos": 26044020599040.0, "grad_norm": 1.5895486098531133, "language_loss": 0.65256011, "learning_rate": 2.8648840351854283e-06, "loss": 0.67797101, "num_input_tokens_seen": 134610550, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.23925781, "step": 6269, "time_per_iteration": 4.404542684555054 }, { "auxiliary_loss_clip": 0.0149016, "auxiliary_loss_mlp": 0.01049225, "balance_loss_clip": 1.3066113, "balance_loss_mlp": 1.02506089, "epoch": 0.37697279422816776, "flos": 23588613780480.0, "grad_norm": 1.7153717506849646, "language_loss": 0.71803874, "learning_rate": 2.8645328560902874e-06, "loss": 0.74343258, "num_input_tokens_seen": 134630485, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.24169922, "step": 6270, "time_per_iteration": 3.0313608646392822 }, { "auxiliary_loss_clip": 0.01299885, "auxiliary_loss_mlp": 0.01038664, "balance_loss_clip": 1.17728877, "balance_loss_mlp": 1.01577556, "epoch": 0.3770329174808357, "flos": 64777446829440.0, "grad_norm": 1.0412935320573684, "language_loss": 0.56066954, "learning_rate": 2.8641816442119746e-06, "loss": 0.58405501, "num_input_tokens_seen": 134693510, "router_z_loss_clip": 1.2265625, "router_z_loss_mlp": 0.22851562, "step": 6271, "time_per_iteration": 3.3044159412384033 }, { "auxiliary_loss_clip": 0.01486381, "auxiliary_loss_mlp": 0.01047268, "balance_loss_clip": 1.30220902, "balance_loss_mlp": 1.02280641, "epoch": 0.3770930407335037, "flos": 21845335155840.0, "grad_norm": 1.737235804307271, "language_loss": 0.8045187, "learning_rate": 2.8638303995638066e-06, "loss": 0.8298552, "num_input_tokens_seen": 134713115, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.24475098, "step": 6272, "time_per_iteration": 2.9144816398620605 }, { "auxiliary_loss_clip": 0.0148178, "auxiliary_loss_mlp": 0.0104436, "balance_loss_clip": 1.29897988, "balance_loss_mlp": 1.02153158, "epoch": 0.37715316398617166, "flos": 22758287610240.0, "grad_norm": 1.863581001924852, "language_loss": 0.74959183, "learning_rate": 2.863479122159103e-06, "loss": 0.77485323, "num_input_tokens_seen": 134732635, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.22839355, "step": 6273, "time_per_iteration": 2.974905014038086 }, { "auxiliary_loss_clip": 0.01505338, "auxiliary_loss_mlp": 0.01052399, "balance_loss_clip": 1.32060742, "balance_loss_mlp": 1.02935612, "epoch": 0.3772132872388396, "flos": 18923371511040.0, "grad_norm": 1.9128324775568435, "language_loss": 0.72273326, "learning_rate": 2.8631278120111858e-06, "loss": 0.74831057, "num_input_tokens_seen": 134750695, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.23010254, "step": 6274, "time_per_iteration": 2.8780627250671387 }, { "auxiliary_loss_clip": 0.01501047, "auxiliary_loss_mlp": 0.01045013, "balance_loss_clip": 1.31187022, "balance_loss_mlp": 1.02201772, "epoch": 0.3772734104915076, "flos": 17354738390400.0, "grad_norm": 2.0471710890898214, "language_loss": 0.84758896, "learning_rate": 2.8627764691333742e-06, "loss": 0.8730495, "num_input_tokens_seen": 134768935, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.22998047, "step": 6275, "time_per_iteration": 4.233761310577393 }, { "auxiliary_loss_clip": 0.01476912, "auxiliary_loss_mlp": 0.01041837, "balance_loss_clip": 1.2964282, "balance_loss_mlp": 1.01830506, "epoch": 0.37733353374417555, "flos": 32354911918080.0, "grad_norm": 1.6634433540517763, "language_loss": 0.76099259, "learning_rate": 2.8624250935389935e-06, "loss": 0.78618008, "num_input_tokens_seen": 134791260, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.23547363, "step": 6276, "time_per_iteration": 4.4118475914001465 }, { "auxiliary_loss_clip": 0.01509437, "auxiliary_loss_mlp": 0.01044416, "balance_loss_clip": 1.32188106, "balance_loss_mlp": 1.01973939, "epoch": 0.3773936569968435, "flos": 23369284396800.0, "grad_norm": 1.9498564572269774, "language_loss": 0.8634991, "learning_rate": 2.862073685241366e-06, "loss": 0.88903761, "num_input_tokens_seen": 134808350, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.2467041, "step": 6277, "time_per_iteration": 2.8479421138763428 }, { "auxiliary_loss_clip": 0.01488072, "auxiliary_loss_mlp": 0.01045693, "balance_loss_clip": 1.30706799, "balance_loss_mlp": 1.0218873, "epoch": 0.3774537802495115, "flos": 21475774477440.0, "grad_norm": 2.150628373204088, "language_loss": 0.78836995, "learning_rate": 2.861722244253818e-06, "loss": 0.81370759, "num_input_tokens_seen": 134826005, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.23803711, "step": 6278, "time_per_iteration": 4.2657694816589355 }, { "auxiliary_loss_clip": 0.01512862, "auxiliary_loss_mlp": 0.01047577, "balance_loss_clip": 1.32320523, "balance_loss_mlp": 1.02323401, "epoch": 0.37751390350217945, "flos": 24984727902720.0, "grad_norm": 2.4849684982907547, "language_loss": 0.83440816, "learning_rate": 2.8613707705896767e-06, "loss": 0.86001253, "num_input_tokens_seen": 134844995, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.2434082, "step": 6279, "time_per_iteration": 2.95794677734375 }, { "auxiliary_loss_clip": 0.0149884, "auxiliary_loss_mlp": 0.01046018, "balance_loss_clip": 1.31184363, "balance_loss_mlp": 1.02371395, "epoch": 0.3775740267548474, "flos": 27830716248960.0, "grad_norm": 1.7448450091451106, "language_loss": 0.75343734, "learning_rate": 2.861019264262269e-06, "loss": 0.77888596, "num_input_tokens_seen": 134865285, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.22302246, "step": 6280, "time_per_iteration": 2.9447779655456543 }, { "auxiliary_loss_clip": 0.01489106, "auxiliary_loss_mlp": 0.0104295, "balance_loss_clip": 1.30833101, "balance_loss_mlp": 1.02021646, "epoch": 0.3776341500075154, "flos": 22575498042240.0, "grad_norm": 1.3958221733913045, "language_loss": 0.76732111, "learning_rate": 2.8606677252849242e-06, "loss": 0.79264164, "num_input_tokens_seen": 134886535, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22729492, "step": 6281, "time_per_iteration": 2.971968173980713 }, { "auxiliary_loss_clip": 0.01484246, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.29872, "balance_loss_mlp": 1.01392221, "epoch": 0.3776942732601834, "flos": 23087960357760.0, "grad_norm": 1.9573894619322283, "language_loss": 0.84761548, "learning_rate": 2.860316153670974e-06, "loss": 0.87283266, "num_input_tokens_seen": 134907435, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.23535156, "step": 6282, "time_per_iteration": 2.904425859451294 }, { "auxiliary_loss_clip": 0.01478819, "auxiliary_loss_mlp": 0.01039027, "balance_loss_clip": 1.29811502, "balance_loss_mlp": 1.01557899, "epoch": 0.37775439651285136, "flos": 21734041651200.0, "grad_norm": 1.6829077469904408, "language_loss": 0.70802069, "learning_rate": 2.8599645494337484e-06, "loss": 0.73319912, "num_input_tokens_seen": 134925360, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.23461914, "step": 6283, "time_per_iteration": 2.870537519454956 }, { "auxiliary_loss_clip": 0.01491939, "auxiliary_loss_mlp": 0.01041097, "balance_loss_clip": 1.30774665, "balance_loss_mlp": 1.01752901, "epoch": 0.37781451976551933, "flos": 23998288613760.0, "grad_norm": 3.0054314339758585, "language_loss": 0.7769891, "learning_rate": 2.859612912586581e-06, "loss": 0.80231947, "num_input_tokens_seen": 134944205, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.23583984, "step": 6284, "time_per_iteration": 2.9045557975769043 }, { "auxiliary_loss_clip": 0.01506584, "auxiliary_loss_mlp": 0.0104137, "balance_loss_clip": 1.31633782, "balance_loss_mlp": 1.01581132, "epoch": 0.3778746430181873, "flos": 13733948522880.0, "grad_norm": 2.108059479660192, "language_loss": 0.86797404, "learning_rate": 2.8592612431428055e-06, "loss": 0.8934536, "num_input_tokens_seen": 134960255, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.2557373, "step": 6285, "time_per_iteration": 2.838348627090454 }, { "auxiliary_loss_clip": 0.01503446, "auxiliary_loss_mlp": 0.01039494, "balance_loss_clip": 1.31564879, "balance_loss_mlp": 1.01450801, "epoch": 0.37793476627085526, "flos": 19469070771840.0, "grad_norm": 1.83376686348102, "language_loss": 0.8514328, "learning_rate": 2.858909541115758e-06, "loss": 0.87686223, "num_input_tokens_seen": 134978605, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.24975586, "step": 6286, "time_per_iteration": 2.849339246749878 }, { "auxiliary_loss_clip": 0.01500518, "auxiliary_loss_mlp": 0.0104328, "balance_loss_clip": 1.31412864, "balance_loss_mlp": 1.01943803, "epoch": 0.3779948895235232, "flos": 10714129079040.0, "grad_norm": 2.2508748612862925, "language_loss": 0.8267011, "learning_rate": 2.858557806518775e-06, "loss": 0.85213906, "num_input_tokens_seen": 134995020, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.23864746, "step": 6287, "time_per_iteration": 2.904369592666626 }, { "auxiliary_loss_clip": 0.01493647, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.30965114, "balance_loss_mlp": 1.01687515, "epoch": 0.3780550127761912, "flos": 22320262270080.0, "grad_norm": 2.523627155133825, "language_loss": 0.7459048, "learning_rate": 2.8582060393651927e-06, "loss": 0.77124238, "num_input_tokens_seen": 135012620, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.23217773, "step": 6288, "time_per_iteration": 2.9245071411132812 }, { "auxiliary_loss_clip": 0.01492659, "auxiliary_loss_mlp": 0.01040577, "balance_loss_clip": 1.30845451, "balance_loss_mlp": 1.01768887, "epoch": 0.37811513602885916, "flos": 28962681373440.0, "grad_norm": 3.101994682723379, "language_loss": 0.76464105, "learning_rate": 2.857854239668352e-06, "loss": 0.78997338, "num_input_tokens_seen": 135033365, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22912598, "step": 6289, "time_per_iteration": 2.9749672412872314 }, { "auxiliary_loss_clip": 0.01492654, "auxiliary_loss_mlp": 0.01038132, "balance_loss_clip": 1.30865836, "balance_loss_mlp": 1.0149101, "epoch": 0.3781752592815271, "flos": 23123459543040.0, "grad_norm": 1.9083296580109173, "language_loss": 0.75318021, "learning_rate": 2.857502407441593e-06, "loss": 0.77848804, "num_input_tokens_seen": 135052185, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.23217773, "step": 6290, "time_per_iteration": 2.930063486099243 }, { "auxiliary_loss_clip": 0.01511063, "auxiliary_loss_mlp": 0.01044292, "balance_loss_clip": 1.32038283, "balance_loss_mlp": 1.01928186, "epoch": 0.3782353825341951, "flos": 19765506574080.0, "grad_norm": 2.378515784087982, "language_loss": 0.81079257, "learning_rate": 2.8571505426982566e-06, "loss": 0.83634615, "num_input_tokens_seen": 135070425, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.24987793, "step": 6291, "time_per_iteration": 2.836290121078491 }, { "auxiliary_loss_clip": 0.01498644, "auxiliary_loss_mlp": 0.01040227, "balance_loss_clip": 1.31072056, "balance_loss_mlp": 1.01639724, "epoch": 0.37829550578686305, "flos": 22060049569920.0, "grad_norm": 1.7275238160964965, "language_loss": 0.77105832, "learning_rate": 2.8567986454516854e-06, "loss": 0.79644704, "num_input_tokens_seen": 135090525, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.23828125, "step": 6292, "time_per_iteration": 2.865901231765747 }, { "auxiliary_loss_clip": 0.01489913, "auxiliary_loss_mlp": 0.01044535, "balance_loss_clip": 1.30530286, "balance_loss_mlp": 1.02052605, "epoch": 0.378355629039531, "flos": 16478597220480.0, "grad_norm": 2.6144260879290506, "language_loss": 0.71318614, "learning_rate": 2.856446715715224e-06, "loss": 0.73853058, "num_input_tokens_seen": 135109575, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.24023438, "step": 6293, "time_per_iteration": 2.8297388553619385 }, { "auxiliary_loss_clip": 0.01479412, "auxiliary_loss_mlp": 0.01041685, "balance_loss_clip": 1.29755437, "balance_loss_mlp": 1.0184747, "epoch": 0.378415752292199, "flos": 19984473999360.0, "grad_norm": 2.77373987588886, "language_loss": 0.71909189, "learning_rate": 2.8560947535022173e-06, "loss": 0.74430287, "num_input_tokens_seen": 135127000, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.23217773, "step": 6294, "time_per_iteration": 2.8517003059387207 }, { "auxiliary_loss_clip": 0.01508409, "auxiliary_loss_mlp": 0.01039868, "balance_loss_clip": 1.31623173, "balance_loss_mlp": 1.01641965, "epoch": 0.378475875544867, "flos": 14655497489280.0, "grad_norm": 2.379615641316863, "language_loss": 0.84536791, "learning_rate": 2.855742758826011e-06, "loss": 0.87085068, "num_input_tokens_seen": 135145285, "router_z_loss_clip": 1.92382812, "router_z_loss_mlp": 0.23449707, "step": 6295, "time_per_iteration": 2.9297478199005127 }, { "auxiliary_loss_clip": 0.01486794, "auxiliary_loss_mlp": 0.01038907, "balance_loss_clip": 1.30196357, "balance_loss_mlp": 1.0160898, "epoch": 0.37853599879753497, "flos": 26662166064000.0, "grad_norm": 1.851365068893925, "language_loss": 0.72114676, "learning_rate": 2.8553907316999547e-06, "loss": 0.74640375, "num_input_tokens_seen": 135165240, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.22790527, "step": 6296, "time_per_iteration": 2.9938693046569824 }, { "auxiliary_loss_clip": 0.01471678, "auxiliary_loss_mlp": 0.01047245, "balance_loss_clip": 1.29386497, "balance_loss_mlp": 1.02290213, "epoch": 0.37859612205020293, "flos": 17321048997120.0, "grad_norm": 1.7775700553814144, "language_loss": 0.77849269, "learning_rate": 2.855038672137396e-06, "loss": 0.80368185, "num_input_tokens_seen": 135184045, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.2434082, "step": 6297, "time_per_iteration": 2.8632941246032715 }, { "auxiliary_loss_clip": 0.01480756, "auxiliary_loss_mlp": 0.01047803, "balance_loss_clip": 1.296556, "balance_loss_mlp": 1.02416444, "epoch": 0.3786562453028709, "flos": 18228753054720.0, "grad_norm": 1.7659371238271087, "language_loss": 0.80089307, "learning_rate": 2.854686580151684e-06, "loss": 0.82617855, "num_input_tokens_seen": 135202365, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.23632812, "step": 6298, "time_per_iteration": 2.848825454711914 }, { "auxiliary_loss_clip": 0.01476184, "auxiliary_loss_mlp": 0.01043691, "balance_loss_clip": 1.29573655, "balance_loss_mlp": 1.02135086, "epoch": 0.37871636855553886, "flos": 21224701226880.0, "grad_norm": 1.76346846229619, "language_loss": 0.85384357, "learning_rate": 2.8543344557561722e-06, "loss": 0.87904227, "num_input_tokens_seen": 135220955, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22338867, "step": 6299, "time_per_iteration": 2.8439624309539795 }, { "auxiliary_loss_clip": 0.01489216, "auxiliary_loss_mlp": 0.0104883, "balance_loss_clip": 1.30369997, "balance_loss_mlp": 1.02470231, "epoch": 0.3787764918082068, "flos": 20960733208320.0, "grad_norm": 2.0882033785166025, "language_loss": 0.76774424, "learning_rate": 2.8539822989642116e-06, "loss": 0.79312468, "num_input_tokens_seen": 135239715, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.24121094, "step": 6300, "time_per_iteration": 2.8400354385375977 }, { "auxiliary_loss_clip": 0.01522195, "auxiliary_loss_mlp": 0.01053403, "balance_loss_clip": 1.33129478, "balance_loss_mlp": 1.02776134, "epoch": 0.3788366150608748, "flos": 17316072069120.0, "grad_norm": 1.9763783903963708, "language_loss": 0.83410978, "learning_rate": 2.8536301097891577e-06, "loss": 0.85986567, "num_input_tokens_seen": 135257035, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.25646973, "step": 6301, "time_per_iteration": 2.8780391216278076 }, { "auxiliary_loss_clip": 0.0148689, "auxiliary_loss_mlp": 0.0104994, "balance_loss_clip": 1.30301666, "balance_loss_mlp": 1.02485788, "epoch": 0.37889673831354276, "flos": 24320405479680.0, "grad_norm": 1.846665985209703, "language_loss": 0.68111312, "learning_rate": 2.8532778882443636e-06, "loss": 0.70648146, "num_input_tokens_seen": 135275720, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.25085449, "step": 6302, "time_per_iteration": 2.8968801498413086 }, { "auxiliary_loss_clip": 0.01486866, "auxiliary_loss_mlp": 0.01051439, "balance_loss_clip": 1.30341995, "balance_loss_mlp": 1.02704859, "epoch": 0.3789568615662107, "flos": 26693774196480.0, "grad_norm": 1.837946965443948, "language_loss": 0.693416, "learning_rate": 2.8529256343431867e-06, "loss": 0.718799, "num_input_tokens_seen": 135294140, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.24401855, "step": 6303, "time_per_iteration": 2.9194233417510986 }, { "auxiliary_loss_clip": 0.01496156, "auxiliary_loss_mlp": 0.01049258, "balance_loss_clip": 1.3100872, "balance_loss_mlp": 1.02621496, "epoch": 0.3790169848188787, "flos": 23595310010880.0, "grad_norm": 1.6210615702942242, "language_loss": 0.78282309, "learning_rate": 2.8525733480989846e-06, "loss": 0.80827725, "num_input_tokens_seen": 135314845, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.23034668, "step": 6304, "time_per_iteration": 4.380652189254761 }, { "auxiliary_loss_clip": 0.01508922, "auxiliary_loss_mlp": 0.01052566, "balance_loss_clip": 1.32090342, "balance_loss_mlp": 1.02840185, "epoch": 0.37907710807154665, "flos": 18445367750400.0, "grad_norm": 2.3062887680939808, "language_loss": 0.81303567, "learning_rate": 2.8522210295251146e-06, "loss": 0.83865052, "num_input_tokens_seen": 135333055, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.24169922, "step": 6305, "time_per_iteration": 2.8929603099823 }, { "auxiliary_loss_clip": 0.01301115, "auxiliary_loss_mlp": 0.01036951, "balance_loss_clip": 1.18528533, "balance_loss_mlp": 1.01330018, "epoch": 0.3791372313242146, "flos": 50132808092160.0, "grad_norm": 0.9779997288408186, "language_loss": 0.64549124, "learning_rate": 2.8518686786349387e-06, "loss": 0.66887194, "num_input_tokens_seen": 135387865, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.23632812, "step": 6306, "time_per_iteration": 3.35331654548645 }, { "auxiliary_loss_clip": 0.01483497, "auxiliary_loss_mlp": 0.01050713, "balance_loss_clip": 1.29993582, "balance_loss_mlp": 1.02638268, "epoch": 0.3791973545768826, "flos": 24327282689280.0, "grad_norm": 1.6372744533336887, "language_loss": 0.74138665, "learning_rate": 2.851516295441817e-06, "loss": 0.76672882, "num_input_tokens_seen": 135409095, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.24316406, "step": 6307, "time_per_iteration": 2.974104404449463 }, { "auxiliary_loss_clip": 0.01497025, "auxiliary_loss_mlp": 0.01046581, "balance_loss_clip": 1.31253469, "balance_loss_mlp": 1.02245307, "epoch": 0.3792574778295506, "flos": 21589873159680.0, "grad_norm": 1.5694221928728536, "language_loss": 0.78825802, "learning_rate": 2.851163879959112e-06, "loss": 0.81369412, "num_input_tokens_seen": 135429585, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.24133301, "step": 6308, "time_per_iteration": 2.964582681655884 }, { "auxiliary_loss_clip": 0.01479545, "auxiliary_loss_mlp": 0.01045129, "balance_loss_clip": 1.29441297, "balance_loss_mlp": 1.02163315, "epoch": 0.37931760108221857, "flos": 22282772313600.0, "grad_norm": 2.151287477825613, "language_loss": 0.73140073, "learning_rate": 2.8508114322001876e-06, "loss": 0.75664753, "num_input_tokens_seen": 135446320, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.23510742, "step": 6309, "time_per_iteration": 2.9911539554595947 }, { "auxiliary_loss_clip": 0.0148185, "auxiliary_loss_mlp": 0.01044617, "balance_loss_clip": 1.29965687, "balance_loss_mlp": 1.02135921, "epoch": 0.37937772433488653, "flos": 19692562677120.0, "grad_norm": 1.4037327349070574, "language_loss": 0.79247475, "learning_rate": 2.8504589521784083e-06, "loss": 0.81773943, "num_input_tokens_seen": 135465720, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.23242188, "step": 6310, "time_per_iteration": 4.351290941238403 }, { "auxiliary_loss_clip": 0.0148257, "auxiliary_loss_mlp": 0.01043283, "balance_loss_clip": 1.29979253, "balance_loss_mlp": 1.01948869, "epoch": 0.3794378475875545, "flos": 19108830522240.0, "grad_norm": 1.7913933566749083, "language_loss": 0.77620691, "learning_rate": 2.8501064399071403e-06, "loss": 0.80146545, "num_input_tokens_seen": 135485155, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.23791504, "step": 6311, "time_per_iteration": 4.268390655517578 }, { "auxiliary_loss_clip": 0.01476927, "auxiliary_loss_mlp": 0.01042721, "balance_loss_clip": 1.29531479, "balance_loss_mlp": 1.02009523, "epoch": 0.37949797084022246, "flos": 20349374463360.0, "grad_norm": 1.5414059300766683, "language_loss": 0.7104162, "learning_rate": 2.8497538953997504e-06, "loss": 0.73561263, "num_input_tokens_seen": 135502675, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.22607422, "step": 6312, "time_per_iteration": 2.8651866912841797 }, { "auxiliary_loss_clip": 0.01292442, "auxiliary_loss_mlp": 0.01030988, "balance_loss_clip": 1.17888284, "balance_loss_mlp": 1.00466657, "epoch": 0.37955809409289043, "flos": 64001559432960.0, "grad_norm": 0.7779416941232136, "language_loss": 0.56191075, "learning_rate": 2.849401318669608e-06, "loss": 0.58514506, "num_input_tokens_seen": 135562005, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 0.26367188, "step": 6313, "time_per_iteration": 4.712616443634033 }, { "auxiliary_loss_clip": 0.01481284, "auxiliary_loss_mlp": 0.01043183, "balance_loss_clip": 1.2986573, "balance_loss_mlp": 1.01990175, "epoch": 0.3796182173455584, "flos": 31553252968320.0, "grad_norm": 2.69652168432199, "language_loss": 0.72442245, "learning_rate": 2.849048709730083e-06, "loss": 0.74966711, "num_input_tokens_seen": 135582600, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.23266602, "step": 6314, "time_per_iteration": 2.931257486343384 }, { "auxiliary_loss_clip": 0.01485669, "auxiliary_loss_mlp": 0.01042285, "balance_loss_clip": 1.29875124, "balance_loss_mlp": 1.01882434, "epoch": 0.37967834059822636, "flos": 12138503218560.0, "grad_norm": 3.9492798174339825, "language_loss": 0.74301887, "learning_rate": 2.848696068594545e-06, "loss": 0.76829839, "num_input_tokens_seen": 135600280, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.23461914, "step": 6315, "time_per_iteration": 2.827350616455078 }, { "auxiliary_loss_clip": 0.0148523, "auxiliary_loss_mlp": 0.01043911, "balance_loss_clip": 1.30392504, "balance_loss_mlp": 1.020594, "epoch": 0.3797384638508943, "flos": 39363181626240.0, "grad_norm": 1.880523513036837, "language_loss": 0.71885109, "learning_rate": 2.8483433952763677e-06, "loss": 0.74414253, "num_input_tokens_seen": 135621560, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.2331543, "step": 6316, "time_per_iteration": 3.0200612545013428 }, { "auxiliary_loss_clip": 0.01476502, "auxiliary_loss_mlp": 0.01036798, "balance_loss_clip": 1.29441857, "balance_loss_mlp": 1.01422024, "epoch": 0.3797985871035623, "flos": 34066175207040.0, "grad_norm": 1.8390991751123245, "language_loss": 0.66421998, "learning_rate": 2.847990689788923e-06, "loss": 0.68935299, "num_input_tokens_seen": 135641745, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.22570801, "step": 6317, "time_per_iteration": 2.9777965545654297 }, { "auxiliary_loss_clip": 0.01462691, "auxiliary_loss_mlp": 0.01038939, "balance_loss_clip": 1.28376007, "balance_loss_mlp": 1.01627719, "epoch": 0.37985871035623026, "flos": 23232671786880.0, "grad_norm": 2.6903485652199457, "language_loss": 0.87492704, "learning_rate": 2.8476379521455877e-06, "loss": 0.89994335, "num_input_tokens_seen": 135660650, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.22631836, "step": 6318, "time_per_iteration": 2.8897392749786377 }, { "auxiliary_loss_clip": 0.01475997, "auxiliary_loss_mlp": 0.01047835, "balance_loss_clip": 1.29273415, "balance_loss_mlp": 1.02508998, "epoch": 0.3799188336088982, "flos": 18124336759680.0, "grad_norm": 2.0673632539962887, "language_loss": 0.77809173, "learning_rate": 2.8472851823597354e-06, "loss": 0.80333006, "num_input_tokens_seen": 135679980, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.22741699, "step": 6319, "time_per_iteration": 2.8912525177001953 }, { "auxiliary_loss_clip": 0.01473587, "auxiliary_loss_mlp": 0.01044529, "balance_loss_clip": 1.29311597, "balance_loss_mlp": 1.0221417, "epoch": 0.3799789568615662, "flos": 21881965461120.0, "grad_norm": 1.8209029079271601, "language_loss": 0.64310622, "learning_rate": 2.846932380444744e-06, "loss": 0.66828746, "num_input_tokens_seen": 135699400, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.22387695, "step": 6320, "time_per_iteration": 2.887160062789917 }, { "auxiliary_loss_clip": 0.01460315, "auxiliary_loss_mlp": 0.01039148, "balance_loss_clip": 1.28033614, "balance_loss_mlp": 1.01665354, "epoch": 0.3800390801142342, "flos": 32975726826240.0, "grad_norm": 2.061216799892788, "language_loss": 0.72133195, "learning_rate": 2.846579546413992e-06, "loss": 0.74632657, "num_input_tokens_seen": 135723455, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22485352, "step": 6321, "time_per_iteration": 2.987090587615967 }, { "auxiliary_loss_clip": 0.01478408, "auxiliary_loss_mlp": 0.01041925, "balance_loss_clip": 1.29300857, "balance_loss_mlp": 1.01783299, "epoch": 0.38009920336690217, "flos": 26918487711360.0, "grad_norm": 2.0401838505336145, "language_loss": 0.75894856, "learning_rate": 2.846226680280859e-06, "loss": 0.78415191, "num_input_tokens_seen": 135744335, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.24108887, "step": 6322, "time_per_iteration": 2.9191226959228516 }, { "auxiliary_loss_clip": 0.01468667, "auxiliary_loss_mlp": 0.01042628, "balance_loss_clip": 1.28786731, "balance_loss_mlp": 1.02075326, "epoch": 0.38015932661957014, "flos": 22498301134080.0, "grad_norm": 5.517411947282556, "language_loss": 0.86031461, "learning_rate": 2.845873782058725e-06, "loss": 0.88542753, "num_input_tokens_seen": 135761440, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.21875, "step": 6323, "time_per_iteration": 2.8930017948150635 }, { "auxiliary_loss_clip": 0.01468551, "auxiliary_loss_mlp": 0.010479, "balance_loss_clip": 1.28498495, "balance_loss_mlp": 1.023808, "epoch": 0.3802194498722381, "flos": 21990996725760.0, "grad_norm": 2.841858588498901, "language_loss": 0.73756182, "learning_rate": 2.845520851760973e-06, "loss": 0.76272631, "num_input_tokens_seen": 135779955, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.2409668, "step": 6324, "time_per_iteration": 2.852933645248413 }, { "auxiliary_loss_clip": 0.01476405, "auxiliary_loss_mlp": 0.01041803, "balance_loss_clip": 1.29211807, "balance_loss_mlp": 1.01933169, "epoch": 0.38027957312490607, "flos": 21334863611520.0, "grad_norm": 1.7910219206002573, "language_loss": 0.85399032, "learning_rate": 2.8451678894009847e-06, "loss": 0.87917244, "num_input_tokens_seen": 135799840, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22460938, "step": 6325, "time_per_iteration": 2.886261463165283 }, { "auxiliary_loss_clip": 0.01467986, "auxiliary_loss_mlp": 0.01044429, "balance_loss_clip": 1.28766572, "balance_loss_mlp": 1.02168345, "epoch": 0.38033969637757403, "flos": 16700053109760.0, "grad_norm": 1.6237585432195174, "language_loss": 0.80420876, "learning_rate": 2.8448148949921465e-06, "loss": 0.82933295, "num_input_tokens_seen": 135817880, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.22753906, "step": 6326, "time_per_iteration": 2.8814303874969482 }, { "auxiliary_loss_clip": 0.01460226, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.28150141, "balance_loss_mlp": 1.02234352, "epoch": 0.380399819630242, "flos": 36224648530560.0, "grad_norm": 1.878557043865977, "language_loss": 0.74190414, "learning_rate": 2.844461868547842e-06, "loss": 0.76695931, "num_input_tokens_seen": 135838940, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.22937012, "step": 6327, "time_per_iteration": 2.97017765045166 }, { "auxiliary_loss_clip": 0.01464805, "auxiliary_loss_mlp": 0.01048025, "balance_loss_clip": 1.28400671, "balance_loss_mlp": 1.02457643, "epoch": 0.38045994288290996, "flos": 21298957223040.0, "grad_norm": 1.4578241424633058, "language_loss": 0.8362661, "learning_rate": 2.844108810081459e-06, "loss": 0.86139441, "num_input_tokens_seen": 135858325, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.23449707, "step": 6328, "time_per_iteration": 2.909496784210205 }, { "auxiliary_loss_clip": 0.01453916, "auxiliary_loss_mlp": 0.01047132, "balance_loss_clip": 1.27340674, "balance_loss_mlp": 1.02427959, "epoch": 0.38052006613557793, "flos": 20932608925440.0, "grad_norm": 2.160231016989018, "language_loss": 0.62428194, "learning_rate": 2.843755719606385e-06, "loss": 0.64929247, "num_input_tokens_seen": 135878430, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.2286377, "step": 6329, "time_per_iteration": 2.859618902206421 }, { "auxiliary_loss_clip": 0.01463386, "auxiliary_loss_mlp": 0.01045756, "balance_loss_clip": 1.28376389, "balance_loss_mlp": 1.02326107, "epoch": 0.3805801893882459, "flos": 20999128060800.0, "grad_norm": 1.9589070342352326, "language_loss": 0.56691611, "learning_rate": 2.8434025971360104e-06, "loss": 0.59200746, "num_input_tokens_seen": 135894755, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22497559, "step": 6330, "time_per_iteration": 2.896533489227295 }, { "auxiliary_loss_clip": 0.01448263, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.27367938, "balance_loss_mlp": 1.0186218, "epoch": 0.38064031264091386, "flos": 25570043625600.0, "grad_norm": 1.9569099947078032, "language_loss": 0.66493446, "learning_rate": 2.8430494426837243e-06, "loss": 0.68981028, "num_input_tokens_seen": 135918275, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.20703125, "step": 6331, "time_per_iteration": 2.9298412799835205 }, { "auxiliary_loss_clip": 0.01473406, "auxiliary_loss_mlp": 0.01048753, "balance_loss_clip": 1.29232454, "balance_loss_mlp": 1.02569818, "epoch": 0.3807004358935818, "flos": 15094246746240.0, "grad_norm": 1.6508784327364194, "language_loss": 0.77519333, "learning_rate": 2.842696256262919e-06, "loss": 0.80041486, "num_input_tokens_seen": 135937430, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.23046875, "step": 6332, "time_per_iteration": 2.8665952682495117 }, { "auxiliary_loss_clip": 0.01473626, "auxiliary_loss_mlp": 0.01047247, "balance_loss_clip": 1.29052782, "balance_loss_mlp": 1.02277291, "epoch": 0.3807605591462498, "flos": 16408141787520.0, "grad_norm": 1.7719742929967748, "language_loss": 0.82400548, "learning_rate": 2.842343037886987e-06, "loss": 0.8492142, "num_input_tokens_seen": 135954210, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.24499512, "step": 6333, "time_per_iteration": 2.8203916549682617 }, { "auxiliary_loss_clip": 0.01462649, "auxiliary_loss_mlp": 0.01040205, "balance_loss_clip": 1.28264165, "balance_loss_mlp": 1.01827097, "epoch": 0.3808206823989178, "flos": 29068364522880.0, "grad_norm": 1.5720893709163775, "language_loss": 0.86965901, "learning_rate": 2.8419897875693226e-06, "loss": 0.89468753, "num_input_tokens_seen": 135974425, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.21936035, "step": 6334, "time_per_iteration": 2.9206113815307617 }, { "auxiliary_loss_clip": 0.01464243, "auxiliary_loss_mlp": 0.01041918, "balance_loss_clip": 1.28375411, "balance_loss_mlp": 1.02038836, "epoch": 0.3808808056515858, "flos": 15714337737600.0, "grad_norm": 1.8470292245351432, "language_loss": 0.80183828, "learning_rate": 2.841636505323321e-06, "loss": 0.82689989, "num_input_tokens_seen": 135991985, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.21520996, "step": 6335, "time_per_iteration": 2.8583874702453613 }, { "auxiliary_loss_clip": 0.01474972, "auxiliary_loss_mlp": 0.01043141, "balance_loss_clip": 1.29245949, "balance_loss_mlp": 1.01878679, "epoch": 0.38094092890425374, "flos": 20714410661760.0, "grad_norm": 2.4029274401277623, "language_loss": 0.73110998, "learning_rate": 2.8412831911623795e-06, "loss": 0.75629109, "num_input_tokens_seen": 136010015, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.24353027, "step": 6336, "time_per_iteration": 2.844435930252075 }, { "auxiliary_loss_clip": 0.01466313, "auxiliary_loss_mlp": 0.01040899, "balance_loss_clip": 1.28693366, "balance_loss_mlp": 1.0199182, "epoch": 0.3810010521569217, "flos": 20677735111680.0, "grad_norm": 2.0467374791921062, "language_loss": 0.70442438, "learning_rate": 2.840929845099894e-06, "loss": 0.72949648, "num_input_tokens_seen": 136028440, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.20983887, "step": 6337, "time_per_iteration": 2.8668885231018066 }, { "auxiliary_loss_clip": 0.01462534, "auxiliary_loss_mlp": 0.01037856, "balance_loss_clip": 1.28285766, "balance_loss_mlp": 1.01538563, "epoch": 0.38106117540958967, "flos": 31839010997760.0, "grad_norm": 2.492931007081314, "language_loss": 0.64546967, "learning_rate": 2.8405764671492652e-06, "loss": 0.67047358, "num_input_tokens_seen": 136048360, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.22473145, "step": 6338, "time_per_iteration": 2.9405808448791504 }, { "auxiliary_loss_clip": 0.01481611, "auxiliary_loss_mlp": 0.01043117, "balance_loss_clip": 1.29696095, "balance_loss_mlp": 1.02045548, "epoch": 0.38112129866225763, "flos": 16909700106240.0, "grad_norm": 2.1070997019757636, "language_loss": 0.70236492, "learning_rate": 2.8402230573238923e-06, "loss": 0.7276122, "num_input_tokens_seen": 136065500, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22644043, "step": 6339, "time_per_iteration": 4.353609085083008 }, { "auxiliary_loss_clip": 0.01461085, "auxiliary_loss_mlp": 0.01038311, "balance_loss_clip": 1.28233004, "balance_loss_mlp": 1.0164361, "epoch": 0.3811814219149256, "flos": 20897154984960.0, "grad_norm": 2.596607238801787, "language_loss": 0.69215888, "learning_rate": 2.839869615637177e-06, "loss": 0.71715283, "num_input_tokens_seen": 136084060, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21875, "step": 6340, "time_per_iteration": 2.8762898445129395 }, { "auxiliary_loss_clip": 0.01466505, "auxiliary_loss_mlp": 0.01036916, "balance_loss_clip": 1.28342152, "balance_loss_mlp": 1.01477933, "epoch": 0.38124154516759357, "flos": 16699510172160.0, "grad_norm": 1.8116489750737765, "language_loss": 0.90523756, "learning_rate": 2.839516142102522e-06, "loss": 0.93027186, "num_input_tokens_seen": 136102310, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22143555, "step": 6341, "time_per_iteration": 2.870163679122925 }, { "auxiliary_loss_clip": 0.01477731, "auxiliary_loss_mlp": 0.01040606, "balance_loss_clip": 1.29344785, "balance_loss_mlp": 1.01776552, "epoch": 0.38130166842026153, "flos": 19691250577920.0, "grad_norm": 1.6369134641032932, "language_loss": 0.75738603, "learning_rate": 2.83916263673333e-06, "loss": 0.78256941, "num_input_tokens_seen": 136120725, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.22839355, "step": 6342, "time_per_iteration": 2.8650383949279785 }, { "auxiliary_loss_clip": 0.01460929, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.27946889, "balance_loss_mlp": 1.0130899, "epoch": 0.3813617916729295, "flos": 22208109114240.0, "grad_norm": 1.6826225159628223, "language_loss": 0.84288824, "learning_rate": 2.838809099543007e-06, "loss": 0.86784261, "num_input_tokens_seen": 136139105, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.2142334, "step": 6343, "time_per_iteration": 2.8838319778442383 }, { "auxiliary_loss_clip": 0.01474394, "auxiliary_loss_mlp": 0.01041496, "balance_loss_clip": 1.29084575, "balance_loss_mlp": 1.01947832, "epoch": 0.38142191492559746, "flos": 19105708631040.0, "grad_norm": 1.6514449183137576, "language_loss": 0.78218079, "learning_rate": 2.838455530544959e-06, "loss": 0.80733967, "num_input_tokens_seen": 136158265, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.22009277, "step": 6344, "time_per_iteration": 2.835404872894287 }, { "auxiliary_loss_clip": 0.01465735, "auxiliary_loss_mlp": 0.010451, "balance_loss_clip": 1.28465557, "balance_loss_mlp": 1.02332056, "epoch": 0.3814820381782654, "flos": 24108450998400.0, "grad_norm": 2.0604302654532365, "language_loss": 0.74195874, "learning_rate": 2.838101929752593e-06, "loss": 0.76706713, "num_input_tokens_seen": 136176100, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.21801758, "step": 6345, "time_per_iteration": 2.880025863647461 }, { "auxiliary_loss_clip": 0.01471216, "auxiliary_loss_mlp": 0.01045327, "balance_loss_clip": 1.29380429, "balance_loss_mlp": 1.02285624, "epoch": 0.3815421614309334, "flos": 15786919676160.0, "grad_norm": 2.154503130461109, "language_loss": 0.70894259, "learning_rate": 2.8377482971793187e-06, "loss": 0.73410797, "num_input_tokens_seen": 136195125, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.22473145, "step": 6346, "time_per_iteration": 4.262303352355957 }, { "auxiliary_loss_clip": 0.01482377, "auxiliary_loss_mlp": 0.01038341, "balance_loss_clip": 1.29789686, "balance_loss_mlp": 1.01639438, "epoch": 0.38160228468360136, "flos": 19908815414400.0, "grad_norm": 2.043776113635593, "language_loss": 0.76333809, "learning_rate": 2.8373946328385437e-06, "loss": 0.78854531, "num_input_tokens_seen": 136213885, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.21960449, "step": 6347, "time_per_iteration": 4.265958070755005 }, { "auxiliary_loss_clip": 0.01462471, "auxiliary_loss_mlp": 0.01034335, "balance_loss_clip": 1.2802335, "balance_loss_mlp": 1.01292467, "epoch": 0.3816624079362694, "flos": 19290353235840.0, "grad_norm": 1.5027810037054412, "language_loss": 0.75407803, "learning_rate": 2.8370409367436813e-06, "loss": 0.77904612, "num_input_tokens_seen": 136232700, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.21398926, "step": 6348, "time_per_iteration": 4.3215861320495605 }, { "auxiliary_loss_clip": 0.01477944, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.292539, "balance_loss_mlp": 1.0164268, "epoch": 0.38172253118893734, "flos": 21187166025600.0, "grad_norm": 1.8083599871118465, "language_loss": 0.88211155, "learning_rate": 2.836687208908142e-06, "loss": 0.90726876, "num_input_tokens_seen": 136248975, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.21350098, "step": 6349, "time_per_iteration": 2.8707215785980225 }, { "auxiliary_loss_clip": 0.01471971, "auxiliary_loss_mlp": 0.01043299, "balance_loss_clip": 1.28897691, "balance_loss_mlp": 1.02079201, "epoch": 0.3817826544416053, "flos": 17537935161600.0, "grad_norm": 2.0622007583519593, "language_loss": 0.77618313, "learning_rate": 2.836333449345341e-06, "loss": 0.80133581, "num_input_tokens_seen": 136266710, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22485352, "step": 6350, "time_per_iteration": 2.9012835025787354 }, { "auxiliary_loss_clip": 0.01467086, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.28642046, "balance_loss_mlp": 1.01204348, "epoch": 0.38184277769427327, "flos": 16335288380160.0, "grad_norm": 2.420915241355584, "language_loss": 0.77453232, "learning_rate": 2.8359796580686907e-06, "loss": 0.79956108, "num_input_tokens_seen": 136284445, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.23730469, "step": 6351, "time_per_iteration": 2.8255538940429688 }, { "auxiliary_loss_clip": 0.01475522, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.29215169, "balance_loss_mlp": 1.01927733, "epoch": 0.38190290094694124, "flos": 30455067726720.0, "grad_norm": 1.7615366536542985, "language_loss": 0.74813509, "learning_rate": 2.8356258350916085e-06, "loss": 0.77331388, "num_input_tokens_seen": 136305730, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.23095703, "step": 6352, "time_per_iteration": 2.9712560176849365 }, { "auxiliary_loss_clip": 0.01468406, "auxiliary_loss_mlp": 0.0103423, "balance_loss_clip": 1.28897178, "balance_loss_mlp": 1.01396465, "epoch": 0.3819630241996092, "flos": 14218015086720.0, "grad_norm": 3.0205535118185796, "language_loss": 0.64851499, "learning_rate": 2.8352719804275104e-06, "loss": 0.67354131, "num_input_tokens_seen": 136323850, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.20263672, "step": 6353, "time_per_iteration": 2.8374369144439697 }, { "auxiliary_loss_clip": 0.01475284, "auxiliary_loss_mlp": 0.01044414, "balance_loss_clip": 1.29307485, "balance_loss_mlp": 1.0191412, "epoch": 0.38202314745227717, "flos": 25020589046400.0, "grad_norm": 1.6824461027623239, "language_loss": 0.84072638, "learning_rate": 2.834918094089816e-06, "loss": 0.86592335, "num_input_tokens_seen": 136344880, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.25280762, "step": 6354, "time_per_iteration": 2.897923707962036 }, { "auxiliary_loss_clip": 0.01462945, "auxiliary_loss_mlp": 0.01039474, "balance_loss_clip": 1.28588033, "balance_loss_mlp": 1.0182426, "epoch": 0.38208327070494513, "flos": 20824482556800.0, "grad_norm": 1.6516875772862567, "language_loss": 0.81248164, "learning_rate": 2.834564176091943e-06, "loss": 0.83750576, "num_input_tokens_seen": 136366060, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.21252441, "step": 6355, "time_per_iteration": 2.885236978530884 }, { "auxiliary_loss_clip": 0.01473151, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.29184294, "balance_loss_mlp": 1.02081203, "epoch": 0.3821433939576131, "flos": 22647899001600.0, "grad_norm": 1.8765918338451968, "language_loss": 0.76048672, "learning_rate": 2.8342102264473125e-06, "loss": 0.78563964, "num_input_tokens_seen": 136385625, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.21337891, "step": 6356, "time_per_iteration": 2.8902339935302734 }, { "auxiliary_loss_clip": 0.01475655, "auxiliary_loss_mlp": 0.01041776, "balance_loss_clip": 1.293257, "balance_loss_mlp": 1.0205332, "epoch": 0.38220351721028106, "flos": 26881088244480.0, "grad_norm": 1.7237475732783842, "language_loss": 0.82314515, "learning_rate": 2.833856245169348e-06, "loss": 0.84831941, "num_input_tokens_seen": 136405750, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.21252441, "step": 6357, "time_per_iteration": 2.9008283615112305 }, { "auxiliary_loss_clip": 0.01486109, "auxiliary_loss_mlp": 0.01038336, "balance_loss_clip": 1.30160284, "balance_loss_mlp": 1.01539969, "epoch": 0.38226364046294903, "flos": 23377835664000.0, "grad_norm": 1.6804313389302816, "language_loss": 0.78944242, "learning_rate": 2.8335022322714695e-06, "loss": 0.81468689, "num_input_tokens_seen": 136426085, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.22912598, "step": 6358, "time_per_iteration": 2.863264799118042 }, { "auxiliary_loss_clip": 0.01489531, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.3028338, "balance_loss_mlp": 1.01657534, "epoch": 0.382323763715617, "flos": 19655706147840.0, "grad_norm": 8.292777523418867, "language_loss": 0.79915798, "learning_rate": 2.8331481877671036e-06, "loss": 0.82443398, "num_input_tokens_seen": 136442670, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.21496582, "step": 6359, "time_per_iteration": 2.905219078063965 }, { "auxiliary_loss_clip": 0.01474437, "auxiliary_loss_mlp": 0.01037165, "balance_loss_clip": 1.2943716, "balance_loss_mlp": 1.01486135, "epoch": 0.38238388696828496, "flos": 54143030384640.0, "grad_norm": 1.81840829472201, "language_loss": 0.70293808, "learning_rate": 2.8327941116696754e-06, "loss": 0.72805411, "num_input_tokens_seen": 136465730, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.22302246, "step": 6360, "time_per_iteration": 3.1570260524749756 }, { "auxiliary_loss_clip": 0.01473441, "auxiliary_loss_mlp": 0.01035378, "balance_loss_clip": 1.2932837, "balance_loss_mlp": 1.01455212, "epoch": 0.382444010220953, "flos": 24946921232640.0, "grad_norm": 1.811804490322789, "language_loss": 0.79069668, "learning_rate": 2.83244000399261e-06, "loss": 0.81578487, "num_input_tokens_seen": 136487215, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.20849609, "step": 6361, "time_per_iteration": 2.950392723083496 }, { "auxiliary_loss_clip": 0.01456551, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.27816916, "balance_loss_mlp": 1.01389229, "epoch": 0.38250413347362094, "flos": 42351935875200.0, "grad_norm": 2.0255515638387136, "language_loss": 0.65992773, "learning_rate": 2.832085864749337e-06, "loss": 0.68483651, "num_input_tokens_seen": 136510365, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.2043457, "step": 6362, "time_per_iteration": 3.0583200454711914 }, { "auxiliary_loss_clip": 0.01489736, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.30497468, "balance_loss_mlp": 1.01837277, "epoch": 0.3825642567262889, "flos": 16297753178880.0, "grad_norm": 1.680242374311346, "language_loss": 0.82882106, "learning_rate": 2.8317316939532848e-06, "loss": 0.85412216, "num_input_tokens_seen": 136527100, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.21984863, "step": 6363, "time_per_iteration": 2.8550002574920654 }, { "auxiliary_loss_clip": 0.01475338, "auxiliary_loss_mlp": 0.01042659, "balance_loss_clip": 1.29662347, "balance_loss_mlp": 1.02053392, "epoch": 0.3826243799789569, "flos": 45669503220480.0, "grad_norm": 1.653008496440779, "language_loss": 0.59863764, "learning_rate": 2.8313774916178825e-06, "loss": 0.62381762, "num_input_tokens_seen": 136550870, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.22131348, "step": 6364, "time_per_iteration": 3.1291422843933105 }, { "auxiliary_loss_clip": 0.01492701, "auxiliary_loss_mlp": 0.0104136, "balance_loss_clip": 1.30532014, "balance_loss_mlp": 1.0212729, "epoch": 0.38268450323162484, "flos": 25312545613440.0, "grad_norm": 1.9906214832280595, "language_loss": 0.70165265, "learning_rate": 2.8310232577565635e-06, "loss": 0.72699326, "num_input_tokens_seen": 136569895, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.20080566, "step": 6365, "time_per_iteration": 2.9093146324157715 }, { "auxiliary_loss_clip": 0.01495504, "auxiliary_loss_mlp": 0.01039124, "balance_loss_clip": 1.30513847, "balance_loss_mlp": 1.01730847, "epoch": 0.3827446264842928, "flos": 21846240051840.0, "grad_norm": 1.7433582491962958, "language_loss": 0.74365854, "learning_rate": 2.830668992382758e-06, "loss": 0.76900476, "num_input_tokens_seen": 136588585, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.21801758, "step": 6366, "time_per_iteration": 2.8879058361053467 }, { "auxiliary_loss_clip": 0.01491589, "auxiliary_loss_mlp": 0.01042596, "balance_loss_clip": 1.30520916, "balance_loss_mlp": 1.02169871, "epoch": 0.38280474973696077, "flos": 25744417660800.0, "grad_norm": 2.0890637451863263, "language_loss": 0.69884437, "learning_rate": 2.830314695509902e-06, "loss": 0.72418618, "num_input_tokens_seen": 136606640, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.2088623, "step": 6367, "time_per_iteration": 2.9425206184387207 }, { "auxiliary_loss_clip": 0.01462811, "auxiliary_loss_mlp": 0.01036461, "balance_loss_clip": 1.28536391, "balance_loss_mlp": 1.01505113, "epoch": 0.38286487298962874, "flos": 24905675957760.0, "grad_norm": 2.17103859511807, "language_loss": 0.65749955, "learning_rate": 2.82996036715143e-06, "loss": 0.68249226, "num_input_tokens_seen": 136624940, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.21411133, "step": 6368, "time_per_iteration": 2.8961455821990967 }, { "auxiliary_loss_clip": 0.01474303, "auxiliary_loss_mlp": 0.01044118, "balance_loss_clip": 1.2930057, "balance_loss_mlp": 1.02289832, "epoch": 0.3829249962422967, "flos": 28554182904960.0, "grad_norm": 1.3500827258370098, "language_loss": 0.68742144, "learning_rate": 2.8296060073207763e-06, "loss": 0.71260566, "num_input_tokens_seen": 136645540, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.21228027, "step": 6369, "time_per_iteration": 2.949284791946411 }, { "auxiliary_loss_clip": 0.0147638, "auxiliary_loss_mlp": 0.01038137, "balance_loss_clip": 1.29476738, "balance_loss_mlp": 1.01698923, "epoch": 0.38298511949496467, "flos": 21481339587840.0, "grad_norm": 2.405714402298814, "language_loss": 0.78738487, "learning_rate": 2.8292516160313804e-06, "loss": 0.81253004, "num_input_tokens_seen": 136664530, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21130371, "step": 6370, "time_per_iteration": 2.885477066040039 }, { "auxiliary_loss_clip": 0.01488802, "auxiliary_loss_mlp": 0.01037018, "balance_loss_clip": 1.30485296, "balance_loss_mlp": 1.01608479, "epoch": 0.38304524274763263, "flos": 31690906208640.0, "grad_norm": 3.775614122888807, "language_loss": 0.65018928, "learning_rate": 2.8288971932966805e-06, "loss": 0.67544746, "num_input_tokens_seen": 136682315, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.20922852, "step": 6371, "time_per_iteration": 2.9398391246795654 }, { "auxiliary_loss_clip": 0.01503975, "auxiliary_loss_mlp": 0.01043041, "balance_loss_clip": 1.31298292, "balance_loss_mlp": 1.02054596, "epoch": 0.3831053660003006, "flos": 25086203285760.0, "grad_norm": 1.8248677146460808, "language_loss": 0.73462999, "learning_rate": 2.8285427391301155e-06, "loss": 0.76010013, "num_input_tokens_seen": 136701185, "router_z_loss_clip": 1.91113281, "router_z_loss_mlp": 0.22485352, "step": 6372, "time_per_iteration": 2.884978771209717 }, { "auxiliary_loss_clip": 0.01485398, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.298226, "balance_loss_mlp": 1.01620245, "epoch": 0.38316548925296856, "flos": 23269618805760.0, "grad_norm": 1.9728709977103525, "language_loss": 0.85973251, "learning_rate": 2.8281882535451266e-06, "loss": 0.88495398, "num_input_tokens_seen": 136721265, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.20544434, "step": 6373, "time_per_iteration": 2.8574302196502686 }, { "auxiliary_loss_clip": 0.01482395, "auxiliary_loss_mlp": 0.01039765, "balance_loss_clip": 1.29651833, "balance_loss_mlp": 1.01854563, "epoch": 0.3832256125056366, "flos": 34436912250240.0, "grad_norm": 5.050226645035196, "language_loss": 0.75684094, "learning_rate": 2.8278337365551567e-06, "loss": 0.78206253, "num_input_tokens_seen": 136741885, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.21228027, "step": 6374, "time_per_iteration": 4.462501764297485 }, { "auxiliary_loss_clip": 0.01480882, "auxiliary_loss_mlp": 0.01038968, "balance_loss_clip": 1.29431736, "balance_loss_mlp": 1.01848757, "epoch": 0.38328573575830455, "flos": 21772888951680.0, "grad_norm": 2.67722952043786, "language_loss": 0.76968169, "learning_rate": 2.8274791881736485e-06, "loss": 0.79488021, "num_input_tokens_seen": 136760905, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.20471191, "step": 6375, "time_per_iteration": 2.8899521827697754 }, { "auxiliary_loss_clip": 0.01488062, "auxiliary_loss_mlp": 0.01038662, "balance_loss_clip": 1.3023392, "balance_loss_mlp": 1.01739478, "epoch": 0.3833458590109725, "flos": 17387839601280.0, "grad_norm": 2.08649993253791, "language_loss": 0.74226201, "learning_rate": 2.8271246084140457e-06, "loss": 0.76752937, "num_input_tokens_seen": 136777240, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.21276855, "step": 6376, "time_per_iteration": 2.8692827224731445 }, { "auxiliary_loss_clip": 0.01483432, "auxiliary_loss_mlp": 0.01040467, "balance_loss_clip": 1.30110979, "balance_loss_mlp": 1.01798415, "epoch": 0.3834059822636405, "flos": 29436613102080.0, "grad_norm": 2.02066135701997, "language_loss": 0.68934357, "learning_rate": 2.826769997289796e-06, "loss": 0.71458256, "num_input_tokens_seen": 136801040, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.22473145, "step": 6377, "time_per_iteration": 2.9613442420959473 }, { "auxiliary_loss_clip": 0.01495446, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.30729866, "balance_loss_mlp": 1.01460958, "epoch": 0.38346610551630844, "flos": 21480751405440.0, "grad_norm": 2.1675152170038685, "language_loss": 0.74037856, "learning_rate": 2.826415354814344e-06, "loss": 0.765692, "num_input_tokens_seen": 136819495, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.21289062, "step": 6378, "time_per_iteration": 2.869659185409546 }, { "auxiliary_loss_clip": 0.01484316, "auxiliary_loss_mlp": 0.01043577, "balance_loss_clip": 1.29913902, "balance_loss_mlp": 1.02222693, "epoch": 0.3835262287689764, "flos": 27572132361600.0, "grad_norm": 1.7491966164074595, "language_loss": 0.69906712, "learning_rate": 2.8260606810011396e-06, "loss": 0.72434604, "num_input_tokens_seen": 136838840, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.21362305, "step": 6379, "time_per_iteration": 3.0107345581054688 }, { "auxiliary_loss_clip": 0.01470977, "auxiliary_loss_mlp": 0.01039392, "balance_loss_clip": 1.29042792, "balance_loss_mlp": 1.01872087, "epoch": 0.3835863520216444, "flos": 15532226841600.0, "grad_norm": 1.9180832646828803, "language_loss": 0.83871889, "learning_rate": 2.8257059758636315e-06, "loss": 0.86382258, "num_input_tokens_seen": 136854425, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.20678711, "step": 6380, "time_per_iteration": 2.867462158203125 }, { "auxiliary_loss_clip": 0.01481472, "auxiliary_loss_mlp": 0.01038301, "balance_loss_clip": 1.29960668, "balance_loss_mlp": 1.01716506, "epoch": 0.38364647527431234, "flos": 21914116531200.0, "grad_norm": 1.5021757788203036, "language_loss": 0.81776273, "learning_rate": 2.8253512394152697e-06, "loss": 0.84296048, "num_input_tokens_seen": 136874355, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21142578, "step": 6381, "time_per_iteration": 4.306679010391235 }, { "auxiliary_loss_clip": 0.01277233, "auxiliary_loss_mlp": 0.01018558, "balance_loss_clip": 1.17606103, "balance_loss_mlp": 1.00196409, "epoch": 0.3837065985269803, "flos": 65563089120000.0, "grad_norm": 0.8005831697450573, "language_loss": 0.60620368, "learning_rate": 2.8249964716695068e-06, "loss": 0.6291616, "num_input_tokens_seen": 136937475, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.16601562, "step": 6382, "time_per_iteration": 4.752147674560547 }, { "auxiliary_loss_clip": 0.0149424, "auxiliary_loss_mlp": 0.01037666, "balance_loss_clip": 1.30550528, "balance_loss_mlp": 1.01624417, "epoch": 0.38376672177964827, "flos": 28268243896320.0, "grad_norm": 16.06141120732858, "language_loss": 0.68488938, "learning_rate": 2.824641672639794e-06, "loss": 0.71020842, "num_input_tokens_seen": 136955805, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.21411133, "step": 6383, "time_per_iteration": 4.2904627323150635 }, { "auxiliary_loss_clip": 0.01494766, "auxiliary_loss_mlp": 0.01042895, "balance_loss_clip": 1.30922294, "balance_loss_mlp": 1.02147305, "epoch": 0.38382684503231623, "flos": 20641104806400.0, "grad_norm": 1.7034770266698918, "language_loss": 0.74829996, "learning_rate": 2.824286842339587e-06, "loss": 0.77367663, "num_input_tokens_seen": 136975240, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.21411133, "step": 6384, "time_per_iteration": 2.8940584659576416 }, { "auxiliary_loss_clip": 0.01486878, "auxiliary_loss_mlp": 0.01041339, "balance_loss_clip": 1.30405641, "balance_loss_mlp": 1.02042973, "epoch": 0.3838869682849842, "flos": 19614279893760.0, "grad_norm": 1.3806273808939558, "language_loss": 0.76592529, "learning_rate": 2.823931980782341e-06, "loss": 0.79120749, "num_input_tokens_seen": 136994985, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.20898438, "step": 6385, "time_per_iteration": 2.9040303230285645 }, { "auxiliary_loss_clip": 0.01272182, "auxiliary_loss_mlp": 0.01021681, "balance_loss_clip": 1.17353058, "balance_loss_mlp": 1.00461042, "epoch": 0.38394709153765216, "flos": 56581081182720.0, "grad_norm": 0.8932560006944583, "language_loss": 0.67099303, "learning_rate": 2.82357708798151e-06, "loss": 0.6939317, "num_input_tokens_seen": 137046290, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.17089844, "step": 6386, "time_per_iteration": 3.2035484313964844 }, { "auxiliary_loss_clip": 0.0147972, "auxiliary_loss_mlp": 0.01040199, "balance_loss_clip": 1.29776537, "balance_loss_mlp": 1.01983857, "epoch": 0.3840072147903202, "flos": 15897398774400.0, "grad_norm": 1.7286873706384767, "language_loss": 0.73764527, "learning_rate": 2.8232221639505547e-06, "loss": 0.76284444, "num_input_tokens_seen": 137064725, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.20373535, "step": 6387, "time_per_iteration": 2.8605823516845703 }, { "auxiliary_loss_clip": 0.01464733, "auxiliary_loss_mlp": 0.01048822, "balance_loss_clip": 1.28585386, "balance_loss_mlp": 1.02726889, "epoch": 0.38406733804298815, "flos": 28229125127040.0, "grad_norm": 1.5945548895896784, "language_loss": 0.8176989, "learning_rate": 2.822867208702932e-06, "loss": 0.84283447, "num_input_tokens_seen": 137086030, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.21557617, "step": 6388, "time_per_iteration": 2.9260332584381104 }, { "auxiliary_loss_clip": 0.01470352, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.28783333, "balance_loss_mlp": 1.02635086, "epoch": 0.3841274612956561, "flos": 18232779841920.0, "grad_norm": 2.015058797881041, "language_loss": 0.77203172, "learning_rate": 2.8225122222521026e-06, "loss": 0.79720128, "num_input_tokens_seen": 137105400, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.20263672, "step": 6389, "time_per_iteration": 2.9214928150177 }, { "auxiliary_loss_clip": 0.0149465, "auxiliary_loss_mlp": 0.01048808, "balance_loss_clip": 1.30683494, "balance_loss_mlp": 1.02674246, "epoch": 0.3841875845483241, "flos": 19802589327360.0, "grad_norm": 3.0266721421039446, "language_loss": 0.77224058, "learning_rate": 2.8221572046115273e-06, "loss": 0.79767513, "num_input_tokens_seen": 137124985, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.22058105, "step": 6390, "time_per_iteration": 2.8680434226989746 }, { "auxiliary_loss_clip": 0.01492606, "auxiliary_loss_mlp": 0.01050198, "balance_loss_clip": 1.30423748, "balance_loss_mlp": 1.02813232, "epoch": 0.38424770780099204, "flos": 29910997278720.0, "grad_norm": 1.5965653904454025, "language_loss": 0.70734018, "learning_rate": 2.821802155794668e-06, "loss": 0.73276818, "num_input_tokens_seen": 137146745, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.2208252, "step": 6391, "time_per_iteration": 2.9442131519317627 }, { "auxiliary_loss_clip": 0.01487042, "auxiliary_loss_mlp": 0.01047873, "balance_loss_clip": 1.30125988, "balance_loss_mlp": 1.02647495, "epoch": 0.38430783105366, "flos": 20823487171200.0, "grad_norm": 2.21306105392205, "language_loss": 0.84963465, "learning_rate": 2.8214470758149884e-06, "loss": 0.87498373, "num_input_tokens_seen": 137163195, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.21398926, "step": 6392, "time_per_iteration": 2.8224666118621826 }, { "auxiliary_loss_clip": 0.01486384, "auxiliary_loss_mlp": 0.01043035, "balance_loss_clip": 1.3004477, "balance_loss_mlp": 1.0218755, "epoch": 0.384367954306328, "flos": 11005678442880.0, "grad_norm": 2.16649539158351, "language_loss": 0.6267308, "learning_rate": 2.8210919646859536e-06, "loss": 0.65202504, "num_input_tokens_seen": 137179330, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.21166992, "step": 6393, "time_per_iteration": 2.863619804382324 }, { "auxiliary_loss_clip": 0.01495852, "auxiliary_loss_mlp": 0.01043709, "balance_loss_clip": 1.30570877, "balance_loss_mlp": 1.02232325, "epoch": 0.38442807755899594, "flos": 25348994939520.0, "grad_norm": 3.347236496506489, "language_loss": 0.72067714, "learning_rate": 2.820736822421029e-06, "loss": 0.74607271, "num_input_tokens_seen": 137198655, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.21398926, "step": 6394, "time_per_iteration": 2.871183156967163 }, { "auxiliary_loss_clip": 0.01495665, "auxiliary_loss_mlp": 0.01045626, "balance_loss_clip": 1.30557239, "balance_loss_mlp": 1.02367949, "epoch": 0.3844882008116639, "flos": 21079763573760.0, "grad_norm": 2.2096223223753033, "language_loss": 0.82889557, "learning_rate": 2.8203816490336822e-06, "loss": 0.85430849, "num_input_tokens_seen": 137217120, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.21960449, "step": 6395, "time_per_iteration": 2.865556001663208 }, { "auxiliary_loss_clip": 0.01484459, "auxiliary_loss_mlp": 0.01042022, "balance_loss_clip": 1.29866266, "balance_loss_mlp": 1.02077854, "epoch": 0.38454832406433187, "flos": 17971074063360.0, "grad_norm": 2.4723800507032534, "language_loss": 0.71483266, "learning_rate": 2.8200264445373813e-06, "loss": 0.7400974, "num_input_tokens_seen": 137234410, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.21240234, "step": 6396, "time_per_iteration": 2.826753616333008 }, { "auxiliary_loss_clip": 0.01264789, "auxiliary_loss_mlp": 0.01038708, "balance_loss_clip": 1.16112924, "balance_loss_mlp": 1.01820374, "epoch": 0.38460844731699984, "flos": 67958582544000.0, "grad_norm": 0.905978046724653, "language_loss": 0.59718275, "learning_rate": 2.8196712089455954e-06, "loss": 0.62021774, "num_input_tokens_seen": 137294940, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.20507812, "step": 6397, "time_per_iteration": 3.411128282546997 }, { "auxiliary_loss_clip": 0.01473044, "auxiliary_loss_mlp": 0.01040814, "balance_loss_clip": 1.2935921, "balance_loss_mlp": 1.01874804, "epoch": 0.3846685705696678, "flos": 25860235645440.0, "grad_norm": 1.7890644238007474, "language_loss": 0.85283506, "learning_rate": 2.819315942271794e-06, "loss": 0.87797362, "num_input_tokens_seen": 137315035, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22094727, "step": 6398, "time_per_iteration": 2.905503988265991 }, { "auxiliary_loss_clip": 0.01474371, "auxiliary_loss_mlp": 0.01037697, "balance_loss_clip": 1.29311275, "balance_loss_mlp": 1.01662087, "epoch": 0.38472869382233577, "flos": 16298974788480.0, "grad_norm": 2.310695778619935, "language_loss": 0.80692118, "learning_rate": 2.8189606445294515e-06, "loss": 0.83204186, "num_input_tokens_seen": 137333155, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.21081543, "step": 6399, "time_per_iteration": 2.8995730876922607 }, { "auxiliary_loss_clip": 0.01479613, "auxiliary_loss_mlp": 0.01036508, "balance_loss_clip": 1.29436743, "balance_loss_mlp": 1.01423943, "epoch": 0.38478881707500373, "flos": 19362211257600.0, "grad_norm": 1.7904550922867954, "language_loss": 0.68989146, "learning_rate": 2.818605315732038e-06, "loss": 0.7150526, "num_input_tokens_seen": 137351515, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.22253418, "step": 6400, "time_per_iteration": 2.8898890018463135 }, { "auxiliary_loss_clip": 0.01499457, "auxiliary_loss_mlp": 0.01040739, "balance_loss_clip": 1.31277633, "balance_loss_mlp": 1.01925778, "epoch": 0.38484894032767175, "flos": 24870945934080.0, "grad_norm": 1.924636558672334, "language_loss": 0.73872423, "learning_rate": 2.81824995589303e-06, "loss": 0.76412624, "num_input_tokens_seen": 137371255, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.21484375, "step": 6401, "time_per_iteration": 2.943347454071045 }, { "auxiliary_loss_clip": 0.01475964, "auxiliary_loss_mlp": 0.01038858, "balance_loss_clip": 1.28969836, "balance_loss_mlp": 1.01706636, "epoch": 0.3849090635803397, "flos": 14509971653760.0, "grad_norm": 2.2330143138180767, "language_loss": 0.73316121, "learning_rate": 2.8178945650259012e-06, "loss": 0.75830942, "num_input_tokens_seen": 137388980, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.21801758, "step": 6402, "time_per_iteration": 2.840742826461792 }, { "auxiliary_loss_clip": 0.01470295, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.28916001, "balance_loss_mlp": 1.01408958, "epoch": 0.3849691868330077, "flos": 18525234101760.0, "grad_norm": 2.0501697592489925, "language_loss": 0.84400964, "learning_rate": 2.817539143144128e-06, "loss": 0.86905485, "num_input_tokens_seen": 137406885, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.20117188, "step": 6403, "time_per_iteration": 2.828883171081543 }, { "auxiliary_loss_clip": 0.0146918, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.2881546, "balance_loss_mlp": 1.01687443, "epoch": 0.38502931008567565, "flos": 21626367730560.0, "grad_norm": 2.0556815213181987, "language_loss": 0.83827353, "learning_rate": 2.817183690261189e-06, "loss": 0.86334491, "num_input_tokens_seen": 137425535, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.21081543, "step": 6404, "time_per_iteration": 2.8430607318878174 }, { "auxiliary_loss_clip": 0.01477119, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.29161692, "balance_loss_mlp": 1.01365066, "epoch": 0.3850894333383436, "flos": 25427458702080.0, "grad_norm": 1.6604351362397647, "language_loss": 0.70625466, "learning_rate": 2.816828206390563e-06, "loss": 0.73137617, "num_input_tokens_seen": 137447700, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.21374512, "step": 6405, "time_per_iteration": 2.957427978515625 }, { "auxiliary_loss_clip": 0.01465562, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.28691435, "balance_loss_mlp": 1.01422071, "epoch": 0.3851495565910116, "flos": 20236949838720.0, "grad_norm": 7.750699712934738, "language_loss": 0.80087835, "learning_rate": 2.816472691545729e-06, "loss": 0.82588524, "num_input_tokens_seen": 137462245, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.20898438, "step": 6406, "time_per_iteration": 2.8318138122558594 }, { "auxiliary_loss_clip": 0.0147823, "auxiliary_loss_mlp": 0.01038767, "balance_loss_clip": 1.29527712, "balance_loss_mlp": 1.01748812, "epoch": 0.38520967984367954, "flos": 16517127807360.0, "grad_norm": 2.2339889523404746, "language_loss": 0.8527959, "learning_rate": 2.8161171457401694e-06, "loss": 0.87796593, "num_input_tokens_seen": 137476455, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.21264648, "step": 6407, "time_per_iteration": 2.796774387359619 }, { "auxiliary_loss_clip": 0.01273419, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.16713905, "balance_loss_mlp": 1.01327932, "epoch": 0.3852698030963475, "flos": 61343744785920.0, "grad_norm": 0.8711012872483209, "language_loss": 0.65034127, "learning_rate": 2.815761568987365e-06, "loss": 0.67338943, "num_input_tokens_seen": 137539845, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.18164062, "step": 6408, "time_per_iteration": 3.391498327255249 }, { "auxiliary_loss_clip": 0.01465939, "auxiliary_loss_mlp": 0.01041551, "balance_loss_clip": 1.28284502, "balance_loss_mlp": 1.02005732, "epoch": 0.3853299263490155, "flos": 22903225263360.0, "grad_norm": 1.5121160488118142, "language_loss": 0.74127185, "learning_rate": 2.8154059613008e-06, "loss": 0.76634675, "num_input_tokens_seen": 137559880, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.21496582, "step": 6409, "time_per_iteration": 4.253864765167236 }, { "auxiliary_loss_clip": 0.01496298, "auxiliary_loss_mlp": 0.01041802, "balance_loss_clip": 1.30494666, "balance_loss_mlp": 1.01987982, "epoch": 0.38539004960168344, "flos": 20057055937920.0, "grad_norm": 3.029212081743284, "language_loss": 0.72221071, "learning_rate": 2.81505032269396e-06, "loss": 0.74759173, "num_input_tokens_seen": 137578225, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.21936035, "step": 6410, "time_per_iteration": 2.837019205093384 }, { "auxiliary_loss_clip": 0.01268011, "auxiliary_loss_mlp": 0.01025794, "balance_loss_clip": 1.16594756, "balance_loss_mlp": 1.00881898, "epoch": 0.3854501728543514, "flos": 68765263666560.0, "grad_norm": 0.6853781857179113, "language_loss": 0.60369277, "learning_rate": 2.81469465318033e-06, "loss": 0.6266309, "num_input_tokens_seen": 137645770, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 0.16992188, "step": 6411, "time_per_iteration": 3.4090776443481445 }, { "auxiliary_loss_clip": 0.01481759, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.29769957, "balance_loss_mlp": 1.01555717, "epoch": 0.38551029610701937, "flos": 20494628830080.0, "grad_norm": 1.8030010635662714, "language_loss": 0.78187984, "learning_rate": 2.814338952773397e-06, "loss": 0.80706191, "num_input_tokens_seen": 137664090, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.20874023, "step": 6412, "time_per_iteration": 2.851182222366333 }, { "auxiliary_loss_clip": 0.01493418, "auxiliary_loss_mlp": 0.01041505, "balance_loss_clip": 1.30618358, "balance_loss_mlp": 1.01833081, "epoch": 0.38557041935968733, "flos": 23480849370240.0, "grad_norm": 1.741526109935132, "language_loss": 0.78794944, "learning_rate": 2.8139832214866493e-06, "loss": 0.81329864, "num_input_tokens_seen": 137683190, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.23156738, "step": 6413, "time_per_iteration": 2.8615612983703613 }, { "auxiliary_loss_clip": 0.01268687, "auxiliary_loss_mlp": 0.01029118, "balance_loss_clip": 1.1639086, "balance_loss_mlp": 1.0054673, "epoch": 0.38563054261235535, "flos": 63994546488960.0, "grad_norm": 0.808082180624448, "language_loss": 0.61353862, "learning_rate": 2.813627459333576e-06, "loss": 0.63651657, "num_input_tokens_seen": 137737315, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.23632812, "step": 6414, "time_per_iteration": 3.15305495262146 }, { "auxiliary_loss_clip": 0.01497018, "auxiliary_loss_mlp": 0.0103882, "balance_loss_clip": 1.31028819, "balance_loss_mlp": 1.01769578, "epoch": 0.3856906658650233, "flos": 23998333858560.0, "grad_norm": 1.9997209864550294, "language_loss": 0.78261054, "learning_rate": 2.8132716663276685e-06, "loss": 0.80796891, "num_input_tokens_seen": 137753535, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.21142578, "step": 6415, "time_per_iteration": 2.8667171001434326 }, { "auxiliary_loss_clip": 0.01472682, "auxiliary_loss_mlp": 0.01032647, "balance_loss_clip": 1.29573774, "balance_loss_mlp": 1.01281011, "epoch": 0.3857507891176913, "flos": 25017738624000.0, "grad_norm": 1.6405510485679282, "language_loss": 0.80620372, "learning_rate": 2.8129158424824173e-06, "loss": 0.8312571, "num_input_tokens_seen": 137773405, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.19836426, "step": 6416, "time_per_iteration": 5.788764238357544 }, { "auxiliary_loss_clip": 0.01470687, "auxiliary_loss_mlp": 0.01039661, "balance_loss_clip": 1.28910518, "balance_loss_mlp": 1.01945448, "epoch": 0.38581091237035925, "flos": 21545008300800.0, "grad_norm": 2.1913133322661817, "language_loss": 0.80297852, "learning_rate": 2.8125599878113155e-06, "loss": 0.82808197, "num_input_tokens_seen": 137790810, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.20202637, "step": 6417, "time_per_iteration": 2.8774495124816895 }, { "auxiliary_loss_clip": 0.01475952, "auxiliary_loss_mlp": 0.01048634, "balance_loss_clip": 1.29567242, "balance_loss_mlp": 1.02829742, "epoch": 0.3858710356230272, "flos": 17392726039680.0, "grad_norm": 2.3703855328507473, "language_loss": 0.81162536, "learning_rate": 2.8122041023278583e-06, "loss": 0.83687127, "num_input_tokens_seen": 137810265, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.20349121, "step": 6418, "time_per_iteration": 2.8629744052886963 }, { "auxiliary_loss_clip": 0.01467859, "auxiliary_loss_mlp": 0.0104244, "balance_loss_clip": 1.28882205, "balance_loss_mlp": 1.02260363, "epoch": 0.3859311588756952, "flos": 20349283973760.0, "grad_norm": 1.7139350076482205, "language_loss": 0.811517, "learning_rate": 2.8118481860455407e-06, "loss": 0.83662003, "num_input_tokens_seen": 137828580, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.19848633, "step": 6419, "time_per_iteration": 4.2436363697052 }, { "auxiliary_loss_clip": 0.01467196, "auxiliary_loss_mlp": 0.01041701, "balance_loss_clip": 1.28735328, "balance_loss_mlp": 1.02004123, "epoch": 0.38599128212836314, "flos": 26331859889280.0, "grad_norm": 2.4743976699248873, "language_loss": 0.68580019, "learning_rate": 2.8114922389778573e-06, "loss": 0.71088916, "num_input_tokens_seen": 137846145, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.21655273, "step": 6420, "time_per_iteration": 2.894181728363037 }, { "auxiliary_loss_clip": 0.01467594, "auxiliary_loss_mlp": 0.01052363, "balance_loss_clip": 1.29165626, "balance_loss_mlp": 1.03256202, "epoch": 0.3860514053810311, "flos": 13561927217280.0, "grad_norm": 1.8505806163753193, "language_loss": 0.81345087, "learning_rate": 2.8111362611383076e-06, "loss": 0.83865047, "num_input_tokens_seen": 137863705, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.19812012, "step": 6421, "time_per_iteration": 2.83227801322937 }, { "auxiliary_loss_clip": 0.01472555, "auxiliary_loss_mlp": 0.01043713, "balance_loss_clip": 1.29076171, "balance_loss_mlp": 1.02248192, "epoch": 0.3861115286336991, "flos": 20962678734720.0, "grad_norm": 2.1295929851740936, "language_loss": 0.73341775, "learning_rate": 2.8107802525403886e-06, "loss": 0.75858039, "num_input_tokens_seen": 137880285, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.21240234, "step": 6422, "time_per_iteration": 2.8425204753875732 }, { "auxiliary_loss_clip": 0.01447738, "auxiliary_loss_mlp": 0.01042561, "balance_loss_clip": 1.27344131, "balance_loss_mlp": 1.02076983, "epoch": 0.38617165188636704, "flos": 16371375747840.0, "grad_norm": 1.6019010590450193, "language_loss": 0.6753881, "learning_rate": 2.8104242131976025e-06, "loss": 0.70029116, "num_input_tokens_seen": 137898335, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21801758, "step": 6423, "time_per_iteration": 2.968427896499634 }, { "auxiliary_loss_clip": 0.01483089, "auxiliary_loss_mlp": 0.01046953, "balance_loss_clip": 1.3000555, "balance_loss_mlp": 1.02613902, "epoch": 0.386231775139035, "flos": 34800545859840.0, "grad_norm": 2.108289612882626, "language_loss": 0.70426214, "learning_rate": 2.810068143123449e-06, "loss": 0.72956258, "num_input_tokens_seen": 137918605, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.20800781, "step": 6424, "time_per_iteration": 2.97904896736145 }, { "auxiliary_loss_clip": 0.0146359, "auxiliary_loss_mlp": 0.01046369, "balance_loss_clip": 1.28626776, "balance_loss_mlp": 1.02476835, "epoch": 0.38629189839170297, "flos": 21736258646400.0, "grad_norm": 1.5996756437961153, "language_loss": 0.73315668, "learning_rate": 2.809712042331429e-06, "loss": 0.75825632, "num_input_tokens_seen": 137938245, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.21618652, "step": 6425, "time_per_iteration": 2.872779607772827 }, { "auxiliary_loss_clip": 0.01482738, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.29841113, "balance_loss_mlp": 1.02671039, "epoch": 0.38635202164437094, "flos": 27934001424000.0, "grad_norm": 2.1870348304321117, "language_loss": 0.81823647, "learning_rate": 2.8093559108350484e-06, "loss": 0.84354043, "num_input_tokens_seen": 137956770, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.20947266, "step": 6426, "time_per_iteration": 2.888880729675293 }, { "auxiliary_loss_clip": 0.01478724, "auxiliary_loss_mlp": 0.01049546, "balance_loss_clip": 1.29671085, "balance_loss_mlp": 1.02774262, "epoch": 0.38641214489703896, "flos": 23597119802880.0, "grad_norm": 2.2207847060981103, "language_loss": 0.76010597, "learning_rate": 2.80899974864781e-06, "loss": 0.78538871, "num_input_tokens_seen": 137977040, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.21813965, "step": 6427, "time_per_iteration": 2.8851401805877686 }, { "auxiliary_loss_clip": 0.01468154, "auxiliary_loss_mlp": 0.01050359, "balance_loss_clip": 1.28958833, "balance_loss_mlp": 1.0288893, "epoch": 0.3864722681497069, "flos": 12648974762880.0, "grad_norm": 2.293264408205655, "language_loss": 0.70690644, "learning_rate": 2.8086435557832203e-06, "loss": 0.73209155, "num_input_tokens_seen": 137993545, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.21472168, "step": 6428, "time_per_iteration": 2.886946439743042 }, { "auxiliary_loss_clip": 0.01463613, "auxiliary_loss_mlp": 0.01048967, "balance_loss_clip": 1.28399777, "balance_loss_mlp": 1.02752101, "epoch": 0.3865323914023749, "flos": 17606807026560.0, "grad_norm": 2.4469024319812784, "language_loss": 0.85138118, "learning_rate": 2.8082873322547863e-06, "loss": 0.87650698, "num_input_tokens_seen": 138010140, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.21447754, "step": 6429, "time_per_iteration": 2.8583502769470215 }, { "auxiliary_loss_clip": 0.01473688, "auxiliary_loss_mlp": 0.01046, "balance_loss_clip": 1.29339623, "balance_loss_mlp": 1.0241251, "epoch": 0.38659251465504285, "flos": 18488196593280.0, "grad_norm": 1.9011376055613178, "language_loss": 0.82138145, "learning_rate": 2.807931078076015e-06, "loss": 0.8465783, "num_input_tokens_seen": 138028880, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.21875, "step": 6430, "time_per_iteration": 2.8387465476989746 }, { "auxiliary_loss_clip": 0.0127948, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.16495252, "balance_loss_mlp": 1.01131022, "epoch": 0.3866526379077108, "flos": 64198012930560.0, "grad_norm": 0.7260824743711669, "language_loss": 0.58921963, "learning_rate": 2.807574793260416e-06, "loss": 0.61236215, "num_input_tokens_seen": 138098090, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.234375, "step": 6431, "time_per_iteration": 3.388615131378174 }, { "auxiliary_loss_clip": 0.01473144, "auxiliary_loss_mlp": 0.01042897, "balance_loss_clip": 1.28821278, "balance_loss_mlp": 1.0212245, "epoch": 0.3867127611603788, "flos": 14395375278720.0, "grad_norm": 2.5628167451923765, "language_loss": 0.80467409, "learning_rate": 2.8072184778215004e-06, "loss": 0.82983446, "num_input_tokens_seen": 138114735, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.2166748, "step": 6432, "time_per_iteration": 2.843686580657959 }, { "auxiliary_loss_clip": 0.01487968, "auxiliary_loss_mlp": 0.01047551, "balance_loss_clip": 1.30038512, "balance_loss_mlp": 1.02506781, "epoch": 0.38677288441304675, "flos": 20020244653440.0, "grad_norm": 3.332195892725776, "language_loss": 0.82287371, "learning_rate": 2.806862131772779e-06, "loss": 0.84822887, "num_input_tokens_seen": 138130480, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.22485352, "step": 6433, "time_per_iteration": 2.8364133834838867 }, { "auxiliary_loss_clip": 0.01476982, "auxiliary_loss_mlp": 0.01043994, "balance_loss_clip": 1.2938385, "balance_loss_mlp": 1.02109385, "epoch": 0.3868330076657147, "flos": 22247092149120.0, "grad_norm": 2.216064351870715, "language_loss": 0.71774495, "learning_rate": 2.806505755127765e-06, "loss": 0.74295473, "num_input_tokens_seen": 138150640, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.22888184, "step": 6434, "time_per_iteration": 2.8981528282165527 }, { "auxiliary_loss_clip": 0.01487767, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.29846489, "balance_loss_mlp": 1.01758432, "epoch": 0.3868931309183827, "flos": 16736185722240.0, "grad_norm": 2.028682790579075, "language_loss": 0.78933549, "learning_rate": 2.806149347899972e-06, "loss": 0.81460965, "num_input_tokens_seen": 138169700, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.22070312, "step": 6435, "time_per_iteration": 2.8888416290283203 }, { "auxiliary_loss_clip": 0.01458023, "auxiliary_loss_mlp": 0.01035678, "balance_loss_clip": 1.27911603, "balance_loss_mlp": 1.01332593, "epoch": 0.38695325417105064, "flos": 22685026999680.0, "grad_norm": 2.0275445829562564, "language_loss": 0.79927707, "learning_rate": 2.805792910102915e-06, "loss": 0.8242141, "num_input_tokens_seen": 138185835, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.22363281, "step": 6436, "time_per_iteration": 2.90470027923584 }, { "auxiliary_loss_clip": 0.01457309, "auxiliary_loss_mlp": 0.01036166, "balance_loss_clip": 1.28068256, "balance_loss_mlp": 1.01480377, "epoch": 0.3870133774237186, "flos": 23122328423040.0, "grad_norm": 1.640191650980483, "language_loss": 0.77635419, "learning_rate": 2.8054364417501093e-06, "loss": 0.80128896, "num_input_tokens_seen": 138204080, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.21374512, "step": 6437, "time_per_iteration": 2.8718676567077637 }, { "auxiliary_loss_clip": 0.01463732, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.28614783, "balance_loss_mlp": 1.01419663, "epoch": 0.3870735006763866, "flos": 17684592117120.0, "grad_norm": 2.1851073485926094, "language_loss": 0.8260628, "learning_rate": 2.805079942855074e-06, "loss": 0.85106003, "num_input_tokens_seen": 138220710, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.21777344, "step": 6438, "time_per_iteration": 2.82076358795166 }, { "auxiliary_loss_clip": 0.01476631, "auxiliary_loss_mlp": 0.01040454, "balance_loss_clip": 1.29487753, "balance_loss_mlp": 1.01727939, "epoch": 0.38713362392905454, "flos": 23306475335040.0, "grad_norm": 1.4521267372192577, "language_loss": 0.76461482, "learning_rate": 2.804723413431326e-06, "loss": 0.78978568, "num_input_tokens_seen": 138241720, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.23181152, "step": 6439, "time_per_iteration": 2.933702230453491 }, { "auxiliary_loss_clip": 0.01455923, "auxiliary_loss_mlp": 0.01039541, "balance_loss_clip": 1.28051472, "balance_loss_mlp": 1.01727295, "epoch": 0.38719374718172256, "flos": 21040644804480.0, "grad_norm": 1.6393280996453956, "language_loss": 0.74339986, "learning_rate": 2.8043668534923855e-06, "loss": 0.76835454, "num_input_tokens_seen": 138261885, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.22265625, "step": 6440, "time_per_iteration": 2.860747814178467 }, { "auxiliary_loss_clip": 0.01478727, "auxiliary_loss_mlp": 0.01040457, "balance_loss_clip": 1.29251587, "balance_loss_mlp": 1.01702034, "epoch": 0.3872538704343905, "flos": 19619256821760.0, "grad_norm": 1.9689011216711991, "language_loss": 0.82791698, "learning_rate": 2.804010263051774e-06, "loss": 0.85310876, "num_input_tokens_seen": 138280255, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.234375, "step": 6441, "time_per_iteration": 2.8553576469421387 }, { "auxiliary_loss_clip": 0.0145344, "auxiliary_loss_mlp": 0.01047035, "balance_loss_clip": 1.275002, "balance_loss_mlp": 1.0253861, "epoch": 0.3873139936870585, "flos": 17538975792000.0, "grad_norm": 2.6065766299653146, "language_loss": 0.8242296, "learning_rate": 2.8036536421230118e-06, "loss": 0.8492344, "num_input_tokens_seen": 138296675, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.21655273, "step": 6442, "time_per_iteration": 2.860387086868286 }, { "auxiliary_loss_clip": 0.01464416, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.28522718, "balance_loss_mlp": 1.01726389, "epoch": 0.38737411693972645, "flos": 17795840376960.0, "grad_norm": 1.7481945186990149, "language_loss": 0.85071969, "learning_rate": 2.803296990719624e-06, "loss": 0.8757571, "num_input_tokens_seen": 138314985, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.22070312, "step": 6443, "time_per_iteration": 2.882420539855957 }, { "auxiliary_loss_clip": 0.01291873, "auxiliary_loss_mlp": 0.01048852, "balance_loss_clip": 1.17577314, "balance_loss_mlp": 1.02577293, "epoch": 0.3874342401923944, "flos": 58329336735360.0, "grad_norm": 0.7771330312485016, "language_loss": 0.50312662, "learning_rate": 2.8029403088551327e-06, "loss": 0.52653384, "num_input_tokens_seen": 138373275, "router_z_loss_clip": 1.1640625, "router_z_loss_mlp": 0.23046875, "step": 6444, "time_per_iteration": 4.82112455368042 }, { "auxiliary_loss_clip": 0.01450054, "auxiliary_loss_mlp": 0.01038676, "balance_loss_clip": 1.27650774, "balance_loss_mlp": 1.0175643, "epoch": 0.3874943634450624, "flos": 17720769974400.0, "grad_norm": 2.322886185799783, "language_loss": 0.7956928, "learning_rate": 2.802583596543065e-06, "loss": 0.82058012, "num_input_tokens_seen": 138391145, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.21118164, "step": 6445, "time_per_iteration": 2.856750011444092 }, { "auxiliary_loss_clip": 0.01468063, "auxiliary_loss_mlp": 0.01039607, "balance_loss_clip": 1.29000449, "balance_loss_mlp": 1.01781547, "epoch": 0.38755448669773035, "flos": 19254220623360.0, "grad_norm": 2.7073919445990993, "language_loss": 0.82057106, "learning_rate": 2.8022268537969474e-06, "loss": 0.84564781, "num_input_tokens_seen": 138409875, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.21789551, "step": 6446, "time_per_iteration": 2.900074005126953 }, { "auxiliary_loss_clip": 0.01465203, "auxiliary_loss_mlp": 0.01035893, "balance_loss_clip": 1.28638029, "balance_loss_mlp": 1.01369643, "epoch": 0.3876146099503983, "flos": 20603841073920.0, "grad_norm": 1.9318680785826423, "language_loss": 0.77886713, "learning_rate": 2.801870080630306e-06, "loss": 0.80387813, "num_input_tokens_seen": 138428965, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.22180176, "step": 6447, "time_per_iteration": 2.8529326915740967 }, { "auxiliary_loss_clip": 0.01449531, "auxiliary_loss_mlp": 0.01035807, "balance_loss_clip": 1.27591097, "balance_loss_mlp": 1.0150764, "epoch": 0.3876747332030663, "flos": 19290443725440.0, "grad_norm": 1.4745814281482696, "language_loss": 0.76953954, "learning_rate": 2.801513277056671e-06, "loss": 0.79439294, "num_input_tokens_seen": 138448090, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20715332, "step": 6448, "time_per_iteration": 2.842885732650757 }, { "auxiliary_loss_clip": 0.01480814, "auxiliary_loss_mlp": 0.01034734, "balance_loss_clip": 1.30317831, "balance_loss_mlp": 1.01419389, "epoch": 0.38773485645573424, "flos": 18953350830720.0, "grad_norm": 2.1134305871008094, "language_loss": 0.76715136, "learning_rate": 2.8011564430895725e-06, "loss": 0.79230678, "num_input_tokens_seen": 138466105, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.20544434, "step": 6449, "time_per_iteration": 2.8640687465667725 }, { "auxiliary_loss_clip": 0.01473864, "auxiliary_loss_mlp": 0.01037877, "balance_loss_clip": 1.28976667, "balance_loss_mlp": 1.01572824, "epoch": 0.3877949797084022, "flos": 23080540210560.0, "grad_norm": 1.7698323328407812, "language_loss": 0.7871573, "learning_rate": 2.800799578742542e-06, "loss": 0.81227469, "num_input_tokens_seen": 138485160, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.22143555, "step": 6450, "time_per_iteration": 2.935988187789917 }, { "auxiliary_loss_clip": 0.01491692, "auxiliary_loss_mlp": 0.01041437, "balance_loss_clip": 1.30458879, "balance_loss_mlp": 1.0198009, "epoch": 0.3878551029610702, "flos": 29107347557760.0, "grad_norm": 2.212877986935538, "language_loss": 0.78377247, "learning_rate": 2.8004426840291106e-06, "loss": 0.80910373, "num_input_tokens_seen": 138504135, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.21643066, "step": 6451, "time_per_iteration": 4.314770698547363 }, { "auxiliary_loss_clip": 0.01451492, "auxiliary_loss_mlp": 0.01033167, "balance_loss_clip": 1.27736223, "balance_loss_mlp": 1.0110898, "epoch": 0.38791522621373814, "flos": 21006276739200.0, "grad_norm": 3.698004062761702, "language_loss": 0.77440166, "learning_rate": 2.800085758962812e-06, "loss": 0.79924822, "num_input_tokens_seen": 138523955, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.22070312, "step": 6452, "time_per_iteration": 2.869380474090576 }, { "auxiliary_loss_clip": 0.0146, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.28212309, "balance_loss_mlp": 1.01779008, "epoch": 0.3879753494664061, "flos": 15495234577920.0, "grad_norm": 1.7316537615798895, "language_loss": 0.80512798, "learning_rate": 2.799728803557182e-06, "loss": 0.83011782, "num_input_tokens_seen": 138541655, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.21203613, "step": 6453, "time_per_iteration": 4.285577297210693 }, { "auxiliary_loss_clip": 0.01480769, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 1.29570472, "balance_loss_mlp": 1.01373744, "epoch": 0.3880354727190741, "flos": 22063985867520.0, "grad_norm": 2.1151465641588554, "language_loss": 0.72580135, "learning_rate": 2.7993718178257555e-06, "loss": 0.75095737, "num_input_tokens_seen": 138560860, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.21081543, "step": 6454, "time_per_iteration": 2.8691494464874268 }, { "auxiliary_loss_clip": 0.01481033, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.29784381, "balance_loss_mlp": 1.01492906, "epoch": 0.3880955959717421, "flos": 20350188869760.0, "grad_norm": 1.9676575668173608, "language_loss": 0.78671861, "learning_rate": 2.7990148017820694e-06, "loss": 0.81189871, "num_input_tokens_seen": 138580200, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.22058105, "step": 6455, "time_per_iteration": 2.8559696674346924 }, { "auxiliary_loss_clip": 0.01473937, "auxiliary_loss_mlp": 0.01034416, "balance_loss_clip": 1.29308701, "balance_loss_mlp": 1.01246953, "epoch": 0.38815571922441006, "flos": 23085652872960.0, "grad_norm": 2.1914610183734173, "language_loss": 0.76643538, "learning_rate": 2.798657755439662e-06, "loss": 0.79151893, "num_input_tokens_seen": 138598315, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.21960449, "step": 6456, "time_per_iteration": 2.8931961059570312 }, { "auxiliary_loss_clip": 0.01475755, "auxiliary_loss_mlp": 0.01035306, "balance_loss_clip": 1.29378259, "balance_loss_mlp": 1.01439679, "epoch": 0.388215842477078, "flos": 20786630641920.0, "grad_norm": 2.2015591730440103, "language_loss": 0.61973786, "learning_rate": 2.7983006788120726e-06, "loss": 0.64484847, "num_input_tokens_seen": 138615695, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.20922852, "step": 6457, "time_per_iteration": 2.852195978164673 }, { "auxiliary_loss_clip": 0.01473364, "auxiliary_loss_mlp": 0.01037595, "balance_loss_clip": 1.29011035, "balance_loss_mlp": 1.01494551, "epoch": 0.388275965729746, "flos": 20457681811200.0, "grad_norm": 2.436070967150388, "language_loss": 0.80303782, "learning_rate": 2.797943571912841e-06, "loss": 0.82814735, "num_input_tokens_seen": 138633180, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.2265625, "step": 6458, "time_per_iteration": 2.8481314182281494 }, { "auxiliary_loss_clip": 0.01473268, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.29112697, "balance_loss_mlp": 1.01625156, "epoch": 0.38833608898241395, "flos": 27904112593920.0, "grad_norm": 1.81315455275797, "language_loss": 0.82588696, "learning_rate": 2.797586434755509e-06, "loss": 0.85100329, "num_input_tokens_seen": 138654785, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.22119141, "step": 6459, "time_per_iteration": 2.9705734252929688 }, { "auxiliary_loss_clip": 0.01455555, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.27912235, "balance_loss_mlp": 1.0121038, "epoch": 0.3883962122350819, "flos": 18085217990400.0, "grad_norm": 1.8099352372326558, "language_loss": 0.6368261, "learning_rate": 2.7972292673536202e-06, "loss": 0.66170746, "num_input_tokens_seen": 138673330, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.20471191, "step": 6460, "time_per_iteration": 2.839108943939209 }, { "auxiliary_loss_clip": 0.01463363, "auxiliary_loss_mlp": 0.01036676, "balance_loss_clip": 1.2850225, "balance_loss_mlp": 1.01524258, "epoch": 0.3884563354877499, "flos": 23632528498560.0, "grad_norm": 3.7580611129221584, "language_loss": 0.86891162, "learning_rate": 2.796872069720717e-06, "loss": 0.89391202, "num_input_tokens_seen": 138694185, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.2142334, "step": 6461, "time_per_iteration": 2.8754096031188965 }, { "auxiliary_loss_clip": 0.01473467, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.29247034, "balance_loss_mlp": 1.01732922, "epoch": 0.38851645874041785, "flos": 27464865644160.0, "grad_norm": 2.2714153902233365, "language_loss": 0.72698379, "learning_rate": 2.7965148418703456e-06, "loss": 0.75210965, "num_input_tokens_seen": 138714625, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.21777344, "step": 6462, "time_per_iteration": 2.8975532054901123 }, { "auxiliary_loss_clip": 0.01470805, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.28996825, "balance_loss_mlp": 1.01568377, "epoch": 0.3885765819930858, "flos": 25238877799680.0, "grad_norm": 6.6917940898368125, "language_loss": 0.77156401, "learning_rate": 2.796157583816052e-06, "loss": 0.79665226, "num_input_tokens_seen": 138733585, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.22351074, "step": 6463, "time_per_iteration": 2.922329902648926 }, { "auxiliary_loss_clip": 0.01489555, "auxiliary_loss_mlp": 0.01044015, "balance_loss_clip": 1.30257404, "balance_loss_mlp": 1.02100778, "epoch": 0.3886367052457538, "flos": 16955469861120.0, "grad_norm": 1.8702766843642906, "language_loss": 0.71651065, "learning_rate": 2.795800295571382e-06, "loss": 0.74184632, "num_input_tokens_seen": 138752335, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.23034668, "step": 6464, "time_per_iteration": 2.909059762954712 }, { "auxiliary_loss_clip": 0.01463859, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.28506041, "balance_loss_mlp": 1.00924802, "epoch": 0.38869682849842174, "flos": 27163452913920.0, "grad_norm": 2.018986641119819, "language_loss": 0.70529342, "learning_rate": 2.7954429771498858e-06, "loss": 0.7302388, "num_input_tokens_seen": 138768450, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.21435547, "step": 6465, "time_per_iteration": 2.8694539070129395 }, { "auxiliary_loss_clip": 0.0146733, "auxiliary_loss_mlp": 0.01042124, "balance_loss_clip": 1.2856704, "balance_loss_mlp": 1.01874709, "epoch": 0.3887569517510897, "flos": 21072026712960.0, "grad_norm": 2.6850146818168303, "language_loss": 0.78791839, "learning_rate": 2.7950856285651117e-06, "loss": 0.81301296, "num_input_tokens_seen": 138786775, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.23376465, "step": 6466, "time_per_iteration": 2.83375883102417 }, { "auxiliary_loss_clip": 0.01474671, "auxiliary_loss_mlp": 0.01038525, "balance_loss_clip": 1.29096472, "balance_loss_mlp": 1.01636422, "epoch": 0.38881707500375773, "flos": 29509421264640.0, "grad_norm": 1.7837472509113252, "language_loss": 0.69781858, "learning_rate": 2.794728249830611e-06, "loss": 0.72295052, "num_input_tokens_seen": 138810100, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.22167969, "step": 6467, "time_per_iteration": 3.0356523990631104 }, { "auxiliary_loss_clip": 0.01473144, "auxiliary_loss_mlp": 0.01046356, "balance_loss_clip": 1.289891, "balance_loss_mlp": 1.02405155, "epoch": 0.3888771982564257, "flos": 17495830235520.0, "grad_norm": 2.6897972993320693, "language_loss": 0.84835529, "learning_rate": 2.794370840959936e-06, "loss": 0.8735503, "num_input_tokens_seen": 138825140, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.22302246, "step": 6468, "time_per_iteration": 3.009885787963867 }, { "auxiliary_loss_clip": 0.01467082, "auxiliary_loss_mlp": 0.01043319, "balance_loss_clip": 1.28749359, "balance_loss_mlp": 1.02245724, "epoch": 0.38893732150909366, "flos": 21951877956480.0, "grad_norm": 2.052947770134962, "language_loss": 0.85111868, "learning_rate": 2.7940134019666383e-06, "loss": 0.87622261, "num_input_tokens_seen": 138844115, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.20861816, "step": 6469, "time_per_iteration": 2.87727952003479 }, { "auxiliary_loss_clip": 0.01471598, "auxiliary_loss_mlp": 0.0104336, "balance_loss_clip": 1.2905519, "balance_loss_mlp": 1.02041197, "epoch": 0.3889974447617616, "flos": 24286127904000.0, "grad_norm": 1.6731880781646202, "language_loss": 0.75288224, "learning_rate": 2.793655932864273e-06, "loss": 0.77803177, "num_input_tokens_seen": 138860860, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.22949219, "step": 6470, "time_per_iteration": 2.922835350036621 }, { "auxiliary_loss_clip": 0.01463586, "auxiliary_loss_mlp": 0.01042444, "balance_loss_clip": 1.28213191, "balance_loss_mlp": 1.02011669, "epoch": 0.3890575680144296, "flos": 25678350973440.0, "grad_norm": 1.6386236340461162, "language_loss": 0.75468874, "learning_rate": 2.7932984336663953e-06, "loss": 0.77974904, "num_input_tokens_seen": 138881910, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.2232666, "step": 6471, "time_per_iteration": 2.895298957824707 }, { "auxiliary_loss_clip": 0.01473805, "auxiliary_loss_mlp": 0.01046171, "balance_loss_clip": 1.29270124, "balance_loss_mlp": 1.02445054, "epoch": 0.38911769126709755, "flos": 22865147124480.0, "grad_norm": 2.2776537363550586, "language_loss": 0.68737334, "learning_rate": 2.792940904386562e-06, "loss": 0.71257311, "num_input_tokens_seen": 138900975, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.21728516, "step": 6472, "time_per_iteration": 2.8691658973693848 }, { "auxiliary_loss_clip": 0.01471348, "auxiliary_loss_mlp": 0.01049088, "balance_loss_clip": 1.29006004, "balance_loss_mlp": 1.02658105, "epoch": 0.3891778145197655, "flos": 25458523896960.0, "grad_norm": 1.6502567750175128, "language_loss": 0.76723725, "learning_rate": 2.7925833450383293e-06, "loss": 0.79244161, "num_input_tokens_seen": 138920795, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.22521973, "step": 6473, "time_per_iteration": 2.877331018447876 }, { "auxiliary_loss_clip": 0.01479081, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.29568315, "balance_loss_mlp": 1.01939034, "epoch": 0.3892379377724335, "flos": 14036130414720.0, "grad_norm": 3.1481225693589185, "language_loss": 0.72411156, "learning_rate": 2.792225755635257e-06, "loss": 0.74932933, "num_input_tokens_seen": 138938770, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.23278809, "step": 6474, "time_per_iteration": 2.826059341430664 }, { "auxiliary_loss_clip": 0.01467027, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.28581154, "balance_loss_mlp": 1.0200479, "epoch": 0.38929806102510145, "flos": 20167173077760.0, "grad_norm": 1.5594234756638636, "language_loss": 0.69724584, "learning_rate": 2.7918681361909046e-06, "loss": 0.72233307, "num_input_tokens_seen": 138958880, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.21630859, "step": 6475, "time_per_iteration": 2.8886568546295166 }, { "auxiliary_loss_clip": 0.01484171, "auxiliary_loss_mlp": 0.01047089, "balance_loss_clip": 1.29821014, "balance_loss_mlp": 1.02486825, "epoch": 0.3893581842777694, "flos": 22174148252160.0, "grad_norm": 3.4826287002597947, "language_loss": 0.76736051, "learning_rate": 2.7915104867188332e-06, "loss": 0.79267311, "num_input_tokens_seen": 138977240, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.22216797, "step": 6476, "time_per_iteration": 2.8204667568206787 }, { "auxiliary_loss_clip": 0.01269792, "auxiliary_loss_mlp": 0.01026206, "balance_loss_clip": 1.15640378, "balance_loss_mlp": 1.00188696, "epoch": 0.3894183075304374, "flos": 67334555255040.0, "grad_norm": 0.8198489608675659, "language_loss": 0.58249038, "learning_rate": 2.7911528072326055e-06, "loss": 0.60545039, "num_input_tokens_seen": 139039035, "router_z_loss_clip": 1.1328125, "router_z_loss_mlp": 0.24316406, "step": 6477, "time_per_iteration": 3.3603296279907227 }, { "auxiliary_loss_clip": 0.01479656, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.2941792, "balance_loss_mlp": 1.01441455, "epoch": 0.38947843078310534, "flos": 18555620624640.0, "grad_norm": 1.9362290964443059, "language_loss": 0.78700054, "learning_rate": 2.7907950977457832e-06, "loss": 0.81218445, "num_input_tokens_seen": 139055560, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.24328613, "step": 6478, "time_per_iteration": 2.8588738441467285 }, { "auxiliary_loss_clip": 0.01453724, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.27361417, "balance_loss_mlp": 1.01469135, "epoch": 0.3895385540357733, "flos": 14612668646400.0, "grad_norm": 20.425634629816475, "language_loss": 0.83245409, "learning_rate": 2.7904373582719317e-06, "loss": 0.85735661, "num_input_tokens_seen": 139071865, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.21838379, "step": 6479, "time_per_iteration": 4.239387273788452 }, { "auxiliary_loss_clip": 0.01459382, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.28029537, "balance_loss_mlp": 1.01574111, "epoch": 0.38959867728844133, "flos": 19984926447360.0, "grad_norm": 1.8188356817462135, "language_loss": 0.81275952, "learning_rate": 2.790079588824617e-06, "loss": 0.83773696, "num_input_tokens_seen": 139089640, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.22631836, "step": 6480, "time_per_iteration": 2.9282755851745605 }, { "auxiliary_loss_clip": 0.01452399, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 1.27439296, "balance_loss_mlp": 1.01161742, "epoch": 0.3896588005411093, "flos": 22681769374080.0, "grad_norm": 1.6642284044167857, "language_loss": 0.83807117, "learning_rate": 2.7897217894174038e-06, "loss": 0.86292791, "num_input_tokens_seen": 139109365, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.21655273, "step": 6481, "time_per_iteration": 2.979628324508667 }, { "auxiliary_loss_clip": 0.01452093, "auxiliary_loss_mlp": 0.01037139, "balance_loss_clip": 1.27674007, "balance_loss_mlp": 1.01597905, "epoch": 0.38971892379377726, "flos": 21005824291200.0, "grad_norm": 1.8126122118030907, "language_loss": 0.76101524, "learning_rate": 2.789363960063863e-06, "loss": 0.78590763, "num_input_tokens_seen": 139128260, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.21166992, "step": 6482, "time_per_iteration": 2.911851644515991 }, { "auxiliary_loss_clip": 0.01464236, "auxiliary_loss_mlp": 0.01041872, "balance_loss_clip": 1.28133214, "balance_loss_mlp": 1.02085507, "epoch": 0.3897790470464452, "flos": 22538686757760.0, "grad_norm": 2.3329985177127925, "language_loss": 0.79808515, "learning_rate": 2.78900610077756e-06, "loss": 0.82314622, "num_input_tokens_seen": 139147315, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.21008301, "step": 6483, "time_per_iteration": 2.8610291481018066 }, { "auxiliary_loss_clip": 0.01451322, "auxiliary_loss_mlp": 0.01028988, "balance_loss_clip": 1.27089453, "balance_loss_mlp": 1.0071609, "epoch": 0.3898391702991132, "flos": 26220113936640.0, "grad_norm": 1.6891186744655102, "language_loss": 0.80901259, "learning_rate": 2.788648211572067e-06, "loss": 0.83381569, "num_input_tokens_seen": 139167270, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.21850586, "step": 6484, "time_per_iteration": 2.884709596633911 }, { "auxiliary_loss_clip": 0.01466922, "auxiliary_loss_mlp": 0.01041446, "balance_loss_clip": 1.28651452, "balance_loss_mlp": 1.01874852, "epoch": 0.38989929355178116, "flos": 21074469932160.0, "grad_norm": 1.5690804142810402, "language_loss": 0.78686631, "learning_rate": 2.7882902924609557e-06, "loss": 0.81194997, "num_input_tokens_seen": 139185970, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.22705078, "step": 6485, "time_per_iteration": 2.8519203662872314 }, { "auxiliary_loss_clip": 0.01465191, "auxiliary_loss_mlp": 0.01037081, "balance_loss_clip": 1.28090131, "balance_loss_mlp": 1.01503956, "epoch": 0.3899594168044491, "flos": 25495154202240.0, "grad_norm": 2.938284277479139, "language_loss": 0.86117804, "learning_rate": 2.7879323434577965e-06, "loss": 0.88620079, "num_input_tokens_seen": 139203730, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22045898, "step": 6486, "time_per_iteration": 5.717637538909912 }, { "auxiliary_loss_clip": 0.01471592, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.28603804, "balance_loss_mlp": 1.011217, "epoch": 0.3900195400571171, "flos": 31151857933440.0, "grad_norm": 2.5520437927577055, "language_loss": 0.8616637, "learning_rate": 2.7875743645761645e-06, "loss": 0.88671315, "num_input_tokens_seen": 139222560, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.22143555, "step": 6487, "time_per_iteration": 2.8867156505584717 }, { "auxiliary_loss_clip": 0.01446172, "auxiliary_loss_mlp": 0.01035316, "balance_loss_clip": 1.26829076, "balance_loss_mlp": 1.01286888, "epoch": 0.39007966330978505, "flos": 20239528792320.0, "grad_norm": 1.6136897087334714, "language_loss": 0.74004883, "learning_rate": 2.787216355829633e-06, "loss": 0.76486373, "num_input_tokens_seen": 139242165, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.22460938, "step": 6488, "time_per_iteration": 4.255650997161865 }, { "auxiliary_loss_clip": 0.0146968, "auxiliary_loss_mlp": 0.01040041, "balance_loss_clip": 1.28528798, "balance_loss_mlp": 1.01846433, "epoch": 0.390139786562453, "flos": 22539003471360.0, "grad_norm": 1.6973016367126375, "language_loss": 0.68697822, "learning_rate": 2.786858317231779e-06, "loss": 0.71207547, "num_input_tokens_seen": 139262525, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.21582031, "step": 6489, "time_per_iteration": 2.8851261138916016 }, { "auxiliary_loss_clip": 0.01449179, "auxiliary_loss_mlp": 0.0103702, "balance_loss_clip": 1.27143812, "balance_loss_mlp": 1.01566935, "epoch": 0.390199909815121, "flos": 26444239269120.0, "grad_norm": 1.6614095441877768, "language_loss": 0.81629241, "learning_rate": 2.7865002487961788e-06, "loss": 0.8411544, "num_input_tokens_seen": 139282835, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.21337891, "step": 6490, "time_per_iteration": 2.861380100250244 }, { "auxiliary_loss_clip": 0.01462105, "auxiliary_loss_mlp": 0.01038832, "balance_loss_clip": 1.28049338, "balance_loss_mlp": 1.0168972, "epoch": 0.39026003306778895, "flos": 17283423306240.0, "grad_norm": 2.6496128451993566, "language_loss": 0.90749961, "learning_rate": 2.7861421505364104e-06, "loss": 0.93250895, "num_input_tokens_seen": 139299490, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.21923828, "step": 6491, "time_per_iteration": 2.8403480052948 }, { "auxiliary_loss_clip": 0.01467062, "auxiliary_loss_mlp": 0.01044725, "balance_loss_clip": 1.28364277, "balance_loss_mlp": 1.02226639, "epoch": 0.3903201563204569, "flos": 24542947244160.0, "grad_norm": 1.8984510525372758, "language_loss": 0.79254913, "learning_rate": 2.7857840224660523e-06, "loss": 0.81766701, "num_input_tokens_seen": 139317865, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.22460938, "step": 6492, "time_per_iteration": 2.9087772369384766 }, { "auxiliary_loss_clip": 0.01454749, "auxiliary_loss_mlp": 0.01037222, "balance_loss_clip": 1.27522135, "balance_loss_mlp": 1.01439393, "epoch": 0.39038027957312493, "flos": 23778190068480.0, "grad_norm": 2.4098980113312622, "language_loss": 0.74625432, "learning_rate": 2.7854258645986857e-06, "loss": 0.77117395, "num_input_tokens_seen": 139339840, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.22839355, "step": 6493, "time_per_iteration": 2.86013126373291 }, { "auxiliary_loss_clip": 0.01494065, "auxiliary_loss_mlp": 0.01040747, "balance_loss_clip": 1.30416441, "balance_loss_mlp": 1.01772761, "epoch": 0.3904404028257929, "flos": 14108712353280.0, "grad_norm": 12.707937005054411, "language_loss": 0.77330816, "learning_rate": 2.7850676769478916e-06, "loss": 0.79865628, "num_input_tokens_seen": 139357555, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.22998047, "step": 6494, "time_per_iteration": 2.8610501289367676 }, { "auxiliary_loss_clip": 0.01495229, "auxiliary_loss_mlp": 0.01047392, "balance_loss_clip": 1.30292869, "balance_loss_mlp": 1.02425337, "epoch": 0.39050052607846086, "flos": 16918884800640.0, "grad_norm": 2.837665955404574, "language_loss": 0.75787115, "learning_rate": 2.7847094595272525e-06, "loss": 0.78329742, "num_input_tokens_seen": 139374455, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.23144531, "step": 6495, "time_per_iteration": 2.8506293296813965 }, { "auxiliary_loss_clip": 0.01463353, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.28470898, "balance_loss_mlp": 1.01910806, "epoch": 0.39056064933112883, "flos": 25925306947200.0, "grad_norm": 1.5961995683225885, "language_loss": 0.68052977, "learning_rate": 2.784351212350352e-06, "loss": 0.70557535, "num_input_tokens_seen": 139394770, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.22106934, "step": 6496, "time_per_iteration": 2.9213039875030518 }, { "auxiliary_loss_clip": 0.01246077, "auxiliary_loss_mlp": 0.01022988, "balance_loss_clip": 1.1387701, "balance_loss_mlp": 1.00086308, "epoch": 0.3906207725837968, "flos": 60055892766720.0, "grad_norm": 0.6614577911319535, "language_loss": 0.54061711, "learning_rate": 2.783992935430775e-06, "loss": 0.56330776, "num_input_tokens_seen": 139454760, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.22167969, "step": 6497, "time_per_iteration": 3.4821202754974365 }, { "auxiliary_loss_clip": 0.0147616, "auxiliary_loss_mlp": 0.01038603, "balance_loss_clip": 1.29337895, "balance_loss_mlp": 1.01647735, "epoch": 0.39068089583646476, "flos": 21078406229760.0, "grad_norm": 2.0237171771950404, "language_loss": 0.70241982, "learning_rate": 2.7836346287821068e-06, "loss": 0.72756743, "num_input_tokens_seen": 139472645, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.22143555, "step": 6498, "time_per_iteration": 2.9563822746276855 }, { "auxiliary_loss_clip": 0.01245281, "auxiliary_loss_mlp": 0.01020022, "balance_loss_clip": 1.13839436, "balance_loss_mlp": 1.00247467, "epoch": 0.3907410190891327, "flos": 70480734721920.0, "grad_norm": 0.7328468323146559, "language_loss": 0.51780653, "learning_rate": 2.783276292417936e-06, "loss": 0.54045951, "num_input_tokens_seen": 139536730, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.17578125, "step": 6499, "time_per_iteration": 3.4432950019836426 }, { "auxiliary_loss_clip": 0.01477106, "auxiliary_loss_mlp": 0.01042215, "balance_loss_clip": 1.29118872, "balance_loss_mlp": 1.01857555, "epoch": 0.3908011423418007, "flos": 27973436906880.0, "grad_norm": 1.6003017893856029, "language_loss": 0.74879336, "learning_rate": 2.7829179263518487e-06, "loss": 0.77398658, "num_input_tokens_seen": 139557540, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.23657227, "step": 6500, "time_per_iteration": 2.979611396789551 }, { "auxiliary_loss_clip": 0.01479284, "auxiliary_loss_mlp": 0.01045425, "balance_loss_clip": 1.29217124, "balance_loss_mlp": 1.02232242, "epoch": 0.39086126559446865, "flos": 24472763280000.0, "grad_norm": 1.8510884974050967, "language_loss": 0.6942327, "learning_rate": 2.7825595305974354e-06, "loss": 0.7194798, "num_input_tokens_seen": 139576875, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.2310791, "step": 6501, "time_per_iteration": 2.8669726848602295 }, { "auxiliary_loss_clip": 0.01469837, "auxiliary_loss_mlp": 0.01043365, "balance_loss_clip": 1.2880547, "balance_loss_mlp": 1.02168059, "epoch": 0.3909213888471366, "flos": 16949090344320.0, "grad_norm": 1.720408861246598, "language_loss": 0.79274988, "learning_rate": 2.782201105168287e-06, "loss": 0.81788188, "num_input_tokens_seen": 139594295, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.21716309, "step": 6502, "time_per_iteration": 2.853104591369629 }, { "auxiliary_loss_clip": 0.01454593, "auxiliary_loss_mlp": 0.01038286, "balance_loss_clip": 1.2766819, "balance_loss_mlp": 1.01669765, "epoch": 0.3909815120998046, "flos": 29290363349760.0, "grad_norm": 2.2260472964275695, "language_loss": 0.80701578, "learning_rate": 2.7818426500779932e-06, "loss": 0.83194458, "num_input_tokens_seen": 139614080, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.21582031, "step": 6503, "time_per_iteration": 2.8915133476257324 }, { "auxiliary_loss_clip": 0.01458157, "auxiliary_loss_mlp": 0.01040399, "balance_loss_clip": 1.28059912, "balance_loss_mlp": 1.01816654, "epoch": 0.39104163535247255, "flos": 18960092305920.0, "grad_norm": 2.1948605446885585, "language_loss": 0.72240686, "learning_rate": 2.7814841653401485e-06, "loss": 0.74739242, "num_input_tokens_seen": 139632755, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.22229004, "step": 6504, "time_per_iteration": 2.865107536315918 }, { "auxiliary_loss_clip": 0.01463038, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.28204083, "balance_loss_mlp": 1.01395261, "epoch": 0.3911017586051405, "flos": 26334619822080.0, "grad_norm": 1.6587255387775324, "language_loss": 0.83796978, "learning_rate": 2.7811256509683454e-06, "loss": 0.86296797, "num_input_tokens_seen": 139654205, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.22814941, "step": 6505, "time_per_iteration": 2.8531925678253174 }, { "auxiliary_loss_clip": 0.01466117, "auxiliary_loss_mlp": 0.01038941, "balance_loss_clip": 1.28561759, "balance_loss_mlp": 1.0145154, "epoch": 0.3911618818578085, "flos": 21845923338240.0, "grad_norm": 2.908049162175065, "language_loss": 0.72020316, "learning_rate": 2.7807671069761797e-06, "loss": 0.74525368, "num_input_tokens_seen": 139673595, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.24414062, "step": 6506, "time_per_iteration": 2.8197293281555176 }, { "auxiliary_loss_clip": 0.0145306, "auxiliary_loss_mlp": 0.01039433, "balance_loss_clip": 1.27710593, "balance_loss_mlp": 1.0170337, "epoch": 0.3912220051104765, "flos": 16367575184640.0, "grad_norm": 1.8452941956639932, "language_loss": 0.76196462, "learning_rate": 2.7804085333772477e-06, "loss": 0.78688949, "num_input_tokens_seen": 139690565, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.22399902, "step": 6507, "time_per_iteration": 2.800933599472046 }, { "auxiliary_loss_clip": 0.01246192, "auxiliary_loss_mlp": 0.01028565, "balance_loss_clip": 1.13445079, "balance_loss_mlp": 1.00186241, "epoch": 0.39128212836314447, "flos": 71082320590080.0, "grad_norm": 0.7557618663356298, "language_loss": 0.56528008, "learning_rate": 2.7800499301851446e-06, "loss": 0.58802766, "num_input_tokens_seen": 139749420, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.26757812, "step": 6508, "time_per_iteration": 3.4869067668914795 }, { "auxiliary_loss_clip": 0.0145485, "auxiliary_loss_mlp": 0.01038118, "balance_loss_clip": 1.27594733, "balance_loss_mlp": 1.01568258, "epoch": 0.39134225161581243, "flos": 20339692076160.0, "grad_norm": 2.1393188476974334, "language_loss": 0.77129066, "learning_rate": 2.779691297413471e-06, "loss": 0.79622036, "num_input_tokens_seen": 139766265, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.2244873, "step": 6509, "time_per_iteration": 2.8593108654022217 }, { "auxiliary_loss_clip": 0.01479067, "auxiliary_loss_mlp": 0.01041819, "balance_loss_clip": 1.29573584, "balance_loss_mlp": 1.01810861, "epoch": 0.3914023748684804, "flos": 17026965924480.0, "grad_norm": 3.617374419375828, "language_loss": 0.83670205, "learning_rate": 2.779332635075825e-06, "loss": 0.86191094, "num_input_tokens_seen": 139782400, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.23693848, "step": 6510, "time_per_iteration": 2.982682466506958 }, { "auxiliary_loss_clip": 0.01479391, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.29542947, "balance_loss_mlp": 1.01335347, "epoch": 0.39146249812114836, "flos": 18414257310720.0, "grad_norm": 4.242789570282186, "language_loss": 0.77992827, "learning_rate": 2.7789739431858073e-06, "loss": 0.80508196, "num_input_tokens_seen": 139801435, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.22631836, "step": 6511, "time_per_iteration": 2.846534013748169 }, { "auxiliary_loss_clip": 0.01248754, "auxiliary_loss_mlp": 0.01023101, "balance_loss_clip": 1.13660741, "balance_loss_mlp": 0.99964011, "epoch": 0.3915226213738163, "flos": 67671964863360.0, "grad_norm": 0.7173514472524468, "language_loss": 0.57781255, "learning_rate": 2.7786152217570196e-06, "loss": 0.60053116, "num_input_tokens_seen": 139869700, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.234375, "step": 6512, "time_per_iteration": 3.3698952198028564 }, { "auxiliary_loss_clip": 0.01473038, "auxiliary_loss_mlp": 0.01037559, "balance_loss_clip": 1.29186249, "balance_loss_mlp": 1.01352644, "epoch": 0.3915827446264843, "flos": 26370480965760.0, "grad_norm": 1.6755171546132923, "language_loss": 0.70543802, "learning_rate": 2.7782564708030647e-06, "loss": 0.73054397, "num_input_tokens_seen": 139890140, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.24023438, "step": 6513, "time_per_iteration": 2.8852343559265137 }, { "auxiliary_loss_clip": 0.01490079, "auxiliary_loss_mlp": 0.01038175, "balance_loss_clip": 1.30126214, "balance_loss_mlp": 1.01559663, "epoch": 0.39164286787915226, "flos": 21953732993280.0, "grad_norm": 6.981320307495923, "language_loss": 0.76992071, "learning_rate": 2.7778976903375464e-06, "loss": 0.79520321, "num_input_tokens_seen": 139908020, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.22583008, "step": 6514, "time_per_iteration": 4.2583982944488525 }, { "auxiliary_loss_clip": 0.01462652, "auxiliary_loss_mlp": 0.01037655, "balance_loss_clip": 1.28275394, "balance_loss_mlp": 1.01501727, "epoch": 0.3917029911318202, "flos": 16408684725120.0, "grad_norm": 1.792733649073772, "language_loss": 0.78699082, "learning_rate": 2.7775388803740693e-06, "loss": 0.81199384, "num_input_tokens_seen": 139926180, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.22631836, "step": 6515, "time_per_iteration": 2.8541603088378906 }, { "auxiliary_loss_clip": 0.01454579, "auxiliary_loss_mlp": 0.01036503, "balance_loss_clip": 1.277246, "balance_loss_mlp": 1.01483083, "epoch": 0.3917631143844882, "flos": 26222376176640.0, "grad_norm": 1.369738973240342, "language_loss": 0.80450237, "learning_rate": 2.7771800409262406e-06, "loss": 0.82941318, "num_input_tokens_seen": 139947420, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.21679688, "step": 6516, "time_per_iteration": 2.874918222427368 }, { "auxiliary_loss_clip": 0.01460763, "auxiliary_loss_mlp": 0.01039018, "balance_loss_clip": 1.27860594, "balance_loss_mlp": 1.01611829, "epoch": 0.39182323763715615, "flos": 18556706499840.0, "grad_norm": 2.15663250984783, "language_loss": 0.71346545, "learning_rate": 2.7768211720076665e-06, "loss": 0.73846322, "num_input_tokens_seen": 139965800, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.22888184, "step": 6517, "time_per_iteration": 2.857529878616333 }, { "auxiliary_loss_clip": 0.01477906, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.29398489, "balance_loss_mlp": 1.01871586, "epoch": 0.3918833608898241, "flos": 34326840355200.0, "grad_norm": 1.6455553102558327, "language_loss": 0.72382867, "learning_rate": 2.776462273631956e-06, "loss": 0.7490145, "num_input_tokens_seen": 139988140, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.21960449, "step": 6518, "time_per_iteration": 2.9520936012268066 }, { "auxiliary_loss_clip": 0.01482304, "auxiliary_loss_mlp": 0.01042854, "balance_loss_clip": 1.29638338, "balance_loss_mlp": 1.01960862, "epoch": 0.3919434841424921, "flos": 36953318338560.0, "grad_norm": 1.753587491936517, "language_loss": 0.6210888, "learning_rate": 2.7761033458127177e-06, "loss": 0.64634037, "num_input_tokens_seen": 140010060, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.2322998, "step": 6519, "time_per_iteration": 2.9504072666168213 }, { "auxiliary_loss_clip": 0.01504056, "auxiliary_loss_mlp": 0.01041148, "balance_loss_clip": 1.31318772, "balance_loss_mlp": 1.01837873, "epoch": 0.3920036073951601, "flos": 23518203592320.0, "grad_norm": 8.15818327060903, "language_loss": 0.68571562, "learning_rate": 2.775744388563563e-06, "loss": 0.71116763, "num_input_tokens_seen": 140029400, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.22766113, "step": 6520, "time_per_iteration": 2.9840166568756104 }, { "auxiliary_loss_clip": 0.01478665, "auxiliary_loss_mlp": 0.01038354, "balance_loss_clip": 1.29522336, "balance_loss_mlp": 1.01605034, "epoch": 0.39206373064782807, "flos": 18415343185920.0, "grad_norm": 1.712994880313037, "language_loss": 0.79395235, "learning_rate": 2.775385401898104e-06, "loss": 0.81912255, "num_input_tokens_seen": 140048940, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.22314453, "step": 6521, "time_per_iteration": 4.256750583648682 }, { "auxiliary_loss_clip": 0.01491442, "auxiliary_loss_mlp": 0.01041033, "balance_loss_clip": 1.30240369, "balance_loss_mlp": 1.01689339, "epoch": 0.39212385390049603, "flos": 12320749848960.0, "grad_norm": 2.2838888590056157, "language_loss": 0.71719515, "learning_rate": 2.775026385829952e-06, "loss": 0.74251986, "num_input_tokens_seen": 140066380, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.24145508, "step": 6522, "time_per_iteration": 4.262572288513184 }, { "auxiliary_loss_clip": 0.01478958, "auxiliary_loss_mlp": 0.01039915, "balance_loss_clip": 1.29362535, "balance_loss_mlp": 1.01813519, "epoch": 0.392183977153164, "flos": 19728650044800.0, "grad_norm": 5.776711946182707, "language_loss": 0.78119576, "learning_rate": 2.774667340372722e-06, "loss": 0.80638444, "num_input_tokens_seen": 140085275, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.21789551, "step": 6523, "time_per_iteration": 4.235956192016602 }, { "auxiliary_loss_clip": 0.01486248, "auxiliary_loss_mlp": 0.01040997, "balance_loss_clip": 1.29976964, "balance_loss_mlp": 1.01912212, "epoch": 0.39224410040583196, "flos": 33157611498240.0, "grad_norm": 2.144101962473702, "language_loss": 0.62729132, "learning_rate": 2.7743082655400293e-06, "loss": 0.65256381, "num_input_tokens_seen": 140105105, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.21887207, "step": 6524, "time_per_iteration": 2.944766044616699 }, { "auxiliary_loss_clip": 0.01483324, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.29438162, "balance_loss_mlp": 1.01293349, "epoch": 0.39230422365849993, "flos": 27793995454080.0, "grad_norm": 3.153439739620719, "language_loss": 0.74745744, "learning_rate": 2.773949161345489e-06, "loss": 0.77265334, "num_input_tokens_seen": 140125645, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.23327637, "step": 6525, "time_per_iteration": 2.863583564758301 }, { "auxiliary_loss_clip": 0.01487576, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.29956925, "balance_loss_mlp": 1.01732385, "epoch": 0.3923643469111679, "flos": 17940732785280.0, "grad_norm": 2.4825014791628206, "language_loss": 0.81652153, "learning_rate": 2.773590027802719e-06, "loss": 0.84179074, "num_input_tokens_seen": 140141925, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.22033691, "step": 6526, "time_per_iteration": 2.8307366371154785 }, { "auxiliary_loss_clip": 0.01507555, "auxiliary_loss_mlp": 0.01039732, "balance_loss_clip": 1.31958461, "balance_loss_mlp": 1.01857233, "epoch": 0.39242447016383586, "flos": 24069784677120.0, "grad_norm": 2.068573661272539, "language_loss": 0.70861399, "learning_rate": 2.7732308649253383e-06, "loss": 0.73408675, "num_input_tokens_seen": 140160965, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.21179199, "step": 6527, "time_per_iteration": 2.8415517807006836 }, { "auxiliary_loss_clip": 0.0148445, "auxiliary_loss_mlp": 0.0103822, "balance_loss_clip": 1.30081272, "balance_loss_mlp": 1.01624942, "epoch": 0.3924845934165038, "flos": 10669490444160.0, "grad_norm": 2.353052875278004, "language_loss": 0.83746958, "learning_rate": 2.772871672726965e-06, "loss": 0.86269635, "num_input_tokens_seen": 140177780, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.21972656, "step": 6528, "time_per_iteration": 2.810439348220825 }, { "auxiliary_loss_clip": 0.01479589, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.29753983, "balance_loss_mlp": 1.01751733, "epoch": 0.3925447166691718, "flos": 31257676817280.0, "grad_norm": 1.5860588978559498, "language_loss": 0.69345576, "learning_rate": 2.7725124512212205e-06, "loss": 0.71865213, "num_input_tokens_seen": 140201660, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.22521973, "step": 6529, "time_per_iteration": 2.9746310710906982 }, { "auxiliary_loss_clip": 0.01485064, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.29735506, "balance_loss_mlp": 1.02068818, "epoch": 0.39260483992183975, "flos": 29424623230080.0, "grad_norm": 2.632745526743396, "language_loss": 0.81275624, "learning_rate": 2.7721532004217267e-06, "loss": 0.83804286, "num_input_tokens_seen": 140218585, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.22912598, "step": 6530, "time_per_iteration": 2.8779969215393066 }, { "auxiliary_loss_clip": 0.01479276, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.29565322, "balance_loss_mlp": 1.01944017, "epoch": 0.3926649631745077, "flos": 22868314260480.0, "grad_norm": 1.9031338238280056, "language_loss": 0.76709366, "learning_rate": 2.7717939203421063e-06, "loss": 0.79231209, "num_input_tokens_seen": 140239905, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.23120117, "step": 6531, "time_per_iteration": 2.8764700889587402 }, { "auxiliary_loss_clip": 0.01252326, "auxiliary_loss_mlp": 0.0104516, "balance_loss_clip": 1.14402032, "balance_loss_mlp": 1.02541852, "epoch": 0.3927250864271757, "flos": 63921828816000.0, "grad_norm": 0.8297735564892434, "language_loss": 0.60433906, "learning_rate": 2.7714346109959822e-06, "loss": 0.62731391, "num_input_tokens_seen": 140293820, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 0.19726562, "step": 6532, "time_per_iteration": 3.200252056121826 }, { "auxiliary_loss_clip": 0.01256339, "auxiliary_loss_mlp": 0.01027999, "balance_loss_clip": 1.14762747, "balance_loss_mlp": 1.00539672, "epoch": 0.3927852096798437, "flos": 68943483509760.0, "grad_norm": 0.7844141063688981, "language_loss": 0.556095, "learning_rate": 2.771075272396981e-06, "loss": 0.57893836, "num_input_tokens_seen": 140360420, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.22558594, "step": 6533, "time_per_iteration": 3.3633668422698975 }, { "auxiliary_loss_clip": 0.01497272, "auxiliary_loss_mlp": 0.01037545, "balance_loss_clip": 1.31052828, "balance_loss_mlp": 1.01634967, "epoch": 0.39284533293251167, "flos": 29727981486720.0, "grad_norm": 1.9367288234962818, "language_loss": 0.77596992, "learning_rate": 2.7707159045587284e-06, "loss": 0.80131805, "num_input_tokens_seen": 140381950, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.21191406, "step": 6534, "time_per_iteration": 2.9465172290802 }, { "auxiliary_loss_clip": 0.01496993, "auxiliary_loss_mlp": 0.01038887, "balance_loss_clip": 1.30748987, "balance_loss_mlp": 1.01665449, "epoch": 0.39290545618517964, "flos": 18561321469440.0, "grad_norm": 2.0171849265588944, "language_loss": 0.79380453, "learning_rate": 2.770356507494851e-06, "loss": 0.81916326, "num_input_tokens_seen": 140399410, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.22241211, "step": 6535, "time_per_iteration": 2.8136367797851562 }, { "auxiliary_loss_clip": 0.0147328, "auxiliary_loss_mlp": 0.01040236, "balance_loss_clip": 1.2912364, "balance_loss_mlp": 1.01834917, "epoch": 0.3929655794378476, "flos": 26260499560320.0, "grad_norm": 1.605071914258273, "language_loss": 0.68932575, "learning_rate": 2.769997081218978e-06, "loss": 0.71446097, "num_input_tokens_seen": 140419055, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.21875, "step": 6536, "time_per_iteration": 2.9005329608917236 }, { "auxiliary_loss_clip": 0.01462204, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.28379941, "balance_loss_mlp": 1.01648605, "epoch": 0.39302570269051557, "flos": 29289775167360.0, "grad_norm": 1.9099635530738206, "language_loss": 0.70077354, "learning_rate": 2.769637625744738e-06, "loss": 0.72577572, "num_input_tokens_seen": 140438800, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.21533203, "step": 6537, "time_per_iteration": 2.8891079425811768 }, { "auxiliary_loss_clip": 0.01478646, "auxiliary_loss_mlp": 0.0104368, "balance_loss_clip": 1.29367328, "balance_loss_mlp": 1.01980281, "epoch": 0.39308582594318353, "flos": 17356276713600.0, "grad_norm": 2.301179986908904, "language_loss": 0.79670823, "learning_rate": 2.769278141085763e-06, "loss": 0.82193154, "num_input_tokens_seen": 140456880, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.2388916, "step": 6538, "time_per_iteration": 2.834627866744995 }, { "auxiliary_loss_clip": 0.01248717, "auxiliary_loss_mlp": 0.01021196, "balance_loss_clip": 1.13858294, "balance_loss_mlp": 0.99897557, "epoch": 0.3931459491958515, "flos": 61033147361280.0, "grad_norm": 0.8052640312663555, "language_loss": 0.61967766, "learning_rate": 2.768918627255683e-06, "loss": 0.64237678, "num_input_tokens_seen": 140507510, "router_z_loss_clip": 1.1015625, "router_z_loss_mlp": 0.22265625, "step": 6539, "time_per_iteration": 3.1424150466918945 }, { "auxiliary_loss_clip": 0.01483646, "auxiliary_loss_mlp": 0.01040554, "balance_loss_clip": 1.30007553, "balance_loss_mlp": 1.0181669, "epoch": 0.39320607244851946, "flos": 39029436846720.0, "grad_norm": 2.0953723427176567, "language_loss": 0.68625259, "learning_rate": 2.7685590842681315e-06, "loss": 0.71149457, "num_input_tokens_seen": 140528740, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.22387695, "step": 6540, "time_per_iteration": 2.96734881401062 }, { "auxiliary_loss_clip": 0.01484796, "auxiliary_loss_mlp": 0.01045853, "balance_loss_clip": 1.30104339, "balance_loss_mlp": 1.02252352, "epoch": 0.3932661957011874, "flos": 24690147137280.0, "grad_norm": 1.7120149454324094, "language_loss": 0.72967541, "learning_rate": 2.7681995121367433e-06, "loss": 0.75498188, "num_input_tokens_seen": 140547560, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.23303223, "step": 6541, "time_per_iteration": 2.8768720626831055 }, { "auxiliary_loss_clip": 0.01252999, "auxiliary_loss_mlp": 0.01025621, "balance_loss_clip": 1.14407766, "balance_loss_mlp": 1.00263786, "epoch": 0.3933263189538554, "flos": 70126132089600.0, "grad_norm": 0.8248936924620905, "language_loss": 0.60389388, "learning_rate": 2.7678399108751516e-06, "loss": 0.62668002, "num_input_tokens_seen": 140601175, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.22949219, "step": 6542, "time_per_iteration": 3.129950523376465 }, { "auxiliary_loss_clip": 0.01479956, "auxiliary_loss_mlp": 0.01044746, "balance_loss_clip": 1.29591346, "balance_loss_mlp": 1.02140474, "epoch": 0.39338644220652336, "flos": 22939176896640.0, "grad_norm": 3.1261540717692604, "language_loss": 0.83070922, "learning_rate": 2.7674802804969947e-06, "loss": 0.85595626, "num_input_tokens_seen": 140622200, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.23303223, "step": 6543, "time_per_iteration": 2.8621015548706055 }, { "auxiliary_loss_clip": 0.01469356, "auxiliary_loss_mlp": 0.01044965, "balance_loss_clip": 1.28551626, "balance_loss_mlp": 1.02132618, "epoch": 0.3934465654591913, "flos": 30860761017600.0, "grad_norm": 1.896485214834873, "language_loss": 0.69794786, "learning_rate": 2.767120621015908e-06, "loss": 0.72309113, "num_input_tokens_seen": 140643125, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23657227, "step": 6544, "time_per_iteration": 2.91471266746521 }, { "auxiliary_loss_clip": 0.0148786, "auxiliary_loss_mlp": 0.01046209, "balance_loss_clip": 1.29942107, "balance_loss_mlp": 1.02167594, "epoch": 0.3935066887118593, "flos": 29247082058880.0, "grad_norm": 2.109915128070669, "language_loss": 0.7639848, "learning_rate": 2.76676093244553e-06, "loss": 0.78932548, "num_input_tokens_seen": 140662500, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.24536133, "step": 6545, "time_per_iteration": 2.9296715259552 }, { "auxiliary_loss_clip": 0.01459401, "auxiliary_loss_mlp": 0.01045994, "balance_loss_clip": 1.28283834, "balance_loss_mlp": 1.02370214, "epoch": 0.3935668119645273, "flos": 19144601176320.0, "grad_norm": 1.5157993198059068, "language_loss": 0.75144851, "learning_rate": 2.7664012147995015e-06, "loss": 0.77650249, "num_input_tokens_seen": 140681960, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.22302246, "step": 6546, "time_per_iteration": 2.8275420665740967 }, { "auxiliary_loss_clip": 0.0150677, "auxiliary_loss_mlp": 0.01042938, "balance_loss_clip": 1.31472278, "balance_loss_mlp": 1.01906037, "epoch": 0.3936269352171953, "flos": 18525550815360.0, "grad_norm": 1.9973043510342527, "language_loss": 0.82351792, "learning_rate": 2.7660414680914617e-06, "loss": 0.849015, "num_input_tokens_seen": 140699170, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.2388916, "step": 6547, "time_per_iteration": 2.799750804901123 }, { "auxiliary_loss_clip": 0.01486583, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.3019762, "balance_loss_mlp": 1.01869321, "epoch": 0.39368705846986324, "flos": 15641258106240.0, "grad_norm": 2.0218766220773423, "language_loss": 0.8451618, "learning_rate": 2.7656816923350525e-06, "loss": 0.87044573, "num_input_tokens_seen": 140714920, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.23132324, "step": 6548, "time_per_iteration": 2.858471155166626 }, { "auxiliary_loss_clip": 0.01471623, "auxiliary_loss_mlp": 0.01038332, "balance_loss_clip": 1.2897172, "balance_loss_mlp": 1.01729107, "epoch": 0.3937471817225312, "flos": 21336085221120.0, "grad_norm": 1.7513204856908495, "language_loss": 0.73388767, "learning_rate": 2.7653218875439174e-06, "loss": 0.75898719, "num_input_tokens_seen": 140734595, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.21044922, "step": 6549, "time_per_iteration": 4.360385179519653 }, { "auxiliary_loss_clip": 0.01500762, "auxiliary_loss_mlp": 0.0104547, "balance_loss_clip": 1.31430566, "balance_loss_mlp": 1.02162838, "epoch": 0.39380730497519917, "flos": 20786313928320.0, "grad_norm": 1.5107616511517212, "language_loss": 0.78474641, "learning_rate": 2.764962053731699e-06, "loss": 0.81020874, "num_input_tokens_seen": 140754050, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.23852539, "step": 6550, "time_per_iteration": 2.8533008098602295 }, { "auxiliary_loss_clip": 0.01480528, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.29686213, "balance_loss_mlp": 1.01629782, "epoch": 0.39386742822786713, "flos": 21618042687360.0, "grad_norm": 2.0100223811213067, "language_loss": 0.81656879, "learning_rate": 2.7646021909120434e-06, "loss": 0.84176332, "num_input_tokens_seen": 140771440, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.22619629, "step": 6551, "time_per_iteration": 2.8873531818389893 }, { "auxiliary_loss_clip": 0.01469733, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.28538704, "balance_loss_mlp": 1.01772761, "epoch": 0.3939275514805351, "flos": 12420324950400.0, "grad_norm": 2.1774937406382877, "language_loss": 0.80593526, "learning_rate": 2.764242299098596e-06, "loss": 0.83103597, "num_input_tokens_seen": 140786715, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22607422, "step": 6552, "time_per_iteration": 2.9269766807556152 }, { "auxiliary_loss_clip": 0.01491072, "auxiliary_loss_mlp": 0.01046622, "balance_loss_clip": 1.30464625, "balance_loss_mlp": 1.02447248, "epoch": 0.39398767473320306, "flos": 18561321469440.0, "grad_norm": 2.154720883761838, "language_loss": 0.71704704, "learning_rate": 2.763882378305003e-06, "loss": 0.74242395, "num_input_tokens_seen": 140804950, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.22167969, "step": 6553, "time_per_iteration": 2.889822244644165 }, { "auxiliary_loss_clip": 0.01467518, "auxiliary_loss_mlp": 0.010431, "balance_loss_clip": 1.28547549, "balance_loss_mlp": 1.02029526, "epoch": 0.39404779798587103, "flos": 29319302039040.0, "grad_norm": 2.0590786757959294, "language_loss": 0.6506564, "learning_rate": 2.7635224285449144e-06, "loss": 0.67576253, "num_input_tokens_seen": 140822800, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.22802734, "step": 6554, "time_per_iteration": 2.925755739212036 }, { "auxiliary_loss_clip": 0.01472356, "auxiliary_loss_mlp": 0.01042834, "balance_loss_clip": 1.28873515, "balance_loss_mlp": 1.02030349, "epoch": 0.394107921238539, "flos": 34910436775680.0, "grad_norm": 23.005876682661086, "language_loss": 0.80254412, "learning_rate": 2.7631624498319796e-06, "loss": 0.82769597, "num_input_tokens_seen": 140842940, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.2253418, "step": 6555, "time_per_iteration": 2.9472358226776123 }, { "auxiliary_loss_clip": 0.01486856, "auxiliary_loss_mlp": 0.01038624, "balance_loss_clip": 1.30029416, "balance_loss_mlp": 1.01481807, "epoch": 0.39416804449120696, "flos": 25091994620160.0, "grad_norm": 1.630655908738108, "language_loss": 0.72679985, "learning_rate": 2.7628024421798473e-06, "loss": 0.75205469, "num_input_tokens_seen": 140863060, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.23815918, "step": 6556, "time_per_iteration": 4.28685736656189 }, { "auxiliary_loss_clip": 0.01475483, "auxiliary_loss_mlp": 0.01035065, "balance_loss_clip": 1.29115808, "balance_loss_mlp": 1.01276064, "epoch": 0.3942281677438749, "flos": 32319367488000.0, "grad_norm": 2.3214319040599323, "language_loss": 0.83904105, "learning_rate": 2.7624424056021705e-06, "loss": 0.86414647, "num_input_tokens_seen": 140883795, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.22302246, "step": 6557, "time_per_iteration": 4.37511134147644 }, { "auxiliary_loss_clip": 0.01481856, "auxiliary_loss_mlp": 0.01039075, "balance_loss_clip": 1.29715168, "balance_loss_mlp": 1.0165925, "epoch": 0.3942882909965429, "flos": 24947464170240.0, "grad_norm": 2.064544376955438, "language_loss": 0.81300175, "learning_rate": 2.7620823401126004e-06, "loss": 0.83821112, "num_input_tokens_seen": 140903055, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.22473145, "step": 6558, "time_per_iteration": 4.268579721450806 }, { "auxiliary_loss_clip": 0.0147978, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 1.29711282, "balance_loss_mlp": 1.01409864, "epoch": 0.39434841424921085, "flos": 11882272060800.0, "grad_norm": 2.2100435787577952, "language_loss": 0.7218855, "learning_rate": 2.761722245724792e-06, "loss": 0.74704957, "num_input_tokens_seen": 140920685, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.2253418, "step": 6559, "time_per_iteration": 2.7939467430114746 }, { "auxiliary_loss_clip": 0.01503229, "auxiliary_loss_mlp": 0.01040983, "balance_loss_clip": 1.3119272, "balance_loss_mlp": 1.01830983, "epoch": 0.3944085375018789, "flos": 16370018403840.0, "grad_norm": 1.9738544126592652, "language_loss": 0.80983186, "learning_rate": 2.7613621224524003e-06, "loss": 0.83527398, "num_input_tokens_seen": 140937320, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.22668457, "step": 6560, "time_per_iteration": 2.799424648284912 }, { "auxiliary_loss_clip": 0.01487102, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.30070114, "balance_loss_mlp": 1.01745558, "epoch": 0.39446866075454684, "flos": 10640280286080.0, "grad_norm": 1.994206091552577, "language_loss": 0.8379246, "learning_rate": 2.7610019703090803e-06, "loss": 0.8632018, "num_input_tokens_seen": 140954855, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.23156738, "step": 6561, "time_per_iteration": 2.846052646636963 }, { "auxiliary_loss_clip": 0.01468628, "auxiliary_loss_mlp": 0.01035069, "balance_loss_clip": 1.28457093, "balance_loss_mlp": 1.01251507, "epoch": 0.3945287840072148, "flos": 18196782963840.0, "grad_norm": 2.422890216631079, "language_loss": 0.81424081, "learning_rate": 2.7606417893084887e-06, "loss": 0.83927774, "num_input_tokens_seen": 140973250, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22546387, "step": 6562, "time_per_iteration": 2.8391945362091064 }, { "auxiliary_loss_clip": 0.01468561, "auxiliary_loss_mlp": 0.01039566, "balance_loss_clip": 1.28732133, "balance_loss_mlp": 1.01691651, "epoch": 0.39458890725988277, "flos": 23050379911680.0, "grad_norm": 1.5404717174373483, "language_loss": 0.82061183, "learning_rate": 2.7602815794642853e-06, "loss": 0.84569311, "num_input_tokens_seen": 140993050, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.22644043, "step": 6563, "time_per_iteration": 2.9049787521362305 }, { "auxiliary_loss_clip": 0.01475575, "auxiliary_loss_mlp": 0.01038833, "balance_loss_clip": 1.29013693, "balance_loss_mlp": 1.01649344, "epoch": 0.39464903051255074, "flos": 17166293222400.0, "grad_norm": 2.4617212188495645, "language_loss": 0.70996672, "learning_rate": 2.759921340790127e-06, "loss": 0.73511082, "num_input_tokens_seen": 141010815, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.2232666, "step": 6564, "time_per_iteration": 2.8074395656585693 }, { "auxiliary_loss_clip": 0.01481578, "auxiliary_loss_mlp": 0.01038625, "balance_loss_clip": 1.29474294, "balance_loss_mlp": 1.01568913, "epoch": 0.3947091537652187, "flos": 15897715488000.0, "grad_norm": 2.6602540727915747, "language_loss": 0.84789193, "learning_rate": 2.759561073299676e-06, "loss": 0.87309396, "num_input_tokens_seen": 141028720, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.22937012, "step": 6565, "time_per_iteration": 2.837442636489868 }, { "auxiliary_loss_clip": 0.014725, "auxiliary_loss_mlp": 0.01035268, "balance_loss_clip": 1.28869128, "balance_loss_mlp": 1.01319051, "epoch": 0.39476927701788667, "flos": 18553086915840.0, "grad_norm": 1.7655644726715483, "language_loss": 0.84606266, "learning_rate": 2.7592007770065937e-06, "loss": 0.87114036, "num_input_tokens_seen": 141046025, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.22058105, "step": 6566, "time_per_iteration": 2.793774127960205 }, { "auxiliary_loss_clip": 0.01501007, "auxiliary_loss_mlp": 0.01040933, "balance_loss_clip": 1.30888116, "balance_loss_mlp": 1.01918876, "epoch": 0.39482940027055463, "flos": 22286527632000.0, "grad_norm": 3.186992962660655, "language_loss": 0.78571385, "learning_rate": 2.7588404519245403e-06, "loss": 0.81113321, "num_input_tokens_seen": 141066865, "router_z_loss_clip": 1.921875, "router_z_loss_mlp": 0.21740723, "step": 6567, "time_per_iteration": 2.861283540725708 }, { "auxiliary_loss_clip": 0.01457389, "auxiliary_loss_mlp": 0.01039224, "balance_loss_clip": 1.27805948, "balance_loss_mlp": 1.01761198, "epoch": 0.3948895235232226, "flos": 14765976587520.0, "grad_norm": 2.446334283916939, "language_loss": 0.80780041, "learning_rate": 2.758480098067182e-06, "loss": 0.83276647, "num_input_tokens_seen": 141084210, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.21618652, "step": 6568, "time_per_iteration": 2.8019163608551025 }, { "auxiliary_loss_clip": 0.01473127, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.2886132, "balance_loss_mlp": 1.0169065, "epoch": 0.39494964677589056, "flos": 22576176714240.0, "grad_norm": 1.5783561122097896, "language_loss": 0.85738629, "learning_rate": 2.7581197154481816e-06, "loss": 0.88251048, "num_input_tokens_seen": 141103895, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22387695, "step": 6569, "time_per_iteration": 2.871307134628296 }, { "auxiliary_loss_clip": 0.01473724, "auxiliary_loss_mlp": 0.0104082, "balance_loss_clip": 1.29112279, "balance_loss_mlp": 1.01869512, "epoch": 0.3950097700285585, "flos": 22972685310720.0, "grad_norm": 1.8149179879007349, "language_loss": 0.75317466, "learning_rate": 2.7577593040812066e-06, "loss": 0.77832013, "num_input_tokens_seen": 141124000, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.22131348, "step": 6570, "time_per_iteration": 2.866025447845459 }, { "auxiliary_loss_clip": 0.01470264, "auxiliary_loss_mlp": 0.01038107, "balance_loss_clip": 1.28562343, "balance_loss_mlp": 1.01659012, "epoch": 0.3950698932812265, "flos": 20605062683520.0, "grad_norm": 1.5865369347785425, "language_loss": 0.80531251, "learning_rate": 2.757398863979922e-06, "loss": 0.83039629, "num_input_tokens_seen": 141142535, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.21520996, "step": 6571, "time_per_iteration": 2.835641384124756 }, { "auxiliary_loss_clip": 0.01470822, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.28788424, "balance_loss_mlp": 1.01684654, "epoch": 0.39513001653389446, "flos": 20385416586240.0, "grad_norm": 2.0891886773397683, "language_loss": 0.78874087, "learning_rate": 2.757038395157997e-06, "loss": 0.81383646, "num_input_tokens_seen": 141161575, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.21887207, "step": 6572, "time_per_iteration": 2.8703367710113525 }, { "auxiliary_loss_clip": 0.0147982, "auxiliary_loss_mlp": 0.01037033, "balance_loss_clip": 1.29361761, "balance_loss_mlp": 1.01428771, "epoch": 0.3951901397865625, "flos": 26473494672000.0, "grad_norm": 2.1750907042735776, "language_loss": 0.7514416, "learning_rate": 2.7566778976291002e-06, "loss": 0.7766102, "num_input_tokens_seen": 141181150, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.22729492, "step": 6573, "time_per_iteration": 2.8805041313171387 }, { "auxiliary_loss_clip": 0.01479869, "auxiliary_loss_mlp": 0.01035381, "balance_loss_clip": 1.29626048, "balance_loss_mlp": 1.01453102, "epoch": 0.39525026303923044, "flos": 43854638042880.0, "grad_norm": 1.593476149063084, "language_loss": 0.68314344, "learning_rate": 2.7563173714069017e-06, "loss": 0.70829594, "num_input_tokens_seen": 141206310, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.20849609, "step": 6574, "time_per_iteration": 3.124318838119507 }, { "auxiliary_loss_clip": 0.01477643, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.29043925, "balance_loss_mlp": 1.0130024, "epoch": 0.3953103862918984, "flos": 18049854539520.0, "grad_norm": 2.3150344167708927, "language_loss": 0.7323705, "learning_rate": 2.755956816505072e-06, "loss": 0.75750107, "num_input_tokens_seen": 141223925, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.22412109, "step": 6575, "time_per_iteration": 2.8557560443878174 }, { "auxiliary_loss_clip": 0.0149352, "auxiliary_loss_mlp": 0.01042708, "balance_loss_clip": 1.30404568, "balance_loss_mlp": 1.01971316, "epoch": 0.3953705095445664, "flos": 16983051206400.0, "grad_norm": 3.0166831190399694, "language_loss": 0.74060446, "learning_rate": 2.7555962329372845e-06, "loss": 0.76596677, "num_input_tokens_seen": 141239010, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.22998047, "step": 6576, "time_per_iteration": 2.907925605773926 }, { "auxiliary_loss_clip": 0.01474956, "auxiliary_loss_mlp": 0.01037567, "balance_loss_clip": 1.28951383, "balance_loss_mlp": 1.01694441, "epoch": 0.39543063279723434, "flos": 17417592696960.0, "grad_norm": 2.33763481752221, "language_loss": 0.8438313, "learning_rate": 2.7552356207172124e-06, "loss": 0.86895657, "num_input_tokens_seen": 141252255, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.20629883, "step": 6577, "time_per_iteration": 2.895383358001709 }, { "auxiliary_loss_clip": 0.01482899, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.29922807, "balance_loss_mlp": 1.01761007, "epoch": 0.3954907560499023, "flos": 22794555957120.0, "grad_norm": 2.4946180715393633, "language_loss": 0.91537964, "learning_rate": 2.75487497985853e-06, "loss": 0.94061226, "num_input_tokens_seen": 141269325, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.22753906, "step": 6578, "time_per_iteration": 2.9213907718658447 }, { "auxiliary_loss_clip": 0.01488697, "auxiliary_loss_mlp": 0.01042235, "balance_loss_clip": 1.29864359, "balance_loss_mlp": 1.01853585, "epoch": 0.39555087930257027, "flos": 21954366420480.0, "grad_norm": 1.9331173304684939, "language_loss": 0.79063666, "learning_rate": 2.7545143103749117e-06, "loss": 0.81594592, "num_input_tokens_seen": 141288505, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.23718262, "step": 6579, "time_per_iteration": 2.952899694442749 }, { "auxiliary_loss_clip": 0.01492287, "auxiliary_loss_mlp": 0.01037414, "balance_loss_clip": 1.30243576, "balance_loss_mlp": 1.0136317, "epoch": 0.39561100255523823, "flos": 20412274014720.0, "grad_norm": 2.25203564265753, "language_loss": 0.69400918, "learning_rate": 2.754153612280037e-06, "loss": 0.71930623, "num_input_tokens_seen": 141303680, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.23803711, "step": 6580, "time_per_iteration": 2.8576860427856445 }, { "auxiliary_loss_clip": 0.01472771, "auxiliary_loss_mlp": 0.01037126, "balance_loss_clip": 1.28704071, "balance_loss_mlp": 1.01515555, "epoch": 0.3956711258079062, "flos": 27976106350080.0, "grad_norm": 1.929766114834269, "language_loss": 0.59708238, "learning_rate": 2.7537928855875797e-06, "loss": 0.62218136, "num_input_tokens_seen": 141324090, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.21960449, "step": 6581, "time_per_iteration": 2.9328601360321045 }, { "auxiliary_loss_clip": 0.01480082, "auxiliary_loss_mlp": 0.01038704, "balance_loss_clip": 1.29438698, "balance_loss_mlp": 1.01595938, "epoch": 0.39573124906057416, "flos": 14436303840000.0, "grad_norm": 2.257470849691793, "language_loss": 0.7048713, "learning_rate": 2.7534321303112224e-06, "loss": 0.73005921, "num_input_tokens_seen": 141342235, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.22753906, "step": 6582, "time_per_iteration": 2.936013698577881 }, { "auxiliary_loss_clip": 0.01484426, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.29732001, "balance_loss_mlp": 1.01636255, "epoch": 0.39579137231324213, "flos": 18742844183040.0, "grad_norm": 4.850254105393422, "language_loss": 0.77062416, "learning_rate": 2.753071346464642e-06, "loss": 0.79584634, "num_input_tokens_seen": 141361195, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.2142334, "step": 6583, "time_per_iteration": 2.8234238624572754 }, { "auxiliary_loss_clip": 0.01477311, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.29092383, "balance_loss_mlp": 1.01446509, "epoch": 0.3958514955659101, "flos": 17685497013120.0, "grad_norm": 1.4997864119641209, "language_loss": 0.66754079, "learning_rate": 2.7527105340615207e-06, "loss": 0.69266254, "num_input_tokens_seen": 141378275, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.20410156, "step": 6584, "time_per_iteration": 2.836475372314453 }, { "auxiliary_loss_clip": 0.01496704, "auxiliary_loss_mlp": 0.01045441, "balance_loss_clip": 1.30636358, "balance_loss_mlp": 1.02332783, "epoch": 0.39591161881857806, "flos": 29319573507840.0, "grad_norm": 2.2869198702366065, "language_loss": 0.7339831, "learning_rate": 2.7523496931155413e-06, "loss": 0.75940454, "num_input_tokens_seen": 141396960, "router_z_loss_clip": 1.90136719, "router_z_loss_mlp": 0.22106934, "step": 6585, "time_per_iteration": 4.310010671615601 }, { "auxiliary_loss_clip": 0.01474366, "auxiliary_loss_mlp": 0.01036081, "balance_loss_clip": 1.28697908, "balance_loss_mlp": 1.01495767, "epoch": 0.3959717420712461, "flos": 25781862372480.0, "grad_norm": 2.756985163476723, "language_loss": 0.742046, "learning_rate": 2.7519888236403856e-06, "loss": 0.76715052, "num_input_tokens_seen": 141417320, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.21118164, "step": 6586, "time_per_iteration": 2.9039525985717773 }, { "auxiliary_loss_clip": 0.01484702, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.29637241, "balance_loss_mlp": 1.01690173, "epoch": 0.39603186532391405, "flos": 20933740045440.0, "grad_norm": 1.6122831067467862, "language_loss": 0.72106218, "learning_rate": 2.7516279256497382e-06, "loss": 0.74630213, "num_input_tokens_seen": 141435985, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.22399902, "step": 6587, "time_per_iteration": 2.8939497470855713 }, { "auxiliary_loss_clip": 0.01242106, "auxiliary_loss_mlp": 0.01024492, "balance_loss_clip": 1.13783884, "balance_loss_mlp": 1.00475109, "epoch": 0.396091988576582, "flos": 54906357709440.0, "grad_norm": 0.8740181847071061, "language_loss": 0.61278713, "learning_rate": 2.751266999157285e-06, "loss": 0.6354531, "num_input_tokens_seen": 141486075, "router_z_loss_clip": 1.0390625, "router_z_loss_mlp": 0.19726562, "step": 6588, "time_per_iteration": 3.2782557010650635 }, { "auxiliary_loss_clip": 0.01493082, "auxiliary_loss_mlp": 0.01039621, "balance_loss_clip": 1.30390477, "balance_loss_mlp": 1.01823509, "epoch": 0.39615211182925, "flos": 20712284156160.0, "grad_norm": 1.6979532649248734, "language_loss": 0.81773698, "learning_rate": 2.7509060441767115e-06, "loss": 0.84306395, "num_input_tokens_seen": 141505280, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.21374512, "step": 6589, "time_per_iteration": 2.9053702354431152 }, { "auxiliary_loss_clip": 0.01490885, "auxiliary_loss_mlp": 0.01037302, "balance_loss_clip": 1.30111742, "balance_loss_mlp": 1.01491439, "epoch": 0.39621223508191794, "flos": 21003652540800.0, "grad_norm": 2.645348759630068, "language_loss": 0.70878124, "learning_rate": 2.7505450607217057e-06, "loss": 0.73406315, "num_input_tokens_seen": 141523930, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.22375488, "step": 6590, "time_per_iteration": 2.954557418823242 }, { "auxiliary_loss_clip": 0.01477142, "auxiliary_loss_mlp": 0.01037903, "balance_loss_clip": 1.29195988, "balance_loss_mlp": 1.01655281, "epoch": 0.3962723583345859, "flos": 23379554966400.0, "grad_norm": 1.7765237536725684, "language_loss": 0.75894034, "learning_rate": 2.750184048805956e-06, "loss": 0.78409082, "num_input_tokens_seen": 141541320, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.21350098, "step": 6591, "time_per_iteration": 2.9496283531188965 }, { "auxiliary_loss_clip": 0.0148147, "auxiliary_loss_mlp": 0.01039473, "balance_loss_clip": 1.2946527, "balance_loss_mlp": 1.0171212, "epoch": 0.39633248158725387, "flos": 25125593523840.0, "grad_norm": 2.2327032530817634, "language_loss": 0.7880711, "learning_rate": 2.749823008443152e-06, "loss": 0.81328052, "num_input_tokens_seen": 141561880, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.22338867, "step": 6592, "time_per_iteration": 5.74885368347168 }, { "auxiliary_loss_clip": 0.01467612, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.28506815, "balance_loss_mlp": 1.01444125, "epoch": 0.39639260483992184, "flos": 39802654800000.0, "grad_norm": 2.0519178569021395, "language_loss": 0.69530213, "learning_rate": 2.7494619396469843e-06, "loss": 0.72033763, "num_input_tokens_seen": 141586460, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.21459961, "step": 6593, "time_per_iteration": 4.4713521003723145 }, { "auxiliary_loss_clip": 0.01486005, "auxiliary_loss_mlp": 0.01044371, "balance_loss_clip": 1.29662621, "balance_loss_mlp": 1.02234125, "epoch": 0.3964527280925898, "flos": 17355779020800.0, "grad_norm": 2.6506204407076286, "language_loss": 0.78044224, "learning_rate": 2.7491008424311452e-06, "loss": 0.80574596, "num_input_tokens_seen": 141605955, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.22033691, "step": 6594, "time_per_iteration": 2.94380784034729 }, { "auxiliary_loss_clip": 0.01245823, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.13871956, "balance_loss_mlp": 1.016343, "epoch": 0.39651285134525777, "flos": 71751981899520.0, "grad_norm": 1.1493737908206458, "language_loss": 0.63134992, "learning_rate": 2.7487397168093265e-06, "loss": 0.65414989, "num_input_tokens_seen": 141673140, "router_z_loss_clip": 1.0703125, "router_z_loss_mlp": 0.17871094, "step": 6595, "time_per_iteration": 3.4127001762390137 }, { "auxiliary_loss_clip": 0.01497954, "auxiliary_loss_mlp": 0.01041919, "balance_loss_clip": 1.30718112, "balance_loss_mlp": 1.01986563, "epoch": 0.39657297459792573, "flos": 25786748810880.0, "grad_norm": 2.0759823633828485, "language_loss": 0.64069873, "learning_rate": 2.748378562795223e-06, "loss": 0.66609746, "num_input_tokens_seen": 141692955, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.22070312, "step": 6596, "time_per_iteration": 2.916390895843506 }, { "auxiliary_loss_clip": 0.01457862, "auxiliary_loss_mlp": 0.01038727, "balance_loss_clip": 1.27720165, "balance_loss_mlp": 1.01804376, "epoch": 0.3966330978505937, "flos": 20275661404800.0, "grad_norm": 1.7861893724144253, "language_loss": 0.79292023, "learning_rate": 2.7480173804025293e-06, "loss": 0.81788617, "num_input_tokens_seen": 141710680, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.20666504, "step": 6597, "time_per_iteration": 2.9172351360321045 }, { "auxiliary_loss_clip": 0.01495618, "auxiliary_loss_mlp": 0.01040225, "balance_loss_clip": 1.30506778, "balance_loss_mlp": 1.01798081, "epoch": 0.39669322110326166, "flos": 20640878582400.0, "grad_norm": 2.263292847172142, "language_loss": 0.68497074, "learning_rate": 2.747656169644941e-06, "loss": 0.71032917, "num_input_tokens_seen": 141729860, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 0.22253418, "step": 6598, "time_per_iteration": 2.8455846309661865 }, { "auxiliary_loss_clip": 0.01482165, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.29377866, "balance_loss_mlp": 1.01869726, "epoch": 0.3967533443559297, "flos": 21736122912000.0, "grad_norm": 2.6715068887304625, "language_loss": 0.79484105, "learning_rate": 2.747294930536157e-06, "loss": 0.82005817, "num_input_tokens_seen": 141749060, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.20837402, "step": 6599, "time_per_iteration": 2.9381017684936523 }, { "auxiliary_loss_clip": 0.0148589, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.29776335, "balance_loss_mlp": 1.01604748, "epoch": 0.39681346760859765, "flos": 25495199447040.0, "grad_norm": 2.2958424263427366, "language_loss": 0.73252112, "learning_rate": 2.7469336630898737e-06, "loss": 0.75777233, "num_input_tokens_seen": 141769860, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.23193359, "step": 6600, "time_per_iteration": 2.885343551635742 }, { "auxiliary_loss_clip": 0.0147746, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.28987539, "balance_loss_mlp": 1.01558876, "epoch": 0.3968735908612656, "flos": 20969374965120.0, "grad_norm": 2.210982761197086, "language_loss": 0.86426091, "learning_rate": 2.746572367319791e-06, "loss": 0.8894043, "num_input_tokens_seen": 141788465, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.2130127, "step": 6601, "time_per_iteration": 2.85606050491333 }, { "auxiliary_loss_clip": 0.01494752, "auxiliary_loss_mlp": 0.01044533, "balance_loss_clip": 1.30188775, "balance_loss_mlp": 1.02116823, "epoch": 0.3969337141139336, "flos": 10714219568640.0, "grad_norm": 2.438144760481022, "language_loss": 0.70808136, "learning_rate": 2.7462110432396095e-06, "loss": 0.73347425, "num_input_tokens_seen": 141804955, "router_z_loss_clip": 1.9296875, "router_z_loss_mlp": 0.23376465, "step": 6602, "time_per_iteration": 2.796848773956299 }, { "auxiliary_loss_clip": 0.01485633, "auxiliary_loss_mlp": 0.01045793, "balance_loss_clip": 1.29501867, "balance_loss_mlp": 1.02397752, "epoch": 0.39699383736660154, "flos": 17600246530560.0, "grad_norm": 3.0046105078852, "language_loss": 0.85038489, "learning_rate": 2.7458496908630305e-06, "loss": 0.87569916, "num_input_tokens_seen": 141820025, "router_z_loss_clip": 1.90722656, "router_z_loss_mlp": 0.21826172, "step": 6603, "time_per_iteration": 2.8304295539855957 }, { "auxiliary_loss_clip": 0.0148068, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.29388535, "balance_loss_mlp": 1.01693797, "epoch": 0.3970539606192695, "flos": 17795071215360.0, "grad_norm": 1.8195238822985256, "language_loss": 0.7349658, "learning_rate": 2.7454883102037563e-06, "loss": 0.7601546, "num_input_tokens_seen": 141838735, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.21264648, "step": 6604, "time_per_iteration": 2.843843698501587 }, { "auxiliary_loss_clip": 0.01464191, "auxiliary_loss_mlp": 0.0104122, "balance_loss_clip": 1.28384614, "balance_loss_mlp": 1.02029848, "epoch": 0.3971140838719375, "flos": 24799992808320.0, "grad_norm": 1.7586239919664313, "language_loss": 0.82954371, "learning_rate": 2.745126901275491e-06, "loss": 0.85459781, "num_input_tokens_seen": 141858090, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.20910645, "step": 6605, "time_per_iteration": 2.861524820327759 }, { "auxiliary_loss_clip": 0.01470834, "auxiliary_loss_mlp": 0.01036659, "balance_loss_clip": 1.28614497, "balance_loss_mlp": 1.01495075, "epoch": 0.39717420712460544, "flos": 24254293547520.0, "grad_norm": 2.778611059543577, "language_loss": 0.7488209, "learning_rate": 2.7447654640919383e-06, "loss": 0.77389586, "num_input_tokens_seen": 141877540, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.21716309, "step": 6606, "time_per_iteration": 2.990558385848999 }, { "auxiliary_loss_clip": 0.01490021, "auxiliary_loss_mlp": 0.01035966, "balance_loss_clip": 1.29993939, "balance_loss_mlp": 1.01491332, "epoch": 0.3972343303772734, "flos": 25895644341120.0, "grad_norm": 1.832985078083235, "language_loss": 0.74589497, "learning_rate": 2.744403998666805e-06, "loss": 0.77115488, "num_input_tokens_seen": 141897315, "router_z_loss_clip": 1.90234375, "router_z_loss_mlp": 0.21069336, "step": 6607, "time_per_iteration": 2.870893716812134 }, { "auxiliary_loss_clip": 0.01489745, "auxiliary_loss_mlp": 0.01041019, "balance_loss_clip": 1.30014467, "balance_loss_mlp": 1.01951337, "epoch": 0.39729445362994137, "flos": 45639840614400.0, "grad_norm": 1.4567900588699736, "language_loss": 0.68663007, "learning_rate": 2.744042505013797e-06, "loss": 0.71193773, "num_input_tokens_seen": 141919580, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.21508789, "step": 6608, "time_per_iteration": 3.0299103260040283 }, { "auxiliary_loss_clip": 0.01492896, "auxiliary_loss_mlp": 0.0104249, "balance_loss_clip": 1.30121541, "balance_loss_mlp": 1.02005506, "epoch": 0.39735457688260933, "flos": 20203712893440.0, "grad_norm": 1.9873138562668455, "language_loss": 0.75270557, "learning_rate": 2.7436809831466233e-06, "loss": 0.77805942, "num_input_tokens_seen": 141937045, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.22412109, "step": 6609, "time_per_iteration": 2.878965139389038 }, { "auxiliary_loss_clip": 0.01496928, "auxiliary_loss_mlp": 0.01045528, "balance_loss_clip": 1.30827749, "balance_loss_mlp": 1.0234983, "epoch": 0.3974147001352773, "flos": 23341567317120.0, "grad_norm": 1.9057705259741453, "language_loss": 0.7240485, "learning_rate": 2.7433194330789927e-06, "loss": 0.74947309, "num_input_tokens_seen": 141956695, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.22021484, "step": 6610, "time_per_iteration": 2.8512966632843018 }, { "auxiliary_loss_clip": 0.01467685, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.2845794, "balance_loss_mlp": 1.01976228, "epoch": 0.39747482338794526, "flos": 21698451976320.0, "grad_norm": 1.6649936205138955, "language_loss": 0.79294044, "learning_rate": 2.7429578548246133e-06, "loss": 0.81803608, "num_input_tokens_seen": 141975935, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22106934, "step": 6611, "time_per_iteration": 2.8537490367889404 }, { "auxiliary_loss_clip": 0.01483943, "auxiliary_loss_mlp": 0.01045236, "balance_loss_clip": 1.29548335, "balance_loss_mlp": 1.0237422, "epoch": 0.3975349466406133, "flos": 30999952581120.0, "grad_norm": 1.8694837516679714, "language_loss": 0.7991693, "learning_rate": 2.7425962483971985e-06, "loss": 0.8244611, "num_input_tokens_seen": 141995750, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.21484375, "step": 6612, "time_per_iteration": 2.9016566276550293 }, { "auxiliary_loss_clip": 0.01229635, "auxiliary_loss_mlp": 0.01040026, "balance_loss_clip": 1.12363505, "balance_loss_mlp": 1.02018976, "epoch": 0.39759506989328125, "flos": 63714172590720.0, "grad_norm": 0.8613701226834495, "language_loss": 0.65054369, "learning_rate": 2.742234613810459e-06, "loss": 0.6732403, "num_input_tokens_seen": 142057655, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.19824219, "step": 6613, "time_per_iteration": 3.2614665031433105 }, { "auxiliary_loss_clip": 0.01483428, "auxiliary_loss_mlp": 0.01041106, "balance_loss_clip": 1.29706907, "balance_loss_mlp": 1.0191592, "epoch": 0.3976551931459492, "flos": 23706286801920.0, "grad_norm": 7.858383440013012, "language_loss": 0.72740179, "learning_rate": 2.741872951078109e-06, "loss": 0.75264716, "num_input_tokens_seen": 142076020, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.21936035, "step": 6614, "time_per_iteration": 2.8597917556762695 }, { "auxiliary_loss_clip": 0.01480504, "auxiliary_loss_mlp": 0.01040293, "balance_loss_clip": 1.29509568, "balance_loss_mlp": 1.01825142, "epoch": 0.3977153163986172, "flos": 15678205125120.0, "grad_norm": 1.85362720576193, "language_loss": 0.82140237, "learning_rate": 2.741511260213862e-06, "loss": 0.84661031, "num_input_tokens_seen": 142093790, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22033691, "step": 6615, "time_per_iteration": 2.944305896759033 }, { "auxiliary_loss_clip": 0.01480675, "auxiliary_loss_mlp": 0.01033509, "balance_loss_clip": 1.29292452, "balance_loss_mlp": 1.01274276, "epoch": 0.39777543965128515, "flos": 14072941699200.0, "grad_norm": 1.9785460549099283, "language_loss": 0.68180203, "learning_rate": 2.741149541231434e-06, "loss": 0.70694387, "num_input_tokens_seen": 142110545, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.20776367, "step": 6616, "time_per_iteration": 2.892889976501465 }, { "auxiliary_loss_clip": 0.01501956, "auxiliary_loss_mlp": 0.01040161, "balance_loss_clip": 1.30936933, "balance_loss_mlp": 1.01858413, "epoch": 0.3978355629039531, "flos": 23377835664000.0, "grad_norm": 2.8706099509194205, "language_loss": 0.8475318, "learning_rate": 2.740787794144541e-06, "loss": 0.87295288, "num_input_tokens_seen": 142128695, "router_z_loss_clip": 1.92871094, "router_z_loss_mlp": 0.21582031, "step": 6617, "time_per_iteration": 2.959622383117676 }, { "auxiliary_loss_clip": 0.01476434, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.29456091, "balance_loss_mlp": 1.01972795, "epoch": 0.3978956861566211, "flos": 19072200216960.0, "grad_norm": 1.7105717562858966, "language_loss": 0.72671384, "learning_rate": 2.7404260189669e-06, "loss": 0.75187761, "num_input_tokens_seen": 142148375, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.20214844, "step": 6618, "time_per_iteration": 2.8881735801696777 }, { "auxiliary_loss_clip": 0.01488423, "auxiliary_loss_mlp": 0.01037668, "balance_loss_clip": 1.30045462, "balance_loss_mlp": 1.01400518, "epoch": 0.39795580940928904, "flos": 30240081843840.0, "grad_norm": 1.790417154422876, "language_loss": 0.66451579, "learning_rate": 2.740064215712231e-06, "loss": 0.68977672, "num_input_tokens_seen": 142169735, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.23681641, "step": 6619, "time_per_iteration": 4.334418058395386 }, { "auxiliary_loss_clip": 0.01230265, "auxiliary_loss_mlp": 0.01042204, "balance_loss_clip": 1.12538528, "balance_loss_mlp": 1.02284455, "epoch": 0.398015932661957, "flos": 69878316464640.0, "grad_norm": 0.777801330185096, "language_loss": 0.58349729, "learning_rate": 2.7397023843942527e-06, "loss": 0.60622203, "num_input_tokens_seen": 142229520, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.19335938, "step": 6620, "time_per_iteration": 3.320657730102539 }, { "auxiliary_loss_clip": 0.01474988, "auxiliary_loss_mlp": 0.01034624, "balance_loss_clip": 1.29166722, "balance_loss_mlp": 1.01425147, "epoch": 0.39807605591462497, "flos": 20167580280960.0, "grad_norm": 1.6852235727119147, "language_loss": 0.79696846, "learning_rate": 2.739340525026686e-06, "loss": 0.82206458, "num_input_tokens_seen": 142247660, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.20373535, "step": 6621, "time_per_iteration": 2.863731861114502 }, { "auxiliary_loss_clip": 0.01479312, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.29639149, "balance_loss_mlp": 1.01398361, "epoch": 0.39813617916729294, "flos": 21151531105920.0, "grad_norm": 6.062456004448127, "language_loss": 0.78872299, "learning_rate": 2.738978637623252e-06, "loss": 0.81387019, "num_input_tokens_seen": 142266990, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.21411133, "step": 6622, "time_per_iteration": 2.8547496795654297 }, { "auxiliary_loss_clip": 0.01475272, "auxiliary_loss_mlp": 0.010381, "balance_loss_clip": 1.29009533, "balance_loss_mlp": 1.01556993, "epoch": 0.3981963024199609, "flos": 18997537017600.0, "grad_norm": 1.5576856212578405, "language_loss": 0.7594732, "learning_rate": 2.738616722197674e-06, "loss": 0.78460693, "num_input_tokens_seen": 142287170, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.2253418, "step": 6623, "time_per_iteration": 2.8832640647888184 }, { "auxiliary_loss_clip": 0.0148873, "auxiliary_loss_mlp": 0.01042453, "balance_loss_clip": 1.30217016, "balance_loss_mlp": 1.02080488, "epoch": 0.39825642567262887, "flos": 16582153864320.0, "grad_norm": 2.184057595019245, "language_loss": 0.80569506, "learning_rate": 2.7382547787636766e-06, "loss": 0.83100688, "num_input_tokens_seen": 142305405, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.21630859, "step": 6624, "time_per_iteration": 2.844261407852173 }, { "auxiliary_loss_clip": 0.01504825, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.31328404, "balance_loss_mlp": 1.01800215, "epoch": 0.39831654892529683, "flos": 22209873661440.0, "grad_norm": 1.9848074252639605, "language_loss": 0.8485114, "learning_rate": 2.7378928073349832e-06, "loss": 0.87396294, "num_input_tokens_seen": 142322710, "router_z_loss_clip": 1.91503906, "router_z_loss_mlp": 0.2232666, "step": 6625, "time_per_iteration": 2.832756757736206 }, { "auxiliary_loss_clip": 0.0148327, "auxiliary_loss_mlp": 0.01045963, "balance_loss_clip": 1.29913759, "balance_loss_mlp": 1.02312231, "epoch": 0.39837667217796485, "flos": 10494980674560.0, "grad_norm": 2.378175900333542, "language_loss": 0.87445927, "learning_rate": 2.737530807925321e-06, "loss": 0.89975166, "num_input_tokens_seen": 142338535, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.22827148, "step": 6626, "time_per_iteration": 4.214043378829956 }, { "auxiliary_loss_clip": 0.01476268, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.28993678, "balance_loss_mlp": 1.01819015, "epoch": 0.3984367954306328, "flos": 17973653016960.0, "grad_norm": 2.5889033414007265, "language_loss": 0.84414101, "learning_rate": 2.737168780548417e-06, "loss": 0.86929744, "num_input_tokens_seen": 142354570, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.21203613, "step": 6627, "time_per_iteration": 4.299873352050781 }, { "auxiliary_loss_clip": 0.01465167, "auxiliary_loss_mlp": 0.01038857, "balance_loss_clip": 1.28178573, "balance_loss_mlp": 1.01810312, "epoch": 0.3984969186833008, "flos": 22721340591360.0, "grad_norm": 1.7826922125057385, "language_loss": 0.83824605, "learning_rate": 2.736806725217998e-06, "loss": 0.86328626, "num_input_tokens_seen": 142374395, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.20751953, "step": 6628, "time_per_iteration": 2.887967348098755 }, { "auxiliary_loss_clip": 0.0149903, "auxiliary_loss_mlp": 0.0104256, "balance_loss_clip": 1.31046557, "balance_loss_mlp": 1.02041054, "epoch": 0.39855704193596875, "flos": 23416637719680.0, "grad_norm": 1.6635885480747885, "language_loss": 0.72011828, "learning_rate": 2.7364446419477945e-06, "loss": 0.74553418, "num_input_tokens_seen": 142396040, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.22131348, "step": 6629, "time_per_iteration": 4.316423654556274 }, { "auxiliary_loss_clip": 0.01455903, "auxiliary_loss_mlp": 0.01042552, "balance_loss_clip": 1.27737856, "balance_loss_mlp": 1.02090323, "epoch": 0.3986171651886367, "flos": 21261919714560.0, "grad_norm": 1.8646172401037568, "language_loss": 0.81034625, "learning_rate": 2.7360825307515366e-06, "loss": 0.83533078, "num_input_tokens_seen": 142415495, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.21643066, "step": 6630, "time_per_iteration": 2.8564646244049072 }, { "auxiliary_loss_clip": 0.01473077, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.28734756, "balance_loss_mlp": 1.01463461, "epoch": 0.3986772884413047, "flos": 12466366174080.0, "grad_norm": 3.164461314559046, "language_loss": 0.76318049, "learning_rate": 2.7357203916429555e-06, "loss": 0.78827775, "num_input_tokens_seen": 142431865, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.22009277, "step": 6631, "time_per_iteration": 2.8109452724456787 }, { "auxiliary_loss_clip": 0.01471353, "auxiliary_loss_mlp": 0.01041347, "balance_loss_clip": 1.28606606, "balance_loss_mlp": 1.01931727, "epoch": 0.39873741169397264, "flos": 19655570413440.0, "grad_norm": 1.765926667132695, "language_loss": 0.72772241, "learning_rate": 2.735358224635783e-06, "loss": 0.75284946, "num_input_tokens_seen": 142450595, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.22045898, "step": 6632, "time_per_iteration": 2.8458011150360107 }, { "auxiliary_loss_clip": 0.01455904, "auxiliary_loss_mlp": 0.01041559, "balance_loss_clip": 1.27507925, "balance_loss_mlp": 1.01946974, "epoch": 0.3987975349466406, "flos": 21693565537920.0, "grad_norm": 2.317838278960125, "language_loss": 0.75535572, "learning_rate": 2.7349960297437533e-06, "loss": 0.78033036, "num_input_tokens_seen": 142466650, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22106934, "step": 6633, "time_per_iteration": 2.82204532623291 }, { "auxiliary_loss_clip": 0.01471947, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.28611743, "balance_loss_mlp": 1.0189898, "epoch": 0.3988576581993086, "flos": 23924394576000.0, "grad_norm": 1.7783172905490863, "language_loss": 0.81580746, "learning_rate": 2.7346338069806e-06, "loss": 0.84093815, "num_input_tokens_seen": 142486165, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.22131348, "step": 6634, "time_per_iteration": 2.8431789875030518 }, { "auxiliary_loss_clip": 0.01483419, "auxiliary_loss_mlp": 0.01038761, "balance_loss_clip": 1.29750264, "balance_loss_mlp": 1.01694584, "epoch": 0.39891778145197654, "flos": 18158885804160.0, "grad_norm": 1.9091389275848272, "language_loss": 0.76069069, "learning_rate": 2.7342715563600597e-06, "loss": 0.78591245, "num_input_tokens_seen": 142505035, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.21826172, "step": 6635, "time_per_iteration": 2.867730140686035 }, { "auxiliary_loss_clip": 0.01504446, "auxiliary_loss_mlp": 0.01045434, "balance_loss_clip": 1.31119013, "balance_loss_mlp": 1.02178264, "epoch": 0.3989779047046445, "flos": 22603893793920.0, "grad_norm": 4.217272985903355, "language_loss": 0.66805291, "learning_rate": 2.733909277895868e-06, "loss": 0.69355178, "num_input_tokens_seen": 142521870, "router_z_loss_clip": 1.93359375, "router_z_loss_mlp": 0.23669434, "step": 6636, "time_per_iteration": 2.83748459815979 }, { "auxiliary_loss_clip": 0.01468044, "auxiliary_loss_mlp": 0.01042482, "balance_loss_clip": 1.28586817, "balance_loss_mlp": 1.02054739, "epoch": 0.39903802795731247, "flos": 18086258620800.0, "grad_norm": 1.7708653506718106, "language_loss": 0.82918596, "learning_rate": 2.733546971601763e-06, "loss": 0.85429126, "num_input_tokens_seen": 142540455, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.21948242, "step": 6637, "time_per_iteration": 2.8742995262145996 }, { "auxiliary_loss_clip": 0.01238473, "auxiliary_loss_mlp": 0.01054747, "balance_loss_clip": 1.12651479, "balance_loss_mlp": 1.02937937, "epoch": 0.39909815120998043, "flos": 70475305345920.0, "grad_norm": 0.7382101704866447, "language_loss": 0.53240436, "learning_rate": 2.733184637491484e-06, "loss": 0.55533653, "num_input_tokens_seen": 142599665, "router_z_loss_clip": 1.125, "router_z_loss_mlp": 0.25390625, "step": 6638, "time_per_iteration": 3.4348180294036865 }, { "auxiliary_loss_clip": 0.01470872, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.28621352, "balance_loss_mlp": 1.01752186, "epoch": 0.39915827446264845, "flos": 18557837619840.0, "grad_norm": 1.6269036989430905, "language_loss": 0.75785476, "learning_rate": 2.732822275578769e-06, "loss": 0.78295922, "num_input_tokens_seen": 142618845, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22058105, "step": 6639, "time_per_iteration": 2.853752613067627 }, { "auxiliary_loss_clip": 0.01457374, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.27832484, "balance_loss_mlp": 1.01677322, "epoch": 0.3992183977153164, "flos": 29909006507520.0, "grad_norm": 1.6659626558589975, "language_loss": 0.76696181, "learning_rate": 2.7324598858773603e-06, "loss": 0.7919209, "num_input_tokens_seen": 142640885, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.21777344, "step": 6640, "time_per_iteration": 2.9235072135925293 }, { "auxiliary_loss_clip": 0.01470857, "auxiliary_loss_mlp": 0.01038716, "balance_loss_clip": 1.28719008, "balance_loss_mlp": 1.01687729, "epoch": 0.3992785209679844, "flos": 22575317063040.0, "grad_norm": 2.182998791851487, "language_loss": 0.82788908, "learning_rate": 2.7320974684009996e-06, "loss": 0.85298479, "num_input_tokens_seen": 142659340, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.21850586, "step": 6641, "time_per_iteration": 2.8876326084136963 }, { "auxiliary_loss_clip": 0.01480655, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.29634559, "balance_loss_mlp": 1.01584029, "epoch": 0.39933864422065235, "flos": 19692155473920.0, "grad_norm": 1.8584846787098261, "language_loss": 0.77461982, "learning_rate": 2.7317350231634288e-06, "loss": 0.79981011, "num_input_tokens_seen": 142677085, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.22546387, "step": 6642, "time_per_iteration": 2.834116220474243 }, { "auxiliary_loss_clip": 0.01491198, "auxiliary_loss_mlp": 0.01038962, "balance_loss_clip": 1.30318165, "balance_loss_mlp": 1.01708698, "epoch": 0.3993987674733203, "flos": 23048615364480.0, "grad_norm": 2.2755161144694322, "language_loss": 0.72729278, "learning_rate": 2.731372550178393e-06, "loss": 0.75259435, "num_input_tokens_seen": 142694595, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.21862793, "step": 6643, "time_per_iteration": 2.8666434288024902 }, { "auxiliary_loss_clip": 0.01489156, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.30207253, "balance_loss_mlp": 1.01660669, "epoch": 0.3994588907259883, "flos": 19400244151680.0, "grad_norm": 1.6609068848290167, "language_loss": 0.66779613, "learning_rate": 2.7310100494596375e-06, "loss": 0.69307423, "num_input_tokens_seen": 142714175, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.22058105, "step": 6644, "time_per_iteration": 2.8829734325408936 }, { "auxiliary_loss_clip": 0.0147528, "auxiliary_loss_mlp": 0.01043043, "balance_loss_clip": 1.2902534, "balance_loss_mlp": 1.02054799, "epoch": 0.39951901397865625, "flos": 13742047342080.0, "grad_norm": 3.161573040176594, "language_loss": 0.79085034, "learning_rate": 2.730647521020907e-06, "loss": 0.8160336, "num_input_tokens_seen": 142730955, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.22497559, "step": 6645, "time_per_iteration": 2.843592882156372 }, { "auxiliary_loss_clip": 0.01486723, "auxiliary_loss_mlp": 0.01039779, "balance_loss_clip": 1.29988909, "balance_loss_mlp": 1.01739109, "epoch": 0.3995791372313242, "flos": 23596893578880.0, "grad_norm": 1.569574497980234, "language_loss": 0.70484215, "learning_rate": 2.73028496487595e-06, "loss": 0.73010719, "num_input_tokens_seen": 142751200, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.22363281, "step": 6646, "time_per_iteration": 2.985799551010132 }, { "auxiliary_loss_clip": 0.01474731, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.2890234, "balance_loss_mlp": 1.01502609, "epoch": 0.3996392604839922, "flos": 21364480972800.0, "grad_norm": 2.6458993570767095, "language_loss": 0.72376704, "learning_rate": 2.729922381038513e-06, "loss": 0.74887574, "num_input_tokens_seen": 142770170, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.2109375, "step": 6647, "time_per_iteration": 2.90041184425354 }, { "auxiliary_loss_clip": 0.01457853, "auxiliary_loss_mlp": 0.01042173, "balance_loss_clip": 1.27904749, "balance_loss_mlp": 1.0205363, "epoch": 0.39969938373666014, "flos": 26043658640640.0, "grad_norm": 1.3933508052905788, "language_loss": 0.74451929, "learning_rate": 2.7295597695223463e-06, "loss": 0.76951957, "num_input_tokens_seen": 142792680, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.21630859, "step": 6648, "time_per_iteration": 2.8953137397766113 }, { "auxiliary_loss_clip": 0.01463434, "auxiliary_loss_mlp": 0.01037507, "balance_loss_clip": 1.28053904, "balance_loss_mlp": 1.01507223, "epoch": 0.3997595069893281, "flos": 20124887172480.0, "grad_norm": 2.7597341609715156, "language_loss": 0.67365932, "learning_rate": 2.7291971303412006e-06, "loss": 0.69866872, "num_input_tokens_seen": 142810510, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.2244873, "step": 6649, "time_per_iteration": 2.9237916469573975 }, { "auxiliary_loss_clip": 0.01486922, "auxiliary_loss_mlp": 0.01037723, "balance_loss_clip": 1.30365992, "balance_loss_mlp": 1.01634872, "epoch": 0.39981963024199607, "flos": 27795217063680.0, "grad_norm": 1.7225191059592717, "language_loss": 0.76483428, "learning_rate": 2.728834463508826e-06, "loss": 0.79008073, "num_input_tokens_seen": 142832455, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.21374512, "step": 6650, "time_per_iteration": 2.905372381210327 }, { "auxiliary_loss_clip": 0.01479961, "auxiliary_loss_mlp": 0.01041207, "balance_loss_clip": 1.29598284, "balance_loss_mlp": 1.02021396, "epoch": 0.39987975349466404, "flos": 21954411665280.0, "grad_norm": 1.632673247646959, "language_loss": 0.72037703, "learning_rate": 2.728471769038975e-06, "loss": 0.74558866, "num_input_tokens_seen": 142852590, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.21008301, "step": 6651, "time_per_iteration": 2.8894877433776855 }, { "auxiliary_loss_clip": 0.01484936, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.29770029, "balance_loss_mlp": 1.01382232, "epoch": 0.39993987674733206, "flos": 20714184437760.0, "grad_norm": 1.8464898260405884, "language_loss": 0.7432183, "learning_rate": 2.728109046945403e-06, "loss": 0.76841778, "num_input_tokens_seen": 142870595, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.21191406, "step": 6652, "time_per_iteration": 2.845848798751831 }, { "auxiliary_loss_clip": 0.01242901, "auxiliary_loss_mlp": 0.01018492, "balance_loss_clip": 1.13411665, "balance_loss_mlp": 1.00037241, "epoch": 0.4, "flos": 61553572761600.0, "grad_norm": 0.8554467543259106, "language_loss": 0.60712844, "learning_rate": 2.727746297241862e-06, "loss": 0.62974238, "num_input_tokens_seen": 142925805, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.18164062, "step": 6653, "time_per_iteration": 3.268691301345825 }, { "auxiliary_loss_clip": 0.01465693, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.28840423, "balance_loss_mlp": 1.01520002, "epoch": 0.400060123252668, "flos": 14510605080960.0, "grad_norm": 2.1058417312825664, "language_loss": 0.67326003, "learning_rate": 2.7273835199421085e-06, "loss": 0.69829297, "num_input_tokens_seen": 142943145, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.22387695, "step": 6654, "time_per_iteration": 4.27107572555542 }, { "auxiliary_loss_clip": 0.01470751, "auxiliary_loss_mlp": 0.01039282, "balance_loss_clip": 1.2883507, "balance_loss_mlp": 1.01925468, "epoch": 0.40012024650533595, "flos": 19101184151040.0, "grad_norm": 2.1866879259848067, "language_loss": 0.90245938, "learning_rate": 2.7270207150599e-06, "loss": 0.92755973, "num_input_tokens_seen": 142956925, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.20031738, "step": 6655, "time_per_iteration": 2.8469367027282715 }, { "auxiliary_loss_clip": 0.01459647, "auxiliary_loss_mlp": 0.01034178, "balance_loss_clip": 1.28270578, "balance_loss_mlp": 1.01356649, "epoch": 0.4001803697580039, "flos": 29362900043520.0, "grad_norm": 1.8283643725139043, "language_loss": 0.73775256, "learning_rate": 2.7266578826089917e-06, "loss": 0.76269084, "num_input_tokens_seen": 142978040, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.20605469, "step": 6656, "time_per_iteration": 3.0052499771118164 }, { "auxiliary_loss_clip": 0.01476339, "auxiliary_loss_mlp": 0.01037252, "balance_loss_clip": 1.29233885, "balance_loss_mlp": 1.01592505, "epoch": 0.4002404930106719, "flos": 20929034586240.0, "grad_norm": 1.5939524436443075, "language_loss": 0.74558878, "learning_rate": 2.726295022603144e-06, "loss": 0.77072465, "num_input_tokens_seen": 142998390, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.21325684, "step": 6657, "time_per_iteration": 3.0338175296783447 }, { "auxiliary_loss_clip": 0.01478657, "auxiliary_loss_mlp": 0.01047558, "balance_loss_clip": 1.2950654, "balance_loss_mlp": 1.02569461, "epoch": 0.40030061626333985, "flos": 28418248967040.0, "grad_norm": 1.9719515102529566, "language_loss": 0.79962915, "learning_rate": 2.725932135056117e-06, "loss": 0.82489133, "num_input_tokens_seen": 143021505, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.21875, "step": 6658, "time_per_iteration": 2.9869658946990967 }, { "auxiliary_loss_clip": 0.01490082, "auxiliary_loss_mlp": 0.01036195, "balance_loss_clip": 1.30514145, "balance_loss_mlp": 1.01517892, "epoch": 0.4003607395160078, "flos": 25932681849600.0, "grad_norm": 1.9022246397826297, "language_loss": 0.78404963, "learning_rate": 2.72556921998167e-06, "loss": 0.80931234, "num_input_tokens_seen": 143041375, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.21020508, "step": 6659, "time_per_iteration": 2.893192768096924 }, { "auxiliary_loss_clip": 0.01451068, "auxiliary_loss_mlp": 0.01036747, "balance_loss_clip": 1.2757709, "balance_loss_mlp": 1.0172683, "epoch": 0.4004208627686758, "flos": 20776179093120.0, "grad_norm": 1.7262817845156733, "language_loss": 0.73602653, "learning_rate": 2.7252062773935662e-06, "loss": 0.76090467, "num_input_tokens_seen": 143058725, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.19470215, "step": 6660, "time_per_iteration": 2.802809476852417 }, { "auxiliary_loss_clip": 0.01472411, "auxiliary_loss_mlp": 0.01039863, "balance_loss_clip": 1.29066014, "balance_loss_mlp": 1.01957345, "epoch": 0.40048098602134374, "flos": 24692002174080.0, "grad_norm": 2.1694569915491693, "language_loss": 0.71975219, "learning_rate": 2.7248433073055674e-06, "loss": 0.74487495, "num_input_tokens_seen": 143076995, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.20275879, "step": 6661, "time_per_iteration": 5.795670986175537 }, { "auxiliary_loss_clip": 0.01480144, "auxiliary_loss_mlp": 0.01042465, "balance_loss_clip": 1.29481411, "balance_loss_mlp": 1.02123356, "epoch": 0.4005411092740117, "flos": 23196177216000.0, "grad_norm": 1.7070350022625265, "language_loss": 0.76172233, "learning_rate": 2.724480309731437e-06, "loss": 0.78694844, "num_input_tokens_seen": 143096780, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.21228027, "step": 6662, "time_per_iteration": 2.8437774181365967 }, { "auxiliary_loss_clip": 0.014788, "auxiliary_loss_mlp": 0.01041321, "balance_loss_clip": 1.29248118, "balance_loss_mlp": 1.01937509, "epoch": 0.4006012325266797, "flos": 17529248160000.0, "grad_norm": 10.065590615343613, "language_loss": 0.67108524, "learning_rate": 2.7241172846849417e-06, "loss": 0.6962865, "num_input_tokens_seen": 143112590, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.21948242, "step": 6663, "time_per_iteration": 4.187743663787842 }, { "auxiliary_loss_clip": 0.01481331, "auxiliary_loss_mlp": 0.01041742, "balance_loss_clip": 1.29712796, "balance_loss_mlp": 1.02158356, "epoch": 0.40066135577934764, "flos": 19864674472320.0, "grad_norm": 1.9451263806523116, "language_loss": 0.86974078, "learning_rate": 2.7237542321798455e-06, "loss": 0.89497149, "num_input_tokens_seen": 143130220, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.20153809, "step": 6664, "time_per_iteration": 2.8048341274261475 }, { "auxiliary_loss_clip": 0.01462833, "auxiliary_loss_mlp": 0.01042005, "balance_loss_clip": 1.27993488, "balance_loss_mlp": 1.02086926, "epoch": 0.40072147903201566, "flos": 18159021538560.0, "grad_norm": 2.2835635234663014, "language_loss": 0.84999514, "learning_rate": 2.723391152229917e-06, "loss": 0.87504351, "num_input_tokens_seen": 143147160, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.21142578, "step": 6665, "time_per_iteration": 2.8077855110168457 }, { "auxiliary_loss_clip": 0.01474227, "auxiliary_loss_mlp": 0.01042582, "balance_loss_clip": 1.29024017, "balance_loss_mlp": 1.02152967, "epoch": 0.4007816022846836, "flos": 18670533713280.0, "grad_norm": 1.5993297310547436, "language_loss": 0.79036885, "learning_rate": 2.7230280448489236e-06, "loss": 0.81553692, "num_input_tokens_seen": 143164605, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.21057129, "step": 6666, "time_per_iteration": 2.8053057193756104 }, { "auxiliary_loss_clip": 0.01483709, "auxiliary_loss_mlp": 0.01043593, "balance_loss_clip": 1.30019951, "balance_loss_mlp": 1.02288675, "epoch": 0.4008417255373516, "flos": 25714302606720.0, "grad_norm": 2.129987297038019, "language_loss": 0.74658793, "learning_rate": 2.7226649100506333e-06, "loss": 0.77186096, "num_input_tokens_seen": 143183965, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.20690918, "step": 6667, "time_per_iteration": 2.9194271564483643 }, { "auxiliary_loss_clip": 0.01484188, "auxiliary_loss_mlp": 0.01042442, "balance_loss_clip": 1.29836512, "balance_loss_mlp": 1.02007842, "epoch": 0.40090184879001955, "flos": 22869354890880.0, "grad_norm": 1.4785743934221591, "language_loss": 0.75959212, "learning_rate": 2.7223017478488183e-06, "loss": 0.78485847, "num_input_tokens_seen": 143204965, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.22363281, "step": 6668, "time_per_iteration": 2.92448091506958 }, { "auxiliary_loss_clip": 0.01473914, "auxiliary_loss_mlp": 0.01045146, "balance_loss_clip": 1.29526401, "balance_loss_mlp": 1.0250597, "epoch": 0.4009619720426875, "flos": 29071848372480.0, "grad_norm": 1.9384936607106849, "language_loss": 0.83097982, "learning_rate": 2.721938558257248e-06, "loss": 0.85617042, "num_input_tokens_seen": 143225015, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.20092773, "step": 6669, "time_per_iteration": 2.9238102436065674 }, { "auxiliary_loss_clip": 0.01232523, "auxiliary_loss_mlp": 0.01057885, "balance_loss_clip": 1.12750137, "balance_loss_mlp": 1.03709543, "epoch": 0.4010220952953555, "flos": 66091703829120.0, "grad_norm": 0.7126103941881856, "language_loss": 0.53425461, "learning_rate": 2.721575341289695e-06, "loss": 0.55715871, "num_input_tokens_seen": 143294925, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.20800781, "step": 6670, "time_per_iteration": 3.5269203186035156 }, { "auxiliary_loss_clip": 0.01460311, "auxiliary_loss_mlp": 0.01035511, "balance_loss_clip": 1.28039575, "balance_loss_mlp": 1.01517344, "epoch": 0.40108221854802345, "flos": 29654766120960.0, "grad_norm": 2.464883817648517, "language_loss": 0.8889159, "learning_rate": 2.7212120969599333e-06, "loss": 0.91387409, "num_input_tokens_seen": 143314170, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.20361328, "step": 6671, "time_per_iteration": 2.9420790672302246 }, { "auxiliary_loss_clip": 0.01479199, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.29518342, "balance_loss_mlp": 1.01919854, "epoch": 0.4011423418006914, "flos": 19936894452480.0, "grad_norm": 1.8761896874970936, "language_loss": 0.79331154, "learning_rate": 2.720848825281736e-06, "loss": 0.81851, "num_input_tokens_seen": 143330050, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.21435547, "step": 6672, "time_per_iteration": 2.798285722732544 }, { "auxiliary_loss_clip": 0.01474744, "auxiliary_loss_mlp": 0.01038885, "balance_loss_clip": 1.29322028, "balance_loss_mlp": 1.01827383, "epoch": 0.4012024650533594, "flos": 20093957712000.0, "grad_norm": 2.6273031015903894, "language_loss": 0.64092982, "learning_rate": 2.72048552626888e-06, "loss": 0.66606605, "num_input_tokens_seen": 143348650, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.20617676, "step": 6673, "time_per_iteration": 2.8736982345581055 }, { "auxiliary_loss_clip": 0.01477865, "auxiliary_loss_mlp": 0.01043172, "balance_loss_clip": 1.29435158, "balance_loss_mlp": 1.02202439, "epoch": 0.40126258830602735, "flos": 21707365201920.0, "grad_norm": 1.4972153984140504, "language_loss": 0.80677742, "learning_rate": 2.7201221999351402e-06, "loss": 0.8319878, "num_input_tokens_seen": 143370275, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.21154785, "step": 6674, "time_per_iteration": 3.0009572505950928 }, { "auxiliary_loss_clip": 0.01500717, "auxiliary_loss_mlp": 0.01044141, "balance_loss_clip": 1.31118917, "balance_loss_mlp": 1.0232439, "epoch": 0.4013227115586953, "flos": 12027797896320.0, "grad_norm": 2.7085134546218037, "language_loss": 0.82937866, "learning_rate": 2.719758846294294e-06, "loss": 0.85482728, "num_input_tokens_seen": 143385390, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.20898438, "step": 6675, "time_per_iteration": 2.880319356918335 }, { "auxiliary_loss_clip": 0.01489832, "auxiliary_loss_mlp": 0.01047198, "balance_loss_clip": 1.30657399, "balance_loss_mlp": 1.02665806, "epoch": 0.4013828348113633, "flos": 25458523896960.0, "grad_norm": 1.8761928932883938, "language_loss": 0.94483048, "learning_rate": 2.71939546536012e-06, "loss": 0.97020072, "num_input_tokens_seen": 143404215, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.20532227, "step": 6676, "time_per_iteration": 2.866199493408203 }, { "auxiliary_loss_clip": 0.01503532, "auxiliary_loss_mlp": 0.01042733, "balance_loss_clip": 1.31468689, "balance_loss_mlp": 1.02091765, "epoch": 0.40144295806403124, "flos": 18590803096320.0, "grad_norm": 2.5962251281424855, "language_loss": 0.8073945, "learning_rate": 2.719032057146399e-06, "loss": 0.83285713, "num_input_tokens_seen": 143422245, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.21801758, "step": 6677, "time_per_iteration": 2.8110363483428955 }, { "auxiliary_loss_clip": 0.01489705, "auxiliary_loss_mlp": 0.01046134, "balance_loss_clip": 1.30779505, "balance_loss_mlp": 1.02608323, "epoch": 0.4015030813166992, "flos": 22940624730240.0, "grad_norm": 2.4053656652233273, "language_loss": 0.83951962, "learning_rate": 2.71866862166691e-06, "loss": 0.86487806, "num_input_tokens_seen": 143443130, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.20043945, "step": 6678, "time_per_iteration": 2.8696792125701904 }, { "auxiliary_loss_clip": 0.01482476, "auxiliary_loss_mlp": 0.01044203, "balance_loss_clip": 1.30265594, "balance_loss_mlp": 1.02481937, "epoch": 0.4015632045693672, "flos": 20604836459520.0, "grad_norm": 2.1416901898409537, "language_loss": 0.64882517, "learning_rate": 2.718305158935434e-06, "loss": 0.67409194, "num_input_tokens_seen": 143461385, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.19384766, "step": 6679, "time_per_iteration": 2.80134654045105 }, { "auxiliary_loss_clip": 0.01487941, "auxiliary_loss_mlp": 0.01042258, "balance_loss_clip": 1.30878448, "balance_loss_mlp": 1.02217078, "epoch": 0.4016233278220352, "flos": 23448924524160.0, "grad_norm": 1.8991285917949285, "language_loss": 0.79431272, "learning_rate": 2.7179416689657554e-06, "loss": 0.81961471, "num_input_tokens_seen": 143481750, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.20080566, "step": 6680, "time_per_iteration": 2.865013599395752 }, { "auxiliary_loss_clip": 0.01510541, "auxiliary_loss_mlp": 0.01050589, "balance_loss_clip": 1.32144356, "balance_loss_mlp": 1.03040683, "epoch": 0.40168345107470316, "flos": 21440230047360.0, "grad_norm": 1.607129267713683, "language_loss": 0.76416707, "learning_rate": 2.7175781517716556e-06, "loss": 0.78977841, "num_input_tokens_seen": 143501540, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.20178223, "step": 6681, "time_per_iteration": 2.8391401767730713 }, { "auxiliary_loss_clip": 0.01520511, "auxiliary_loss_mlp": 0.01044141, "balance_loss_clip": 1.33288074, "balance_loss_mlp": 1.02426922, "epoch": 0.4017435743273711, "flos": 22867590343680.0, "grad_norm": 1.8243811582227518, "language_loss": 0.65037274, "learning_rate": 2.7172146073669213e-06, "loss": 0.67601931, "num_input_tokens_seen": 143520530, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.1986084, "step": 6682, "time_per_iteration": 2.8541107177734375 }, { "auxiliary_loss_clip": 0.01491764, "auxiliary_loss_mlp": 0.01041927, "balance_loss_clip": 1.30697632, "balance_loss_mlp": 1.02259123, "epoch": 0.4018036975800391, "flos": 28634275480320.0, "grad_norm": 1.7882211876797014, "language_loss": 0.74197483, "learning_rate": 2.716851035765337e-06, "loss": 0.76731169, "num_input_tokens_seen": 143540210, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.19311523, "step": 6683, "time_per_iteration": 2.883040189743042 }, { "auxiliary_loss_clip": 0.01493136, "auxiliary_loss_mlp": 0.01049566, "balance_loss_clip": 1.30928719, "balance_loss_mlp": 1.02889478, "epoch": 0.40186382083270705, "flos": 26662437532800.0, "grad_norm": 1.7217613311229025, "language_loss": 0.74187088, "learning_rate": 2.7164874369806896e-06, "loss": 0.76729786, "num_input_tokens_seen": 143560940, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.20678711, "step": 6684, "time_per_iteration": 2.8782949447631836 }, { "auxiliary_loss_clip": 0.01263456, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.15838552, "balance_loss_mlp": 1.01312065, "epoch": 0.401923944085375, "flos": 59286656355840.0, "grad_norm": 0.8122807106893122, "language_loss": 0.60465515, "learning_rate": 2.716123811026767e-06, "loss": 0.62760592, "num_input_tokens_seen": 143624015, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.18457031, "step": 6685, "time_per_iteration": 3.4571034908294678 }, { "auxiliary_loss_clip": 0.0152232, "auxiliary_loss_mlp": 0.01040787, "balance_loss_clip": 1.33183217, "balance_loss_mlp": 1.02030659, "epoch": 0.401984067338043, "flos": 16991602473600.0, "grad_norm": 1.7309799682477942, "language_loss": 0.70801795, "learning_rate": 2.715760157917357e-06, "loss": 0.73364902, "num_input_tokens_seen": 143642750, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.20495605, "step": 6686, "time_per_iteration": 2.996462106704712 }, { "auxiliary_loss_clip": 0.01504838, "auxiliary_loss_mlp": 0.01040069, "balance_loss_clip": 1.32230091, "balance_loss_mlp": 1.01998186, "epoch": 0.40204419059071095, "flos": 24983189579520.0, "grad_norm": 1.9347437369316967, "language_loss": 0.7523343, "learning_rate": 2.7153964776662504e-06, "loss": 0.77778333, "num_input_tokens_seen": 143664515, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.20092773, "step": 6687, "time_per_iteration": 2.928790807723999 }, { "auxiliary_loss_clip": 0.01514428, "auxiliary_loss_mlp": 0.01038332, "balance_loss_clip": 1.32829285, "balance_loss_mlp": 1.01776826, "epoch": 0.4021043138433789, "flos": 23487998048640.0, "grad_norm": 2.2130038946116937, "language_loss": 0.72032773, "learning_rate": 2.7150327702872385e-06, "loss": 0.74585533, "num_input_tokens_seen": 143683135, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.20544434, "step": 6688, "time_per_iteration": 2.874418020248413 }, { "auxiliary_loss_clip": 0.0152194, "auxiliary_loss_mlp": 0.01040583, "balance_loss_clip": 1.3314085, "balance_loss_mlp": 1.01984024, "epoch": 0.4021644370960469, "flos": 26007209314560.0, "grad_norm": 1.9118396420392236, "language_loss": 0.65859687, "learning_rate": 2.7146690357941112e-06, "loss": 0.6842221, "num_input_tokens_seen": 143703985, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.20739746, "step": 6689, "time_per_iteration": 4.382193565368652 }, { "auxiliary_loss_clip": 0.01507731, "auxiliary_loss_mlp": 0.01035366, "balance_loss_clip": 1.31916797, "balance_loss_mlp": 1.01488554, "epoch": 0.40222456034871484, "flos": 13595480876160.0, "grad_norm": 2.3653991643020778, "language_loss": 0.74260378, "learning_rate": 2.7143052742006632e-06, "loss": 0.76803476, "num_input_tokens_seen": 143719245, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.20471191, "step": 6690, "time_per_iteration": 2.806926965713501 }, { "auxiliary_loss_clip": 0.01497973, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.3151238, "balance_loss_mlp": 1.01498616, "epoch": 0.4022846836013828, "flos": 24288209164800.0, "grad_norm": 2.0921031246078425, "language_loss": 0.75374281, "learning_rate": 2.7139414855206872e-06, "loss": 0.77907294, "num_input_tokens_seen": 143739575, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.20056152, "step": 6691, "time_per_iteration": 2.8615708351135254 }, { "auxiliary_loss_clip": 0.01528521, "auxiliary_loss_mlp": 0.01041171, "balance_loss_clip": 1.34095466, "balance_loss_mlp": 1.02059531, "epoch": 0.40234480685405083, "flos": 20159798175360.0, "grad_norm": 1.9068642676576706, "language_loss": 0.73176038, "learning_rate": 2.7135776697679785e-06, "loss": 0.75745726, "num_input_tokens_seen": 143758515, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.20568848, "step": 6692, "time_per_iteration": 2.874814748764038 }, { "auxiliary_loss_clip": 0.01496864, "auxiliary_loss_mlp": 0.01037159, "balance_loss_clip": 1.31299973, "balance_loss_mlp": 1.01712012, "epoch": 0.4024049301067188, "flos": 22940624730240.0, "grad_norm": 1.7046168766509167, "language_loss": 0.84624511, "learning_rate": 2.7132138269563333e-06, "loss": 0.87158537, "num_input_tokens_seen": 143776770, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.20031738, "step": 6693, "time_per_iteration": 2.9002187252044678 }, { "auxiliary_loss_clip": 0.0151063, "auxiliary_loss_mlp": 0.01037656, "balance_loss_clip": 1.32593131, "balance_loss_mlp": 1.01687765, "epoch": 0.40246505335938676, "flos": 36042175676160.0, "grad_norm": 1.9663378410614787, "language_loss": 0.72012538, "learning_rate": 2.7128499570995483e-06, "loss": 0.74560821, "num_input_tokens_seen": 143798450, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.20788574, "step": 6694, "time_per_iteration": 2.986011505126953 }, { "auxiliary_loss_clip": 0.01502912, "auxiliary_loss_mlp": 0.01038564, "balance_loss_clip": 1.31861818, "balance_loss_mlp": 1.01662946, "epoch": 0.4025251766120547, "flos": 20603886318720.0, "grad_norm": 2.3877834560454163, "language_loss": 0.68744278, "learning_rate": 2.7124860602114212e-06, "loss": 0.71285748, "num_input_tokens_seen": 143816995, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.21948242, "step": 6695, "time_per_iteration": 2.924715757369995 }, { "auxiliary_loss_clip": 0.01507525, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.3247149, "balance_loss_mlp": 1.01660776, "epoch": 0.4025852998647227, "flos": 64545114205440.0, "grad_norm": 1.94391292170846, "language_loss": 0.80385906, "learning_rate": 2.7121221363057515e-06, "loss": 0.82929301, "num_input_tokens_seen": 143842090, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.19262695, "step": 6696, "time_per_iteration": 6.218676567077637 }, { "auxiliary_loss_clip": 0.0151408, "auxiliary_loss_mlp": 0.01042582, "balance_loss_clip": 1.32894182, "balance_loss_mlp": 1.02147031, "epoch": 0.40264542311739066, "flos": 20895978620160.0, "grad_norm": 1.6927536094756572, "language_loss": 0.71477532, "learning_rate": 2.7117581853963393e-06, "loss": 0.7403419, "num_input_tokens_seen": 143860800, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.21105957, "step": 6697, "time_per_iteration": 2.9198110103607178 }, { "auxiliary_loss_clip": 0.01486751, "auxiliary_loss_mlp": 0.01038917, "balance_loss_clip": 1.30635405, "balance_loss_mlp": 1.01899695, "epoch": 0.4027055463700586, "flos": 26261404456320.0, "grad_norm": 2.264080722303738, "language_loss": 0.62005782, "learning_rate": 2.711394207496984e-06, "loss": 0.64531457, "num_input_tokens_seen": 143878950, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.19921875, "step": 6698, "time_per_iteration": 4.397491693496704 }, { "auxiliary_loss_clip": 0.01496079, "auxiliary_loss_mlp": 0.01032036, "balance_loss_clip": 1.31332994, "balance_loss_mlp": 1.01153207, "epoch": 0.4027656696227266, "flos": 20641150051200.0, "grad_norm": 2.117889960337718, "language_loss": 0.78013569, "learning_rate": 2.711030202621491e-06, "loss": 0.80541682, "num_input_tokens_seen": 143898385, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.20495605, "step": 6699, "time_per_iteration": 2.857994318008423 }, { "auxiliary_loss_clip": 0.01479958, "auxiliary_loss_mlp": 0.01033011, "balance_loss_clip": 1.30006766, "balance_loss_mlp": 1.01304305, "epoch": 0.40282579287539455, "flos": 22356485372160.0, "grad_norm": 1.733222213363098, "language_loss": 0.81216168, "learning_rate": 2.7106661707836605e-06, "loss": 0.83729136, "num_input_tokens_seen": 143918795, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.19958496, "step": 6700, "time_per_iteration": 2.90022349357605 }, { "auxiliary_loss_clip": 0.01505971, "auxiliary_loss_mlp": 0.01040936, "balance_loss_clip": 1.31798613, "balance_loss_mlp": 1.01802373, "epoch": 0.4028859161280625, "flos": 29286065093760.0, "grad_norm": 2.0265453675938527, "language_loss": 0.75361019, "learning_rate": 2.7103021119972977e-06, "loss": 0.77907926, "num_input_tokens_seen": 143938245, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.22924805, "step": 6701, "time_per_iteration": 2.9520859718322754 }, { "auxiliary_loss_clip": 0.01488979, "auxiliary_loss_mlp": 0.01031459, "balance_loss_clip": 1.30735683, "balance_loss_mlp": 1.01139641, "epoch": 0.4029460393807305, "flos": 28634275480320.0, "grad_norm": 1.7118110110637654, "language_loss": 0.66908085, "learning_rate": 2.709938026276208e-06, "loss": 0.69428521, "num_input_tokens_seen": 143960995, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.20080566, "step": 6702, "time_per_iteration": 3.04244065284729 }, { "auxiliary_loss_clip": 0.01504075, "auxiliary_loss_mlp": 0.01037807, "balance_loss_clip": 1.31788278, "balance_loss_mlp": 1.01693344, "epoch": 0.40300616263339845, "flos": 22612264081920.0, "grad_norm": 1.643798663481284, "language_loss": 0.66851985, "learning_rate": 2.7095739136341964e-06, "loss": 0.69393861, "num_input_tokens_seen": 143979910, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.20874023, "step": 6703, "time_per_iteration": 2.9070193767547607 }, { "auxiliary_loss_clip": 0.01499332, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.31293154, "balance_loss_mlp": 1.01336074, "epoch": 0.4030662858860664, "flos": 25531286814720.0, "grad_norm": 1.868803690830443, "language_loss": 0.82643807, "learning_rate": 2.709209774085071e-06, "loss": 0.8517592, "num_input_tokens_seen": 144000095, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.19421387, "step": 6704, "time_per_iteration": 2.9339048862457275 }, { "auxiliary_loss_clip": 0.01505079, "auxiliary_loss_mlp": 0.01034406, "balance_loss_clip": 1.3184793, "balance_loss_mlp": 1.01316261, "epoch": 0.40312640913873443, "flos": 23597210292480.0, "grad_norm": 1.552998315858976, "language_loss": 0.73951441, "learning_rate": 2.7088456076426407e-06, "loss": 0.76490927, "num_input_tokens_seen": 144019695, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.21228027, "step": 6705, "time_per_iteration": 2.897552728652954 }, { "auxiliary_loss_clip": 0.01477393, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.29708087, "balance_loss_mlp": 1.01236749, "epoch": 0.4031865323914024, "flos": 20020606611840.0, "grad_norm": 2.0357857555587096, "language_loss": 0.67760682, "learning_rate": 2.708481414320713e-06, "loss": 0.70270264, "num_input_tokens_seen": 144038525, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.19836426, "step": 6706, "time_per_iteration": 2.82255482673645 }, { "auxiliary_loss_clip": 0.01490063, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.30751455, "balance_loss_mlp": 1.01494396, "epoch": 0.40324665564407036, "flos": 21881377278720.0, "grad_norm": 1.3861484092173149, "language_loss": 0.72182566, "learning_rate": 2.7081171941330992e-06, "loss": 0.74708462, "num_input_tokens_seen": 144059485, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.2088623, "step": 6707, "time_per_iteration": 2.8816840648651123 }, { "auxiliary_loss_clip": 0.01459056, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.28193796, "balance_loss_mlp": 1.01210189, "epoch": 0.4033067788967383, "flos": 23889347838720.0, "grad_norm": 2.171447909053771, "language_loss": 0.80779952, "learning_rate": 2.707752947093611e-06, "loss": 0.83271205, "num_input_tokens_seen": 144080265, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.2010498, "step": 6708, "time_per_iteration": 2.9098479747772217 }, { "auxiliary_loss_clip": 0.01509472, "auxiliary_loss_mlp": 0.01040059, "balance_loss_clip": 1.31946659, "balance_loss_mlp": 1.01944816, "epoch": 0.4033669021494063, "flos": 17428632428160.0, "grad_norm": 2.0399004864085324, "language_loss": 0.8442266, "learning_rate": 2.70738867321606e-06, "loss": 0.86972189, "num_input_tokens_seen": 144098040, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.20617676, "step": 6709, "time_per_iteration": 2.839867115020752 }, { "auxiliary_loss_clip": 0.01505262, "auxiliary_loss_mlp": 0.01037863, "balance_loss_clip": 1.31799674, "balance_loss_mlp": 1.01659632, "epoch": 0.40342702540207426, "flos": 29610987137280.0, "grad_norm": 3.428393547213459, "language_loss": 0.72043562, "learning_rate": 2.70702437251426e-06, "loss": 0.74586689, "num_input_tokens_seen": 144118265, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.21264648, "step": 6710, "time_per_iteration": 2.909968137741089 }, { "auxiliary_loss_clip": 0.0147812, "auxiliary_loss_mlp": 0.01040301, "balance_loss_clip": 1.29633093, "balance_loss_mlp": 1.01934457, "epoch": 0.4034871486547422, "flos": 11290260107520.0, "grad_norm": 2.386636094677762, "language_loss": 0.85584092, "learning_rate": 2.7066600450020236e-06, "loss": 0.8810252, "num_input_tokens_seen": 144133865, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.20959473, "step": 6711, "time_per_iteration": 2.8466875553131104 }, { "auxiliary_loss_clip": 0.01492097, "auxiliary_loss_mlp": 0.01037997, "balance_loss_clip": 1.30768001, "balance_loss_mlp": 1.01829147, "epoch": 0.4035472719074102, "flos": 15559446228480.0, "grad_norm": 2.279021120891494, "language_loss": 0.77628732, "learning_rate": 2.706295690693168e-06, "loss": 0.8015883, "num_input_tokens_seen": 144150125, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.19726562, "step": 6712, "time_per_iteration": 2.837409019470215 }, { "auxiliary_loss_clip": 0.01496185, "auxiliary_loss_mlp": 0.01038514, "balance_loss_clip": 1.31230211, "balance_loss_mlp": 1.01756883, "epoch": 0.40360739516007815, "flos": 24683360417280.0, "grad_norm": 2.309401599293232, "language_loss": 0.8030926, "learning_rate": 2.7059313096015096e-06, "loss": 0.82843959, "num_input_tokens_seen": 144169295, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.20947266, "step": 6713, "time_per_iteration": 2.8454666137695312 }, { "auxiliary_loss_clip": 0.01491532, "auxiliary_loss_mlp": 0.01039182, "balance_loss_clip": 1.30470204, "balance_loss_mlp": 1.01874912, "epoch": 0.4036675184127461, "flos": 17311140385920.0, "grad_norm": 2.021421285954685, "language_loss": 0.88883716, "learning_rate": 2.705566901740865e-06, "loss": 0.91414428, "num_input_tokens_seen": 144185790, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.2043457, "step": 6714, "time_per_iteration": 2.870880126953125 }, { "auxiliary_loss_clip": 0.01494954, "auxiliary_loss_mlp": 0.01043557, "balance_loss_clip": 1.31054211, "balance_loss_mlp": 1.02249312, "epoch": 0.4037276416654141, "flos": 19873270984320.0, "grad_norm": 1.6773431017631308, "language_loss": 0.69963062, "learning_rate": 2.7052024671250527e-06, "loss": 0.7250157, "num_input_tokens_seen": 144205190, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.21069336, "step": 6715, "time_per_iteration": 2.831733465194702 }, { "auxiliary_loss_clip": 0.01512253, "auxiliary_loss_mlp": 0.01044162, "balance_loss_clip": 1.32259965, "balance_loss_mlp": 1.02383637, "epoch": 0.40378776491808205, "flos": 18305407025280.0, "grad_norm": 2.094132714670534, "language_loss": 0.78566146, "learning_rate": 2.704838005767892e-06, "loss": 0.81122559, "num_input_tokens_seen": 144222705, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.20336914, "step": 6716, "time_per_iteration": 2.827209949493408 }, { "auxiliary_loss_clip": 0.01471514, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.29162538, "balance_loss_mlp": 1.01489043, "epoch": 0.40384788817075, "flos": 15057752175360.0, "grad_norm": 4.567862922241594, "language_loss": 0.77239263, "learning_rate": 2.7044735176832037e-06, "loss": 0.7974695, "num_input_tokens_seen": 144239545, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.21289062, "step": 6717, "time_per_iteration": 2.8132240772247314 }, { "auxiliary_loss_clip": 0.01246347, "auxiliary_loss_mlp": 0.01011477, "balance_loss_clip": 1.13859808, "balance_loss_mlp": 0.99669474, "epoch": 0.40390801142341803, "flos": 61958994583680.0, "grad_norm": 0.9271795436925444, "language_loss": 0.60867655, "learning_rate": 2.7041090028848084e-06, "loss": 0.63125479, "num_input_tokens_seen": 144288145, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.14746094, "step": 6718, "time_per_iteration": 3.2303478717803955 }, { "auxiliary_loss_clip": 0.01498769, "auxiliary_loss_mlp": 0.01042162, "balance_loss_clip": 1.30922151, "balance_loss_mlp": 1.02034688, "epoch": 0.403968134676086, "flos": 22746795431040.0, "grad_norm": 9.076436700008227, "language_loss": 0.75332224, "learning_rate": 2.7037444613865306e-06, "loss": 0.77873158, "num_input_tokens_seen": 144302315, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.21826172, "step": 6719, "time_per_iteration": 2.8524045944213867 }, { "auxiliary_loss_clip": 0.01483662, "auxiliary_loss_mlp": 0.01046369, "balance_loss_clip": 1.29939008, "balance_loss_mlp": 1.02509046, "epoch": 0.40402825792875396, "flos": 19791911554560.0, "grad_norm": 2.0482028573759616, "language_loss": 0.82989693, "learning_rate": 2.7033798932021906e-06, "loss": 0.85519731, "num_input_tokens_seen": 144318990, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.21276855, "step": 6720, "time_per_iteration": 2.8408243656158447 }, { "auxiliary_loss_clip": 0.0149436, "auxiliary_loss_mlp": 0.010379, "balance_loss_clip": 1.30759621, "balance_loss_mlp": 1.01782513, "epoch": 0.40408838118142193, "flos": 19618623394560.0, "grad_norm": 1.7663266169740917, "language_loss": 0.77478904, "learning_rate": 2.7030152983456153e-06, "loss": 0.80011159, "num_input_tokens_seen": 144335765, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.20080566, "step": 6721, "time_per_iteration": 2.846968412399292 }, { "auxiliary_loss_clip": 0.01480271, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.2986455, "balance_loss_mlp": 1.01797557, "epoch": 0.4041485044340899, "flos": 24436811646720.0, "grad_norm": 1.6093296873367648, "language_loss": 0.7367419, "learning_rate": 2.7026506768306304e-06, "loss": 0.76191962, "num_input_tokens_seen": 144355825, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.19519043, "step": 6722, "time_per_iteration": 2.8775830268859863 }, { "auxiliary_loss_clip": 0.01478763, "auxiliary_loss_mlp": 0.01037845, "balance_loss_clip": 1.29600883, "balance_loss_mlp": 1.01794934, "epoch": 0.40420862768675786, "flos": 16767974833920.0, "grad_norm": 1.6462765185109878, "language_loss": 0.66342562, "learning_rate": 2.7022860286710602e-06, "loss": 0.68859172, "num_input_tokens_seen": 144374320, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.19897461, "step": 6723, "time_per_iteration": 2.8542542457580566 }, { "auxiliary_loss_clip": 0.01501765, "auxiliary_loss_mlp": 0.01044873, "balance_loss_clip": 1.31349874, "balance_loss_mlp": 1.02367759, "epoch": 0.4042687509394258, "flos": 22501649249280.0, "grad_norm": 1.655902329675024, "language_loss": 0.74067044, "learning_rate": 2.701921353880734e-06, "loss": 0.76613683, "num_input_tokens_seen": 144394325, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.21203613, "step": 6724, "time_per_iteration": 4.310163974761963 }, { "auxiliary_loss_clip": 0.01464737, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.28632331, "balance_loss_mlp": 1.01263213, "epoch": 0.4043288741920938, "flos": 30348298702080.0, "grad_norm": 1.786290430657111, "language_loss": 0.75983286, "learning_rate": 2.7015566524734787e-06, "loss": 0.7848109, "num_input_tokens_seen": 144412765, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.20446777, "step": 6725, "time_per_iteration": 2.880164384841919 }, { "auxiliary_loss_clip": 0.01481448, "auxiliary_loss_mlp": 0.01036134, "balance_loss_clip": 1.29882216, "balance_loss_mlp": 1.01461685, "epoch": 0.40438899744476176, "flos": 46363307270400.0, "grad_norm": 1.6466407225544473, "language_loss": 0.772246, "learning_rate": 2.701191924463126e-06, "loss": 0.79742181, "num_input_tokens_seen": 144435400, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.21508789, "step": 6726, "time_per_iteration": 3.0462794303894043 }, { "auxiliary_loss_clip": 0.01489664, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.30309725, "balance_loss_mlp": 1.01607907, "epoch": 0.4044491206974297, "flos": 13341195244800.0, "grad_norm": 2.22729347487987, "language_loss": 0.82476091, "learning_rate": 2.7008271698635054e-06, "loss": 0.850025, "num_input_tokens_seen": 144452925, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.20654297, "step": 6727, "time_per_iteration": 2.8585100173950195 }, { "auxiliary_loss_clip": 0.0149793, "auxiliary_loss_mlp": 0.01035344, "balance_loss_clip": 1.31081116, "balance_loss_mlp": 1.0139935, "epoch": 0.4045092439500977, "flos": 12101284730880.0, "grad_norm": 2.312974936843572, "language_loss": 0.86404842, "learning_rate": 2.700462388688447e-06, "loss": 0.88938123, "num_input_tokens_seen": 144470195, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.21350098, "step": 6728, "time_per_iteration": 3.0446431636810303 }, { "auxiliary_loss_clip": 0.01485786, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.30358601, "balance_loss_mlp": 1.01609945, "epoch": 0.40456936720276565, "flos": 21189835468800.0, "grad_norm": 1.7206042194596154, "language_loss": 0.82164228, "learning_rate": 2.700097580951786e-06, "loss": 0.84686285, "num_input_tokens_seen": 144490320, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.20178223, "step": 6729, "time_per_iteration": 2.896275281906128 }, { "auxiliary_loss_clip": 0.01497027, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.31238794, "balance_loss_mlp": 1.02093816, "epoch": 0.4046294904554336, "flos": 23925932899200.0, "grad_norm": 6.016488673940383, "language_loss": 0.74610668, "learning_rate": 2.6997327466673533e-06, "loss": 0.77148873, "num_input_tokens_seen": 144508990, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.20251465, "step": 6730, "time_per_iteration": 2.8608593940734863 }, { "auxiliary_loss_clip": 0.01482433, "auxiliary_loss_mlp": 0.01037093, "balance_loss_clip": 1.29969084, "balance_loss_mlp": 1.01652992, "epoch": 0.4046896137081016, "flos": 38085419197440.0, "grad_norm": 1.8244550718450188, "language_loss": 0.68340588, "learning_rate": 2.699367885848985e-06, "loss": 0.70860112, "num_input_tokens_seen": 144529550, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.20556641, "step": 6731, "time_per_iteration": 5.786974191665649 }, { "auxiliary_loss_clip": 0.01481109, "auxiliary_loss_mlp": 0.01039632, "balance_loss_clip": 1.29589748, "balance_loss_mlp": 1.01884174, "epoch": 0.4047497369607696, "flos": 23626239471360.0, "grad_norm": 1.4580730815884706, "language_loss": 0.74910045, "learning_rate": 2.699002998510517e-06, "loss": 0.77430785, "num_input_tokens_seen": 144549310, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.20788574, "step": 6732, "time_per_iteration": 2.8549928665161133 }, { "auxiliary_loss_clip": 0.01480576, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.29789424, "balance_loss_mlp": 1.01582932, "epoch": 0.40480986021343757, "flos": 12831357127680.0, "grad_norm": 1.752196811025774, "language_loss": 0.77904254, "learning_rate": 2.6986380846657852e-06, "loss": 0.80420715, "num_input_tokens_seen": 144567430, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.20031738, "step": 6733, "time_per_iteration": 4.261539459228516 }, { "auxiliary_loss_clip": 0.01510631, "auxiliary_loss_mlp": 0.01046318, "balance_loss_clip": 1.32017875, "balance_loss_mlp": 1.02415729, "epoch": 0.40486998346610553, "flos": 23779230698880.0, "grad_norm": 1.741353442442142, "language_loss": 0.77588373, "learning_rate": 2.698273144328627e-06, "loss": 0.80145323, "num_input_tokens_seen": 144585975, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.22167969, "step": 6734, "time_per_iteration": 2.8466339111328125 }, { "auxiliary_loss_clip": 0.01502551, "auxiliary_loss_mlp": 0.01044433, "balance_loss_clip": 1.31121838, "balance_loss_mlp": 1.02315462, "epoch": 0.4049301067187735, "flos": 22867092650880.0, "grad_norm": 2.2067013384939878, "language_loss": 0.65904164, "learning_rate": 2.6979081775128805e-06, "loss": 0.68451142, "num_input_tokens_seen": 144605225, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.21276855, "step": 6735, "time_per_iteration": 2.8863890171051025 }, { "auxiliary_loss_clip": 0.01481395, "auxiliary_loss_mlp": 0.01042955, "balance_loss_clip": 1.29627132, "balance_loss_mlp": 1.02168787, "epoch": 0.40499022997144146, "flos": 22794284488320.0, "grad_norm": 1.6740819987225546, "language_loss": 0.8362478, "learning_rate": 2.697543184232387e-06, "loss": 0.86149132, "num_input_tokens_seen": 144624145, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.21264648, "step": 6736, "time_per_iteration": 2.8403313159942627 }, { "auxiliary_loss_clip": 0.01494128, "auxiliary_loss_mlp": 0.01050994, "balance_loss_clip": 1.30760956, "balance_loss_mlp": 1.02901161, "epoch": 0.4050503532241094, "flos": 23049701239680.0, "grad_norm": 1.5676897038642668, "language_loss": 0.75900376, "learning_rate": 2.6971781645009863e-06, "loss": 0.784455, "num_input_tokens_seen": 144644470, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.21972656, "step": 6737, "time_per_iteration": 2.8292407989501953 }, { "auxiliary_loss_clip": 0.01481615, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.29747462, "balance_loss_mlp": 1.025069, "epoch": 0.4051104764767774, "flos": 16654735802880.0, "grad_norm": 1.903095956796571, "language_loss": 0.72516513, "learning_rate": 2.696813118332519e-06, "loss": 0.75043654, "num_input_tokens_seen": 144661055, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.20446777, "step": 6738, "time_per_iteration": 2.902050018310547 }, { "auxiliary_loss_clip": 0.01478278, "auxiliary_loss_mlp": 0.01053455, "balance_loss_clip": 1.29489172, "balance_loss_mlp": 1.03225982, "epoch": 0.40517059972944536, "flos": 16366670288640.0, "grad_norm": 2.0180020728183354, "language_loss": 0.76003098, "learning_rate": 2.696448045740828e-06, "loss": 0.7853483, "num_input_tokens_seen": 144677935, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.21203613, "step": 6739, "time_per_iteration": 2.815553903579712 }, { "auxiliary_loss_clip": 0.01481042, "auxiliary_loss_mlp": 0.01044903, "balance_loss_clip": 1.29598069, "balance_loss_mlp": 1.0223608, "epoch": 0.4052307229821133, "flos": 28815029032320.0, "grad_norm": 2.24615667945766, "language_loss": 0.75428122, "learning_rate": 2.6960829467397576e-06, "loss": 0.77954066, "num_input_tokens_seen": 144697725, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.22521973, "step": 6740, "time_per_iteration": 2.866610050201416 }, { "auxiliary_loss_clip": 0.0147099, "auxiliary_loss_mlp": 0.01043652, "balance_loss_clip": 1.29019511, "balance_loss_mlp": 1.02249265, "epoch": 0.4052908462347813, "flos": 21407897998080.0, "grad_norm": 1.5410561086780905, "language_loss": 0.77613658, "learning_rate": 2.695717821343153e-06, "loss": 0.801283, "num_input_tokens_seen": 144718805, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.21166992, "step": 6741, "time_per_iteration": 2.854022264480591 }, { "auxiliary_loss_clip": 0.01479877, "auxiliary_loss_mlp": 0.01051204, "balance_loss_clip": 1.29510045, "balance_loss_mlp": 1.02776742, "epoch": 0.40535096948744925, "flos": 22429248289920.0, "grad_norm": 1.9634866035373009, "language_loss": 0.72352469, "learning_rate": 2.6953526695648577e-06, "loss": 0.74883556, "num_input_tokens_seen": 144737105, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.23449707, "step": 6742, "time_per_iteration": 2.812648057937622 }, { "auxiliary_loss_clip": 0.01491324, "auxiliary_loss_mlp": 0.01039286, "balance_loss_clip": 1.30450833, "balance_loss_mlp": 1.01774454, "epoch": 0.4054110927401172, "flos": 17017374026880.0, "grad_norm": 2.230126803084672, "language_loss": 0.73575515, "learning_rate": 2.6949874914187202e-06, "loss": 0.76106119, "num_input_tokens_seen": 144751350, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.21533203, "step": 6743, "time_per_iteration": 2.8082680702209473 }, { "auxiliary_loss_clip": 0.01500983, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.31134868, "balance_loss_mlp": 1.01924729, "epoch": 0.4054712159927852, "flos": 21624422204160.0, "grad_norm": 2.3099385217168282, "language_loss": 0.72120035, "learning_rate": 2.694622286918588e-06, "loss": 0.74662155, "num_input_tokens_seen": 144770030, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.21899414, "step": 6744, "time_per_iteration": 2.820746660232544 }, { "auxiliary_loss_clip": 0.01483545, "auxiliary_loss_mlp": 0.01042007, "balance_loss_clip": 1.30034256, "balance_loss_mlp": 1.02088284, "epoch": 0.4055313392454532, "flos": 25823695829760.0, "grad_norm": 1.567338843173594, "language_loss": 0.80565536, "learning_rate": 2.6942570560783076e-06, "loss": 0.83091086, "num_input_tokens_seen": 144790965, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.21130371, "step": 6745, "time_per_iteration": 2.882887125015259 }, { "auxiliary_loss_clip": 0.01480782, "auxiliary_loss_mlp": 0.01041642, "balance_loss_clip": 1.29850984, "balance_loss_mlp": 1.01851559, "epoch": 0.40559146249812117, "flos": 14145206924160.0, "grad_norm": 2.3927765922330893, "language_loss": 0.67554724, "learning_rate": 2.693891798911731e-06, "loss": 0.70077145, "num_input_tokens_seen": 144807755, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.23144531, "step": 6746, "time_per_iteration": 2.7916104793548584 }, { "auxiliary_loss_clip": 0.01489812, "auxiliary_loss_mlp": 0.01036102, "balance_loss_clip": 1.30465984, "balance_loss_mlp": 1.01533592, "epoch": 0.40565158575078913, "flos": 41370654493440.0, "grad_norm": 3.039729581353956, "language_loss": 0.57424986, "learning_rate": 2.6935265154327075e-06, "loss": 0.599509, "num_input_tokens_seen": 144832405, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.2076416, "step": 6747, "time_per_iteration": 2.9790310859680176 }, { "auxiliary_loss_clip": 0.01489452, "auxiliary_loss_mlp": 0.01042688, "balance_loss_clip": 1.30388331, "balance_loss_mlp": 1.02082467, "epoch": 0.4057117090034571, "flos": 28555223535360.0, "grad_norm": 1.7838519785105682, "language_loss": 0.8507942, "learning_rate": 2.693161205655089e-06, "loss": 0.87611556, "num_input_tokens_seen": 144853890, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.21850586, "step": 6748, "time_per_iteration": 2.9651427268981934 }, { "auxiliary_loss_clip": 0.01493812, "auxiliary_loss_mlp": 0.01038778, "balance_loss_clip": 1.30680954, "balance_loss_mlp": 1.01704657, "epoch": 0.40577183225612506, "flos": 18013043255040.0, "grad_norm": 1.7721534484483183, "language_loss": 0.82416886, "learning_rate": 2.6927958695927287e-06, "loss": 0.84949481, "num_input_tokens_seen": 144871395, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.2175293, "step": 6749, "time_per_iteration": 2.818758010864258 }, { "auxiliary_loss_clip": 0.01481935, "auxiliary_loss_mlp": 0.0104115, "balance_loss_clip": 1.29864001, "balance_loss_mlp": 1.01953709, "epoch": 0.40583195550879303, "flos": 19546177190400.0, "grad_norm": 1.7075615182241006, "language_loss": 0.76137376, "learning_rate": 2.6924305072594784e-06, "loss": 0.78660458, "num_input_tokens_seen": 144890975, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.21606445, "step": 6750, "time_per_iteration": 2.901683807373047 }, { "auxiliary_loss_clip": 0.01507307, "auxiliary_loss_mlp": 0.01038379, "balance_loss_clip": 1.31444967, "balance_loss_mlp": 1.01656342, "epoch": 0.405892078761461, "flos": 22319583598080.0, "grad_norm": 2.8634228434763105, "language_loss": 0.74315983, "learning_rate": 2.692065118669195e-06, "loss": 0.76861674, "num_input_tokens_seen": 144908170, "router_z_loss_clip": 1.92773438, "router_z_loss_mlp": 0.21801758, "step": 6751, "time_per_iteration": 2.8803458213806152 }, { "auxiliary_loss_clip": 0.01500197, "auxiliary_loss_mlp": 0.01045534, "balance_loss_clip": 1.31293082, "balance_loss_mlp": 1.02320647, "epoch": 0.40595220201412896, "flos": 25495516160640.0, "grad_norm": 3.509388972995175, "language_loss": 0.68113577, "learning_rate": 2.6916997038357326e-06, "loss": 0.70659304, "num_input_tokens_seen": 144928020, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.2232666, "step": 6752, "time_per_iteration": 2.8837597370147705 }, { "auxiliary_loss_clip": 0.01504579, "auxiliary_loss_mlp": 0.01050287, "balance_loss_clip": 1.31399071, "balance_loss_mlp": 1.02599192, "epoch": 0.4060123252667969, "flos": 49873210836480.0, "grad_norm": 1.9053288334291985, "language_loss": 0.71629131, "learning_rate": 2.691334262772948e-06, "loss": 0.74184, "num_input_tokens_seen": 144951240, "router_z_loss_clip": 1.90429688, "router_z_loss_mlp": 0.24267578, "step": 6753, "time_per_iteration": 3.0923516750335693 }, { "auxiliary_loss_clip": 0.01498965, "auxiliary_loss_mlp": 0.01044041, "balance_loss_clip": 1.31068087, "balance_loss_mlp": 1.02214265, "epoch": 0.4060724485194649, "flos": 21143613265920.0, "grad_norm": 1.9156957595945474, "language_loss": 0.7268995, "learning_rate": 2.690968795494699e-06, "loss": 0.75232959, "num_input_tokens_seen": 144969100, "router_z_loss_clip": 1.88085938, "router_z_loss_mlp": 0.21899414, "step": 6754, "time_per_iteration": 2.8273680210113525 }, { "auxiliary_loss_clip": 0.01500183, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 1.31096637, "balance_loss_mlp": 1.02143335, "epoch": 0.40613257177213286, "flos": 21766961882880.0, "grad_norm": 1.7513738502730913, "language_loss": 0.83836019, "learning_rate": 2.690603302014844e-06, "loss": 0.8637979, "num_input_tokens_seen": 144987065, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.22155762, "step": 6755, "time_per_iteration": 2.846447229385376 }, { "auxiliary_loss_clip": 0.01505582, "auxiliary_loss_mlp": 0.01043637, "balance_loss_clip": 1.31474209, "balance_loss_mlp": 1.02120161, "epoch": 0.4061926950248008, "flos": 25565609635200.0, "grad_norm": 1.8814678908606506, "language_loss": 0.71788645, "learning_rate": 2.6902377823472426e-06, "loss": 0.7433787, "num_input_tokens_seen": 145007310, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.22424316, "step": 6756, "time_per_iteration": 2.8939061164855957 }, { "auxiliary_loss_clip": 0.01498267, "auxiliary_loss_mlp": 0.01042578, "balance_loss_clip": 1.30926275, "balance_loss_mlp": 1.02056038, "epoch": 0.4062528182774688, "flos": 23706241557120.0, "grad_norm": 1.7879777894184403, "language_loss": 0.80273223, "learning_rate": 2.689872236505755e-06, "loss": 0.82814062, "num_input_tokens_seen": 145026210, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.22021484, "step": 6757, "time_per_iteration": 2.879807472229004 }, { "auxiliary_loss_clip": 0.01493668, "auxiliary_loss_mlp": 0.01039446, "balance_loss_clip": 1.30697358, "balance_loss_mlp": 1.017869, "epoch": 0.4063129415301368, "flos": 21736077667200.0, "grad_norm": 2.04128193174796, "language_loss": 0.79379487, "learning_rate": 2.6895066645042437e-06, "loss": 0.81912595, "num_input_tokens_seen": 145045475, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.21569824, "step": 6758, "time_per_iteration": 4.262715101242065 }, { "auxiliary_loss_clip": 0.01485111, "auxiliary_loss_mlp": 0.01034403, "balance_loss_clip": 1.30163288, "balance_loss_mlp": 1.01311255, "epoch": 0.40637306478280477, "flos": 12795450739200.0, "grad_norm": 3.701213696809869, "language_loss": 0.9024868, "learning_rate": 2.6891410663565703e-06, "loss": 0.92768192, "num_input_tokens_seen": 145062260, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.2130127, "step": 6759, "time_per_iteration": 2.795884132385254 }, { "auxiliary_loss_clip": 0.01506824, "auxiliary_loss_mlp": 0.01036006, "balance_loss_clip": 1.31751966, "balance_loss_mlp": 1.0147512, "epoch": 0.40643318803547274, "flos": 24035099898240.0, "grad_norm": 1.9535453080169018, "language_loss": 0.65187275, "learning_rate": 2.688775442076598e-06, "loss": 0.67730099, "num_input_tokens_seen": 145082470, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.21252441, "step": 6760, "time_per_iteration": 2.893871784210205 }, { "auxiliary_loss_clip": 0.0151538, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.32619905, "balance_loss_mlp": 1.0213623, "epoch": 0.4064933112881407, "flos": 25603190081280.0, "grad_norm": 1.5124254279120817, "language_loss": 0.75462019, "learning_rate": 2.688409791678193e-06, "loss": 0.78020185, "num_input_tokens_seen": 145105685, "router_z_loss_clip": 1.89160156, "router_z_loss_mlp": 0.2142334, "step": 6761, "time_per_iteration": 2.8730552196502686 }, { "auxiliary_loss_clip": 0.01479957, "auxiliary_loss_mlp": 0.01034791, "balance_loss_clip": 1.29891288, "balance_loss_mlp": 1.01342833, "epoch": 0.40655343454080867, "flos": 22064438315520.0, "grad_norm": 1.6038825883396162, "language_loss": 0.70640051, "learning_rate": 2.6880441151752185e-06, "loss": 0.73154795, "num_input_tokens_seen": 145125590, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.21362305, "step": 6762, "time_per_iteration": 2.8499209880828857 }, { "auxiliary_loss_clip": 0.01497903, "auxiliary_loss_mlp": 0.01038926, "balance_loss_clip": 1.31134212, "balance_loss_mlp": 1.01709914, "epoch": 0.40661355779347663, "flos": 26480100412800.0, "grad_norm": 3.071849889144561, "language_loss": 0.73591775, "learning_rate": 2.6876784125815433e-06, "loss": 0.76128608, "num_input_tokens_seen": 145146810, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.21813965, "step": 6763, "time_per_iteration": 2.8786327838897705 }, { "auxiliary_loss_clip": 0.01505166, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.31643355, "balance_loss_mlp": 1.01382995, "epoch": 0.4066736810461446, "flos": 13268930019840.0, "grad_norm": 1.826199436080857, "language_loss": 0.69959033, "learning_rate": 2.687312683911033e-06, "loss": 0.7249971, "num_input_tokens_seen": 145163130, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.21679688, "step": 6764, "time_per_iteration": 2.8155317306518555 }, { "auxiliary_loss_clip": 0.01513126, "auxiliary_loss_mlp": 0.0104521, "balance_loss_clip": 1.32214952, "balance_loss_mlp": 1.02263141, "epoch": 0.40673380429881256, "flos": 28815345745920.0, "grad_norm": 2.0253790188856553, "language_loss": 0.91884613, "learning_rate": 2.686946929177557e-06, "loss": 0.94442952, "num_input_tokens_seen": 145181420, "router_z_loss_clip": 1.90820312, "router_z_loss_mlp": 0.22583008, "step": 6765, "time_per_iteration": 2.9646458625793457 }, { "auxiliary_loss_clip": 0.01522106, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.32866335, "balance_loss_mlp": 1.01882815, "epoch": 0.4067939275514805, "flos": 12503041724160.0, "grad_norm": 2.3745151206344697, "language_loss": 0.7989018, "learning_rate": 2.6865811483949855e-06, "loss": 0.82453668, "num_input_tokens_seen": 145198545, "router_z_loss_clip": 1.93164062, "router_z_loss_mlp": 0.22546387, "step": 6766, "time_per_iteration": 4.2178428173065186 }, { "auxiliary_loss_clip": 0.01502237, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.31193852, "balance_loss_mlp": 1.01866555, "epoch": 0.4068540508041485, "flos": 18779745957120.0, "grad_norm": 2.0248636514704272, "language_loss": 0.76903498, "learning_rate": 2.6862153415771867e-06, "loss": 0.79445946, "num_input_tokens_seen": 145215835, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.2154541, "step": 6767, "time_per_iteration": 4.224372863769531 }, { "auxiliary_loss_clip": 0.01498637, "auxiliary_loss_mlp": 0.01040348, "balance_loss_clip": 1.31138873, "balance_loss_mlp": 1.01875937, "epoch": 0.40691417405681646, "flos": 28524746522880.0, "grad_norm": 1.689511459844688, "language_loss": 0.78754687, "learning_rate": 2.685849508738034e-06, "loss": 0.81293672, "num_input_tokens_seen": 145236555, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.21582031, "step": 6768, "time_per_iteration": 4.3601367473602295 }, { "auxiliary_loss_clip": 0.01506432, "auxiliary_loss_mlp": 0.01040369, "balance_loss_clip": 1.31850135, "balance_loss_mlp": 1.01911378, "epoch": 0.4069742973094844, "flos": 20823894374400.0, "grad_norm": 2.116282490586452, "language_loss": 0.88735867, "learning_rate": 2.6854836498913995e-06, "loss": 0.91282666, "num_input_tokens_seen": 145254595, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.21240234, "step": 6769, "time_per_iteration": 2.839622735977173 }, { "auxiliary_loss_clip": 0.01495614, "auxiliary_loss_mlp": 0.01038922, "balance_loss_clip": 1.31124675, "balance_loss_mlp": 1.01754785, "epoch": 0.4070344205621524, "flos": 21480163223040.0, "grad_norm": 2.362055096940512, "language_loss": 0.81727624, "learning_rate": 2.685117765051156e-06, "loss": 0.84262156, "num_input_tokens_seen": 145274005, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.21386719, "step": 6770, "time_per_iteration": 2.8860654830932617 }, { "auxiliary_loss_clip": 0.01510779, "auxiliary_loss_mlp": 0.01038933, "balance_loss_clip": 1.31923819, "balance_loss_mlp": 1.01602101, "epoch": 0.4070945438148204, "flos": 26840385907200.0, "grad_norm": 1.8600695440618467, "language_loss": 0.80788243, "learning_rate": 2.6847518542311783e-06, "loss": 0.83337957, "num_input_tokens_seen": 145294850, "router_z_loss_clip": 1.9140625, "router_z_loss_mlp": 0.22900391, "step": 6771, "time_per_iteration": 2.940472364425659 }, { "auxiliary_loss_clip": 0.01501195, "auxiliary_loss_mlp": 0.01041202, "balance_loss_clip": 1.31487727, "balance_loss_mlp": 1.02019751, "epoch": 0.4071546670674884, "flos": 26363965714560.0, "grad_norm": 1.34303116939065, "language_loss": 0.76587629, "learning_rate": 2.6843859174453417e-06, "loss": 0.7913003, "num_input_tokens_seen": 145317050, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.20996094, "step": 6772, "time_per_iteration": 2.8977255821228027 }, { "auxiliary_loss_clip": 0.01506316, "auxiliary_loss_mlp": 0.01042446, "balance_loss_clip": 1.31656754, "balance_loss_mlp": 1.01979637, "epoch": 0.40721479032015634, "flos": 17904554928000.0, "grad_norm": 1.6857588973799869, "language_loss": 0.82059205, "learning_rate": 2.6840199547075218e-06, "loss": 0.84607965, "num_input_tokens_seen": 145334480, "router_z_loss_clip": 1.8984375, "router_z_loss_mlp": 0.22644043, "step": 6773, "time_per_iteration": 2.785315990447998 }, { "auxiliary_loss_clip": 0.0124419, "auxiliary_loss_mlp": 0.01019855, "balance_loss_clip": 1.13817155, "balance_loss_mlp": 0.99830163, "epoch": 0.4072749135728243, "flos": 49880875190400.0, "grad_norm": 0.8170587861039549, "language_loss": 0.6436764, "learning_rate": 2.683653966031597e-06, "loss": 0.66631687, "num_input_tokens_seen": 145388695, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.21582031, "step": 6774, "time_per_iteration": 3.239579439163208 }, { "auxiliary_loss_clip": 0.01522091, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.33061099, "balance_loss_mlp": 1.0154711, "epoch": 0.40733503682549227, "flos": 27575163763200.0, "grad_norm": 1.764698141055891, "language_loss": 0.72997159, "learning_rate": 2.683287951431446e-06, "loss": 0.75556356, "num_input_tokens_seen": 145408240, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.21630859, "step": 6775, "time_per_iteration": 2.8959832191467285 }, { "auxiliary_loss_clip": 0.01512677, "auxiliary_loss_mlp": 0.01043815, "balance_loss_clip": 1.32377017, "balance_loss_mlp": 1.02279854, "epoch": 0.40739516007816023, "flos": 22137020254080.0, "grad_norm": 1.4389810988648113, "language_loss": 0.78140962, "learning_rate": 2.6829219109209474e-06, "loss": 0.80697453, "num_input_tokens_seen": 145428395, "router_z_loss_clip": 1.88964844, "router_z_loss_mlp": 0.21008301, "step": 6776, "time_per_iteration": 2.8857216835021973 }, { "auxiliary_loss_clip": 0.01534337, "auxiliary_loss_mlp": 0.01045309, "balance_loss_clip": 1.34031522, "balance_loss_mlp": 1.02366078, "epoch": 0.4074552833308282, "flos": 23852853267840.0, "grad_norm": 2.448134985495422, "language_loss": 0.80370343, "learning_rate": 2.682555844513981e-06, "loss": 0.82949996, "num_input_tokens_seen": 145448290, "router_z_loss_clip": 1.93945312, "router_z_loss_mlp": 0.21643066, "step": 6777, "time_per_iteration": 2.85481858253479 }, { "auxiliary_loss_clip": 0.01238603, "auxiliary_loss_mlp": 0.01016566, "balance_loss_clip": 1.13483799, "balance_loss_mlp": 0.99806517, "epoch": 0.40751540658349616, "flos": 58030792410240.0, "grad_norm": 0.692172142210136, "language_loss": 0.53244609, "learning_rate": 2.6821897522244286e-06, "loss": 0.5549978, "num_input_tokens_seen": 145509785, "router_z_loss_clip": 1.0390625, "router_z_loss_mlp": 0.18457031, "step": 6778, "time_per_iteration": 3.309422254562378 }, { "auxiliary_loss_clip": 0.01526536, "auxiliary_loss_mlp": 0.01048891, "balance_loss_clip": 1.33761549, "balance_loss_mlp": 1.02624071, "epoch": 0.40757552983616413, "flos": 21224429758080.0, "grad_norm": 1.8914929413390473, "language_loss": 0.8313604, "learning_rate": 2.6818236340661718e-06, "loss": 0.85711467, "num_input_tokens_seen": 145528620, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.2265625, "step": 6779, "time_per_iteration": 2.8551979064941406 }, { "auxiliary_loss_clip": 0.0151916, "auxiliary_loss_mlp": 0.01043933, "balance_loss_clip": 1.33016646, "balance_loss_mlp": 1.02175987, "epoch": 0.4076356530888321, "flos": 26844774652800.0, "grad_norm": 1.4916315013375414, "language_loss": 0.76629949, "learning_rate": 2.6814574900530957e-06, "loss": 0.79193044, "num_input_tokens_seen": 145547775, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.22167969, "step": 6780, "time_per_iteration": 2.864501476287842 }, { "auxiliary_loss_clip": 0.01495092, "auxiliary_loss_mlp": 0.01039344, "balance_loss_clip": 1.31175828, "balance_loss_mlp": 1.01872098, "epoch": 0.40769577634150006, "flos": 12210542219520.0, "grad_norm": 2.1816618515084634, "language_loss": 0.67393291, "learning_rate": 2.6810913201990827e-06, "loss": 0.69927728, "num_input_tokens_seen": 145564465, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.2064209, "step": 6781, "time_per_iteration": 2.8452322483062744 }, { "auxiliary_loss_clip": 0.01487614, "auxiliary_loss_mlp": 0.0104199, "balance_loss_clip": 1.30254507, "balance_loss_mlp": 1.0204246, "epoch": 0.407755899594168, "flos": 33667992552960.0, "grad_norm": 1.636403228965355, "language_loss": 0.71666121, "learning_rate": 2.6807251245180183e-06, "loss": 0.74195731, "num_input_tokens_seen": 145585965, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.21569824, "step": 6782, "time_per_iteration": 2.9384114742279053 }, { "auxiliary_loss_clip": 0.01506425, "auxiliary_loss_mlp": 0.01038976, "balance_loss_clip": 1.31794143, "balance_loss_mlp": 1.01761425, "epoch": 0.407816022846836, "flos": 20167037343360.0, "grad_norm": 1.7112461992755006, "language_loss": 0.82632935, "learning_rate": 2.6803589030237897e-06, "loss": 0.85178334, "num_input_tokens_seen": 145605000, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.21374512, "step": 6783, "time_per_iteration": 2.8324053287506104 }, { "auxiliary_loss_clip": 0.01509502, "auxiliary_loss_mlp": 0.01045544, "balance_loss_clip": 1.32200682, "balance_loss_mlp": 1.02370453, "epoch": 0.40787614609950396, "flos": 21188749593600.0, "grad_norm": 1.4459681415758028, "language_loss": 0.81486452, "learning_rate": 2.679992655730283e-06, "loss": 0.840415, "num_input_tokens_seen": 145623740, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.21801758, "step": 6784, "time_per_iteration": 2.854062795639038 }, { "auxiliary_loss_clip": 0.01515838, "auxiliary_loss_mlp": 0.0104271, "balance_loss_clip": 1.32147264, "balance_loss_mlp": 1.01959503, "epoch": 0.407936269352172, "flos": 20530037525760.0, "grad_norm": 1.5914690018963948, "language_loss": 0.66649199, "learning_rate": 2.679626382651386e-06, "loss": 0.69207752, "num_input_tokens_seen": 145643515, "router_z_loss_clip": 1.94042969, "router_z_loss_mlp": 0.2310791, "step": 6785, "time_per_iteration": 2.858283281326294 }, { "auxiliary_loss_clip": 0.01497412, "auxiliary_loss_mlp": 0.01040118, "balance_loss_clip": 1.30997503, "balance_loss_mlp": 1.01863658, "epoch": 0.40799639260483994, "flos": 20127963818880.0, "grad_norm": 2.019470580090676, "language_loss": 0.80050027, "learning_rate": 2.679260083800989e-06, "loss": 0.82587552, "num_input_tokens_seen": 145660890, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.21472168, "step": 6786, "time_per_iteration": 2.829864501953125 }, { "auxiliary_loss_clip": 0.01494072, "auxiliary_loss_mlp": 0.01041882, "balance_loss_clip": 1.30791211, "balance_loss_mlp": 1.02044845, "epoch": 0.4080565158575079, "flos": 21007453104000.0, "grad_norm": 1.8742375181532813, "language_loss": 0.82121503, "learning_rate": 2.678893759192982e-06, "loss": 0.84657454, "num_input_tokens_seen": 145680070, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.2142334, "step": 6787, "time_per_iteration": 2.943730592727661 }, { "auxiliary_loss_clip": 0.01504192, "auxiliary_loss_mlp": 0.01038753, "balance_loss_clip": 1.31738925, "balance_loss_mlp": 1.0168426, "epoch": 0.40811663911017587, "flos": 19327074030720.0, "grad_norm": 1.8359308509936134, "language_loss": 0.68442965, "learning_rate": 2.678527408841255e-06, "loss": 0.70985913, "num_input_tokens_seen": 145698010, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.21911621, "step": 6788, "time_per_iteration": 2.9029886722564697 }, { "auxiliary_loss_clip": 0.01493592, "auxiliary_loss_mlp": 0.01043203, "balance_loss_clip": 1.30512905, "balance_loss_mlp": 1.02124476, "epoch": 0.40817676236284384, "flos": 40640265383040.0, "grad_norm": 2.1780425832183745, "language_loss": 0.66931713, "learning_rate": 2.678161032759701e-06, "loss": 0.69468504, "num_input_tokens_seen": 145722215, "router_z_loss_clip": 1.8828125, "router_z_loss_mlp": 0.21960449, "step": 6789, "time_per_iteration": 3.0607638359069824 }, { "auxiliary_loss_clip": 0.01483505, "auxiliary_loss_mlp": 0.01039844, "balance_loss_clip": 1.29796588, "balance_loss_mlp": 1.01452422, "epoch": 0.4082368856155118, "flos": 20531847317760.0, "grad_norm": 1.7983495064251784, "language_loss": 0.61893183, "learning_rate": 2.6777946309622123e-06, "loss": 0.64416528, "num_input_tokens_seen": 145741090, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.25317383, "step": 6790, "time_per_iteration": 2.9165382385253906 }, { "auxiliary_loss_clip": 0.01487055, "auxiliary_loss_mlp": 0.01042716, "balance_loss_clip": 1.30252981, "balance_loss_mlp": 1.02031696, "epoch": 0.40829700886817977, "flos": 11432709296640.0, "grad_norm": 3.060044978260564, "language_loss": 0.71191627, "learning_rate": 2.677428203462683e-06, "loss": 0.73721397, "num_input_tokens_seen": 145754985, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22412109, "step": 6791, "time_per_iteration": 2.8610119819641113 }, { "auxiliary_loss_clip": 0.01252357, "auxiliary_loss_mlp": 0.01045421, "balance_loss_clip": 1.14046538, "balance_loss_mlp": 1.02043521, "epoch": 0.40835713212084773, "flos": 67361367438720.0, "grad_norm": 0.7536062885560001, "language_loss": 0.59745848, "learning_rate": 2.6770617502750093e-06, "loss": 0.62043631, "num_input_tokens_seen": 145815260, "router_z_loss_clip": 1.1171875, "router_z_loss_mlp": 0.24902344, "step": 6792, "time_per_iteration": 3.352713108062744 }, { "auxiliary_loss_clip": 0.01499891, "auxiliary_loss_mlp": 0.01041598, "balance_loss_clip": 1.31022573, "balance_loss_mlp": 1.01797128, "epoch": 0.4084172553735157, "flos": 21771938810880.0, "grad_norm": 1.8653781861215921, "language_loss": 0.80707496, "learning_rate": 2.6766952714130857e-06, "loss": 0.83248985, "num_input_tokens_seen": 145832665, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.23632812, "step": 6793, "time_per_iteration": 2.8783676624298096 }, { "auxiliary_loss_clip": 0.01493381, "auxiliary_loss_mlp": 0.01038392, "balance_loss_clip": 1.30660594, "balance_loss_mlp": 1.01679111, "epoch": 0.40847737862618366, "flos": 27428642542080.0, "grad_norm": 1.9974359165490705, "language_loss": 0.85615277, "learning_rate": 2.6763287668908094e-06, "loss": 0.88147044, "num_input_tokens_seen": 145850240, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.21594238, "step": 6794, "time_per_iteration": 4.288780212402344 }, { "auxiliary_loss_clip": 0.01493482, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.30561614, "balance_loss_mlp": 1.01740408, "epoch": 0.4085375018788516, "flos": 18596232472320.0, "grad_norm": 1.5422245037601559, "language_loss": 0.80157834, "learning_rate": 2.6759622367220788e-06, "loss": 0.826922, "num_input_tokens_seen": 145869545, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.23461914, "step": 6795, "time_per_iteration": 2.8388419151306152 }, { "auxiliary_loss_clip": 0.0149902, "auxiliary_loss_mlp": 0.01041003, "balance_loss_clip": 1.30688858, "balance_loss_mlp": 1.01764941, "epoch": 0.4085976251315196, "flos": 15419621237760.0, "grad_norm": 2.4412738476014684, "language_loss": 0.70090044, "learning_rate": 2.675595680920792e-06, "loss": 0.7263006, "num_input_tokens_seen": 145884025, "router_z_loss_clip": 1.91894531, "router_z_loss_mlp": 0.23339844, "step": 6796, "time_per_iteration": 2.824941635131836 }, { "auxiliary_loss_clip": 0.01486853, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.30021429, "balance_loss_mlp": 1.0183301, "epoch": 0.40865774838418756, "flos": 21262281672960.0, "grad_norm": 1.8967021489499423, "language_loss": 0.78455925, "learning_rate": 2.6752290995008498e-06, "loss": 0.80983377, "num_input_tokens_seen": 145903210, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.22253418, "step": 6797, "time_per_iteration": 2.8418192863464355 }, { "auxiliary_loss_clip": 0.0149284, "auxiliary_loss_mlp": 0.01042823, "balance_loss_clip": 1.30499482, "balance_loss_mlp": 1.02099633, "epoch": 0.4087178716368556, "flos": 13780261215360.0, "grad_norm": 2.1150561590414276, "language_loss": 0.86518002, "learning_rate": 2.6748624924761523e-06, "loss": 0.89053667, "num_input_tokens_seen": 145920985, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.21838379, "step": 6798, "time_per_iteration": 2.8100321292877197 }, { "auxiliary_loss_clip": 0.01484658, "auxiliary_loss_mlp": 0.01038949, "balance_loss_clip": 1.30071568, "balance_loss_mlp": 1.0171814, "epoch": 0.40877799488952354, "flos": 23631895071360.0, "grad_norm": 1.47193709529708, "language_loss": 0.84837723, "learning_rate": 2.674495859860601e-06, "loss": 0.87361336, "num_input_tokens_seen": 145940350, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.21765137, "step": 6799, "time_per_iteration": 2.8991024494171143 }, { "auxiliary_loss_clip": 0.01489024, "auxiliary_loss_mlp": 0.01038378, "balance_loss_clip": 1.30325031, "balance_loss_mlp": 1.01529884, "epoch": 0.4088381181421915, "flos": 20927451018240.0, "grad_norm": 5.279358679648146, "language_loss": 0.84361291, "learning_rate": 2.6741292016681e-06, "loss": 0.86888695, "num_input_tokens_seen": 145957460, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.23083496, "step": 6800, "time_per_iteration": 2.802234411239624 }, { "auxiliary_loss_clip": 0.01490728, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.30240321, "balance_loss_mlp": 1.01681042, "epoch": 0.4088982413948595, "flos": 13305379345920.0, "grad_norm": 2.3157314609689266, "language_loss": 0.74797857, "learning_rate": 2.6737625179125514e-06, "loss": 0.7732892, "num_input_tokens_seen": 145975285, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.23522949, "step": 6801, "time_per_iteration": 4.206390857696533 }, { "auxiliary_loss_clip": 0.01494594, "auxiliary_loss_mlp": 0.01041421, "balance_loss_clip": 1.30775046, "balance_loss_mlp": 1.01902127, "epoch": 0.40895836464752744, "flos": 15275950439040.0, "grad_norm": 3.0700035496478493, "language_loss": 0.805556, "learning_rate": 2.673395808607861e-06, "loss": 0.83091617, "num_input_tokens_seen": 145989150, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.22387695, "step": 6802, "time_per_iteration": 4.210785150527954 }, { "auxiliary_loss_clip": 0.01496015, "auxiliary_loss_mlp": 0.01038704, "balance_loss_clip": 1.30617678, "balance_loss_mlp": 1.01548207, "epoch": 0.4090184879001954, "flos": 14509473960960.0, "grad_norm": 6.844553471056186, "language_loss": 0.77048451, "learning_rate": 2.673029073767934e-06, "loss": 0.79583168, "num_input_tokens_seen": 146006980, "router_z_loss_clip": 1.89746094, "router_z_loss_mlp": 0.2322998, "step": 6803, "time_per_iteration": 4.302134275436401 }, { "auxiliary_loss_clip": 0.01470979, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.28506875, "balance_loss_mlp": 1.01558864, "epoch": 0.40907861115286337, "flos": 13889609193600.0, "grad_norm": 1.8426000799444895, "language_loss": 0.80055064, "learning_rate": 2.6726623134066764e-06, "loss": 0.82564002, "num_input_tokens_seen": 146025125, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.22387695, "step": 6804, "time_per_iteration": 2.8246982097625732 }, { "auxiliary_loss_clip": 0.01504447, "auxiliary_loss_mlp": 0.01042971, "balance_loss_clip": 1.31174302, "balance_loss_mlp": 1.02076197, "epoch": 0.40913873440553133, "flos": 28049185981440.0, "grad_norm": 1.7528053405160642, "language_loss": 0.75975156, "learning_rate": 2.672295527537998e-06, "loss": 0.78522569, "num_input_tokens_seen": 146044990, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.22192383, "step": 6805, "time_per_iteration": 2.885532855987549 }, { "auxiliary_loss_clip": 0.01507801, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.31771517, "balance_loss_mlp": 1.01943529, "epoch": 0.4091988576581993, "flos": 21628629970560.0, "grad_norm": 1.584174310941899, "language_loss": 0.80180711, "learning_rate": 2.671928716175804e-06, "loss": 0.82730657, "num_input_tokens_seen": 146066045, "router_z_loss_clip": 1.89941406, "router_z_loss_mlp": 0.22717285, "step": 6806, "time_per_iteration": 2.8513293266296387 }, { "auxiliary_loss_clip": 0.01502775, "auxiliary_loss_mlp": 0.01035675, "balance_loss_clip": 1.31216526, "balance_loss_mlp": 1.01341891, "epoch": 0.40925898091086726, "flos": 25233855626880.0, "grad_norm": 2.1597590798744815, "language_loss": 0.73273909, "learning_rate": 2.671561879334007e-06, "loss": 0.75812358, "num_input_tokens_seen": 146086280, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.22253418, "step": 6807, "time_per_iteration": 2.8295576572418213 }, { "auxiliary_loss_clip": 0.01250856, "auxiliary_loss_mlp": 0.01018235, "balance_loss_clip": 1.13962078, "balance_loss_mlp": 0.99467963, "epoch": 0.40931910416353523, "flos": 68960251347840.0, "grad_norm": 0.8323223525576464, "language_loss": 0.58875138, "learning_rate": 2.6711950170265155e-06, "loss": 0.61144233, "num_input_tokens_seen": 146148840, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.23535156, "step": 6808, "time_per_iteration": 3.506718873977661 }, { "auxiliary_loss_clip": 0.01486683, "auxiliary_loss_mlp": 0.01040073, "balance_loss_clip": 1.30107307, "balance_loss_mlp": 1.01739979, "epoch": 0.4093792274162032, "flos": 20198147783040.0, "grad_norm": 1.9207606173927545, "language_loss": 0.55419219, "learning_rate": 2.670828129267242e-06, "loss": 0.57945973, "num_input_tokens_seen": 146166195, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22668457, "step": 6809, "time_per_iteration": 2.8651437759399414 }, { "auxiliary_loss_clip": 0.01487406, "auxiliary_loss_mlp": 0.01034912, "balance_loss_clip": 1.3009733, "balance_loss_mlp": 1.01240528, "epoch": 0.40943935066887116, "flos": 25239737450880.0, "grad_norm": 1.8486966128463767, "language_loss": 0.83430088, "learning_rate": 2.6704612160700983e-06, "loss": 0.85952407, "num_input_tokens_seen": 146185045, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.22497559, "step": 6810, "time_per_iteration": 2.88028621673584 }, { "auxiliary_loss_clip": 0.01504021, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.31372058, "balance_loss_mlp": 1.01459539, "epoch": 0.4094994739215392, "flos": 23264958591360.0, "grad_norm": 2.614185002132003, "language_loss": 0.78697407, "learning_rate": 2.670094277448999e-06, "loss": 0.81240243, "num_input_tokens_seen": 146204655, "router_z_loss_clip": 1.90332031, "router_z_loss_mlp": 0.24230957, "step": 6811, "time_per_iteration": 2.8286502361297607 }, { "auxiliary_loss_clip": 0.01492006, "auxiliary_loss_mlp": 0.01039884, "balance_loss_clip": 1.30228519, "balance_loss_mlp": 1.01474297, "epoch": 0.40955959717420715, "flos": 17390554289280.0, "grad_norm": 2.261377418804439, "language_loss": 0.71083051, "learning_rate": 2.669727313417857e-06, "loss": 0.73614943, "num_input_tokens_seen": 146222000, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.25158691, "step": 6812, "time_per_iteration": 2.8374407291412354 }, { "auxiliary_loss_clip": 0.01486445, "auxiliary_loss_mlp": 0.01038296, "balance_loss_clip": 1.29963136, "balance_loss_mlp": 1.01346445, "epoch": 0.4096197204268751, "flos": 25093261474560.0, "grad_norm": 1.6067315606072514, "language_loss": 0.67658532, "learning_rate": 2.6693603239905872e-06, "loss": 0.70183265, "num_input_tokens_seen": 146242630, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.24853516, "step": 6813, "time_per_iteration": 2.9135401248931885 }, { "auxiliary_loss_clip": 0.01479257, "auxiliary_loss_mlp": 0.01035584, "balance_loss_clip": 1.29398894, "balance_loss_mlp": 1.01213515, "epoch": 0.4096798436795431, "flos": 30597743139840.0, "grad_norm": 2.156529779707446, "language_loss": 0.74802125, "learning_rate": 2.6689933091811087e-06, "loss": 0.77316964, "num_input_tokens_seen": 146263070, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.23449707, "step": 6814, "time_per_iteration": 2.8980822563171387 }, { "auxiliary_loss_clip": 0.01505553, "auxiliary_loss_mlp": 0.01040271, "balance_loss_clip": 1.31304431, "balance_loss_mlp": 1.01732278, "epoch": 0.40973996693221104, "flos": 24144176407680.0, "grad_norm": 2.3429798223434033, "language_loss": 0.67325747, "learning_rate": 2.6686262690033357e-06, "loss": 0.69871569, "num_input_tokens_seen": 146282890, "router_z_loss_clip": 1.92578125, "router_z_loss_mlp": 0.22937012, "step": 6815, "time_per_iteration": 2.834437370300293 }, { "auxiliary_loss_clip": 0.01472733, "auxiliary_loss_mlp": 0.01042777, "balance_loss_clip": 1.29061389, "balance_loss_mlp": 1.02056789, "epoch": 0.409800090184879, "flos": 23999826936960.0, "grad_norm": 2.0371055127326, "language_loss": 0.77519882, "learning_rate": 2.668259203471188e-06, "loss": 0.80035388, "num_input_tokens_seen": 146301755, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.22216797, "step": 6816, "time_per_iteration": 2.8439950942993164 }, { "auxiliary_loss_clip": 0.01477481, "auxiliary_loss_mlp": 0.01039297, "balance_loss_clip": 1.29079771, "balance_loss_mlp": 1.01552653, "epoch": 0.40986021343754697, "flos": 16152272588160.0, "grad_norm": 2.1315652271517016, "language_loss": 0.82718444, "learning_rate": 2.6678921125985843e-06, "loss": 0.8523522, "num_input_tokens_seen": 146316835, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.23791504, "step": 6817, "time_per_iteration": 2.8048946857452393 }, { "auxiliary_loss_clip": 0.01511903, "auxiliary_loss_mlp": 0.01040801, "balance_loss_clip": 1.31659222, "balance_loss_mlp": 1.01680434, "epoch": 0.40992033669021494, "flos": 24801666865920.0, "grad_norm": 1.6946449395947591, "language_loss": 0.8141312, "learning_rate": 2.667524996399444e-06, "loss": 0.8396582, "num_input_tokens_seen": 146336650, "router_z_loss_clip": 1.95117188, "router_z_loss_mlp": 0.23986816, "step": 6818, "time_per_iteration": 2.8447911739349365 }, { "auxiliary_loss_clip": 0.01468693, "auxiliary_loss_mlp": 0.01041651, "balance_loss_clip": 1.2835083, "balance_loss_mlp": 1.01786876, "epoch": 0.4099804599428829, "flos": 29653589756160.0, "grad_norm": 1.705287654418305, "language_loss": 0.66896433, "learning_rate": 2.66715785488769e-06, "loss": 0.69406772, "num_input_tokens_seen": 146357640, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.23779297, "step": 6819, "time_per_iteration": 2.8805065155029297 }, { "auxiliary_loss_clip": 0.01508421, "auxiliary_loss_mlp": 0.01040623, "balance_loss_clip": 1.31330132, "balance_loss_mlp": 1.01731753, "epoch": 0.41004058319555087, "flos": 24837256540800.0, "grad_norm": 1.7244506237584396, "language_loss": 0.86118644, "learning_rate": 2.6667906880772428e-06, "loss": 0.88667685, "num_input_tokens_seen": 146379325, "router_z_loss_clip": 1.94726562, "router_z_loss_mlp": 0.2331543, "step": 6820, "time_per_iteration": 2.897613525390625 }, { "auxiliary_loss_clip": 0.01477886, "auxiliary_loss_mlp": 0.01044924, "balance_loss_clip": 1.29430628, "balance_loss_mlp": 1.02178538, "epoch": 0.41010070644821883, "flos": 25748037244800.0, "grad_norm": 1.7921826800197256, "language_loss": 0.72247565, "learning_rate": 2.6664234959820256e-06, "loss": 0.74770379, "num_input_tokens_seen": 146398635, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.23144531, "step": 6821, "time_per_iteration": 2.9182918071746826 }, { "auxiliary_loss_clip": 0.01491708, "auxiliary_loss_mlp": 0.01045046, "balance_loss_clip": 1.30392683, "balance_loss_mlp": 1.02191973, "epoch": 0.4101608297008868, "flos": 22356213903360.0, "grad_norm": 2.3698047690408894, "language_loss": 0.74807107, "learning_rate": 2.6660562786159634e-06, "loss": 0.77343863, "num_input_tokens_seen": 146417585, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.23132324, "step": 6822, "time_per_iteration": 2.881303310394287 }, { "auxiliary_loss_clip": 0.0149689, "auxiliary_loss_mlp": 0.01040206, "balance_loss_clip": 1.30860281, "balance_loss_mlp": 1.01679361, "epoch": 0.41022095295355476, "flos": 21955226071680.0, "grad_norm": 2.4246625147081526, "language_loss": 0.76946557, "learning_rate": 2.6656890359929796e-06, "loss": 0.79483652, "num_input_tokens_seen": 146437035, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.23413086, "step": 6823, "time_per_iteration": 2.8137426376342773 }, { "auxiliary_loss_clip": 0.01520089, "auxiliary_loss_mlp": 0.01048429, "balance_loss_clip": 1.32630479, "balance_loss_mlp": 1.02393138, "epoch": 0.4102810762062228, "flos": 27461065080960.0, "grad_norm": 1.633870564991092, "language_loss": 0.74113715, "learning_rate": 2.665321768127001e-06, "loss": 0.76682234, "num_input_tokens_seen": 146457370, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.24511719, "step": 6824, "time_per_iteration": 2.8842718601226807 }, { "auxiliary_loss_clip": 0.01510394, "auxiliary_loss_mlp": 0.0104056, "balance_loss_clip": 1.31646633, "balance_loss_mlp": 1.01703978, "epoch": 0.41034119945889075, "flos": 24510117502080.0, "grad_norm": 1.8911254934102826, "language_loss": 0.72604531, "learning_rate": 2.6649544750319548e-06, "loss": 0.75155485, "num_input_tokens_seen": 146478105, "router_z_loss_clip": 1.93847656, "router_z_loss_mlp": 0.23510742, "step": 6825, "time_per_iteration": 2.95027494430542 }, { "auxiliary_loss_clip": 0.0149451, "auxiliary_loss_mlp": 0.01047178, "balance_loss_clip": 1.30716169, "balance_loss_mlp": 1.02476633, "epoch": 0.4104013227115587, "flos": 24363279567360.0, "grad_norm": 2.0859158041072257, "language_loss": 0.85443997, "learning_rate": 2.664587156721768e-06, "loss": 0.87985682, "num_input_tokens_seen": 146497835, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.22412109, "step": 6826, "time_per_iteration": 2.861497402191162 }, { "auxiliary_loss_clip": 0.01476708, "auxiliary_loss_mlp": 0.01048106, "balance_loss_clip": 1.29345107, "balance_loss_mlp": 1.02464545, "epoch": 0.4104614459642267, "flos": 23739026054400.0, "grad_norm": 1.8082822228329583, "language_loss": 0.67315394, "learning_rate": 2.6642198132103696e-06, "loss": 0.69840205, "num_input_tokens_seen": 146517735, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.23461914, "step": 6827, "time_per_iteration": 2.861844778060913 }, { "auxiliary_loss_clip": 0.0147627, "auxiliary_loss_mlp": 0.01043715, "balance_loss_clip": 1.29196548, "balance_loss_mlp": 1.01977801, "epoch": 0.41052156921689464, "flos": 22138287108480.0, "grad_norm": 1.5900784303265159, "language_loss": 0.73109925, "learning_rate": 2.663852444511689e-06, "loss": 0.75629914, "num_input_tokens_seen": 146537640, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23901367, "step": 6828, "time_per_iteration": 4.270995616912842 }, { "auxiliary_loss_clip": 0.0152184, "auxiliary_loss_mlp": 0.01048246, "balance_loss_clip": 1.32719588, "balance_loss_mlp": 1.02540517, "epoch": 0.4105816924695626, "flos": 20094048201600.0, "grad_norm": 2.4314770035356115, "language_loss": 0.8462798, "learning_rate": 2.6634850506396574e-06, "loss": 0.87198067, "num_input_tokens_seen": 146554695, "router_z_loss_clip": 1.9453125, "router_z_loss_mlp": 0.22839355, "step": 6829, "time_per_iteration": 2.891573429107666 }, { "auxiliary_loss_clip": 0.01486569, "auxiliary_loss_mlp": 0.01044864, "balance_loss_clip": 1.30024505, "balance_loss_mlp": 1.0210464, "epoch": 0.4106418157222306, "flos": 18085715683200.0, "grad_norm": 2.5933670443359844, "language_loss": 0.90433812, "learning_rate": 2.663117631608206e-06, "loss": 0.92965245, "num_input_tokens_seen": 146573740, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.23840332, "step": 6830, "time_per_iteration": 2.858628034591675 }, { "auxiliary_loss_clip": 0.01493384, "auxiliary_loss_mlp": 0.01046007, "balance_loss_clip": 1.30623245, "balance_loss_mlp": 1.02373886, "epoch": 0.41070193897489854, "flos": 21656528029440.0, "grad_norm": 2.420352675989182, "language_loss": 0.66549397, "learning_rate": 2.662750187431268e-06, "loss": 0.69088787, "num_input_tokens_seen": 146592885, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.22265625, "step": 6831, "time_per_iteration": 2.845128297805786 }, { "auxiliary_loss_clip": 0.01486334, "auxiliary_loss_mlp": 0.01043197, "balance_loss_clip": 1.30071735, "balance_loss_mlp": 1.01987958, "epoch": 0.4107620622275665, "flos": 26658636969600.0, "grad_norm": 1.8522413161824247, "language_loss": 0.70325738, "learning_rate": 2.662382718122776e-06, "loss": 0.7285527, "num_input_tokens_seen": 146611995, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.2331543, "step": 6832, "time_per_iteration": 2.9097986221313477 }, { "auxiliary_loss_clip": 0.01485424, "auxiliary_loss_mlp": 0.01047003, "balance_loss_clip": 1.29830289, "balance_loss_mlp": 1.02466285, "epoch": 0.41082218548023447, "flos": 18743613344640.0, "grad_norm": 2.3914154827605802, "language_loss": 0.74307233, "learning_rate": 2.662015223696666e-06, "loss": 0.76839662, "num_input_tokens_seen": 146628045, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.22338867, "step": 6833, "time_per_iteration": 2.894451379776001 }, { "auxiliary_loss_clip": 0.01507902, "auxiliary_loss_mlp": 0.01049967, "balance_loss_clip": 1.3156383, "balance_loss_mlp": 1.02461171, "epoch": 0.41088230873290243, "flos": 22904175404160.0, "grad_norm": 1.5968088300560257, "language_loss": 0.73362124, "learning_rate": 2.6616477041668713e-06, "loss": 0.75919992, "num_input_tokens_seen": 146648355, "router_z_loss_clip": 1.92089844, "router_z_loss_mlp": 0.25366211, "step": 6834, "time_per_iteration": 2.868238925933838 }, { "auxiliary_loss_clip": 0.01502374, "auxiliary_loss_mlp": 0.01048249, "balance_loss_clip": 1.31007266, "balance_loss_mlp": 1.02492011, "epoch": 0.4109424319855704, "flos": 24286897065600.0, "grad_norm": 2.793194308429929, "language_loss": 0.72230422, "learning_rate": 2.661280159547329e-06, "loss": 0.74781042, "num_input_tokens_seen": 146668370, "router_z_loss_clip": 1.92285156, "router_z_loss_mlp": 0.23352051, "step": 6835, "time_per_iteration": 4.229032039642334 }, { "auxiliary_loss_clip": 0.01495309, "auxiliary_loss_mlp": 0.01047519, "balance_loss_clip": 1.30688, "balance_loss_mlp": 1.02200806, "epoch": 0.41100255523823837, "flos": 12976837718400.0, "grad_norm": 1.988134279925879, "language_loss": 0.87960798, "learning_rate": 2.660912589851978e-06, "loss": 0.90503627, "num_input_tokens_seen": 146686665, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.25512695, "step": 6836, "time_per_iteration": 2.8039655685424805 }, { "auxiliary_loss_clip": 0.01468965, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.28607905, "balance_loss_mlp": 1.01846933, "epoch": 0.4110626784909064, "flos": 23155203409920.0, "grad_norm": 1.900427424252544, "language_loss": 0.69257081, "learning_rate": 2.6605449950947547e-06, "loss": 0.7177006, "num_input_tokens_seen": 146706570, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.25549316, "step": 6837, "time_per_iteration": 4.246564865112305 }, { "auxiliary_loss_clip": 0.01488475, "auxiliary_loss_mlp": 0.01045209, "balance_loss_clip": 1.29932785, "balance_loss_mlp": 1.02158177, "epoch": 0.41112280174357435, "flos": 22757654183040.0, "grad_norm": 2.207777278216685, "language_loss": 0.75998211, "learning_rate": 2.660177375289599e-06, "loss": 0.78531897, "num_input_tokens_seen": 146723425, "router_z_loss_clip": 1.890625, "router_z_loss_mlp": 0.23620605, "step": 6838, "time_per_iteration": 4.2281334400177 }, { "auxiliary_loss_clip": 0.01491048, "auxiliary_loss_mlp": 0.01040173, "balance_loss_clip": 1.30416203, "balance_loss_mlp": 1.01580644, "epoch": 0.4111829249962423, "flos": 21111416951040.0, "grad_norm": 2.04915714002317, "language_loss": 0.8284483, "learning_rate": 2.659809730450451e-06, "loss": 0.85376054, "num_input_tokens_seen": 146741640, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.24365234, "step": 6839, "time_per_iteration": 2.841240167617798 }, { "auxiliary_loss_clip": 0.01488941, "auxiliary_loss_mlp": 0.01039187, "balance_loss_clip": 1.30232918, "balance_loss_mlp": 1.01589346, "epoch": 0.4112430482489103, "flos": 21515436184320.0, "grad_norm": 1.9429800355147544, "language_loss": 0.81462586, "learning_rate": 2.6594420605912523e-06, "loss": 0.83990717, "num_input_tokens_seen": 146759195, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.23303223, "step": 6840, "time_per_iteration": 2.8043906688690186 }, { "auxiliary_loss_clip": 0.01472532, "auxiliary_loss_mlp": 0.01038142, "balance_loss_clip": 1.28852987, "balance_loss_mlp": 1.01434779, "epoch": 0.41130317150157825, "flos": 19578644974080.0, "grad_norm": 3.022319855016868, "language_loss": 0.68263793, "learning_rate": 2.6590743657259442e-06, "loss": 0.7077446, "num_input_tokens_seen": 146774990, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.23791504, "step": 6841, "time_per_iteration": 2.845144033432007 }, { "auxiliary_loss_clip": 0.01261095, "auxiliary_loss_mlp": 0.01087266, "balance_loss_clip": 1.14619362, "balance_loss_mlp": 1.05713022, "epoch": 0.4113632947542462, "flos": 62416339453440.0, "grad_norm": 0.7778926550012077, "language_loss": 0.59730232, "learning_rate": 2.65870664586847e-06, "loss": 0.62078595, "num_input_tokens_seen": 146839610, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 0.30078125, "step": 6842, "time_per_iteration": 3.4109702110290527 }, { "auxiliary_loss_clip": 0.01469282, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.28772473, "balance_loss_mlp": 1.02056646, "epoch": 0.4114234180069142, "flos": 13926782436480.0, "grad_norm": 2.1895254012432757, "language_loss": 0.70712823, "learning_rate": 2.6583389010327742e-06, "loss": 0.7322644, "num_input_tokens_seen": 146857360, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.2376709, "step": 6843, "time_per_iteration": 2.851491928100586 }, { "auxiliary_loss_clip": 0.01255814, "auxiliary_loss_mlp": 0.01060661, "balance_loss_clip": 1.14188027, "balance_loss_mlp": 1.03357625, "epoch": 0.41148354125958214, "flos": 64960100663040.0, "grad_norm": 0.7407055049932092, "language_loss": 0.5368247, "learning_rate": 2.6579711312328013e-06, "loss": 0.55998945, "num_input_tokens_seen": 146917055, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.27148438, "step": 6844, "time_per_iteration": 3.3211963176727295 }, { "auxiliary_loss_clip": 0.01484423, "auxiliary_loss_mlp": 0.01039385, "balance_loss_clip": 1.29965639, "balance_loss_mlp": 1.01615095, "epoch": 0.4115436645122501, "flos": 18736645645440.0, "grad_norm": 1.599714065018392, "language_loss": 0.66596085, "learning_rate": 2.6576033364824967e-06, "loss": 0.69119895, "num_input_tokens_seen": 146935215, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.23205566, "step": 6845, "time_per_iteration": 2.844075918197632 }, { "auxiliary_loss_clip": 0.01495056, "auxiliary_loss_mlp": 0.01040003, "balance_loss_clip": 1.30832243, "balance_loss_mlp": 1.01639962, "epoch": 0.41160378776491807, "flos": 16261484832000.0, "grad_norm": 1.8545358261424179, "language_loss": 0.70765793, "learning_rate": 2.657235516795808e-06, "loss": 0.73300844, "num_input_tokens_seen": 146951970, "router_z_loss_clip": 1.86621094, "router_z_loss_mlp": 0.23583984, "step": 6846, "time_per_iteration": 2.847303628921509 }, { "auxiliary_loss_clip": 0.01474085, "auxiliary_loss_mlp": 0.01040416, "balance_loss_clip": 1.29006851, "balance_loss_mlp": 1.01668191, "epoch": 0.41166391101758604, "flos": 27982983559680.0, "grad_norm": 2.0740972363976016, "language_loss": 0.65966374, "learning_rate": 2.6568676721866826e-06, "loss": 0.68480873, "num_input_tokens_seen": 146975615, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.23742676, "step": 6847, "time_per_iteration": 2.9432668685913086 }, { "auxiliary_loss_clip": 0.01468088, "auxiliary_loss_mlp": 0.01042867, "balance_loss_clip": 1.28329289, "balance_loss_mlp": 1.01803613, "epoch": 0.411724034270254, "flos": 34144593724800.0, "grad_norm": 1.4479413651102355, "language_loss": 0.71447742, "learning_rate": 2.656499802669069e-06, "loss": 0.73958701, "num_input_tokens_seen": 146998855, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.24853516, "step": 6848, "time_per_iteration": 2.954068183898926 }, { "auxiliary_loss_clip": 0.01255129, "auxiliary_loss_mlp": 0.01028373, "balance_loss_clip": 1.14034271, "balance_loss_mlp": 1.00186062, "epoch": 0.41178415752292197, "flos": 67956908486400.0, "grad_norm": 0.8955491248080992, "language_loss": 0.56309438, "learning_rate": 2.6561319082569174e-06, "loss": 0.58592939, "num_input_tokens_seen": 147062710, "router_z_loss_clip": 1.1484375, "router_z_loss_mlp": 0.265625, "step": 6849, "time_per_iteration": 3.441833019256592 }, { "auxiliary_loss_clip": 0.01469596, "auxiliary_loss_mlp": 0.01040829, "balance_loss_clip": 1.28840089, "balance_loss_mlp": 1.01656985, "epoch": 0.41184428077558993, "flos": 34327338048000.0, "grad_norm": 2.4875529014082907, "language_loss": 0.76922721, "learning_rate": 2.6557639889641783e-06, "loss": 0.79433143, "num_input_tokens_seen": 147086075, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.24267578, "step": 6850, "time_per_iteration": 2.91916823387146 }, { "auxiliary_loss_clip": 0.01476821, "auxiliary_loss_mlp": 0.01043392, "balance_loss_clip": 1.29365945, "balance_loss_mlp": 1.01842976, "epoch": 0.41190440402825795, "flos": 35457583870080.0, "grad_norm": 1.6599565790335393, "language_loss": 0.68368983, "learning_rate": 2.6553960448048025e-06, "loss": 0.70889199, "num_input_tokens_seen": 147107590, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.24975586, "step": 6851, "time_per_iteration": 2.9478774070739746 }, { "auxiliary_loss_clip": 0.01489035, "auxiliary_loss_mlp": 0.01045399, "balance_loss_clip": 1.30061626, "balance_loss_mlp": 1.01916122, "epoch": 0.4119645272809259, "flos": 20859574538880.0, "grad_norm": 4.093361043007292, "language_loss": 0.80608428, "learning_rate": 2.655028075792743e-06, "loss": 0.83142859, "num_input_tokens_seen": 147123715, "router_z_loss_clip": 1.88671875, "router_z_loss_mlp": 0.26257324, "step": 6852, "time_per_iteration": 2.798630714416504 }, { "auxiliary_loss_clip": 0.01501884, "auxiliary_loss_mlp": 0.01039983, "balance_loss_clip": 1.31094241, "balance_loss_mlp": 1.01536632, "epoch": 0.4120246505335939, "flos": 27573172992000.0, "grad_norm": 2.7528250641062053, "language_loss": 0.78394663, "learning_rate": 2.6546600819419537e-06, "loss": 0.80936527, "num_input_tokens_seen": 147144290, "router_z_loss_clip": 1.91015625, "router_z_loss_mlp": 0.24621582, "step": 6853, "time_per_iteration": 2.8998210430145264 }, { "auxiliary_loss_clip": 0.0151055, "auxiliary_loss_mlp": 0.01040347, "balance_loss_clip": 1.31571579, "balance_loss_mlp": 1.01556325, "epoch": 0.41208477378626185, "flos": 37829459508480.0, "grad_norm": 1.7192068076063796, "language_loss": 0.66882527, "learning_rate": 2.6542920632663883e-06, "loss": 0.69433427, "num_input_tokens_seen": 147166340, "router_z_loss_clip": 1.94628906, "router_z_loss_mlp": 0.2479248, "step": 6854, "time_per_iteration": 2.9895408153533936 }, { "auxiliary_loss_clip": 0.01489289, "auxiliary_loss_mlp": 0.01038929, "balance_loss_clip": 1.30173576, "balance_loss_mlp": 1.01483643, "epoch": 0.4121448970389298, "flos": 23451232008960.0, "grad_norm": 1.8294741119445719, "language_loss": 0.8467797, "learning_rate": 2.6539240197800023e-06, "loss": 0.87206185, "num_input_tokens_seen": 147184025, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.24108887, "step": 6855, "time_per_iteration": 2.825977325439453 }, { "auxiliary_loss_clip": 0.0146181, "auxiliary_loss_mlp": 0.01042846, "balance_loss_clip": 1.28019762, "balance_loss_mlp": 1.01973104, "epoch": 0.4122050202915978, "flos": 21335406549120.0, "grad_norm": 1.9175771881984625, "language_loss": 0.79817897, "learning_rate": 2.6535559514967517e-06, "loss": 0.8232255, "num_input_tokens_seen": 147202730, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.23120117, "step": 6856, "time_per_iteration": 2.9382219314575195 }, { "auxiliary_loss_clip": 0.01481002, "auxiliary_loss_mlp": 0.01039472, "balance_loss_clip": 1.29526663, "balance_loss_mlp": 1.01665521, "epoch": 0.41226514354426574, "flos": 17313900318720.0, "grad_norm": 2.3882862688394773, "language_loss": 0.80672741, "learning_rate": 2.6531878584305935e-06, "loss": 0.83193213, "num_input_tokens_seen": 147215315, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.22814941, "step": 6857, "time_per_iteration": 2.8730032444000244 }, { "auxiliary_loss_clip": 0.01485696, "auxiliary_loss_mlp": 0.01040995, "balance_loss_clip": 1.29840446, "balance_loss_mlp": 1.01771307, "epoch": 0.4123252667969337, "flos": 17647645098240.0, "grad_norm": 4.477133480406094, "language_loss": 0.71517229, "learning_rate": 2.6528197405954873e-06, "loss": 0.74043924, "num_input_tokens_seen": 147233330, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.23291016, "step": 6858, "time_per_iteration": 2.845611333847046 }, { "auxiliary_loss_clip": 0.01481609, "auxiliary_loss_mlp": 0.01045615, "balance_loss_clip": 1.2970283, "balance_loss_mlp": 1.02092671, "epoch": 0.4123853900496017, "flos": 46440730402560.0, "grad_norm": 1.5771376995442332, "language_loss": 0.59834838, "learning_rate": 2.652451598005391e-06, "loss": 0.62362063, "num_input_tokens_seen": 147257780, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.24682617, "step": 6859, "time_per_iteration": 3.0960280895233154 }, { "auxiliary_loss_clip": 0.01488418, "auxiliary_loss_mlp": 0.01042702, "balance_loss_clip": 1.2987653, "balance_loss_mlp": 1.01918197, "epoch": 0.41244551330226964, "flos": 17683913445120.0, "grad_norm": 1.9663359051327043, "language_loss": 0.74357039, "learning_rate": 2.652083430674264e-06, "loss": 0.76888162, "num_input_tokens_seen": 147276055, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.23522949, "step": 6860, "time_per_iteration": 2.842412233352661 }, { "auxiliary_loss_clip": 0.01472462, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.28814697, "balance_loss_mlp": 1.01788545, "epoch": 0.4125056365549376, "flos": 18701960866560.0, "grad_norm": 1.6949722307987445, "language_loss": 0.75003755, "learning_rate": 2.651715238616068e-06, "loss": 0.77516615, "num_input_tokens_seen": 147293200, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22509766, "step": 6861, "time_per_iteration": 2.8212027549743652 }, { "auxiliary_loss_clip": 0.01470888, "auxiliary_loss_mlp": 0.01041382, "balance_loss_clip": 1.28818917, "balance_loss_mlp": 1.01910138, "epoch": 0.41256575980760557, "flos": 17904419193600.0, "grad_norm": 2.7392445868276383, "language_loss": 0.8079555, "learning_rate": 2.651347021844765e-06, "loss": 0.83307821, "num_input_tokens_seen": 147310640, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.22290039, "step": 6862, "time_per_iteration": 2.795100450515747 }, { "auxiliary_loss_clip": 0.01479343, "auxiliary_loss_mlp": 0.01040407, "balance_loss_clip": 1.29515588, "balance_loss_mlp": 1.01840115, "epoch": 0.41262588306027354, "flos": 21991403928960.0, "grad_norm": 1.7378878457055444, "language_loss": 0.77036387, "learning_rate": 2.650978780374318e-06, "loss": 0.79556143, "num_input_tokens_seen": 147329435, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.22009277, "step": 6863, "time_per_iteration": 4.253396034240723 }, { "auxiliary_loss_clip": 0.0125764, "auxiliary_loss_mlp": 0.01052216, "balance_loss_clip": 1.1467793, "balance_loss_mlp": 1.02904177, "epoch": 0.41268600631294156, "flos": 53375350279680.0, "grad_norm": 0.7120610851283444, "language_loss": 0.52716279, "learning_rate": 2.650610514218691e-06, "loss": 0.55026138, "num_input_tokens_seen": 147385805, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.23144531, "step": 6864, "time_per_iteration": 3.327979326248169 }, { "auxiliary_loss_clip": 0.01497462, "auxiliary_loss_mlp": 0.01040797, "balance_loss_clip": 1.30744147, "balance_loss_mlp": 1.01784968, "epoch": 0.4127461295656095, "flos": 24395023434240.0, "grad_norm": 1.7255539229402597, "language_loss": 0.73271179, "learning_rate": 2.6502422233918468e-06, "loss": 0.75809443, "num_input_tokens_seen": 147405160, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.22961426, "step": 6865, "time_per_iteration": 2.866166353225708 }, { "auxiliary_loss_clip": 0.01252854, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.14252031, "balance_loss_mlp": 1.00590169, "epoch": 0.4128062528182775, "flos": 71736100974720.0, "grad_norm": 0.9256410644446629, "language_loss": 0.66651034, "learning_rate": 2.649873907907753e-06, "loss": 0.68938398, "num_input_tokens_seen": 147460245, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.28515625, "step": 6866, "time_per_iteration": 3.1921327114105225 }, { "auxiliary_loss_clip": 0.01466348, "auxiliary_loss_mlp": 0.01041662, "balance_loss_clip": 1.28318715, "balance_loss_mlp": 1.01933408, "epoch": 0.41286637607094545, "flos": 17855708526720.0, "grad_norm": 2.442182167673794, "language_loss": 0.82400918, "learning_rate": 2.649505567780375e-06, "loss": 0.84908921, "num_input_tokens_seen": 147476200, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.22338867, "step": 6867, "time_per_iteration": 2.9046554565429688 }, { "auxiliary_loss_clip": 0.01492797, "auxiliary_loss_mlp": 0.01050075, "balance_loss_clip": 1.30393386, "balance_loss_mlp": 1.02612615, "epoch": 0.4129264993236134, "flos": 25558641936000.0, "grad_norm": 1.9904360713065918, "language_loss": 0.78160602, "learning_rate": 2.6491372030236815e-06, "loss": 0.80703473, "num_input_tokens_seen": 147494315, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.23950195, "step": 6868, "time_per_iteration": 2.9100170135498047 }, { "auxiliary_loss_clip": 0.01246431, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.13838422, "balance_loss_mlp": 1.0170939, "epoch": 0.4129866225762814, "flos": 65439715253760.0, "grad_norm": 0.877080018897909, "language_loss": 0.57936358, "learning_rate": 2.64876881365164e-06, "loss": 0.60226202, "num_input_tokens_seen": 147543665, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.26367188, "step": 6869, "time_per_iteration": 3.06510591506958 }, { "auxiliary_loss_clip": 0.01473092, "auxiliary_loss_mlp": 0.01047068, "balance_loss_clip": 1.29077935, "balance_loss_mlp": 1.02311838, "epoch": 0.41304674582894935, "flos": 28888832580480.0, "grad_norm": 1.7110666604344338, "language_loss": 0.75332403, "learning_rate": 2.64840039967822e-06, "loss": 0.77852559, "num_input_tokens_seen": 147564870, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.23937988, "step": 6870, "time_per_iteration": 4.305445194244385 }, { "auxiliary_loss_clip": 0.01478413, "auxiliary_loss_mlp": 0.01052422, "balance_loss_clip": 1.29201114, "balance_loss_mlp": 1.02972436, "epoch": 0.4131068690816173, "flos": 22901913164160.0, "grad_norm": 2.344985545419534, "language_loss": 0.84253383, "learning_rate": 2.6480319611173912e-06, "loss": 0.8678422, "num_input_tokens_seen": 147584840, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.22680664, "step": 6871, "time_per_iteration": 2.8669683933258057 }, { "auxiliary_loss_clip": 0.01492147, "auxiliary_loss_mlp": 0.01050594, "balance_loss_clip": 1.30457711, "balance_loss_mlp": 1.02690744, "epoch": 0.4131669923342853, "flos": 26075673976320.0, "grad_norm": 1.9893437650736632, "language_loss": 0.69348121, "learning_rate": 2.6476634979831263e-06, "loss": 0.71890855, "num_input_tokens_seen": 147604635, "router_z_loss_clip": 1.87304688, "router_z_loss_mlp": 0.23669434, "step": 6872, "time_per_iteration": 4.267404556274414 }, { "auxiliary_loss_clip": 0.01490902, "auxiliary_loss_mlp": 0.01047025, "balance_loss_clip": 1.30498171, "balance_loss_mlp": 1.02367187, "epoch": 0.41322711558695324, "flos": 19253858664960.0, "grad_norm": 1.963146595009174, "language_loss": 0.76522815, "learning_rate": 2.6472950102893964e-06, "loss": 0.79060739, "num_input_tokens_seen": 147620700, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.23376465, "step": 6873, "time_per_iteration": 4.210752010345459 }, { "auxiliary_loss_clip": 0.01492228, "auxiliary_loss_mlp": 0.01046203, "balance_loss_clip": 1.30374336, "balance_loss_mlp": 1.02380359, "epoch": 0.4132872388396212, "flos": 22684710286080.0, "grad_norm": 2.278591653059523, "language_loss": 0.8436712, "learning_rate": 2.6469264980501746e-06, "loss": 0.86905551, "num_input_tokens_seen": 147639490, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.22387695, "step": 6874, "time_per_iteration": 2.830596685409546 }, { "auxiliary_loss_clip": 0.01491447, "auxiliary_loss_mlp": 0.01049935, "balance_loss_clip": 1.30343723, "balance_loss_mlp": 1.02653468, "epoch": 0.4133473620922892, "flos": 20158033628160.0, "grad_norm": 1.958956667522632, "language_loss": 0.72196603, "learning_rate": 2.646557961279436e-06, "loss": 0.74737984, "num_input_tokens_seen": 147657205, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.23413086, "step": 6875, "time_per_iteration": 2.830190420150757 }, { "auxiliary_loss_clip": 0.01460331, "auxiliary_loss_mlp": 0.01051017, "balance_loss_clip": 1.28374803, "balance_loss_mlp": 1.03002429, "epoch": 0.41340748534495714, "flos": 24253252917120.0, "grad_norm": 1.531374672648363, "language_loss": 0.83347952, "learning_rate": 2.646189399991154e-06, "loss": 0.85859305, "num_input_tokens_seen": 147677005, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.20996094, "step": 6876, "time_per_iteration": 2.872391939163208 }, { "auxiliary_loss_clip": 0.01493477, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.30371583, "balance_loss_mlp": 1.02099931, "epoch": 0.41346760859762516, "flos": 14400261717120.0, "grad_norm": 2.3666888319883275, "language_loss": 0.67204446, "learning_rate": 2.6458208141993048e-06, "loss": 0.6974138, "num_input_tokens_seen": 147693435, "router_z_loss_clip": 1.89648438, "router_z_loss_mlp": 0.22460938, "step": 6877, "time_per_iteration": 2.8142096996307373 }, { "auxiliary_loss_clip": 0.0148267, "auxiliary_loss_mlp": 0.01041894, "balance_loss_clip": 1.29741311, "balance_loss_mlp": 1.01955414, "epoch": 0.4135277318502931, "flos": 22502192186880.0, "grad_norm": 2.151678750815433, "language_loss": 0.77510905, "learning_rate": 2.6454522039178668e-06, "loss": 0.8003546, "num_input_tokens_seen": 147714000, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.22338867, "step": 6878, "time_per_iteration": 2.972215414047241 }, { "auxiliary_loss_clip": 0.0147639, "auxiliary_loss_mlp": 0.0104313, "balance_loss_clip": 1.29042625, "balance_loss_mlp": 1.02071917, "epoch": 0.4135878551029611, "flos": 22428841086720.0, "grad_norm": 2.1443786362273833, "language_loss": 0.8104406, "learning_rate": 2.6450835691608154e-06, "loss": 0.83563578, "num_input_tokens_seen": 147731010, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.22399902, "step": 6879, "time_per_iteration": 2.9080541133880615 }, { "auxiliary_loss_clip": 0.01482621, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.29728162, "balance_loss_mlp": 1.0139426, "epoch": 0.41364797835562905, "flos": 27064737463680.0, "grad_norm": 1.6899336145076975, "language_loss": 0.8556205, "learning_rate": 2.6447149099421315e-06, "loss": 0.88080275, "num_input_tokens_seen": 147750880, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.21679688, "step": 6880, "time_per_iteration": 2.9099271297454834 }, { "auxiliary_loss_clip": 0.01497324, "auxiliary_loss_mlp": 0.01032968, "balance_loss_clip": 1.31036496, "balance_loss_mlp": 1.01216626, "epoch": 0.413708101608297, "flos": 22978250421120.0, "grad_norm": 1.6306094437850742, "language_loss": 0.71441436, "learning_rate": 2.6443462262757927e-06, "loss": 0.73971725, "num_input_tokens_seen": 147771360, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.20776367, "step": 6881, "time_per_iteration": 2.8861281871795654 }, { "auxiliary_loss_clip": 0.01472447, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.29374957, "balance_loss_mlp": 1.01932311, "epoch": 0.413768224860965, "flos": 13341783427200.0, "grad_norm": 1.9254513433758464, "language_loss": 0.82200646, "learning_rate": 2.6439775181757805e-06, "loss": 0.84714204, "num_input_tokens_seen": 147787440, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21789551, "step": 6882, "time_per_iteration": 2.8194236755371094 }, { "auxiliary_loss_clip": 0.01506984, "auxiliary_loss_mlp": 0.01040388, "balance_loss_clip": 1.31642306, "balance_loss_mlp": 1.0172137, "epoch": 0.41382834811363295, "flos": 20823803884800.0, "grad_norm": 2.39409409242022, "language_loss": 0.70973873, "learning_rate": 2.643608785656077e-06, "loss": 0.73521245, "num_input_tokens_seen": 147805720, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.23168945, "step": 6883, "time_per_iteration": 2.849210500717163 }, { "auxiliary_loss_clip": 0.01488981, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.30304372, "balance_loss_mlp": 1.01460969, "epoch": 0.4138884713663009, "flos": 20676694481280.0, "grad_norm": 2.2724040049877807, "language_loss": 0.7598722, "learning_rate": 2.643240028730663e-06, "loss": 0.78512436, "num_input_tokens_seen": 147824605, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.21618652, "step": 6884, "time_per_iteration": 2.847496271133423 }, { "auxiliary_loss_clip": 0.0149666, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.30699754, "balance_loss_mlp": 1.01798332, "epoch": 0.4139485946189689, "flos": 29067776340480.0, "grad_norm": 1.4649851508786178, "language_loss": 0.76812267, "learning_rate": 2.642871247413523e-06, "loss": 0.7935009, "num_input_tokens_seen": 147845445, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.23181152, "step": 6885, "time_per_iteration": 2.9419925212860107 }, { "auxiliary_loss_clip": 0.01480984, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.29431677, "balance_loss_mlp": 1.01791775, "epoch": 0.41400871787163684, "flos": 24436404443520.0, "grad_norm": 2.2244528028441044, "language_loss": 0.70695424, "learning_rate": 2.6425024417186414e-06, "loss": 0.73215491, "num_input_tokens_seen": 147865580, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.21154785, "step": 6886, "time_per_iteration": 2.8810081481933594 }, { "auxiliary_loss_clip": 0.01500476, "auxiliary_loss_mlp": 0.01035708, "balance_loss_clip": 1.31193352, "balance_loss_mlp": 1.01361871, "epoch": 0.4140688411243048, "flos": 19473550007040.0, "grad_norm": 1.5063437157537314, "language_loss": 0.76156199, "learning_rate": 2.642133611660002e-06, "loss": 0.78692383, "num_input_tokens_seen": 147885230, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.22094727, "step": 6887, "time_per_iteration": 2.8531312942504883 }, { "auxiliary_loss_clip": 0.0147587, "auxiliary_loss_mlp": 0.0103867, "balance_loss_clip": 1.29134655, "balance_loss_mlp": 1.01680744, "epoch": 0.4141289643769728, "flos": 19321961368320.0, "grad_norm": 2.0862045365132196, "language_loss": 0.71131694, "learning_rate": 2.641764757251592e-06, "loss": 0.73646235, "num_input_tokens_seen": 147903035, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.21850586, "step": 6888, "time_per_iteration": 2.7990520000457764 }, { "auxiliary_loss_clip": 0.01479406, "auxiliary_loss_mlp": 0.01044267, "balance_loss_clip": 1.29449272, "balance_loss_mlp": 1.02128303, "epoch": 0.41418908762964074, "flos": 16735507050240.0, "grad_norm": 1.8321557567320765, "language_loss": 0.76738489, "learning_rate": 2.6413958785073976e-06, "loss": 0.79262161, "num_input_tokens_seen": 147918745, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.22998047, "step": 6889, "time_per_iteration": 2.7860748767852783 }, { "auxiliary_loss_clip": 0.01482644, "auxiliary_loss_mlp": 0.01041093, "balance_loss_clip": 1.29830194, "balance_loss_mlp": 1.01754928, "epoch": 0.41424921088230876, "flos": 25306573299840.0, "grad_norm": 1.6306885033708345, "language_loss": 0.80409825, "learning_rate": 2.6410269754414074e-06, "loss": 0.82933569, "num_input_tokens_seen": 147938265, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.23547363, "step": 6890, "time_per_iteration": 2.8611912727355957 }, { "auxiliary_loss_clip": 0.01465247, "auxiliary_loss_mlp": 0.01041765, "balance_loss_clip": 1.28518569, "balance_loss_mlp": 1.01912737, "epoch": 0.4143093341349767, "flos": 20970551329920.0, "grad_norm": 1.521467134705875, "language_loss": 0.74476725, "learning_rate": 2.6406580480676113e-06, "loss": 0.76983738, "num_input_tokens_seen": 147957320, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.22631836, "step": 6891, "time_per_iteration": 2.8638806343078613 }, { "auxiliary_loss_clip": 0.01494331, "auxiliary_loss_mlp": 0.0104316, "balance_loss_clip": 1.30581307, "balance_loss_mlp": 1.01942563, "epoch": 0.4143694573876447, "flos": 22027762765440.0, "grad_norm": 2.244979056049151, "language_loss": 0.84873593, "learning_rate": 2.6402890963999963e-06, "loss": 0.87411082, "num_input_tokens_seen": 147977045, "router_z_loss_clip": 1.88574219, "router_z_loss_mlp": 0.23754883, "step": 6892, "time_per_iteration": 2.874753713607788 }, { "auxiliary_loss_clip": 0.01465113, "auxiliary_loss_mlp": 0.01038307, "balance_loss_clip": 1.28558075, "balance_loss_mlp": 1.01570463, "epoch": 0.41442958064031266, "flos": 35710376423040.0, "grad_norm": 2.068802017375974, "language_loss": 0.70582175, "learning_rate": 2.6399201204525554e-06, "loss": 0.73085594, "num_input_tokens_seen": 147996905, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.22607422, "step": 6893, "time_per_iteration": 2.9320008754730225 }, { "auxiliary_loss_clip": 0.01469798, "auxiliary_loss_mlp": 0.01039285, "balance_loss_clip": 1.28716445, "balance_loss_mlp": 1.01627803, "epoch": 0.4144897038929806, "flos": 28305598118400.0, "grad_norm": 2.1881171949293177, "language_loss": 0.73484409, "learning_rate": 2.639551120239279e-06, "loss": 0.7599349, "num_input_tokens_seen": 148017875, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.23010254, "step": 6894, "time_per_iteration": 2.874009370803833 }, { "auxiliary_loss_clip": 0.01483006, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.29888237, "balance_loss_mlp": 1.01533818, "epoch": 0.4145498271456486, "flos": 11653486513920.0, "grad_norm": 2.467994921179865, "language_loss": 0.63668305, "learning_rate": 2.63918209577416e-06, "loss": 0.66188794, "num_input_tokens_seen": 148032300, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.22143555, "step": 6895, "time_per_iteration": 2.771549701690674 }, { "auxiliary_loss_clip": 0.01459426, "auxiliary_loss_mlp": 0.01037018, "balance_loss_clip": 1.27930975, "balance_loss_mlp": 1.01560819, "epoch": 0.41460995039831655, "flos": 27247165073280.0, "grad_norm": 1.4727994012119314, "language_loss": 0.71484256, "learning_rate": 2.638813047071192e-06, "loss": 0.73980701, "num_input_tokens_seen": 148053260, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.2142334, "step": 6896, "time_per_iteration": 2.8818492889404297 }, { "auxiliary_loss_clip": 0.0148333, "auxiliary_loss_mlp": 0.01044976, "balance_loss_clip": 1.29839325, "balance_loss_mlp": 1.02280307, "epoch": 0.4146700736509845, "flos": 25933631990400.0, "grad_norm": 1.6286665016432573, "language_loss": 0.73116541, "learning_rate": 2.6384439741443696e-06, "loss": 0.75644845, "num_input_tokens_seen": 148072965, "router_z_loss_clip": 1.84960938, "router_z_loss_mlp": 0.22167969, "step": 6897, "time_per_iteration": 2.890639543533325 }, { "auxiliary_loss_clip": 0.01482771, "auxiliary_loss_mlp": 0.01040744, "balance_loss_clip": 1.29896235, "balance_loss_mlp": 1.01883316, "epoch": 0.4147301969036525, "flos": 26844412694400.0, "grad_norm": 1.6545732595967189, "language_loss": 0.84912288, "learning_rate": 2.6380748770076873e-06, "loss": 0.874358, "num_input_tokens_seen": 148093240, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.21911621, "step": 6898, "time_per_iteration": 4.335132360458374 }, { "auxiliary_loss_clip": 0.01477783, "auxiliary_loss_mlp": 0.01042309, "balance_loss_clip": 1.29238033, "balance_loss_mlp": 1.02005291, "epoch": 0.41479032015632045, "flos": 20306726599680.0, "grad_norm": 2.344241084866195, "language_loss": 0.75035512, "learning_rate": 2.6377057556751416e-06, "loss": 0.77555597, "num_input_tokens_seen": 148110925, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.22241211, "step": 6899, "time_per_iteration": 2.9439637660980225 }, { "auxiliary_loss_clip": 0.01482776, "auxiliary_loss_mlp": 0.01037831, "balance_loss_clip": 1.29513884, "balance_loss_mlp": 1.01559889, "epoch": 0.4148504434089884, "flos": 25275915308160.0, "grad_norm": 1.6920190875141417, "language_loss": 0.76585865, "learning_rate": 2.6373366101607306e-06, "loss": 0.79106474, "num_input_tokens_seen": 148130670, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.22216797, "step": 6900, "time_per_iteration": 2.947110652923584 }, { "auxiliary_loss_clip": 0.01458399, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.27707088, "balance_loss_mlp": 1.01271605, "epoch": 0.4149105666616564, "flos": 12829728314880.0, "grad_norm": 2.70312924849219, "language_loss": 0.82069004, "learning_rate": 2.6369674404784503e-06, "loss": 0.84562099, "num_input_tokens_seen": 148148350, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.2199707, "step": 6901, "time_per_iteration": 2.816500663757324 }, { "auxiliary_loss_clip": 0.01452396, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.27164912, "balance_loss_mlp": 1.01165962, "epoch": 0.41497068991432434, "flos": 16772635048320.0, "grad_norm": 1.6240311830311718, "language_loss": 0.70454144, "learning_rate": 2.6365982466423014e-06, "loss": 0.72940129, "num_input_tokens_seen": 148167550, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.21960449, "step": 6902, "time_per_iteration": 2.8383026123046875 }, { "auxiliary_loss_clip": 0.01458264, "auxiliary_loss_mlp": 0.01037114, "balance_loss_clip": 1.28006101, "balance_loss_mlp": 1.01651514, "epoch": 0.4150308131669923, "flos": 18009061712640.0, "grad_norm": 1.6322678504652326, "language_loss": 0.84044623, "learning_rate": 2.6362290286662834e-06, "loss": 0.86540002, "num_input_tokens_seen": 148184740, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.20605469, "step": 6903, "time_per_iteration": 2.825498104095459 }, { "auxiliary_loss_clip": 0.01477709, "auxiliary_loss_mlp": 0.01035113, "balance_loss_clip": 1.29021764, "balance_loss_mlp": 1.01184368, "epoch": 0.41509093641966033, "flos": 30056975562240.0, "grad_norm": 2.099364021373867, "language_loss": 0.68739021, "learning_rate": 2.6358597865643968e-06, "loss": 0.71251845, "num_input_tokens_seen": 148204605, "router_z_loss_clip": 1.87402344, "router_z_loss_mlp": 0.23254395, "step": 6904, "time_per_iteration": 2.927365303039551 }, { "auxiliary_loss_clip": 0.01477477, "auxiliary_loss_mlp": 0.01033915, "balance_loss_clip": 1.29093623, "balance_loss_mlp": 1.01308894, "epoch": 0.4151510596723283, "flos": 24290697628800.0, "grad_norm": 1.5856569311533761, "language_loss": 0.78396785, "learning_rate": 2.635490520350643e-06, "loss": 0.80908179, "num_input_tokens_seen": 148224675, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.20825195, "step": 6905, "time_per_iteration": 4.293271541595459 }, { "auxiliary_loss_clip": 0.0148135, "auxiliary_loss_mlp": 0.01034829, "balance_loss_clip": 1.29516649, "balance_loss_mlp": 1.01338279, "epoch": 0.41521118292499626, "flos": 23486414480640.0, "grad_norm": 2.2893323212753716, "language_loss": 0.69413459, "learning_rate": 2.635121230039025e-06, "loss": 0.71929634, "num_input_tokens_seen": 148243375, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.21447754, "step": 6906, "time_per_iteration": 4.284117937088013 }, { "auxiliary_loss_clip": 0.0147293, "auxiliary_loss_mlp": 0.0103805, "balance_loss_clip": 1.29129636, "balance_loss_mlp": 1.0170933, "epoch": 0.4152713061776642, "flos": 22135165217280.0, "grad_norm": 2.3240208409198133, "language_loss": 0.68714958, "learning_rate": 2.6347519156435467e-06, "loss": 0.71225935, "num_input_tokens_seen": 148261140, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.20959473, "step": 6907, "time_per_iteration": 2.804215669631958 }, { "auxiliary_loss_clip": 0.01484526, "auxiliary_loss_mlp": 0.01036051, "balance_loss_clip": 1.29960179, "balance_loss_mlp": 1.0150826, "epoch": 0.4153314294303322, "flos": 21261241042560.0, "grad_norm": 1.6852922140142488, "language_loss": 0.77666736, "learning_rate": 2.6343825771782123e-06, "loss": 0.80187309, "num_input_tokens_seen": 148279655, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.2097168, "step": 6908, "time_per_iteration": 4.2858452796936035 }, { "auxiliary_loss_clip": 0.01252653, "auxiliary_loss_mlp": 0.01028903, "balance_loss_clip": 1.14677954, "balance_loss_mlp": 1.00143743, "epoch": 0.41539155268300015, "flos": 57949794938880.0, "grad_norm": 0.7771906642726056, "language_loss": 0.64911181, "learning_rate": 2.634013214657026e-06, "loss": 0.67192733, "num_input_tokens_seen": 148339005, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.27539062, "step": 6909, "time_per_iteration": 3.394258737564087 }, { "auxiliary_loss_clip": 0.01461123, "auxiliary_loss_mlp": 0.01037713, "balance_loss_clip": 1.28090119, "balance_loss_mlp": 1.01681566, "epoch": 0.4154516759356681, "flos": 21912532963200.0, "grad_norm": 1.4477332482717022, "language_loss": 0.87779993, "learning_rate": 2.633643828093996e-06, "loss": 0.90278828, "num_input_tokens_seen": 148358715, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.20898438, "step": 6910, "time_per_iteration": 2.868013858795166 }, { "auxiliary_loss_clip": 0.01255988, "auxiliary_loss_mlp": 0.01048283, "balance_loss_clip": 1.14746332, "balance_loss_mlp": 1.02158022, "epoch": 0.4155117991883361, "flos": 67862473309440.0, "grad_norm": 0.8458552621962913, "language_loss": 0.62203991, "learning_rate": 2.633274417503128e-06, "loss": 0.64508259, "num_input_tokens_seen": 148417280, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 0.26757812, "step": 6911, "time_per_iteration": 3.2872982025146484 }, { "auxiliary_loss_clip": 0.0151258, "auxiliary_loss_mlp": 0.01043159, "balance_loss_clip": 1.3214916, "balance_loss_mlp": 1.02245259, "epoch": 0.41557192244100405, "flos": 14290732759680.0, "grad_norm": 2.671473350303384, "language_loss": 0.88278198, "learning_rate": 2.6329049828984312e-06, "loss": 0.90833938, "num_input_tokens_seen": 148432610, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.20727539, "step": 6912, "time_per_iteration": 2.8015835285186768 }, { "auxiliary_loss_clip": 0.01486849, "auxiliary_loss_mlp": 0.0104264, "balance_loss_clip": 1.3021158, "balance_loss_mlp": 1.0230062, "epoch": 0.415632045693672, "flos": 24472582300800.0, "grad_norm": 1.9934335804515464, "language_loss": 0.64108634, "learning_rate": 2.632535524293914e-06, "loss": 0.66638124, "num_input_tokens_seen": 148451510, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.19628906, "step": 6913, "time_per_iteration": 2.8591151237487793 }, { "auxiliary_loss_clip": 0.01472271, "auxiliary_loss_mlp": 0.01046728, "balance_loss_clip": 1.29105484, "balance_loss_mlp": 1.02592635, "epoch": 0.41569216894634, "flos": 20123529828480.0, "grad_norm": 1.702501783458402, "language_loss": 0.76291573, "learning_rate": 2.632166041703586e-06, "loss": 0.78810573, "num_input_tokens_seen": 148469945, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.20812988, "step": 6914, "time_per_iteration": 2.86930251121521 }, { "auxiliary_loss_clip": 0.01481192, "auxiliary_loss_mlp": 0.01054675, "balance_loss_clip": 1.29648399, "balance_loss_mlp": 1.03314602, "epoch": 0.41575229219900794, "flos": 23807626450560.0, "grad_norm": 51.10460125866058, "language_loss": 0.88122994, "learning_rate": 2.631796535141458e-06, "loss": 0.90658867, "num_input_tokens_seen": 148486655, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.2154541, "step": 6915, "time_per_iteration": 2.8269529342651367 }, { "auxiliary_loss_clip": 0.01477652, "auxiliary_loss_mlp": 0.0105588, "balance_loss_clip": 1.29412389, "balance_loss_mlp": 1.0346489, "epoch": 0.4158124154516759, "flos": 23117532474240.0, "grad_norm": 1.8723635601343491, "language_loss": 0.72166598, "learning_rate": 2.6314270046215426e-06, "loss": 0.74700129, "num_input_tokens_seen": 148505035, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.21228027, "step": 6916, "time_per_iteration": 2.872739553451538 }, { "auxiliary_loss_clip": 0.01487386, "auxiliary_loss_mlp": 0.01067389, "balance_loss_clip": 1.30050254, "balance_loss_mlp": 1.0457046, "epoch": 0.41587253870434393, "flos": 24253524385920.0, "grad_norm": 1.9128601953372484, "language_loss": 0.73042518, "learning_rate": 2.631057450157852e-06, "loss": 0.75597292, "num_input_tokens_seen": 148525575, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.21691895, "step": 6917, "time_per_iteration": 2.8505101203918457 }, { "auxiliary_loss_clip": 0.01475226, "auxiliary_loss_mlp": 0.01064792, "balance_loss_clip": 1.29219627, "balance_loss_mlp": 1.04331017, "epoch": 0.4159326619570119, "flos": 23892967422720.0, "grad_norm": 1.4375623847442527, "language_loss": 0.8136549, "learning_rate": 2.6306878717643988e-06, "loss": 0.83905512, "num_input_tokens_seen": 148547270, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.21472168, "step": 6918, "time_per_iteration": 2.9246771335601807 }, { "auxiliary_loss_clip": 0.01487141, "auxiliary_loss_mlp": 0.01062799, "balance_loss_clip": 1.29998672, "balance_loss_mlp": 1.04072142, "epoch": 0.41599278520967986, "flos": 40641713216640.0, "grad_norm": 1.3686734367103763, "language_loss": 0.70739472, "learning_rate": 2.6303182694551995e-06, "loss": 0.73289412, "num_input_tokens_seen": 148572100, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.2208252, "step": 6919, "time_per_iteration": 3.016178607940674 }, { "auxiliary_loss_clip": 0.01474998, "auxiliary_loss_mlp": 0.01057138, "balance_loss_clip": 1.2909143, "balance_loss_mlp": 1.03491759, "epoch": 0.4160529084623478, "flos": 18231965435520.0, "grad_norm": 2.386813292630589, "language_loss": 0.82383597, "learning_rate": 2.6299486432442677e-06, "loss": 0.84915739, "num_input_tokens_seen": 148591245, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.22229004, "step": 6920, "time_per_iteration": 2.8413913249969482 }, { "auxiliary_loss_clip": 0.01484151, "auxiliary_loss_mlp": 0.01056841, "balance_loss_clip": 1.29764628, "balance_loss_mlp": 1.03550267, "epoch": 0.4161130317150158, "flos": 13669736872320.0, "grad_norm": 2.044282077591011, "language_loss": 0.66650283, "learning_rate": 2.6295789931456195e-06, "loss": 0.69191277, "num_input_tokens_seen": 148607980, "router_z_loss_clip": 1.86425781, "router_z_loss_mlp": 0.21350098, "step": 6921, "time_per_iteration": 2.8346188068389893 }, { "auxiliary_loss_clip": 0.01474566, "auxiliary_loss_mlp": 0.01061958, "balance_loss_clip": 1.2917937, "balance_loss_mlp": 1.04145384, "epoch": 0.41617315496768376, "flos": 16186685898240.0, "grad_norm": 2.1138902007449554, "language_loss": 0.81754172, "learning_rate": 2.629209319173274e-06, "loss": 0.84290695, "num_input_tokens_seen": 148624490, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.20507812, "step": 6922, "time_per_iteration": 2.802933931350708 }, { "auxiliary_loss_clip": 0.01483414, "auxiliary_loss_mlp": 0.01056405, "balance_loss_clip": 1.29789782, "balance_loss_mlp": 1.03493512, "epoch": 0.4162332782203517, "flos": 26224412192640.0, "grad_norm": 1.7558697755764596, "language_loss": 0.67999846, "learning_rate": 2.628839621341247e-06, "loss": 0.70539665, "num_input_tokens_seen": 148646490, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.21484375, "step": 6923, "time_per_iteration": 2.9095559120178223 }, { "auxiliary_loss_clip": 0.01477531, "auxiliary_loss_mlp": 0.0105999, "balance_loss_clip": 1.29531956, "balance_loss_mlp": 1.03823423, "epoch": 0.4162934014730197, "flos": 28195707202560.0, "grad_norm": 2.285550320669461, "language_loss": 0.76605278, "learning_rate": 2.6284698996635593e-06, "loss": 0.79142797, "num_input_tokens_seen": 148668580, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.2175293, "step": 6924, "time_per_iteration": 2.9351460933685303 }, { "auxiliary_loss_clip": 0.01478628, "auxiliary_loss_mlp": 0.01051322, "balance_loss_clip": 1.29323554, "balance_loss_mlp": 1.02992415, "epoch": 0.41635352472568765, "flos": 19874899797120.0, "grad_norm": 1.6550886337975523, "language_loss": 0.74146307, "learning_rate": 2.62810015415423e-06, "loss": 0.76676255, "num_input_tokens_seen": 148688410, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.21386719, "step": 6925, "time_per_iteration": 2.865053176879883 }, { "auxiliary_loss_clip": 0.01462434, "auxiliary_loss_mlp": 0.01044292, "balance_loss_clip": 1.27908087, "balance_loss_mlp": 1.02294183, "epoch": 0.4164136479783556, "flos": 14942522373120.0, "grad_norm": 1.909725537250393, "language_loss": 0.85100877, "learning_rate": 2.6277303848272792e-06, "loss": 0.87607598, "num_input_tokens_seen": 148704855, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.21337891, "step": 6926, "time_per_iteration": 2.852017879486084 }, { "auxiliary_loss_clip": 0.01453261, "auxiliary_loss_mlp": 0.01040208, "balance_loss_clip": 1.27353692, "balance_loss_mlp": 1.01923871, "epoch": 0.4164737712310236, "flos": 21766373700480.0, "grad_norm": 1.5604733655339171, "language_loss": 0.86753726, "learning_rate": 2.6273605916967302e-06, "loss": 0.89247197, "num_input_tokens_seen": 148723065, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.2097168, "step": 6927, "time_per_iteration": 2.854295253753662 }, { "auxiliary_loss_clip": 0.0146678, "auxiliary_loss_mlp": 0.01040521, "balance_loss_clip": 1.2844975, "balance_loss_mlp": 1.01826477, "epoch": 0.41653389448369155, "flos": 20749728867840.0, "grad_norm": 2.233191990431906, "language_loss": 0.73933947, "learning_rate": 2.626990774776604e-06, "loss": 0.76441246, "num_input_tokens_seen": 148741780, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.22277832, "step": 6928, "time_per_iteration": 2.8461570739746094 }, { "auxiliary_loss_clip": 0.0146041, "auxiliary_loss_mlp": 0.01039763, "balance_loss_clip": 1.27751517, "balance_loss_mlp": 1.01884186, "epoch": 0.4165940177363595, "flos": 24983687272320.0, "grad_norm": 1.9142636850439767, "language_loss": 0.79323101, "learning_rate": 2.6266209340809254e-06, "loss": 0.81823277, "num_input_tokens_seen": 148759795, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.20922852, "step": 6929, "time_per_iteration": 2.8861517906188965 }, { "auxiliary_loss_clip": 0.0145235, "auxiliary_loss_mlp": 0.01035197, "balance_loss_clip": 1.27221715, "balance_loss_mlp": 1.01437068, "epoch": 0.41665414098902753, "flos": 20531394869760.0, "grad_norm": 1.865926894726875, "language_loss": 0.71662635, "learning_rate": 2.6262510696237182e-06, "loss": 0.74150181, "num_input_tokens_seen": 148778680, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.20812988, "step": 6930, "time_per_iteration": 2.8529043197631836 }, { "auxiliary_loss_clip": 0.01474129, "auxiliary_loss_mlp": 0.01040283, "balance_loss_clip": 1.28860402, "balance_loss_mlp": 1.01898062, "epoch": 0.4167142642416955, "flos": 19692472187520.0, "grad_norm": 1.8546738627967763, "language_loss": 0.82375193, "learning_rate": 2.625881181419007e-06, "loss": 0.84889603, "num_input_tokens_seen": 148796470, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.21313477, "step": 6931, "time_per_iteration": 2.8160171508789062 }, { "auxiliary_loss_clip": 0.01444953, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.26467752, "balance_loss_mlp": 1.01499426, "epoch": 0.41677438749436346, "flos": 23772896426880.0, "grad_norm": 2.161705083668155, "language_loss": 0.79546475, "learning_rate": 2.6255112694808193e-06, "loss": 0.82026929, "num_input_tokens_seen": 148815300, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.20495605, "step": 6932, "time_per_iteration": 2.835158586502075 }, { "auxiliary_loss_clip": 0.014636, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.27967048, "balance_loss_mlp": 1.0153985, "epoch": 0.41683451074703143, "flos": 30422464208640.0, "grad_norm": 20.986902689540873, "language_loss": 0.83089423, "learning_rate": 2.6251413338231813e-06, "loss": 0.85590291, "num_input_tokens_seen": 148834315, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.21887207, "step": 6933, "time_per_iteration": 4.368880987167358 }, { "auxiliary_loss_clip": 0.01484847, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.29705071, "balance_loss_mlp": 1.01315594, "epoch": 0.4168946339996994, "flos": 21516657793920.0, "grad_norm": 1.9328715218891803, "language_loss": 0.78258759, "learning_rate": 2.624771374460121e-06, "loss": 0.80777776, "num_input_tokens_seen": 148852420, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.21008301, "step": 6934, "time_per_iteration": 2.8357813358306885 }, { "auxiliary_loss_clip": 0.01470189, "auxiliary_loss_mlp": 0.01036224, "balance_loss_clip": 1.28683448, "balance_loss_mlp": 1.01508796, "epoch": 0.41695475725236736, "flos": 17647418874240.0, "grad_norm": 2.2453886195172106, "language_loss": 0.67586637, "learning_rate": 2.624401391405668e-06, "loss": 0.70093048, "num_input_tokens_seen": 148869305, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.21130371, "step": 6935, "time_per_iteration": 2.8146512508392334 }, { "auxiliary_loss_clip": 0.01454667, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.27430534, "balance_loss_mlp": 1.02077413, "epoch": 0.4170148805050353, "flos": 15677254984320.0, "grad_norm": 2.0788519493053337, "language_loss": 0.74133217, "learning_rate": 2.6240313846738513e-06, "loss": 0.76629949, "num_input_tokens_seen": 148886395, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.21289062, "step": 6936, "time_per_iteration": 2.8332698345184326 }, { "auxiliary_loss_clip": 0.01449274, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.27065301, "balance_loss_mlp": 1.01611495, "epoch": 0.4170750037577033, "flos": 15167643091200.0, "grad_norm": 2.1428575130020566, "language_loss": 0.74944675, "learning_rate": 2.6236613542787024e-06, "loss": 0.77431071, "num_input_tokens_seen": 148905235, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.21020508, "step": 6937, "time_per_iteration": 2.845737934112549 }, { "auxiliary_loss_clip": 0.01467997, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.28589821, "balance_loss_mlp": 1.0216701, "epoch": 0.41713512701037125, "flos": 28780299008640.0, "grad_norm": 1.4530718571695742, "language_loss": 0.84988558, "learning_rate": 2.6232913002342518e-06, "loss": 0.87498617, "num_input_tokens_seen": 148928130, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.20373535, "step": 6938, "time_per_iteration": 2.948174238204956 }, { "auxiliary_loss_clip": 0.01478745, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.29159498, "balance_loss_mlp": 1.01473045, "epoch": 0.4171952502630392, "flos": 28268741589120.0, "grad_norm": 1.8487419512897938, "language_loss": 0.7521016, "learning_rate": 2.6229212225545334e-06, "loss": 0.77725267, "num_input_tokens_seen": 148948790, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.21630859, "step": 6939, "time_per_iteration": 2.9203481674194336 }, { "auxiliary_loss_clip": 0.01467879, "auxiliary_loss_mlp": 0.01039927, "balance_loss_clip": 1.28714681, "balance_loss_mlp": 1.01871967, "epoch": 0.4172553735157072, "flos": 24582427971840.0, "grad_norm": 2.7700366780817016, "language_loss": 0.76045078, "learning_rate": 2.622551121253579e-06, "loss": 0.78552884, "num_input_tokens_seen": 148967690, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.21203613, "step": 6940, "time_per_iteration": 4.357048273086548 }, { "auxiliary_loss_clip": 0.01481827, "auxiliary_loss_mlp": 0.01039044, "balance_loss_clip": 1.29697824, "balance_loss_mlp": 1.01782513, "epoch": 0.41731549676837515, "flos": 27055371790080.0, "grad_norm": 1.8408681868401786, "language_loss": 0.72253007, "learning_rate": 2.622180996345424e-06, "loss": 0.74773878, "num_input_tokens_seen": 148987150, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.21203613, "step": 6941, "time_per_iteration": 2.895787477493286 }, { "auxiliary_loss_clip": 0.01476224, "auxiliary_loss_mlp": 0.01039877, "balance_loss_clip": 1.2907238, "balance_loss_mlp": 1.01886046, "epoch": 0.4173756200210431, "flos": 28403544407040.0, "grad_norm": 2.013371769581459, "language_loss": 0.75155663, "learning_rate": 2.621810847844104e-06, "loss": 0.77671766, "num_input_tokens_seen": 149004895, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.21032715, "step": 6942, "time_per_iteration": 4.377228021621704 }, { "auxiliary_loss_clip": 0.0148712, "auxiliary_loss_mlp": 0.01041561, "balance_loss_clip": 1.29919624, "balance_loss_mlp": 1.01863718, "epoch": 0.41743574327371114, "flos": 22529954511360.0, "grad_norm": 1.956096788048457, "language_loss": 0.73660433, "learning_rate": 2.6214406757636534e-06, "loss": 0.76189113, "num_input_tokens_seen": 149020970, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.22937012, "step": 6943, "time_per_iteration": 4.1981213092803955 }, { "auxiliary_loss_clip": 0.0148145, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.29667068, "balance_loss_mlp": 1.0184387, "epoch": 0.4174958665263791, "flos": 30124535328000.0, "grad_norm": 1.7126117407783177, "language_loss": 0.64420986, "learning_rate": 2.621070480118111e-06, "loss": 0.66942728, "num_input_tokens_seen": 149041795, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.21838379, "step": 6944, "time_per_iteration": 2.9117696285247803 }, { "auxiliary_loss_clip": 0.01470877, "auxiliary_loss_mlp": 0.01035753, "balance_loss_clip": 1.28672457, "balance_loss_mlp": 1.01507044, "epoch": 0.41755598977904707, "flos": 25273969781760.0, "grad_norm": 1.4298131554761693, "language_loss": 0.70252073, "learning_rate": 2.620700260921513e-06, "loss": 0.72758704, "num_input_tokens_seen": 149063700, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.20678711, "step": 6945, "time_per_iteration": 2.8991613388061523 }, { "auxiliary_loss_clip": 0.01469244, "auxiliary_loss_mlp": 0.01039206, "balance_loss_clip": 1.28652608, "balance_loss_mlp": 1.01653218, "epoch": 0.41761611303171503, "flos": 19838088512640.0, "grad_norm": 2.349867726684987, "language_loss": 0.8195315, "learning_rate": 2.620330018187899e-06, "loss": 0.844616, "num_input_tokens_seen": 149082410, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.22692871, "step": 6946, "time_per_iteration": 2.826840400695801 }, { "auxiliary_loss_clip": 0.01466835, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.28528857, "balance_loss_mlp": 1.01265335, "epoch": 0.417676236284383, "flos": 15531638659200.0, "grad_norm": 2.2101053592946225, "language_loss": 0.78553551, "learning_rate": 2.6199597519313086e-06, "loss": 0.81053078, "num_input_tokens_seen": 149098745, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.20031738, "step": 6947, "time_per_iteration": 2.814042091369629 }, { "auxiliary_loss_clip": 0.01483151, "auxiliary_loss_mlp": 0.01041072, "balance_loss_clip": 1.30036557, "balance_loss_mlp": 1.01996052, "epoch": 0.41773635953705096, "flos": 32536977569280.0, "grad_norm": 1.9046167915130512, "language_loss": 0.72204638, "learning_rate": 2.6195894621657825e-06, "loss": 0.74728864, "num_input_tokens_seen": 149122255, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.21130371, "step": 6948, "time_per_iteration": 2.9695961475372314 }, { "auxiliary_loss_clip": 0.01463306, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.28247499, "balance_loss_mlp": 1.0124588, "epoch": 0.4177964827897189, "flos": 23451458232960.0, "grad_norm": 1.4846338326293698, "language_loss": 0.77570128, "learning_rate": 2.619219148905362e-06, "loss": 0.80067354, "num_input_tokens_seen": 149142845, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.21459961, "step": 6949, "time_per_iteration": 2.9076483249664307 }, { "auxiliary_loss_clip": 0.01489484, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.3012712, "balance_loss_mlp": 1.01815557, "epoch": 0.4178566060423869, "flos": 22759554464640.0, "grad_norm": 1.5961192244066145, "language_loss": 0.82606208, "learning_rate": 2.6188488121640888e-06, "loss": 0.85136175, "num_input_tokens_seen": 149163375, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.22314453, "step": 6950, "time_per_iteration": 2.868067979812622 }, { "auxiliary_loss_clip": 0.01461476, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.2837944, "balance_loss_mlp": 1.01698232, "epoch": 0.41791672929505486, "flos": 26044337312640.0, "grad_norm": 1.2995413972626246, "language_loss": 0.76720595, "learning_rate": 2.618478451956007e-06, "loss": 0.79219472, "num_input_tokens_seen": 149185610, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.20410156, "step": 6951, "time_per_iteration": 2.9133458137512207 }, { "auxiliary_loss_clip": 0.01496025, "auxiliary_loss_mlp": 0.01036162, "balance_loss_clip": 1.30617118, "balance_loss_mlp": 1.01491904, "epoch": 0.4179768525477228, "flos": 19576970916480.0, "grad_norm": 2.072155769672736, "language_loss": 0.73949558, "learning_rate": 2.61810806829516e-06, "loss": 0.76481748, "num_input_tokens_seen": 149203990, "router_z_loss_clip": 1.90039062, "router_z_loss_mlp": 0.21240234, "step": 6952, "time_per_iteration": 2.913940668106079 }, { "auxiliary_loss_clip": 0.01488186, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.30328619, "balance_loss_mlp": 1.01903367, "epoch": 0.4180369758003908, "flos": 17792401772160.0, "grad_norm": 3.633599836318094, "language_loss": 0.72552836, "learning_rate": 2.617737661195593e-06, "loss": 0.75081336, "num_input_tokens_seen": 149221385, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.21276855, "step": 6953, "time_per_iteration": 2.840087652206421 }, { "auxiliary_loss_clip": 0.01463346, "auxiliary_loss_mlp": 0.01043201, "balance_loss_clip": 1.28504133, "balance_loss_mlp": 1.02133775, "epoch": 0.41809709905305875, "flos": 20970732309120.0, "grad_norm": 1.6945976652244814, "language_loss": 0.76557839, "learning_rate": 2.617367230671353e-06, "loss": 0.79064387, "num_input_tokens_seen": 149241175, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.21875, "step": 6954, "time_per_iteration": 2.8516314029693604 }, { "auxiliary_loss_clip": 0.01477502, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.29374623, "balance_loss_mlp": 1.01498151, "epoch": 0.4181572223057267, "flos": 22027672275840.0, "grad_norm": 9.045867496618138, "language_loss": 0.8536846, "learning_rate": 2.616996776736485e-06, "loss": 0.87884045, "num_input_tokens_seen": 149259115, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.23120117, "step": 6955, "time_per_iteration": 2.841114044189453 }, { "auxiliary_loss_clip": 0.01472283, "auxiliary_loss_mlp": 0.01041824, "balance_loss_clip": 1.29160571, "balance_loss_mlp": 1.02068806, "epoch": 0.4182173455583947, "flos": 26255522632320.0, "grad_norm": 1.4754475660360569, "language_loss": 0.83965337, "learning_rate": 2.616626299405037e-06, "loss": 0.86479449, "num_input_tokens_seen": 149278705, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.21142578, "step": 6956, "time_per_iteration": 2.875835418701172 }, { "auxiliary_loss_clip": 0.01483582, "auxiliary_loss_mlp": 0.01047647, "balance_loss_clip": 1.2974354, "balance_loss_mlp": 1.02544975, "epoch": 0.4182774688110627, "flos": 14799304022400.0, "grad_norm": 1.9607425705864332, "language_loss": 0.72399294, "learning_rate": 2.616255798691059e-06, "loss": 0.74930525, "num_input_tokens_seen": 149294040, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.22192383, "step": 6957, "time_per_iteration": 2.8217484951019287 }, { "auxiliary_loss_clip": 0.01483802, "auxiliary_loss_mlp": 0.01047625, "balance_loss_clip": 1.29910171, "balance_loss_mlp": 1.0275147, "epoch": 0.41833759206373067, "flos": 20421594443520.0, "grad_norm": 2.3926090424247364, "language_loss": 0.76508433, "learning_rate": 2.6158852746085982e-06, "loss": 0.7903986, "num_input_tokens_seen": 149310385, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.20117188, "step": 6958, "time_per_iteration": 2.840210199356079 }, { "auxiliary_loss_clip": 0.01475072, "auxiliary_loss_mlp": 0.01038185, "balance_loss_clip": 1.29263234, "balance_loss_mlp": 1.01634574, "epoch": 0.41839771531639863, "flos": 23665991667840.0, "grad_norm": 2.17587133155173, "language_loss": 0.77637887, "learning_rate": 2.6155147271717066e-06, "loss": 0.80151141, "num_input_tokens_seen": 149328235, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.21862793, "step": 6959, "time_per_iteration": 2.891871929168701 }, { "auxiliary_loss_clip": 0.0146624, "auxiliary_loss_mlp": 0.01045702, "balance_loss_clip": 1.28369713, "balance_loss_mlp": 1.02308774, "epoch": 0.4184578385690666, "flos": 19763244334080.0, "grad_norm": 4.293041062202975, "language_loss": 0.77613914, "learning_rate": 2.6151441563944347e-06, "loss": 0.80125856, "num_input_tokens_seen": 149347465, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.22595215, "step": 6960, "time_per_iteration": 2.8494880199432373 }, { "auxiliary_loss_clip": 0.01452702, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.27725089, "balance_loss_mlp": 1.01596236, "epoch": 0.41851796182173456, "flos": 20202988976640.0, "grad_norm": 1.9026062030936706, "language_loss": 0.76240593, "learning_rate": 2.614773562290835e-06, "loss": 0.78729528, "num_input_tokens_seen": 149366685, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.20263672, "step": 6961, "time_per_iteration": 2.892587661743164 }, { "auxiliary_loss_clip": 0.01258455, "auxiliary_loss_mlp": 0.01042979, "balance_loss_clip": 1.14956069, "balance_loss_mlp": 1.02276075, "epoch": 0.41857808507440253, "flos": 59049111300480.0, "grad_norm": 0.8063364173127168, "language_loss": 0.54660928, "learning_rate": 2.61440294487496e-06, "loss": 0.56962359, "num_input_tokens_seen": 149422925, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 0.20214844, "step": 6962, "time_per_iteration": 3.2956666946411133 }, { "auxiliary_loss_clip": 0.01481467, "auxiliary_loss_mlp": 0.01044099, "balance_loss_clip": 1.29758716, "balance_loss_mlp": 1.02214122, "epoch": 0.4186382083270705, "flos": 18488015614080.0, "grad_norm": 2.2766321197361776, "language_loss": 0.86465979, "learning_rate": 2.614032304160864e-06, "loss": 0.88991541, "num_input_tokens_seen": 149440820, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.21972656, "step": 6963, "time_per_iteration": 2.810253143310547 }, { "auxiliary_loss_clip": 0.01472497, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.29071426, "balance_loss_mlp": 1.01959896, "epoch": 0.41869833157973846, "flos": 21588425326080.0, "grad_norm": 1.4629381714428389, "language_loss": 0.70801353, "learning_rate": 2.6136616401626014e-06, "loss": 0.73314714, "num_input_tokens_seen": 149461060, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21264648, "step": 6964, "time_per_iteration": 2.879701852798462 }, { "auxiliary_loss_clip": 0.01465373, "auxiliary_loss_mlp": 0.01045158, "balance_loss_clip": 1.28472662, "balance_loss_mlp": 1.02435565, "epoch": 0.4187584548324064, "flos": 35530980215040.0, "grad_norm": 1.4481141809320326, "language_loss": 0.7142176, "learning_rate": 2.6132909528942273e-06, "loss": 0.7393229, "num_input_tokens_seen": 149483115, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.20800781, "step": 6965, "time_per_iteration": 2.9654433727264404 }, { "auxiliary_loss_clip": 0.01462367, "auxiliary_loss_mlp": 0.01040844, "balance_loss_clip": 1.28465247, "balance_loss_mlp": 1.02035201, "epoch": 0.4188185780850744, "flos": 18663746993280.0, "grad_norm": 1.5963621322425618, "language_loss": 0.72639203, "learning_rate": 2.6129202423697997e-06, "loss": 0.75142413, "num_input_tokens_seen": 149501495, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.20483398, "step": 6966, "time_per_iteration": 2.860440969467163 }, { "auxiliary_loss_clip": 0.01489192, "auxiliary_loss_mlp": 0.01040701, "balance_loss_clip": 1.30237877, "balance_loss_mlp": 1.02062654, "epoch": 0.41887870133774235, "flos": 40348218326400.0, "grad_norm": 2.1020135896272722, "language_loss": 0.71813226, "learning_rate": 2.612549508603375e-06, "loss": 0.74343115, "num_input_tokens_seen": 149523170, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.20056152, "step": 6967, "time_per_iteration": 2.9962122440338135 }, { "auxiliary_loss_clip": 0.01260775, "auxiliary_loss_mlp": 0.0106169, "balance_loss_clip": 1.15326071, "balance_loss_mlp": 1.0415678, "epoch": 0.4189388245904103, "flos": 61397323908480.0, "grad_norm": 0.6927624031303851, "language_loss": 0.46415129, "learning_rate": 2.612178751609011e-06, "loss": 0.48737594, "num_input_tokens_seen": 149583955, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.20117188, "step": 6968, "time_per_iteration": 4.790131568908691 }, { "auxiliary_loss_clip": 0.01491045, "auxiliary_loss_mlp": 0.01042645, "balance_loss_clip": 1.30477428, "balance_loss_mlp": 1.02118754, "epoch": 0.4189989478430783, "flos": 28226093725440.0, "grad_norm": 4.011384829411986, "language_loss": 0.7577309, "learning_rate": 2.6118079714007685e-06, "loss": 0.78306782, "num_input_tokens_seen": 149604440, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.21472168, "step": 6969, "time_per_iteration": 2.972778081893921 }, { "auxiliary_loss_clip": 0.0147837, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.29505086, "balance_loss_mlp": 1.01767135, "epoch": 0.4190590710957463, "flos": 24575098314240.0, "grad_norm": 1.7767477510818215, "language_loss": 0.81700528, "learning_rate": 2.611437167992705e-06, "loss": 0.84216535, "num_input_tokens_seen": 149623745, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.19970703, "step": 6970, "time_per_iteration": 2.8886237144470215 }, { "auxiliary_loss_clip": 0.01479819, "auxiliary_loss_mlp": 0.01038399, "balance_loss_clip": 1.29820085, "balance_loss_mlp": 1.0171324, "epoch": 0.41911919434841427, "flos": 21736077667200.0, "grad_norm": 2.0101854926999514, "language_loss": 0.83935928, "learning_rate": 2.6110663413988835e-06, "loss": 0.86454153, "num_input_tokens_seen": 149643025, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21276855, "step": 6971, "time_per_iteration": 2.8384807109832764 }, { "auxiliary_loss_clip": 0.01464827, "auxiliary_loss_mlp": 0.010379, "balance_loss_clip": 1.28759885, "balance_loss_mlp": 1.01688385, "epoch": 0.41917931760108224, "flos": 17610109896960.0, "grad_norm": 2.0602187282357254, "language_loss": 0.76149315, "learning_rate": 2.6106954916333648e-06, "loss": 0.78652036, "num_input_tokens_seen": 149660695, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.21020508, "step": 6972, "time_per_iteration": 2.8807787895202637 }, { "auxiliary_loss_clip": 0.01474144, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.29151726, "balance_loss_mlp": 1.01301336, "epoch": 0.4192394408537502, "flos": 37831043076480.0, "grad_norm": 2.068725423560535, "language_loss": 0.74054348, "learning_rate": 2.610324618710212e-06, "loss": 0.76562655, "num_input_tokens_seen": 149682040, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.21142578, "step": 6973, "time_per_iteration": 2.9747424125671387 }, { "auxiliary_loss_clip": 0.01495943, "auxiliary_loss_mlp": 0.01051953, "balance_loss_clip": 1.30737257, "balance_loss_mlp": 1.03033996, "epoch": 0.41929956410641817, "flos": 23116899047040.0, "grad_norm": 2.207100165315424, "language_loss": 0.75391716, "learning_rate": 2.609953722643489e-06, "loss": 0.77939612, "num_input_tokens_seen": 149700855, "router_z_loss_clip": 1.88769531, "router_z_loss_mlp": 0.21606445, "step": 6974, "time_per_iteration": 2.900017023086548 }, { "auxiliary_loss_clip": 0.01475565, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.29237962, "balance_loss_mlp": 1.01286292, "epoch": 0.41935968735908613, "flos": 22533709829760.0, "grad_norm": 1.8605671930701877, "language_loss": 0.73611575, "learning_rate": 2.609582803447259e-06, "loss": 0.76122117, "num_input_tokens_seen": 149717360, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.22131348, "step": 6975, "time_per_iteration": 2.8255467414855957 }, { "auxiliary_loss_clip": 0.01477884, "auxiliary_loss_mlp": 0.01041602, "balance_loss_clip": 1.29735482, "balance_loss_mlp": 1.02058494, "epoch": 0.4194198106117541, "flos": 26881812161280.0, "grad_norm": 1.5106497502758174, "language_loss": 0.8132267, "learning_rate": 2.6092118611355885e-06, "loss": 0.83842152, "num_input_tokens_seen": 149738975, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.21020508, "step": 6976, "time_per_iteration": 4.2981579303741455 }, { "auxiliary_loss_clip": 0.01474775, "auxiliary_loss_mlp": 0.01033588, "balance_loss_clip": 1.29079533, "balance_loss_mlp": 1.01307237, "epoch": 0.41947993386442206, "flos": 19912163529600.0, "grad_norm": 2.8812566517915332, "language_loss": 0.68811631, "learning_rate": 2.6088408957225425e-06, "loss": 0.71319997, "num_input_tokens_seen": 149757055, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.20532227, "step": 6977, "time_per_iteration": 4.289769172668457 }, { "auxiliary_loss_clip": 0.01494509, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.3093555, "balance_loss_mlp": 1.01696265, "epoch": 0.41954005711709, "flos": 17392726039680.0, "grad_norm": 2.9154316667675504, "language_loss": 0.81678128, "learning_rate": 2.6084699072221898e-06, "loss": 0.84210104, "num_input_tokens_seen": 149772885, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.20507812, "step": 6978, "time_per_iteration": 4.249008893966675 }, { "auxiliary_loss_clip": 0.01490809, "auxiliary_loss_mlp": 0.01039361, "balance_loss_clip": 1.30266118, "balance_loss_mlp": 1.01808226, "epoch": 0.419600180369758, "flos": 25012535472000.0, "grad_norm": 2.108662572138678, "language_loss": 0.83634329, "learning_rate": 2.6080988956485964e-06, "loss": 0.86164498, "num_input_tokens_seen": 149791515, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.21276855, "step": 6979, "time_per_iteration": 2.860628843307495 }, { "auxiliary_loss_clip": 0.01470422, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.28805983, "balance_loss_mlp": 1.01661909, "epoch": 0.41966030362242596, "flos": 17392590305280.0, "grad_norm": 2.0394303736265296, "language_loss": 0.84031326, "learning_rate": 2.6077278610158325e-06, "loss": 0.86538851, "num_input_tokens_seen": 149807250, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.20495605, "step": 6980, "time_per_iteration": 2.8149101734161377 }, { "auxiliary_loss_clip": 0.01493048, "auxiliary_loss_mlp": 0.01042595, "balance_loss_clip": 1.30573094, "balance_loss_mlp": 1.02124429, "epoch": 0.4197204268750939, "flos": 22164330130560.0, "grad_norm": 3.7157782841751343, "language_loss": 0.78961658, "learning_rate": 2.6073568033379665e-06, "loss": 0.814973, "num_input_tokens_seen": 149821640, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.21350098, "step": 6981, "time_per_iteration": 2.8314263820648193 }, { "auxiliary_loss_clip": 0.01464816, "auxiliary_loss_mlp": 0.01033454, "balance_loss_clip": 1.28449512, "balance_loss_mlp": 1.01331997, "epoch": 0.4197805501277619, "flos": 22092517353600.0, "grad_norm": 1.659330205870222, "language_loss": 0.84737492, "learning_rate": 2.6069857226290696e-06, "loss": 0.87235761, "num_input_tokens_seen": 149840545, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.20129395, "step": 6982, "time_per_iteration": 2.8203184604644775 }, { "auxiliary_loss_clip": 0.01492462, "auxiliary_loss_mlp": 0.01041528, "balance_loss_clip": 1.30451131, "balance_loss_mlp": 1.0191288, "epoch": 0.4198406733804299, "flos": 26443334373120.0, "grad_norm": 2.3397612971467714, "language_loss": 0.57896996, "learning_rate": 2.606614618903214e-06, "loss": 0.6043098, "num_input_tokens_seen": 149860375, "router_z_loss_clip": 1.87792969, "router_z_loss_mlp": 0.22399902, "step": 6983, "time_per_iteration": 2.866570472717285 }, { "auxiliary_loss_clip": 0.01469857, "auxiliary_loss_mlp": 0.01037726, "balance_loss_clip": 1.28894353, "balance_loss_mlp": 1.01688838, "epoch": 0.4199007966330979, "flos": 12538540909440.0, "grad_norm": 1.9110522756041868, "language_loss": 0.82976985, "learning_rate": 2.606243492174471e-06, "loss": 0.85484564, "num_input_tokens_seen": 149877850, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.20837402, "step": 6984, "time_per_iteration": 2.877274513244629 }, { "auxiliary_loss_clip": 0.01472555, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.29003048, "balance_loss_mlp": 1.01448059, "epoch": 0.41996091988576584, "flos": 21773115175680.0, "grad_norm": 3.029549289545675, "language_loss": 0.80100471, "learning_rate": 2.605872342456914e-06, "loss": 0.82608926, "num_input_tokens_seen": 149896110, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.2142334, "step": 6985, "time_per_iteration": 2.882906436920166 }, { "auxiliary_loss_clip": 0.01492519, "auxiliary_loss_mlp": 0.01039031, "balance_loss_clip": 1.30203116, "balance_loss_mlp": 1.01702535, "epoch": 0.4200210431384338, "flos": 26553180044160.0, "grad_norm": 1.6346867830224485, "language_loss": 0.78715229, "learning_rate": 2.6055011697646173e-06, "loss": 0.81246775, "num_input_tokens_seen": 149916495, "router_z_loss_clip": 1.90527344, "router_z_loss_mlp": 0.2199707, "step": 6986, "time_per_iteration": 2.8790605068206787 }, { "auxiliary_loss_clip": 0.01456653, "auxiliary_loss_mlp": 0.01040019, "balance_loss_clip": 1.27918267, "balance_loss_mlp": 1.01976514, "epoch": 0.42008116639110177, "flos": 26806741758720.0, "grad_norm": 1.5749747318342266, "language_loss": 0.72929549, "learning_rate": 2.605129974111655e-06, "loss": 0.75426221, "num_input_tokens_seen": 149936445, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.20251465, "step": 6987, "time_per_iteration": 2.9014296531677246 }, { "auxiliary_loss_clip": 0.01489899, "auxiliary_loss_mlp": 0.01042678, "balance_loss_clip": 1.30509841, "balance_loss_mlp": 1.02000487, "epoch": 0.42014128964376973, "flos": 32099902369920.0, "grad_norm": 1.4226539290910547, "language_loss": 0.75629282, "learning_rate": 2.604758755512104e-06, "loss": 0.78161865, "num_input_tokens_seen": 149959430, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.22680664, "step": 6988, "time_per_iteration": 2.96189546585083 }, { "auxiliary_loss_clip": 0.01489384, "auxiliary_loss_mlp": 0.01039472, "balance_loss_clip": 1.30319643, "balance_loss_mlp": 1.01793051, "epoch": 0.4202014128964377, "flos": 26477883417600.0, "grad_norm": 1.610340073058688, "language_loss": 0.74859422, "learning_rate": 2.60438751398004e-06, "loss": 0.77388275, "num_input_tokens_seen": 149980365, "router_z_loss_clip": 1.86230469, "router_z_loss_mlp": 0.2154541, "step": 6989, "time_per_iteration": 2.8682408332824707 }, { "auxiliary_loss_clip": 0.01486926, "auxiliary_loss_mlp": 0.01036394, "balance_loss_clip": 1.30002093, "balance_loss_mlp": 1.01387572, "epoch": 0.42026153614910566, "flos": 13407533400960.0, "grad_norm": 2.1342757526053613, "language_loss": 0.72604561, "learning_rate": 2.6040162495295404e-06, "loss": 0.75127888, "num_input_tokens_seen": 149997375, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.2253418, "step": 6990, "time_per_iteration": 2.8554816246032715 }, { "auxiliary_loss_clip": 0.0123708, "auxiliary_loss_mlp": 0.01067683, "balance_loss_clip": 1.13369608, "balance_loss_mlp": 1.04584372, "epoch": 0.42032165940177363, "flos": 60278796489600.0, "grad_norm": 0.8340442210893206, "language_loss": 0.60463494, "learning_rate": 2.603644962174685e-06, "loss": 0.62768257, "num_input_tokens_seen": 150051230, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.21875, "step": 6991, "time_per_iteration": 3.216486930847168 }, { "auxiliary_loss_clip": 0.01487786, "auxiliary_loss_mlp": 0.01040797, "balance_loss_clip": 1.30170858, "balance_loss_mlp": 1.01958942, "epoch": 0.4203817826544416, "flos": 24545933400960.0, "grad_norm": 1.7116787687561292, "language_loss": 0.83772886, "learning_rate": 2.6032736519295517e-06, "loss": 0.8630147, "num_input_tokens_seen": 150071135, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.2121582, "step": 6992, "time_per_iteration": 2.9229466915130615 }, { "auxiliary_loss_clip": 0.01249257, "auxiliary_loss_mlp": 0.01028689, "balance_loss_clip": 1.13891459, "balance_loss_mlp": 1.00255859, "epoch": 0.42044190590710956, "flos": 58847002202880.0, "grad_norm": 0.8220601391908872, "language_loss": 0.65514743, "learning_rate": 2.6029023188082217e-06, "loss": 0.6779269, "num_input_tokens_seen": 150125220, "router_z_loss_clip": 1.1015625, "router_z_loss_mlp": 0.26171875, "step": 6993, "time_per_iteration": 3.2792551517486572 }, { "auxiliary_loss_clip": 0.01505107, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.31368935, "balance_loss_mlp": 1.02253175, "epoch": 0.4205020291597775, "flos": 16444862582400.0, "grad_norm": 2.292474609036427, "language_loss": 0.84994435, "learning_rate": 2.6025309628247746e-06, "loss": 0.87544274, "num_input_tokens_seen": 150142300, "router_z_loss_clip": 1.91308594, "router_z_loss_mlp": 0.2220459, "step": 6994, "time_per_iteration": 2.8137123584747314 }, { "auxiliary_loss_clip": 0.01465952, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 1.28613949, "balance_loss_mlp": 1.02260303, "epoch": 0.4205621524124455, "flos": 18414800248320.0, "grad_norm": 1.9413686532627623, "language_loss": 0.78479773, "learning_rate": 2.6021595839932934e-06, "loss": 0.80989957, "num_input_tokens_seen": 150161345, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.21606445, "step": 6995, "time_per_iteration": 2.958728313446045 }, { "auxiliary_loss_clip": 0.01467161, "auxiliary_loss_mlp": 0.01038398, "balance_loss_clip": 1.28860712, "balance_loss_mlp": 1.01668978, "epoch": 0.4206222756651135, "flos": 25531015345920.0, "grad_norm": 1.684865431290551, "language_loss": 0.80347037, "learning_rate": 2.60178818232786e-06, "loss": 0.8285259, "num_input_tokens_seen": 150182420, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.21728516, "step": 6996, "time_per_iteration": 2.919257640838623 }, { "auxiliary_loss_clip": 0.01482464, "auxiliary_loss_mlp": 0.01036655, "balance_loss_clip": 1.29870546, "balance_loss_mlp": 1.01570988, "epoch": 0.4206823989177815, "flos": 15312671233920.0, "grad_norm": 2.1422672535636496, "language_loss": 0.76564932, "learning_rate": 2.601416757842559e-06, "loss": 0.79084051, "num_input_tokens_seen": 150200175, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.20947266, "step": 6997, "time_per_iteration": 2.831120729446411 }, { "auxiliary_loss_clip": 0.01473797, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.28897905, "balance_loss_mlp": 1.01786232, "epoch": 0.42074252217044944, "flos": 15561029796480.0, "grad_norm": 1.9669222648243108, "language_loss": 0.76430142, "learning_rate": 2.6010453105514743e-06, "loss": 0.78943974, "num_input_tokens_seen": 150217100, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.22167969, "step": 6998, "time_per_iteration": 2.884909152984619 }, { "auxiliary_loss_clip": 0.01496699, "auxiliary_loss_mlp": 0.01043216, "balance_loss_clip": 1.30747998, "balance_loss_mlp": 1.01985097, "epoch": 0.4208026454231174, "flos": 26158390750080.0, "grad_norm": 1.8825937142846703, "language_loss": 0.76726645, "learning_rate": 2.60067384046869e-06, "loss": 0.7926656, "num_input_tokens_seen": 150239830, "router_z_loss_clip": 1.89257812, "router_z_loss_mlp": 0.23364258, "step": 6999, "time_per_iteration": 2.9460692405700684 }, { "auxiliary_loss_clip": 0.01485037, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.30255699, "balance_loss_mlp": 1.01715159, "epoch": 0.42086276867578537, "flos": 23560579987200.0, "grad_norm": 2.1086304538172906, "language_loss": 0.64846218, "learning_rate": 2.600302347608295e-06, "loss": 0.67370677, "num_input_tokens_seen": 150260690, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.22277832, "step": 7000, "time_per_iteration": 2.876311779022217 }, { "auxiliary_loss_clip": 0.01505175, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.31818974, "balance_loss_mlp": 1.01946127, "epoch": 0.42092289192845334, "flos": 18122391233280.0, "grad_norm": 2.7971499855369766, "language_loss": 0.76809978, "learning_rate": 2.5999308319843743e-06, "loss": 0.79356682, "num_input_tokens_seen": 150279885, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.22058105, "step": 7001, "time_per_iteration": 2.8683078289031982 }, { "auxiliary_loss_clip": 0.01493342, "auxiliary_loss_mlp": 0.0104237, "balance_loss_clip": 1.3107537, "balance_loss_mlp": 1.02037644, "epoch": 0.4209830151811213, "flos": 20014996256640.0, "grad_norm": 1.4713260996602677, "language_loss": 0.87539077, "learning_rate": 2.5995592936110154e-06, "loss": 0.9007479, "num_input_tokens_seen": 150297390, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.2199707, "step": 7002, "time_per_iteration": 2.821183919906616 }, { "auxiliary_loss_clip": 0.01487936, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.3047483, "balance_loss_mlp": 1.01812136, "epoch": 0.42104313843378927, "flos": 21988417772160.0, "grad_norm": 2.74588773827634, "language_loss": 0.69099939, "learning_rate": 2.5991877325023096e-06, "loss": 0.71627831, "num_input_tokens_seen": 150317390, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.21813965, "step": 7003, "time_per_iteration": 4.3195414543151855 }, { "auxiliary_loss_clip": 0.0150112, "auxiliary_loss_mlp": 0.01045068, "balance_loss_clip": 1.31310153, "balance_loss_mlp": 1.02269268, "epoch": 0.42110326168645723, "flos": 25454044661760.0, "grad_norm": 2.8304112830688326, "language_loss": 0.78699446, "learning_rate": 2.598816148672344e-06, "loss": 0.81245637, "num_input_tokens_seen": 150337455, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.22387695, "step": 7004, "time_per_iteration": 2.850839376449585 }, { "auxiliary_loss_clip": 0.01473248, "auxiliary_loss_mlp": 0.01043284, "balance_loss_clip": 1.29325962, "balance_loss_mlp": 1.02086091, "epoch": 0.4211633849391252, "flos": 17831746765440.0, "grad_norm": 1.5485617199741195, "language_loss": 0.68767118, "learning_rate": 2.59844454213521e-06, "loss": 0.7128365, "num_input_tokens_seen": 150355385, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22424316, "step": 7005, "time_per_iteration": 2.827394723892212 }, { "auxiliary_loss_clip": 0.01488992, "auxiliary_loss_mlp": 0.01039569, "balance_loss_clip": 1.30399156, "balance_loss_mlp": 1.01826632, "epoch": 0.42122350819179316, "flos": 16289201911680.0, "grad_norm": 3.3379986156692643, "language_loss": 0.73598015, "learning_rate": 2.5980729129049994e-06, "loss": 0.7612657, "num_input_tokens_seen": 150371750, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.21289062, "step": 7006, "time_per_iteration": 2.8123152256011963 }, { "auxiliary_loss_clip": 0.01491863, "auxiliary_loss_mlp": 0.01045648, "balance_loss_clip": 1.3057667, "balance_loss_mlp": 1.02357054, "epoch": 0.4212836314444611, "flos": 19655253699840.0, "grad_norm": 1.7690035117024923, "language_loss": 0.72057617, "learning_rate": 2.5977012609958033e-06, "loss": 0.74595124, "num_input_tokens_seen": 150389955, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.22058105, "step": 7007, "time_per_iteration": 2.8527493476867676 }, { "auxiliary_loss_clip": 0.01493062, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.30781615, "balance_loss_mlp": 1.01815176, "epoch": 0.4213437546971291, "flos": 18378124698240.0, "grad_norm": 1.90739813346141, "language_loss": 0.8351171, "learning_rate": 2.5973295864217166e-06, "loss": 0.86044049, "num_input_tokens_seen": 150405780, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.21105957, "step": 7008, "time_per_iteration": 2.82950758934021 }, { "auxiliary_loss_clip": 0.01499595, "auxiliary_loss_mlp": 0.01043111, "balance_loss_clip": 1.31433821, "balance_loss_mlp": 1.02193987, "epoch": 0.42140387794979706, "flos": 27714762529920.0, "grad_norm": 6.601080614480888, "language_loss": 0.72483325, "learning_rate": 2.596957889196831e-06, "loss": 0.75026035, "num_input_tokens_seen": 150425615, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.21179199, "step": 7009, "time_per_iteration": 2.8988163471221924 }, { "auxiliary_loss_clip": 0.01494499, "auxiliary_loss_mlp": 0.01041598, "balance_loss_clip": 1.30674088, "balance_loss_mlp": 1.01994991, "epoch": 0.4214640012024651, "flos": 28158669694080.0, "grad_norm": 2.679354477922483, "language_loss": 0.6721642, "learning_rate": 2.596586169335243e-06, "loss": 0.69752508, "num_input_tokens_seen": 150445765, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.21643066, "step": 7010, "time_per_iteration": 4.312077045440674 }, { "auxiliary_loss_clip": 0.01483473, "auxiliary_loss_mlp": 0.01041964, "balance_loss_clip": 1.30011582, "balance_loss_mlp": 1.01983857, "epoch": 0.42152412445513304, "flos": 23006148480000.0, "grad_norm": 1.5982273158777511, "language_loss": 0.73336387, "learning_rate": 2.5962144268510477e-06, "loss": 0.75861824, "num_input_tokens_seen": 150464405, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.22119141, "step": 7011, "time_per_iteration": 2.858715534210205 }, { "auxiliary_loss_clip": 0.01258668, "auxiliary_loss_mlp": 0.01058152, "balance_loss_clip": 1.14398456, "balance_loss_mlp": 1.03431058, "epoch": 0.421584247707801, "flos": 63777931793280.0, "grad_norm": 0.8037487176202057, "language_loss": 0.54360449, "learning_rate": 2.5958426617583417e-06, "loss": 0.5667727, "num_input_tokens_seen": 150520430, "router_z_loss_clip": 1.1484375, "router_z_loss_mlp": 0.23828125, "step": 7012, "time_per_iteration": 4.65428614616394 }, { "auxiliary_loss_clip": 0.01496618, "auxiliary_loss_mlp": 0.01044578, "balance_loss_clip": 1.31094682, "balance_loss_mlp": 1.02234519, "epoch": 0.421644370960469, "flos": 24324839470080.0, "grad_norm": 1.3460731999058508, "language_loss": 0.79333448, "learning_rate": 2.5954708740712215e-06, "loss": 0.81874645, "num_input_tokens_seen": 150542610, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.22253418, "step": 7013, "time_per_iteration": 2.8751492500305176 }, { "auxiliary_loss_clip": 0.0149669, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.31057549, "balance_loss_mlp": 1.01698899, "epoch": 0.42170449421313694, "flos": 23451141519360.0, "grad_norm": 1.7597739343981018, "language_loss": 0.81911528, "learning_rate": 2.595099063803787e-06, "loss": 0.84447491, "num_input_tokens_seen": 150560970, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.22302246, "step": 7014, "time_per_iteration": 4.251984596252441 }, { "auxiliary_loss_clip": 0.01486161, "auxiliary_loss_mlp": 0.0104053, "balance_loss_clip": 1.30335951, "balance_loss_mlp": 1.01937079, "epoch": 0.4217646174658049, "flos": 23705834353920.0, "grad_norm": 1.5276312301688555, "language_loss": 0.78386289, "learning_rate": 2.5947272309701354e-06, "loss": 0.80912977, "num_input_tokens_seen": 150582615, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.21154785, "step": 7015, "time_per_iteration": 2.941279649734497 }, { "auxiliary_loss_clip": 0.0150906, "auxiliary_loss_mlp": 0.01045807, "balance_loss_clip": 1.32228625, "balance_loss_mlp": 1.02363384, "epoch": 0.42182474071847287, "flos": 24982013214720.0, "grad_norm": 1.3006194860310722, "language_loss": 0.82479608, "learning_rate": 2.594355375584368e-06, "loss": 0.85034478, "num_input_tokens_seen": 150603640, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.22192383, "step": 7016, "time_per_iteration": 2.8927063941955566 }, { "auxiliary_loss_clip": 0.01481851, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.29755306, "balance_loss_mlp": 1.02102208, "epoch": 0.42188486397114083, "flos": 22866866426880.0, "grad_norm": 2.0216304957056566, "language_loss": 0.68283767, "learning_rate": 2.593983497660586e-06, "loss": 0.70809001, "num_input_tokens_seen": 150622490, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.22351074, "step": 7017, "time_per_iteration": 2.8800899982452393 }, { "auxiliary_loss_clip": 0.01259009, "auxiliary_loss_mlp": 0.01042215, "balance_loss_clip": 1.14233446, "balance_loss_mlp": 1.01036251, "epoch": 0.4219449872238088, "flos": 67008864049920.0, "grad_norm": 0.6999819402752542, "language_loss": 0.59535193, "learning_rate": 2.5936115972128895e-06, "loss": 0.61836421, "num_input_tokens_seen": 150689545, "router_z_loss_clip": 1.171875, "router_z_loss_mlp": 0.31835938, "step": 7018, "time_per_iteration": 3.4934003353118896 }, { "auxiliary_loss_clip": 0.01500266, "auxiliary_loss_mlp": 0.01042261, "balance_loss_clip": 1.31235552, "balance_loss_mlp": 1.01971877, "epoch": 0.42200511047647676, "flos": 13123177960320.0, "grad_norm": 1.7399984355213636, "language_loss": 0.7625314, "learning_rate": 2.593239674255382e-06, "loss": 0.7879566, "num_input_tokens_seen": 150707610, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.22546387, "step": 7019, "time_per_iteration": 2.8802733421325684 }, { "auxiliary_loss_clip": 0.01490527, "auxiliary_loss_mlp": 0.01045892, "balance_loss_clip": 1.30478728, "balance_loss_mlp": 1.02301598, "epoch": 0.42206523372914473, "flos": 13999273885440.0, "grad_norm": 1.9190720975663869, "language_loss": 0.70470041, "learning_rate": 2.592867728802166e-06, "loss": 0.73006463, "num_input_tokens_seen": 150724530, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.22888184, "step": 7020, "time_per_iteration": 2.9077346324920654 }, { "auxiliary_loss_clip": 0.01466212, "auxiliary_loss_mlp": 0.01043984, "balance_loss_clip": 1.28972268, "balance_loss_mlp": 1.02275324, "epoch": 0.4221253569818127, "flos": 21951742222080.0, "grad_norm": 1.6088626705174531, "language_loss": 0.81768298, "learning_rate": 2.592495760867347e-06, "loss": 0.842785, "num_input_tokens_seen": 150742870, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.21240234, "step": 7021, "time_per_iteration": 2.8436410427093506 }, { "auxiliary_loss_clip": 0.01494564, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.3097105, "balance_loss_mlp": 1.01739645, "epoch": 0.42218548023448066, "flos": 32204092440960.0, "grad_norm": 2.590373004056052, "language_loss": 0.70636308, "learning_rate": 2.5921237704650293e-06, "loss": 0.73169911, "num_input_tokens_seen": 150765500, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.21643066, "step": 7022, "time_per_iteration": 2.9064908027648926 }, { "auxiliary_loss_clip": 0.01469825, "auxiliary_loss_mlp": 0.01042911, "balance_loss_clip": 1.29404545, "balance_loss_mlp": 1.02263391, "epoch": 0.4222456034871487, "flos": 30131503027200.0, "grad_norm": 1.8180980027926745, "language_loss": 0.68076485, "learning_rate": 2.5917517576093188e-06, "loss": 0.70589221, "num_input_tokens_seen": 150784945, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.20263672, "step": 7023, "time_per_iteration": 2.888573408126831 }, { "auxiliary_loss_clip": 0.01470825, "auxiliary_loss_mlp": 0.01047326, "balance_loss_clip": 1.29392958, "balance_loss_mlp": 1.02469993, "epoch": 0.42230572673981664, "flos": 22138196618880.0, "grad_norm": 86.62136877134265, "language_loss": 0.70155597, "learning_rate": 2.591379722314322e-06, "loss": 0.72673744, "num_input_tokens_seen": 150803120, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.22619629, "step": 7024, "time_per_iteration": 2.8557558059692383 }, { "auxiliary_loss_clip": 0.01491888, "auxiliary_loss_mlp": 0.01046972, "balance_loss_clip": 1.30906272, "balance_loss_mlp": 1.02444136, "epoch": 0.4223658499924846, "flos": 22065388456320.0, "grad_norm": 1.5703480439390414, "language_loss": 0.77760351, "learning_rate": 2.591007664594147e-06, "loss": 0.80299211, "num_input_tokens_seen": 150823135, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.2253418, "step": 7025, "time_per_iteration": 2.845501184463501 }, { "auxiliary_loss_clip": 0.01477506, "auxiliary_loss_mlp": 0.01041958, "balance_loss_clip": 1.2980237, "balance_loss_mlp": 1.02072692, "epoch": 0.4224259732451526, "flos": 20419965630720.0, "grad_norm": 2.34086604854161, "language_loss": 0.80604005, "learning_rate": 2.5906355844629024e-06, "loss": 0.83123469, "num_input_tokens_seen": 150842070, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.2121582, "step": 7026, "time_per_iteration": 2.8462634086608887 }, { "auxiliary_loss_clip": 0.01249087, "auxiliary_loss_mlp": 0.01023345, "balance_loss_clip": 1.1385057, "balance_loss_mlp": 0.99740553, "epoch": 0.42248609649782054, "flos": 62877150190080.0, "grad_norm": 0.7446523305026598, "language_loss": 0.62018424, "learning_rate": 2.5902634819346966e-06, "loss": 0.64290857, "num_input_tokens_seen": 150907450, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.25976562, "step": 7027, "time_per_iteration": 3.4912173748016357 }, { "auxiliary_loss_clip": 0.01470628, "auxiliary_loss_mlp": 0.01042727, "balance_loss_clip": 1.29159629, "balance_loss_mlp": 1.02131736, "epoch": 0.4225462197504885, "flos": 26261087742720.0, "grad_norm": 3.8807768588098766, "language_loss": 0.72196579, "learning_rate": 2.5898913570236414e-06, "loss": 0.74709934, "num_input_tokens_seen": 150928040, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.21398926, "step": 7028, "time_per_iteration": 2.9317526817321777 }, { "auxiliary_loss_clip": 0.01485412, "auxiliary_loss_mlp": 0.01039563, "balance_loss_clip": 1.301687, "balance_loss_mlp": 1.01823652, "epoch": 0.42260634300315647, "flos": 20531666338560.0, "grad_norm": 5.4552553355504525, "language_loss": 0.83495855, "learning_rate": 2.589519209743846e-06, "loss": 0.86020827, "num_input_tokens_seen": 150945760, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.21325684, "step": 7029, "time_per_iteration": 2.8726391792297363 }, { "auxiliary_loss_clip": 0.0149303, "auxiliary_loss_mlp": 0.01045643, "balance_loss_clip": 1.30818379, "balance_loss_mlp": 1.02336264, "epoch": 0.42266646625582444, "flos": 24327508913280.0, "grad_norm": 2.0756842603538708, "language_loss": 0.7603488, "learning_rate": 2.589147040109424e-06, "loss": 0.78573549, "num_input_tokens_seen": 150965665, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22277832, "step": 7030, "time_per_iteration": 2.8746137619018555 }, { "auxiliary_loss_clip": 0.01468579, "auxiliary_loss_mlp": 0.01043195, "balance_loss_clip": 1.28731465, "balance_loss_mlp": 1.02067685, "epoch": 0.4227265895084924, "flos": 24214088903040.0, "grad_norm": 2.08951110867092, "language_loss": 0.87615341, "learning_rate": 2.588774848134486e-06, "loss": 0.9012711, "num_input_tokens_seen": 150982260, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.22509766, "step": 7031, "time_per_iteration": 2.881432056427002 }, { "auxiliary_loss_clip": 0.01481282, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.29958189, "balance_loss_mlp": 1.01813626, "epoch": 0.42278671276116037, "flos": 16918522842240.0, "grad_norm": 2.656614747530965, "language_loss": 0.74310148, "learning_rate": 2.5884026338331473e-06, "loss": 0.76831478, "num_input_tokens_seen": 150999990, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.21923828, "step": 7032, "time_per_iteration": 2.881918430328369 }, { "auxiliary_loss_clip": 0.01484819, "auxiliary_loss_mlp": 0.01039618, "balance_loss_clip": 1.3003087, "balance_loss_mlp": 1.01872063, "epoch": 0.42284683601382833, "flos": 25422074570880.0, "grad_norm": 1.888868450620308, "language_loss": 0.71466637, "learning_rate": 2.5880303972195222e-06, "loss": 0.73991072, "num_input_tokens_seen": 151021105, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.20898438, "step": 7033, "time_per_iteration": 2.883237361907959 }, { "auxiliary_loss_clip": 0.01491012, "auxiliary_loss_mlp": 0.01041097, "balance_loss_clip": 1.30583572, "balance_loss_mlp": 1.01963902, "epoch": 0.4229069592664963, "flos": 23050922849280.0, "grad_norm": 2.042455612043299, "language_loss": 0.90749627, "learning_rate": 2.5876581383077256e-06, "loss": 0.93281734, "num_input_tokens_seen": 151040665, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.21447754, "step": 7034, "time_per_iteration": 2.8584516048431396 }, { "auxiliary_loss_clip": 0.01462662, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.28473949, "balance_loss_mlp": 1.02593565, "epoch": 0.42296708251916426, "flos": 26078524398720.0, "grad_norm": 1.6007280110117255, "language_loss": 0.77524281, "learning_rate": 2.5872858571118723e-06, "loss": 0.80033731, "num_input_tokens_seen": 151061240, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.20837402, "step": 7035, "time_per_iteration": 2.8498620986938477 }, { "auxiliary_loss_clip": 0.01482387, "auxiliary_loss_mlp": 0.01048949, "balance_loss_clip": 1.29946375, "balance_loss_mlp": 1.02670455, "epoch": 0.4230272057718323, "flos": 19466401328640.0, "grad_norm": 1.9194140636193495, "language_loss": 0.83340186, "learning_rate": 2.5869135536460817e-06, "loss": 0.85871518, "num_input_tokens_seen": 151076870, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22241211, "step": 7036, "time_per_iteration": 2.782806873321533 }, { "auxiliary_loss_clip": 0.01466723, "auxiliary_loss_mlp": 0.01034881, "balance_loss_clip": 1.28933072, "balance_loss_mlp": 1.01362586, "epoch": 0.42308732902450025, "flos": 22393658615040.0, "grad_norm": 1.7187036248296264, "language_loss": 0.70867074, "learning_rate": 2.58654122792447e-06, "loss": 0.7336868, "num_input_tokens_seen": 151095110, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.21252441, "step": 7037, "time_per_iteration": 2.8160901069641113 }, { "auxiliary_loss_clip": 0.01469409, "auxiliary_loss_mlp": 0.0103918, "balance_loss_clip": 1.28837764, "balance_loss_mlp": 1.01760268, "epoch": 0.4231474522771682, "flos": 21005055129600.0, "grad_norm": 1.7101236323065594, "language_loss": 0.78157759, "learning_rate": 2.586168879961155e-06, "loss": 0.80666339, "num_input_tokens_seen": 151114355, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.21594238, "step": 7038, "time_per_iteration": 2.8704192638397217 }, { "auxiliary_loss_clip": 0.01483933, "auxiliary_loss_mlp": 0.01046777, "balance_loss_clip": 1.29793704, "balance_loss_mlp": 1.02399611, "epoch": 0.4232075755298362, "flos": 14984084361600.0, "grad_norm": 1.9811879588500545, "language_loss": 0.67932767, "learning_rate": 2.585796509770259e-06, "loss": 0.70463479, "num_input_tokens_seen": 151131505, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.22766113, "step": 7039, "time_per_iteration": 4.259572505950928 }, { "auxiliary_loss_clip": 0.01488543, "auxiliary_loss_mlp": 0.01045491, "balance_loss_clip": 1.30125248, "balance_loss_mlp": 1.02465332, "epoch": 0.42326769878250414, "flos": 24542721020160.0, "grad_norm": 1.57980839223373, "language_loss": 0.76103163, "learning_rate": 2.5854241173658996e-06, "loss": 0.78637195, "num_input_tokens_seen": 151151555, "router_z_loss_clip": 1.87207031, "router_z_loss_mlp": 0.20849609, "step": 7040, "time_per_iteration": 2.868297815322876 }, { "auxiliary_loss_clip": 0.01471628, "auxiliary_loss_mlp": 0.01039375, "balance_loss_clip": 1.28978705, "balance_loss_mlp": 1.01723814, "epoch": 0.4233278220351721, "flos": 26881631182080.0, "grad_norm": 1.7125138574712695, "language_loss": 0.66394711, "learning_rate": 2.5850517027621996e-06, "loss": 0.68905723, "num_input_tokens_seen": 151172385, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.22143555, "step": 7041, "time_per_iteration": 2.8884286880493164 }, { "auxiliary_loss_clip": 0.01474239, "auxiliary_loss_mlp": 0.0103796, "balance_loss_clip": 1.29169416, "balance_loss_mlp": 1.01573944, "epoch": 0.4233879452878401, "flos": 42830482573440.0, "grad_norm": 1.727720612260796, "language_loss": 0.74959695, "learning_rate": 2.5846792659732803e-06, "loss": 0.77471888, "num_input_tokens_seen": 151194930, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.22229004, "step": 7042, "time_per_iteration": 3.06581711769104 }, { "auxiliary_loss_clip": 0.0145834, "auxiliary_loss_mlp": 0.01040946, "balance_loss_clip": 1.28024983, "balance_loss_mlp": 1.01953626, "epoch": 0.42344806854050804, "flos": 25240054164480.0, "grad_norm": 1.4245072379942034, "language_loss": 0.82359463, "learning_rate": 2.5843068070132643e-06, "loss": 0.84858751, "num_input_tokens_seen": 151217905, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.21398926, "step": 7043, "time_per_iteration": 2.897966146469116 }, { "auxiliary_loss_clip": 0.01468253, "auxiliary_loss_mlp": 0.01042565, "balance_loss_clip": 1.28841209, "balance_loss_mlp": 1.01989102, "epoch": 0.423508191793176, "flos": 22787769237120.0, "grad_norm": 2.351533716102935, "language_loss": 0.65819108, "learning_rate": 2.5839343258962763e-06, "loss": 0.6832993, "num_input_tokens_seen": 151234580, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.22680664, "step": 7044, "time_per_iteration": 2.874068260192871 }, { "auxiliary_loss_clip": 0.01494059, "auxiliary_loss_mlp": 0.01044749, "balance_loss_clip": 1.30813003, "balance_loss_mlp": 1.02097845, "epoch": 0.42356831504584397, "flos": 34649726382720.0, "grad_norm": 1.7147354669374402, "language_loss": 0.75606638, "learning_rate": 2.5835618226364393e-06, "loss": 0.7814545, "num_input_tokens_seen": 151254765, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.23803711, "step": 7045, "time_per_iteration": 2.9598186016082764 }, { "auxiliary_loss_clip": 0.01460399, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.2821368, "balance_loss_mlp": 1.0165962, "epoch": 0.42362843829851193, "flos": 17604092338560.0, "grad_norm": 2.3333162447133016, "language_loss": 0.81399447, "learning_rate": 2.5831892972478797e-06, "loss": 0.83898008, "num_input_tokens_seen": 151269045, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.21582031, "step": 7046, "time_per_iteration": 4.205596923828125 }, { "auxiliary_loss_clip": 0.01471902, "auxiliary_loss_mlp": 0.01039421, "balance_loss_clip": 1.2881422, "balance_loss_mlp": 1.01661611, "epoch": 0.4236885615511799, "flos": 22576086224640.0, "grad_norm": 1.707772585384806, "language_loss": 0.7761538, "learning_rate": 2.5828167497447242e-06, "loss": 0.80126703, "num_input_tokens_seen": 151287530, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.22814941, "step": 7047, "time_per_iteration": 4.248044013977051 }, { "auxiliary_loss_clip": 0.01459142, "auxiliary_loss_mlp": 0.010374, "balance_loss_clip": 1.28160167, "balance_loss_mlp": 1.01565588, "epoch": 0.42374868480384786, "flos": 26480507616000.0, "grad_norm": 1.8804945001585132, "language_loss": 0.68869686, "learning_rate": 2.582444180141098e-06, "loss": 0.71366221, "num_input_tokens_seen": 151308905, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.21728516, "step": 7048, "time_per_iteration": 2.9048032760620117 }, { "auxiliary_loss_clip": 0.01481243, "auxiliary_loss_mlp": 0.01035475, "balance_loss_clip": 1.29727268, "balance_loss_mlp": 1.01190722, "epoch": 0.4238088080565159, "flos": 20378403642240.0, "grad_norm": 1.9349392738265394, "language_loss": 0.78834534, "learning_rate": 2.5820715884511307e-06, "loss": 0.8135125, "num_input_tokens_seen": 151326525, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23571777, "step": 7049, "time_per_iteration": 4.268215894699097 }, { "auxiliary_loss_clip": 0.01472598, "auxiliary_loss_mlp": 0.01042061, "balance_loss_clip": 1.28891802, "balance_loss_mlp": 1.01912475, "epoch": 0.42386893130918385, "flos": 21180288816000.0, "grad_norm": 2.2114978032616883, "language_loss": 0.83840024, "learning_rate": 2.5816989746889504e-06, "loss": 0.86354679, "num_input_tokens_seen": 151344675, "router_z_loss_clip": 1.8359375, "router_z_loss_mlp": 0.22924805, "step": 7050, "time_per_iteration": 2.8113133907318115 }, { "auxiliary_loss_clip": 0.01468392, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.28779149, "balance_loss_mlp": 1.0120163, "epoch": 0.4239290545618518, "flos": 17684501627520.0, "grad_norm": 2.0907358284418827, "language_loss": 0.74133122, "learning_rate": 2.581326338868687e-06, "loss": 0.76635975, "num_input_tokens_seen": 151360730, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22436523, "step": 7051, "time_per_iteration": 2.807770013809204 }, { "auxiliary_loss_clip": 0.01471982, "auxiliary_loss_mlp": 0.01035548, "balance_loss_clip": 1.29141307, "balance_loss_mlp": 1.01270771, "epoch": 0.4239891778145198, "flos": 24324703735680.0, "grad_norm": 2.1156588994241927, "language_loss": 0.8685503, "learning_rate": 2.5809536810044706e-06, "loss": 0.89362562, "num_input_tokens_seen": 151380445, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22851562, "step": 7052, "time_per_iteration": 2.8544981479644775 }, { "auxiliary_loss_clip": 0.01471036, "auxiliary_loss_mlp": 0.01041357, "balance_loss_clip": 1.28795481, "balance_loss_mlp": 1.01829004, "epoch": 0.42404930106718774, "flos": 20568160909440.0, "grad_norm": 1.4240513186670427, "language_loss": 0.73340666, "learning_rate": 2.5805810011104323e-06, "loss": 0.75853062, "num_input_tokens_seen": 151399325, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.23071289, "step": 7053, "time_per_iteration": 2.8438780307769775 }, { "auxiliary_loss_clip": 0.01467737, "auxiliary_loss_mlp": 0.01037659, "balance_loss_clip": 1.28689921, "balance_loss_mlp": 1.01449656, "epoch": 0.4241094243198557, "flos": 22317819050880.0, "grad_norm": 2.106624462462933, "language_loss": 0.83029902, "learning_rate": 2.580208299200704e-06, "loss": 0.85535294, "num_input_tokens_seen": 151417240, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.23156738, "step": 7054, "time_per_iteration": 2.8378076553344727 }, { "auxiliary_loss_clip": 0.01251872, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.13854742, "balance_loss_mlp": 1.00180852, "epoch": 0.4241695475725237, "flos": 70643480843520.0, "grad_norm": 0.7922161520724251, "language_loss": 0.60566294, "learning_rate": 2.5798355752894183e-06, "loss": 0.62847435, "num_input_tokens_seen": 151476015, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 0.27539062, "step": 7055, "time_per_iteration": 3.304468870162964 }, { "auxiliary_loss_clip": 0.0147128, "auxiliary_loss_mlp": 0.01039026, "balance_loss_clip": 1.28785634, "balance_loss_mlp": 1.01510096, "epoch": 0.42422967082519164, "flos": 14035496987520.0, "grad_norm": 2.3412738264596022, "language_loss": 0.77340734, "learning_rate": 2.5794628293907107e-06, "loss": 0.79851037, "num_input_tokens_seen": 151492035, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.23937988, "step": 7056, "time_per_iteration": 2.8280317783355713 }, { "auxiliary_loss_clip": 0.01478435, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.29188156, "balance_loss_mlp": 1.01726127, "epoch": 0.4242897940778596, "flos": 22355535231360.0, "grad_norm": 1.8557269389019724, "language_loss": 0.84869325, "learning_rate": 2.579090061518714e-06, "loss": 0.8738879, "num_input_tokens_seen": 151508970, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.23779297, "step": 7057, "time_per_iteration": 2.8505892753601074 }, { "auxiliary_loss_clip": 0.01475061, "auxiliary_loss_mlp": 0.01036488, "balance_loss_clip": 1.29005933, "balance_loss_mlp": 1.01355267, "epoch": 0.42434991733052757, "flos": 22604617710720.0, "grad_norm": 2.656094765801848, "language_loss": 0.83719397, "learning_rate": 2.5787172716875642e-06, "loss": 0.86230946, "num_input_tokens_seen": 151525295, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.22937012, "step": 7058, "time_per_iteration": 2.8491175174713135 }, { "auxiliary_loss_clip": 0.01445298, "auxiliary_loss_mlp": 0.01036546, "balance_loss_clip": 1.26852858, "balance_loss_mlp": 1.01494563, "epoch": 0.42441004058319554, "flos": 20021104304640.0, "grad_norm": 1.7190487857609122, "language_loss": 0.81154054, "learning_rate": 2.5783444599113973e-06, "loss": 0.83635902, "num_input_tokens_seen": 151544435, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.21618652, "step": 7059, "time_per_iteration": 2.8908252716064453 }, { "auxiliary_loss_clip": 0.014734, "auxiliary_loss_mlp": 0.0104191, "balance_loss_clip": 1.28856611, "balance_loss_mlp": 1.01898623, "epoch": 0.4244701638358635, "flos": 11152244908800.0, "grad_norm": 14.270175961006737, "language_loss": 0.70953333, "learning_rate": 2.57797162620435e-06, "loss": 0.73468643, "num_input_tokens_seen": 151559520, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.22912598, "step": 7060, "time_per_iteration": 2.822981834411621 }, { "auxiliary_loss_clip": 0.01476756, "auxiliary_loss_mlp": 0.0103776, "balance_loss_clip": 1.29251516, "balance_loss_mlp": 1.01539683, "epoch": 0.42453028708853147, "flos": 23998288613760.0, "grad_norm": 1.6085195089812157, "language_loss": 0.76445198, "learning_rate": 2.577598770580562e-06, "loss": 0.78959709, "num_input_tokens_seen": 151579790, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.22338867, "step": 7061, "time_per_iteration": 2.8544015884399414 }, { "auxiliary_loss_clip": 0.01477706, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.29438961, "balance_loss_mlp": 1.01686025, "epoch": 0.42459041034119943, "flos": 18415659899520.0, "grad_norm": 2.0292926244870015, "language_loss": 0.74150854, "learning_rate": 2.5772258930541693e-06, "loss": 0.76669276, "num_input_tokens_seen": 151598285, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.23864746, "step": 7062, "time_per_iteration": 2.822336196899414 }, { "auxiliary_loss_clip": 0.0146176, "auxiliary_loss_mlp": 0.01044475, "balance_loss_clip": 1.27942777, "balance_loss_mlp": 1.02249336, "epoch": 0.42465053359386745, "flos": 20967384193920.0, "grad_norm": 1.6569015020194375, "language_loss": 0.67177546, "learning_rate": 2.5768529936393137e-06, "loss": 0.69683778, "num_input_tokens_seen": 151615430, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.21972656, "step": 7063, "time_per_iteration": 2.825423002243042 }, { "auxiliary_loss_clip": 0.01457156, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.28017068, "balance_loss_mlp": 1.0166502, "epoch": 0.4247106568465354, "flos": 33118311749760.0, "grad_norm": 1.6302609094980036, "language_loss": 0.79077524, "learning_rate": 2.5764800723501354e-06, "loss": 0.81573236, "num_input_tokens_seen": 151637030, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.21911621, "step": 7064, "time_per_iteration": 2.9116926193237305 }, { "auxiliary_loss_clip": 0.01481129, "auxiliary_loss_mlp": 0.01036149, "balance_loss_clip": 1.29585266, "balance_loss_mlp": 1.01339173, "epoch": 0.4247707800992034, "flos": 20056332021120.0, "grad_norm": 1.946044571825641, "language_loss": 0.76434374, "learning_rate": 2.5761071292007736e-06, "loss": 0.78951645, "num_input_tokens_seen": 151655745, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.22741699, "step": 7065, "time_per_iteration": 2.8624989986419678 }, { "auxiliary_loss_clip": 0.01478094, "auxiliary_loss_mlp": 0.01040103, "balance_loss_clip": 1.29725456, "balance_loss_mlp": 1.01722646, "epoch": 0.42483090335187135, "flos": 22395966099840.0, "grad_norm": 1.6431681808649956, "language_loss": 0.72996271, "learning_rate": 2.5757341642053725e-06, "loss": 0.75514472, "num_input_tokens_seen": 151678040, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.22888184, "step": 7066, "time_per_iteration": 2.889441967010498 }, { "auxiliary_loss_clip": 0.01474853, "auxiliary_loss_mlp": 0.01040096, "balance_loss_clip": 1.28954279, "balance_loss_mlp": 1.01561093, "epoch": 0.4248910266045393, "flos": 21365657337600.0, "grad_norm": 2.2401426937799434, "language_loss": 0.80679452, "learning_rate": 2.5753611773780745e-06, "loss": 0.83194405, "num_input_tokens_seen": 151696410, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.24475098, "step": 7067, "time_per_iteration": 2.8291900157928467 }, { "auxiliary_loss_clip": 0.01243727, "auxiliary_loss_mlp": 0.01037827, "balance_loss_clip": 1.137308, "balance_loss_mlp": 1.0034945, "epoch": 0.4249511498572073, "flos": 64037330087040.0, "grad_norm": 0.9128119802923499, "language_loss": 0.63540232, "learning_rate": 2.574988168733022e-06, "loss": 0.65821785, "num_input_tokens_seen": 151756365, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.34375, "step": 7068, "time_per_iteration": 3.2698163986206055 }, { "auxiliary_loss_clip": 0.01474036, "auxiliary_loss_mlp": 0.01038896, "balance_loss_clip": 1.29031348, "balance_loss_mlp": 1.0149821, "epoch": 0.42501127310987524, "flos": 19615818216960.0, "grad_norm": 1.6712598376359682, "language_loss": 0.72958124, "learning_rate": 2.574615138284361e-06, "loss": 0.75471056, "num_input_tokens_seen": 151775165, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.23913574, "step": 7069, "time_per_iteration": 2.837414503097534 }, { "auxiliary_loss_clip": 0.01474219, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.28884935, "balance_loss_mlp": 1.01425552, "epoch": 0.4250713963625432, "flos": 19471378256640.0, "grad_norm": 2.3883478489198278, "language_loss": 0.79959911, "learning_rate": 2.5742420860462364e-06, "loss": 0.82471383, "num_input_tokens_seen": 151792620, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.2298584, "step": 7070, "time_per_iteration": 2.87648868560791 }, { "auxiliary_loss_clip": 0.01470175, "auxiliary_loss_mlp": 0.01038631, "balance_loss_clip": 1.2881341, "balance_loss_mlp": 1.01531327, "epoch": 0.4251315196152112, "flos": 25348587736320.0, "grad_norm": 1.757575169507403, "language_loss": 0.708009, "learning_rate": 2.573869012032795e-06, "loss": 0.73309708, "num_input_tokens_seen": 151812850, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.23327637, "step": 7071, "time_per_iteration": 2.989698886871338 }, { "auxiliary_loss_clip": 0.01467, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.28439081, "balance_loss_mlp": 1.01505816, "epoch": 0.42519164286787914, "flos": 26370299986560.0, "grad_norm": 3.079234544391467, "language_loss": 0.72624779, "learning_rate": 2.5734959162581824e-06, "loss": 0.75128567, "num_input_tokens_seen": 151831785, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.21740723, "step": 7072, "time_per_iteration": 2.8998169898986816 }, { "auxiliary_loss_clip": 0.0147643, "auxiliary_loss_mlp": 0.01041417, "balance_loss_clip": 1.28997517, "balance_loss_mlp": 1.01851654, "epoch": 0.4252517661205471, "flos": 26042165562240.0, "grad_norm": 1.801806560297093, "language_loss": 0.82587892, "learning_rate": 2.5731227987365475e-06, "loss": 0.85105741, "num_input_tokens_seen": 151853885, "router_z_loss_clip": 1.86523438, "router_z_loss_mlp": 0.22888184, "step": 7073, "time_per_iteration": 4.302968263626099 }, { "auxiliary_loss_clip": 0.01462803, "auxiliary_loss_mlp": 0.01041189, "balance_loss_clip": 1.28333497, "balance_loss_mlp": 1.01921892, "epoch": 0.42531188937321507, "flos": 12721239987840.0, "grad_norm": 2.526784660928835, "language_loss": 0.92226684, "learning_rate": 2.5727496594820386e-06, "loss": 0.94730675, "num_input_tokens_seen": 151871780, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.21972656, "step": 7074, "time_per_iteration": 2.8009183406829834 }, { "auxiliary_loss_clip": 0.01474724, "auxiliary_loss_mlp": 0.01043956, "balance_loss_clip": 1.28715611, "balance_loss_mlp": 1.01989985, "epoch": 0.42537201262588303, "flos": 22102018761600.0, "grad_norm": 1.664867803326938, "language_loss": 0.65047055, "learning_rate": 2.572376498508805e-06, "loss": 0.67565733, "num_input_tokens_seen": 151891600, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.24047852, "step": 7075, "time_per_iteration": 2.84133243560791 }, { "auxiliary_loss_clip": 0.01461465, "auxiliary_loss_mlp": 0.01041701, "balance_loss_clip": 1.28327394, "balance_loss_mlp": 1.02024293, "epoch": 0.42543213587855105, "flos": 23013523382400.0, "grad_norm": 1.836824499285344, "language_loss": 0.7511422, "learning_rate": 2.5720033158309973e-06, "loss": 0.77617383, "num_input_tokens_seen": 151911330, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.21459961, "step": 7076, "time_per_iteration": 2.844604969024658 }, { "auxiliary_loss_clip": 0.0148134, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.29581809, "balance_loss_mlp": 1.02133036, "epoch": 0.425492259131219, "flos": 25093306719360.0, "grad_norm": 2.1293913860918527, "language_loss": 0.79453564, "learning_rate": 2.571630111462766e-06, "loss": 0.81978714, "num_input_tokens_seen": 151930355, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.22473145, "step": 7077, "time_per_iteration": 2.9756767749786377 }, { "auxiliary_loss_clip": 0.01457901, "auxiliary_loss_mlp": 0.01037841, "balance_loss_clip": 1.2822144, "balance_loss_mlp": 1.01665723, "epoch": 0.425552382383887, "flos": 22826209334400.0, "grad_norm": 1.6854221612056146, "language_loss": 0.73800761, "learning_rate": 2.571256885418265e-06, "loss": 0.76296496, "num_input_tokens_seen": 151949695, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.21191406, "step": 7078, "time_per_iteration": 2.9292047023773193 }, { "auxiliary_loss_clip": 0.01463414, "auxiliary_loss_mlp": 0.01045266, "balance_loss_clip": 1.28474426, "balance_loss_mlp": 1.02244973, "epoch": 0.42561250563655495, "flos": 13561610503680.0, "grad_norm": 1.6984132203027067, "language_loss": 0.81002343, "learning_rate": 2.5708836377116445e-06, "loss": 0.83511019, "num_input_tokens_seen": 151967640, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.22802734, "step": 7079, "time_per_iteration": 2.7873973846435547 }, { "auxiliary_loss_clip": 0.0146893, "auxiliary_loss_mlp": 0.01043286, "balance_loss_clip": 1.28887808, "balance_loss_mlp": 1.0218997, "epoch": 0.4256726288892229, "flos": 46995976316160.0, "grad_norm": 1.509967960694907, "language_loss": 0.72754961, "learning_rate": 2.5705103683570592e-06, "loss": 0.75267172, "num_input_tokens_seen": 151994020, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.21386719, "step": 7080, "time_per_iteration": 3.0949535369873047 }, { "auxiliary_loss_clip": 0.01461318, "auxiliary_loss_mlp": 0.01042871, "balance_loss_clip": 1.28008914, "balance_loss_mlp": 1.02092409, "epoch": 0.4257327521418909, "flos": 23596893578880.0, "grad_norm": 2.7475255756402905, "language_loss": 0.81404763, "learning_rate": 2.5701370773686646e-06, "loss": 0.83908951, "num_input_tokens_seen": 152013415, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.21948242, "step": 7081, "time_per_iteration": 4.316623210906982 }, { "auxiliary_loss_clip": 0.01458734, "auxiliary_loss_mlp": 0.01047012, "balance_loss_clip": 1.28271258, "balance_loss_mlp": 1.0245527, "epoch": 0.42579287539455885, "flos": 18999437299200.0, "grad_norm": 1.6086405719613108, "language_loss": 0.81806117, "learning_rate": 2.5697637647606138e-06, "loss": 0.84311861, "num_input_tokens_seen": 152030860, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.22460938, "step": 7082, "time_per_iteration": 4.196643352508545 }, { "auxiliary_loss_clip": 0.01473346, "auxiliary_loss_mlp": 0.01046157, "balance_loss_clip": 1.29193783, "balance_loss_mlp": 1.02444851, "epoch": 0.4258529986472268, "flos": 25202790432000.0, "grad_norm": 1.7702021216620596, "language_loss": 0.70296693, "learning_rate": 2.569390430547065e-06, "loss": 0.72816199, "num_input_tokens_seen": 152050395, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.21704102, "step": 7083, "time_per_iteration": 2.8531947135925293 }, { "auxiliary_loss_clip": 0.01243944, "auxiliary_loss_mlp": 0.01045938, "balance_loss_clip": 1.13867402, "balance_loss_mlp": 1.01961637, "epoch": 0.4259131218998948, "flos": 70002414247680.0, "grad_norm": 0.8987582894866127, "language_loss": 0.67095327, "learning_rate": 2.569017074742173e-06, "loss": 0.69385207, "num_input_tokens_seen": 152113555, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.26367188, "step": 7084, "time_per_iteration": 4.859258651733398 }, { "auxiliary_loss_clip": 0.01467491, "auxiliary_loss_mlp": 0.01049991, "balance_loss_clip": 1.28674662, "balance_loss_mlp": 1.02636373, "epoch": 0.42597324515256274, "flos": 18014174375040.0, "grad_norm": 2.073264252772804, "language_loss": 0.79048955, "learning_rate": 2.5686436973600964e-06, "loss": 0.81566429, "num_input_tokens_seen": 152131575, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.23620605, "step": 7085, "time_per_iteration": 2.827221393585205 }, { "auxiliary_loss_clip": 0.01495705, "auxiliary_loss_mlp": 0.01056485, "balance_loss_clip": 1.30613577, "balance_loss_mlp": 1.03285789, "epoch": 0.4260333684052307, "flos": 15167190643200.0, "grad_norm": 2.438441834141738, "language_loss": 0.77774489, "learning_rate": 2.568270298414995e-06, "loss": 0.80326676, "num_input_tokens_seen": 152149435, "router_z_loss_clip": 1.89550781, "router_z_loss_mlp": 0.23632812, "step": 7086, "time_per_iteration": 2.7935128211975098 }, { "auxiliary_loss_clip": 0.01473715, "auxiliary_loss_mlp": 0.01042254, "balance_loss_clip": 1.29409981, "balance_loss_mlp": 1.02034283, "epoch": 0.42609349165789867, "flos": 14947544545920.0, "grad_norm": 1.8130101783300696, "language_loss": 0.81090742, "learning_rate": 2.5678968779210255e-06, "loss": 0.83606708, "num_input_tokens_seen": 152166860, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.21911621, "step": 7087, "time_per_iteration": 2.8359174728393555 }, { "auxiliary_loss_clip": 0.01477646, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.29562593, "balance_loss_mlp": 1.02150822, "epoch": 0.42615361491056664, "flos": 23742328924800.0, "grad_norm": 1.6207979807323898, "language_loss": 0.66610831, "learning_rate": 2.5675234358923505e-06, "loss": 0.69133091, "num_input_tokens_seen": 152187475, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.2310791, "step": 7088, "time_per_iteration": 2.8723244667053223 }, { "auxiliary_loss_clip": 0.01494108, "auxiliary_loss_mlp": 0.01050189, "balance_loss_clip": 1.30900097, "balance_loss_mlp": 1.02768207, "epoch": 0.42621373816323466, "flos": 24947237946240.0, "grad_norm": 2.1818213667225255, "language_loss": 0.69553363, "learning_rate": 2.56714997234313e-06, "loss": 0.72097659, "num_input_tokens_seen": 152207235, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.22509766, "step": 7089, "time_per_iteration": 2.9019615650177 }, { "auxiliary_loss_clip": 0.0148952, "auxiliary_loss_mlp": 0.01042486, "balance_loss_clip": 1.30398643, "balance_loss_mlp": 1.02133834, "epoch": 0.4262738614159026, "flos": 13560841342080.0, "grad_norm": 3.9077710905916443, "language_loss": 0.7450248, "learning_rate": 2.566776487287525e-06, "loss": 0.77034485, "num_input_tokens_seen": 152224240, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.21142578, "step": 7090, "time_per_iteration": 2.7999441623687744 }, { "auxiliary_loss_clip": 0.01491988, "auxiliary_loss_mlp": 0.01048336, "balance_loss_clip": 1.30647242, "balance_loss_mlp": 1.02643681, "epoch": 0.4263339846685706, "flos": 29759861088000.0, "grad_norm": 2.24574039437063, "language_loss": 0.75631404, "learning_rate": 2.5664029807396994e-06, "loss": 0.7817173, "num_input_tokens_seen": 152242595, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.21911621, "step": 7091, "time_per_iteration": 2.8859994411468506 }, { "auxiliary_loss_clip": 0.01459933, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.28428388, "balance_loss_mlp": 1.01845634, "epoch": 0.42639410792123855, "flos": 16842276074880.0, "grad_norm": 1.8615750119415506, "language_loss": 0.83412373, "learning_rate": 2.5660294527138156e-06, "loss": 0.85911733, "num_input_tokens_seen": 152260840, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.2097168, "step": 7092, "time_per_iteration": 2.8245270252227783 }, { "auxiliary_loss_clip": 0.0150104, "auxiliary_loss_mlp": 0.01044675, "balance_loss_clip": 1.31262004, "balance_loss_mlp": 1.02226329, "epoch": 0.4264542311739065, "flos": 28773783757440.0, "grad_norm": 1.5284073933417415, "language_loss": 0.74306208, "learning_rate": 2.565655903224038e-06, "loss": 0.76851928, "num_input_tokens_seen": 152280580, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.22412109, "step": 7093, "time_per_iteration": 2.867769241333008 }, { "auxiliary_loss_clip": 0.01482669, "auxiliary_loss_mlp": 0.01037738, "balance_loss_clip": 1.30091023, "balance_loss_mlp": 1.01549339, "epoch": 0.4265143544265745, "flos": 24723338837760.0, "grad_norm": 2.202005778297535, "language_loss": 0.71069527, "learning_rate": 2.565282332284532e-06, "loss": 0.73589933, "num_input_tokens_seen": 152298455, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.22241211, "step": 7094, "time_per_iteration": 2.866036891937256 }, { "auxiliary_loss_clip": 0.01493678, "auxiliary_loss_mlp": 0.01044628, "balance_loss_clip": 1.31031775, "balance_loss_mlp": 1.02060771, "epoch": 0.42657447767924245, "flos": 21874500069120.0, "grad_norm": 1.972071129020264, "language_loss": 0.82513475, "learning_rate": 2.564908739909464e-06, "loss": 0.85051787, "num_input_tokens_seen": 152316995, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.24047852, "step": 7095, "time_per_iteration": 2.799010992050171 }, { "auxiliary_loss_clip": 0.01498013, "auxiliary_loss_mlp": 0.01047034, "balance_loss_clip": 1.31170344, "balance_loss_mlp": 1.02357328, "epoch": 0.4266346009319104, "flos": 21480117978240.0, "grad_norm": 1.736118968724066, "language_loss": 0.81419003, "learning_rate": 2.5645351261129996e-06, "loss": 0.8396405, "num_input_tokens_seen": 152334800, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.23461914, "step": 7096, "time_per_iteration": 2.941101551055908 }, { "auxiliary_loss_clip": 0.01501692, "auxiliary_loss_mlp": 0.01040685, "balance_loss_clip": 1.31312776, "balance_loss_mlp": 1.0183568, "epoch": 0.4266947241845784, "flos": 25530110449920.0, "grad_norm": 1.9558958838763105, "language_loss": 0.66535032, "learning_rate": 2.5641614909093066e-06, "loss": 0.69077408, "num_input_tokens_seen": 152355175, "router_z_loss_clip": 1.88378906, "router_z_loss_mlp": 0.2232666, "step": 7097, "time_per_iteration": 2.8706889152526855 }, { "auxiliary_loss_clip": 0.01480272, "auxiliary_loss_mlp": 0.0103301, "balance_loss_clip": 1.29937196, "balance_loss_mlp": 1.01089644, "epoch": 0.42675484743724634, "flos": 26552275148160.0, "grad_norm": 1.7336567320563723, "language_loss": 0.74958587, "learning_rate": 2.5637878343125535e-06, "loss": 0.7747187, "num_input_tokens_seen": 152377245, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.22094727, "step": 7098, "time_per_iteration": 2.888233184814453 }, { "auxiliary_loss_clip": 0.01467765, "auxiliary_loss_mlp": 0.01040603, "balance_loss_clip": 1.28886056, "balance_loss_mlp": 1.01822782, "epoch": 0.4268149706899143, "flos": 23123188074240.0, "grad_norm": 1.626764291897448, "language_loss": 0.75957477, "learning_rate": 2.5634141563369086e-06, "loss": 0.78465849, "num_input_tokens_seen": 152396985, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.22387695, "step": 7099, "time_per_iteration": 2.8786673545837402 }, { "auxiliary_loss_clip": 0.01494132, "auxiliary_loss_mlp": 0.01038632, "balance_loss_clip": 1.30865347, "balance_loss_mlp": 1.01583898, "epoch": 0.4268750939425823, "flos": 22715639746560.0, "grad_norm": 2.0155925480813606, "language_loss": 0.83379269, "learning_rate": 2.5630404569965432e-06, "loss": 0.85912037, "num_input_tokens_seen": 152415590, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.2277832, "step": 7100, "time_per_iteration": 2.825925827026367 }, { "auxiliary_loss_clip": 0.01484212, "auxiliary_loss_mlp": 0.01039347, "balance_loss_clip": 1.30007625, "balance_loss_mlp": 1.01651835, "epoch": 0.42693521719525024, "flos": 25385941958400.0, "grad_norm": 1.38420099557558, "language_loss": 0.82477719, "learning_rate": 2.562666736305627e-06, "loss": 0.85001278, "num_input_tokens_seen": 152436735, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.22814941, "step": 7101, "time_per_iteration": 2.8799118995666504 }, { "auxiliary_loss_clip": 0.0149098, "auxiliary_loss_mlp": 0.01037449, "balance_loss_clip": 1.30534983, "balance_loss_mlp": 1.01379836, "epoch": 0.42699534044791826, "flos": 18159881189760.0, "grad_norm": 2.4175712516481362, "language_loss": 0.73272902, "learning_rate": 2.5622929942783314e-06, "loss": 0.75801331, "num_input_tokens_seen": 152455685, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.2364502, "step": 7102, "time_per_iteration": 2.828179121017456 }, { "auxiliary_loss_clip": 0.01469827, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.29176378, "balance_loss_mlp": 1.01471829, "epoch": 0.4270554637005862, "flos": 13706186198400.0, "grad_norm": 2.173044595388497, "language_loss": 0.83969367, "learning_rate": 2.5619192309288297e-06, "loss": 0.86475849, "num_input_tokens_seen": 152473500, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.21936035, "step": 7103, "time_per_iteration": 2.798736810684204 }, { "auxiliary_loss_clip": 0.01495216, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.30886197, "balance_loss_mlp": 1.01348233, "epoch": 0.4271155869532542, "flos": 17502526465920.0, "grad_norm": 2.0836487852502095, "language_loss": 0.75479543, "learning_rate": 2.561545446271294e-06, "loss": 0.78010803, "num_input_tokens_seen": 152491320, "router_z_loss_clip": 1.86328125, "router_z_loss_mlp": 0.22583008, "step": 7104, "time_per_iteration": 2.8120079040527344 }, { "auxiliary_loss_clip": 0.01474399, "auxiliary_loss_mlp": 0.01037463, "balance_loss_clip": 1.29226017, "balance_loss_mlp": 1.01528978, "epoch": 0.42717571020592215, "flos": 32464983813120.0, "grad_norm": 2.5076280588491175, "language_loss": 0.76446128, "learning_rate": 2.5611716403198987e-06, "loss": 0.78957987, "num_input_tokens_seen": 152511970, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.22155762, "step": 7105, "time_per_iteration": 2.8916070461273193 }, { "auxiliary_loss_clip": 0.01502075, "auxiliary_loss_mlp": 0.01034304, "balance_loss_clip": 1.31800914, "balance_loss_mlp": 1.01291788, "epoch": 0.4272358334585901, "flos": 16261665811200.0, "grad_norm": 2.07420057156611, "language_loss": 0.79114771, "learning_rate": 2.560797813088819e-06, "loss": 0.81651151, "num_input_tokens_seen": 152530515, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.21374512, "step": 7106, "time_per_iteration": 2.792191505432129 }, { "auxiliary_loss_clip": 0.0147742, "auxiliary_loss_mlp": 0.01039423, "balance_loss_clip": 1.29602563, "balance_loss_mlp": 1.0169158, "epoch": 0.4272959567112581, "flos": 24208976240640.0, "grad_norm": 2.7794575296211894, "language_loss": 0.81298923, "learning_rate": 2.560423964592229e-06, "loss": 0.83815765, "num_input_tokens_seen": 152549295, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.22521973, "step": 7107, "time_per_iteration": 2.8419156074523926 }, { "auxiliary_loss_clip": 0.01475607, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.29452145, "balance_loss_mlp": 1.01402736, "epoch": 0.42735607996392605, "flos": 27974749006080.0, "grad_norm": 1.5731997885386806, "language_loss": 0.68677485, "learning_rate": 2.5600500948443075e-06, "loss": 0.71189523, "num_input_tokens_seen": 152570725, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.22399902, "step": 7108, "time_per_iteration": 4.306857109069824 }, { "auxiliary_loss_clip": 0.01482864, "auxiliary_loss_mlp": 0.01041066, "balance_loss_clip": 1.30096769, "balance_loss_mlp": 1.01913142, "epoch": 0.427416203216594, "flos": 20303333239680.0, "grad_norm": 1.8770057054858285, "language_loss": 0.72345471, "learning_rate": 2.5596762038592294e-06, "loss": 0.748694, "num_input_tokens_seen": 152588950, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21923828, "step": 7109, "time_per_iteration": 2.880417823791504 }, { "auxiliary_loss_clip": 0.01478671, "auxiliary_loss_mlp": 0.01038646, "balance_loss_clip": 1.29602146, "balance_loss_mlp": 1.01516223, "epoch": 0.427476326469262, "flos": 26955479975040.0, "grad_norm": 1.7429405070441886, "language_loss": 0.65401232, "learning_rate": 2.559302291651174e-06, "loss": 0.67918551, "num_input_tokens_seen": 152608965, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.23474121, "step": 7110, "time_per_iteration": 2.881152629852295 }, { "auxiliary_loss_clip": 0.01478838, "auxiliary_loss_mlp": 0.01037728, "balance_loss_clip": 1.29659534, "balance_loss_mlp": 1.01474428, "epoch": 0.42753644972192995, "flos": 25713895403520.0, "grad_norm": 1.7312647067920897, "language_loss": 0.77229345, "learning_rate": 2.5589283582343197e-06, "loss": 0.79745913, "num_input_tokens_seen": 152630220, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.22961426, "step": 7111, "time_per_iteration": 2.877923011779785 }, { "auxiliary_loss_clip": 0.014828, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.2995224, "balance_loss_mlp": 1.0164094, "epoch": 0.4275965729745979, "flos": 18775854904320.0, "grad_norm": 1.7590635471110536, "language_loss": 0.74340713, "learning_rate": 2.558554403622845e-06, "loss": 0.76861864, "num_input_tokens_seen": 152648835, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.21948242, "step": 7112, "time_per_iteration": 2.8184444904327393 }, { "auxiliary_loss_clip": 0.01465311, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.28628302, "balance_loss_mlp": 1.01390254, "epoch": 0.4276566962272659, "flos": 23773756078080.0, "grad_norm": 1.6137174139734443, "language_loss": 0.72004783, "learning_rate": 2.5581804278309323e-06, "loss": 0.74505776, "num_input_tokens_seen": 152668375, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.21777344, "step": 7113, "time_per_iteration": 2.9155595302581787 }, { "auxiliary_loss_clip": 0.01488841, "auxiliary_loss_mlp": 0.0103994, "balance_loss_clip": 1.30405569, "balance_loss_mlp": 1.01776743, "epoch": 0.42771681947993384, "flos": 22502825614080.0, "grad_norm": 1.5418724869165017, "language_loss": 0.61934996, "learning_rate": 2.5578064308727617e-06, "loss": 0.64463782, "num_input_tokens_seen": 152689725, "router_z_loss_clip": 1.84863281, "router_z_loss_mlp": 0.22167969, "step": 7114, "time_per_iteration": 2.980602741241455 }, { "auxiliary_loss_clip": 0.01492758, "auxiliary_loss_mlp": 0.01041735, "balance_loss_clip": 1.30317116, "balance_loss_mlp": 1.01776218, "epoch": 0.42777694273260186, "flos": 25055228580480.0, "grad_norm": 1.9043519815471994, "language_loss": 0.65778196, "learning_rate": 2.5574324127625153e-06, "loss": 0.68312687, "num_input_tokens_seen": 152709375, "router_z_loss_clip": 1.89453125, "router_z_loss_mlp": 0.23986816, "step": 7115, "time_per_iteration": 4.265331745147705 }, { "auxiliary_loss_clip": 0.01475668, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.29388404, "balance_loss_mlp": 1.01162112, "epoch": 0.4278370659852698, "flos": 18670850426880.0, "grad_norm": 1.6471052860346147, "language_loss": 0.73823857, "learning_rate": 2.5570583735143753e-06, "loss": 0.76332921, "num_input_tokens_seen": 152727510, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.21777344, "step": 7116, "time_per_iteration": 2.8323047161102295 }, { "auxiliary_loss_clip": 0.01455817, "auxiliary_loss_mlp": 0.0103525, "balance_loss_clip": 1.27829587, "balance_loss_mlp": 1.01324427, "epoch": 0.4278971892379378, "flos": 27319747011840.0, "grad_norm": 1.739070112535789, "language_loss": 0.70102179, "learning_rate": 2.5566843131425275e-06, "loss": 0.72593248, "num_input_tokens_seen": 152746670, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.22009277, "step": 7117, "time_per_iteration": 4.315077066421509 }, { "auxiliary_loss_clip": 0.0147499, "auxiliary_loss_mlp": 0.01041741, "balance_loss_clip": 1.29439211, "balance_loss_mlp": 1.01880503, "epoch": 0.42795731249060576, "flos": 12894437658240.0, "grad_norm": 2.7473168055614994, "language_loss": 0.71541446, "learning_rate": 2.5563102316611536e-06, "loss": 0.74058175, "num_input_tokens_seen": 152760545, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22937012, "step": 7118, "time_per_iteration": 4.215929985046387 }, { "auxiliary_loss_clip": 0.01467739, "auxiliary_loss_mlp": 0.01041512, "balance_loss_clip": 1.28759253, "balance_loss_mlp": 1.01887369, "epoch": 0.4280174357432737, "flos": 33414521328000.0, "grad_norm": 1.8753448909109025, "language_loss": 0.75403285, "learning_rate": 2.55593612908444e-06, "loss": 0.77912533, "num_input_tokens_seen": 152780970, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.22631836, "step": 7119, "time_per_iteration": 2.9512226581573486 }, { "auxiliary_loss_clip": 0.01462949, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.28370535, "balance_loss_mlp": 1.01028776, "epoch": 0.4280775589959417, "flos": 18268052803200.0, "grad_norm": 2.0033717169538625, "language_loss": 0.74837142, "learning_rate": 2.555562005426573e-06, "loss": 0.77333337, "num_input_tokens_seen": 152798475, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.22973633, "step": 7120, "time_per_iteration": 2.8159942626953125 }, { "auxiliary_loss_clip": 0.01475702, "auxiliary_loss_mlp": 0.01039756, "balance_loss_clip": 1.29457951, "balance_loss_mlp": 1.01688004, "epoch": 0.42813768224860965, "flos": 21481430077440.0, "grad_norm": 1.9672518694679362, "language_loss": 0.77832901, "learning_rate": 2.5551878607017385e-06, "loss": 0.80348361, "num_input_tokens_seen": 152817555, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.2286377, "step": 7121, "time_per_iteration": 2.8971588611602783 }, { "auxiliary_loss_clip": 0.01450676, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.27390432, "balance_loss_mlp": 1.01625741, "epoch": 0.4281978055012776, "flos": 15677707432320.0, "grad_norm": 1.731754953481099, "language_loss": 0.86312211, "learning_rate": 2.554813694924126e-06, "loss": 0.88802516, "num_input_tokens_seen": 152836295, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.23364258, "step": 7122, "time_per_iteration": 2.8223047256469727 }, { "auxiliary_loss_clip": 0.01462025, "auxiliary_loss_mlp": 0.01037242, "balance_loss_clip": 1.28247547, "balance_loss_mlp": 1.01491368, "epoch": 0.4282579287539456, "flos": 17720860464000.0, "grad_norm": 5.382276948508483, "language_loss": 0.8200078, "learning_rate": 2.554439508107921e-06, "loss": 0.84500051, "num_input_tokens_seen": 152854950, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22338867, "step": 7123, "time_per_iteration": 2.8408985137939453 }, { "auxiliary_loss_clip": 0.01459769, "auxiliary_loss_mlp": 0.01035041, "balance_loss_clip": 1.28149426, "balance_loss_mlp": 1.01316607, "epoch": 0.42831805200661355, "flos": 19290488970240.0, "grad_norm": 1.5818329185226463, "language_loss": 0.81430256, "learning_rate": 2.5540653002673153e-06, "loss": 0.83925062, "num_input_tokens_seen": 152873995, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.21862793, "step": 7124, "time_per_iteration": 2.876183271408081 }, { "auxiliary_loss_clip": 0.01461212, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.28026378, "balance_loss_mlp": 1.01808143, "epoch": 0.4283781752592815, "flos": 19802272613760.0, "grad_norm": 1.7350061051988042, "language_loss": 0.80686855, "learning_rate": 2.553691071416498e-06, "loss": 0.83190167, "num_input_tokens_seen": 152892925, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.2401123, "step": 7125, "time_per_iteration": 2.871673583984375 }, { "auxiliary_loss_clip": 0.01448779, "auxiliary_loss_mlp": 0.01037301, "balance_loss_clip": 1.27112043, "balance_loss_mlp": 1.01523519, "epoch": 0.4284382985119495, "flos": 16516630114560.0, "grad_norm": 2.2128455848264434, "language_loss": 0.75949377, "learning_rate": 2.553316821569659e-06, "loss": 0.78435457, "num_input_tokens_seen": 152910935, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.22058105, "step": 7126, "time_per_iteration": 2.8393664360046387 }, { "auxiliary_loss_clip": 0.01468541, "auxiliary_loss_mlp": 0.01038388, "balance_loss_clip": 1.2870419, "balance_loss_mlp": 1.01557136, "epoch": 0.42849842176461744, "flos": 23341069624320.0, "grad_norm": 1.630653779326275, "language_loss": 0.82348311, "learning_rate": 2.5529425507409913e-06, "loss": 0.84855235, "num_input_tokens_seen": 152931030, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.22802734, "step": 7127, "time_per_iteration": 2.885897397994995 }, { "auxiliary_loss_clip": 0.01467856, "auxiliary_loss_mlp": 0.01040244, "balance_loss_clip": 1.28433836, "balance_loss_mlp": 1.01697433, "epoch": 0.4285585450172854, "flos": 17283242327040.0, "grad_norm": 2.5322786049146337, "language_loss": 0.76694316, "learning_rate": 2.5525682589446867e-06, "loss": 0.79202414, "num_input_tokens_seen": 152948085, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.23278809, "step": 7128, "time_per_iteration": 2.9343371391296387 }, { "auxiliary_loss_clip": 0.01468104, "auxiliary_loss_mlp": 0.01039169, "balance_loss_clip": 1.28341174, "balance_loss_mlp": 1.01616192, "epoch": 0.42861866826995343, "flos": 24290199936000.0, "grad_norm": 2.5158771956410613, "language_loss": 0.74472439, "learning_rate": 2.552193946194937e-06, "loss": 0.76979715, "num_input_tokens_seen": 152966265, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.22998047, "step": 7129, "time_per_iteration": 2.8772947788238525 }, { "auxiliary_loss_clip": 0.01462086, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.28205895, "balance_loss_mlp": 1.01398551, "epoch": 0.4286787915226214, "flos": 24363867749760.0, "grad_norm": 2.8738693730627185, "language_loss": 0.78946614, "learning_rate": 2.5518196125059394e-06, "loss": 0.81445193, "num_input_tokens_seen": 152986775, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.22509766, "step": 7130, "time_per_iteration": 2.908754587173462 }, { "auxiliary_loss_clip": 0.01476166, "auxiliary_loss_mlp": 0.01042858, "balance_loss_clip": 1.29233265, "balance_loss_mlp": 1.01918304, "epoch": 0.42873891477528936, "flos": 15458287559040.0, "grad_norm": 2.2713513156266187, "language_loss": 0.74427652, "learning_rate": 2.551445257891886e-06, "loss": 0.76946676, "num_input_tokens_seen": 153003595, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.23693848, "step": 7131, "time_per_iteration": 2.8172099590301514 }, { "auxiliary_loss_clip": 0.01477301, "auxiliary_loss_mlp": 0.01044301, "balance_loss_clip": 1.29410875, "balance_loss_mlp": 1.02147198, "epoch": 0.4287990380279573, "flos": 17648549994240.0, "grad_norm": 2.217428585682831, "language_loss": 0.78136009, "learning_rate": 2.551070882366973e-06, "loss": 0.80657613, "num_input_tokens_seen": 153021960, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22827148, "step": 7132, "time_per_iteration": 2.828075647354126 }, { "auxiliary_loss_clip": 0.01472214, "auxiliary_loss_mlp": 0.01044373, "balance_loss_clip": 1.29017997, "balance_loss_mlp": 1.02150846, "epoch": 0.4288591612806253, "flos": 27173768728320.0, "grad_norm": 1.6607455826577207, "language_loss": 0.79082, "learning_rate": 2.550696485945397e-06, "loss": 0.81598592, "num_input_tokens_seen": 153042110, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.2286377, "step": 7133, "time_per_iteration": 2.8752410411834717 }, { "auxiliary_loss_clip": 0.01476844, "auxiliary_loss_mlp": 0.010479, "balance_loss_clip": 1.29389429, "balance_loss_mlp": 1.02517915, "epoch": 0.42891928453329325, "flos": 17171496374400.0, "grad_norm": 1.9387117446939677, "language_loss": 0.755247, "learning_rate": 2.550322068641355e-06, "loss": 0.78049445, "num_input_tokens_seen": 153058925, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.22705078, "step": 7134, "time_per_iteration": 2.8343286514282227 }, { "auxiliary_loss_clip": 0.01450531, "auxiliary_loss_mlp": 0.01039046, "balance_loss_clip": 1.27201891, "balance_loss_mlp": 1.01683748, "epoch": 0.4289794077859612, "flos": 18196194781440.0, "grad_norm": 1.8323264884324404, "language_loss": 0.8498317, "learning_rate": 2.5499476304690455e-06, "loss": 0.87472749, "num_input_tokens_seen": 153078070, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.22192383, "step": 7135, "time_per_iteration": 2.8087246417999268 }, { "auxiliary_loss_clip": 0.01450484, "auxiliary_loss_mlp": 0.01040393, "balance_loss_clip": 1.2734195, "balance_loss_mlp": 1.01861346, "epoch": 0.4290395310386292, "flos": 28268605854720.0, "grad_norm": 1.9514951797259767, "language_loss": 0.76305223, "learning_rate": 2.549573171442666e-06, "loss": 0.78796095, "num_input_tokens_seen": 153096680, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.2175293, "step": 7136, "time_per_iteration": 2.8528990745544434 }, { "auxiliary_loss_clip": 0.01474231, "auxiliary_loss_mlp": 0.01037455, "balance_loss_clip": 1.28996634, "balance_loss_mlp": 1.01505589, "epoch": 0.42909965429129715, "flos": 16224537813120.0, "grad_norm": 2.9091493296078665, "language_loss": 0.80258286, "learning_rate": 2.5491986915764175e-06, "loss": 0.82769972, "num_input_tokens_seen": 153113305, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.22387695, "step": 7137, "time_per_iteration": 2.7956466674804688 }, { "auxiliary_loss_clip": 0.01470202, "auxiliary_loss_mlp": 0.01041902, "balance_loss_clip": 1.28831506, "balance_loss_mlp": 1.01971722, "epoch": 0.4291597775439651, "flos": 23123369053440.0, "grad_norm": 1.8113042222793359, "language_loss": 0.77374321, "learning_rate": 2.548824190884499e-06, "loss": 0.79886425, "num_input_tokens_seen": 153132735, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.22192383, "step": 7138, "time_per_iteration": 2.881227731704712 }, { "auxiliary_loss_clip": 0.01263668, "auxiliary_loss_mlp": 0.01065081, "balance_loss_clip": 1.15347588, "balance_loss_mlp": 1.0353266, "epoch": 0.4292199007966331, "flos": 67576081852800.0, "grad_norm": 0.7923938618595718, "language_loss": 0.5628981, "learning_rate": 2.548449669381113e-06, "loss": 0.58618557, "num_input_tokens_seen": 153187925, "router_z_loss_clip": 1.109375, "router_z_loss_mlp": 0.296875, "step": 7139, "time_per_iteration": 3.2196671962738037 }, { "auxiliary_loss_clip": 0.01446694, "auxiliary_loss_mlp": 0.01040665, "balance_loss_clip": 1.27130878, "balance_loss_mlp": 1.0189327, "epoch": 0.42928002404930105, "flos": 23009406105600.0, "grad_norm": 1.79417403514838, "language_loss": 0.81421208, "learning_rate": 2.5480751270804595e-06, "loss": 0.8390857, "num_input_tokens_seen": 153206990, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.21740723, "step": 7140, "time_per_iteration": 2.902254819869995 }, { "auxiliary_loss_clip": 0.01479072, "auxiliary_loss_mlp": 0.01042046, "balance_loss_clip": 1.295017, "balance_loss_mlp": 1.01797783, "epoch": 0.429340147301969, "flos": 11551920641280.0, "grad_norm": 1.7983262637180595, "language_loss": 0.82850361, "learning_rate": 2.5477005639967424e-06, "loss": 0.85371482, "num_input_tokens_seen": 153222345, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.24072266, "step": 7141, "time_per_iteration": 2.810974359512329 }, { "auxiliary_loss_clip": 0.01470063, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.28582716, "balance_loss_mlp": 1.02259135, "epoch": 0.42940027055463703, "flos": 25275915308160.0, "grad_norm": 1.6274572684533348, "language_loss": 0.87276649, "learning_rate": 2.547325980144166e-06, "loss": 0.89792341, "num_input_tokens_seen": 153240570, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.23034668, "step": 7142, "time_per_iteration": 2.8382935523986816 }, { "auxiliary_loss_clip": 0.0146293, "auxiliary_loss_mlp": 0.01045689, "balance_loss_clip": 1.28620136, "balance_loss_mlp": 1.02303886, "epoch": 0.429460393807305, "flos": 23815408556160.0, "grad_norm": 1.9195663566826497, "language_loss": 0.78756875, "learning_rate": 2.5469513755369323e-06, "loss": 0.81265497, "num_input_tokens_seen": 153259575, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.2265625, "step": 7143, "time_per_iteration": 4.24007248878479 }, { "auxiliary_loss_clip": 0.01482616, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.30109906, "balance_loss_mlp": 1.01956964, "epoch": 0.42952051705997296, "flos": 13926375233280.0, "grad_norm": 2.108181469895643, "language_loss": 0.78330338, "learning_rate": 2.5465767501892484e-06, "loss": 0.80852687, "num_input_tokens_seen": 153276650, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.20166016, "step": 7144, "time_per_iteration": 2.800973892211914 }, { "auxiliary_loss_clip": 0.01467771, "auxiliary_loss_mlp": 0.010416, "balance_loss_clip": 1.28640389, "balance_loss_mlp": 1.01867628, "epoch": 0.4295806403126409, "flos": 26771559287040.0, "grad_norm": 3.0085630947510675, "language_loss": 0.74766803, "learning_rate": 2.54620210411532e-06, "loss": 0.7727617, "num_input_tokens_seen": 153298025, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.22924805, "step": 7145, "time_per_iteration": 2.873518943786621 }, { "auxiliary_loss_clip": 0.0146648, "auxiliary_loss_mlp": 0.01041336, "balance_loss_clip": 1.28429127, "balance_loss_mlp": 1.01931846, "epoch": 0.4296407635653089, "flos": 20960597473920.0, "grad_norm": 2.0135328580120184, "language_loss": 0.80101967, "learning_rate": 2.545827437329352e-06, "loss": 0.82609785, "num_input_tokens_seen": 153315775, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.22009277, "step": 7146, "time_per_iteration": 2.8143060207366943 }, { "auxiliary_loss_clip": 0.01460524, "auxiliary_loss_mlp": 0.0103969, "balance_loss_clip": 1.28193033, "balance_loss_mlp": 1.01845837, "epoch": 0.42970088681797686, "flos": 15860994693120.0, "grad_norm": 3.320080098259512, "language_loss": 0.84389567, "learning_rate": 2.5454527498455532e-06, "loss": 0.8688978, "num_input_tokens_seen": 153332765, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21228027, "step": 7147, "time_per_iteration": 2.8341891765594482 }, { "auxiliary_loss_clip": 0.0146631, "auxiliary_loss_mlp": 0.01043344, "balance_loss_clip": 1.28674984, "balance_loss_mlp": 1.01982355, "epoch": 0.4297610100706448, "flos": 22392798963840.0, "grad_norm": 2.0797655236318056, "language_loss": 0.8778978, "learning_rate": 2.545078041678131e-06, "loss": 0.90299428, "num_input_tokens_seen": 153350760, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.23522949, "step": 7148, "time_per_iteration": 2.866269588470459 }, { "auxiliary_loss_clip": 0.01456839, "auxiliary_loss_mlp": 0.01040163, "balance_loss_clip": 1.27749038, "balance_loss_mlp": 1.0170722, "epoch": 0.4298211333233128, "flos": 27936580377600.0, "grad_norm": 2.126902888038868, "language_loss": 0.78705764, "learning_rate": 2.5447033128412957e-06, "loss": 0.81202769, "num_input_tokens_seen": 153370765, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.23083496, "step": 7149, "time_per_iteration": 2.9194178581237793 }, { "auxiliary_loss_clip": 0.01455679, "auxiliary_loss_mlp": 0.0104129, "balance_loss_clip": 1.2777282, "balance_loss_mlp": 1.01879513, "epoch": 0.42988125657598075, "flos": 24436268709120.0, "grad_norm": 1.7111751858188546, "language_loss": 0.8078301, "learning_rate": 2.544328563349256e-06, "loss": 0.83279979, "num_input_tokens_seen": 153390725, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.22485352, "step": 7150, "time_per_iteration": 4.293681383132935 }, { "auxiliary_loss_clip": 0.01481611, "auxiliary_loss_mlp": 0.01042886, "balance_loss_clip": 1.29455614, "balance_loss_mlp": 1.01863885, "epoch": 0.4299413798286487, "flos": 15858280005120.0, "grad_norm": 1.8430121824220336, "language_loss": 0.75959623, "learning_rate": 2.5439537932162222e-06, "loss": 0.78484124, "num_input_tokens_seen": 153408010, "router_z_loss_clip": 1.87011719, "router_z_loss_mlp": 0.24267578, "step": 7151, "time_per_iteration": 4.211425542831421 }, { "auxiliary_loss_clip": 0.01481874, "auxiliary_loss_mlp": 0.01041705, "balance_loss_clip": 1.29631197, "balance_loss_mlp": 1.01938868, "epoch": 0.4300015030813167, "flos": 22319628842880.0, "grad_norm": 1.7565109148156308, "language_loss": 0.7116462, "learning_rate": 2.543579002456406e-06, "loss": 0.73688197, "num_input_tokens_seen": 153426865, "router_z_loss_clip": 1.85449219, "router_z_loss_mlp": 0.2232666, "step": 7152, "time_per_iteration": 2.8337037563323975 }, { "auxiliary_loss_clip": 0.01460169, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.27846849, "balance_loss_mlp": 1.01363266, "epoch": 0.43006162633398465, "flos": 34911205937280.0, "grad_norm": 1.5812781101012474, "language_loss": 0.72112632, "learning_rate": 2.54320419108402e-06, "loss": 0.7460869, "num_input_tokens_seen": 153449410, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.22265625, "step": 7153, "time_per_iteration": 4.3419976234436035 }, { "auxiliary_loss_clip": 0.01455982, "auxiliary_loss_mlp": 0.0103756, "balance_loss_clip": 1.27578235, "balance_loss_mlp": 1.01518428, "epoch": 0.4301217495866526, "flos": 15970206936960.0, "grad_norm": 1.867013879317763, "language_loss": 0.79134244, "learning_rate": 2.542829359113276e-06, "loss": 0.81627786, "num_input_tokens_seen": 153467910, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.22375488, "step": 7154, "time_per_iteration": 2.8343729972839355 }, { "auxiliary_loss_clip": 0.01457227, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.27785957, "balance_loss_mlp": 1.01412618, "epoch": 0.43018187283932063, "flos": 18779564977920.0, "grad_norm": 1.5379068748133056, "language_loss": 0.79804289, "learning_rate": 2.542454506558389e-06, "loss": 0.82296419, "num_input_tokens_seen": 153487100, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.20776367, "step": 7155, "time_per_iteration": 2.791686773300171 }, { "auxiliary_loss_clip": 0.0144536, "auxiliary_loss_mlp": 0.01038001, "balance_loss_clip": 1.26919436, "balance_loss_mlp": 1.01568556, "epoch": 0.4302419960919886, "flos": 20159933909760.0, "grad_norm": 1.7634509702636543, "language_loss": 0.89600283, "learning_rate": 2.5420796334335723e-06, "loss": 0.92083645, "num_input_tokens_seen": 153505565, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.22314453, "step": 7156, "time_per_iteration": 2.9325857162475586 }, { "auxiliary_loss_clip": 0.01472789, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.28958607, "balance_loss_mlp": 1.01574373, "epoch": 0.43030211934465656, "flos": 26444013045120.0, "grad_norm": 3.549259225605503, "language_loss": 0.84542787, "learning_rate": 2.541704739753042e-06, "loss": 0.87055081, "num_input_tokens_seen": 153526130, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.23779297, "step": 7157, "time_per_iteration": 2.8458869457244873 }, { "auxiliary_loss_clip": 0.01490471, "auxiliary_loss_mlp": 0.01041475, "balance_loss_clip": 1.3033973, "balance_loss_mlp": 1.01964784, "epoch": 0.43036224259732453, "flos": 24399909872640.0, "grad_norm": 1.6488935123692399, "language_loss": 0.72828031, "learning_rate": 2.5413298255310132e-06, "loss": 0.75359976, "num_input_tokens_seen": 153546370, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.21826172, "step": 7158, "time_per_iteration": 2.8423197269439697 }, { "auxiliary_loss_clip": 0.01464972, "auxiliary_loss_mlp": 0.01045117, "balance_loss_clip": 1.28315854, "balance_loss_mlp": 1.02240765, "epoch": 0.4304223658499925, "flos": 17210569898880.0, "grad_norm": 1.915088365114028, "language_loss": 0.83956301, "learning_rate": 2.5409548907817034e-06, "loss": 0.8646639, "num_input_tokens_seen": 153562800, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.22729492, "step": 7159, "time_per_iteration": 2.8590245246887207 }, { "auxiliary_loss_clip": 0.0146295, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.28078723, "balance_loss_mlp": 1.01359403, "epoch": 0.43048248910266046, "flos": 14911230954240.0, "grad_norm": 2.6413441178456045, "language_loss": 0.84140277, "learning_rate": 2.54057993551933e-06, "loss": 0.86638862, "num_input_tokens_seen": 153578395, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.22033691, "step": 7160, "time_per_iteration": 2.873429536819458 }, { "auxiliary_loss_clip": 0.01488972, "auxiliary_loss_mlp": 0.01040947, "balance_loss_clip": 1.30187678, "balance_loss_mlp": 1.01777315, "epoch": 0.4305426123553284, "flos": 21589782670080.0, "grad_norm": 1.955304100498663, "language_loss": 0.78322709, "learning_rate": 2.5402049597581116e-06, "loss": 0.80852628, "num_input_tokens_seen": 153596880, "router_z_loss_clip": 1.87109375, "router_z_loss_mlp": 0.23193359, "step": 7161, "time_per_iteration": 2.826483964920044 }, { "auxiliary_loss_clip": 0.01464497, "auxiliary_loss_mlp": 0.01035755, "balance_loss_clip": 1.28136802, "balance_loss_mlp": 1.01399946, "epoch": 0.4306027356079964, "flos": 22611449675520.0, "grad_norm": 2.06949680006272, "language_loss": 0.73901868, "learning_rate": 2.5398299635122662e-06, "loss": 0.76402128, "num_input_tokens_seen": 153616570, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.2175293, "step": 7162, "time_per_iteration": 2.827371835708618 }, { "auxiliary_loss_clip": 0.01255705, "auxiliary_loss_mlp": 0.01034909, "balance_loss_clip": 1.14774299, "balance_loss_mlp": 1.00763345, "epoch": 0.43066285886066435, "flos": 70699684164480.0, "grad_norm": 0.8087336402041331, "language_loss": 0.59115815, "learning_rate": 2.5394549467960147e-06, "loss": 0.61406422, "num_input_tokens_seen": 153671450, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.2734375, "step": 7163, "time_per_iteration": 3.2481682300567627 }, { "auxiliary_loss_clip": 0.01448287, "auxiliary_loss_mlp": 0.01033998, "balance_loss_clip": 1.26974463, "balance_loss_mlp": 1.01253986, "epoch": 0.4307229821133323, "flos": 26730902194560.0, "grad_norm": 30.537524032787132, "language_loss": 0.80107653, "learning_rate": 2.5390799096235783e-06, "loss": 0.82589936, "num_input_tokens_seen": 153691405, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.21459961, "step": 7164, "time_per_iteration": 2.8913636207580566 }, { "auxiliary_loss_clip": 0.01476977, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.29100204, "balance_loss_mlp": 1.01482654, "epoch": 0.4307831053660003, "flos": 26188279580160.0, "grad_norm": 1.8661804097467254, "language_loss": 0.69345951, "learning_rate": 2.538704852009177e-06, "loss": 0.71859425, "num_input_tokens_seen": 153711555, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.21679688, "step": 7165, "time_per_iteration": 2.899526834487915 }, { "auxiliary_loss_clip": 0.01457735, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.27562296, "balance_loss_mlp": 1.02013206, "epoch": 0.43084322861866825, "flos": 18918032624640.0, "grad_norm": 2.766069566420504, "language_loss": 0.75847745, "learning_rate": 2.538329773967034e-06, "loss": 0.78348589, "num_input_tokens_seen": 153730095, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.2298584, "step": 7166, "time_per_iteration": 2.824721336364746 }, { "auxiliary_loss_clip": 0.01452843, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.27408016, "balance_loss_mlp": 1.01490402, "epoch": 0.4309033518713362, "flos": 26444239269120.0, "grad_norm": 1.6760417028195063, "language_loss": 0.7287361, "learning_rate": 2.537954675511372e-06, "loss": 0.75363886, "num_input_tokens_seen": 153749320, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.2253418, "step": 7167, "time_per_iteration": 3.0053017139434814 }, { "auxiliary_loss_clip": 0.01447732, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.27048302, "balance_loss_mlp": 1.01404214, "epoch": 0.43096347512400424, "flos": 21222710455680.0, "grad_norm": 1.6685022092669415, "language_loss": 0.79352212, "learning_rate": 2.537579556656414e-06, "loss": 0.81836677, "num_input_tokens_seen": 153767825, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.22692871, "step": 7168, "time_per_iteration": 2.860227346420288 }, { "auxiliary_loss_clip": 0.01450032, "auxiliary_loss_mlp": 0.01040372, "balance_loss_clip": 1.27068424, "balance_loss_mlp": 1.01765013, "epoch": 0.4310235983766722, "flos": 16547559575040.0, "grad_norm": 2.110619810157183, "language_loss": 0.8340044, "learning_rate": 2.537204417416387e-06, "loss": 0.85890841, "num_input_tokens_seen": 153785350, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.22729492, "step": 7169, "time_per_iteration": 2.8525123596191406 }, { "auxiliary_loss_clip": 0.01252354, "auxiliary_loss_mlp": 0.01029171, "balance_loss_clip": 1.14466441, "balance_loss_mlp": 1.00304019, "epoch": 0.43108372162934017, "flos": 64805480622720.0, "grad_norm": 0.6714268412775781, "language_loss": 0.60833567, "learning_rate": 2.5368292578055132e-06, "loss": 0.6311509, "num_input_tokens_seen": 153856400, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.26171875, "step": 7170, "time_per_iteration": 3.502706527709961 }, { "auxiliary_loss_clip": 0.01444287, "auxiliary_loss_mlp": 0.01034751, "balance_loss_clip": 1.26516986, "balance_loss_mlp": 1.01257837, "epoch": 0.43114384488200813, "flos": 13452081546240.0, "grad_norm": 3.677506397564554, "language_loss": 0.77064282, "learning_rate": 2.536454077838021e-06, "loss": 0.79543322, "num_input_tokens_seen": 153875230, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.22155762, "step": 7171, "time_per_iteration": 2.8230485916137695 }, { "auxiliary_loss_clip": 0.01444536, "auxiliary_loss_mlp": 0.0103707, "balance_loss_clip": 1.26610088, "balance_loss_mlp": 1.01456285, "epoch": 0.4312039681346761, "flos": 26297899027200.0, "grad_norm": 1.6313889753851782, "language_loss": 0.77950728, "learning_rate": 2.5360788775281357e-06, "loss": 0.80432332, "num_input_tokens_seen": 153894740, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.22509766, "step": 7172, "time_per_iteration": 2.898340940475464 }, { "auxiliary_loss_clip": 0.01456524, "auxiliary_loss_mlp": 0.01040366, "balance_loss_clip": 1.27372837, "balance_loss_mlp": 1.01560652, "epoch": 0.43126409138734406, "flos": 20386547706240.0, "grad_norm": 2.8694199877664524, "language_loss": 0.77817655, "learning_rate": 2.535703656890086e-06, "loss": 0.80314541, "num_input_tokens_seen": 153913230, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.2479248, "step": 7173, "time_per_iteration": 2.8805859088897705 }, { "auxiliary_loss_clip": 0.01449976, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.27055526, "balance_loss_mlp": 1.01225305, "epoch": 0.431324214640012, "flos": 22132133815680.0, "grad_norm": 1.5039131823283207, "language_loss": 0.77074385, "learning_rate": 2.5353284159381e-06, "loss": 0.79560179, "num_input_tokens_seen": 153933250, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.23571777, "step": 7174, "time_per_iteration": 2.9182989597320557 }, { "auxiliary_loss_clip": 0.01459536, "auxiliary_loss_mlp": 0.0103647, "balance_loss_clip": 1.27693129, "balance_loss_mlp": 1.01272357, "epoch": 0.43138433789268, "flos": 15239229644160.0, "grad_norm": 1.4520126121497234, "language_loss": 0.82924777, "learning_rate": 2.534953154686407e-06, "loss": 0.85420781, "num_input_tokens_seen": 153951325, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.23742676, "step": 7175, "time_per_iteration": 2.865137815475464 }, { "auxiliary_loss_clip": 0.01472945, "auxiliary_loss_mlp": 0.01039908, "balance_loss_clip": 1.28433192, "balance_loss_mlp": 1.01584029, "epoch": 0.43144446114534796, "flos": 18159383496960.0, "grad_norm": 2.8337191227874823, "language_loss": 0.75042897, "learning_rate": 2.5345778731492366e-06, "loss": 0.77555752, "num_input_tokens_seen": 153966975, "router_z_loss_clip": 1.88476562, "router_z_loss_mlp": 0.24084473, "step": 7176, "time_per_iteration": 2.7703304290771484 }, { "auxiliary_loss_clip": 0.01463936, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.27945364, "balance_loss_mlp": 1.01423073, "epoch": 0.4315045843980159, "flos": 22940172282240.0, "grad_norm": 1.7622965810288402, "language_loss": 0.74251199, "learning_rate": 2.534202571340819e-06, "loss": 0.76752603, "num_input_tokens_seen": 153986695, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.2322998, "step": 7177, "time_per_iteration": 2.8688673973083496 }, { "auxiliary_loss_clip": 0.01493475, "auxiliary_loss_mlp": 0.01042067, "balance_loss_clip": 1.30081487, "balance_loss_mlp": 1.01739049, "epoch": 0.4315647076506839, "flos": 22136884519680.0, "grad_norm": 2.059031562814478, "language_loss": 0.82185721, "learning_rate": 2.533827249275387e-06, "loss": 0.84721261, "num_input_tokens_seen": 154004710, "router_z_loss_clip": 1.92480469, "router_z_loss_mlp": 0.24682617, "step": 7178, "time_per_iteration": 4.238221645355225 }, { "auxiliary_loss_clip": 0.01444324, "auxiliary_loss_mlp": 0.01043032, "balance_loss_clip": 1.26918948, "balance_loss_mlp": 1.02008367, "epoch": 0.43162483090335185, "flos": 26882671812480.0, "grad_norm": 2.1142113328536887, "language_loss": 0.84566486, "learning_rate": 2.5334519069671725e-06, "loss": 0.87053841, "num_input_tokens_seen": 154024320, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.22937012, "step": 7179, "time_per_iteration": 2.863978862762451 }, { "auxiliary_loss_clip": 0.01456261, "auxiliary_loss_mlp": 0.01039582, "balance_loss_clip": 1.27540207, "balance_loss_mlp": 1.01626468, "epoch": 0.4316849541560198, "flos": 13919859982080.0, "grad_norm": 1.79060715275152, "language_loss": 0.76440936, "learning_rate": 2.5330765444304075e-06, "loss": 0.78936779, "num_input_tokens_seen": 154041755, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.23303223, "step": 7180, "time_per_iteration": 2.8634438514709473 }, { "auxiliary_loss_clip": 0.01463008, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.27842546, "balance_loss_mlp": 1.01764643, "epoch": 0.4317450774086878, "flos": 16443731462400.0, "grad_norm": 1.9332059848182244, "language_loss": 0.82280654, "learning_rate": 2.5327011616793274e-06, "loss": 0.84784997, "num_input_tokens_seen": 154056775, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.23681641, "step": 7181, "time_per_iteration": 2.765810251235962 }, { "auxiliary_loss_clip": 0.01465641, "auxiliary_loss_mlp": 0.01045243, "balance_loss_clip": 1.28181672, "balance_loss_mlp": 1.02167523, "epoch": 0.4318052006613558, "flos": 20563817408640.0, "grad_norm": 1.7265897895892273, "language_loss": 0.89590466, "learning_rate": 2.532325758728165e-06, "loss": 0.92101353, "num_input_tokens_seen": 154075015, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.23583984, "step": 7182, "time_per_iteration": 2.823988914489746 }, { "auxiliary_loss_clip": 0.01450608, "auxiliary_loss_mlp": 0.01037528, "balance_loss_clip": 1.2712152, "balance_loss_mlp": 1.01543868, "epoch": 0.43186532391402377, "flos": 22830236121600.0, "grad_norm": 1.7457539844124343, "language_loss": 0.76805341, "learning_rate": 2.5319503355911566e-06, "loss": 0.79293478, "num_input_tokens_seen": 154095170, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.2208252, "step": 7183, "time_per_iteration": 2.8317618370056152 }, { "auxiliary_loss_clip": 0.01462922, "auxiliary_loss_mlp": 0.01038047, "balance_loss_clip": 1.2789557, "balance_loss_mlp": 1.0155884, "epoch": 0.43192544716669173, "flos": 25567509916800.0, "grad_norm": 1.903530138002767, "language_loss": 0.78173125, "learning_rate": 2.5315748922825393e-06, "loss": 0.80674088, "num_input_tokens_seen": 154116895, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.2244873, "step": 7184, "time_per_iteration": 2.870814085006714 }, { "auxiliary_loss_clip": 0.01437387, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.26252174, "balance_loss_mlp": 1.01436472, "epoch": 0.4319855704193597, "flos": 30966534656640.0, "grad_norm": 1.532799240372351, "language_loss": 0.73850513, "learning_rate": 2.5311994288165474e-06, "loss": 0.76324683, "num_input_tokens_seen": 154138395, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.22412109, "step": 7185, "time_per_iteration": 4.3542585372924805 }, { "auxiliary_loss_clip": 0.01459857, "auxiliary_loss_mlp": 0.01043507, "balance_loss_clip": 1.27512765, "balance_loss_mlp": 1.01865208, "epoch": 0.43204569367202766, "flos": 24248185499520.0, "grad_norm": 2.348599429974186, "language_loss": 0.76920176, "learning_rate": 2.530823945207421e-06, "loss": 0.79423541, "num_input_tokens_seen": 154156775, "router_z_loss_clip": 1.84765625, "router_z_loss_mlp": 0.2487793, "step": 7186, "time_per_iteration": 4.242719650268555 }, { "auxiliary_loss_clip": 0.01457729, "auxiliary_loss_mlp": 0.01038497, "balance_loss_clip": 1.27617514, "balance_loss_mlp": 1.0164907, "epoch": 0.43210581692469563, "flos": 18416429061120.0, "grad_norm": 2.6814462154406886, "language_loss": 0.76762307, "learning_rate": 2.5304484414693962e-06, "loss": 0.79258537, "num_input_tokens_seen": 154177500, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.22009277, "step": 7187, "time_per_iteration": 2.8473987579345703 }, { "auxiliary_loss_clip": 0.01258641, "auxiliary_loss_mlp": 0.01055631, "balance_loss_clip": 1.15123272, "balance_loss_mlp": 1.03045392, "epoch": 0.4321659401773636, "flos": 49862388049920.0, "grad_norm": 0.85608708718504, "language_loss": 0.68325245, "learning_rate": 2.530072917616714e-06, "loss": 0.70639515, "num_input_tokens_seen": 154237110, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.25195312, "step": 7188, "time_per_iteration": 3.3641107082366943 }, { "auxiliary_loss_clip": 0.01440386, "auxiliary_loss_mlp": 0.01037243, "balance_loss_clip": 1.26400781, "balance_loss_mlp": 1.01564193, "epoch": 0.43222606343003156, "flos": 17137354533120.0, "grad_norm": 1.9049255396570928, "language_loss": 0.78654552, "learning_rate": 2.529697373663614e-06, "loss": 0.81132174, "num_input_tokens_seen": 154253910, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.21594238, "step": 7189, "time_per_iteration": 4.241934299468994 }, { "auxiliary_loss_clip": 0.01476422, "auxiliary_loss_mlp": 0.01042819, "balance_loss_clip": 1.28833687, "balance_loss_mlp": 1.01970482, "epoch": 0.4322861866826995, "flos": 22760866563840.0, "grad_norm": 1.7220154486559482, "language_loss": 0.72133297, "learning_rate": 2.5293218096243364e-06, "loss": 0.74652535, "num_input_tokens_seen": 154274770, "router_z_loss_clip": 1.87988281, "router_z_loss_mlp": 0.23120117, "step": 7190, "time_per_iteration": 2.858563184738159 }, { "auxiliary_loss_clip": 0.01438597, "auxiliary_loss_mlp": 0.01038433, "balance_loss_clip": 1.26054597, "balance_loss_mlp": 1.01552129, "epoch": 0.4323463099353675, "flos": 27903524411520.0, "grad_norm": 1.4549260364841998, "language_loss": 0.80601561, "learning_rate": 2.5289462255131223e-06, "loss": 0.83078593, "num_input_tokens_seen": 154295035, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.22900391, "step": 7191, "time_per_iteration": 2.900874137878418 }, { "auxiliary_loss_clip": 0.01435928, "auxiliary_loss_mlp": 0.01040313, "balance_loss_clip": 1.25794876, "balance_loss_mlp": 1.01740098, "epoch": 0.43240643318803546, "flos": 21624286469760.0, "grad_norm": 1.7165561907429043, "language_loss": 0.75745469, "learning_rate": 2.5285706213442146e-06, "loss": 0.78221709, "num_input_tokens_seen": 154314905, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.22912598, "step": 7192, "time_per_iteration": 2.8999364376068115 }, { "auxiliary_loss_clip": 0.0144474, "auxiliary_loss_mlp": 0.01034972, "balance_loss_clip": 1.26535034, "balance_loss_mlp": 1.01158261, "epoch": 0.4324665564407034, "flos": 17565199793280.0, "grad_norm": 1.8506994345949868, "language_loss": 0.7957589, "learning_rate": 2.5281949971318557e-06, "loss": 0.82055604, "num_input_tokens_seen": 154331740, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.23400879, "step": 7193, "time_per_iteration": 2.8175084590911865 }, { "auxiliary_loss_clip": 0.01443494, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.26212478, "balance_loss_mlp": 1.01211429, "epoch": 0.4325266796933714, "flos": 18410728216320.0, "grad_norm": 1.7975023968493737, "language_loss": 0.76384318, "learning_rate": 2.5278193528902897e-06, "loss": 0.78863257, "num_input_tokens_seen": 154348740, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.23339844, "step": 7194, "time_per_iteration": 2.894209623336792 }, { "auxiliary_loss_clip": 0.01456527, "auxiliary_loss_mlp": 0.01041161, "balance_loss_clip": 1.27612448, "balance_loss_mlp": 1.01789153, "epoch": 0.4325868029460394, "flos": 22574955104640.0, "grad_norm": 2.3518663570300467, "language_loss": 0.60848236, "learning_rate": 2.5274436886337613e-06, "loss": 0.63345921, "num_input_tokens_seen": 154368835, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.23278809, "step": 7195, "time_per_iteration": 2.8486146926879883 }, { "auxiliary_loss_clip": 0.01469873, "auxiliary_loss_mlp": 0.01041712, "balance_loss_clip": 1.28532648, "balance_loss_mlp": 1.01626134, "epoch": 0.43264692619870737, "flos": 14612216198400.0, "grad_norm": 2.09796925399397, "language_loss": 0.6568501, "learning_rate": 2.527068004376515e-06, "loss": 0.68196595, "num_input_tokens_seen": 154384620, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.25463867, "step": 7196, "time_per_iteration": 2.8022654056549072 }, { "auxiliary_loss_clip": 0.01464764, "auxiliary_loss_mlp": 0.01043092, "balance_loss_clip": 1.28025246, "balance_loss_mlp": 1.01971519, "epoch": 0.43270704945137534, "flos": 21510233032320.0, "grad_norm": 2.159735741265863, "language_loss": 0.72921002, "learning_rate": 2.526692300132797e-06, "loss": 0.75428855, "num_input_tokens_seen": 154402865, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.23400879, "step": 7197, "time_per_iteration": 2.8162379264831543 }, { "auxiliary_loss_clip": 0.01434823, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 1.25820196, "balance_loss_mlp": 1.01475942, "epoch": 0.4327671727040433, "flos": 25166884043520.0, "grad_norm": 1.4278633306108317, "language_loss": 0.73723245, "learning_rate": 2.5263165759168547e-06, "loss": 0.76197165, "num_input_tokens_seen": 154423625, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.2434082, "step": 7198, "time_per_iteration": 2.8758440017700195 }, { "auxiliary_loss_clip": 0.01443324, "auxiliary_loss_mlp": 0.01035828, "balance_loss_clip": 1.26490593, "balance_loss_mlp": 1.0117712, "epoch": 0.43282729595671127, "flos": 25458071448960.0, "grad_norm": 1.757944557374797, "language_loss": 0.81556863, "learning_rate": 2.525940831742934e-06, "loss": 0.84036016, "num_input_tokens_seen": 154444775, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.24072266, "step": 7199, "time_per_iteration": 2.927938461303711 }, { "auxiliary_loss_clip": 0.01452992, "auxiliary_loss_mlp": 0.01039977, "balance_loss_clip": 1.27408946, "balance_loss_mlp": 1.01841247, "epoch": 0.43288741920937923, "flos": 24134720244480.0, "grad_norm": 2.991178389092289, "language_loss": 0.69793463, "learning_rate": 2.525565067625286e-06, "loss": 0.72286439, "num_input_tokens_seen": 154460815, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.21557617, "step": 7200, "time_per_iteration": 2.863464832305908 }, { "auxiliary_loss_clip": 0.01454647, "auxiliary_loss_mlp": 0.01041809, "balance_loss_clip": 1.27336419, "balance_loss_mlp": 1.01938581, "epoch": 0.4329475424620472, "flos": 19213427796480.0, "grad_norm": 2.0269881694742207, "language_loss": 0.87923634, "learning_rate": 2.525189283578157e-06, "loss": 0.90420085, "num_input_tokens_seen": 154479145, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.22424316, "step": 7201, "time_per_iteration": 2.8250744342803955 }, { "auxiliary_loss_clip": 0.0148457, "auxiliary_loss_mlp": 0.01036931, "balance_loss_clip": 1.29628325, "balance_loss_mlp": 1.01350641, "epoch": 0.43300766571471516, "flos": 22648713408000.0, "grad_norm": 2.0582828518352656, "language_loss": 0.65718716, "learning_rate": 2.5248134796157974e-06, "loss": 0.68240219, "num_input_tokens_seen": 154498905, "router_z_loss_clip": 1.88183594, "router_z_loss_mlp": 0.23413086, "step": 7202, "time_per_iteration": 2.822941541671753 }, { "auxiliary_loss_clip": 0.01438551, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.25939155, "balance_loss_mlp": 1.01416802, "epoch": 0.4330677889673831, "flos": 22129600106880.0, "grad_norm": 1.9070012782941839, "language_loss": 0.8280524, "learning_rate": 2.5244376557524586e-06, "loss": 0.85279417, "num_input_tokens_seen": 154517270, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.21447754, "step": 7203, "time_per_iteration": 2.858931064605713 }, { "auxiliary_loss_clip": 0.01467585, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.2797749, "balance_loss_mlp": 1.01466942, "epoch": 0.4331279122200511, "flos": 23231857380480.0, "grad_norm": 2.017487532064701, "language_loss": 0.82657558, "learning_rate": 2.5240618120023912e-06, "loss": 0.85162079, "num_input_tokens_seen": 154535945, "router_z_loss_clip": 1.87890625, "router_z_loss_mlp": 0.22290039, "step": 7204, "time_per_iteration": 2.825761079788208 }, { "auxiliary_loss_clip": 0.01451241, "auxiliary_loss_mlp": 0.01035203, "balance_loss_clip": 1.27050924, "balance_loss_mlp": 1.01256537, "epoch": 0.43318803547271906, "flos": 18268867209600.0, "grad_norm": 2.987585517509372, "language_loss": 0.74934691, "learning_rate": 2.5236859483798468e-06, "loss": 0.77421135, "num_input_tokens_seen": 154554935, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.22631836, "step": 7205, "time_per_iteration": 2.825655937194824 }, { "auxiliary_loss_clip": 0.01437118, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.26254189, "balance_loss_mlp": 1.01168919, "epoch": 0.433248158725387, "flos": 27430090375680.0, "grad_norm": 2.4538834552919213, "language_loss": 0.75683594, "learning_rate": 2.5233100648990803e-06, "loss": 0.78155053, "num_input_tokens_seen": 154576065, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.22631836, "step": 7206, "time_per_iteration": 2.922534465789795 }, { "auxiliary_loss_clip": 0.01442411, "auxiliary_loss_mlp": 0.01037551, "balance_loss_clip": 1.26399314, "balance_loss_mlp": 1.01485312, "epoch": 0.433308281978055, "flos": 23227423390080.0, "grad_norm": 1.746616035759977, "language_loss": 0.79804528, "learning_rate": 2.522934161574342e-06, "loss": 0.82284486, "num_input_tokens_seen": 154595110, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.22717285, "step": 7207, "time_per_iteration": 2.8545446395874023 }, { "auxiliary_loss_clip": 0.01470715, "auxiliary_loss_mlp": 0.01041447, "balance_loss_clip": 1.28476906, "balance_loss_mlp": 1.01853538, "epoch": 0.433368405230723, "flos": 15861085182720.0, "grad_norm": 1.7049487443937106, "language_loss": 0.81350589, "learning_rate": 2.5225582384198888e-06, "loss": 0.83862746, "num_input_tokens_seen": 154612255, "router_z_loss_clip": 1.85742188, "router_z_loss_mlp": 0.22900391, "step": 7208, "time_per_iteration": 2.8545920848846436 }, { "auxiliary_loss_clip": 0.0145201, "auxiliary_loss_mlp": 0.01036187, "balance_loss_clip": 1.27243829, "balance_loss_mlp": 1.01458645, "epoch": 0.433428528483391, "flos": 19035434177280.0, "grad_norm": 2.2432331947440143, "language_loss": 0.7178297, "learning_rate": 2.5221822954499744e-06, "loss": 0.74271166, "num_input_tokens_seen": 154630440, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.21606445, "step": 7209, "time_per_iteration": 2.801804780960083 }, { "auxiliary_loss_clip": 0.01445148, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.26599038, "balance_loss_mlp": 1.01607502, "epoch": 0.43348865173605894, "flos": 24729175416960.0, "grad_norm": 1.4243047574813827, "language_loss": 0.82213163, "learning_rate": 2.5218063326788557e-06, "loss": 0.8469767, "num_input_tokens_seen": 154652515, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.23291016, "step": 7210, "time_per_iteration": 2.9203689098358154 }, { "auxiliary_loss_clip": 0.01458705, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.27802467, "balance_loss_mlp": 1.01947725, "epoch": 0.4335487749887269, "flos": 22100751907200.0, "grad_norm": 2.1004804772070695, "language_loss": 0.82723498, "learning_rate": 2.5214303501207885e-06, "loss": 0.85222578, "num_input_tokens_seen": 154670965, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.2088623, "step": 7211, "time_per_iteration": 2.960676431655884 }, { "auxiliary_loss_clip": 0.01460535, "auxiliary_loss_mlp": 0.01036238, "balance_loss_clip": 1.27759957, "balance_loss_mlp": 1.01505446, "epoch": 0.43360889824139487, "flos": 22393206167040.0, "grad_norm": 1.8669610794742044, "language_loss": 0.76423126, "learning_rate": 2.521054347790029e-06, "loss": 0.78919899, "num_input_tokens_seen": 154689980, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.21179199, "step": 7212, "time_per_iteration": 2.8220133781433105 }, { "auxiliary_loss_clip": 0.01454946, "auxiliary_loss_mlp": 0.01037481, "balance_loss_clip": 1.27569985, "balance_loss_mlp": 1.01685786, "epoch": 0.43366902149406283, "flos": 17536758796800.0, "grad_norm": 2.3307574793894266, "language_loss": 0.77331734, "learning_rate": 2.5206783257008375e-06, "loss": 0.79824167, "num_input_tokens_seen": 154706570, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.20629883, "step": 7213, "time_per_iteration": 4.290258884429932 }, { "auxiliary_loss_clip": 0.01452492, "auxiliary_loss_mlp": 0.01038169, "balance_loss_clip": 1.27082276, "balance_loss_mlp": 1.017331, "epoch": 0.4337291447467308, "flos": 19031135921280.0, "grad_norm": 2.653131165822232, "language_loss": 0.65330589, "learning_rate": 2.520302283867471e-06, "loss": 0.67821252, "num_input_tokens_seen": 154725210, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.20849609, "step": 7214, "time_per_iteration": 2.8240954875946045 }, { "auxiliary_loss_clip": 0.01439044, "auxiliary_loss_mlp": 0.01040349, "balance_loss_clip": 1.26443791, "balance_loss_mlp": 1.01910579, "epoch": 0.43378926799939876, "flos": 27245174302080.0, "grad_norm": 1.5769218407146535, "language_loss": 0.71788669, "learning_rate": 2.519926222304191e-06, "loss": 0.74268061, "num_input_tokens_seen": 154745945, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.21240234, "step": 7215, "time_per_iteration": 2.8849449157714844 }, { "auxiliary_loss_clip": 0.014502, "auxiliary_loss_mlp": 0.01041405, "balance_loss_clip": 1.27251077, "balance_loss_mlp": 1.01956582, "epoch": 0.43384939125206673, "flos": 15969618754560.0, "grad_norm": 1.9391746025960457, "language_loss": 0.76006842, "learning_rate": 2.519550141025255e-06, "loss": 0.78498441, "num_input_tokens_seen": 154763580, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.21838379, "step": 7216, "time_per_iteration": 2.803990125656128 }, { "auxiliary_loss_clip": 0.01480728, "auxiliary_loss_mlp": 0.01039856, "balance_loss_clip": 1.29299068, "balance_loss_mlp": 1.01619291, "epoch": 0.4339095145047347, "flos": 21801782396160.0, "grad_norm": 2.530977357112367, "language_loss": 0.77407193, "learning_rate": 2.519174040044927e-06, "loss": 0.79927784, "num_input_tokens_seen": 154776825, "router_z_loss_clip": 1.875, "router_z_loss_mlp": 0.23681641, "step": 7217, "time_per_iteration": 2.7782344818115234 }, { "auxiliary_loss_clip": 0.01458834, "auxiliary_loss_mlp": 0.0103871, "balance_loss_clip": 1.27726948, "balance_loss_mlp": 1.01714468, "epoch": 0.43396963775740266, "flos": 14217924597120.0, "grad_norm": 2.380846073570895, "language_loss": 0.74773538, "learning_rate": 2.5187979193774664e-06, "loss": 0.7727108, "num_input_tokens_seen": 154794025, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21569824, "step": 7218, "time_per_iteration": 2.8044447898864746 }, { "auxiliary_loss_clip": 0.01457949, "auxiliary_loss_mlp": 0.01034487, "balance_loss_clip": 1.27574635, "balance_loss_mlp": 1.01317215, "epoch": 0.4340297610100706, "flos": 19728514310400.0, "grad_norm": 2.012487734199626, "language_loss": 0.70626867, "learning_rate": 2.5184217790371367e-06, "loss": 0.73119301, "num_input_tokens_seen": 154813105, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.2130127, "step": 7219, "time_per_iteration": 2.814913511276245 }, { "auxiliary_loss_clip": 0.01447138, "auxiliary_loss_mlp": 0.01036357, "balance_loss_clip": 1.2692132, "balance_loss_mlp": 1.01584125, "epoch": 0.4340898842627386, "flos": 18962942728320.0, "grad_norm": 1.8173886012126734, "language_loss": 0.78339785, "learning_rate": 2.518045619038202e-06, "loss": 0.80823278, "num_input_tokens_seen": 154833525, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.2052002, "step": 7220, "time_per_iteration": 4.40529727935791 }, { "auxiliary_loss_clip": 0.01446832, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.26829064, "balance_loss_mlp": 1.0159179, "epoch": 0.4341500075154066, "flos": 22028531927040.0, "grad_norm": 2.1977913592598384, "language_loss": 0.70491791, "learning_rate": 2.5176694393949243e-06, "loss": 0.72975421, "num_input_tokens_seen": 154853090, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.20874023, "step": 7221, "time_per_iteration": 4.273157835006714 }, { "auxiliary_loss_clip": 0.0145886, "auxiliary_loss_mlp": 0.01039039, "balance_loss_clip": 1.27627766, "balance_loss_mlp": 1.01804662, "epoch": 0.4342101307680746, "flos": 23591871406080.0, "grad_norm": 4.127077782461437, "language_loss": 0.65580767, "learning_rate": 2.51729324012157e-06, "loss": 0.68078661, "num_input_tokens_seen": 154872055, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.20983887, "step": 7222, "time_per_iteration": 2.876605749130249 }, { "auxiliary_loss_clip": 0.01460395, "auxiliary_loss_mlp": 0.01039732, "balance_loss_clip": 1.28013754, "balance_loss_mlp": 1.0177145, "epoch": 0.43427025402074254, "flos": 17977046376960.0, "grad_norm": 2.3577912232778067, "language_loss": 0.73944706, "learning_rate": 2.5169170212324053e-06, "loss": 0.76444829, "num_input_tokens_seen": 154886645, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22021484, "step": 7223, "time_per_iteration": 4.165122985839844 }, { "auxiliary_loss_clip": 0.01466478, "auxiliary_loss_mlp": 0.01036615, "balance_loss_clip": 1.28325582, "balance_loss_mlp": 1.01505017, "epoch": 0.4343303772734105, "flos": 26297537068800.0, "grad_norm": 2.7004079351282733, "language_loss": 0.94515079, "learning_rate": 2.516540782741694e-06, "loss": 0.9701817, "num_input_tokens_seen": 154906775, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.21557617, "step": 7224, "time_per_iteration": 2.850823402404785 }, { "auxiliary_loss_clip": 0.01447113, "auxiliary_loss_mlp": 0.01037567, "balance_loss_clip": 1.26829445, "balance_loss_mlp": 1.01733732, "epoch": 0.43439050052607847, "flos": 26845317590400.0, "grad_norm": 1.4472559503670315, "language_loss": 0.62238097, "learning_rate": 2.5161645246637056e-06, "loss": 0.6472277, "num_input_tokens_seen": 154926990, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.20239258, "step": 7225, "time_per_iteration": 2.9088900089263916 }, { "auxiliary_loss_clip": 0.0145778, "auxiliary_loss_mlp": 0.01040632, "balance_loss_clip": 1.27742028, "balance_loss_mlp": 1.01980639, "epoch": 0.43445062377874644, "flos": 21407626529280.0, "grad_norm": 3.0722759939224438, "language_loss": 0.78985721, "learning_rate": 2.5157882470127054e-06, "loss": 0.81484139, "num_input_tokens_seen": 154946210, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.20837402, "step": 7226, "time_per_iteration": 2.8238272666931152 }, { "auxiliary_loss_clip": 0.0143923, "auxiliary_loss_mlp": 0.01034114, "balance_loss_clip": 1.26445699, "balance_loss_mlp": 1.01338375, "epoch": 0.4345107470314144, "flos": 19911575347200.0, "grad_norm": 1.641214487040912, "language_loss": 0.85938835, "learning_rate": 2.515411949802964e-06, "loss": 0.88412178, "num_input_tokens_seen": 154964995, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.20715332, "step": 7227, "time_per_iteration": 2.8198015689849854 }, { "auxiliary_loss_clip": 0.01449876, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.27221274, "balance_loss_mlp": 1.01448727, "epoch": 0.43457087028408237, "flos": 26443877310720.0, "grad_norm": 1.8572637256085816, "language_loss": 0.77458096, "learning_rate": 2.5150356330487498e-06, "loss": 0.79943907, "num_input_tokens_seen": 154984775, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.21459961, "step": 7228, "time_per_iteration": 2.858193874359131 }, { "auxiliary_loss_clip": 0.01443238, "auxiliary_loss_mlp": 0.01036147, "balance_loss_clip": 1.26723194, "balance_loss_mlp": 1.01458263, "epoch": 0.43463099353675033, "flos": 31881432637440.0, "grad_norm": 1.598486377591358, "language_loss": 0.81332064, "learning_rate": 2.5146592967643324e-06, "loss": 0.8381145, "num_input_tokens_seen": 155008125, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.21557617, "step": 7229, "time_per_iteration": 2.9410006999969482 }, { "auxiliary_loss_clip": 0.01456452, "auxiliary_loss_mlp": 0.01038828, "balance_loss_clip": 1.27736306, "balance_loss_mlp": 1.0178709, "epoch": 0.4346911167894183, "flos": 24581885034240.0, "grad_norm": 1.9523271890975424, "language_loss": 0.83005172, "learning_rate": 2.5142829409639834e-06, "loss": 0.85500449, "num_input_tokens_seen": 155027885, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.20947266, "step": 7230, "time_per_iteration": 2.950418710708618 }, { "auxiliary_loss_clip": 0.01475328, "auxiliary_loss_mlp": 0.01042569, "balance_loss_clip": 1.29018688, "balance_loss_mlp": 1.02020526, "epoch": 0.43475124004208626, "flos": 17099095415040.0, "grad_norm": 2.3080174564885683, "language_loss": 0.78290355, "learning_rate": 2.513906565661973e-06, "loss": 0.80808258, "num_input_tokens_seen": 155043375, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.22351074, "step": 7231, "time_per_iteration": 2.8339085578918457 }, { "auxiliary_loss_clip": 0.01440279, "auxiliary_loss_mlp": 0.01032494, "balance_loss_clip": 1.26504278, "balance_loss_mlp": 1.01269388, "epoch": 0.4348113632947542, "flos": 26115878620800.0, "grad_norm": 1.4851648702897655, "language_loss": 0.69339693, "learning_rate": 2.513530170872575e-06, "loss": 0.71812469, "num_input_tokens_seen": 155062930, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.19799805, "step": 7232, "time_per_iteration": 2.903183937072754 }, { "auxiliary_loss_clip": 0.0146563, "auxiliary_loss_mlp": 0.01037998, "balance_loss_clip": 1.28291607, "balance_loss_mlp": 1.01502657, "epoch": 0.4348714865474222, "flos": 34212832162560.0, "grad_norm": 1.9060791084376187, "language_loss": 0.7248354, "learning_rate": 2.5131537566100605e-06, "loss": 0.74987167, "num_input_tokens_seen": 155084980, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.22973633, "step": 7233, "time_per_iteration": 3.0739660263061523 }, { "auxiliary_loss_clip": 0.0147048, "auxiliary_loss_mlp": 0.01036184, "balance_loss_clip": 1.28678191, "balance_loss_mlp": 1.01497674, "epoch": 0.43493160980009016, "flos": 31549226181120.0, "grad_norm": 1.652442601171842, "language_loss": 0.75136667, "learning_rate": 2.5127773228887053e-06, "loss": 0.77643329, "num_input_tokens_seen": 155107260, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.21203613, "step": 7234, "time_per_iteration": 2.9157984256744385 }, { "auxiliary_loss_clip": 0.01486707, "auxiliary_loss_mlp": 0.01041315, "balance_loss_clip": 1.29803896, "balance_loss_mlp": 1.01928544, "epoch": 0.4349917330527582, "flos": 24072046917120.0, "grad_norm": 1.9536236744841642, "language_loss": 0.59731674, "learning_rate": 2.512400869722782e-06, "loss": 0.62259698, "num_input_tokens_seen": 155126720, "router_z_loss_clip": 1.88867188, "router_z_loss_mlp": 0.22009277, "step": 7235, "time_per_iteration": 2.8367245197296143 }, { "auxiliary_loss_clip": 0.01464457, "auxiliary_loss_mlp": 0.01034557, "balance_loss_clip": 1.28320837, "balance_loss_mlp": 1.01302791, "epoch": 0.43505185630542614, "flos": 30531721697280.0, "grad_norm": 1.8539179955552163, "language_loss": 0.77747208, "learning_rate": 2.512024397126566e-06, "loss": 0.80246222, "num_input_tokens_seen": 155148640, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.21533203, "step": 7236, "time_per_iteration": 2.8806731700897217 }, { "auxiliary_loss_clip": 0.01446353, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 1.26887369, "balance_loss_mlp": 1.01401889, "epoch": 0.4351119795580941, "flos": 15741783348480.0, "grad_norm": 2.0889779575662355, "language_loss": 0.81650746, "learning_rate": 2.5116479051143345e-06, "loss": 0.84131688, "num_input_tokens_seen": 155165870, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.20568848, "step": 7237, "time_per_iteration": 2.795853614807129 }, { "auxiliary_loss_clip": 0.0145068, "auxiliary_loss_mlp": 0.01038572, "balance_loss_clip": 1.2724973, "balance_loss_mlp": 1.01761532, "epoch": 0.4351721028107621, "flos": 18740400963840.0, "grad_norm": 1.4709505934868143, "language_loss": 0.63940883, "learning_rate": 2.5112713937003623e-06, "loss": 0.6643014, "num_input_tokens_seen": 155185315, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.20947266, "step": 7238, "time_per_iteration": 2.8485748767852783 }, { "auxiliary_loss_clip": 0.01452674, "auxiliary_loss_mlp": 0.01040147, "balance_loss_clip": 1.27638865, "balance_loss_mlp": 1.01970279, "epoch": 0.43523222606343004, "flos": 25237384721280.0, "grad_norm": 1.709029408492606, "language_loss": 0.85933375, "learning_rate": 2.510894862898928e-06, "loss": 0.88426197, "num_input_tokens_seen": 155205790, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.2043457, "step": 7239, "time_per_iteration": 2.8579494953155518 }, { "auxiliary_loss_clip": 0.01463901, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.28305161, "balance_loss_mlp": 1.01266658, "epoch": 0.435292349316098, "flos": 22718987861760.0, "grad_norm": 1.5776932938268091, "language_loss": 0.73314905, "learning_rate": 2.510518312724309e-06, "loss": 0.75811827, "num_input_tokens_seen": 155226475, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.20349121, "step": 7240, "time_per_iteration": 2.8270397186279297 }, { "auxiliary_loss_clip": 0.01460926, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.27850711, "balance_loss_mlp": 1.01496339, "epoch": 0.43535247256876597, "flos": 25786975034880.0, "grad_norm": 2.454316610087799, "language_loss": 0.83157694, "learning_rate": 2.5101417431907842e-06, "loss": 0.85654783, "num_input_tokens_seen": 155247110, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.21191406, "step": 7241, "time_per_iteration": 2.861924648284912 }, { "auxiliary_loss_clip": 0.01468563, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.28219187, "balance_loss_mlp": 1.01267374, "epoch": 0.43541259582143393, "flos": 17536577817600.0, "grad_norm": 2.764376800027208, "language_loss": 0.81178153, "learning_rate": 2.5097651543126345e-06, "loss": 0.8368057, "num_input_tokens_seen": 155261335, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.21191406, "step": 7242, "time_per_iteration": 2.7843289375305176 }, { "auxiliary_loss_clip": 0.01468604, "auxiliary_loss_mlp": 0.01033902, "balance_loss_clip": 1.28300798, "balance_loss_mlp": 1.01366019, "epoch": 0.4354727190741019, "flos": 15203594724480.0, "grad_norm": 2.126107484002281, "language_loss": 0.70104694, "learning_rate": 2.509388546104138e-06, "loss": 0.72607207, "num_input_tokens_seen": 155278510, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.20251465, "step": 7243, "time_per_iteration": 2.9414398670196533 }, { "auxiliary_loss_clip": 0.01453386, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.27645934, "balance_loss_mlp": 1.01488531, "epoch": 0.43553284232676986, "flos": 16656636084480.0, "grad_norm": 1.8080228502890152, "language_loss": 0.81729364, "learning_rate": 2.5090119185795766e-06, "loss": 0.84217304, "num_input_tokens_seen": 155296450, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.19665527, "step": 7244, "time_per_iteration": 2.857069492340088 }, { "auxiliary_loss_clip": 0.01452023, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.2736423, "balance_loss_mlp": 1.01117039, "epoch": 0.43559296557943783, "flos": 23410439182080.0, "grad_norm": 1.7147545265074993, "language_loss": 0.73888588, "learning_rate": 2.508635271753234e-06, "loss": 0.76371622, "num_input_tokens_seen": 155316080, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.19836426, "step": 7245, "time_per_iteration": 2.860724449157715 }, { "auxiliary_loss_clip": 0.01464699, "auxiliary_loss_mlp": 0.01039925, "balance_loss_clip": 1.28405309, "balance_loss_mlp": 1.01964712, "epoch": 0.4356530888321058, "flos": 22429248289920.0, "grad_norm": 1.6192488494429365, "language_loss": 0.78495371, "learning_rate": 2.508258605639389e-06, "loss": 0.80999994, "num_input_tokens_seen": 155336765, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.20288086, "step": 7246, "time_per_iteration": 2.907010793685913 }, { "auxiliary_loss_clip": 0.01457153, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.27800512, "balance_loss_mlp": 1.0160625, "epoch": 0.43571321208477376, "flos": 21626051016960.0, "grad_norm": 1.865678192965393, "language_loss": 0.86513329, "learning_rate": 2.5078819202523275e-06, "loss": 0.89007759, "num_input_tokens_seen": 155356440, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.2121582, "step": 7247, "time_per_iteration": 2.8645999431610107 }, { "auxiliary_loss_clip": 0.01468441, "auxiliary_loss_mlp": 0.01037833, "balance_loss_clip": 1.28749323, "balance_loss_mlp": 1.01668584, "epoch": 0.4357733353374418, "flos": 23997790920960.0, "grad_norm": 1.7112499758663562, "language_loss": 0.7293877, "learning_rate": 2.507505215606333e-06, "loss": 0.75445044, "num_input_tokens_seen": 155377070, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.21154785, "step": 7248, "time_per_iteration": 4.282192945480347 }, { "auxiliary_loss_clip": 0.01465149, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.2860955, "balance_loss_mlp": 1.01385832, "epoch": 0.43583345859010975, "flos": 25275598594560.0, "grad_norm": 1.516199566134212, "language_loss": 0.87611353, "learning_rate": 2.5071284917156893e-06, "loss": 0.90110481, "num_input_tokens_seen": 155398415, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.20117188, "step": 7249, "time_per_iteration": 2.8870763778686523 }, { "auxiliary_loss_clip": 0.0147458, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.29069901, "balance_loss_mlp": 1.02052176, "epoch": 0.4358935818427777, "flos": 23706874984320.0, "grad_norm": 1.864732020260216, "language_loss": 0.82488108, "learning_rate": 2.506751748594683e-06, "loss": 0.8500362, "num_input_tokens_seen": 155415625, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.20410156, "step": 7250, "time_per_iteration": 2.8323020935058594 }, { "auxiliary_loss_clip": 0.01476222, "auxiliary_loss_mlp": 0.01034889, "balance_loss_clip": 1.29469824, "balance_loss_mlp": 1.01326442, "epoch": 0.4359537050954457, "flos": 29544603736320.0, "grad_norm": 2.0649875466068472, "language_loss": 0.86009431, "learning_rate": 2.5063749862575988e-06, "loss": 0.88520539, "num_input_tokens_seen": 155435505, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.21618652, "step": 7251, "time_per_iteration": 2.909106731414795 }, { "auxiliary_loss_clip": 0.01461672, "auxiliary_loss_mlp": 0.01040471, "balance_loss_clip": 1.28179586, "balance_loss_mlp": 1.01785696, "epoch": 0.43601382834811364, "flos": 22721612060160.0, "grad_norm": 1.844581942199752, "language_loss": 0.69988358, "learning_rate": 2.5059982047187245e-06, "loss": 0.72490501, "num_input_tokens_seen": 155455425, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.22595215, "step": 7252, "time_per_iteration": 2.886782646179199 }, { "auxiliary_loss_clip": 0.01447263, "auxiliary_loss_mlp": 0.01035582, "balance_loss_clip": 1.27173221, "balance_loss_mlp": 1.01566219, "epoch": 0.4360739516007816, "flos": 19108016115840.0, "grad_norm": 1.6588738386835842, "language_loss": 0.84477925, "learning_rate": 2.505621403992348e-06, "loss": 0.86960769, "num_input_tokens_seen": 155474250, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.19909668, "step": 7253, "time_per_iteration": 2.805887460708618 }, { "auxiliary_loss_clip": 0.01463839, "auxiliary_loss_mlp": 0.01035717, "balance_loss_clip": 1.28419495, "balance_loss_mlp": 1.01440239, "epoch": 0.43613407485344957, "flos": 23414918417280.0, "grad_norm": 1.5708163300614775, "language_loss": 0.71460873, "learning_rate": 2.505244584092757e-06, "loss": 0.73960423, "num_input_tokens_seen": 155494685, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.21325684, "step": 7254, "time_per_iteration": 2.9536285400390625 }, { "auxiliary_loss_clip": 0.01448596, "auxiliary_loss_mlp": 0.01039337, "balance_loss_clip": 1.27267587, "balance_loss_mlp": 1.01859498, "epoch": 0.43619419810611754, "flos": 22647989491200.0, "grad_norm": 1.7666580781687167, "language_loss": 0.8227337, "learning_rate": 2.5048677450342406e-06, "loss": 0.8476131, "num_input_tokens_seen": 155513040, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.20739746, "step": 7255, "time_per_iteration": 2.8537089824676514 }, { "auxiliary_loss_clip": 0.01472285, "auxiliary_loss_mlp": 0.0103694, "balance_loss_clip": 1.28967476, "balance_loss_mlp": 1.01685357, "epoch": 0.4362543213587855, "flos": 20057644120320.0, "grad_norm": 2.558512632804761, "language_loss": 0.77823347, "learning_rate": 2.504490886831089e-06, "loss": 0.80332577, "num_input_tokens_seen": 155530100, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.20080566, "step": 7256, "time_per_iteration": 4.330026865005493 }, { "auxiliary_loss_clip": 0.01445765, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.27001202, "balance_loss_mlp": 1.01175761, "epoch": 0.43631444461145347, "flos": 21371312937600.0, "grad_norm": 1.511188513504407, "language_loss": 0.76617748, "learning_rate": 2.5041140094975922e-06, "loss": 0.7909674, "num_input_tokens_seen": 155549375, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21459961, "step": 7257, "time_per_iteration": 2.840121269226074 }, { "auxiliary_loss_clip": 0.01460746, "auxiliary_loss_mlp": 0.01039905, "balance_loss_clip": 1.27919793, "balance_loss_mlp": 1.0179466, "epoch": 0.43637456786412143, "flos": 22428569617920.0, "grad_norm": 1.7307968248687207, "language_loss": 0.73704803, "learning_rate": 2.5037371130480417e-06, "loss": 0.76205444, "num_input_tokens_seen": 155569395, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.21960449, "step": 7258, "time_per_iteration": 4.202048063278198 }, { "auxiliary_loss_clip": 0.01467685, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.28495336, "balance_loss_mlp": 1.02065933, "epoch": 0.4364346911167894, "flos": 28560652911360.0, "grad_norm": 1.8865507813133124, "language_loss": 0.77485031, "learning_rate": 2.5033601974967297e-06, "loss": 0.79994011, "num_input_tokens_seen": 155589090, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.20654297, "step": 7259, "time_per_iteration": 2.8654587268829346 }, { "auxiliary_loss_clip": 0.01246906, "auxiliary_loss_mlp": 0.01019478, "balance_loss_clip": 1.14452434, "balance_loss_mlp": 0.99668515, "epoch": 0.43649481436945736, "flos": 62688524042880.0, "grad_norm": 0.7438014998269435, "language_loss": 0.57024562, "learning_rate": 2.5029832628579483e-06, "loss": 0.59290946, "num_input_tokens_seen": 155648660, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 0.22753906, "step": 7260, "time_per_iteration": 3.3056180477142334 }, { "auxiliary_loss_clip": 0.01444548, "auxiliary_loss_mlp": 0.01040747, "balance_loss_clip": 1.26485288, "balance_loss_mlp": 1.01892018, "epoch": 0.4365549376221254, "flos": 30604484615040.0, "grad_norm": 2.2879742526547937, "language_loss": 0.72427654, "learning_rate": 2.5026063091459907e-06, "loss": 0.74912953, "num_input_tokens_seen": 155669945, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.21826172, "step": 7261, "time_per_iteration": 2.9165055751800537 }, { "auxiliary_loss_clip": 0.01448917, "auxiliary_loss_mlp": 0.01039875, "balance_loss_clip": 1.2699945, "balance_loss_mlp": 1.01884604, "epoch": 0.43661506087479335, "flos": 17174889734400.0, "grad_norm": 1.8958679492158117, "language_loss": 0.70138067, "learning_rate": 2.5022293363751522e-06, "loss": 0.72626853, "num_input_tokens_seen": 155688555, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.21020508, "step": 7262, "time_per_iteration": 2.850949764251709 }, { "auxiliary_loss_clip": 0.01421375, "auxiliary_loss_mlp": 0.01039101, "balance_loss_clip": 1.25014138, "balance_loss_mlp": 1.01946735, "epoch": 0.4366751841274613, "flos": 22055841803520.0, "grad_norm": 1.7601594815285206, "language_loss": 0.80813116, "learning_rate": 2.501852344559726e-06, "loss": 0.8327359, "num_input_tokens_seen": 155705370, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.19628906, "step": 7263, "time_per_iteration": 2.893087148666382 }, { "auxiliary_loss_clip": 0.0143975, "auxiliary_loss_mlp": 0.01042743, "balance_loss_clip": 1.26303291, "balance_loss_mlp": 1.02215552, "epoch": 0.4367353073801293, "flos": 16005751367040.0, "grad_norm": 2.003533480308807, "language_loss": 0.76256841, "learning_rate": 2.50147533371401e-06, "loss": 0.78739333, "num_input_tokens_seen": 155721890, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.20568848, "step": 7264, "time_per_iteration": 2.8973236083984375 }, { "auxiliary_loss_clip": 0.01443187, "auxiliary_loss_mlp": 0.0103892, "balance_loss_clip": 1.26626992, "balance_loss_mlp": 1.01767683, "epoch": 0.43679543063279724, "flos": 38231849928960.0, "grad_norm": 1.7820417260261963, "language_loss": 0.62453628, "learning_rate": 2.501098303852298e-06, "loss": 0.64935732, "num_input_tokens_seen": 155743970, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.21240234, "step": 7265, "time_per_iteration": 3.0061168670654297 }, { "auxiliary_loss_clip": 0.01441734, "auxiliary_loss_mlp": 0.01033369, "balance_loss_clip": 1.26464212, "balance_loss_mlp": 1.01214933, "epoch": 0.4368555538854652, "flos": 15201106260480.0, "grad_norm": 2.3847629309021814, "language_loss": 0.73773187, "learning_rate": 2.5007212549888884e-06, "loss": 0.76248288, "num_input_tokens_seen": 155761830, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.2121582, "step": 7266, "time_per_iteration": 2.8127048015594482 }, { "auxiliary_loss_clip": 0.01446746, "auxiliary_loss_mlp": 0.01040862, "balance_loss_clip": 1.26863027, "balance_loss_mlp": 1.01972651, "epoch": 0.4369156771381332, "flos": 23077689788160.0, "grad_norm": 2.04715496559634, "language_loss": 0.83503985, "learning_rate": 2.5003441871380794e-06, "loss": 0.85991591, "num_input_tokens_seen": 155779610, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.21154785, "step": 7267, "time_per_iteration": 2.8533895015716553 }, { "auxiliary_loss_clip": 0.01433706, "auxiliary_loss_mlp": 0.01036016, "balance_loss_clip": 1.25986218, "balance_loss_mlp": 1.01528549, "epoch": 0.43697580039080114, "flos": 23451639212160.0, "grad_norm": 2.723722911866533, "language_loss": 0.75097692, "learning_rate": 2.4999671003141674e-06, "loss": 0.77567416, "num_input_tokens_seen": 155798765, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.20727539, "step": 7268, "time_per_iteration": 2.84023118019104 }, { "auxiliary_loss_clip": 0.01445875, "auxiliary_loss_mlp": 0.01036897, "balance_loss_clip": 1.26695943, "balance_loss_mlp": 1.0155468, "epoch": 0.4370359236434691, "flos": 18523741023360.0, "grad_norm": 2.1991217589814838, "language_loss": 0.80503309, "learning_rate": 2.499589994531454e-06, "loss": 0.82986087, "num_input_tokens_seen": 155817750, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.21350098, "step": 7269, "time_per_iteration": 2.81695556640625 }, { "auxiliary_loss_clip": 0.01441429, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.26530099, "balance_loss_mlp": 1.0155437, "epoch": 0.43709604689613707, "flos": 23233078990080.0, "grad_norm": 2.4589123448996832, "language_loss": 0.75513119, "learning_rate": 2.499212869804237e-06, "loss": 0.77991998, "num_input_tokens_seen": 155836490, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.21899414, "step": 7270, "time_per_iteration": 2.8459506034851074 }, { "auxiliary_loss_clip": 0.01453849, "auxiliary_loss_mlp": 0.01035044, "balance_loss_clip": 1.27427697, "balance_loss_mlp": 1.01384878, "epoch": 0.43715617014880503, "flos": 23813779743360.0, "grad_norm": 2.0771524980857374, "language_loss": 0.79803479, "learning_rate": 2.4988357261468182e-06, "loss": 0.82292378, "num_input_tokens_seen": 155856225, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.21203613, "step": 7271, "time_per_iteration": 2.8872764110565186 }, { "auxiliary_loss_clip": 0.01240625, "auxiliary_loss_mlp": 0.01030707, "balance_loss_clip": 1.14078259, "balance_loss_mlp": 1.01106119, "epoch": 0.437216293401473, "flos": 61973201450880.0, "grad_norm": 0.7016885431934072, "language_loss": 0.54911906, "learning_rate": 2.4984585635734993e-06, "loss": 0.57183236, "num_input_tokens_seen": 155916770, "router_z_loss_clip": 0.99609375, "router_z_loss_mlp": 0.19628906, "step": 7272, "time_per_iteration": 3.451115608215332 }, { "auxiliary_loss_clip": 0.01456745, "auxiliary_loss_mlp": 0.01039572, "balance_loss_clip": 1.27627611, "balance_loss_mlp": 1.01781678, "epoch": 0.43727641665414096, "flos": 21992580293760.0, "grad_norm": 1.6920170506978562, "language_loss": 0.71058053, "learning_rate": 2.498081382098581e-06, "loss": 0.73554367, "num_input_tokens_seen": 155936490, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.21740723, "step": 7273, "time_per_iteration": 2.8276493549346924 }, { "auxiliary_loss_clip": 0.01450508, "auxiliary_loss_mlp": 0.01036576, "balance_loss_clip": 1.27047706, "balance_loss_mlp": 1.01527345, "epoch": 0.437336539906809, "flos": 39545473501440.0, "grad_norm": 2.123069428058822, "language_loss": 0.76993978, "learning_rate": 2.497704181736367e-06, "loss": 0.79481065, "num_input_tokens_seen": 155957595, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.21313477, "step": 7274, "time_per_iteration": 3.0094635486602783 }, { "auxiliary_loss_clip": 0.0142809, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.25227571, "balance_loss_mlp": 1.01028109, "epoch": 0.43739666315947695, "flos": 17466077139840.0, "grad_norm": 1.9046044488873413, "language_loss": 0.80896145, "learning_rate": 2.49732696250116e-06, "loss": 0.83355051, "num_input_tokens_seen": 155975710, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.2052002, "step": 7275, "time_per_iteration": 2.8493292331695557 }, { "auxiliary_loss_clip": 0.01444054, "auxiliary_loss_mlp": 0.01036343, "balance_loss_clip": 1.26820946, "balance_loss_mlp": 1.01414585, "epoch": 0.4374567864121449, "flos": 16366398819840.0, "grad_norm": 1.9982148612882, "language_loss": 0.81189704, "learning_rate": 2.496949724407266e-06, "loss": 0.83670092, "num_input_tokens_seen": 155993090, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.22180176, "step": 7276, "time_per_iteration": 2.816157579421997 }, { "auxiliary_loss_clip": 0.01464957, "auxiliary_loss_mlp": 0.01033939, "balance_loss_clip": 1.28028202, "balance_loss_mlp": 1.01221871, "epoch": 0.4375169096648129, "flos": 30598693280640.0, "grad_norm": 1.8326460505798894, "language_loss": 0.7356683, "learning_rate": 2.496572467468988e-06, "loss": 0.76065725, "num_input_tokens_seen": 156013685, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.21716309, "step": 7277, "time_per_iteration": 2.893683433532715 }, { "auxiliary_loss_clip": 0.01440424, "auxiliary_loss_mlp": 0.01036765, "balance_loss_clip": 1.26277375, "balance_loss_mlp": 1.01509297, "epoch": 0.43757703291748085, "flos": 30567944799360.0, "grad_norm": 1.9383331165489974, "language_loss": 0.73458445, "learning_rate": 2.4961951917006317e-06, "loss": 0.75935638, "num_input_tokens_seen": 156034300, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.2166748, "step": 7278, "time_per_iteration": 2.916102170944214 }, { "auxiliary_loss_clip": 0.01439827, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 1.26402068, "balance_loss_mlp": 1.01511931, "epoch": 0.4376371561701488, "flos": 21407264570880.0, "grad_norm": 2.9668028761150618, "language_loss": 0.66841936, "learning_rate": 2.4958178971165046e-06, "loss": 0.69317734, "num_input_tokens_seen": 156053805, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.20861816, "step": 7279, "time_per_iteration": 2.8302624225616455 }, { "auxiliary_loss_clip": 0.01468451, "auxiliary_loss_mlp": 0.01044569, "balance_loss_clip": 1.2837975, "balance_loss_mlp": 1.02221751, "epoch": 0.4376972794228168, "flos": 23414873172480.0, "grad_norm": 2.6564791931036282, "language_loss": 0.83113384, "learning_rate": 2.4954405837309126e-06, "loss": 0.856264, "num_input_tokens_seen": 156073295, "router_z_loss_clip": 1.84667969, "router_z_loss_mlp": 0.22363281, "step": 7280, "time_per_iteration": 2.839761257171631 }, { "auxiliary_loss_clip": 0.01430095, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 1.25744009, "balance_loss_mlp": 1.01079965, "epoch": 0.43775740267548474, "flos": 22903134773760.0, "grad_norm": 1.6559866092433078, "language_loss": 0.77784175, "learning_rate": 2.4950632515581653e-06, "loss": 0.80246711, "num_input_tokens_seen": 156094540, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.21618652, "step": 7281, "time_per_iteration": 2.8494272232055664 }, { "auxiliary_loss_clip": 0.01449329, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.27014136, "balance_loss_mlp": 1.01426601, "epoch": 0.4378175259281527, "flos": 23304710787840.0, "grad_norm": 1.9429374273030813, "language_loss": 0.77531928, "learning_rate": 2.494685900612569e-06, "loss": 0.80016249, "num_input_tokens_seen": 156114070, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.20727539, "step": 7282, "time_per_iteration": 2.8460309505462646 }, { "auxiliary_loss_clip": 0.01453597, "auxiliary_loss_mlp": 0.01036564, "balance_loss_clip": 1.27240682, "balance_loss_mlp": 1.01492763, "epoch": 0.43787764918082067, "flos": 23887040353920.0, "grad_norm": 2.501787870014843, "language_loss": 0.85920596, "learning_rate": 2.4943085309084333e-06, "loss": 0.88410753, "num_input_tokens_seen": 156132130, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.21630859, "step": 7283, "time_per_iteration": 4.259234189987183 }, { "auxiliary_loss_clip": 0.01464104, "auxiliary_loss_mlp": 0.0103824, "balance_loss_clip": 1.27974939, "balance_loss_mlp": 1.01588809, "epoch": 0.43793777243348864, "flos": 23998786306560.0, "grad_norm": 1.9467636863732452, "language_loss": 0.81549209, "learning_rate": 2.49393114246007e-06, "loss": 0.84051555, "num_input_tokens_seen": 156150820, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.22338867, "step": 7284, "time_per_iteration": 2.8802502155303955 }, { "auxiliary_loss_clip": 0.01444316, "auxiliary_loss_mlp": 0.01040931, "balance_loss_clip": 1.26632595, "balance_loss_mlp": 1.01786399, "epoch": 0.4379978956861566, "flos": 18633134246400.0, "grad_norm": 2.43226106828972, "language_loss": 0.81770694, "learning_rate": 2.493553735281787e-06, "loss": 0.84255946, "num_input_tokens_seen": 156170125, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.23059082, "step": 7285, "time_per_iteration": 2.8176732063293457 }, { "auxiliary_loss_clip": 0.0145666, "auxiliary_loss_mlp": 0.01032907, "balance_loss_clip": 1.27622366, "balance_loss_mlp": 1.01198554, "epoch": 0.43805801893882457, "flos": 21991494418560.0, "grad_norm": 2.101682044578097, "language_loss": 0.75498533, "learning_rate": 2.493176309387897e-06, "loss": 0.779881, "num_input_tokens_seen": 156187320, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.20935059, "step": 7286, "time_per_iteration": 2.9041709899902344 }, { "auxiliary_loss_clip": 0.0145771, "auxiliary_loss_mlp": 0.01036274, "balance_loss_clip": 1.27436054, "balance_loss_mlp": 1.01463771, "epoch": 0.43811814219149253, "flos": 26403853645440.0, "grad_norm": 2.079649814246176, "language_loss": 0.74573505, "learning_rate": 2.492798864792712e-06, "loss": 0.77067494, "num_input_tokens_seen": 156207455, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.21643066, "step": 7287, "time_per_iteration": 2.957794427871704 }, { "auxiliary_loss_clip": 0.01458136, "auxiliary_loss_mlp": 0.01039796, "balance_loss_clip": 1.27682793, "balance_loss_mlp": 1.01771832, "epoch": 0.43817826544416055, "flos": 17502164507520.0, "grad_norm": 1.9479759123595193, "language_loss": 0.83205479, "learning_rate": 2.492421401510545e-06, "loss": 0.85703409, "num_input_tokens_seen": 156226560, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.22058105, "step": 7288, "time_per_iteration": 2.815713882446289 }, { "auxiliary_loss_clip": 0.01463383, "auxiliary_loss_mlp": 0.01034575, "balance_loss_clip": 1.27918434, "balance_loss_mlp": 1.0137248, "epoch": 0.4382383886968285, "flos": 21591275748480.0, "grad_norm": 1.5743034588301685, "language_loss": 0.84749836, "learning_rate": 2.4920439195557093e-06, "loss": 0.87247801, "num_input_tokens_seen": 156246740, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.20825195, "step": 7289, "time_per_iteration": 2.9344100952148438 }, { "auxiliary_loss_clip": 0.01484325, "auxiliary_loss_mlp": 0.01037717, "balance_loss_clip": 1.29832649, "balance_loss_mlp": 1.01660538, "epoch": 0.4382985119494965, "flos": 27934137158400.0, "grad_norm": 1.5192340254738526, "language_loss": 0.78852707, "learning_rate": 2.4916664189425183e-06, "loss": 0.81374753, "num_input_tokens_seen": 156266440, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.21130371, "step": 7290, "time_per_iteration": 2.8868682384490967 }, { "auxiliary_loss_clip": 0.01455022, "auxiliary_loss_mlp": 0.01038524, "balance_loss_clip": 1.27491403, "balance_loss_mlp": 1.01772201, "epoch": 0.43835863520216445, "flos": 24947735639040.0, "grad_norm": 1.9745493325659746, "language_loss": 0.78523862, "learning_rate": 2.491288899685288e-06, "loss": 0.81017411, "num_input_tokens_seen": 156286900, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.20800781, "step": 7291, "time_per_iteration": 5.8035078048706055 }, { "auxiliary_loss_clip": 0.01460133, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.27908885, "balance_loss_mlp": 1.01515639, "epoch": 0.4384187584548324, "flos": 33523235879040.0, "grad_norm": 1.653094262246065, "language_loss": 0.65559983, "learning_rate": 2.4909113617983325e-06, "loss": 0.68055868, "num_input_tokens_seen": 156307690, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.20593262, "step": 7292, "time_per_iteration": 2.9235785007476807 }, { "auxiliary_loss_clip": 0.01473805, "auxiliary_loss_mlp": 0.01033172, "balance_loss_clip": 1.29024601, "balance_loss_mlp": 1.01149964, "epoch": 0.4384788817075004, "flos": 23961522574080.0, "grad_norm": 1.5140609048946356, "language_loss": 0.7550754, "learning_rate": 2.49053380529597e-06, "loss": 0.78014517, "num_input_tokens_seen": 156326620, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.21679688, "step": 7293, "time_per_iteration": 4.248810768127441 }, { "auxiliary_loss_clip": 0.01465963, "auxiliary_loss_mlp": 0.01038112, "balance_loss_clip": 1.28584588, "balance_loss_mlp": 1.01667857, "epoch": 0.43853900496016834, "flos": 19107744647040.0, "grad_norm": 1.885246907312398, "language_loss": 0.80201018, "learning_rate": 2.490156230192516e-06, "loss": 0.82705092, "num_input_tokens_seen": 156345495, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.2142334, "step": 7294, "time_per_iteration": 2.803452968597412 }, { "auxiliary_loss_clip": 0.01467752, "auxiliary_loss_mlp": 0.01039755, "balance_loss_clip": 1.28625619, "balance_loss_mlp": 1.01904869, "epoch": 0.4385991282128363, "flos": 13233340344960.0, "grad_norm": 1.6500728933971545, "language_loss": 0.73784626, "learning_rate": 2.4897786365022883e-06, "loss": 0.76292133, "num_input_tokens_seen": 156363155, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.20703125, "step": 7295, "time_per_iteration": 2.803194761276245 }, { "auxiliary_loss_clip": 0.01479362, "auxiliary_loss_mlp": 0.01049906, "balance_loss_clip": 1.29499483, "balance_loss_mlp": 1.02757835, "epoch": 0.4386592514655043, "flos": 14328132226560.0, "grad_norm": 1.825078750184844, "language_loss": 0.76134253, "learning_rate": 2.4894010242396063e-06, "loss": 0.78663528, "num_input_tokens_seen": 156380940, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.2232666, "step": 7296, "time_per_iteration": 2.878452777862549 }, { "auxiliary_loss_clip": 0.01460175, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.28013599, "balance_loss_mlp": 1.01532936, "epoch": 0.43871937471817224, "flos": 22794827425920.0, "grad_norm": 1.5750689633217083, "language_loss": 0.69699705, "learning_rate": 2.4890233934187873e-06, "loss": 0.72196352, "num_input_tokens_seen": 156400415, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.21154785, "step": 7297, "time_per_iteration": 2.8298516273498535 }, { "auxiliary_loss_clip": 0.01458994, "auxiliary_loss_mlp": 0.01039017, "balance_loss_clip": 1.28016448, "balance_loss_mlp": 1.01814318, "epoch": 0.4387794979708402, "flos": 28083418312320.0, "grad_norm": 1.5003086472264369, "language_loss": 0.70694721, "learning_rate": 2.4886457440541535e-06, "loss": 0.73192739, "num_input_tokens_seen": 156421120, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.2088623, "step": 7298, "time_per_iteration": 2.866270065307617 }, { "auxiliary_loss_clip": 0.01455592, "auxiliary_loss_mlp": 0.01036079, "balance_loss_clip": 1.27733231, "balance_loss_mlp": 1.01381063, "epoch": 0.43883962122350817, "flos": 26260590049920.0, "grad_norm": 1.5087363188883787, "language_loss": 0.7286793, "learning_rate": 2.4882680761600238e-06, "loss": 0.75359607, "num_input_tokens_seen": 156441535, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.22290039, "step": 7299, "time_per_iteration": 2.8651773929595947 }, { "auxiliary_loss_clip": 0.01468831, "auxiliary_loss_mlp": 0.01039228, "balance_loss_clip": 1.28519011, "balance_loss_mlp": 1.01642346, "epoch": 0.43889974447617613, "flos": 25894467976320.0, "grad_norm": 1.692831211773142, "language_loss": 0.77580523, "learning_rate": 2.487890389750719e-06, "loss": 0.8008858, "num_input_tokens_seen": 156462015, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.22790527, "step": 7300, "time_per_iteration": 2.889347553253174 }, { "auxiliary_loss_clip": 0.01461362, "auxiliary_loss_mlp": 0.01039071, "balance_loss_clip": 1.28041196, "balance_loss_mlp": 1.0179944, "epoch": 0.43895986772884416, "flos": 25057626554880.0, "grad_norm": 3.6764799913728217, "language_loss": 0.71890789, "learning_rate": 2.4875126848405626e-06, "loss": 0.74391222, "num_input_tokens_seen": 156482165, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.21081543, "step": 7301, "time_per_iteration": 2.967463731765747 }, { "auxiliary_loss_clip": 0.01478256, "auxiliary_loss_mlp": 0.01042865, "balance_loss_clip": 1.29572654, "balance_loss_mlp": 1.02089489, "epoch": 0.4390199909815121, "flos": 26005670991360.0, "grad_norm": 2.208914690990745, "language_loss": 0.71561551, "learning_rate": 2.4871349614438757e-06, "loss": 0.74082673, "num_input_tokens_seen": 156503170, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.21960449, "step": 7302, "time_per_iteration": 2.8947505950927734 }, { "auxiliary_loss_clip": 0.01458356, "auxiliary_loss_mlp": 0.01043419, "balance_loss_clip": 1.27897322, "balance_loss_mlp": 1.02190161, "epoch": 0.4390801142341801, "flos": 29033951212800.0, "grad_norm": 1.6064428657511671, "language_loss": 0.82506627, "learning_rate": 2.486757219574983e-06, "loss": 0.85008395, "num_input_tokens_seen": 156523005, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.21520996, "step": 7303, "time_per_iteration": 2.948883295059204 }, { "auxiliary_loss_clip": 0.01482098, "auxiliary_loss_mlp": 0.0104069, "balance_loss_clip": 1.29528975, "balance_loss_mlp": 1.01985252, "epoch": 0.43914023748684805, "flos": 33451785060480.0, "grad_norm": 2.134186872731007, "language_loss": 0.6980226, "learning_rate": 2.4863794592482067e-06, "loss": 0.72325051, "num_input_tokens_seen": 156544440, "router_z_loss_clip": 1.86816406, "router_z_loss_mlp": 0.20861816, "step": 7304, "time_per_iteration": 2.9262404441833496 }, { "auxiliary_loss_clip": 0.01449861, "auxiliary_loss_mlp": 0.01038219, "balance_loss_clip": 1.27302098, "balance_loss_mlp": 1.01720262, "epoch": 0.439200360739516, "flos": 34545038618880.0, "grad_norm": 1.3595587217385068, "language_loss": 0.78708184, "learning_rate": 2.486001680477873e-06, "loss": 0.81196272, "num_input_tokens_seen": 156565410, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.21020508, "step": 7305, "time_per_iteration": 2.9577412605285645 }, { "auxiliary_loss_clip": 0.01465125, "auxiliary_loss_mlp": 0.01040627, "balance_loss_clip": 1.28498924, "balance_loss_mlp": 1.01884699, "epoch": 0.439260483992184, "flos": 21917781360000.0, "grad_norm": 1.7414874662980093, "language_loss": 0.69169647, "learning_rate": 2.485623883278308e-06, "loss": 0.71675396, "num_input_tokens_seen": 156584210, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.21777344, "step": 7306, "time_per_iteration": 2.837794065475464 }, { "auxiliary_loss_clip": 0.01460824, "auxiliary_loss_mlp": 0.01042766, "balance_loss_clip": 1.2793982, "balance_loss_mlp": 1.02020013, "epoch": 0.43932060724485195, "flos": 21006321984000.0, "grad_norm": 1.8126321483119852, "language_loss": 0.63474822, "learning_rate": 2.4852460676638344e-06, "loss": 0.65978414, "num_input_tokens_seen": 156602730, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.22558594, "step": 7307, "time_per_iteration": 2.81889009475708 }, { "auxiliary_loss_clip": 0.01470991, "auxiliary_loss_mlp": 0.01041673, "balance_loss_clip": 1.28688431, "balance_loss_mlp": 1.01996481, "epoch": 0.4393807304975199, "flos": 17755545242880.0, "grad_norm": 1.9648452593338537, "language_loss": 0.72768021, "learning_rate": 2.4848682336487828e-06, "loss": 0.75280684, "num_input_tokens_seen": 156619405, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.21704102, "step": 7308, "time_per_iteration": 2.7916765213012695 }, { "auxiliary_loss_clip": 0.01461368, "auxiliary_loss_mlp": 0.01041255, "balance_loss_clip": 1.27679992, "balance_loss_mlp": 1.01933253, "epoch": 0.4394408537501879, "flos": 22538732002560.0, "grad_norm": 2.1683236417889615, "language_loss": 0.77347863, "learning_rate": 2.4844903812474787e-06, "loss": 0.79850489, "num_input_tokens_seen": 156638165, "router_z_loss_clip": 1.84472656, "router_z_loss_mlp": 0.21936035, "step": 7309, "time_per_iteration": 2.8534719944000244 }, { "auxiliary_loss_clip": 0.01442829, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.26755834, "balance_loss_mlp": 1.01427317, "epoch": 0.43950097700285584, "flos": 23451096274560.0, "grad_norm": 1.806805410759518, "language_loss": 0.71513438, "learning_rate": 2.484112510474251e-06, "loss": 0.73992485, "num_input_tokens_seen": 156658845, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.21960449, "step": 7310, "time_per_iteration": 2.8820126056671143 }, { "auxiliary_loss_clip": 0.01479943, "auxiliary_loss_mlp": 0.01039007, "balance_loss_clip": 1.2947576, "balance_loss_mlp": 1.01760888, "epoch": 0.4395611002555238, "flos": 23189888188800.0, "grad_norm": 1.9953616556712859, "language_loss": 0.76691341, "learning_rate": 2.483734621343429e-06, "loss": 0.79210293, "num_input_tokens_seen": 156677275, "router_z_loss_clip": 1.8515625, "router_z_loss_mlp": 0.21386719, "step": 7311, "time_per_iteration": 2.828118085861206 }, { "auxiliary_loss_clip": 0.01487476, "auxiliary_loss_mlp": 0.01045195, "balance_loss_clip": 1.30167198, "balance_loss_mlp": 1.02435684, "epoch": 0.43962122350819177, "flos": 22137517946880.0, "grad_norm": 2.106678331738435, "language_loss": 0.82711422, "learning_rate": 2.483356713869341e-06, "loss": 0.85244095, "num_input_tokens_seen": 156695815, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.20837402, "step": 7312, "time_per_iteration": 2.913433790206909 }, { "auxiliary_loss_clip": 0.01460607, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.28130078, "balance_loss_mlp": 1.01749039, "epoch": 0.43968134676085974, "flos": 17429446834560.0, "grad_norm": 3.1319490910996497, "language_loss": 0.86618239, "learning_rate": 2.482978788066318e-06, "loss": 0.89116955, "num_input_tokens_seen": 156714385, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.20629883, "step": 7313, "time_per_iteration": 2.8134238719940186 }, { "auxiliary_loss_clip": 0.01471337, "auxiliary_loss_mlp": 0.01043333, "balance_loss_clip": 1.28675854, "balance_loss_mlp": 1.02206564, "epoch": 0.43974147001352776, "flos": 18961856853120.0, "grad_norm": 1.851388154912592, "language_loss": 0.6853385, "learning_rate": 2.4826008439486904e-06, "loss": 0.71048516, "num_input_tokens_seen": 156732615, "router_z_loss_clip": 1.84277344, "router_z_loss_mlp": 0.21252441, "step": 7314, "time_per_iteration": 2.8271877765655518 }, { "auxiliary_loss_clip": 0.01470993, "auxiliary_loss_mlp": 0.01043784, "balance_loss_clip": 1.28542328, "balance_loss_mlp": 1.02142024, "epoch": 0.4398015932661957, "flos": 18962942728320.0, "grad_norm": 1.8980336477172999, "language_loss": 0.77855361, "learning_rate": 2.4822228815307915e-06, "loss": 0.8037014, "num_input_tokens_seen": 156750920, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22363281, "step": 7315, "time_per_iteration": 2.813729763031006 }, { "auxiliary_loss_clip": 0.01453105, "auxiliary_loss_mlp": 0.01041909, "balance_loss_clip": 1.27316523, "balance_loss_mlp": 1.02178645, "epoch": 0.4398617165188637, "flos": 24208750016640.0, "grad_norm": 2.6134878651206637, "language_loss": 0.75027937, "learning_rate": 2.4818449008269523e-06, "loss": 0.77522951, "num_input_tokens_seen": 156768520, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.2010498, "step": 7316, "time_per_iteration": 2.878239393234253 }, { "auxiliary_loss_clip": 0.01461036, "auxiliary_loss_mlp": 0.01043728, "balance_loss_clip": 1.28076005, "balance_loss_mlp": 1.02191234, "epoch": 0.43992183977153165, "flos": 22246820680320.0, "grad_norm": 2.8219878672510434, "language_loss": 0.66170168, "learning_rate": 2.481466901851506e-06, "loss": 0.68674934, "num_input_tokens_seen": 156788700, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.21813965, "step": 7317, "time_per_iteration": 2.8454365730285645 }, { "auxiliary_loss_clip": 0.01475159, "auxiliary_loss_mlp": 0.01042557, "balance_loss_clip": 1.29151237, "balance_loss_mlp": 1.02093256, "epoch": 0.4399819630241996, "flos": 18706666325760.0, "grad_norm": 2.047013528197896, "language_loss": 0.80749702, "learning_rate": 2.4810888846187865e-06, "loss": 0.83267415, "num_input_tokens_seen": 156806470, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.21643066, "step": 7318, "time_per_iteration": 4.281556129455566 }, { "auxiliary_loss_clip": 0.0148374, "auxiliary_loss_mlp": 0.01044934, "balance_loss_clip": 1.29771817, "balance_loss_mlp": 1.0234288, "epoch": 0.4400420862768676, "flos": 23890162245120.0, "grad_norm": 1.510794318563486, "language_loss": 0.80283248, "learning_rate": 2.4807108491431283e-06, "loss": 0.82811916, "num_input_tokens_seen": 156825895, "router_z_loss_clip": 1.86132812, "router_z_loss_mlp": 0.21520996, "step": 7319, "time_per_iteration": 2.855647563934326 }, { "auxiliary_loss_clip": 0.01464952, "auxiliary_loss_mlp": 0.01048759, "balance_loss_clip": 1.28410316, "balance_loss_mlp": 1.02646661, "epoch": 0.44010220952953555, "flos": 28049185981440.0, "grad_norm": 1.835760150995832, "language_loss": 0.81003159, "learning_rate": 2.4803327954388667e-06, "loss": 0.83516872, "num_input_tokens_seen": 156845990, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.22277832, "step": 7320, "time_per_iteration": 2.897409200668335 }, { "auxiliary_loss_clip": 0.01462606, "auxiliary_loss_mlp": 0.01044273, "balance_loss_clip": 1.28107083, "balance_loss_mlp": 1.02301836, "epoch": 0.4401623327822035, "flos": 23779502167680.0, "grad_norm": 1.605227256998024, "language_loss": 0.70353973, "learning_rate": 2.4799547235203376e-06, "loss": 0.72860849, "num_input_tokens_seen": 156866685, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.21276855, "step": 7321, "time_per_iteration": 2.8361306190490723 }, { "auxiliary_loss_clip": 0.01269605, "auxiliary_loss_mlp": 0.01037455, "balance_loss_clip": 1.16526961, "balance_loss_mlp": 1.00922585, "epoch": 0.4402224560348715, "flos": 70809864531840.0, "grad_norm": 0.8849412763665592, "language_loss": 0.56970537, "learning_rate": 2.4795766334018763e-06, "loss": 0.59277594, "num_input_tokens_seen": 156923450, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.28320312, "step": 7322, "time_per_iteration": 3.4432075023651123 }, { "auxiliary_loss_clip": 0.01466172, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.2867651, "balance_loss_mlp": 1.02706611, "epoch": 0.44028257928753944, "flos": 22901686940160.0, "grad_norm": 2.552910071692215, "language_loss": 0.77165258, "learning_rate": 2.479198525097822e-06, "loss": 0.79678214, "num_input_tokens_seen": 156944795, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.19714355, "step": 7323, "time_per_iteration": 2.875600814819336 }, { "auxiliary_loss_clip": 0.01470956, "auxiliary_loss_mlp": 0.01047383, "balance_loss_clip": 1.28863275, "balance_loss_mlp": 1.02630687, "epoch": 0.4403427025402074, "flos": 17905007376000.0, "grad_norm": 1.6179824207700244, "language_loss": 0.81295425, "learning_rate": 2.478820398622511e-06, "loss": 0.83813763, "num_input_tokens_seen": 156962755, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.21057129, "step": 7324, "time_per_iteration": 2.8752851486206055 }, { "auxiliary_loss_clip": 0.01280319, "auxiliary_loss_mlp": 0.01048531, "balance_loss_clip": 1.17387342, "balance_loss_mlp": 1.01820374, "epoch": 0.4404028257928754, "flos": 69595408857600.0, "grad_norm": 0.6679495935086173, "language_loss": 0.54596478, "learning_rate": 2.478442253990283e-06, "loss": 0.56925333, "num_input_tokens_seen": 157028095, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.30273438, "step": 7325, "time_per_iteration": 3.31561541557312 }, { "auxiliary_loss_clip": 0.0146834, "auxiliary_loss_mlp": 0.01040773, "balance_loss_clip": 1.28991163, "balance_loss_mlp": 1.02142537, "epoch": 0.44046294904554334, "flos": 20933604311040.0, "grad_norm": 1.6526656648177898, "language_loss": 0.70498526, "learning_rate": 2.4780640912154766e-06, "loss": 0.73007637, "num_input_tokens_seen": 157048365, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.19348145, "step": 7326, "time_per_iteration": 4.273205518722534 }, { "auxiliary_loss_clip": 0.01457671, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.27975714, "balance_loss_mlp": 1.02456617, "epoch": 0.44052307229821136, "flos": 23634066821760.0, "grad_norm": 1.6635810824539783, "language_loss": 0.77039981, "learning_rate": 2.477685910312432e-06, "loss": 0.79543722, "num_input_tokens_seen": 157069130, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.21496582, "step": 7327, "time_per_iteration": 2.86287260055542 }, { "auxiliary_loss_clip": 0.014645, "auxiliary_loss_mlp": 0.01043833, "balance_loss_clip": 1.28644288, "balance_loss_mlp": 1.02324533, "epoch": 0.4405831955508793, "flos": 17605223458560.0, "grad_norm": 1.9642939962902053, "language_loss": 0.84449703, "learning_rate": 2.4773077112954897e-06, "loss": 0.86958033, "num_input_tokens_seen": 157084940, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.20593262, "step": 7328, "time_per_iteration": 4.242962598800659 }, { "auxiliary_loss_clip": 0.01464424, "auxiliary_loss_mlp": 0.0103852, "balance_loss_clip": 1.28637505, "balance_loss_mlp": 1.01734841, "epoch": 0.4406433188035473, "flos": 21471476221440.0, "grad_norm": 3.5023239265499373, "language_loss": 0.784356, "learning_rate": 2.4769294941789908e-06, "loss": 0.80938542, "num_input_tokens_seen": 157102770, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.21166992, "step": 7329, "time_per_iteration": 3.0050745010375977 }, { "auxiliary_loss_clip": 0.01474705, "auxiliary_loss_mlp": 0.01039162, "balance_loss_clip": 1.29075146, "balance_loss_mlp": 1.01816869, "epoch": 0.44070344205621526, "flos": 22683579166080.0, "grad_norm": 2.507990785580898, "language_loss": 0.74544358, "learning_rate": 2.476551258977278e-06, "loss": 0.7705822, "num_input_tokens_seen": 157122035, "router_z_loss_clip": 1.84082031, "router_z_loss_mlp": 0.20996094, "step": 7330, "time_per_iteration": 2.815157651901245 }, { "auxiliary_loss_clip": 0.01463145, "auxiliary_loss_mlp": 0.01037253, "balance_loss_clip": 1.28338706, "balance_loss_mlp": 1.01686835, "epoch": 0.4407635653088832, "flos": 23451774946560.0, "grad_norm": 1.9215610659451428, "language_loss": 0.75111032, "learning_rate": 2.4761730057046936e-06, "loss": 0.77611434, "num_input_tokens_seen": 157142800, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.20385742, "step": 7331, "time_per_iteration": 2.881594657897949 }, { "auxiliary_loss_clip": 0.01455959, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.27834189, "balance_loss_mlp": 1.02235508, "epoch": 0.4408236885615512, "flos": 24031073111040.0, "grad_norm": 1.6134338705405693, "language_loss": 0.76716685, "learning_rate": 2.475794734375581e-06, "loss": 0.79215693, "num_input_tokens_seen": 157163295, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.20690918, "step": 7332, "time_per_iteration": 2.8717565536499023 }, { "auxiliary_loss_clip": 0.0145772, "auxiliary_loss_mlp": 0.01041772, "balance_loss_clip": 1.27849817, "balance_loss_mlp": 1.02051747, "epoch": 0.44088381181421915, "flos": 12684338213760.0, "grad_norm": 1.6550566805881763, "language_loss": 0.74055439, "learning_rate": 2.475416445004285e-06, "loss": 0.7655493, "num_input_tokens_seen": 157180890, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.21252441, "step": 7333, "time_per_iteration": 2.7910070419311523 }, { "auxiliary_loss_clip": 0.01447035, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.27346575, "balance_loss_mlp": 1.01893115, "epoch": 0.4409439350668871, "flos": 24580120487040.0, "grad_norm": 1.6907011660081197, "language_loss": 0.80304903, "learning_rate": 2.4750381376051493e-06, "loss": 0.82791746, "num_input_tokens_seen": 157200580, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.20874023, "step": 7334, "time_per_iteration": 2.852411985397339 }, { "auxiliary_loss_clip": 0.01489043, "auxiliary_loss_mlp": 0.01044839, "balance_loss_clip": 1.29795182, "balance_loss_mlp": 1.02148545, "epoch": 0.4410040583195551, "flos": 22677290138880.0, "grad_norm": 1.996714422901873, "language_loss": 0.75871956, "learning_rate": 2.47465981219252e-06, "loss": 0.78405839, "num_input_tokens_seen": 157218345, "router_z_loss_clip": 1.91210938, "router_z_loss_mlp": 0.23352051, "step": 7335, "time_per_iteration": 2.8505704402923584 }, { "auxiliary_loss_clip": 0.01454022, "auxiliary_loss_mlp": 0.01040767, "balance_loss_clip": 1.27384269, "balance_loss_mlp": 1.01941633, "epoch": 0.44106418157222305, "flos": 10859564424960.0, "grad_norm": 1.856549313641576, "language_loss": 0.73291951, "learning_rate": 2.4742814687807423e-06, "loss": 0.7578674, "num_input_tokens_seen": 157234395, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.21337891, "step": 7336, "time_per_iteration": 2.83628511428833 }, { "auxiliary_loss_clip": 0.01458408, "auxiliary_loss_mlp": 0.01045877, "balance_loss_clip": 1.27514219, "balance_loss_mlp": 1.02480078, "epoch": 0.441124304824891, "flos": 21736937318400.0, "grad_norm": 2.6059143752219054, "language_loss": 0.64886463, "learning_rate": 2.473903107384165e-06, "loss": 0.67390746, "num_input_tokens_seen": 157254805, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.21081543, "step": 7337, "time_per_iteration": 2.810410499572754 }, { "auxiliary_loss_clip": 0.01257303, "auxiliary_loss_mlp": 0.01041655, "balance_loss_clip": 1.15471673, "balance_loss_mlp": 1.02363062, "epoch": 0.441184428077559, "flos": 63253407098880.0, "grad_norm": 0.7472164689826325, "language_loss": 0.5272482, "learning_rate": 2.473524728017134e-06, "loss": 0.55023777, "num_input_tokens_seen": 157317870, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.18066406, "step": 7338, "time_per_iteration": 3.3686940670013428 }, { "auxiliary_loss_clip": 0.01475106, "auxiliary_loss_mlp": 0.01049788, "balance_loss_clip": 1.28745985, "balance_loss_mlp": 1.0269711, "epoch": 0.44124455133022694, "flos": 21187663718400.0, "grad_norm": 2.0496133249125594, "language_loss": 0.71574116, "learning_rate": 2.473146330693997e-06, "loss": 0.7409901, "num_input_tokens_seen": 157336505, "router_z_loss_clip": 1.87597656, "router_z_loss_mlp": 0.22814941, "step": 7339, "time_per_iteration": 2.831730842590332 }, { "auxiliary_loss_clip": 0.01438831, "auxiliary_loss_mlp": 0.0104328, "balance_loss_clip": 1.26572466, "balance_loss_mlp": 1.02263284, "epoch": 0.4413046745828949, "flos": 17466982035840.0, "grad_norm": 1.5021063426106052, "language_loss": 0.70219815, "learning_rate": 2.472767915429105e-06, "loss": 0.72701931, "num_input_tokens_seen": 157354995, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.20654297, "step": 7340, "time_per_iteration": 2.883413076400757 }, { "auxiliary_loss_clip": 0.0126265, "auxiliary_loss_mlp": 0.01041146, "balance_loss_clip": 1.15931106, "balance_loss_mlp": 1.02254987, "epoch": 0.4413647978355629, "flos": 61611133426560.0, "grad_norm": 0.8983563546049733, "language_loss": 0.64051318, "learning_rate": 2.4723894822368054e-06, "loss": 0.66355109, "num_input_tokens_seen": 157404260, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.18554688, "step": 7341, "time_per_iteration": 3.124307155609131 }, { "auxiliary_loss_clip": 0.01452173, "auxiliary_loss_mlp": 0.01037093, "balance_loss_clip": 1.27094555, "balance_loss_mlp": 1.01565969, "epoch": 0.4414249210882309, "flos": 27538488213120.0, "grad_norm": 2.125804942496564, "language_loss": 0.74457479, "learning_rate": 2.47201103113145e-06, "loss": 0.76946747, "num_input_tokens_seen": 157423045, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.21435547, "step": 7342, "time_per_iteration": 2.953056812286377 }, { "auxiliary_loss_clip": 0.01443644, "auxiliary_loss_mlp": 0.01041507, "balance_loss_clip": 1.26534212, "balance_loss_mlp": 1.02007353, "epoch": 0.44148504434089886, "flos": 23524537864320.0, "grad_norm": 1.845314145359759, "language_loss": 0.81025469, "learning_rate": 2.4716325621273886e-06, "loss": 0.83510619, "num_input_tokens_seen": 157441815, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.2142334, "step": 7343, "time_per_iteration": 2.91367506980896 }, { "auxiliary_loss_clip": 0.01449109, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.26960182, "balance_loss_mlp": 1.01653421, "epoch": 0.4415451675935668, "flos": 21590732810880.0, "grad_norm": 8.218235463187373, "language_loss": 0.77606988, "learning_rate": 2.4712540752389725e-06, "loss": 0.80093694, "num_input_tokens_seen": 157460470, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.21069336, "step": 7344, "time_per_iteration": 2.851857900619507 }, { "auxiliary_loss_clip": 0.01266231, "auxiliary_loss_mlp": 0.01024753, "balance_loss_clip": 1.16291738, "balance_loss_mlp": 1.00720561, "epoch": 0.4416052908462348, "flos": 59033456599680.0, "grad_norm": 0.7911190885519438, "language_loss": 0.63819629, "learning_rate": 2.470875570480556e-06, "loss": 0.66110611, "num_input_tokens_seen": 157512655, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.17578125, "step": 7345, "time_per_iteration": 3.040727138519287 }, { "auxiliary_loss_clip": 0.01452566, "auxiliary_loss_mlp": 0.01037279, "balance_loss_clip": 1.27200794, "balance_loss_mlp": 1.01465261, "epoch": 0.44166541409890275, "flos": 26368354460160.0, "grad_norm": 1.7188739811738007, "language_loss": 0.86124289, "learning_rate": 2.470497047866489e-06, "loss": 0.88614142, "num_input_tokens_seen": 157533700, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22619629, "step": 7346, "time_per_iteration": 2.9145097732543945 }, { "auxiliary_loss_clip": 0.0146037, "auxiliary_loss_mlp": 0.01038932, "balance_loss_clip": 1.27898324, "balance_loss_mlp": 1.01654434, "epoch": 0.4417255373515707, "flos": 20202084080640.0, "grad_norm": 1.6548115070186007, "language_loss": 0.80643398, "learning_rate": 2.470118507411128e-06, "loss": 0.83142698, "num_input_tokens_seen": 157551105, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.22387695, "step": 7347, "time_per_iteration": 2.8458476066589355 }, { "auxiliary_loss_clip": 0.014561, "auxiliary_loss_mlp": 0.01037647, "balance_loss_clip": 1.27440739, "balance_loss_mlp": 1.01452017, "epoch": 0.4417856606042387, "flos": 17895098764800.0, "grad_norm": 1.8212717925722746, "language_loss": 0.83972061, "learning_rate": 2.4697399491288263e-06, "loss": 0.86465812, "num_input_tokens_seen": 157568285, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.2310791, "step": 7348, "time_per_iteration": 2.8834269046783447 }, { "auxiliary_loss_clip": 0.01472188, "auxiliary_loss_mlp": 0.01041202, "balance_loss_clip": 1.2882005, "balance_loss_mlp": 1.01854026, "epoch": 0.44184578385690665, "flos": 27975427678080.0, "grad_norm": 1.7492855049391907, "language_loss": 0.71505284, "learning_rate": 2.469361373033938e-06, "loss": 0.74018669, "num_input_tokens_seen": 157590405, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.2265625, "step": 7349, "time_per_iteration": 2.9200048446655273 }, { "auxiliary_loss_clip": 0.01461134, "auxiliary_loss_mlp": 0.01037383, "balance_loss_clip": 1.27941549, "balance_loss_mlp": 1.01520967, "epoch": 0.4419059071095746, "flos": 23378378601600.0, "grad_norm": 1.8278860292305237, "language_loss": 0.75336695, "learning_rate": 2.468982779140819e-06, "loss": 0.77835214, "num_input_tokens_seen": 157607420, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.22167969, "step": 7350, "time_per_iteration": 2.899661064147949 }, { "auxiliary_loss_clip": 0.01460248, "auxiliary_loss_mlp": 0.01043885, "balance_loss_clip": 1.27958548, "balance_loss_mlp": 1.02093697, "epoch": 0.4419660303622426, "flos": 15020352708480.0, "grad_norm": 2.245050682533336, "language_loss": 0.82610518, "learning_rate": 2.468604167463827e-06, "loss": 0.85114658, "num_input_tokens_seen": 157624990, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.22961426, "step": 7351, "time_per_iteration": 2.8251559734344482 }, { "auxiliary_loss_clip": 0.01439036, "auxiliary_loss_mlp": 0.01034755, "balance_loss_clip": 1.26537824, "balance_loss_mlp": 1.01350021, "epoch": 0.44202615361491054, "flos": 25382005660800.0, "grad_norm": 1.6173226283515463, "language_loss": 0.73927057, "learning_rate": 2.4682255380173176e-06, "loss": 0.7640084, "num_input_tokens_seen": 157645300, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.21252441, "step": 7352, "time_per_iteration": 2.898984909057617 }, { "auxiliary_loss_clip": 0.01470095, "auxiliary_loss_mlp": 0.01038485, "balance_loss_clip": 1.28961325, "balance_loss_mlp": 1.0167774, "epoch": 0.4420862768675785, "flos": 24691866439680.0, "grad_norm": 1.8726812446777494, "language_loss": 0.87981451, "learning_rate": 2.467846890815649e-06, "loss": 0.90490031, "num_input_tokens_seen": 157664060, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.21704102, "step": 7353, "time_per_iteration": 4.32417893409729 }, { "auxiliary_loss_clip": 0.01466424, "auxiliary_loss_mlp": 0.01038067, "balance_loss_clip": 1.28474295, "balance_loss_mlp": 1.01695478, "epoch": 0.44214640012024653, "flos": 19535725641600.0, "grad_norm": 2.7001572308273265, "language_loss": 0.77278095, "learning_rate": 2.4674682258731795e-06, "loss": 0.79782587, "num_input_tokens_seen": 157680905, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.21105957, "step": 7354, "time_per_iteration": 2.8378207683563232 }, { "auxiliary_loss_clip": 0.01449626, "auxiliary_loss_mlp": 0.01041185, "balance_loss_clip": 1.2744031, "balance_loss_mlp": 1.01913118, "epoch": 0.4422065233729145, "flos": 47574052871040.0, "grad_norm": 1.9778980420011554, "language_loss": 0.65880609, "learning_rate": 2.467089543204268e-06, "loss": 0.68371427, "num_input_tokens_seen": 157701980, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.22070312, "step": 7355, "time_per_iteration": 3.060670852661133 }, { "auxiliary_loss_clip": 0.01474169, "auxiliary_loss_mlp": 0.01036876, "balance_loss_clip": 1.2880398, "balance_loss_mlp": 1.01497722, "epoch": 0.44226664662558246, "flos": 19290353235840.0, "grad_norm": 1.7829739777635252, "language_loss": 0.78827536, "learning_rate": 2.466710842823274e-06, "loss": 0.81338573, "num_input_tokens_seen": 157720555, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 0.21911621, "step": 7356, "time_per_iteration": 2.8754546642303467 }, { "auxiliary_loss_clip": 0.01468701, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.28466749, "balance_loss_mlp": 1.01550305, "epoch": 0.4423267698782504, "flos": 17830796624640.0, "grad_norm": 2.0546082264404393, "language_loss": 0.78200567, "learning_rate": 2.4663321247445577e-06, "loss": 0.80707616, "num_input_tokens_seen": 157739160, "router_z_loss_clip": 1.83886719, "router_z_loss_mlp": 0.22851562, "step": 7357, "time_per_iteration": 2.8315320014953613 }, { "auxiliary_loss_clip": 0.01465888, "auxiliary_loss_mlp": 0.01043469, "balance_loss_clip": 1.28540373, "balance_loss_mlp": 1.02158177, "epoch": 0.4423868931309184, "flos": 29216107353600.0, "grad_norm": 1.5107866084595705, "language_loss": 0.74008155, "learning_rate": 2.465953388982481e-06, "loss": 0.7651751, "num_input_tokens_seen": 157760020, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.21911621, "step": 7358, "time_per_iteration": 2.9500234127044678 }, { "auxiliary_loss_clip": 0.01471075, "auxiliary_loss_mlp": 0.01047788, "balance_loss_clip": 1.28953481, "balance_loss_mlp": 1.02432728, "epoch": 0.44244701638358636, "flos": 29724407147520.0, "grad_norm": 1.6005916053442713, "language_loss": 0.76076102, "learning_rate": 2.465574635551405e-06, "loss": 0.78594965, "num_input_tokens_seen": 157780435, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.23449707, "step": 7359, "time_per_iteration": 2.8910272121429443 }, { "auxiliary_loss_clip": 0.01462637, "auxiliary_loss_mlp": 0.01042039, "balance_loss_clip": 1.28419518, "balance_loss_mlp": 1.01978266, "epoch": 0.4425071396362543, "flos": 22940398506240.0, "grad_norm": 1.6986971435865195, "language_loss": 0.70742065, "learning_rate": 2.4651958644656923e-06, "loss": 0.73246741, "num_input_tokens_seen": 157799420, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.22253418, "step": 7360, "time_per_iteration": 2.872999429702759 }, { "auxiliary_loss_clip": 0.01472784, "auxiliary_loss_mlp": 0.01042628, "balance_loss_clip": 1.29071569, "balance_loss_mlp": 1.02014494, "epoch": 0.4425672628889223, "flos": 19801910655360.0, "grad_norm": 2.228508446786744, "language_loss": 0.70888543, "learning_rate": 2.4648170757397053e-06, "loss": 0.73403955, "num_input_tokens_seen": 157817025, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.22485352, "step": 7361, "time_per_iteration": 5.574120283126831 }, { "auxiliary_loss_clip": 0.01463017, "auxiliary_loss_mlp": 0.01038606, "balance_loss_clip": 1.28110409, "balance_loss_mlp": 1.01586056, "epoch": 0.44262738614159025, "flos": 13670053585920.0, "grad_norm": 2.024176807928691, "language_loss": 0.83204174, "learning_rate": 2.464438269387809e-06, "loss": 0.85705799, "num_input_tokens_seen": 157834345, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.22753906, "step": 7362, "time_per_iteration": 2.895024061203003 }, { "auxiliary_loss_clip": 0.01478986, "auxiliary_loss_mlp": 0.01042545, "balance_loss_clip": 1.29130363, "balance_loss_mlp": 1.02013338, "epoch": 0.4426875093942582, "flos": 14218377045120.0, "grad_norm": 1.7485533923596728, "language_loss": 0.75234842, "learning_rate": 2.464059445424366e-06, "loss": 0.77756375, "num_input_tokens_seen": 157852290, "router_z_loss_clip": 1.87695312, "router_z_loss_mlp": 0.22412109, "step": 7363, "time_per_iteration": 4.201362609863281 }, { "auxiliary_loss_clip": 0.01269106, "auxiliary_loss_mlp": 0.01023492, "balance_loss_clip": 1.16477394, "balance_loss_mlp": 1.00127113, "epoch": 0.4427476326469262, "flos": 70152555052800.0, "grad_norm": 0.6853232657375016, "language_loss": 0.55740917, "learning_rate": 2.463680603863743e-06, "loss": 0.58033514, "num_input_tokens_seen": 157923060, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.22265625, "step": 7364, "time_per_iteration": 3.434938430786133 }, { "auxiliary_loss_clip": 0.01449067, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.27170217, "balance_loss_mlp": 1.01632428, "epoch": 0.44280775589959415, "flos": 25455447250560.0, "grad_norm": 1.641036338652434, "language_loss": 0.75919253, "learning_rate": 2.463301744720305e-06, "loss": 0.78406709, "num_input_tokens_seen": 157944110, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.22070312, "step": 7365, "time_per_iteration": 2.9378321170806885 }, { "auxiliary_loss_clip": 0.01451925, "auxiliary_loss_mlp": 0.01038344, "balance_loss_clip": 1.27342212, "balance_loss_mlp": 1.01545608, "epoch": 0.4428678791522621, "flos": 22867590343680.0, "grad_norm": 1.7031800191827613, "language_loss": 0.74966252, "learning_rate": 2.4629228680084184e-06, "loss": 0.77456522, "num_input_tokens_seen": 157964295, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.22888184, "step": 7366, "time_per_iteration": 2.8703179359436035 }, { "auxiliary_loss_clip": 0.01464791, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.28482521, "balance_loss_mlp": 1.01695085, "epoch": 0.44292800240493013, "flos": 25823560095360.0, "grad_norm": 4.8376980688938245, "language_loss": 0.7413125, "learning_rate": 2.46254397374245e-06, "loss": 0.7663554, "num_input_tokens_seen": 157983970, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22546387, "step": 7367, "time_per_iteration": 2.900465726852417 }, { "auxiliary_loss_clip": 0.01453887, "auxiliary_loss_mlp": 0.01046054, "balance_loss_clip": 1.27389717, "balance_loss_mlp": 1.02423906, "epoch": 0.4429881256575981, "flos": 32429846586240.0, "grad_norm": 1.4116551147886955, "language_loss": 0.74712372, "learning_rate": 2.4621650619367677e-06, "loss": 0.7721231, "num_input_tokens_seen": 158006515, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.21813965, "step": 7368, "time_per_iteration": 2.933173418045044 }, { "auxiliary_loss_clip": 0.01460443, "auxiliary_loss_mlp": 0.01043436, "balance_loss_clip": 1.28170919, "balance_loss_mlp": 1.02138209, "epoch": 0.44304824891026606, "flos": 22173831538560.0, "grad_norm": 1.581221177181286, "language_loss": 0.80540216, "learning_rate": 2.4617861326057403e-06, "loss": 0.830441, "num_input_tokens_seen": 158025565, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.22058105, "step": 7369, "time_per_iteration": 2.8809752464294434 }, { "auxiliary_loss_clip": 0.01449686, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.27189767, "balance_loss_mlp": 1.01682854, "epoch": 0.443108372162934, "flos": 25349673611520.0, "grad_norm": 1.8809713624867983, "language_loss": 0.73437309, "learning_rate": 2.461407185763737e-06, "loss": 0.75925088, "num_input_tokens_seen": 158045620, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.21240234, "step": 7370, "time_per_iteration": 2.9148974418640137 }, { "auxiliary_loss_clip": 0.01445296, "auxiliary_loss_mlp": 0.01041548, "balance_loss_clip": 1.26748645, "balance_loss_mlp": 1.01916099, "epoch": 0.443168495415602, "flos": 23341295848320.0, "grad_norm": 1.7838724679952747, "language_loss": 0.71681035, "learning_rate": 2.461028221425126e-06, "loss": 0.74167883, "num_input_tokens_seen": 158063505, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.22387695, "step": 7371, "time_per_iteration": 2.878207206726074 }, { "auxiliary_loss_clip": 0.01438947, "auxiliary_loss_mlp": 0.01040373, "balance_loss_clip": 1.26162624, "balance_loss_mlp": 1.01886725, "epoch": 0.44322861866826996, "flos": 21881467768320.0, "grad_norm": 2.8137954312282005, "language_loss": 0.68804896, "learning_rate": 2.4606492396042786e-06, "loss": 0.71284217, "num_input_tokens_seen": 158080335, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.21508789, "step": 7372, "time_per_iteration": 2.9813220500946045 }, { "auxiliary_loss_clip": 0.01456589, "auxiliary_loss_mlp": 0.01044754, "balance_loss_clip": 1.27399468, "balance_loss_mlp": 1.02147269, "epoch": 0.4432887419209379, "flos": 20094093446400.0, "grad_norm": 1.8783385678742621, "language_loss": 0.84658051, "learning_rate": 2.4602702403155664e-06, "loss": 0.87159395, "num_input_tokens_seen": 158098955, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.23291016, "step": 7373, "time_per_iteration": 2.8384690284729004 }, { "auxiliary_loss_clip": 0.01261389, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.1561656, "balance_loss_mlp": 1.01111948, "epoch": 0.4433488651736059, "flos": 70068209466240.0, "grad_norm": 0.763783441164014, "language_loss": 0.55189943, "learning_rate": 2.4598912235733604e-06, "loss": 0.57484388, "num_input_tokens_seen": 158164110, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.21972656, "step": 7374, "time_per_iteration": 3.3976738452911377 }, { "auxiliary_loss_clip": 0.0143587, "auxiliary_loss_mlp": 0.01048689, "balance_loss_clip": 1.26059723, "balance_loss_mlp": 1.02481174, "epoch": 0.44340898842627385, "flos": 16289247156480.0, "grad_norm": 3.881543859124447, "language_loss": 0.83737659, "learning_rate": 2.4595121893920327e-06, "loss": 0.86222225, "num_input_tokens_seen": 158179850, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.2388916, "step": 7375, "time_per_iteration": 2.8469104766845703 }, { "auxiliary_loss_clip": 0.01450379, "auxiliary_loss_mlp": 0.01049148, "balance_loss_clip": 1.26920366, "balance_loss_mlp": 1.02676034, "epoch": 0.4434691116789418, "flos": 16619146128000.0, "grad_norm": 1.734864088494111, "language_loss": 0.84093511, "learning_rate": 2.4591331377859578e-06, "loss": 0.86593032, "num_input_tokens_seen": 158196590, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.22399902, "step": 7376, "time_per_iteration": 2.8033030033111572 }, { "auxiliary_loss_clip": 0.01441014, "auxiliary_loss_mlp": 0.0104565, "balance_loss_clip": 1.26331496, "balance_loss_mlp": 1.02277398, "epoch": 0.4435292349316098, "flos": 19072788399360.0, "grad_norm": 2.166263573238394, "language_loss": 0.78296113, "learning_rate": 2.4587540687695077e-06, "loss": 0.80782783, "num_input_tokens_seen": 158216355, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.22875977, "step": 7377, "time_per_iteration": 2.876875400543213 }, { "auxiliary_loss_clip": 0.01423809, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.25099683, "balance_loss_mlp": 1.02143717, "epoch": 0.44358935818427775, "flos": 21261150552960.0, "grad_norm": 2.0478718842527024, "language_loss": 0.76581621, "learning_rate": 2.458374982357057e-06, "loss": 0.79049909, "num_input_tokens_seen": 158235825, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.23022461, "step": 7378, "time_per_iteration": 2.8658711910247803 }, { "auxiliary_loss_clip": 0.01441165, "auxiliary_loss_mlp": 0.01044692, "balance_loss_clip": 1.26387477, "balance_loss_mlp": 1.02166021, "epoch": 0.4436494814369457, "flos": 12502996479360.0, "grad_norm": 2.204863750738569, "language_loss": 0.70482981, "learning_rate": 2.457995878562982e-06, "loss": 0.72968835, "num_input_tokens_seen": 158254230, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.23046875, "step": 7379, "time_per_iteration": 2.8294684886932373 }, { "auxiliary_loss_clip": 0.01444101, "auxiliary_loss_mlp": 0.01043735, "balance_loss_clip": 1.26488769, "balance_loss_mlp": 1.02077484, "epoch": 0.44370960468961373, "flos": 23670425658240.0, "grad_norm": 1.5596536074553278, "language_loss": 0.73983073, "learning_rate": 2.457616757401656e-06, "loss": 0.76470912, "num_input_tokens_seen": 158273400, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.22973633, "step": 7380, "time_per_iteration": 2.9124960899353027 }, { "auxiliary_loss_clip": 0.01441963, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.26393056, "balance_loss_mlp": 1.01469398, "epoch": 0.4437697279422817, "flos": 32429801341440.0, "grad_norm": 1.6455631833965363, "language_loss": 0.6607011, "learning_rate": 2.457237618887458e-06, "loss": 0.68549371, "num_input_tokens_seen": 158296840, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.22619629, "step": 7381, "time_per_iteration": 2.9507193565368652 }, { "auxiliary_loss_clip": 0.01455676, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.27638817, "balance_loss_mlp": 1.01985335, "epoch": 0.44382985119494966, "flos": 18121712561280.0, "grad_norm": 1.9553415652525692, "language_loss": 0.80827928, "learning_rate": 2.456858463034763e-06, "loss": 0.83325464, "num_input_tokens_seen": 158314935, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.2199707, "step": 7382, "time_per_iteration": 2.9868087768554688 }, { "auxiliary_loss_clip": 0.01442893, "auxiliary_loss_mlp": 0.01043824, "balance_loss_clip": 1.26531577, "balance_loss_mlp": 1.0217104, "epoch": 0.44388997444761763, "flos": 30786459776640.0, "grad_norm": 1.9916160195569759, "language_loss": 0.66341698, "learning_rate": 2.456479289857949e-06, "loss": 0.68828416, "num_input_tokens_seen": 158334620, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.22119141, "step": 7383, "time_per_iteration": 3.0478219985961914 }, { "auxiliary_loss_clip": 0.01462705, "auxiliary_loss_mlp": 0.0103843, "balance_loss_clip": 1.28014529, "balance_loss_mlp": 1.01556551, "epoch": 0.4439500977002856, "flos": 20348741036160.0, "grad_norm": 3.1575316080845184, "language_loss": 0.77333879, "learning_rate": 2.4561000993713953e-06, "loss": 0.79835016, "num_input_tokens_seen": 158350550, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.2286377, "step": 7384, "time_per_iteration": 2.8994600772857666 }, { "auxiliary_loss_clip": 0.01450654, "auxiliary_loss_mlp": 0.01039903, "balance_loss_clip": 1.26915812, "balance_loss_mlp": 1.0170269, "epoch": 0.44401022095295356, "flos": 20379670496640.0, "grad_norm": 1.8417188201139187, "language_loss": 0.8189171, "learning_rate": 2.4557208915894796e-06, "loss": 0.84382266, "num_input_tokens_seen": 158369555, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.22875977, "step": 7385, "time_per_iteration": 2.8501672744750977 }, { "auxiliary_loss_clip": 0.01436129, "auxiliary_loss_mlp": 0.01042262, "balance_loss_clip": 1.25789857, "balance_loss_mlp": 1.01745462, "epoch": 0.4440703442056215, "flos": 20240433688320.0, "grad_norm": 1.857966187326479, "language_loss": 0.82384014, "learning_rate": 2.455341666526582e-06, "loss": 0.84862411, "num_input_tokens_seen": 158388045, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.24816895, "step": 7386, "time_per_iteration": 2.9048900604248047 }, { "auxiliary_loss_clip": 0.01472334, "auxiliary_loss_mlp": 0.01041557, "balance_loss_clip": 1.28656948, "balance_loss_mlp": 1.01786983, "epoch": 0.4441304674582895, "flos": 39509386133760.0, "grad_norm": 1.8522555422396316, "language_loss": 0.70896506, "learning_rate": 2.4549624241970832e-06, "loss": 0.73410398, "num_input_tokens_seen": 158410115, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.23706055, "step": 7387, "time_per_iteration": 3.0481314659118652 }, { "auxiliary_loss_clip": 0.01446137, "auxiliary_loss_mlp": 0.01042578, "balance_loss_clip": 1.26700258, "balance_loss_mlp": 1.0192486, "epoch": 0.44419059071095746, "flos": 14837789364480.0, "grad_norm": 2.0039629691655008, "language_loss": 0.7249999, "learning_rate": 2.4545831646153628e-06, "loss": 0.74988711, "num_input_tokens_seen": 158427765, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.23327637, "step": 7388, "time_per_iteration": 4.2812511920928955 }, { "auxiliary_loss_clip": 0.0146583, "auxiliary_loss_mlp": 0.0104269, "balance_loss_clip": 1.28333759, "balance_loss_mlp": 1.01944411, "epoch": 0.4442507139636254, "flos": 22648034736000.0, "grad_norm": 1.7161102493226876, "language_loss": 0.6996575, "learning_rate": 2.4542038877958044e-06, "loss": 0.72474277, "num_input_tokens_seen": 158446375, "router_z_loss_clip": 1.82324219, "router_z_loss_mlp": 0.23242188, "step": 7389, "time_per_iteration": 2.885082721710205 }, { "auxiliary_loss_clip": 0.01450466, "auxiliary_loss_mlp": 0.01039211, "balance_loss_clip": 1.27092457, "balance_loss_mlp": 1.01718104, "epoch": 0.4443108372162934, "flos": 38305110539520.0, "grad_norm": 1.8053246020246982, "language_loss": 0.75472224, "learning_rate": 2.453824593752788e-06, "loss": 0.77961898, "num_input_tokens_seen": 158467260, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.22033691, "step": 7390, "time_per_iteration": 3.0353541374206543 }, { "auxiliary_loss_clip": 0.01436769, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.25940013, "balance_loss_mlp": 1.01634991, "epoch": 0.44437096046896135, "flos": 17757988462080.0, "grad_norm": 3.4356276996605093, "language_loss": 0.83178324, "learning_rate": 2.4534452825006988e-06, "loss": 0.8565414, "num_input_tokens_seen": 158486720, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.22705078, "step": 7391, "time_per_iteration": 2.834217071533203 }, { "auxiliary_loss_clip": 0.01441962, "auxiliary_loss_mlp": 0.01041663, "balance_loss_clip": 1.26572704, "balance_loss_mlp": 1.01900172, "epoch": 0.4444310837216293, "flos": 13739468388480.0, "grad_norm": 1.985887724286699, "language_loss": 0.74030519, "learning_rate": 2.4530659540539185e-06, "loss": 0.76514143, "num_input_tokens_seen": 158502530, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.22668457, "step": 7392, "time_per_iteration": 2.888111114501953 }, { "auxiliary_loss_clip": 0.01440431, "auxiliary_loss_mlp": 0.01040356, "balance_loss_clip": 1.26318145, "balance_loss_mlp": 1.01658583, "epoch": 0.44449120697429734, "flos": 25021267718400.0, "grad_norm": 1.5511765945606866, "language_loss": 0.80474615, "learning_rate": 2.4526866084268313e-06, "loss": 0.82955408, "num_input_tokens_seen": 158522715, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.23754883, "step": 7393, "time_per_iteration": 2.9204657077789307 }, { "auxiliary_loss_clip": 0.01453645, "auxiliary_loss_mlp": 0.01036071, "balance_loss_clip": 1.27097535, "balance_loss_mlp": 1.01288509, "epoch": 0.4445513302269653, "flos": 32684584665600.0, "grad_norm": 2.093923085182415, "language_loss": 0.81647635, "learning_rate": 2.4523072456338226e-06, "loss": 0.84137356, "num_input_tokens_seen": 158543615, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.23205566, "step": 7394, "time_per_iteration": 2.9567296504974365 }, { "auxiliary_loss_clip": 0.01441573, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.2650969, "balance_loss_mlp": 1.01711524, "epoch": 0.44461145347963327, "flos": 11663214145920.0, "grad_norm": 3.5223726039522676, "language_loss": 0.80970204, "learning_rate": 2.4519278656892785e-06, "loss": 0.83450389, "num_input_tokens_seen": 158560330, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.21508789, "step": 7395, "time_per_iteration": 4.3942649364471436 }, { "auxiliary_loss_clip": 0.01439653, "auxiliary_loss_mlp": 0.01036817, "balance_loss_clip": 1.26204324, "balance_loss_mlp": 1.01433372, "epoch": 0.44467157673230123, "flos": 20896566802560.0, "grad_norm": 4.997689268312886, "language_loss": 0.69699371, "learning_rate": 2.451548468607584e-06, "loss": 0.72175837, "num_input_tokens_seen": 158579735, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.22485352, "step": 7396, "time_per_iteration": 4.307459592819214 }, { "auxiliary_loss_clip": 0.01461293, "auxiliary_loss_mlp": 0.01042508, "balance_loss_clip": 1.28020847, "balance_loss_mlp": 1.02033532, "epoch": 0.4447316999849692, "flos": 18553901322240.0, "grad_norm": 2.3613004869760643, "language_loss": 0.81086469, "learning_rate": 2.451169054403126e-06, "loss": 0.83590269, "num_input_tokens_seen": 158597075, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.22180176, "step": 7397, "time_per_iteration": 2.8469433784484863 }, { "auxiliary_loss_clip": 0.01444539, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.26732135, "balance_loss_mlp": 1.01654172, "epoch": 0.44479182323763716, "flos": 23779592657280.0, "grad_norm": 4.886182626371294, "language_loss": 0.67990667, "learning_rate": 2.450789623090293e-06, "loss": 0.70473886, "num_input_tokens_seen": 158616650, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.22131348, "step": 7398, "time_per_iteration": 4.290961980819702 }, { "auxiliary_loss_clip": 0.01440936, "auxiliary_loss_mlp": 0.01045754, "balance_loss_clip": 1.26557159, "balance_loss_mlp": 1.02260327, "epoch": 0.44485194649030513, "flos": 16552219789440.0, "grad_norm": 1.6545339587584185, "language_loss": 0.70389485, "learning_rate": 2.450410174683472e-06, "loss": 0.72876173, "num_input_tokens_seen": 158634515, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.23144531, "step": 7399, "time_per_iteration": 2.886300563812256 }, { "auxiliary_loss_clip": 0.01430934, "auxiliary_loss_mlp": 0.01038543, "balance_loss_clip": 1.25591338, "balance_loss_mlp": 1.01687074, "epoch": 0.4449120697429731, "flos": 22611042472320.0, "grad_norm": 2.3428234498568767, "language_loss": 0.73851383, "learning_rate": 2.4500307091970514e-06, "loss": 0.76320857, "num_input_tokens_seen": 158653760, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.21679688, "step": 7400, "time_per_iteration": 2.861452341079712 }, { "auxiliary_loss_clip": 0.01442094, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.26577663, "balance_loss_mlp": 1.01545823, "epoch": 0.44497219299564106, "flos": 20012824506240.0, "grad_norm": 1.765920301810085, "language_loss": 0.8591184, "learning_rate": 2.449651226645422e-06, "loss": 0.88391638, "num_input_tokens_seen": 158672190, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.22241211, "step": 7401, "time_per_iteration": 2.9218664169311523 }, { "auxiliary_loss_clip": 0.01424731, "auxiliary_loss_mlp": 0.01039485, "balance_loss_clip": 1.25201511, "balance_loss_mlp": 1.01910043, "epoch": 0.445032316248309, "flos": 25605497566080.0, "grad_norm": 2.8968338758233627, "language_loss": 0.83859777, "learning_rate": 2.449271727042973e-06, "loss": 0.86323988, "num_input_tokens_seen": 158694115, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.20373535, "step": 7402, "time_per_iteration": 2.987872362136841 }, { "auxiliary_loss_clip": 0.01447583, "auxiliary_loss_mlp": 0.01036437, "balance_loss_clip": 1.26891422, "balance_loss_mlp": 1.01412094, "epoch": 0.445092439500977, "flos": 21260019432960.0, "grad_norm": 1.668879653057871, "language_loss": 0.7770583, "learning_rate": 2.4488922104040947e-06, "loss": 0.80189848, "num_input_tokens_seen": 158711000, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.22338867, "step": 7403, "time_per_iteration": 2.842527151107788 }, { "auxiliary_loss_clip": 0.0127959, "auxiliary_loss_mlp": 0.01039956, "balance_loss_clip": 1.17077112, "balance_loss_mlp": 1.01020169, "epoch": 0.44515256275364495, "flos": 57791419580160.0, "grad_norm": 0.755599175813721, "language_loss": 0.60129452, "learning_rate": 2.4485126767431793e-06, "loss": 0.62449002, "num_input_tokens_seen": 158769675, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 0.296875, "step": 7404, "time_per_iteration": 3.335784435272217 }, { "auxiliary_loss_clip": 0.01454546, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.27246714, "balance_loss_mlp": 1.01611686, "epoch": 0.4452126860063129, "flos": 15604356332160.0, "grad_norm": 2.785212712894443, "language_loss": 0.82704085, "learning_rate": 2.4481331260746177e-06, "loss": 0.85198033, "num_input_tokens_seen": 158788215, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.23278809, "step": 7405, "time_per_iteration": 2.908973455429077 }, { "auxiliary_loss_clip": 0.01440032, "auxiliary_loss_mlp": 0.01039545, "balance_loss_clip": 1.26297033, "balance_loss_mlp": 1.01712167, "epoch": 0.4452728092589809, "flos": 21627634584960.0, "grad_norm": 2.3445138758323876, "language_loss": 0.7616722, "learning_rate": 2.4477535584128036e-06, "loss": 0.78646797, "num_input_tokens_seen": 158809090, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.22424316, "step": 7406, "time_per_iteration": 2.912193775177002 }, { "auxiliary_loss_clip": 0.01423333, "auxiliary_loss_mlp": 0.01038013, "balance_loss_clip": 1.25186694, "balance_loss_mlp": 1.01595974, "epoch": 0.4453329325116489, "flos": 29509556999040.0, "grad_norm": 1.994226512384518, "language_loss": 0.66030717, "learning_rate": 2.447373973772129e-06, "loss": 0.68492055, "num_input_tokens_seen": 158828320, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.22058105, "step": 7407, "time_per_iteration": 2.950221300125122 }, { "auxiliary_loss_clip": 0.01456876, "auxiliary_loss_mlp": 0.01040268, "balance_loss_clip": 1.2769022, "balance_loss_mlp": 1.01810706, "epoch": 0.44539305576431687, "flos": 21371086713600.0, "grad_norm": 1.5637316022641572, "language_loss": 0.6884799, "learning_rate": 2.4469943721669887e-06, "loss": 0.71345139, "num_input_tokens_seen": 158847040, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.22155762, "step": 7408, "time_per_iteration": 2.88632869720459 }, { "auxiliary_loss_clip": 0.01450657, "auxiliary_loss_mlp": 0.01041924, "balance_loss_clip": 1.2705667, "balance_loss_mlp": 1.01923895, "epoch": 0.44545317901698483, "flos": 41442603004800.0, "grad_norm": 1.5293316373602237, "language_loss": 0.72653002, "learning_rate": 2.4466147536117776e-06, "loss": 0.75145578, "num_input_tokens_seen": 158870490, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.22692871, "step": 7409, "time_per_iteration": 3.0382041931152344 }, { "auxiliary_loss_clip": 0.01436041, "auxiliary_loss_mlp": 0.01042243, "balance_loss_clip": 1.25567138, "balance_loss_mlp": 1.01878273, "epoch": 0.4455133022696528, "flos": 22065297966720.0, "grad_norm": 1.646120503224584, "language_loss": 0.65726554, "learning_rate": 2.4462351181208895e-06, "loss": 0.68204844, "num_input_tokens_seen": 158889920, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.23486328, "step": 7410, "time_per_iteration": 2.8450193405151367 }, { "auxiliary_loss_clip": 0.01469416, "auxiliary_loss_mlp": 0.01040869, "balance_loss_clip": 1.28445172, "balance_loss_mlp": 1.01801658, "epoch": 0.44557342552232077, "flos": 23487364621440.0, "grad_norm": 3.051768397041201, "language_loss": 0.75742185, "learning_rate": 2.4458554657087217e-06, "loss": 0.7825247, "num_input_tokens_seen": 158909580, "router_z_loss_clip": 1.85058594, "router_z_loss_mlp": 0.22875977, "step": 7411, "time_per_iteration": 2.899341106414795 }, { "auxiliary_loss_clip": 0.01436688, "auxiliary_loss_mlp": 0.01041145, "balance_loss_clip": 1.26358128, "balance_loss_mlp": 1.01832843, "epoch": 0.44563354877498873, "flos": 19143832014720.0, "grad_norm": 2.3973266305124725, "language_loss": 0.79954088, "learning_rate": 2.4454757963896695e-06, "loss": 0.82431918, "num_input_tokens_seen": 158924600, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.22802734, "step": 7412, "time_per_iteration": 2.869169235229492 }, { "auxiliary_loss_clip": 0.0145249, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.26932025, "balance_loss_mlp": 1.01654601, "epoch": 0.4456936720276567, "flos": 13628129639040.0, "grad_norm": 1.8953164093632644, "language_loss": 0.81423485, "learning_rate": 2.4450961101781304e-06, "loss": 0.83914316, "num_input_tokens_seen": 158939345, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.21801758, "step": 7413, "time_per_iteration": 2.9330315589904785 }, { "auxiliary_loss_clip": 0.01432085, "auxiliary_loss_mlp": 0.01037552, "balance_loss_clip": 1.25696087, "balance_loss_mlp": 1.01520038, "epoch": 0.44575379528032466, "flos": 14720116343040.0, "grad_norm": 2.780380955918839, "language_loss": 0.77257192, "learning_rate": 2.4447164070885026e-06, "loss": 0.79726827, "num_input_tokens_seen": 158955855, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.22338867, "step": 7414, "time_per_iteration": 2.8842039108276367 }, { "auxiliary_loss_clip": 0.01434298, "auxiliary_loss_mlp": 0.0104092, "balance_loss_clip": 1.25765157, "balance_loss_mlp": 1.01713753, "epoch": 0.4458139185329926, "flos": 24181394895360.0, "grad_norm": 1.5280197409697842, "language_loss": 0.83941936, "learning_rate": 2.4443366871351837e-06, "loss": 0.86417156, "num_input_tokens_seen": 158976315, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.23779297, "step": 7415, "time_per_iteration": 2.9058585166931152 }, { "auxiliary_loss_clip": 0.01432121, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.25377429, "balance_loss_mlp": 1.01499438, "epoch": 0.4458740417856606, "flos": 21772210279680.0, "grad_norm": 1.5036259728586137, "language_loss": 0.84695339, "learning_rate": 2.4439569503325732e-06, "loss": 0.87164915, "num_input_tokens_seen": 158996725, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.22460938, "step": 7416, "time_per_iteration": 2.8872830867767334 }, { "auxiliary_loss_clip": 0.01440609, "auxiliary_loss_mlp": 0.01040204, "balance_loss_clip": 1.26038313, "balance_loss_mlp": 1.01724482, "epoch": 0.44593416503832856, "flos": 21078541964160.0, "grad_norm": 2.8492087917878095, "language_loss": 0.81606579, "learning_rate": 2.4435771966950706e-06, "loss": 0.8408739, "num_input_tokens_seen": 159017255, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.22961426, "step": 7417, "time_per_iteration": 2.8885910511016846 }, { "auxiliary_loss_clip": 0.01444495, "auxiliary_loss_mlp": 0.01050134, "balance_loss_clip": 1.26522017, "balance_loss_mlp": 1.02768719, "epoch": 0.4459942882909965, "flos": 22610454289920.0, "grad_norm": 2.031991653407147, "language_loss": 0.81842172, "learning_rate": 2.443197426237077e-06, "loss": 0.84336805, "num_input_tokens_seen": 159035010, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.2244873, "step": 7418, "time_per_iteration": 2.8903253078460693 }, { "auxiliary_loss_clip": 0.01440605, "auxiliary_loss_mlp": 0.01041122, "balance_loss_clip": 1.26014507, "balance_loss_mlp": 1.01896071, "epoch": 0.4460544115436645, "flos": 26516730718080.0, "grad_norm": 1.6497730536291957, "language_loss": 0.77996194, "learning_rate": 2.442817638972991e-06, "loss": 0.80477917, "num_input_tokens_seen": 159055345, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22167969, "step": 7419, "time_per_iteration": 2.8701987266540527 }, { "auxiliary_loss_clip": 0.01441623, "auxiliary_loss_mlp": 0.01036361, "balance_loss_clip": 1.26449609, "balance_loss_mlp": 1.01445055, "epoch": 0.4461145347963325, "flos": 17613231788160.0, "grad_norm": 1.637108654947106, "language_loss": 0.73344535, "learning_rate": 2.4424378349172176e-06, "loss": 0.7582252, "num_input_tokens_seen": 159074225, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.21911621, "step": 7420, "time_per_iteration": 2.839426040649414 }, { "auxiliary_loss_clip": 0.01423441, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.24983132, "balance_loss_mlp": 1.01509881, "epoch": 0.44617465804900047, "flos": 27278727960960.0, "grad_norm": 2.0909378820074713, "language_loss": 0.75501621, "learning_rate": 2.442058014084156e-06, "loss": 0.77963287, "num_input_tokens_seen": 159095415, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.23132324, "step": 7421, "time_per_iteration": 2.9565625190734863 }, { "auxiliary_loss_clip": 0.01423213, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.24992466, "balance_loss_mlp": 1.01824808, "epoch": 0.44623478130166844, "flos": 17795659397760.0, "grad_norm": 1.9253807718136415, "language_loss": 0.76716918, "learning_rate": 2.44167817648821e-06, "loss": 0.79181206, "num_input_tokens_seen": 159114615, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.22827148, "step": 7422, "time_per_iteration": 2.921449661254883 }, { "auxiliary_loss_clip": 0.01443432, "auxiliary_loss_mlp": 0.01043289, "balance_loss_clip": 1.26497591, "balance_loss_mlp": 1.02054417, "epoch": 0.4462949045543364, "flos": 23013387648000.0, "grad_norm": 5.274105488387539, "language_loss": 0.66193402, "learning_rate": 2.441298322143784e-06, "loss": 0.68680131, "num_input_tokens_seen": 159134370, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.22741699, "step": 7423, "time_per_iteration": 4.3420021533966064 }, { "auxiliary_loss_clip": 0.014122, "auxiliary_loss_mlp": 0.01039112, "balance_loss_clip": 1.24142373, "balance_loss_mlp": 1.01639104, "epoch": 0.44635502780700437, "flos": 17828624874240.0, "grad_norm": 1.4949073873160796, "language_loss": 0.8053019, "learning_rate": 2.4409184510652807e-06, "loss": 0.82981503, "num_input_tokens_seen": 159152540, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.22717285, "step": 7424, "time_per_iteration": 2.8859431743621826 }, { "auxiliary_loss_clip": 0.01415983, "auxiliary_loss_mlp": 0.01036814, "balance_loss_clip": 1.24519396, "balance_loss_mlp": 1.01582158, "epoch": 0.44641515105967233, "flos": 26699339306880.0, "grad_norm": 1.3958955526127363, "language_loss": 0.80781186, "learning_rate": 2.4405385632671063e-06, "loss": 0.83233976, "num_input_tokens_seen": 159173425, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21008301, "step": 7425, "time_per_iteration": 2.982287883758545 }, { "auxiliary_loss_clip": 0.0142577, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.25116634, "balance_loss_mlp": 1.01807594, "epoch": 0.4464752743123403, "flos": 18921787943040.0, "grad_norm": 1.4697205323069449, "language_loss": 0.7802006, "learning_rate": 2.4401586587636655e-06, "loss": 0.804847, "num_input_tokens_seen": 159191210, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.20788574, "step": 7426, "time_per_iteration": 2.8496246337890625 }, { "auxiliary_loss_clip": 0.01437178, "auxiliary_loss_mlp": 0.01037899, "balance_loss_clip": 1.25839484, "balance_loss_mlp": 1.01523781, "epoch": 0.44653539756500826, "flos": 29582365161600.0, "grad_norm": 1.734032213497307, "language_loss": 0.65685761, "learning_rate": 2.4397787375693634e-06, "loss": 0.68160844, "num_input_tokens_seen": 159211755, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.22668457, "step": 7427, "time_per_iteration": 2.8883659839630127 }, { "auxiliary_loss_clip": 0.01424946, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.25230825, "balance_loss_mlp": 1.013942, "epoch": 0.44659552081767623, "flos": 21478534410240.0, "grad_norm": 1.8068959268247908, "language_loss": 0.76587546, "learning_rate": 2.439398799698608e-06, "loss": 0.79047978, "num_input_tokens_seen": 159230315, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.21533203, "step": 7428, "time_per_iteration": 2.8329596519470215 }, { "auxiliary_loss_clip": 0.01430354, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.2559855, "balance_loss_mlp": 1.01260483, "epoch": 0.4466556440703442, "flos": 17941049498880.0, "grad_norm": 1.9080483747007464, "language_loss": 0.78647584, "learning_rate": 2.439018845165806e-06, "loss": 0.81113219, "num_input_tokens_seen": 159249810, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.22705078, "step": 7429, "time_per_iteration": 2.8403327465057373 }, { "auxiliary_loss_clip": 0.01449595, "auxiliary_loss_mlp": 0.01039257, "balance_loss_clip": 1.27122378, "balance_loss_mlp": 1.0175128, "epoch": 0.44671576732301216, "flos": 21117751223040.0, "grad_norm": 2.0755426463919235, "language_loss": 0.91502219, "learning_rate": 2.438638873985366e-06, "loss": 0.93991065, "num_input_tokens_seen": 159271715, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.21728516, "step": 7430, "time_per_iteration": 4.496643781661987 }, { "auxiliary_loss_clip": 0.01460811, "auxiliary_loss_mlp": 0.01039179, "balance_loss_clip": 1.27621233, "balance_loss_mlp": 1.01627874, "epoch": 0.4467758905756801, "flos": 23518158347520.0, "grad_norm": 1.5754898465535359, "language_loss": 0.8050459, "learning_rate": 2.4382588861716954e-06, "loss": 0.83004582, "num_input_tokens_seen": 159290690, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.22924805, "step": 7431, "time_per_iteration": 4.363646030426025 }, { "auxiliary_loss_clip": 0.01444532, "auxiliary_loss_mlp": 0.01039897, "balance_loss_clip": 1.26508236, "balance_loss_mlp": 1.01824892, "epoch": 0.4468360138283481, "flos": 18743251386240.0, "grad_norm": 3.5089398880747296, "language_loss": 0.81010115, "learning_rate": 2.437878881739204e-06, "loss": 0.83494544, "num_input_tokens_seen": 159309400, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.21643066, "step": 7432, "time_per_iteration": 2.879289388656616 }, { "auxiliary_loss_clip": 0.01447123, "auxiliary_loss_mlp": 0.01039639, "balance_loss_clip": 1.26615644, "balance_loss_mlp": 1.01828837, "epoch": 0.4468961370810161, "flos": 23487590845440.0, "grad_norm": 1.77111467861156, "language_loss": 0.77973545, "learning_rate": 2.437498860702301e-06, "loss": 0.80460304, "num_input_tokens_seen": 159327425, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.21374512, "step": 7433, "time_per_iteration": 2.8578684329986572 }, { "auxiliary_loss_clip": 0.01423718, "auxiliary_loss_mlp": 0.01038352, "balance_loss_clip": 1.25319302, "balance_loss_mlp": 1.01821804, "epoch": 0.4469562603336841, "flos": 30085642782720.0, "grad_norm": 1.7043136022113377, "language_loss": 0.78062773, "learning_rate": 2.437118823075398e-06, "loss": 0.80524838, "num_input_tokens_seen": 159345805, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20141602, "step": 7434, "time_per_iteration": 4.327753305435181 }, { "auxiliary_loss_clip": 0.01452626, "auxiliary_loss_mlp": 0.01040741, "balance_loss_clip": 1.27384901, "balance_loss_mlp": 1.01915264, "epoch": 0.44701638358635204, "flos": 22466828736000.0, "grad_norm": 1.9180184739191748, "language_loss": 0.65146065, "learning_rate": 2.436738768872905e-06, "loss": 0.67639434, "num_input_tokens_seen": 159364595, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.21582031, "step": 7435, "time_per_iteration": 2.891143321990967 }, { "auxiliary_loss_clip": 0.01452013, "auxiliary_loss_mlp": 0.01045517, "balance_loss_clip": 1.27589869, "balance_loss_mlp": 1.02359462, "epoch": 0.44707650683902, "flos": 24067658171520.0, "grad_norm": 1.7525394662809408, "language_loss": 0.84327221, "learning_rate": 2.4363586981092346e-06, "loss": 0.86824751, "num_input_tokens_seen": 159385265, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.21899414, "step": 7436, "time_per_iteration": 2.952116012573242 }, { "auxiliary_loss_clip": 0.01476006, "auxiliary_loss_mlp": 0.0104901, "balance_loss_clip": 1.29444933, "balance_loss_mlp": 1.02558541, "epoch": 0.44713663009168797, "flos": 23776923214080.0, "grad_norm": 1.7127076417866558, "language_loss": 0.80001497, "learning_rate": 2.435978610798798e-06, "loss": 0.82526517, "num_input_tokens_seen": 159405080, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.23413086, "step": 7437, "time_per_iteration": 2.9006364345550537 }, { "auxiliary_loss_clip": 0.01447198, "auxiliary_loss_mlp": 0.01046993, "balance_loss_clip": 1.27050757, "balance_loss_mlp": 1.0259645, "epoch": 0.44719675334435594, "flos": 24510117502080.0, "grad_norm": 1.7145968723486968, "language_loss": 0.72269893, "learning_rate": 2.435598506956009e-06, "loss": 0.74764085, "num_input_tokens_seen": 159424595, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.21044922, "step": 7438, "time_per_iteration": 2.9519965648651123 }, { "auxiliary_loss_clip": 0.01465774, "auxiliary_loss_mlp": 0.0105505, "balance_loss_clip": 1.28555, "balance_loss_mlp": 1.03259051, "epoch": 0.4472568765970239, "flos": 29791921668480.0, "grad_norm": 1.7410615138782424, "language_loss": 0.67471051, "learning_rate": 2.4352183865952808e-06, "loss": 0.69991875, "num_input_tokens_seen": 159443865, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.22473145, "step": 7439, "time_per_iteration": 2.9677162170410156 }, { "auxiliary_loss_clip": 0.01464016, "auxiliary_loss_mlp": 0.01051448, "balance_loss_clip": 1.28570533, "balance_loss_mlp": 1.028512, "epoch": 0.44731699984969187, "flos": 24653471587200.0, "grad_norm": 2.338812884941249, "language_loss": 0.7453652, "learning_rate": 2.4348382497310285e-06, "loss": 0.77051985, "num_input_tokens_seen": 159464525, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.22924805, "step": 7440, "time_per_iteration": 2.968296766281128 }, { "auxiliary_loss_clip": 0.01450001, "auxiliary_loss_mlp": 0.01056124, "balance_loss_clip": 1.2745893, "balance_loss_mlp": 1.03432012, "epoch": 0.44737712310235983, "flos": 29466366197760.0, "grad_norm": 1.6401665789515534, "language_loss": 0.74672604, "learning_rate": 2.4344580963776655e-06, "loss": 0.77178729, "num_input_tokens_seen": 159486385, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.21801758, "step": 7441, "time_per_iteration": 2.9644076824188232 }, { "auxiliary_loss_clip": 0.01463826, "auxiliary_loss_mlp": 0.01052088, "balance_loss_clip": 1.28539276, "balance_loss_mlp": 1.03035581, "epoch": 0.4474372463550278, "flos": 24906807077760.0, "grad_norm": 2.5266767964216155, "language_loss": 0.75366676, "learning_rate": 2.4340779265496082e-06, "loss": 0.77882588, "num_input_tokens_seen": 159503880, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.21740723, "step": 7442, "time_per_iteration": 2.9294800758361816 }, { "auxiliary_loss_clip": 0.01466714, "auxiliary_loss_mlp": 0.01053616, "balance_loss_clip": 1.28352773, "balance_loss_mlp": 1.03137124, "epoch": 0.44749736960769576, "flos": 33194377537920.0, "grad_norm": 2.5163550588924246, "language_loss": 0.74902058, "learning_rate": 2.433697740261273e-06, "loss": 0.7742238, "num_input_tokens_seen": 159522980, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.22241211, "step": 7443, "time_per_iteration": 2.999361038208008 }, { "auxiliary_loss_clip": 0.01452148, "auxiliary_loss_mlp": 0.01044902, "balance_loss_clip": 1.27550721, "balance_loss_mlp": 1.02189469, "epoch": 0.4475574928603637, "flos": 21082342527360.0, "grad_norm": 1.6593195287990052, "language_loss": 0.78182542, "learning_rate": 2.4333175375270748e-06, "loss": 0.8067959, "num_input_tokens_seen": 159543340, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.23022461, "step": 7444, "time_per_iteration": 2.8841001987457275 }, { "auxiliary_loss_clip": 0.0145058, "auxiliary_loss_mlp": 0.01051158, "balance_loss_clip": 1.27687073, "balance_loss_mlp": 1.02956939, "epoch": 0.4476176161130317, "flos": 21870156568320.0, "grad_norm": 3.515577080400395, "language_loss": 0.84704757, "learning_rate": 2.4329373183614333e-06, "loss": 0.87206495, "num_input_tokens_seen": 159558210, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.21594238, "step": 7445, "time_per_iteration": 2.887338876724243 }, { "auxiliary_loss_clip": 0.01459187, "auxiliary_loss_mlp": 0.01043256, "balance_loss_clip": 1.27976322, "balance_loss_mlp": 1.02111936, "epoch": 0.4476777393656997, "flos": 22538912981760.0, "grad_norm": 1.8969104639756431, "language_loss": 0.65332878, "learning_rate": 2.432557082778765e-06, "loss": 0.67835319, "num_input_tokens_seen": 159577920, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22143555, "step": 7446, "time_per_iteration": 2.9013113975524902 }, { "auxiliary_loss_clip": 0.01286659, "auxiliary_loss_mlp": 0.0102307, "balance_loss_clip": 1.18097556, "balance_loss_mlp": 1.00189805, "epoch": 0.4477378626183677, "flos": 49043671810560.0, "grad_norm": 0.7475661504257273, "language_loss": 0.50419843, "learning_rate": 2.4321768307934884e-06, "loss": 0.52729571, "num_input_tokens_seen": 159632295, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.21191406, "step": 7447, "time_per_iteration": 3.246619701385498 }, { "auxiliary_loss_clip": 0.01287431, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.1807673, "balance_loss_mlp": 1.00853336, "epoch": 0.44779798587103564, "flos": 56573000346240.0, "grad_norm": 0.7609810048795373, "language_loss": 0.59391403, "learning_rate": 2.4317965624200235e-06, "loss": 0.61711979, "num_input_tokens_seen": 159698435, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.24511719, "step": 7448, "time_per_iteration": 3.391934871673584 }, { "auxiliary_loss_clip": 0.01447031, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.27139318, "balance_loss_mlp": 1.01930952, "epoch": 0.4478581091237036, "flos": 46514669685120.0, "grad_norm": 1.6320106180525467, "language_loss": 0.59870529, "learning_rate": 2.431416277672789e-06, "loss": 0.62357903, "num_input_tokens_seen": 159722150, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21032715, "step": 7449, "time_per_iteration": 3.109038829803467 }, { "auxiliary_loss_clip": 0.01459293, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.28178048, "balance_loss_mlp": 1.01509678, "epoch": 0.4479182323763716, "flos": 20824301577600.0, "grad_norm": 2.1445812559509285, "language_loss": 0.81117511, "learning_rate": 2.4310359765662065e-06, "loss": 0.83613312, "num_input_tokens_seen": 159740550, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.21386719, "step": 7450, "time_per_iteration": 2.895526885986328 }, { "auxiliary_loss_clip": 0.01458907, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.28208041, "balance_loss_mlp": 1.01703238, "epoch": 0.44797835562903954, "flos": 14253966720000.0, "grad_norm": 5.982480744081226, "language_loss": 0.80821794, "learning_rate": 2.430655659114697e-06, "loss": 0.8331905, "num_input_tokens_seen": 159758245, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.2130127, "step": 7451, "time_per_iteration": 2.861950397491455 }, { "auxiliary_loss_clip": 0.01286749, "auxiliary_loss_mlp": 0.01038385, "balance_loss_clip": 1.17840767, "balance_loss_mlp": 1.01225424, "epoch": 0.4480384788817075, "flos": 63563850806400.0, "grad_norm": 0.832111098264223, "language_loss": 0.62842786, "learning_rate": 2.430275325332681e-06, "loss": 0.65167916, "num_input_tokens_seen": 159826790, "router_z_loss_clip": 1.078125, "router_z_loss_mlp": 0.26171875, "step": 7452, "time_per_iteration": 3.4893178939819336 }, { "auxiliary_loss_clip": 0.01452983, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.27742028, "balance_loss_mlp": 1.01760292, "epoch": 0.44809860213437547, "flos": 21662545587840.0, "grad_norm": 1.8349565943458381, "language_loss": 0.63530737, "learning_rate": 2.429894975234582e-06, "loss": 0.66022861, "num_input_tokens_seen": 159845805, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.2154541, "step": 7453, "time_per_iteration": 2.8635528087615967 }, { "auxiliary_loss_clip": 0.01290015, "auxiliary_loss_mlp": 0.01052756, "balance_loss_clip": 1.18426311, "balance_loss_mlp": 1.0290091, "epoch": 0.44815872538704343, "flos": 69221595168000.0, "grad_norm": 0.7659197684080481, "language_loss": 0.57060099, "learning_rate": 2.4295146088348224e-06, "loss": 0.59402871, "num_input_tokens_seen": 159898860, "router_z_loss_clip": 1.0546875, "router_z_loss_mlp": 0.23730469, "step": 7454, "time_per_iteration": 3.209456443786621 }, { "auxiliary_loss_clip": 0.01457489, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.28057146, "balance_loss_mlp": 1.01388192, "epoch": 0.4482188486397114, "flos": 12604336128000.0, "grad_norm": 2.0588887404233205, "language_loss": 0.76192296, "learning_rate": 2.4291342261478255e-06, "loss": 0.7868399, "num_input_tokens_seen": 159911555, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.20324707, "step": 7455, "time_per_iteration": 2.846205472946167 }, { "auxiliary_loss_clip": 0.01446942, "auxiliary_loss_mlp": 0.0103713, "balance_loss_clip": 1.27270889, "balance_loss_mlp": 1.0163281, "epoch": 0.44827897189237936, "flos": 34071423603840.0, "grad_norm": 2.9217376758997613, "language_loss": 0.77103794, "learning_rate": 2.428753827188016e-06, "loss": 0.79587865, "num_input_tokens_seen": 159931470, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.20800781, "step": 7456, "time_per_iteration": 2.9845423698425293 }, { "auxiliary_loss_clip": 0.01432573, "auxiliary_loss_mlp": 0.01036503, "balance_loss_clip": 1.26248276, "balance_loss_mlp": 1.01537967, "epoch": 0.44833909514504733, "flos": 25156206270720.0, "grad_norm": 2.2425335206271293, "language_loss": 0.7721318, "learning_rate": 2.428373411969818e-06, "loss": 0.79682255, "num_input_tokens_seen": 159946115, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.21130371, "step": 7457, "time_per_iteration": 2.99906325340271 }, { "auxiliary_loss_clip": 0.0146291, "auxiliary_loss_mlp": 0.01038569, "balance_loss_clip": 1.28563952, "balance_loss_mlp": 1.01698017, "epoch": 0.4483992183977153, "flos": 16188540935040.0, "grad_norm": 2.189848730968719, "language_loss": 0.6884315, "learning_rate": 2.4279929805076576e-06, "loss": 0.71344626, "num_input_tokens_seen": 159963915, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.21606445, "step": 7458, "time_per_iteration": 2.9102818965911865 }, { "auxiliary_loss_clip": 0.01459034, "auxiliary_loss_mlp": 0.01040997, "balance_loss_clip": 1.28060472, "balance_loss_mlp": 1.01894331, "epoch": 0.44845934165038326, "flos": 17754278388480.0, "grad_norm": 3.2145301421326864, "language_loss": 0.72478652, "learning_rate": 2.427612532815961e-06, "loss": 0.74978685, "num_input_tokens_seen": 159982140, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.22070312, "step": 7459, "time_per_iteration": 4.28303599357605 }, { "auxiliary_loss_clip": 0.01442549, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.26732481, "balance_loss_mlp": 1.01461601, "epoch": 0.4485194649030513, "flos": 21846104317440.0, "grad_norm": 3.6388965441206076, "language_loss": 0.69917035, "learning_rate": 2.427232068909154e-06, "loss": 0.72395855, "num_input_tokens_seen": 160002280, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.2166748, "step": 7460, "time_per_iteration": 2.8555076122283936 }, { "auxiliary_loss_clip": 0.01455118, "auxiliary_loss_mlp": 0.01035277, "balance_loss_clip": 1.2781688, "balance_loss_mlp": 1.01442778, "epoch": 0.44857958815571924, "flos": 20094229180800.0, "grad_norm": 2.06958834698811, "language_loss": 0.78792644, "learning_rate": 2.4268515888016635e-06, "loss": 0.81283033, "num_input_tokens_seen": 160020260, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.20849609, "step": 7461, "time_per_iteration": 2.8912193775177 }, { "auxiliary_loss_clip": 0.01445945, "auxiliary_loss_mlp": 0.01036703, "balance_loss_clip": 1.27021456, "balance_loss_mlp": 1.01522112, "epoch": 0.4486397114083872, "flos": 27065099422080.0, "grad_norm": 1.6910666250632658, "language_loss": 0.69510877, "learning_rate": 2.4264710925079184e-06, "loss": 0.71993518, "num_input_tokens_seen": 160040240, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.21496582, "step": 7462, "time_per_iteration": 2.9360218048095703 }, { "auxiliary_loss_clip": 0.01267722, "auxiliary_loss_mlp": 0.0105431, "balance_loss_clip": 1.16260195, "balance_loss_mlp": 1.02264833, "epoch": 0.4486998346610552, "flos": 67349422811520.0, "grad_norm": 0.7564686833911273, "language_loss": 0.54421496, "learning_rate": 2.4260905800423462e-06, "loss": 0.56743526, "num_input_tokens_seen": 160093865, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.31640625, "step": 7463, "time_per_iteration": 3.366792678833008 }, { "auxiliary_loss_clip": 0.01432669, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.25962377, "balance_loss_mlp": 1.0152421, "epoch": 0.44875995791372314, "flos": 27648424373760.0, "grad_norm": 2.409198490817067, "language_loss": 0.76929086, "learning_rate": 2.4257100514193775e-06, "loss": 0.79397738, "num_input_tokens_seen": 160113590, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.20727539, "step": 7464, "time_per_iteration": 2.929297924041748 }, { "auxiliary_loss_clip": 0.01437866, "auxiliary_loss_mlp": 0.01037925, "balance_loss_clip": 1.26538408, "balance_loss_mlp": 1.01700401, "epoch": 0.4488200811663911, "flos": 13013739492480.0, "grad_norm": 1.8836759764309163, "language_loss": 0.75280702, "learning_rate": 2.425329506653441e-06, "loss": 0.777565, "num_input_tokens_seen": 160131795, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.20922852, "step": 7465, "time_per_iteration": 4.273135423660278 }, { "auxiliary_loss_clip": 0.01458881, "auxiliary_loss_mlp": 0.01039361, "balance_loss_clip": 1.27751434, "balance_loss_mlp": 1.01671088, "epoch": 0.44888020441905907, "flos": 27501314970240.0, "grad_norm": 2.482491294351102, "language_loss": 0.81031549, "learning_rate": 2.424948945758966e-06, "loss": 0.83529788, "num_input_tokens_seen": 160150635, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.22668457, "step": 7466, "time_per_iteration": 4.428941488265991 }, { "auxiliary_loss_clip": 0.01447325, "auxiliary_loss_mlp": 0.01035596, "balance_loss_clip": 1.26972961, "balance_loss_mlp": 1.01457918, "epoch": 0.44894032767172704, "flos": 18268776720000.0, "grad_norm": 2.428443136768659, "language_loss": 0.8183164, "learning_rate": 2.4245683687503844e-06, "loss": 0.84314561, "num_input_tokens_seen": 160168615, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.21020508, "step": 7467, "time_per_iteration": 2.840721607208252 }, { "auxiliary_loss_clip": 0.01426668, "auxiliary_loss_mlp": 0.01036393, "balance_loss_clip": 1.25890791, "balance_loss_mlp": 1.0162344, "epoch": 0.449000450924395, "flos": 21589782670080.0, "grad_norm": 1.8721465214090531, "language_loss": 0.76436865, "learning_rate": 2.424187775642129e-06, "loss": 0.78899926, "num_input_tokens_seen": 160187295, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20153809, "step": 7468, "time_per_iteration": 4.246603965759277 }, { "auxiliary_loss_clip": 0.01440446, "auxiliary_loss_mlp": 0.0103197, "balance_loss_clip": 1.2667017, "balance_loss_mlp": 1.01261044, "epoch": 0.44906057417706297, "flos": 17976955887360.0, "grad_norm": 1.7031083381399021, "language_loss": 0.72086513, "learning_rate": 2.4238071664486297e-06, "loss": 0.74558926, "num_input_tokens_seen": 160205115, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.19348145, "step": 7469, "time_per_iteration": 2.835515022277832 }, { "auxiliary_loss_clip": 0.01447459, "auxiliary_loss_mlp": 0.01037814, "balance_loss_clip": 1.27132416, "balance_loss_mlp": 1.01731014, "epoch": 0.44912069742973093, "flos": 20056874958720.0, "grad_norm": 1.7276932145301216, "language_loss": 0.72479188, "learning_rate": 2.4234265411843203e-06, "loss": 0.7496447, "num_input_tokens_seen": 160222580, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.20483398, "step": 7470, "time_per_iteration": 2.8744468688964844 }, { "auxiliary_loss_clip": 0.01437768, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.26199126, "balance_loss_mlp": 1.01497459, "epoch": 0.4491808206823989, "flos": 21043585716480.0, "grad_norm": 1.903701256449809, "language_loss": 0.77645975, "learning_rate": 2.423045899863634e-06, "loss": 0.80119514, "num_input_tokens_seen": 160241520, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.20776367, "step": 7471, "time_per_iteration": 2.870345115661621 }, { "auxiliary_loss_clip": 0.01437462, "auxiliary_loss_mlp": 0.0104025, "balance_loss_clip": 1.26434135, "balance_loss_mlp": 1.01912582, "epoch": 0.44924094393506686, "flos": 22977662238720.0, "grad_norm": 1.8140971684540312, "language_loss": 0.71987677, "learning_rate": 2.4226652425010048e-06, "loss": 0.74465388, "num_input_tokens_seen": 160261815, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21130371, "step": 7472, "time_per_iteration": 2.8750712871551514 }, { "auxiliary_loss_clip": 0.01257142, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.15401399, "balance_loss_mlp": 1.02191305, "epoch": 0.4493010671877349, "flos": 59262739839360.0, "grad_norm": 0.7421910229573018, "language_loss": 0.61744261, "learning_rate": 2.4222845691108676e-06, "loss": 0.64042008, "num_input_tokens_seen": 160317070, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.18652344, "step": 7473, "time_per_iteration": 3.3580939769744873 }, { "auxiliary_loss_clip": 0.01436203, "auxiliary_loss_mlp": 0.01043472, "balance_loss_clip": 1.26181078, "balance_loss_mlp": 1.02174044, "epoch": 0.44936119044040285, "flos": 18014083885440.0, "grad_norm": 1.8782165883417057, "language_loss": 0.79247683, "learning_rate": 2.421903879707657e-06, "loss": 0.81727356, "num_input_tokens_seen": 160334980, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.21728516, "step": 7474, "time_per_iteration": 2.8627736568450928 }, { "auxiliary_loss_clip": 0.01415184, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.24726772, "balance_loss_mlp": 1.01634765, "epoch": 0.4494213136930708, "flos": 21261783980160.0, "grad_norm": 1.585421670394442, "language_loss": 0.72920746, "learning_rate": 2.4215231743058086e-06, "loss": 0.75374484, "num_input_tokens_seen": 160354500, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.2220459, "step": 7475, "time_per_iteration": 2.8839316368103027 }, { "auxiliary_loss_clip": 0.01430269, "auxiliary_loss_mlp": 0.0103322, "balance_loss_clip": 1.25745916, "balance_loss_mlp": 1.01274037, "epoch": 0.4494814369457388, "flos": 27430452334080.0, "grad_norm": 2.101113807612924, "language_loss": 0.77648485, "learning_rate": 2.4211424529197594e-06, "loss": 0.80111974, "num_input_tokens_seen": 160373650, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.20483398, "step": 7476, "time_per_iteration": 2.9087917804718018 }, { "auxiliary_loss_clip": 0.0146182, "auxiliary_loss_mlp": 0.01043384, "balance_loss_clip": 1.28153658, "balance_loss_mlp": 1.02195013, "epoch": 0.44954156019840674, "flos": 22863608801280.0, "grad_norm": 5.374391106002564, "language_loss": 0.72408384, "learning_rate": 2.4207617155639464e-06, "loss": 0.74913585, "num_input_tokens_seen": 160393430, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.21435547, "step": 7477, "time_per_iteration": 2.8622586727142334 }, { "auxiliary_loss_clip": 0.01458536, "auxiliary_loss_mlp": 0.0104638, "balance_loss_clip": 1.28026235, "balance_loss_mlp": 1.02414703, "epoch": 0.4496016834510747, "flos": 17210388919680.0, "grad_norm": 2.4223964625195307, "language_loss": 0.69714642, "learning_rate": 2.4203809622528062e-06, "loss": 0.72219563, "num_input_tokens_seen": 160410545, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.22229004, "step": 7478, "time_per_iteration": 2.8638124465942383 }, { "auxiliary_loss_clip": 0.01422922, "auxiliary_loss_mlp": 0.01038628, "balance_loss_clip": 1.25216126, "balance_loss_mlp": 1.01829112, "epoch": 0.4496618067037427, "flos": 18925950464640.0, "grad_norm": 1.8862380383256183, "language_loss": 0.89958709, "learning_rate": 2.420000193000779e-06, "loss": 0.92420256, "num_input_tokens_seen": 160428105, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.20324707, "step": 7479, "time_per_iteration": 2.8214612007141113 }, { "auxiliary_loss_clip": 0.01426496, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.25413918, "balance_loss_mlp": 1.02296293, "epoch": 0.44972192995641064, "flos": 21041232986880.0, "grad_norm": 1.8549416353174877, "language_loss": 0.76747483, "learning_rate": 2.419619407822302e-06, "loss": 0.7921927, "num_input_tokens_seen": 160448815, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.2232666, "step": 7480, "time_per_iteration": 2.8730101585388184 }, { "auxiliary_loss_clip": 0.0145, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.27159882, "balance_loss_mlp": 1.0213542, "epoch": 0.4497820532090786, "flos": 20786811621120.0, "grad_norm": 3.138225724701109, "language_loss": 0.8145957, "learning_rate": 2.419238606731815e-06, "loss": 0.83952945, "num_input_tokens_seen": 160465940, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.22033691, "step": 7481, "time_per_iteration": 2.9008326530456543 }, { "auxiliary_loss_clip": 0.0142317, "auxiliary_loss_mlp": 0.010368, "balance_loss_clip": 1.2541945, "balance_loss_mlp": 1.01559234, "epoch": 0.44984217646174657, "flos": 33815735383680.0, "grad_norm": 1.8723500932952248, "language_loss": 0.69202185, "learning_rate": 2.418857789743758e-06, "loss": 0.71662158, "num_input_tokens_seen": 160486710, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21203613, "step": 7482, "time_per_iteration": 2.960660696029663 }, { "auxiliary_loss_clip": 0.01449054, "auxiliary_loss_mlp": 0.01040692, "balance_loss_clip": 1.27348053, "balance_loss_mlp": 1.01957989, "epoch": 0.44990229971441453, "flos": 15525485366400.0, "grad_norm": 2.118701762017151, "language_loss": 0.86115086, "learning_rate": 2.418476956872571e-06, "loss": 0.88604832, "num_input_tokens_seen": 160503405, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.21118164, "step": 7483, "time_per_iteration": 2.8056747913360596 }, { "auxiliary_loss_clip": 0.01438623, "auxiliary_loss_mlp": 0.01044715, "balance_loss_clip": 1.26432467, "balance_loss_mlp": 1.02243495, "epoch": 0.4499624229670825, "flos": 29873959770240.0, "grad_norm": 1.8870778615743897, "language_loss": 0.81847215, "learning_rate": 2.4180961081326967e-06, "loss": 0.84330559, "num_input_tokens_seen": 160525080, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.22277832, "step": 7484, "time_per_iteration": 2.903496026992798 }, { "auxiliary_loss_clip": 0.01448123, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.26934886, "balance_loss_mlp": 1.01989985, "epoch": 0.45002254621975046, "flos": 18522474168960.0, "grad_norm": 3.2706173741177706, "language_loss": 0.76401174, "learning_rate": 2.4177152435385754e-06, "loss": 0.78891087, "num_input_tokens_seen": 160540895, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.21887207, "step": 7485, "time_per_iteration": 2.8279008865356445 }, { "auxiliary_loss_clip": 0.01261981, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.15856123, "balance_loss_mlp": 1.00786352, "epoch": 0.4500826694724185, "flos": 70448882382720.0, "grad_norm": 0.8017417792601544, "language_loss": 0.58808845, "learning_rate": 2.4173343631046504e-06, "loss": 0.61097473, "num_input_tokens_seen": 160598270, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.1875, "step": 7486, "time_per_iteration": 3.3784050941467285 }, { "auxiliary_loss_clip": 0.0143746, "auxiliary_loss_mlp": 0.01036493, "balance_loss_clip": 1.26482177, "balance_loss_mlp": 1.01583433, "epoch": 0.45014279272508645, "flos": 15787417368960.0, "grad_norm": 2.670169288068218, "language_loss": 0.84285939, "learning_rate": 2.4169534668453654e-06, "loss": 0.86759889, "num_input_tokens_seen": 160614720, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.20654297, "step": 7487, "time_per_iteration": 2.8163774013519287 }, { "auxiliary_loss_clip": 0.01422901, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.2509985, "balance_loss_mlp": 1.01776791, "epoch": 0.4502029159777544, "flos": 21809790725760.0, "grad_norm": 6.331304448059537, "language_loss": 0.78283858, "learning_rate": 2.4165725547751622e-06, "loss": 0.80745864, "num_input_tokens_seen": 160635170, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.21325684, "step": 7488, "time_per_iteration": 2.8894307613372803 }, { "auxiliary_loss_clip": 0.014591, "auxiliary_loss_mlp": 0.01038077, "balance_loss_clip": 1.2772063, "balance_loss_mlp": 1.01683402, "epoch": 0.4502630392304224, "flos": 28779891805440.0, "grad_norm": 2.110019595220429, "language_loss": 0.7311337, "learning_rate": 2.4161916269084858e-06, "loss": 0.75610542, "num_input_tokens_seen": 160654490, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.21252441, "step": 7489, "time_per_iteration": 2.910886764526367 }, { "auxiliary_loss_clip": 0.01445539, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.27057791, "balance_loss_mlp": 1.01829934, "epoch": 0.45032316248309034, "flos": 15850724123520.0, "grad_norm": 2.7029009370559227, "language_loss": 0.70521963, "learning_rate": 2.4158106832597817e-06, "loss": 0.73008001, "num_input_tokens_seen": 160669400, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.22216797, "step": 7490, "time_per_iteration": 2.837088108062744 }, { "auxiliary_loss_clip": 0.01248996, "auxiliary_loss_mlp": 0.01022275, "balance_loss_clip": 1.14706624, "balance_loss_mlp": 1.00129414, "epoch": 0.4503832857357583, "flos": 57884225944320.0, "grad_norm": 0.7209523010576697, "language_loss": 0.56685042, "learning_rate": 2.415429723843495e-06, "loss": 0.58956313, "num_input_tokens_seen": 160733820, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.20996094, "step": 7491, "time_per_iteration": 3.344919443130493 }, { "auxiliary_loss_clip": 0.01424593, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.25431013, "balance_loss_mlp": 1.01293302, "epoch": 0.4504434089884263, "flos": 23888533432320.0, "grad_norm": 1.889586817734969, "language_loss": 0.80400872, "learning_rate": 2.4150487486740713e-06, "loss": 0.82859963, "num_input_tokens_seen": 160753175, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.2154541, "step": 7492, "time_per_iteration": 2.886906147003174 }, { "auxiliary_loss_clip": 0.01459423, "auxiliary_loss_mlp": 0.01036819, "balance_loss_clip": 1.2780664, "balance_loss_mlp": 1.01538491, "epoch": 0.45050353224109424, "flos": 17793985340160.0, "grad_norm": 2.3645741983422393, "language_loss": 0.93161088, "learning_rate": 2.4146677577659573e-06, "loss": 0.95657325, "num_input_tokens_seen": 160768310, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.2142334, "step": 7493, "time_per_iteration": 2.795440673828125 }, { "auxiliary_loss_clip": 0.01248709, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.14543366, "balance_loss_mlp": 1.00383615, "epoch": 0.4505636554937622, "flos": 65092188792960.0, "grad_norm": 0.7967103253162479, "language_loss": 0.62853253, "learning_rate": 2.4142867511336e-06, "loss": 0.65133083, "num_input_tokens_seen": 160827370, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.2734375, "step": 7494, "time_per_iteration": 4.698879241943359 }, { "auxiliary_loss_clip": 0.01430094, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.25814438, "balance_loss_mlp": 1.01666856, "epoch": 0.45062377874643017, "flos": 22210190375040.0, "grad_norm": 1.454704036865865, "language_loss": 0.82430857, "learning_rate": 2.4139057287914484e-06, "loss": 0.8489908, "num_input_tokens_seen": 160849140, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.21447754, "step": 7495, "time_per_iteration": 2.8762097358703613 }, { "auxiliary_loss_clip": 0.01442721, "auxiliary_loss_mlp": 0.01037317, "balance_loss_clip": 1.26807511, "balance_loss_mlp": 1.0153352, "epoch": 0.45068390199909814, "flos": 37684476610560.0, "grad_norm": 2.452226608069599, "language_loss": 0.86793792, "learning_rate": 2.41352469075395e-06, "loss": 0.89273822, "num_input_tokens_seen": 160871280, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.21972656, "step": 7496, "time_per_iteration": 2.9730031490325928 }, { "auxiliary_loss_clip": 0.01441125, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.26536024, "balance_loss_mlp": 1.01322746, "epoch": 0.4507440252517661, "flos": 22310806106880.0, "grad_norm": 1.9838029426468118, "language_loss": 0.76244855, "learning_rate": 2.4131436370355534e-06, "loss": 0.7872076, "num_input_tokens_seen": 160888625, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.21557617, "step": 7497, "time_per_iteration": 2.8641481399536133 }, { "auxiliary_loss_clip": 0.01453662, "auxiliary_loss_mlp": 0.01042167, "balance_loss_clip": 1.27555704, "balance_loss_mlp": 1.01789641, "epoch": 0.45080414850443407, "flos": 13196212346880.0, "grad_norm": 1.8824213227830577, "language_loss": 0.75531942, "learning_rate": 2.4127625676507088e-06, "loss": 0.78027773, "num_input_tokens_seen": 160907040, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.24279785, "step": 7498, "time_per_iteration": 2.8473942279815674 }, { "auxiliary_loss_clip": 0.01445746, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.26861322, "balance_loss_mlp": 1.01490676, "epoch": 0.4508642717571021, "flos": 21955045092480.0, "grad_norm": 3.4704450637633006, "language_loss": 0.71558595, "learning_rate": 2.4123814826138663e-06, "loss": 0.74040645, "num_input_tokens_seen": 160927115, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.21398926, "step": 7499, "time_per_iteration": 2.9410297870635986 }, { "auxiliary_loss_clip": 0.01456927, "auxiliary_loss_mlp": 0.01036562, "balance_loss_clip": 1.27760077, "balance_loss_mlp": 1.01475835, "epoch": 0.45092439500977005, "flos": 23377835664000.0, "grad_norm": 2.185712495579001, "language_loss": 0.78303993, "learning_rate": 2.412000381939477e-06, "loss": 0.80797487, "num_input_tokens_seen": 160944405, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.21813965, "step": 7500, "time_per_iteration": 4.355379104614258 }, { "auxiliary_loss_clip": 0.01444863, "auxiliary_loss_mlp": 0.01039606, "balance_loss_clip": 1.27034068, "balance_loss_mlp": 1.01843452, "epoch": 0.450984518262438, "flos": 20781608469120.0, "grad_norm": 1.7795001522087839, "language_loss": 0.63499457, "learning_rate": 2.411619265641992e-06, "loss": 0.65983927, "num_input_tokens_seen": 160961345, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.21154785, "step": 7501, "time_per_iteration": 4.305271148681641 }, { "auxiliary_loss_clip": 0.01449767, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.27252269, "balance_loss_mlp": 1.0168947, "epoch": 0.451044641515106, "flos": 17715023884800.0, "grad_norm": 5.33127561482403, "language_loss": 0.85737801, "learning_rate": 2.411238133735863e-06, "loss": 0.88225532, "num_input_tokens_seen": 160977330, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.21069336, "step": 7502, "time_per_iteration": 2.8434741497039795 }, { "auxiliary_loss_clip": 0.01435536, "auxiliary_loss_mlp": 0.01033121, "balance_loss_clip": 1.26301837, "balance_loss_mlp": 1.01215184, "epoch": 0.45110476476777395, "flos": 20604384011520.0, "grad_norm": 1.2985438445518493, "language_loss": 0.79659235, "learning_rate": 2.4108569862355418e-06, "loss": 0.82127887, "num_input_tokens_seen": 160997280, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.2097168, "step": 7503, "time_per_iteration": 2.8235387802124023 }, { "auxiliary_loss_clip": 0.01435912, "auxiliary_loss_mlp": 0.0103575, "balance_loss_clip": 1.26547575, "balance_loss_mlp": 1.01547265, "epoch": 0.4511648880204419, "flos": 16042562651520.0, "grad_norm": 1.7746088312129782, "language_loss": 0.81402564, "learning_rate": 2.410475823155484e-06, "loss": 0.83874226, "num_input_tokens_seen": 161014235, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.20263672, "step": 7504, "time_per_iteration": 4.22543740272522 }, { "auxiliary_loss_clip": 0.01429615, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.25814629, "balance_loss_mlp": 1.01489758, "epoch": 0.4512250112731099, "flos": 23987022658560.0, "grad_norm": 1.7609570283478744, "language_loss": 0.64074606, "learning_rate": 2.4100946445101405e-06, "loss": 0.66540539, "num_input_tokens_seen": 161032360, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21435547, "step": 7505, "time_per_iteration": 2.8386025428771973 }, { "auxiliary_loss_clip": 0.01248181, "auxiliary_loss_mlp": 0.01036055, "balance_loss_clip": 1.14251101, "balance_loss_mlp": 1.01249933, "epoch": 0.45128513452577784, "flos": 71496338924160.0, "grad_norm": 0.8398333130912533, "language_loss": 0.58992809, "learning_rate": 2.409713450313968e-06, "loss": 0.61277044, "num_input_tokens_seen": 161091360, "router_z_loss_clip": 1.0546875, "router_z_loss_mlp": 0.23535156, "step": 7506, "time_per_iteration": 3.3831698894500732 }, { "auxiliary_loss_clip": 0.0143649, "auxiliary_loss_mlp": 0.01032696, "balance_loss_clip": 1.26494968, "balance_loss_mlp": 1.01114357, "epoch": 0.4513452577784458, "flos": 22100978131200.0, "grad_norm": 1.6372141295113214, "language_loss": 0.79684627, "learning_rate": 2.40933224058142e-06, "loss": 0.82153809, "num_input_tokens_seen": 161110825, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21533203, "step": 7507, "time_per_iteration": 2.891263008117676 }, { "auxiliary_loss_clip": 0.01444575, "auxiliary_loss_mlp": 0.01039665, "balance_loss_clip": 1.26988423, "balance_loss_mlp": 1.01610923, "epoch": 0.4514053810311138, "flos": 24286173148800.0, "grad_norm": 1.6563284639354048, "language_loss": 0.74030221, "learning_rate": 2.4089510153269526e-06, "loss": 0.76514459, "num_input_tokens_seen": 161130685, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.23571777, "step": 7508, "time_per_iteration": 2.889578104019165 }, { "auxiliary_loss_clip": 0.01435001, "auxiliary_loss_mlp": 0.01036936, "balance_loss_clip": 1.26435709, "balance_loss_mlp": 1.01630116, "epoch": 0.45146550428378174, "flos": 17893696176000.0, "grad_norm": 2.0395573046248705, "language_loss": 0.80340934, "learning_rate": 2.4085697745650217e-06, "loss": 0.8281287, "num_input_tokens_seen": 161147555, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.2064209, "step": 7509, "time_per_iteration": 2.8748972415924072 }, { "auxiliary_loss_clip": 0.01432107, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.26123118, "balance_loss_mlp": 1.01102138, "epoch": 0.4515256275364497, "flos": 24254112568320.0, "grad_norm": 1.945381163595029, "language_loss": 0.74407256, "learning_rate": 2.4081885183100837e-06, "loss": 0.76871663, "num_input_tokens_seen": 161166255, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21264648, "step": 7510, "time_per_iteration": 2.8546926975250244 }, { "auxiliary_loss_clip": 0.01451483, "auxiliary_loss_mlp": 0.01037194, "balance_loss_clip": 1.27562249, "balance_loss_mlp": 1.01574802, "epoch": 0.45158575078911767, "flos": 20640697603200.0, "grad_norm": 2.6323798126588067, "language_loss": 0.77695966, "learning_rate": 2.4078072465765964e-06, "loss": 0.80184644, "num_input_tokens_seen": 161184720, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.21447754, "step": 7511, "time_per_iteration": 2.8518483638763428 }, { "auxiliary_loss_clip": 0.01451993, "auxiliary_loss_mlp": 0.01036825, "balance_loss_clip": 1.27621281, "balance_loss_mlp": 1.01487875, "epoch": 0.45164587404178563, "flos": 23337269061120.0, "grad_norm": 1.8363495730815926, "language_loss": 0.79831094, "learning_rate": 2.4074259593790174e-06, "loss": 0.82319915, "num_input_tokens_seen": 161204360, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21948242, "step": 7512, "time_per_iteration": 2.858537197113037 }, { "auxiliary_loss_clip": 0.0146753, "auxiliary_loss_mlp": 0.01038641, "balance_loss_clip": 1.285707, "balance_loss_mlp": 1.01642025, "epoch": 0.45170599729445365, "flos": 23816630165760.0, "grad_norm": 2.2233736709960734, "language_loss": 0.88486713, "learning_rate": 2.4070446567318053e-06, "loss": 0.9099288, "num_input_tokens_seen": 161223575, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.22216797, "step": 7513, "time_per_iteration": 2.869544506072998 }, { "auxiliary_loss_clip": 0.01423503, "auxiliary_loss_mlp": 0.0103671, "balance_loss_clip": 1.25698113, "balance_loss_mlp": 1.01556206, "epoch": 0.4517661205471216, "flos": 23523225765120.0, "grad_norm": 1.4950478906898914, "language_loss": 0.67995822, "learning_rate": 2.406663338649419e-06, "loss": 0.70456028, "num_input_tokens_seen": 161243805, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.21142578, "step": 7514, "time_per_iteration": 2.889476776123047 }, { "auxiliary_loss_clip": 0.01463059, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.28647804, "balance_loss_mlp": 1.01160216, "epoch": 0.4518262437997896, "flos": 23524492619520.0, "grad_norm": 2.3330860922871537, "language_loss": 0.70272928, "learning_rate": 2.406282005146318e-06, "loss": 0.72770143, "num_input_tokens_seen": 161261450, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.2253418, "step": 7515, "time_per_iteration": 2.9836409091949463 }, { "auxiliary_loss_clip": 0.01454787, "auxiliary_loss_mlp": 0.01039733, "balance_loss_clip": 1.2758671, "balance_loss_mlp": 1.0169636, "epoch": 0.45188636705245755, "flos": 14574500017920.0, "grad_norm": 4.675468633979767, "language_loss": 0.83237004, "learning_rate": 2.405900656236963e-06, "loss": 0.85731518, "num_input_tokens_seen": 161276965, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.22766113, "step": 7516, "time_per_iteration": 2.825336217880249 }, { "auxiliary_loss_clip": 0.01441017, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.26922405, "balance_loss_mlp": 1.01326585, "epoch": 0.4519464903051255, "flos": 19911077654400.0, "grad_norm": 1.673857177327844, "language_loss": 0.6668725, "learning_rate": 2.4055192919358137e-06, "loss": 0.69162798, "num_input_tokens_seen": 161295375, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.21264648, "step": 7517, "time_per_iteration": 2.8684167861938477 }, { "auxiliary_loss_clip": 0.01439839, "auxiliary_loss_mlp": 0.01037001, "balance_loss_clip": 1.26845467, "balance_loss_mlp": 1.01610327, "epoch": 0.4520066135577935, "flos": 18853866218880.0, "grad_norm": 1.7979159188346792, "language_loss": 0.64002788, "learning_rate": 2.405137912257333e-06, "loss": 0.66479623, "num_input_tokens_seen": 161313010, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.20898438, "step": 7518, "time_per_iteration": 2.830096960067749 }, { "auxiliary_loss_clip": 0.01443735, "auxiliary_loss_mlp": 0.01033724, "balance_loss_clip": 1.27062142, "balance_loss_mlp": 1.01269519, "epoch": 0.45206673681046144, "flos": 48231271860480.0, "grad_norm": 1.4703578945787348, "language_loss": 0.60327619, "learning_rate": 2.404756517215982e-06, "loss": 0.6280508, "num_input_tokens_seen": 161336690, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.21008301, "step": 7519, "time_per_iteration": 3.0923566818237305 }, { "auxiliary_loss_clip": 0.01456659, "auxiliary_loss_mlp": 0.01037209, "balance_loss_clip": 1.28059196, "balance_loss_mlp": 1.01557279, "epoch": 0.4521268600631294, "flos": 23852762778240.0, "grad_norm": 1.5429550887359644, "language_loss": 0.72990692, "learning_rate": 2.404375106826223e-06, "loss": 0.75484556, "num_input_tokens_seen": 161357845, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.21643066, "step": 7520, "time_per_iteration": 2.8903303146362305 }, { "auxiliary_loss_clip": 0.01441898, "auxiliary_loss_mlp": 0.01035202, "balance_loss_clip": 1.26612663, "balance_loss_mlp": 1.01395845, "epoch": 0.4521869833157974, "flos": 18852916078080.0, "grad_norm": 3.0471213762301983, "language_loss": 0.76349354, "learning_rate": 2.4039936811025194e-06, "loss": 0.78826451, "num_input_tokens_seen": 161375160, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21240234, "step": 7521, "time_per_iteration": 2.8569300174713135 }, { "auxiliary_loss_clip": 0.01463324, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.28357923, "balance_loss_mlp": 1.0142467, "epoch": 0.45224710656846534, "flos": 19796526524160.0, "grad_norm": 3.1840144745983285, "language_loss": 0.68132985, "learning_rate": 2.4036122400593343e-06, "loss": 0.70631957, "num_input_tokens_seen": 161393690, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.2142334, "step": 7522, "time_per_iteration": 2.8193488121032715 }, { "auxiliary_loss_clip": 0.01442578, "auxiliary_loss_mlp": 0.01033601, "balance_loss_clip": 1.27051318, "balance_loss_mlp": 1.01208329, "epoch": 0.4523072298211333, "flos": 28267655713920.0, "grad_norm": 1.5035068912827847, "language_loss": 0.61344445, "learning_rate": 2.403230783711134e-06, "loss": 0.63820624, "num_input_tokens_seen": 161415015, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21508789, "step": 7523, "time_per_iteration": 2.892474412918091 }, { "auxiliary_loss_clip": 0.01454555, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.27641201, "balance_loss_mlp": 1.01358914, "epoch": 0.45236735307380127, "flos": 11188196542080.0, "grad_norm": 1.9423573896392052, "language_loss": 0.79059923, "learning_rate": 2.4028493120723813e-06, "loss": 0.81548893, "num_input_tokens_seen": 161432940, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.20837402, "step": 7524, "time_per_iteration": 2.817948341369629 }, { "auxiliary_loss_clip": 0.01449666, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.27684116, "balance_loss_mlp": 1.01254487, "epoch": 0.45242747632646924, "flos": 22611585409920.0, "grad_norm": 2.8430086199352176, "language_loss": 0.64152706, "learning_rate": 2.4024678251575417e-06, "loss": 0.6663599, "num_input_tokens_seen": 161452215, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.21081543, "step": 7525, "time_per_iteration": 2.8133604526519775 }, { "auxiliary_loss_clip": 0.01440028, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.26871562, "balance_loss_mlp": 1.01365614, "epoch": 0.45248759957913726, "flos": 18264478464000.0, "grad_norm": 3.312750475421455, "language_loss": 0.801054, "learning_rate": 2.402086322981083e-06, "loss": 0.82580125, "num_input_tokens_seen": 161469520, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21032715, "step": 7526, "time_per_iteration": 2.8109474182128906 }, { "auxiliary_loss_clip": 0.01439537, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.26854277, "balance_loss_mlp": 1.01416886, "epoch": 0.4525477228318052, "flos": 22458865651200.0, "grad_norm": 1.7779127042978262, "language_loss": 0.82333934, "learning_rate": 2.40170480555747e-06, "loss": 0.84808248, "num_input_tokens_seen": 161487335, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.20629883, "step": 7527, "time_per_iteration": 2.8305063247680664 }, { "auxiliary_loss_clip": 0.01444048, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.27126765, "balance_loss_mlp": 1.01372957, "epoch": 0.4526078460844732, "flos": 29657345074560.0, "grad_norm": 1.6919029761316504, "language_loss": 0.66164792, "learning_rate": 2.4013232729011706e-06, "loss": 0.68644285, "num_input_tokens_seen": 161510095, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.21740723, "step": 7528, "time_per_iteration": 2.9238178730010986 }, { "auxiliary_loss_clip": 0.01437383, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.26697946, "balance_loss_mlp": 1.01401556, "epoch": 0.45266796933714115, "flos": 23050198932480.0, "grad_norm": 1.5496654167426727, "language_loss": 0.76085961, "learning_rate": 2.4009417250266525e-06, "loss": 0.78558433, "num_input_tokens_seen": 161528725, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21081543, "step": 7529, "time_per_iteration": 4.328989505767822 }, { "auxiliary_loss_clip": 0.01450387, "auxiliary_loss_mlp": 0.01040172, "balance_loss_clip": 1.27534938, "balance_loss_mlp": 1.01914334, "epoch": 0.4527280925898091, "flos": 14436439574400.0, "grad_norm": 2.9625870458893053, "language_loss": 0.74024177, "learning_rate": 2.400560161948384e-06, "loss": 0.76514739, "num_input_tokens_seen": 161547195, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.21020508, "step": 7530, "time_per_iteration": 2.8579118251800537 }, { "auxiliary_loss_clip": 0.01446265, "auxiliary_loss_mlp": 0.01035124, "balance_loss_clip": 1.27284193, "balance_loss_mlp": 1.01415467, "epoch": 0.4527882158424771, "flos": 22934697661440.0, "grad_norm": 1.8724647519210782, "language_loss": 0.77437466, "learning_rate": 2.400178583680834e-06, "loss": 0.79918849, "num_input_tokens_seen": 161565565, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.20959473, "step": 7531, "time_per_iteration": 2.918513298034668 }, { "auxiliary_loss_clip": 0.01441096, "auxiliary_loss_mlp": 0.01035003, "balance_loss_clip": 1.27138329, "balance_loss_mlp": 1.01392722, "epoch": 0.45284833909514505, "flos": 25565926348800.0, "grad_norm": 2.5394848168783253, "language_loss": 0.67720497, "learning_rate": 2.3997969902384717e-06, "loss": 0.70196599, "num_input_tokens_seen": 161586630, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21069336, "step": 7532, "time_per_iteration": 2.905796527862549 }, { "auxiliary_loss_clip": 0.01443196, "auxiliary_loss_mlp": 0.01035689, "balance_loss_clip": 1.2721622, "balance_loss_mlp": 1.01507735, "epoch": 0.452908462347813, "flos": 18159021538560.0, "grad_norm": 2.2172270115341637, "language_loss": 0.79222119, "learning_rate": 2.399415381635768e-06, "loss": 0.81701005, "num_input_tokens_seen": 161603815, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.20605469, "step": 7533, "time_per_iteration": 2.824237108230591 }, { "auxiliary_loss_clip": 0.01465636, "auxiliary_loss_mlp": 0.01036118, "balance_loss_clip": 1.28271782, "balance_loss_mlp": 1.01390958, "epoch": 0.452968585600481, "flos": 19072154972160.0, "grad_norm": 2.0128232789563048, "language_loss": 0.84328318, "learning_rate": 2.3990337578871927e-06, "loss": 0.86830074, "num_input_tokens_seen": 161622900, "router_z_loss_clip": 1.83007812, "router_z_loss_mlp": 0.2220459, "step": 7534, "time_per_iteration": 2.858508348464966 }, { "auxiliary_loss_clip": 0.01447967, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.27403069, "balance_loss_mlp": 1.01616001, "epoch": 0.45302870885314894, "flos": 22061180689920.0, "grad_norm": 3.658325286393308, "language_loss": 0.77439439, "learning_rate": 2.3986521190072176e-06, "loss": 0.79924679, "num_input_tokens_seen": 161641700, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.21130371, "step": 7535, "time_per_iteration": 4.349774360656738 }, { "auxiliary_loss_clip": 0.01446202, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.27328396, "balance_loss_mlp": 1.01471424, "epoch": 0.4530888321058169, "flos": 20385959523840.0, "grad_norm": 1.568198138846674, "language_loss": 0.81764126, "learning_rate": 2.3982704650103138e-06, "loss": 0.84245133, "num_input_tokens_seen": 161661955, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20080566, "step": 7536, "time_per_iteration": 2.8531036376953125 }, { "auxiliary_loss_clip": 0.0146027, "auxiliary_loss_mlp": 0.01035114, "balance_loss_clip": 1.28320265, "balance_loss_mlp": 1.0144788, "epoch": 0.4531489553584849, "flos": 14838739505280.0, "grad_norm": 2.0854179894787896, "language_loss": 0.76751143, "learning_rate": 2.3978887959109544e-06, "loss": 0.79246521, "num_input_tokens_seen": 161679245, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.20629883, "step": 7537, "time_per_iteration": 4.214274883270264 }, { "auxiliary_loss_clip": 0.01448719, "auxiliary_loss_mlp": 0.01030841, "balance_loss_clip": 1.2737875, "balance_loss_mlp": 1.01053929, "epoch": 0.45320907861115284, "flos": 21955090337280.0, "grad_norm": 1.8957156611338983, "language_loss": 0.76172811, "learning_rate": 2.3975071117236118e-06, "loss": 0.7865237, "num_input_tokens_seen": 161698795, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.20300293, "step": 7538, "time_per_iteration": 2.8562347888946533 }, { "auxiliary_loss_clip": 0.01249209, "auxiliary_loss_mlp": 0.01020612, "balance_loss_clip": 1.14379907, "balance_loss_mlp": 0.99734223, "epoch": 0.45326920186382086, "flos": 66285849841920.0, "grad_norm": 0.8239588358585198, "language_loss": 0.62378681, "learning_rate": 2.3971254124627593e-06, "loss": 0.64648503, "num_input_tokens_seen": 161761980, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.23242188, "step": 7539, "time_per_iteration": 4.736531496047974 }, { "auxiliary_loss_clip": 0.01446929, "auxiliary_loss_mlp": 0.01041127, "balance_loss_clip": 1.27439952, "balance_loss_mlp": 1.02011025, "epoch": 0.4533293251164888, "flos": 14692444508160.0, "grad_norm": 1.8537091325693893, "language_loss": 0.66714627, "learning_rate": 2.396743698142872e-06, "loss": 0.69202685, "num_input_tokens_seen": 161779455, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.21008301, "step": 7540, "time_per_iteration": 2.8608927726745605 }, { "auxiliary_loss_clip": 0.01479251, "auxiliary_loss_mlp": 0.01039469, "balance_loss_clip": 1.295784, "balance_loss_mlp": 1.01783252, "epoch": 0.4533894483691568, "flos": 22611359185920.0, "grad_norm": 1.8450953314400171, "language_loss": 0.85760534, "learning_rate": 2.396361968778424e-06, "loss": 0.88279259, "num_input_tokens_seen": 161798980, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.21630859, "step": 7541, "time_per_iteration": 3.0485739707946777 }, { "auxiliary_loss_clip": 0.01450253, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.27531767, "balance_loss_mlp": 1.01466203, "epoch": 0.45344957162182475, "flos": 34765091919360.0, "grad_norm": 2.9142560545188307, "language_loss": 0.7736128, "learning_rate": 2.395980224383889e-06, "loss": 0.79846281, "num_input_tokens_seen": 161819745, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.20092773, "step": 7542, "time_per_iteration": 2.9551656246185303 }, { "auxiliary_loss_clip": 0.01457109, "auxiliary_loss_mlp": 0.01032611, "balance_loss_clip": 1.28214061, "balance_loss_mlp": 1.01066446, "epoch": 0.4535096948744927, "flos": 23560715721600.0, "grad_norm": 1.6525995216020746, "language_loss": 0.81075853, "learning_rate": 2.395598464973746e-06, "loss": 0.83565569, "num_input_tokens_seen": 161838575, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.21948242, "step": 7543, "time_per_iteration": 2.863460063934326 }, { "auxiliary_loss_clip": 0.0145673, "auxiliary_loss_mlp": 0.01040192, "balance_loss_clip": 1.28101873, "balance_loss_mlp": 1.01946163, "epoch": 0.4535698181271607, "flos": 25568731526400.0, "grad_norm": 1.9060538144553325, "language_loss": 0.77129781, "learning_rate": 2.395216690562469e-06, "loss": 0.79626697, "num_input_tokens_seen": 161858590, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.20727539, "step": 7544, "time_per_iteration": 2.852433443069458 }, { "auxiliary_loss_clip": 0.01469002, "auxiliary_loss_mlp": 0.01033996, "balance_loss_clip": 1.29114306, "balance_loss_mlp": 1.01377809, "epoch": 0.45362994137982865, "flos": 24875063210880.0, "grad_norm": 1.7532497011283266, "language_loss": 0.76162183, "learning_rate": 2.3948349011645355e-06, "loss": 0.78665185, "num_input_tokens_seen": 161878390, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.20227051, "step": 7545, "time_per_iteration": 2.848027467727661 }, { "auxiliary_loss_clip": 0.01462777, "auxiliary_loss_mlp": 0.01038201, "balance_loss_clip": 1.28767943, "balance_loss_mlp": 1.01788795, "epoch": 0.4536900646324966, "flos": 30818113153920.0, "grad_norm": 2.4828327428289954, "language_loss": 0.72775596, "learning_rate": 2.394453096794423e-06, "loss": 0.75276577, "num_input_tokens_seen": 161898610, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.20300293, "step": 7546, "time_per_iteration": 2.893627166748047 }, { "auxiliary_loss_clip": 0.01483038, "auxiliary_loss_mlp": 0.01038335, "balance_loss_clip": 1.30031276, "balance_loss_mlp": 1.01738954, "epoch": 0.4537501878851646, "flos": 23414782682880.0, "grad_norm": 1.5697198342730723, "language_loss": 0.76530933, "learning_rate": 2.394071277466609e-06, "loss": 0.79052317, "num_input_tokens_seen": 161918210, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.20947266, "step": 7547, "time_per_iteration": 2.832987070083618 }, { "auxiliary_loss_clip": 0.01456459, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.2794683, "balance_loss_mlp": 1.01511478, "epoch": 0.45381031113783254, "flos": 18158116642560.0, "grad_norm": 5.05534316149341, "language_loss": 0.70368606, "learning_rate": 2.393689443195573e-06, "loss": 0.72861695, "num_input_tokens_seen": 161936950, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.21520996, "step": 7548, "time_per_iteration": 2.8207101821899414 }, { "auxiliary_loss_clip": 0.01436285, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.2639271, "balance_loss_mlp": 1.01516652, "epoch": 0.4538704343905005, "flos": 25347275637120.0, "grad_norm": 2.1418994688866713, "language_loss": 0.74046648, "learning_rate": 2.393307593995794e-06, "loss": 0.7651881, "num_input_tokens_seen": 161955550, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.20690918, "step": 7549, "time_per_iteration": 2.854933500289917 }, { "auxiliary_loss_clip": 0.01443068, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.27014446, "balance_loss_mlp": 1.0123744, "epoch": 0.4539305576431685, "flos": 28743351989760.0, "grad_norm": 1.5991919392273304, "language_loss": 0.65820652, "learning_rate": 2.392925729881751e-06, "loss": 0.68295991, "num_input_tokens_seen": 161976760, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.19897461, "step": 7550, "time_per_iteration": 2.934553861618042 }, { "auxiliary_loss_clip": 0.01439402, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.26796651, "balance_loss_mlp": 1.01399767, "epoch": 0.45399068089583644, "flos": 22502689879680.0, "grad_norm": 1.6003325085560562, "language_loss": 0.69228423, "learning_rate": 2.3925438508679263e-06, "loss": 0.71701956, "num_input_tokens_seen": 161996120, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20129395, "step": 7551, "time_per_iteration": 2.920234203338623 }, { "auxiliary_loss_clip": 0.014518, "auxiliary_loss_mlp": 0.01034277, "balance_loss_clip": 1.27381015, "balance_loss_mlp": 1.01371336, "epoch": 0.45405080414850446, "flos": 12900364727040.0, "grad_norm": 2.413844958644645, "language_loss": 0.7994386, "learning_rate": 2.392161956968798e-06, "loss": 0.82429934, "num_input_tokens_seen": 162011125, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.20556641, "step": 7552, "time_per_iteration": 3.029266834259033 }, { "auxiliary_loss_clip": 0.01238088, "auxiliary_loss_mlp": 0.01026707, "balance_loss_clip": 1.13647985, "balance_loss_mlp": 1.00725198, "epoch": 0.4541109274011724, "flos": 59792739154560.0, "grad_norm": 0.8169863969579341, "language_loss": 0.57792127, "learning_rate": 2.39178004819885e-06, "loss": 0.60056925, "num_input_tokens_seen": 162068705, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.19433594, "step": 7553, "time_per_iteration": 3.3373911380767822 }, { "auxiliary_loss_clip": 0.0143803, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.26663494, "balance_loss_mlp": 1.01859927, "epoch": 0.4541710506538404, "flos": 28523389178880.0, "grad_norm": 1.3895489379159014, "language_loss": 0.77419794, "learning_rate": 2.3913981245725626e-06, "loss": 0.79895949, "num_input_tokens_seen": 162089655, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.1953125, "step": 7554, "time_per_iteration": 2.911853790283203 }, { "auxiliary_loss_clip": 0.01458209, "auxiliary_loss_mlp": 0.01034646, "balance_loss_clip": 1.28070104, "balance_loss_mlp": 1.01291394, "epoch": 0.45423117390650836, "flos": 17684456382720.0, "grad_norm": 3.6058234650716776, "language_loss": 0.78107023, "learning_rate": 2.3910161861044194e-06, "loss": 0.8059988, "num_input_tokens_seen": 162108465, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.21728516, "step": 7555, "time_per_iteration": 2.8864855766296387 }, { "auxiliary_loss_clip": 0.01438451, "auxiliary_loss_mlp": 0.01031226, "balance_loss_clip": 1.26466727, "balance_loss_mlp": 1.01035225, "epoch": 0.4542912971591763, "flos": 28083735025920.0, "grad_norm": 1.5842650249395722, "language_loss": 0.73078811, "learning_rate": 2.390634232808903e-06, "loss": 0.75548482, "num_input_tokens_seen": 162129910, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20874023, "step": 7556, "time_per_iteration": 2.96071195602417 }, { "auxiliary_loss_clip": 0.0146072, "auxiliary_loss_mlp": 0.01039444, "balance_loss_clip": 1.28212595, "balance_loss_mlp": 1.01734281, "epoch": 0.4543514204118443, "flos": 22681316926080.0, "grad_norm": 2.45368288432121, "language_loss": 0.6471734, "learning_rate": 2.3902522647004982e-06, "loss": 0.67217505, "num_input_tokens_seen": 162148840, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.22106934, "step": 7557, "time_per_iteration": 2.910444974899292 }, { "auxiliary_loss_clip": 0.01245334, "auxiliary_loss_mlp": 0.0103194, "balance_loss_clip": 1.14205742, "balance_loss_mlp": 1.01134074, "epoch": 0.45441154366451225, "flos": 58246773678720.0, "grad_norm": 0.6952756895395331, "language_loss": 0.57673365, "learning_rate": 2.3898702817936875e-06, "loss": 0.59950638, "num_input_tokens_seen": 162208500, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.20605469, "step": 7558, "time_per_iteration": 3.285207748413086 }, { "auxiliary_loss_clip": 0.01446834, "auxiliary_loss_mlp": 0.01034834, "balance_loss_clip": 1.26888442, "balance_loss_mlp": 1.01292372, "epoch": 0.4544716669171802, "flos": 16773132741120.0, "grad_norm": 2.5117343893454986, "language_loss": 0.58298039, "learning_rate": 2.3894882841029573e-06, "loss": 0.60779715, "num_input_tokens_seen": 162224650, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.21923828, "step": 7559, "time_per_iteration": 2.814863920211792 }, { "auxiliary_loss_clip": 0.01436948, "auxiliary_loss_mlp": 0.01035659, "balance_loss_clip": 1.26365411, "balance_loss_mlp": 1.01373601, "epoch": 0.4545317901698482, "flos": 15933893345280.0, "grad_norm": 1.9080769478285835, "language_loss": 0.72724855, "learning_rate": 2.389106271642792e-06, "loss": 0.75197458, "num_input_tokens_seen": 162242930, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21899414, "step": 7560, "time_per_iteration": 2.8332369327545166 }, { "auxiliary_loss_clip": 0.01464225, "auxiliary_loss_mlp": 0.01035741, "balance_loss_clip": 1.28395259, "balance_loss_mlp": 1.01425958, "epoch": 0.45459191342251615, "flos": 17648776218240.0, "grad_norm": 8.98567723893478, "language_loss": 0.70624834, "learning_rate": 2.3887242444276775e-06, "loss": 0.73124802, "num_input_tokens_seen": 162261455, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.21484375, "step": 7561, "time_per_iteration": 2.899129629135132 }, { "auxiliary_loss_clip": 0.01438982, "auxiliary_loss_mlp": 0.01032943, "balance_loss_clip": 1.26907551, "balance_loss_mlp": 1.01309454, "epoch": 0.4546520366751841, "flos": 16184287923840.0, "grad_norm": 2.597827250449868, "language_loss": 0.86279541, "learning_rate": 2.3883422024721015e-06, "loss": 0.88751471, "num_input_tokens_seen": 162279725, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.19836426, "step": 7562, "time_per_iteration": 2.9800360202789307 }, { "auxiliary_loss_clip": 0.01432064, "auxiliary_loss_mlp": 0.01041132, "balance_loss_clip": 1.26209497, "balance_loss_mlp": 1.02044916, "epoch": 0.4547121599278521, "flos": 19759850974080.0, "grad_norm": 1.9094296854488118, "language_loss": 0.89894557, "learning_rate": 2.38796014579055e-06, "loss": 0.92367756, "num_input_tokens_seen": 162297865, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.20678711, "step": 7563, "time_per_iteration": 2.866267681121826 }, { "auxiliary_loss_clip": 0.01449666, "auxiliary_loss_mlp": 0.01035786, "balance_loss_clip": 1.27346337, "balance_loss_mlp": 1.01420927, "epoch": 0.45477228318052004, "flos": 19946848308480.0, "grad_norm": 8.793927179852368, "language_loss": 0.72725838, "learning_rate": 2.3875780743975097e-06, "loss": 0.75211287, "num_input_tokens_seen": 162316010, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.21557617, "step": 7564, "time_per_iteration": 4.202565431594849 }, { "auxiliary_loss_clip": 0.0144334, "auxiliary_loss_mlp": 0.01036167, "balance_loss_clip": 1.26771319, "balance_loss_mlp": 1.01473296, "epoch": 0.454832406433188, "flos": 21298278551040.0, "grad_norm": 2.16034561099508, "language_loss": 0.69059789, "learning_rate": 2.3871959883074713e-06, "loss": 0.71539307, "num_input_tokens_seen": 162336115, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.21447754, "step": 7565, "time_per_iteration": 2.8455355167388916 }, { "auxiliary_loss_clip": 0.01436771, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.26393414, "balance_loss_mlp": 1.01282406, "epoch": 0.45489252968585603, "flos": 24509438830080.0, "grad_norm": 1.7616022733784116, "language_loss": 0.80892491, "learning_rate": 2.386813887534922e-06, "loss": 0.83363348, "num_input_tokens_seen": 162355705, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.21252441, "step": 7566, "time_per_iteration": 2.8623225688934326 }, { "auxiliary_loss_clip": 0.0143339, "auxiliary_loss_mlp": 0.01035061, "balance_loss_clip": 1.25822997, "balance_loss_mlp": 1.01352024, "epoch": 0.454952652938524, "flos": 17101357655040.0, "grad_norm": 1.6263523332211243, "language_loss": 0.74332327, "learning_rate": 2.3864317720943508e-06, "loss": 0.76800776, "num_input_tokens_seen": 162374055, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.21508789, "step": 7567, "time_per_iteration": 2.8396365642547607 }, { "auxiliary_loss_clip": 0.01445049, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.27018213, "balance_loss_mlp": 1.0153482, "epoch": 0.45501277619119196, "flos": 27640325554560.0, "grad_norm": 1.505000048665826, "language_loss": 0.81605512, "learning_rate": 2.386049642000249e-06, "loss": 0.8408711, "num_input_tokens_seen": 162393560, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.2121582, "step": 7568, "time_per_iteration": 2.9185948371887207 }, { "auxiliary_loss_clip": 0.01463406, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.28281927, "balance_loss_mlp": 1.01351106, "epoch": 0.4550728994438599, "flos": 19984247775360.0, "grad_norm": 2.6853624663203743, "language_loss": 0.80904257, "learning_rate": 2.3856674972671055e-06, "loss": 0.83403903, "num_input_tokens_seen": 162413170, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22753906, "step": 7569, "time_per_iteration": 2.8405261039733887 }, { "auxiliary_loss_clip": 0.01449743, "auxiliary_loss_mlp": 0.01036716, "balance_loss_clip": 1.27161968, "balance_loss_mlp": 1.01533031, "epoch": 0.4551330226965279, "flos": 26077800481920.0, "grad_norm": 1.5025613665284998, "language_loss": 0.75490654, "learning_rate": 2.385285337909412e-06, "loss": 0.77977109, "num_input_tokens_seen": 162434080, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.21398926, "step": 7570, "time_per_iteration": 4.338579416275024 }, { "auxiliary_loss_clip": 0.01436434, "auxiliary_loss_mlp": 0.01042181, "balance_loss_clip": 1.26517856, "balance_loss_mlp": 1.01928043, "epoch": 0.45519314594919585, "flos": 32793977888640.0, "grad_norm": 1.8464120244707356, "language_loss": 0.75242269, "learning_rate": 2.3849031639416596e-06, "loss": 0.77720881, "num_input_tokens_seen": 162455445, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.22900391, "step": 7571, "time_per_iteration": 2.9665868282318115 }, { "auxiliary_loss_clip": 0.01426166, "auxiliary_loss_mlp": 0.01031028, "balance_loss_clip": 1.2587136, "balance_loss_mlp": 1.01119101, "epoch": 0.4552532692018638, "flos": 19182226867200.0, "grad_norm": 1.447537095009797, "language_loss": 0.81807041, "learning_rate": 2.3845209753783414e-06, "loss": 0.84264237, "num_input_tokens_seen": 162474940, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19836426, "step": 7572, "time_per_iteration": 4.227491140365601 }, { "auxiliary_loss_clip": 0.01462793, "auxiliary_loss_mlp": 0.0103705, "balance_loss_clip": 1.28391314, "balance_loss_mlp": 1.0146265, "epoch": 0.4553133924545318, "flos": 26037233879040.0, "grad_norm": 2.1151203355951176, "language_loss": 0.7341994, "learning_rate": 2.3841387722339486e-06, "loss": 0.75919783, "num_input_tokens_seen": 162493340, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.22412109, "step": 7573, "time_per_iteration": 2.8964827060699463 }, { "auxiliary_loss_clip": 0.01460676, "auxiliary_loss_mlp": 0.0104104, "balance_loss_clip": 1.28253913, "balance_loss_mlp": 1.01792538, "epoch": 0.45537351570719975, "flos": 30672994521600.0, "grad_norm": 8.922118262640486, "language_loss": 0.74788111, "learning_rate": 2.3837565545229748e-06, "loss": 0.77289832, "num_input_tokens_seen": 162514360, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.23132324, "step": 7574, "time_per_iteration": 4.384619235992432 }, { "auxiliary_loss_clip": 0.01453033, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.2769835, "balance_loss_mlp": 1.01322937, "epoch": 0.4554336389598677, "flos": 24364229708160.0, "grad_norm": 1.6915863172451424, "language_loss": 0.72359967, "learning_rate": 2.383374322259915e-06, "loss": 0.74847674, "num_input_tokens_seen": 162535240, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.21447754, "step": 7575, "time_per_iteration": 2.9089901447296143 }, { "auxiliary_loss_clip": 0.01439679, "auxiliary_loss_mlp": 0.010371, "balance_loss_clip": 1.26483953, "balance_loss_mlp": 1.01542759, "epoch": 0.4554937622125357, "flos": 20567527482240.0, "grad_norm": 1.8383230564333382, "language_loss": 0.74455279, "learning_rate": 2.3829920754592617e-06, "loss": 0.76932055, "num_input_tokens_seen": 162553880, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.21679688, "step": 7576, "time_per_iteration": 2.8489561080932617 }, { "auxiliary_loss_clip": 0.01434245, "auxiliary_loss_mlp": 0.01040904, "balance_loss_clip": 1.26350284, "balance_loss_mlp": 1.0195061, "epoch": 0.45555388546520365, "flos": 22831141017600.0, "grad_norm": 1.706013871154292, "language_loss": 0.6719833, "learning_rate": 2.382609814135511e-06, "loss": 0.69673479, "num_input_tokens_seen": 162574485, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.21386719, "step": 7577, "time_per_iteration": 2.8618245124816895 }, { "auxiliary_loss_clip": 0.01444015, "auxiliary_loss_mlp": 0.01047905, "balance_loss_clip": 1.27028966, "balance_loss_mlp": 1.02479017, "epoch": 0.4556140087178716, "flos": 21736032422400.0, "grad_norm": 1.8003414044926744, "language_loss": 0.75002128, "learning_rate": 2.382227538303157e-06, "loss": 0.77494049, "num_input_tokens_seen": 162595130, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.23120117, "step": 7578, "time_per_iteration": 2.883845329284668 }, { "auxiliary_loss_clip": 0.01434239, "auxiliary_loss_mlp": 0.01039793, "balance_loss_clip": 1.26084661, "balance_loss_mlp": 1.01828718, "epoch": 0.45567413197053963, "flos": 26005580501760.0, "grad_norm": 1.8526359869111961, "language_loss": 0.71722507, "learning_rate": 2.381845247976697e-06, "loss": 0.74196541, "num_input_tokens_seen": 162615720, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.21508789, "step": 7579, "time_per_iteration": 2.889981985092163 }, { "auxiliary_loss_clip": 0.01441936, "auxiliary_loss_mlp": 0.01037955, "balance_loss_clip": 1.26905036, "balance_loss_mlp": 1.01792765, "epoch": 0.4557342552232076, "flos": 21546048931200.0, "grad_norm": 1.9089140279441101, "language_loss": 0.79688966, "learning_rate": 2.381462943170627e-06, "loss": 0.82168853, "num_input_tokens_seen": 162635825, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.20031738, "step": 7580, "time_per_iteration": 2.860614776611328 }, { "auxiliary_loss_clip": 0.01439235, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.26762819, "balance_loss_mlp": 1.01771092, "epoch": 0.45579437847587556, "flos": 40015876135680.0, "grad_norm": 1.950181382572291, "language_loss": 0.68972373, "learning_rate": 2.381080623899444e-06, "loss": 0.71451092, "num_input_tokens_seen": 162659130, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21777344, "step": 7581, "time_per_iteration": 3.020888566970825 }, { "auxiliary_loss_clip": 0.01435621, "auxiliary_loss_mlp": 0.01038144, "balance_loss_clip": 1.2664696, "balance_loss_mlp": 1.01742554, "epoch": 0.4558545017285435, "flos": 31150772058240.0, "grad_norm": 1.7444092383859002, "language_loss": 0.73943371, "learning_rate": 2.3806982901776455e-06, "loss": 0.76417136, "num_input_tokens_seen": 162681665, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20715332, "step": 7582, "time_per_iteration": 2.9284143447875977 }, { "auxiliary_loss_clip": 0.01455487, "auxiliary_loss_mlp": 0.01047559, "balance_loss_clip": 1.27940774, "balance_loss_mlp": 1.02502894, "epoch": 0.4559146249812115, "flos": 21735760953600.0, "grad_norm": 1.6435245149878732, "language_loss": 0.73481107, "learning_rate": 2.380315942019729e-06, "loss": 0.75984144, "num_input_tokens_seen": 162702040, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.2253418, "step": 7583, "time_per_iteration": 2.8274807929992676 }, { "auxiliary_loss_clip": 0.01450039, "auxiliary_loss_mlp": 0.01035103, "balance_loss_clip": 1.27267289, "balance_loss_mlp": 1.01444459, "epoch": 0.45597474823387946, "flos": 23816630165760.0, "grad_norm": 1.6987208491479704, "language_loss": 0.73012036, "learning_rate": 2.379933579440195e-06, "loss": 0.7549718, "num_input_tokens_seen": 162722375, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.20666504, "step": 7584, "time_per_iteration": 2.976654529571533 }, { "auxiliary_loss_clip": 0.0145073, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.27759147, "balance_loss_mlp": 1.01192498, "epoch": 0.4560348714865474, "flos": 31918379656320.0, "grad_norm": 1.8794146153920301, "language_loss": 0.6854108, "learning_rate": 2.379551202453541e-06, "loss": 0.7102567, "num_input_tokens_seen": 162746095, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.21948242, "step": 7585, "time_per_iteration": 2.9197587966918945 }, { "auxiliary_loss_clip": 0.01454096, "auxiliary_loss_mlp": 0.01037148, "balance_loss_clip": 1.27896333, "balance_loss_mlp": 1.01626277, "epoch": 0.4560949947392154, "flos": 22057515861120.0, "grad_norm": 1.6174023188146225, "language_loss": 0.7709403, "learning_rate": 2.379168811074267e-06, "loss": 0.79585272, "num_input_tokens_seen": 162766330, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.2088623, "step": 7586, "time_per_iteration": 2.847783088684082 }, { "auxiliary_loss_clip": 0.01440956, "auxiliary_loss_mlp": 0.01036033, "balance_loss_clip": 1.26877499, "balance_loss_mlp": 1.01527822, "epoch": 0.45615511799188335, "flos": 24582563706240.0, "grad_norm": 1.9455488369331304, "language_loss": 0.78947526, "learning_rate": 2.3787864053168747e-06, "loss": 0.8142451, "num_input_tokens_seen": 162784755, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.20751953, "step": 7587, "time_per_iteration": 2.9298770427703857 }, { "auxiliary_loss_clip": 0.01474189, "auxiliary_loss_mlp": 0.01043037, "balance_loss_clip": 1.29167795, "balance_loss_mlp": 1.02179432, "epoch": 0.4562152412445513, "flos": 18339322642560.0, "grad_norm": 1.9002097418180268, "language_loss": 0.69483757, "learning_rate": 2.378403985195863e-06, "loss": 0.72000986, "num_input_tokens_seen": 162803850, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.21264648, "step": 7588, "time_per_iteration": 2.8587167263031006 }, { "auxiliary_loss_clip": 0.01436177, "auxiliary_loss_mlp": 0.01040905, "balance_loss_clip": 1.26659036, "balance_loss_mlp": 1.0190661, "epoch": 0.4562753644972193, "flos": 13524392016000.0, "grad_norm": 2.0219648214081474, "language_loss": 0.7979728, "learning_rate": 2.378021550725735e-06, "loss": 0.82274365, "num_input_tokens_seen": 162820775, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21838379, "step": 7589, "time_per_iteration": 2.867971420288086 }, { "auxiliary_loss_clip": 0.01444354, "auxiliary_loss_mlp": 0.01039787, "balance_loss_clip": 1.27054203, "balance_loss_mlp": 1.01700604, "epoch": 0.45633548774988725, "flos": 29650829823360.0, "grad_norm": 2.2688258761597644, "language_loss": 0.63853067, "learning_rate": 2.377639101920992e-06, "loss": 0.66337204, "num_input_tokens_seen": 162839695, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.22790527, "step": 7590, "time_per_iteration": 2.909085273742676 }, { "auxiliary_loss_clip": 0.01451568, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.276214, "balance_loss_mlp": 1.01591563, "epoch": 0.4563956110025552, "flos": 22242703403520.0, "grad_norm": 2.6151672286298555, "language_loss": 0.7296077, "learning_rate": 2.377256638796135e-06, "loss": 0.75448954, "num_input_tokens_seen": 162856095, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.20703125, "step": 7591, "time_per_iteration": 2.8758602142333984 }, { "auxiliary_loss_clip": 0.01444244, "auxiliary_loss_mlp": 0.01045374, "balance_loss_clip": 1.27019811, "balance_loss_mlp": 1.02316523, "epoch": 0.45645573425522323, "flos": 17100814717440.0, "grad_norm": 2.2599114657751347, "language_loss": 0.78497398, "learning_rate": 2.3768741613656695e-06, "loss": 0.80987012, "num_input_tokens_seen": 162874070, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.22216797, "step": 7592, "time_per_iteration": 2.849686622619629 }, { "auxiliary_loss_clip": 0.01449198, "auxiliary_loss_mlp": 0.01038465, "balance_loss_clip": 1.2736963, "balance_loss_mlp": 1.01822352, "epoch": 0.4565158575078912, "flos": 20340144524160.0, "grad_norm": 2.277720968368235, "language_loss": 0.70817959, "learning_rate": 2.376491669644098e-06, "loss": 0.73305625, "num_input_tokens_seen": 162891000, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.20239258, "step": 7593, "time_per_iteration": 2.9023921489715576 }, { "auxiliary_loss_clip": 0.01432356, "auxiliary_loss_mlp": 0.01036493, "balance_loss_clip": 1.26216722, "balance_loss_mlp": 1.01546443, "epoch": 0.45657598076055916, "flos": 23992316300160.0, "grad_norm": 1.9392908925271055, "language_loss": 0.84623253, "learning_rate": 2.3761091636459248e-06, "loss": 0.87092102, "num_input_tokens_seen": 162910120, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21020508, "step": 7594, "time_per_iteration": 2.9609975814819336 }, { "auxiliary_loss_clip": 0.01258472, "auxiliary_loss_mlp": 0.01047196, "balance_loss_clip": 1.15381527, "balance_loss_mlp": 1.01686907, "epoch": 0.45663610401322713, "flos": 69393028291200.0, "grad_norm": 0.8017831952116005, "language_loss": 0.52821457, "learning_rate": 2.375726643385654e-06, "loss": 0.5512712, "num_input_tokens_seen": 162963720, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.30273438, "step": 7595, "time_per_iteration": 3.4565269947052 }, { "auxiliary_loss_clip": 0.01455675, "auxiliary_loss_mlp": 0.01035446, "balance_loss_clip": 1.2759428, "balance_loss_mlp": 1.01421535, "epoch": 0.4566962272658951, "flos": 15154974547200.0, "grad_norm": 2.8190747279946824, "language_loss": 0.87717551, "learning_rate": 2.3753441088777915e-06, "loss": 0.90208673, "num_input_tokens_seen": 162975760, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.21228027, "step": 7596, "time_per_iteration": 2.8314874172210693 }, { "auxiliary_loss_clip": 0.01461636, "auxiliary_loss_mlp": 0.01043433, "balance_loss_clip": 1.2860297, "balance_loss_mlp": 1.02209473, "epoch": 0.45675635051856306, "flos": 18706666325760.0, "grad_norm": 1.5216803503314862, "language_loss": 0.77822578, "learning_rate": 2.374961560136843e-06, "loss": 0.80327648, "num_input_tokens_seen": 162994865, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.21350098, "step": 7597, "time_per_iteration": 2.80583119392395 }, { "auxiliary_loss_clip": 0.01451962, "auxiliary_loss_mlp": 0.01037521, "balance_loss_clip": 1.27596998, "balance_loss_mlp": 1.01539564, "epoch": 0.456816473771231, "flos": 19107473178240.0, "grad_norm": 1.662337096903005, "language_loss": 0.79174864, "learning_rate": 2.374578997177314e-06, "loss": 0.81664348, "num_input_tokens_seen": 163014730, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.22131348, "step": 7598, "time_per_iteration": 2.847041368484497 }, { "auxiliary_loss_clip": 0.0143436, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.26242721, "balance_loss_mlp": 1.01212215, "epoch": 0.456876597023899, "flos": 28961957456640.0, "grad_norm": 2.5000267652049186, "language_loss": 0.72607541, "learning_rate": 2.374196420013712e-06, "loss": 0.7507503, "num_input_tokens_seen": 163033405, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.20996094, "step": 7599, "time_per_iteration": 4.2605321407318115 }, { "auxiliary_loss_clip": 0.01436548, "auxiliary_loss_mlp": 0.0103774, "balance_loss_clip": 1.26444328, "balance_loss_mlp": 1.01623464, "epoch": 0.45693672027656695, "flos": 23298512250240.0, "grad_norm": 2.4036527224821387, "language_loss": 0.70541847, "learning_rate": 2.373813828660544e-06, "loss": 0.73016131, "num_input_tokens_seen": 163051400, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.21484375, "step": 7600, "time_per_iteration": 2.8635849952697754 }, { "auxiliary_loss_clip": 0.01454535, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.27878165, "balance_loss_mlp": 1.01670897, "epoch": 0.4569968435292349, "flos": 20567889440640.0, "grad_norm": 2.2593700131700745, "language_loss": 0.79884666, "learning_rate": 2.373431223132319e-06, "loss": 0.82377791, "num_input_tokens_seen": 163069250, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.21875, "step": 7601, "time_per_iteration": 2.8804054260253906 }, { "auxiliary_loss_clip": 0.01456886, "auxiliary_loss_mlp": 0.01036999, "balance_loss_clip": 1.28018165, "balance_loss_mlp": 1.01705503, "epoch": 0.4570569667819029, "flos": 41297801086080.0, "grad_norm": 2.9982826855095244, "language_loss": 0.72501063, "learning_rate": 2.3730486034435448e-06, "loss": 0.74994946, "num_input_tokens_seen": 163091755, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.19946289, "step": 7602, "time_per_iteration": 3.0245790481567383 }, { "auxiliary_loss_clip": 0.01453868, "auxiliary_loss_mlp": 0.01037181, "balance_loss_clip": 1.27768707, "balance_loss_mlp": 1.01488888, "epoch": 0.45711709003457085, "flos": 26042798989440.0, "grad_norm": 2.024513082480885, "language_loss": 0.74407864, "learning_rate": 2.372665969608729e-06, "loss": 0.76898909, "num_input_tokens_seen": 163111600, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.22290039, "step": 7603, "time_per_iteration": 2.8460190296173096 }, { "auxiliary_loss_clip": 0.01446665, "auxiliary_loss_mlp": 0.01037175, "balance_loss_clip": 1.27253616, "balance_loss_mlp": 1.01569343, "epoch": 0.4571772132872388, "flos": 22166728104960.0, "grad_norm": 1.7710762089831429, "language_loss": 0.83452547, "learning_rate": 2.372283321642383e-06, "loss": 0.85936385, "num_input_tokens_seen": 163127350, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.21472168, "step": 7604, "time_per_iteration": 2.859025001525879 }, { "auxiliary_loss_clip": 0.01468538, "auxiliary_loss_mlp": 0.01042878, "balance_loss_clip": 1.28734159, "balance_loss_mlp": 1.02034724, "epoch": 0.45723733653990684, "flos": 23889528817920.0, "grad_norm": 1.8576968278892065, "language_loss": 0.87070775, "learning_rate": 2.371900659559016e-06, "loss": 0.89582193, "num_input_tokens_seen": 163145855, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.22521973, "step": 7605, "time_per_iteration": 2.939392566680908 }, { "auxiliary_loss_clip": 0.01472591, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.2938484, "balance_loss_mlp": 1.01628411, "epoch": 0.4572974597925748, "flos": 16880173234560.0, "grad_norm": 1.945181262612315, "language_loss": 0.74755496, "learning_rate": 2.371517983373138e-06, "loss": 0.77265733, "num_input_tokens_seen": 163163830, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.21374512, "step": 7606, "time_per_iteration": 5.642092227935791 }, { "auxiliary_loss_clip": 0.01463893, "auxiliary_loss_mlp": 0.01038215, "balance_loss_clip": 1.28380358, "balance_loss_mlp": 1.01625633, "epoch": 0.45735758304524277, "flos": 13779311074560.0, "grad_norm": 2.7528073491231124, "language_loss": 0.81181383, "learning_rate": 2.371135293099262e-06, "loss": 0.83683491, "num_input_tokens_seen": 163180700, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.21972656, "step": 7607, "time_per_iteration": 2.847872734069824 }, { "auxiliary_loss_clip": 0.01459627, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.28273177, "balance_loss_mlp": 1.01803052, "epoch": 0.45741770629791073, "flos": 21109335690240.0, "grad_norm": 2.8447925849831415, "language_loss": 0.81191951, "learning_rate": 2.3707525887518982e-06, "loss": 0.83691168, "num_input_tokens_seen": 163199450, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.2154541, "step": 7608, "time_per_iteration": 4.293159008026123 }, { "auxiliary_loss_clip": 0.01443923, "auxiliary_loss_mlp": 0.01034316, "balance_loss_clip": 1.26896775, "balance_loss_mlp": 1.01329947, "epoch": 0.4574778295505787, "flos": 23123414298240.0, "grad_norm": 1.783088245684045, "language_loss": 0.69084692, "learning_rate": 2.370369870345559e-06, "loss": 0.71562934, "num_input_tokens_seen": 163217875, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.21020508, "step": 7609, "time_per_iteration": 2.8407235145568848 }, { "auxiliary_loss_clip": 0.01447172, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.27084959, "balance_loss_mlp": 1.01579583, "epoch": 0.45753795280324666, "flos": 24363641525760.0, "grad_norm": 1.9463402177206985, "language_loss": 0.8190971, "learning_rate": 2.369987137894757e-06, "loss": 0.84393096, "num_input_tokens_seen": 163237430, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.20422363, "step": 7610, "time_per_iteration": 2.9315028190612793 }, { "auxiliary_loss_clip": 0.01469481, "auxiliary_loss_mlp": 0.01038719, "balance_loss_clip": 1.29037023, "balance_loss_mlp": 1.01711798, "epoch": 0.4575980760559146, "flos": 16662336929280.0, "grad_norm": 2.4640997193383254, "language_loss": 0.83330953, "learning_rate": 2.3696043914140057e-06, "loss": 0.85839158, "num_input_tokens_seen": 163253905, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.21606445, "step": 7611, "time_per_iteration": 2.894212007522583 }, { "auxiliary_loss_clip": 0.01453259, "auxiliary_loss_mlp": 0.01038898, "balance_loss_clip": 1.27775788, "balance_loss_mlp": 1.01691604, "epoch": 0.4576581993085826, "flos": 35923190555520.0, "grad_norm": 1.7801134098653644, "language_loss": 0.74344808, "learning_rate": 2.369221630917819e-06, "loss": 0.76836962, "num_input_tokens_seen": 163274285, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.2199707, "step": 7612, "time_per_iteration": 2.9535319805145264 }, { "auxiliary_loss_clip": 0.01434086, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.26087904, "balance_loss_mlp": 1.01311851, "epoch": 0.45771832256125056, "flos": 20089071273600.0, "grad_norm": 1.5740971651509195, "language_loss": 0.85576648, "learning_rate": 2.368838856420711e-06, "loss": 0.88045168, "num_input_tokens_seen": 163293150, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21313477, "step": 7613, "time_per_iteration": 2.872124195098877 }, { "auxiliary_loss_clip": 0.01450439, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.27505064, "balance_loss_mlp": 1.01220095, "epoch": 0.4577784458139185, "flos": 10751664280320.0, "grad_norm": 1.9405372042217766, "language_loss": 0.7622081, "learning_rate": 2.3684560679371965e-06, "loss": 0.78705573, "num_input_tokens_seen": 163310065, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.22119141, "step": 7614, "time_per_iteration": 2.8006951808929443 }, { "auxiliary_loss_clip": 0.01444563, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 1.27201819, "balance_loss_mlp": 1.01830029, "epoch": 0.4578385690665865, "flos": 21917057443200.0, "grad_norm": 1.5676807958794956, "language_loss": 0.75239646, "learning_rate": 2.368073265481791e-06, "loss": 0.77723312, "num_input_tokens_seen": 163329415, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.20788574, "step": 7615, "time_per_iteration": 2.8754522800445557 }, { "auxiliary_loss_clip": 0.01265543, "auxiliary_loss_mlp": 0.01022911, "balance_loss_clip": 1.16435456, "balance_loss_mlp": 1.00116754, "epoch": 0.45789869231925445, "flos": 64783536894720.0, "grad_norm": 0.7805034422521424, "language_loss": 0.57637572, "learning_rate": 2.3676904490690105e-06, "loss": 0.59926027, "num_input_tokens_seen": 163385875, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.21777344, "step": 7616, "time_per_iteration": 3.3128390312194824 }, { "auxiliary_loss_clip": 0.01441806, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.26735723, "balance_loss_mlp": 1.0152328, "epoch": 0.4579588155719224, "flos": 16152679791360.0, "grad_norm": 1.8139678018668783, "language_loss": 0.7173878, "learning_rate": 2.3673076187133704e-06, "loss": 0.74217051, "num_input_tokens_seen": 163405170, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.21252441, "step": 7617, "time_per_iteration": 2.8204562664031982 }, { "auxiliary_loss_clip": 0.01442577, "auxiliary_loss_mlp": 0.01031953, "balance_loss_clip": 1.26913333, "balance_loss_mlp": 1.00994706, "epoch": 0.45801893882459044, "flos": 21404957086080.0, "grad_norm": 3.1054508614221827, "language_loss": 0.77347016, "learning_rate": 2.36692477442939e-06, "loss": 0.79821545, "num_input_tokens_seen": 163423155, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.22033691, "step": 7618, "time_per_iteration": 2.878480911254883 }, { "auxiliary_loss_clip": 0.01444722, "auxiliary_loss_mlp": 0.01035965, "balance_loss_clip": 1.26887989, "balance_loss_mlp": 1.01577115, "epoch": 0.4580790620772584, "flos": 19545950966400.0, "grad_norm": 1.7298875576255415, "language_loss": 0.77835459, "learning_rate": 2.366541916231585e-06, "loss": 0.8031615, "num_input_tokens_seen": 163442450, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.2019043, "step": 7619, "time_per_iteration": 2.8444936275482178 }, { "auxiliary_loss_clip": 0.01441188, "auxiliary_loss_mlp": 0.01035814, "balance_loss_clip": 1.26782274, "balance_loss_mlp": 1.01439261, "epoch": 0.45813918532992637, "flos": 16589664501120.0, "grad_norm": 1.833564788943475, "language_loss": 0.72443897, "learning_rate": 2.366159044134473e-06, "loss": 0.74920905, "num_input_tokens_seen": 163459810, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21435547, "step": 7620, "time_per_iteration": 2.8428006172180176 }, { "auxiliary_loss_clip": 0.01437895, "auxiliary_loss_mlp": 0.01035644, "balance_loss_clip": 1.26625896, "balance_loss_mlp": 1.01509261, "epoch": 0.45819930858259433, "flos": 42245483564160.0, "grad_norm": 1.6898954212866257, "language_loss": 0.78622794, "learning_rate": 2.3657761581525748e-06, "loss": 0.81096339, "num_input_tokens_seen": 163482970, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.20532227, "step": 7621, "time_per_iteration": 3.044480323791504 }, { "auxiliary_loss_clip": 0.01255442, "auxiliary_loss_mlp": 0.01042213, "balance_loss_clip": 1.1562438, "balance_loss_mlp": 1.0188477, "epoch": 0.4582594318352623, "flos": 63743563728000.0, "grad_norm": 0.7966720418377881, "language_loss": 0.65052855, "learning_rate": 2.3653932583004063e-06, "loss": 0.67350507, "num_input_tokens_seen": 163545330, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.23339844, "step": 7622, "time_per_iteration": 3.3473803997039795 }, { "auxiliary_loss_clip": 0.01451906, "auxiliary_loss_mlp": 0.01036023, "balance_loss_clip": 1.27527559, "balance_loss_mlp": 1.01433873, "epoch": 0.45831955508793026, "flos": 26881223978880.0, "grad_norm": 1.826219463501508, "language_loss": 0.80503082, "learning_rate": 2.3650103445924903e-06, "loss": 0.82991016, "num_input_tokens_seen": 163564620, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.21679688, "step": 7623, "time_per_iteration": 2.919713258743286 }, { "auxiliary_loss_clip": 0.01451138, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.27390838, "balance_loss_mlp": 1.01635361, "epoch": 0.45837967834059823, "flos": 18743160896640.0, "grad_norm": 1.855306994690235, "language_loss": 0.71358734, "learning_rate": 2.3646274170433452e-06, "loss": 0.7384733, "num_input_tokens_seen": 163581010, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.21105957, "step": 7624, "time_per_iteration": 2.826080799102783 }, { "auxiliary_loss_clip": 0.01448601, "auxiliary_loss_mlp": 0.01035142, "balance_loss_clip": 1.27169895, "balance_loss_mlp": 1.01368415, "epoch": 0.4584398015932662, "flos": 21187844697600.0, "grad_norm": 2.092687039943946, "language_loss": 0.74421299, "learning_rate": 2.364244475667491e-06, "loss": 0.76905048, "num_input_tokens_seen": 163599955, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.21472168, "step": 7625, "time_per_iteration": 2.877474069595337 }, { "auxiliary_loss_clip": 0.01447589, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.27317691, "balance_loss_mlp": 1.01472282, "epoch": 0.45849992484593416, "flos": 19799014988160.0, "grad_norm": 1.8338179323685642, "language_loss": 0.78851712, "learning_rate": 2.363861520479451e-06, "loss": 0.81335175, "num_input_tokens_seen": 163618545, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.21154785, "step": 7626, "time_per_iteration": 2.8163959980010986 }, { "auxiliary_loss_clip": 0.01463627, "auxiliary_loss_mlp": 0.01041852, "balance_loss_clip": 1.28491211, "balance_loss_mlp": 1.01961946, "epoch": 0.4585600480986021, "flos": 18232372638720.0, "grad_norm": 1.5236407240422436, "language_loss": 0.85862732, "learning_rate": 2.3634785514937445e-06, "loss": 0.88368213, "num_input_tokens_seen": 163636055, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.22229004, "step": 7627, "time_per_iteration": 2.942054510116577 }, { "auxiliary_loss_clip": 0.01462333, "auxiliary_loss_mlp": 0.01038416, "balance_loss_clip": 1.2820822, "balance_loss_mlp": 1.0160284, "epoch": 0.4586201713512701, "flos": 29033905968000.0, "grad_norm": 1.541489355677873, "language_loss": 0.70170277, "learning_rate": 2.3630955687248953e-06, "loss": 0.72671026, "num_input_tokens_seen": 163657485, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.22387695, "step": 7628, "time_per_iteration": 2.9766438007354736 }, { "auxiliary_loss_clip": 0.01443636, "auxiliary_loss_mlp": 0.01036489, "balance_loss_clip": 1.27142024, "balance_loss_mlp": 1.01386297, "epoch": 0.45868029460393805, "flos": 23415144641280.0, "grad_norm": 2.5945032934010097, "language_loss": 0.78458142, "learning_rate": 2.3627125721874265e-06, "loss": 0.80938274, "num_input_tokens_seen": 163676030, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.22619629, "step": 7629, "time_per_iteration": 2.8186886310577393 }, { "auxiliary_loss_clip": 0.01458025, "auxiliary_loss_mlp": 0.01041868, "balance_loss_clip": 1.27614975, "balance_loss_mlp": 1.01882482, "epoch": 0.458740417856606, "flos": 18230743825920.0, "grad_norm": 2.008948356658365, "language_loss": 0.80213606, "learning_rate": 2.3623295618958595e-06, "loss": 0.82713503, "num_input_tokens_seen": 163694490, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.23046875, "step": 7630, "time_per_iteration": 2.838806629180908 }, { "auxiliary_loss_clip": 0.01465385, "auxiliary_loss_mlp": 0.01042753, "balance_loss_clip": 1.28524804, "balance_loss_mlp": 1.02165341, "epoch": 0.458800541109274, "flos": 34582076127360.0, "grad_norm": 1.7535591605332883, "language_loss": 0.73029542, "learning_rate": 2.3619465378647198e-06, "loss": 0.75537682, "num_input_tokens_seen": 163717035, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.21105957, "step": 7631, "time_per_iteration": 2.9863345623016357 }, { "auxiliary_loss_clip": 0.01462474, "auxiliary_loss_mlp": 0.01042789, "balance_loss_clip": 1.28492987, "balance_loss_mlp": 1.01996064, "epoch": 0.458860664361942, "flos": 17720996198400.0, "grad_norm": 2.05679462207589, "language_loss": 0.72971213, "learning_rate": 2.361563500108531e-06, "loss": 0.75476474, "num_input_tokens_seen": 163734525, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.22839355, "step": 7632, "time_per_iteration": 2.81466007232666 }, { "auxiliary_loss_clip": 0.01474903, "auxiliary_loss_mlp": 0.01042416, "balance_loss_clip": 1.29229641, "balance_loss_mlp": 1.01833606, "epoch": 0.45892078761460997, "flos": 18450978105600.0, "grad_norm": 2.199155467915187, "language_loss": 0.70089376, "learning_rate": 2.3611804486418178e-06, "loss": 0.72606695, "num_input_tokens_seen": 163752860, "router_z_loss_clip": 1.82617188, "router_z_loss_mlp": 0.24072266, "step": 7633, "time_per_iteration": 4.26861047744751 }, { "auxiliary_loss_clip": 0.01454475, "auxiliary_loss_mlp": 0.01039497, "balance_loss_clip": 1.27600884, "balance_loss_mlp": 1.0174675, "epoch": 0.45898091086727794, "flos": 22682810004480.0, "grad_norm": 3.8819057151676013, "language_loss": 0.81969655, "learning_rate": 2.3607973834791062e-06, "loss": 0.84463626, "num_input_tokens_seen": 163772495, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.22045898, "step": 7634, "time_per_iteration": 2.8431358337402344 }, { "auxiliary_loss_clip": 0.01490367, "auxiliary_loss_mlp": 0.01043056, "balance_loss_clip": 1.30512094, "balance_loss_mlp": 1.0200721, "epoch": 0.4590410341199459, "flos": 21662952791040.0, "grad_norm": 1.7840993401334628, "language_loss": 0.8269186, "learning_rate": 2.3604143046349216e-06, "loss": 0.85225284, "num_input_tokens_seen": 163791475, "router_z_loss_clip": 1.85351562, "router_z_loss_mlp": 0.2298584, "step": 7635, "time_per_iteration": 2.9180312156677246 }, { "auxiliary_loss_clip": 0.01447949, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.27163482, "balance_loss_mlp": 1.01991916, "epoch": 0.45910115737261387, "flos": 36548258474880.0, "grad_norm": 1.4998411648664247, "language_loss": 0.65661186, "learning_rate": 2.3600312121237905e-06, "loss": 0.68152618, "num_input_tokens_seen": 163812995, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.2355957, "step": 7636, "time_per_iteration": 2.9770870208740234 }, { "auxiliary_loss_clip": 0.01444012, "auxiliary_loss_mlp": 0.01040083, "balance_loss_clip": 1.27207828, "balance_loss_mlp": 1.01799321, "epoch": 0.45916128062528183, "flos": 24429029541120.0, "grad_norm": 1.6915345909796045, "language_loss": 0.8117699, "learning_rate": 2.3596481059602395e-06, "loss": 0.83661091, "num_input_tokens_seen": 163833945, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.22070312, "step": 7637, "time_per_iteration": 2.910545825958252 }, { "auxiliary_loss_clip": 0.01459197, "auxiliary_loss_mlp": 0.01043461, "balance_loss_clip": 1.27988911, "balance_loss_mlp": 1.0200367, "epoch": 0.4592214038779498, "flos": 23232400318080.0, "grad_norm": 1.5564394269942672, "language_loss": 0.76046091, "learning_rate": 2.3592649861587965e-06, "loss": 0.78548753, "num_input_tokens_seen": 163853885, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.23449707, "step": 7638, "time_per_iteration": 2.901210308074951 }, { "auxiliary_loss_clip": 0.01441778, "auxiliary_loss_mlp": 0.01043102, "balance_loss_clip": 1.26843882, "balance_loss_mlp": 1.0199275, "epoch": 0.45928152713061776, "flos": 19181548195200.0, "grad_norm": 2.51148534065164, "language_loss": 0.74782205, "learning_rate": 2.358881852733989e-06, "loss": 0.77267087, "num_input_tokens_seen": 163871855, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.23156738, "step": 7639, "time_per_iteration": 2.7946150302886963 }, { "auxiliary_loss_clip": 0.01452829, "auxiliary_loss_mlp": 0.01039263, "balance_loss_clip": 1.27615523, "balance_loss_mlp": 1.01712584, "epoch": 0.4593416503832857, "flos": 22423728424320.0, "grad_norm": 1.6104863636814424, "language_loss": 0.69365633, "learning_rate": 2.358498705700346e-06, "loss": 0.71857727, "num_input_tokens_seen": 163891450, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.22119141, "step": 7640, "time_per_iteration": 2.851330041885376 }, { "auxiliary_loss_clip": 0.01468674, "auxiliary_loss_mlp": 0.01042635, "balance_loss_clip": 1.28825688, "balance_loss_mlp": 1.0204618, "epoch": 0.4594017736359537, "flos": 18889546383360.0, "grad_norm": 1.6101901312517073, "language_loss": 0.76666886, "learning_rate": 2.3581155450723958e-06, "loss": 0.79178196, "num_input_tokens_seen": 163909345, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.22180176, "step": 7641, "time_per_iteration": 4.344481706619263 }, { "auxiliary_loss_clip": 0.01458644, "auxiliary_loss_mlp": 0.01047381, "balance_loss_clip": 1.27939129, "balance_loss_mlp": 1.02420616, "epoch": 0.45946189688862166, "flos": 20527413327360.0, "grad_norm": 1.6032145181305488, "language_loss": 0.74876487, "learning_rate": 2.357732370864668e-06, "loss": 0.77382517, "num_input_tokens_seen": 163926940, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.23168945, "step": 7642, "time_per_iteration": 4.264801740646362 }, { "auxiliary_loss_clip": 0.01255846, "auxiliary_loss_mlp": 0.01062565, "balance_loss_clip": 1.15119696, "balance_loss_mlp": 1.03958118, "epoch": 0.4595220201412896, "flos": 61431510994560.0, "grad_norm": 0.8596463334591588, "language_loss": 0.58175772, "learning_rate": 2.357349183091694e-06, "loss": 0.60494179, "num_input_tokens_seen": 163977785, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.22949219, "step": 7643, "time_per_iteration": 4.44842791557312 }, { "auxiliary_loss_clip": 0.01471033, "auxiliary_loss_mlp": 0.01039326, "balance_loss_clip": 1.28700459, "balance_loss_mlp": 1.01726007, "epoch": 0.4595821433939576, "flos": 23341295848320.0, "grad_norm": 1.578175317639982, "language_loss": 0.9331857, "learning_rate": 2.3569659817680016e-06, "loss": 0.95828933, "num_input_tokens_seen": 163996630, "router_z_loss_clip": 1.83984375, "router_z_loss_mlp": 0.22070312, "step": 7644, "time_per_iteration": 2.870189666748047 }, { "auxiliary_loss_clip": 0.01459562, "auxiliary_loss_mlp": 0.01039925, "balance_loss_clip": 1.27972162, "balance_loss_mlp": 1.01733494, "epoch": 0.4596422666466256, "flos": 14290325556480.0, "grad_norm": 3.2271704884911823, "language_loss": 0.84058654, "learning_rate": 2.3565827669081243e-06, "loss": 0.86558139, "num_input_tokens_seen": 164013190, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.22595215, "step": 7645, "time_per_iteration": 2.854024887084961 }, { "auxiliary_loss_clip": 0.01261487, "auxiliary_loss_mlp": 0.01043292, "balance_loss_clip": 1.1572001, "balance_loss_mlp": 1.01754236, "epoch": 0.4597023898992936, "flos": 65758140028800.0, "grad_norm": 0.7625435082316689, "language_loss": 0.59964979, "learning_rate": 2.356199538526593e-06, "loss": 0.62269747, "num_input_tokens_seen": 164074030, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.2578125, "step": 7646, "time_per_iteration": 3.236727476119995 }, { "auxiliary_loss_clip": 0.01451851, "auxiliary_loss_mlp": 0.01039168, "balance_loss_clip": 1.27397609, "balance_loss_mlp": 1.01685214, "epoch": 0.45976251315196154, "flos": 26918487711360.0, "grad_norm": 1.6360056184487939, "language_loss": 0.73559272, "learning_rate": 2.355816296637939e-06, "loss": 0.76050293, "num_input_tokens_seen": 164095515, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.2232666, "step": 7647, "time_per_iteration": 2.9498050212860107 }, { "auxiliary_loss_clip": 0.01468636, "auxiliary_loss_mlp": 0.0103989, "balance_loss_clip": 1.28690827, "balance_loss_mlp": 1.01769304, "epoch": 0.4598226364046295, "flos": 26630150728320.0, "grad_norm": 1.8851471503796926, "language_loss": 0.67468166, "learning_rate": 2.3554330412566957e-06, "loss": 0.69976699, "num_input_tokens_seen": 164117270, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.2220459, "step": 7648, "time_per_iteration": 2.923333168029785 }, { "auxiliary_loss_clip": 0.01453288, "auxiliary_loss_mlp": 0.01039691, "balance_loss_clip": 1.27463198, "balance_loss_mlp": 1.01661229, "epoch": 0.45988275965729747, "flos": 24398054835840.0, "grad_norm": 1.412856328751868, "language_loss": 0.7928651, "learning_rate": 2.3550497723973953e-06, "loss": 0.81779492, "num_input_tokens_seen": 164137850, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.23083496, "step": 7649, "time_per_iteration": 2.889042377471924 }, { "auxiliary_loss_clip": 0.01444424, "auxiliary_loss_mlp": 0.01043287, "balance_loss_clip": 1.26792753, "balance_loss_mlp": 1.02075672, "epoch": 0.45994288290996543, "flos": 24546702562560.0, "grad_norm": 2.2982195803748966, "language_loss": 0.70390218, "learning_rate": 2.3546664900745726e-06, "loss": 0.72877932, "num_input_tokens_seen": 164157960, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.22546387, "step": 7650, "time_per_iteration": 2.917297601699829 }, { "auxiliary_loss_clip": 0.01463919, "auxiliary_loss_mlp": 0.0104052, "balance_loss_clip": 1.28052545, "balance_loss_mlp": 1.01664221, "epoch": 0.4600030061626334, "flos": 14838513281280.0, "grad_norm": 1.938361188275399, "language_loss": 0.85015368, "learning_rate": 2.354283194302761e-06, "loss": 0.87519807, "num_input_tokens_seen": 164174590, "router_z_loss_clip": 1.83398438, "router_z_loss_mlp": 0.2388916, "step": 7651, "time_per_iteration": 2.8470563888549805 }, { "auxiliary_loss_clip": 0.01449534, "auxiliary_loss_mlp": 0.01046224, "balance_loss_clip": 1.27428436, "balance_loss_mlp": 1.02335978, "epoch": 0.46006312941530136, "flos": 18122934170880.0, "grad_norm": 1.8645703272280578, "language_loss": 0.7600041, "learning_rate": 2.3538998850964948e-06, "loss": 0.7849617, "num_input_tokens_seen": 164192935, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.2286377, "step": 7652, "time_per_iteration": 2.8457212448120117 }, { "auxiliary_loss_clip": 0.01457657, "auxiliary_loss_mlp": 0.01037846, "balance_loss_clip": 1.27710938, "balance_loss_mlp": 1.0151006, "epoch": 0.46012325266796933, "flos": 21985295880960.0, "grad_norm": 1.9922473450657596, "language_loss": 0.76580691, "learning_rate": 2.3535165624703097e-06, "loss": 0.79076195, "num_input_tokens_seen": 164213160, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.22741699, "step": 7653, "time_per_iteration": 2.8733856678009033 }, { "auxiliary_loss_clip": 0.015058, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.31642175, "balance_loss_mlp": 1.01944971, "epoch": 0.4601833759206373, "flos": 15276448131840.0, "grad_norm": 2.0839339911837507, "language_loss": 0.66852272, "learning_rate": 2.353133226438741e-06, "loss": 0.69401425, "num_input_tokens_seen": 164229330, "router_z_loss_clip": 1.89355469, "router_z_loss_mlp": 0.23913574, "step": 7654, "time_per_iteration": 2.805328130722046 }, { "auxiliary_loss_clip": 0.01455106, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.2777276, "balance_loss_mlp": 1.01544309, "epoch": 0.46024349917330526, "flos": 27100327138560.0, "grad_norm": 1.6160190238407148, "language_loss": 0.79954708, "learning_rate": 2.3527498770163248e-06, "loss": 0.82446516, "num_input_tokens_seen": 164248240, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.21252441, "step": 7655, "time_per_iteration": 2.8690006732940674 }, { "auxiliary_loss_clip": 0.01452817, "auxiliary_loss_mlp": 0.01036123, "balance_loss_clip": 1.27859044, "balance_loss_mlp": 1.01486802, "epoch": 0.4603036224259732, "flos": 24473803910400.0, "grad_norm": 1.6723703007031339, "language_loss": 0.68301916, "learning_rate": 2.3523665142175985e-06, "loss": 0.70790851, "num_input_tokens_seen": 164268020, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21264648, "step": 7656, "time_per_iteration": 2.8582396507263184 }, { "auxiliary_loss_clip": 0.01459991, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.2795285, "balance_loss_mlp": 1.01584816, "epoch": 0.4603637456786412, "flos": 28120546310400.0, "grad_norm": 1.66160198700632, "language_loss": 0.82239389, "learning_rate": 2.351983138057098e-06, "loss": 0.84739423, "num_input_tokens_seen": 164287305, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.24194336, "step": 7657, "time_per_iteration": 2.882878541946411 }, { "auxiliary_loss_clip": 0.01458924, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.28060055, "balance_loss_mlp": 1.01470411, "epoch": 0.4604238689313092, "flos": 24358619352960.0, "grad_norm": 2.2811860066437126, "language_loss": 0.71773565, "learning_rate": 2.3515997485493623e-06, "loss": 0.74269593, "num_input_tokens_seen": 164306835, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.22399902, "step": 7658, "time_per_iteration": 2.863060235977173 }, { "auxiliary_loss_clip": 0.01248624, "auxiliary_loss_mlp": 0.01035491, "balance_loss_clip": 1.14827847, "balance_loss_mlp": 1.00993276, "epoch": 0.4604839921839772, "flos": 53631852906240.0, "grad_norm": 0.981469992117087, "language_loss": 0.62197721, "learning_rate": 2.351216345708928e-06, "loss": 0.64481831, "num_input_tokens_seen": 164367095, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.25585938, "step": 7659, "time_per_iteration": 3.4133360385894775 }, { "auxiliary_loss_clip": 0.01454666, "auxiliary_loss_mlp": 0.01038645, "balance_loss_clip": 1.27858782, "balance_loss_mlp": 1.01528013, "epoch": 0.46054411543664514, "flos": 31260843953280.0, "grad_norm": 2.1375396244492753, "language_loss": 0.69336587, "learning_rate": 2.350832929550336e-06, "loss": 0.71829891, "num_input_tokens_seen": 164388895, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.23376465, "step": 7660, "time_per_iteration": 2.907419204711914 }, { "auxiliary_loss_clip": 0.01470488, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.28911471, "balance_loss_mlp": 1.01699424, "epoch": 0.4606042386893131, "flos": 24102297705600.0, "grad_norm": 1.9545580605731985, "language_loss": 0.77932233, "learning_rate": 2.3504495000881227e-06, "loss": 0.8044309, "num_input_tokens_seen": 164409080, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.23388672, "step": 7661, "time_per_iteration": 2.895961284637451 }, { "auxiliary_loss_clip": 0.01463447, "auxiliary_loss_mlp": 0.01040121, "balance_loss_clip": 1.28873563, "balance_loss_mlp": 1.01769793, "epoch": 0.46066436194198107, "flos": 26589719859840.0, "grad_norm": 1.9473382129393224, "language_loss": 0.75489354, "learning_rate": 2.3500660573368305e-06, "loss": 0.77992922, "num_input_tokens_seen": 164427585, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.22412109, "step": 7662, "time_per_iteration": 2.9006452560424805 }, { "auxiliary_loss_clip": 0.01499356, "auxiliary_loss_mlp": 0.01042674, "balance_loss_clip": 1.31234562, "balance_loss_mlp": 1.01988125, "epoch": 0.46072448519464904, "flos": 17782040712960.0, "grad_norm": 2.905502127402866, "language_loss": 0.81387842, "learning_rate": 2.349682601310998e-06, "loss": 0.83929873, "num_input_tokens_seen": 164438455, "router_z_loss_clip": 1.86914062, "router_z_loss_mlp": 0.22814941, "step": 7663, "time_per_iteration": 2.80364990234375 }, { "auxiliary_loss_clip": 0.01456792, "auxiliary_loss_mlp": 0.01037231, "balance_loss_clip": 1.28175795, "balance_loss_mlp": 1.01533186, "epoch": 0.460784608447317, "flos": 15094563459840.0, "grad_norm": 2.0495288063324804, "language_loss": 0.74272954, "learning_rate": 2.3492991320251653e-06, "loss": 0.7676698, "num_input_tokens_seen": 164456830, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.21899414, "step": 7664, "time_per_iteration": 2.837287187576294 }, { "auxiliary_loss_clip": 0.01476722, "auxiliary_loss_mlp": 0.01042757, "balance_loss_clip": 1.29636168, "balance_loss_mlp": 1.01938045, "epoch": 0.46084473169998497, "flos": 18597499326720.0, "grad_norm": 2.3572935485616213, "language_loss": 0.73576784, "learning_rate": 2.3489156494938753e-06, "loss": 0.76096267, "num_input_tokens_seen": 164475375, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.23388672, "step": 7665, "time_per_iteration": 2.806087017059326 }, { "auxiliary_loss_clip": 0.01480909, "auxiliary_loss_mlp": 0.01040955, "balance_loss_clip": 1.30047429, "balance_loss_mlp": 1.01862741, "epoch": 0.46090485495265293, "flos": 19502805409920.0, "grad_norm": 1.9041755607085937, "language_loss": 0.78880095, "learning_rate": 2.348532153731669e-06, "loss": 0.81401956, "num_input_tokens_seen": 164492040, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.22338867, "step": 7666, "time_per_iteration": 2.86143159866333 }, { "auxiliary_loss_clip": 0.01468965, "auxiliary_loss_mlp": 0.01047178, "balance_loss_clip": 1.29213047, "balance_loss_mlp": 1.02426565, "epoch": 0.4609649782053209, "flos": 33377981512320.0, "grad_norm": 1.492689878875838, "language_loss": 0.74794912, "learning_rate": 2.348148644753088e-06, "loss": 0.77311051, "num_input_tokens_seen": 164513665, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.22900391, "step": 7667, "time_per_iteration": 2.9449093341827393 }, { "auxiliary_loss_clip": 0.01469546, "auxiliary_loss_mlp": 0.01047107, "balance_loss_clip": 1.29039729, "balance_loss_mlp": 1.02492261, "epoch": 0.46102510145798886, "flos": 23779728391680.0, "grad_norm": 1.5472101588568568, "language_loss": 0.76667464, "learning_rate": 2.347765122572676e-06, "loss": 0.79184115, "num_input_tokens_seen": 164533890, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.2220459, "step": 7668, "time_per_iteration": 4.272247791290283 }, { "auxiliary_loss_clip": 0.01460633, "auxiliary_loss_mlp": 0.01047417, "balance_loss_clip": 1.28737569, "balance_loss_mlp": 1.02582848, "epoch": 0.4610852247106568, "flos": 23305253725440.0, "grad_norm": 1.5418460270269305, "language_loss": 0.78454971, "learning_rate": 2.347381587204975e-06, "loss": 0.80963016, "num_input_tokens_seen": 164553815, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.21569824, "step": 7669, "time_per_iteration": 2.9483301639556885 }, { "auxiliary_loss_clip": 0.01475282, "auxiliary_loss_mlp": 0.01046303, "balance_loss_clip": 1.29592896, "balance_loss_mlp": 1.02485776, "epoch": 0.4611453479633248, "flos": 25458433407360.0, "grad_norm": 1.9426543796752684, "language_loss": 0.83513814, "learning_rate": 2.34699803866453e-06, "loss": 0.86035395, "num_input_tokens_seen": 164573125, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.21447754, "step": 7670, "time_per_iteration": 2.954928398132324 }, { "auxiliary_loss_clip": 0.01462845, "auxiliary_loss_mlp": 0.01055064, "balance_loss_clip": 1.28739405, "balance_loss_mlp": 1.03315306, "epoch": 0.4612054712159928, "flos": 21148816417920.0, "grad_norm": 1.5639319224879435, "language_loss": 0.64491916, "learning_rate": 2.3466144769658845e-06, "loss": 0.6700983, "num_input_tokens_seen": 164592575, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.21911621, "step": 7671, "time_per_iteration": 2.8849852085113525 }, { "auxiliary_loss_clip": 0.01240997, "auxiliary_loss_mlp": 0.01037652, "balance_loss_clip": 1.14135361, "balance_loss_mlp": 1.01114023, "epoch": 0.4612655944686608, "flos": 69991057802880.0, "grad_norm": 0.7085719865318233, "language_loss": 0.55859709, "learning_rate": 2.346230902123583e-06, "loss": 0.58138359, "num_input_tokens_seen": 164659795, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.265625, "step": 7672, "time_per_iteration": 3.4475631713867188 }, { "auxiliary_loss_clip": 0.01474484, "auxiliary_loss_mlp": 0.01052642, "balance_loss_clip": 1.29409194, "balance_loss_mlp": 1.02968216, "epoch": 0.46132571772132874, "flos": 16845986148480.0, "grad_norm": 1.9219157164943914, "language_loss": 0.73209375, "learning_rate": 2.3458473141521715e-06, "loss": 0.75736499, "num_input_tokens_seen": 164678735, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.22949219, "step": 7673, "time_per_iteration": 2.793931007385254 }, { "auxiliary_loss_clip": 0.01464096, "auxiliary_loss_mlp": 0.01051446, "balance_loss_clip": 1.28754342, "balance_loss_mlp": 1.02873623, "epoch": 0.4613858409739967, "flos": 35822484334080.0, "grad_norm": 1.8349664255165943, "language_loss": 0.71349543, "learning_rate": 2.345463713066195e-06, "loss": 0.73865086, "num_input_tokens_seen": 164700885, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.22705078, "step": 7674, "time_per_iteration": 2.9761016368865967 }, { "auxiliary_loss_clip": 0.01473136, "auxiliary_loss_mlp": 0.01051087, "balance_loss_clip": 1.29566932, "balance_loss_mlp": 1.02881896, "epoch": 0.4614459642266647, "flos": 35281490532480.0, "grad_norm": 2.010265208768684, "language_loss": 0.6601128, "learning_rate": 2.3450800988801996e-06, "loss": 0.68535501, "num_input_tokens_seen": 164726960, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.22277832, "step": 7675, "time_per_iteration": 2.985412359237671 }, { "auxiliary_loss_clip": 0.01247983, "auxiliary_loss_mlp": 0.01033886, "balance_loss_clip": 1.14666152, "balance_loss_mlp": 1.00813663, "epoch": 0.46150608747933264, "flos": 66733630076160.0, "grad_norm": 0.769923117974297, "language_loss": 0.5868119, "learning_rate": 2.3446964716087327e-06, "loss": 0.60963058, "num_input_tokens_seen": 164788525, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.2578125, "step": 7676, "time_per_iteration": 4.68008828163147 }, { "auxiliary_loss_clip": 0.01245349, "auxiliary_loss_mlp": 0.01025308, "balance_loss_clip": 1.14294028, "balance_loss_mlp": 1.00213385, "epoch": 0.4615662107320006, "flos": 55855940469120.0, "grad_norm": 0.7920299197790727, "language_loss": 0.62728584, "learning_rate": 2.344312831266341e-06, "loss": 0.64999247, "num_input_tokens_seen": 164843525, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 0.23144531, "step": 7677, "time_per_iteration": 4.620277166366577 }, { "auxiliary_loss_clip": 0.01466207, "auxiliary_loss_mlp": 0.01044139, "balance_loss_clip": 1.29058313, "balance_loss_mlp": 1.02275312, "epoch": 0.46162633398466857, "flos": 15490710097920.0, "grad_norm": 2.421068755391075, "language_loss": 0.77595103, "learning_rate": 2.3439291778675718e-06, "loss": 0.80105448, "num_input_tokens_seen": 164859895, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.21398926, "step": 7678, "time_per_iteration": 4.186838150024414 }, { "auxiliary_loss_clip": 0.0148412, "auxiliary_loss_mlp": 0.01041479, "balance_loss_clip": 1.30460799, "balance_loss_mlp": 1.02086806, "epoch": 0.46168645723733653, "flos": 20021104304640.0, "grad_norm": 1.8429472415840022, "language_loss": 0.67890763, "learning_rate": 2.343545511426974e-06, "loss": 0.70416361, "num_input_tokens_seen": 164878030, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.20605469, "step": 7679, "time_per_iteration": 2.9124860763549805 }, { "auxiliary_loss_clip": 0.0147857, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.30073071, "balance_loss_mlp": 1.02232838, "epoch": 0.4617465804900045, "flos": 20307043313280.0, "grad_norm": 2.11976664704184, "language_loss": 0.71017897, "learning_rate": 2.3431618319590963e-06, "loss": 0.73539162, "num_input_tokens_seen": 164895710, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.20361328, "step": 7680, "time_per_iteration": 2.8527047634124756 }, { "auxiliary_loss_clip": 0.01495587, "auxiliary_loss_mlp": 0.01045407, "balance_loss_clip": 1.31292152, "balance_loss_mlp": 1.02399707, "epoch": 0.46180670374267246, "flos": 22356530616960.0, "grad_norm": 1.8260058811276356, "language_loss": 0.64802623, "learning_rate": 2.342778139478487e-06, "loss": 0.67343616, "num_input_tokens_seen": 164913365, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.21386719, "step": 7681, "time_per_iteration": 2.851130962371826 }, { "auxiliary_loss_clip": 0.01463861, "auxiliary_loss_mlp": 0.01041373, "balance_loss_clip": 1.29016697, "balance_loss_mlp": 1.01999879, "epoch": 0.46186682699534043, "flos": 19903702752000.0, "grad_norm": 1.952491508205418, "language_loss": 0.67846048, "learning_rate": 2.342394433999697e-06, "loss": 0.70351285, "num_input_tokens_seen": 164931620, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21386719, "step": 7682, "time_per_iteration": 2.832533597946167 }, { "auxiliary_loss_clip": 0.01494305, "auxiliary_loss_mlp": 0.01038767, "balance_loss_clip": 1.31328905, "balance_loss_mlp": 1.01758325, "epoch": 0.4619269502480084, "flos": 31515989235840.0, "grad_norm": 3.770754265522648, "language_loss": 0.75775039, "learning_rate": 2.342010715537275e-06, "loss": 0.78308105, "num_input_tokens_seen": 164950905, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.21166992, "step": 7683, "time_per_iteration": 2.8976147174835205 }, { "auxiliary_loss_clip": 0.01471941, "auxiliary_loss_mlp": 0.01041308, "balance_loss_clip": 1.29552722, "balance_loss_mlp": 1.02033877, "epoch": 0.46198707350067636, "flos": 25020317577600.0, "grad_norm": 1.992006314022462, "language_loss": 0.77803153, "learning_rate": 2.3416269841057726e-06, "loss": 0.80316406, "num_input_tokens_seen": 164970950, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.20959473, "step": 7684, "time_per_iteration": 2.919167995452881 }, { "auxiliary_loss_clip": 0.01502326, "auxiliary_loss_mlp": 0.01042188, "balance_loss_clip": 1.31629336, "balance_loss_mlp": 1.02130258, "epoch": 0.4620471967533444, "flos": 18300475342080.0, "grad_norm": 1.8090460354710214, "language_loss": 0.8023181, "learning_rate": 2.3412432397197412e-06, "loss": 0.82776332, "num_input_tokens_seen": 164989855, "router_z_loss_clip": 1.86035156, "router_z_loss_mlp": 0.2088623, "step": 7685, "time_per_iteration": 2.8620898723602295 }, { "auxiliary_loss_clip": 0.01463469, "auxiliary_loss_mlp": 0.01048607, "balance_loss_clip": 1.28998268, "balance_loss_mlp": 1.02742314, "epoch": 0.46210732000601235, "flos": 33998298727680.0, "grad_norm": 1.9639234274070774, "language_loss": 0.67728496, "learning_rate": 2.340859482393731e-06, "loss": 0.70240569, "num_input_tokens_seen": 165012290, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21203613, "step": 7686, "time_per_iteration": 3.042912244796753 }, { "auxiliary_loss_clip": 0.01484323, "auxiliary_loss_mlp": 0.01040539, "balance_loss_clip": 1.30349338, "balance_loss_mlp": 1.01921248, "epoch": 0.4621674432586803, "flos": 25020227088000.0, "grad_norm": 2.1253343470363855, "language_loss": 0.74022388, "learning_rate": 2.340475712142296e-06, "loss": 0.76547253, "num_input_tokens_seen": 165030810, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.21313477, "step": 7687, "time_per_iteration": 2.947704553604126 }, { "auxiliary_loss_clip": 0.01471661, "auxiliary_loss_mlp": 0.01040169, "balance_loss_clip": 1.29443061, "balance_loss_mlp": 1.01863992, "epoch": 0.4622275665113483, "flos": 22023147795840.0, "grad_norm": 2.3023469485964614, "language_loss": 0.76032853, "learning_rate": 2.3400919289799873e-06, "loss": 0.78544682, "num_input_tokens_seen": 165050205, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.21520996, "step": 7688, "time_per_iteration": 2.927866220474243 }, { "auxiliary_loss_clip": 0.01474889, "auxiliary_loss_mlp": 0.01040146, "balance_loss_clip": 1.29726112, "balance_loss_mlp": 1.01960588, "epoch": 0.46228768976401624, "flos": 24068879781120.0, "grad_norm": 1.7932758192765126, "language_loss": 0.79931915, "learning_rate": 2.3397081329213585e-06, "loss": 0.82446957, "num_input_tokens_seen": 165069370, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.20532227, "step": 7689, "time_per_iteration": 2.9059464931488037 }, { "auxiliary_loss_clip": 0.01483767, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.30191684, "balance_loss_mlp": 1.02304173, "epoch": 0.4623478130166842, "flos": 26662663756800.0, "grad_norm": 2.483227058988167, "language_loss": 0.57718992, "learning_rate": 2.339324323980964e-06, "loss": 0.60246849, "num_input_tokens_seen": 165089610, "router_z_loss_clip": 1.81933594, "router_z_loss_mlp": 0.21044922, "step": 7690, "time_per_iteration": 2.946136951446533 }, { "auxiliary_loss_clip": 0.01479847, "auxiliary_loss_mlp": 0.01045782, "balance_loss_clip": 1.298033, "balance_loss_mlp": 1.0241102, "epoch": 0.46240793626935217, "flos": 20568160909440.0, "grad_norm": 8.433342810712363, "language_loss": 0.8327269, "learning_rate": 2.3389405021733562e-06, "loss": 0.85798323, "num_input_tokens_seen": 165109050, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.21679688, "step": 7691, "time_per_iteration": 2.862374782562256 }, { "auxiliary_loss_clip": 0.01473282, "auxiliary_loss_mlp": 0.01044025, "balance_loss_clip": 1.29348755, "balance_loss_mlp": 1.02348554, "epoch": 0.46246805952202014, "flos": 22466376288000.0, "grad_norm": 1.453414556777515, "language_loss": 0.7608422, "learning_rate": 2.338556667513091e-06, "loss": 0.78601539, "num_input_tokens_seen": 165130130, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.20532227, "step": 7692, "time_per_iteration": 2.8665647506713867 }, { "auxiliary_loss_clip": 0.01462578, "auxiliary_loss_mlp": 0.01046903, "balance_loss_clip": 1.28318787, "balance_loss_mlp": 1.02473044, "epoch": 0.4625281827746881, "flos": 35054288553600.0, "grad_norm": 1.9563614231570285, "language_loss": 0.74958766, "learning_rate": 2.338172820014723e-06, "loss": 0.77468246, "num_input_tokens_seen": 165152685, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.22155762, "step": 7693, "time_per_iteration": 2.9809470176696777 }, { "auxiliary_loss_clip": 0.01465053, "auxiliary_loss_mlp": 0.01052159, "balance_loss_clip": 1.2886095, "balance_loss_mlp": 1.03028429, "epoch": 0.46258830602735607, "flos": 21078360984960.0, "grad_norm": 1.7317118223301362, "language_loss": 0.85905278, "learning_rate": 2.337788959692808e-06, "loss": 0.88422489, "num_input_tokens_seen": 165173315, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.21862793, "step": 7694, "time_per_iteration": 2.84395694732666 }, { "auxiliary_loss_clip": 0.01470177, "auxiliary_loss_mlp": 0.01051757, "balance_loss_clip": 1.28907502, "balance_loss_mlp": 1.03081203, "epoch": 0.46264842928002403, "flos": 26188415314560.0, "grad_norm": 2.0574135414648507, "language_loss": 0.80418372, "learning_rate": 2.337405086561902e-06, "loss": 0.82940304, "num_input_tokens_seen": 165192395, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.20947266, "step": 7695, "time_per_iteration": 2.888993740081787 }, { "auxiliary_loss_clip": 0.01459441, "auxiliary_loss_mlp": 0.01050132, "balance_loss_clip": 1.28309822, "balance_loss_mlp": 1.02978325, "epoch": 0.462708552532692, "flos": 16773042251520.0, "grad_norm": 1.7149788925478255, "language_loss": 0.72865188, "learning_rate": 2.3370212006365606e-06, "loss": 0.75374758, "num_input_tokens_seen": 165211355, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.20349121, "step": 7696, "time_per_iteration": 2.8278751373291016 }, { "auxiliary_loss_clip": 0.01466675, "auxiliary_loss_mlp": 0.0104626, "balance_loss_clip": 1.28605556, "balance_loss_mlp": 1.02532649, "epoch": 0.46276867578535996, "flos": 15569038126080.0, "grad_norm": 1.557160509195556, "language_loss": 0.70277202, "learning_rate": 2.3366373019313423e-06, "loss": 0.72790134, "num_input_tokens_seen": 165229380, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.20947266, "step": 7697, "time_per_iteration": 2.7915966510772705 }, { "auxiliary_loss_clip": 0.01460617, "auxiliary_loss_mlp": 0.01053314, "balance_loss_clip": 1.2820015, "balance_loss_mlp": 1.03141546, "epoch": 0.462828799038028, "flos": 22424633320320.0, "grad_norm": 2.2742491726599847, "language_loss": 0.85609663, "learning_rate": 2.3362533904608025e-06, "loss": 0.88123596, "num_input_tokens_seen": 165247200, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21899414, "step": 7698, "time_per_iteration": 2.8370280265808105 }, { "auxiliary_loss_clip": 0.01453938, "auxiliary_loss_mlp": 0.01048361, "balance_loss_clip": 1.27650034, "balance_loss_mlp": 1.0272367, "epoch": 0.46288892229069595, "flos": 21079537349760.0, "grad_norm": 1.7939736543825515, "language_loss": 0.72152114, "learning_rate": 2.335869466239502e-06, "loss": 0.74654418, "num_input_tokens_seen": 165265825, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.21118164, "step": 7699, "time_per_iteration": 2.8161075115203857 }, { "auxiliary_loss_clip": 0.01472144, "auxiliary_loss_mlp": 0.01046431, "balance_loss_clip": 1.2892226, "balance_loss_mlp": 1.02052689, "epoch": 0.4629490455433639, "flos": 23196177216000.0, "grad_norm": 4.930155981968964, "language_loss": 0.71936387, "learning_rate": 2.335485529281996e-06, "loss": 0.74454963, "num_input_tokens_seen": 165284380, "router_z_loss_clip": 1.828125, "router_z_loss_mlp": 0.25915527, "step": 7700, "time_per_iteration": 2.862619161605835 }, { "auxiliary_loss_clip": 0.01451055, "auxiliary_loss_mlp": 0.01045681, "balance_loss_clip": 1.27548301, "balance_loss_mlp": 1.02348459, "epoch": 0.4630091687960319, "flos": 18842735998080.0, "grad_norm": 2.308297658293401, "language_loss": 0.73571998, "learning_rate": 2.3351015796028467e-06, "loss": 0.76068735, "num_input_tokens_seen": 165300320, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.2220459, "step": 7701, "time_per_iteration": 3.010960817337036 }, { "auxiliary_loss_clip": 0.01467586, "auxiliary_loss_mlp": 0.01041801, "balance_loss_clip": 1.28491831, "balance_loss_mlp": 1.01919913, "epoch": 0.46306929204869984, "flos": 38920541316480.0, "grad_norm": 2.1524418939452863, "language_loss": 0.65981722, "learning_rate": 2.3347176172166114e-06, "loss": 0.68491107, "num_input_tokens_seen": 165318130, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.22595215, "step": 7702, "time_per_iteration": 3.1165611743927 }, { "auxiliary_loss_clip": 0.01451094, "auxiliary_loss_mlp": 0.01037324, "balance_loss_clip": 1.27410543, "balance_loss_mlp": 1.01490104, "epoch": 0.4631294153013678, "flos": 19653127194240.0, "grad_norm": 2.078042993062129, "language_loss": 0.73644948, "learning_rate": 2.33433364213785e-06, "loss": 0.76133364, "num_input_tokens_seen": 165336225, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.2244873, "step": 7703, "time_per_iteration": 4.363089323043823 }, { "auxiliary_loss_clip": 0.01475514, "auxiliary_loss_mlp": 0.01048402, "balance_loss_clip": 1.29197788, "balance_loss_mlp": 1.02504897, "epoch": 0.4631895385540358, "flos": 24619013032320.0, "grad_norm": 1.8023291433112174, "language_loss": 0.69816756, "learning_rate": 2.3339496543811243e-06, "loss": 0.72340679, "num_input_tokens_seen": 165355005, "router_z_loss_clip": 1.83691406, "router_z_loss_mlp": 0.23339844, "step": 7704, "time_per_iteration": 2.8878016471862793 }, { "auxiliary_loss_clip": 0.01464058, "auxiliary_loss_mlp": 0.01041071, "balance_loss_clip": 1.28292608, "balance_loss_mlp": 1.01740789, "epoch": 0.46324966180670374, "flos": 26330547790080.0, "grad_norm": 1.8730763207769465, "language_loss": 0.81953818, "learning_rate": 2.3335656539609934e-06, "loss": 0.84458947, "num_input_tokens_seen": 165374910, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.23669434, "step": 7705, "time_per_iteration": 2.951017141342163 }, { "auxiliary_loss_clip": 0.01468274, "auxiliary_loss_mlp": 0.01044843, "balance_loss_clip": 1.28707647, "balance_loss_mlp": 1.02404106, "epoch": 0.4633097850593717, "flos": 19248746002560.0, "grad_norm": 2.150755367227443, "language_loss": 0.78897738, "learning_rate": 2.3331816408920196e-06, "loss": 0.81410855, "num_input_tokens_seen": 165392590, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.20800781, "step": 7706, "time_per_iteration": 2.823875665664673 }, { "auxiliary_loss_clip": 0.01440747, "auxiliary_loss_mlp": 0.01041823, "balance_loss_clip": 1.26720452, "balance_loss_mlp": 1.01999545, "epoch": 0.46336990831203967, "flos": 22793243857920.0, "grad_norm": 1.8434156853553338, "language_loss": 0.71102089, "learning_rate": 2.3327976151887654e-06, "loss": 0.73584652, "num_input_tokens_seen": 165411195, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21813965, "step": 7707, "time_per_iteration": 2.8452210426330566 }, { "auxiliary_loss_clip": 0.01459048, "auxiliary_loss_mlp": 0.01045371, "balance_loss_clip": 1.27800703, "balance_loss_mlp": 1.02114761, "epoch": 0.46343003156470763, "flos": 38223117682560.0, "grad_norm": 1.7984126188762937, "language_loss": 0.61930633, "learning_rate": 2.332413576865791e-06, "loss": 0.64435053, "num_input_tokens_seen": 165430150, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.2421875, "step": 7708, "time_per_iteration": 2.981297492980957 }, { "auxiliary_loss_clip": 0.01450719, "auxiliary_loss_mlp": 0.01045258, "balance_loss_clip": 1.27383637, "balance_loss_mlp": 1.02289426, "epoch": 0.4634901548173756, "flos": 31950349747200.0, "grad_norm": 1.9375090887252284, "language_loss": 0.77821952, "learning_rate": 2.3320295259376614e-06, "loss": 0.80317932, "num_input_tokens_seen": 165450595, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.22375488, "step": 7709, "time_per_iteration": 2.906395435333252 }, { "auxiliary_loss_clip": 0.0146118, "auxiliary_loss_mlp": 0.0104163, "balance_loss_clip": 1.28089786, "balance_loss_mlp": 1.01849127, "epoch": 0.46355027807004356, "flos": 20091740716800.0, "grad_norm": 1.6951391427451175, "language_loss": 0.77777267, "learning_rate": 2.3316454624189385e-06, "loss": 0.80280083, "num_input_tokens_seen": 165469515, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.23144531, "step": 7710, "time_per_iteration": 2.8789968490600586 }, { "auxiliary_loss_clip": 0.01465951, "auxiliary_loss_mlp": 0.01039602, "balance_loss_clip": 1.28387046, "balance_loss_mlp": 1.01622474, "epoch": 0.4636104013227116, "flos": 24072092161920.0, "grad_norm": 1.9366513295867969, "language_loss": 0.74170327, "learning_rate": 2.3312613863241865e-06, "loss": 0.76675874, "num_input_tokens_seen": 165488125, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.23376465, "step": 7711, "time_per_iteration": 4.3217933177948 }, { "auxiliary_loss_clip": 0.01461473, "auxiliary_loss_mlp": 0.01052555, "balance_loss_clip": 1.28369451, "balance_loss_mlp": 1.02833116, "epoch": 0.46367052457537955, "flos": 23925118492800.0, "grad_norm": 1.3376632639661397, "language_loss": 0.72199255, "learning_rate": 2.33087729766797e-06, "loss": 0.7471329, "num_input_tokens_seen": 165509225, "router_z_loss_clip": 1.77929688, "router_z_loss_mlp": 0.24206543, "step": 7712, "time_per_iteration": 4.355421543121338 }, { "auxiliary_loss_clip": 0.01477079, "auxiliary_loss_mlp": 0.01045923, "balance_loss_clip": 1.29253101, "balance_loss_mlp": 1.02165246, "epoch": 0.4637306478280475, "flos": 26407880432640.0, "grad_norm": 1.7933232084817345, "language_loss": 0.7410289, "learning_rate": 2.3304931964648524e-06, "loss": 0.76625896, "num_input_tokens_seen": 165529945, "router_z_loss_clip": 1.84375, "router_z_loss_mlp": 0.24267578, "step": 7713, "time_per_iteration": 2.8958425521850586 }, { "auxiliary_loss_clip": 0.01469687, "auxiliary_loss_mlp": 0.0104328, "balance_loss_clip": 1.28695107, "balance_loss_mlp": 1.02035594, "epoch": 0.4637907710807155, "flos": 21990634767360.0, "grad_norm": 1.626554367690889, "language_loss": 0.59631681, "learning_rate": 2.3301090827294e-06, "loss": 0.62144649, "num_input_tokens_seen": 165550690, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.22924805, "step": 7714, "time_per_iteration": 2.8916175365448 }, { "auxiliary_loss_clip": 0.01449198, "auxiliary_loss_mlp": 0.01043153, "balance_loss_clip": 1.27187729, "balance_loss_mlp": 1.01985908, "epoch": 0.46385089433338345, "flos": 12429962092800.0, "grad_norm": 3.0411747174715513, "language_loss": 0.7085005, "learning_rate": 2.3297249564761784e-06, "loss": 0.73342395, "num_input_tokens_seen": 165567775, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.23303223, "step": 7715, "time_per_iteration": 2.838351249694824 }, { "auxiliary_loss_clip": 0.01474037, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.28855884, "balance_loss_mlp": 1.01948833, "epoch": 0.4639110175860514, "flos": 23926521081600.0, "grad_norm": 2.290578391935896, "language_loss": 0.68912673, "learning_rate": 2.3293408177197527e-06, "loss": 0.71429056, "num_input_tokens_seen": 165587010, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.22839355, "step": 7716, "time_per_iteration": 2.868561029434204 }, { "auxiliary_loss_clip": 0.01468594, "auxiliary_loss_mlp": 0.01044333, "balance_loss_clip": 1.28777885, "balance_loss_mlp": 1.02101576, "epoch": 0.4639711408387194, "flos": 25311052535040.0, "grad_norm": 2.6903441097223055, "language_loss": 0.81858528, "learning_rate": 2.328956666474691e-06, "loss": 0.84371454, "num_input_tokens_seen": 165607850, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.2331543, "step": 7717, "time_per_iteration": 2.886523962020874 }, { "auxiliary_loss_clip": 0.01462163, "auxiliary_loss_mlp": 0.01043251, "balance_loss_clip": 1.28116095, "balance_loss_mlp": 1.01965928, "epoch": 0.46403126409138734, "flos": 21220855418880.0, "grad_norm": 1.65208207784816, "language_loss": 0.74045211, "learning_rate": 2.3285725027555593e-06, "loss": 0.76550627, "num_input_tokens_seen": 165627175, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.23596191, "step": 7718, "time_per_iteration": 2.8619933128356934 }, { "auxiliary_loss_clip": 0.01451183, "auxiliary_loss_mlp": 0.01049656, "balance_loss_clip": 1.27223945, "balance_loss_mlp": 1.02605271, "epoch": 0.4640913873440553, "flos": 35859159884160.0, "grad_norm": 1.7000422084717037, "language_loss": 0.71253431, "learning_rate": 2.3281883265769254e-06, "loss": 0.73754269, "num_input_tokens_seen": 165648340, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.23608398, "step": 7719, "time_per_iteration": 2.9574482440948486 }, { "auxiliary_loss_clip": 0.01458284, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.27658141, "balance_loss_mlp": 1.02425611, "epoch": 0.46415151059672327, "flos": 19174082803200.0, "grad_norm": 2.5720929570713795, "language_loss": 0.87536484, "learning_rate": 2.327804137953357e-06, "loss": 0.90044129, "num_input_tokens_seen": 165667195, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.2512207, "step": 7720, "time_per_iteration": 2.8423311710357666 }, { "auxiliary_loss_clip": 0.01290013, "auxiliary_loss_mlp": 0.01065469, "balance_loss_clip": 1.19002986, "balance_loss_mlp": 1.03876591, "epoch": 0.46421163384939124, "flos": 58943925843840.0, "grad_norm": 0.7280230534931796, "language_loss": 0.55097175, "learning_rate": 2.3274199368994226e-06, "loss": 0.57452655, "num_input_tokens_seen": 165726760, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.26757812, "step": 7721, "time_per_iteration": 3.3859004974365234 }, { "auxiliary_loss_clip": 0.01454135, "auxiliary_loss_mlp": 0.01051985, "balance_loss_clip": 1.27794611, "balance_loss_mlp": 1.0275588, "epoch": 0.4642717571020592, "flos": 20167037343360.0, "grad_norm": 1.9056384243362798, "language_loss": 0.81085271, "learning_rate": 2.3270357234296918e-06, "loss": 0.8359139, "num_input_tokens_seen": 165745005, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.24414062, "step": 7722, "time_per_iteration": 2.944897413253784 }, { "auxiliary_loss_clip": 0.01471307, "auxiliary_loss_mlp": 0.01037102, "balance_loss_clip": 1.28795564, "balance_loss_mlp": 1.01386809, "epoch": 0.46433188035472717, "flos": 25057083617280.0, "grad_norm": 1.6707507502593257, "language_loss": 0.78485227, "learning_rate": 2.3266514975587332e-06, "loss": 0.8099364, "num_input_tokens_seen": 165765750, "router_z_loss_clip": 1.83496094, "router_z_loss_mlp": 0.23242188, "step": 7723, "time_per_iteration": 2.8801467418670654 }, { "auxiliary_loss_clip": 0.01448334, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.27228165, "balance_loss_mlp": 1.01637983, "epoch": 0.4643920036073952, "flos": 28087626078720.0, "grad_norm": 2.082014698211516, "language_loss": 0.69026804, "learning_rate": 2.326267259301118e-06, "loss": 0.71514833, "num_input_tokens_seen": 165787515, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.23303223, "step": 7724, "time_per_iteration": 2.9307503700256348 }, { "auxiliary_loss_clip": 0.01449813, "auxiliary_loss_mlp": 0.01040912, "balance_loss_clip": 1.27219129, "balance_loss_mlp": 1.01702297, "epoch": 0.46445212686006315, "flos": 18378531901440.0, "grad_norm": 2.2347163990468566, "language_loss": 0.68297553, "learning_rate": 2.325883008671415e-06, "loss": 0.70788276, "num_input_tokens_seen": 165806675, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.2388916, "step": 7725, "time_per_iteration": 2.821589469909668 }, { "auxiliary_loss_clip": 0.01435262, "auxiliary_loss_mlp": 0.01040885, "balance_loss_clip": 1.26333141, "balance_loss_mlp": 1.01618505, "epoch": 0.4645122501127311, "flos": 31733870785920.0, "grad_norm": 1.7327649895604826, "language_loss": 0.65732539, "learning_rate": 2.3254987456841955e-06, "loss": 0.68208683, "num_input_tokens_seen": 165829835, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.24707031, "step": 7726, "time_per_iteration": 2.9924604892730713 }, { "auxiliary_loss_clip": 0.0145266, "auxiliary_loss_mlp": 0.01037298, "balance_loss_clip": 1.27536249, "balance_loss_mlp": 1.01461291, "epoch": 0.4645723733653991, "flos": 23779275943680.0, "grad_norm": 1.8060572466889955, "language_loss": 0.75640053, "learning_rate": 2.3251144703540307e-06, "loss": 0.78130013, "num_input_tokens_seen": 165849380, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.22692871, "step": 7727, "time_per_iteration": 2.908284902572632 }, { "auxiliary_loss_clip": 0.0145495, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.27635193, "balance_loss_mlp": 1.01784754, "epoch": 0.46463249661806705, "flos": 33158380659840.0, "grad_norm": 1.815982588360759, "language_loss": 0.78907669, "learning_rate": 2.3247301826954936e-06, "loss": 0.81404227, "num_input_tokens_seen": 165868620, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.23754883, "step": 7728, "time_per_iteration": 2.971658706665039 }, { "auxiliary_loss_clip": 0.01463619, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.28348386, "balance_loss_mlp": 1.01168227, "epoch": 0.464692619870735, "flos": 18305090311680.0, "grad_norm": 2.0359939928415662, "language_loss": 0.76650953, "learning_rate": 2.324345882723155e-06, "loss": 0.79149222, "num_input_tokens_seen": 165885915, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.2298584, "step": 7729, "time_per_iteration": 2.895003080368042 }, { "auxiliary_loss_clip": 0.01446907, "auxiliary_loss_mlp": 0.01036048, "balance_loss_clip": 1.2701211, "balance_loss_mlp": 1.01237285, "epoch": 0.464752743123403, "flos": 22648306204800.0, "grad_norm": 1.7254804959246097, "language_loss": 0.808433, "learning_rate": 2.323961570451588e-06, "loss": 0.83326256, "num_input_tokens_seen": 165905465, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.23693848, "step": 7730, "time_per_iteration": 2.846914291381836 }, { "auxiliary_loss_clip": 0.01447155, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.27066338, "balance_loss_mlp": 1.01676667, "epoch": 0.46481286637607094, "flos": 20421730177920.0, "grad_norm": 3.163985828298907, "language_loss": 0.78187859, "learning_rate": 2.3235772458953655e-06, "loss": 0.80675066, "num_input_tokens_seen": 165924640, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.23266602, "step": 7731, "time_per_iteration": 2.8764257431030273 }, { "auxiliary_loss_clip": 0.01443584, "auxiliary_loss_mlp": 0.01034854, "balance_loss_clip": 1.26808953, "balance_loss_mlp": 1.01171589, "epoch": 0.4648729896287389, "flos": 34288264523520.0, "grad_norm": 6.52787957180364, "language_loss": 0.66027224, "learning_rate": 2.323192909069061e-06, "loss": 0.68505669, "num_input_tokens_seen": 165945765, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.23132324, "step": 7732, "time_per_iteration": 2.935483455657959 }, { "auxiliary_loss_clip": 0.01465251, "auxiliary_loss_mlp": 0.01046922, "balance_loss_clip": 1.28238237, "balance_loss_mlp": 1.02158976, "epoch": 0.4649331128814069, "flos": 21330972558720.0, "grad_norm": 2.4757436415295992, "language_loss": 0.73775566, "learning_rate": 2.32280855998725e-06, "loss": 0.7628774, "num_input_tokens_seen": 165964025, "router_z_loss_clip": 1.82910156, "router_z_loss_mlp": 0.25341797, "step": 7733, "time_per_iteration": 2.8409266471862793 }, { "auxiliary_loss_clip": 0.01274518, "auxiliary_loss_mlp": 0.01056719, "balance_loss_clip": 1.17386246, "balance_loss_mlp": 1.02639186, "epoch": 0.46499323613407484, "flos": 58334223173760.0, "grad_norm": 1.2316066411508926, "language_loss": 0.51962274, "learning_rate": 2.3224241986645057e-06, "loss": 0.54293513, "num_input_tokens_seen": 166021950, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.30273438, "step": 7734, "time_per_iteration": 3.2833995819091797 }, { "auxiliary_loss_clip": 0.01457664, "auxiliary_loss_mlp": 0.0103787, "balance_loss_clip": 1.27870202, "balance_loss_mlp": 1.01445746, "epoch": 0.4650533593867428, "flos": 10896873402240.0, "grad_norm": 2.3686845254026605, "language_loss": 0.76397306, "learning_rate": 2.3220398251154035e-06, "loss": 0.78892839, "num_input_tokens_seen": 166039675, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.23425293, "step": 7735, "time_per_iteration": 2.840210437774658 }, { "auxiliary_loss_clip": 0.01446316, "auxiliary_loss_mlp": 0.01044473, "balance_loss_clip": 1.27075672, "balance_loss_mlp": 1.02033329, "epoch": 0.46511348263941077, "flos": 19984157285760.0, "grad_norm": 7.523662309823855, "language_loss": 0.70556885, "learning_rate": 2.321655439354519e-06, "loss": 0.73047674, "num_input_tokens_seen": 166057745, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.24133301, "step": 7736, "time_per_iteration": 2.9055418968200684 }, { "auxiliary_loss_clip": 0.01436736, "auxiliary_loss_mlp": 0.01034735, "balance_loss_clip": 1.26337123, "balance_loss_mlp": 1.010607, "epoch": 0.46517360589207873, "flos": 19686771342720.0, "grad_norm": 1.6987821331134665, "language_loss": 0.73073918, "learning_rate": 2.321271041396427e-06, "loss": 0.75545382, "num_input_tokens_seen": 166076440, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.24121094, "step": 7737, "time_per_iteration": 2.830345630645752 }, { "auxiliary_loss_clip": 0.01464695, "auxiliary_loss_mlp": 0.0103697, "balance_loss_clip": 1.28554893, "balance_loss_mlp": 1.01330674, "epoch": 0.46523372914474675, "flos": 16881259109760.0, "grad_norm": 2.1427199503492185, "language_loss": 0.84554875, "learning_rate": 2.3208866312557065e-06, "loss": 0.87056541, "num_input_tokens_seen": 166092520, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.23669434, "step": 7738, "time_per_iteration": 4.324207782745361 }, { "auxiliary_loss_clip": 0.01268355, "auxiliary_loss_mlp": 0.0104337, "balance_loss_clip": 1.1697135, "balance_loss_mlp": 1.01800191, "epoch": 0.4652938523974147, "flos": 53468699581440.0, "grad_norm": 0.7841903451665665, "language_loss": 0.57967955, "learning_rate": 2.320502208946932e-06, "loss": 0.60279679, "num_input_tokens_seen": 166156285, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.25390625, "step": 7739, "time_per_iteration": 3.4321415424346924 }, { "auxiliary_loss_clip": 0.01458817, "auxiliary_loss_mlp": 0.01043657, "balance_loss_clip": 1.27982616, "balance_loss_mlp": 1.02032804, "epoch": 0.4653539756500827, "flos": 15239591602560.0, "grad_norm": 1.7685700234155608, "language_loss": 0.85815644, "learning_rate": 2.3201177744846815e-06, "loss": 0.88318115, "num_input_tokens_seen": 166173455, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.2331543, "step": 7740, "time_per_iteration": 2.8683462142944336 }, { "auxiliary_loss_clip": 0.01450251, "auxiliary_loss_mlp": 0.01044715, "balance_loss_clip": 1.27567184, "balance_loss_mlp": 1.02094471, "epoch": 0.46541409890275065, "flos": 23742690883200.0, "grad_norm": 1.4867486481983532, "language_loss": 0.76132101, "learning_rate": 2.3197333278835327e-06, "loss": 0.78627068, "num_input_tokens_seen": 166194370, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.23791504, "step": 7741, "time_per_iteration": 2.917785882949829 }, { "auxiliary_loss_clip": 0.01474025, "auxiliary_loss_mlp": 0.01043475, "balance_loss_clip": 1.29195905, "balance_loss_mlp": 1.0209204, "epoch": 0.4654742221554186, "flos": 20856362158080.0, "grad_norm": 1.7989287701058376, "language_loss": 0.8139236, "learning_rate": 2.319348869158064e-06, "loss": 0.83909857, "num_input_tokens_seen": 166213195, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.22546387, "step": 7742, "time_per_iteration": 2.9295310974121094 }, { "auxiliary_loss_clip": 0.01453904, "auxiliary_loss_mlp": 0.0104252, "balance_loss_clip": 1.27441049, "balance_loss_mlp": 1.01910782, "epoch": 0.4655343454080866, "flos": 20714727375360.0, "grad_norm": 1.715048983217858, "language_loss": 0.73633319, "learning_rate": 2.3189643983228555e-06, "loss": 0.7612974, "num_input_tokens_seen": 166231350, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.23413086, "step": 7743, "time_per_iteration": 2.848858118057251 }, { "auxiliary_loss_clip": 0.01459384, "auxiliary_loss_mlp": 0.01040553, "balance_loss_clip": 1.28086567, "balance_loss_mlp": 1.01638889, "epoch": 0.46559446866075455, "flos": 18999256320000.0, "grad_norm": 2.481771428746751, "language_loss": 0.7234149, "learning_rate": 2.318579915392483e-06, "loss": 0.74841428, "num_input_tokens_seen": 166250530, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.24157715, "step": 7744, "time_per_iteration": 2.8956353664398193 }, { "auxiliary_loss_clip": 0.01446884, "auxiliary_loss_mlp": 0.01037524, "balance_loss_clip": 1.27178741, "balance_loss_mlp": 1.01408696, "epoch": 0.4656545919134225, "flos": 34509810902400.0, "grad_norm": 2.591972612225464, "language_loss": 0.85855019, "learning_rate": 2.31819542038153e-06, "loss": 0.88339424, "num_input_tokens_seen": 166272545, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.234375, "step": 7745, "time_per_iteration": 2.9566757678985596 }, { "auxiliary_loss_clip": 0.01439627, "auxiliary_loss_mlp": 0.01042842, "balance_loss_clip": 1.26432097, "balance_loss_mlp": 1.01987052, "epoch": 0.4657147151660905, "flos": 24319726807680.0, "grad_norm": 1.4905374709692187, "language_loss": 0.7336477, "learning_rate": 2.317810913304574e-06, "loss": 0.75847238, "num_input_tokens_seen": 166292135, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.22961426, "step": 7746, "time_per_iteration": 4.342200994491577 }, { "auxiliary_loss_clip": 0.01438066, "auxiliary_loss_mlp": 0.01043489, "balance_loss_clip": 1.26488543, "balance_loss_mlp": 1.0208751, "epoch": 0.46577483841875844, "flos": 58814697415680.0, "grad_norm": 1.486868359582299, "language_loss": 0.70790726, "learning_rate": 2.3174263941761963e-06, "loss": 0.73272276, "num_input_tokens_seen": 166316710, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.22607422, "step": 7747, "time_per_iteration": 4.635173320770264 }, { "auxiliary_loss_clip": 0.0145482, "auxiliary_loss_mlp": 0.01037776, "balance_loss_clip": 1.27895141, "balance_loss_mlp": 1.01535308, "epoch": 0.4658349616714264, "flos": 31334557011840.0, "grad_norm": 1.7341595062304458, "language_loss": 0.68287826, "learning_rate": 2.317041863010978e-06, "loss": 0.7078042, "num_input_tokens_seen": 166338535, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.22424316, "step": 7748, "time_per_iteration": 2.9657742977142334 }, { "auxiliary_loss_clip": 0.01466496, "auxiliary_loss_mlp": 0.01038301, "balance_loss_clip": 1.28481126, "balance_loss_mlp": 1.01519787, "epoch": 0.46589508492409437, "flos": 14866954277760.0, "grad_norm": 1.9596727707578627, "language_loss": 0.65822107, "learning_rate": 2.3166573198235007e-06, "loss": 0.68326902, "num_input_tokens_seen": 166355540, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.23095703, "step": 7749, "time_per_iteration": 2.80350923538208 }, { "auxiliary_loss_clip": 0.01477011, "auxiliary_loss_mlp": 0.01046794, "balance_loss_clip": 1.29642487, "balance_loss_mlp": 1.02336943, "epoch": 0.46595520817676234, "flos": 12903622352640.0, "grad_norm": 2.3367972089632674, "language_loss": 0.74379903, "learning_rate": 2.3162727646283456e-06, "loss": 0.76903701, "num_input_tokens_seen": 166372635, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 0.23425293, "step": 7750, "time_per_iteration": 2.87186598777771 }, { "auxiliary_loss_clip": 0.01467396, "auxiliary_loss_mlp": 0.01035519, "balance_loss_clip": 1.28738272, "balance_loss_mlp": 1.01180792, "epoch": 0.46601533142943036, "flos": 32867102764800.0, "grad_norm": 1.8135778067664545, "language_loss": 0.74904871, "learning_rate": 2.3158881974400963e-06, "loss": 0.77407789, "num_input_tokens_seen": 166393175, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.23730469, "step": 7751, "time_per_iteration": 2.968078851699829 }, { "auxiliary_loss_clip": 0.01465479, "auxiliary_loss_mlp": 0.01044859, "balance_loss_clip": 1.2827183, "balance_loss_mlp": 1.02093375, "epoch": 0.4660754546820983, "flos": 19975244060160.0, "grad_norm": 1.776971704630873, "language_loss": 0.73788834, "learning_rate": 2.3155036182733345e-06, "loss": 0.76299167, "num_input_tokens_seen": 166408630, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.23925781, "step": 7752, "time_per_iteration": 2.8290586471557617 }, { "auxiliary_loss_clip": 0.01470458, "auxiliary_loss_mlp": 0.01043006, "balance_loss_clip": 1.28890622, "balance_loss_mlp": 1.01952159, "epoch": 0.4661355779347663, "flos": 26699520286080.0, "grad_norm": 2.155426009934996, "language_loss": 0.69726419, "learning_rate": 2.315119027142644e-06, "loss": 0.72239882, "num_input_tokens_seen": 166428170, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.23461914, "step": 7753, "time_per_iteration": 2.922971725463867 }, { "auxiliary_loss_clip": 0.01450836, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.27548385, "balance_loss_mlp": 1.01225865, "epoch": 0.46619570118743425, "flos": 20969193985920.0, "grad_norm": 1.721509261262011, "language_loss": 0.73656654, "learning_rate": 2.3147344240626076e-06, "loss": 0.76143074, "num_input_tokens_seen": 166446705, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.23339844, "step": 7754, "time_per_iteration": 2.9297406673431396 }, { "auxiliary_loss_clip": 0.01461223, "auxiliary_loss_mlp": 0.01040131, "balance_loss_clip": 1.28119779, "balance_loss_mlp": 1.01720691, "epoch": 0.4662558244401022, "flos": 24436856891520.0, "grad_norm": 1.7304423913917106, "language_loss": 0.79369122, "learning_rate": 2.3143498090478114e-06, "loss": 0.81870478, "num_input_tokens_seen": 166466750, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.22924805, "step": 7755, "time_per_iteration": 2.912994146347046 }, { "auxiliary_loss_clip": 0.01437336, "auxiliary_loss_mlp": 0.01042215, "balance_loss_clip": 1.26363611, "balance_loss_mlp": 1.01966035, "epoch": 0.4663159476927702, "flos": 20605334152320.0, "grad_norm": 1.6705440285554565, "language_loss": 0.7358824, "learning_rate": 2.3139651821128382e-06, "loss": 0.76067793, "num_input_tokens_seen": 166485400, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.22570801, "step": 7756, "time_per_iteration": 2.8729231357574463 }, { "auxiliary_loss_clip": 0.01442018, "auxiliary_loss_mlp": 0.01040723, "balance_loss_clip": 1.26777363, "balance_loss_mlp": 1.01737022, "epoch": 0.46637607094543815, "flos": 25671745232640.0, "grad_norm": 1.709709463394274, "language_loss": 0.78756142, "learning_rate": 2.313580543272274e-06, "loss": 0.81238884, "num_input_tokens_seen": 166505730, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.23364258, "step": 7757, "time_per_iteration": 2.877902030944824 }, { "auxiliary_loss_clip": 0.01452158, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.27576101, "balance_loss_mlp": 1.0115335, "epoch": 0.4664361941981061, "flos": 24282960768000.0, "grad_norm": 2.172393648766265, "language_loss": 0.6721217, "learning_rate": 2.313195892540705e-06, "loss": 0.69697106, "num_input_tokens_seen": 166523770, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.21252441, "step": 7758, "time_per_iteration": 2.9181931018829346 }, { "auxiliary_loss_clip": 0.01456554, "auxiliary_loss_mlp": 0.01039626, "balance_loss_clip": 1.2802012, "balance_loss_mlp": 1.0170598, "epoch": 0.4664963174507741, "flos": 18415071717120.0, "grad_norm": 1.741751361813492, "language_loss": 0.75791776, "learning_rate": 2.3128112299327147e-06, "loss": 0.78287959, "num_input_tokens_seen": 166542935, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.22570801, "step": 7759, "time_per_iteration": 2.8382821083068848 }, { "auxiliary_loss_clip": 0.01459257, "auxiliary_loss_mlp": 0.01041086, "balance_loss_clip": 1.28354692, "balance_loss_mlp": 1.01867437, "epoch": 0.46655644070344204, "flos": 22465199923200.0, "grad_norm": 1.5327456154332444, "language_loss": 0.7774961, "learning_rate": 2.312426555462893e-06, "loss": 0.80249953, "num_input_tokens_seen": 166563935, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.22412109, "step": 7760, "time_per_iteration": 2.8832669258117676 }, { "auxiliary_loss_clip": 0.01444831, "auxiliary_loss_mlp": 0.01044619, "balance_loss_clip": 1.27091575, "balance_loss_mlp": 1.0222671, "epoch": 0.46661656395611, "flos": 13816348583040.0, "grad_norm": 1.7297299094069096, "language_loss": 0.74897438, "learning_rate": 2.3120418691458237e-06, "loss": 0.7738688, "num_input_tokens_seen": 166582175, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.22351074, "step": 7761, "time_per_iteration": 2.8712756633758545 }, { "auxiliary_loss_clip": 0.01477935, "auxiliary_loss_mlp": 0.01045877, "balance_loss_clip": 1.29701936, "balance_loss_mlp": 1.02123713, "epoch": 0.466676687208778, "flos": 21661957405440.0, "grad_norm": 1.5493029345338603, "language_loss": 0.7944628, "learning_rate": 2.3116571709960956e-06, "loss": 0.81970096, "num_input_tokens_seen": 166601870, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.24645996, "step": 7762, "time_per_iteration": 2.872647285461426 }, { "auxiliary_loss_clip": 0.01247654, "auxiliary_loss_mlp": 0.01048839, "balance_loss_clip": 1.14730597, "balance_loss_mlp": 1.01946604, "epoch": 0.46673681046144594, "flos": 68565145340160.0, "grad_norm": 0.8155508127491906, "language_loss": 0.59864938, "learning_rate": 2.311272461028297e-06, "loss": 0.62161428, "num_input_tokens_seen": 166668960, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.29296875, "step": 7763, "time_per_iteration": 3.436049461364746 }, { "auxiliary_loss_clip": 0.01466608, "auxiliary_loss_mlp": 0.0104357, "balance_loss_clip": 1.28489578, "balance_loss_mlp": 1.01977539, "epoch": 0.46679693371411396, "flos": 15822735575040.0, "grad_norm": 2.1462561655029786, "language_loss": 0.7943126, "learning_rate": 2.3108877392570146e-06, "loss": 0.81941438, "num_input_tokens_seen": 166686110, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.23803711, "step": 7764, "time_per_iteration": 2.810814380645752 }, { "auxiliary_loss_clip": 0.01458002, "auxiliary_loss_mlp": 0.01045319, "balance_loss_clip": 1.28324461, "balance_loss_mlp": 1.02278829, "epoch": 0.4668570569667819, "flos": 18523605288960.0, "grad_norm": 1.8929049609779096, "language_loss": 0.73463857, "learning_rate": 2.310503005696839e-06, "loss": 0.75967181, "num_input_tokens_seen": 166703930, "router_z_loss_clip": 1.74902344, "router_z_loss_mlp": 0.22509766, "step": 7765, "time_per_iteration": 2.8739616870880127 }, { "auxiliary_loss_clip": 0.01467854, "auxiliary_loss_mlp": 0.01045325, "balance_loss_clip": 1.28898942, "balance_loss_mlp": 1.02259159, "epoch": 0.4669171802194499, "flos": 19215554302080.0, "grad_norm": 2.3548115022829648, "language_loss": 0.78712451, "learning_rate": 2.3101182603623576e-06, "loss": 0.81225622, "num_input_tokens_seen": 166719940, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.22729492, "step": 7766, "time_per_iteration": 2.935147523880005 }, { "auxiliary_loss_clip": 0.01457093, "auxiliary_loss_mlp": 0.01040211, "balance_loss_clip": 1.28139865, "balance_loss_mlp": 1.01857424, "epoch": 0.46697730347211786, "flos": 12283712340480.0, "grad_norm": 4.640353268551855, "language_loss": 0.66232812, "learning_rate": 2.3097335032681607e-06, "loss": 0.68730116, "num_input_tokens_seen": 166738285, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21643066, "step": 7767, "time_per_iteration": 2.8599228858947754 }, { "auxiliary_loss_clip": 0.01463348, "auxiliary_loss_mlp": 0.010463, "balance_loss_clip": 1.2857306, "balance_loss_mlp": 1.02368617, "epoch": 0.4670374267247858, "flos": 23597255537280.0, "grad_norm": 1.9821133767787407, "language_loss": 0.75540257, "learning_rate": 2.3093487344288393e-06, "loss": 0.78049898, "num_input_tokens_seen": 166758170, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.22607422, "step": 7768, "time_per_iteration": 2.8706419467926025 }, { "auxiliary_loss_clip": 0.01455881, "auxiliary_loss_mlp": 0.01047182, "balance_loss_clip": 1.27774239, "balance_loss_mlp": 1.02517605, "epoch": 0.4670975499774538, "flos": 15997924016640.0, "grad_norm": 1.6213766050273777, "language_loss": 0.71301925, "learning_rate": 2.308963953858982e-06, "loss": 0.73804986, "num_input_tokens_seen": 166775750, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.2199707, "step": 7769, "time_per_iteration": 2.871973752975464 }, { "auxiliary_loss_clip": 0.01466626, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.2873919, "balance_loss_mlp": 1.01782739, "epoch": 0.46715767323012175, "flos": 15386022334080.0, "grad_norm": 2.1237013647979186, "language_loss": 0.82153803, "learning_rate": 2.3085791615731803e-06, "loss": 0.84661067, "num_input_tokens_seen": 166791720, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.22802734, "step": 7770, "time_per_iteration": 2.841442823410034 }, { "auxiliary_loss_clip": 0.01248082, "auxiliary_loss_mlp": 0.01028948, "balance_loss_clip": 1.14804125, "balance_loss_mlp": 1.00129151, "epoch": 0.4672177964827897, "flos": 60281013484800.0, "grad_norm": 0.8648783717561113, "language_loss": 0.55666459, "learning_rate": 2.3081943575860265e-06, "loss": 0.57943487, "num_input_tokens_seen": 166856360, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.27734375, "step": 7771, "time_per_iteration": 3.4392666816711426 }, { "auxiliary_loss_clip": 0.0145727, "auxiliary_loss_mlp": 0.01044023, "balance_loss_clip": 1.28215218, "balance_loss_mlp": 1.02179074, "epoch": 0.4672779197354577, "flos": 27647564722560.0, "grad_norm": 2.9354985649339875, "language_loss": 0.66069746, "learning_rate": 2.3078095419121117e-06, "loss": 0.68571043, "num_input_tokens_seen": 166875925, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.22216797, "step": 7772, "time_per_iteration": 2.9237897396087646 }, { "auxiliary_loss_clip": 0.01462196, "auxiliary_loss_mlp": 0.01039306, "balance_loss_clip": 1.28638244, "balance_loss_mlp": 1.0176096, "epoch": 0.46733804298812565, "flos": 31406686502400.0, "grad_norm": 2.2021678298027845, "language_loss": 0.64753932, "learning_rate": 2.3074247145660283e-06, "loss": 0.67255431, "num_input_tokens_seen": 166896520, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.21679688, "step": 7773, "time_per_iteration": 4.337916612625122 }, { "auxiliary_loss_clip": 0.01456129, "auxiliary_loss_mlp": 0.01041528, "balance_loss_clip": 1.27808022, "balance_loss_mlp": 1.01892591, "epoch": 0.4673981662407936, "flos": 19510135067520.0, "grad_norm": 2.1223566643519307, "language_loss": 0.80429977, "learning_rate": 2.3070398755623685e-06, "loss": 0.82927632, "num_input_tokens_seen": 166915370, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.22570801, "step": 7774, "time_per_iteration": 2.847783088684082 }, { "auxiliary_loss_clip": 0.01462986, "auxiliary_loss_mlp": 0.01040135, "balance_loss_clip": 1.28304553, "balance_loss_mlp": 1.01835561, "epoch": 0.4674582894934616, "flos": 20531530604160.0, "grad_norm": 1.7264233916950047, "language_loss": 0.79163373, "learning_rate": 2.306655024915726e-06, "loss": 0.81666493, "num_input_tokens_seen": 166934875, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.21789551, "step": 7775, "time_per_iteration": 2.8886308670043945 }, { "auxiliary_loss_clip": 0.0145233, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.27652788, "balance_loss_mlp": 1.01412368, "epoch": 0.46751841274612954, "flos": 22101113865600.0, "grad_norm": 2.0266299626234274, "language_loss": 0.70380569, "learning_rate": 2.306270162640694e-06, "loss": 0.72868711, "num_input_tokens_seen": 166954285, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21691895, "step": 7776, "time_per_iteration": 2.873547077178955 }, { "auxiliary_loss_clip": 0.01452846, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.27631664, "balance_loss_mlp": 1.0152185, "epoch": 0.46757853599879756, "flos": 26991522097920.0, "grad_norm": 1.8634497707474054, "language_loss": 0.74115914, "learning_rate": 2.3058852887518678e-06, "loss": 0.76605368, "num_input_tokens_seen": 166975975, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.21398926, "step": 7777, "time_per_iteration": 2.9397499561309814 }, { "auxiliary_loss_clip": 0.01465381, "auxiliary_loss_mlp": 0.01039537, "balance_loss_clip": 1.28572893, "balance_loss_mlp": 1.01790094, "epoch": 0.4676386592514655, "flos": 24144719345280.0, "grad_norm": 6.61439344617977, "language_loss": 0.70751655, "learning_rate": 2.3055004032638394e-06, "loss": 0.7325657, "num_input_tokens_seen": 166996140, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.21630859, "step": 7778, "time_per_iteration": 2.911848306655884 }, { "auxiliary_loss_clip": 0.01472656, "auxiliary_loss_mlp": 0.01040095, "balance_loss_clip": 1.29146683, "balance_loss_mlp": 1.01825547, "epoch": 0.4676987825041335, "flos": 25494656509440.0, "grad_norm": 2.2576013081586557, "language_loss": 0.74121583, "learning_rate": 2.305115506191206e-06, "loss": 0.76634336, "num_input_tokens_seen": 167016105, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.21838379, "step": 7779, "time_per_iteration": 2.897386074066162 }, { "auxiliary_loss_clip": 0.01443896, "auxiliary_loss_mlp": 0.01042423, "balance_loss_clip": 1.26953268, "balance_loss_mlp": 1.02047634, "epoch": 0.46775890575680146, "flos": 21955497540480.0, "grad_norm": 2.434361575367322, "language_loss": 0.72948003, "learning_rate": 2.304730597548562e-06, "loss": 0.75434327, "num_input_tokens_seen": 167036185, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21948242, "step": 7780, "time_per_iteration": 2.901930809020996 }, { "auxiliary_loss_clip": 0.01484651, "auxiliary_loss_mlp": 0.01042253, "balance_loss_clip": 1.29915869, "balance_loss_mlp": 1.02055717, "epoch": 0.4678190290094694, "flos": 25239285002880.0, "grad_norm": 1.8217064057666252, "language_loss": 0.7454983, "learning_rate": 2.3043456773505023e-06, "loss": 0.77076733, "num_input_tokens_seen": 167054515, "router_z_loss_clip": 1.85253906, "router_z_loss_mlp": 0.21704102, "step": 7781, "time_per_iteration": 4.40968132019043 }, { "auxiliary_loss_clip": 0.01478606, "auxiliary_loss_mlp": 0.01041811, "balance_loss_clip": 1.29650533, "balance_loss_mlp": 1.01907814, "epoch": 0.4678791522621374, "flos": 32280610677120.0, "grad_norm": 1.686313850230819, "language_loss": 0.63127172, "learning_rate": 2.3039607456116252e-06, "loss": 0.6564759, "num_input_tokens_seen": 167077245, "router_z_loss_clip": 1.81835938, "router_z_loss_mlp": 0.22753906, "step": 7782, "time_per_iteration": 4.437561750411987 }, { "auxiliary_loss_clip": 0.01471635, "auxiliary_loss_mlp": 0.01037913, "balance_loss_clip": 1.2900548, "balance_loss_mlp": 1.01655054, "epoch": 0.46793927551480535, "flos": 27056231441280.0, "grad_norm": 2.093456807640064, "language_loss": 0.63547397, "learning_rate": 2.3035758023465254e-06, "loss": 0.66056943, "num_input_tokens_seen": 167097235, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.21362305, "step": 7783, "time_per_iteration": 4.292320966720581 }, { "auxiliary_loss_clip": 0.0148646, "auxiliary_loss_mlp": 0.01042454, "balance_loss_clip": 1.29947686, "balance_loss_mlp": 1.01976824, "epoch": 0.4679993987674733, "flos": 17466393853440.0, "grad_norm": 2.8066610323514922, "language_loss": 0.68540865, "learning_rate": 2.303190847569801e-06, "loss": 0.71069777, "num_input_tokens_seen": 167113155, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 0.22680664, "step": 7784, "time_per_iteration": 2.880420207977295 }, { "auxiliary_loss_clip": 0.01461677, "auxiliary_loss_mlp": 0.01039595, "balance_loss_clip": 1.28419805, "balance_loss_mlp": 1.01837587, "epoch": 0.4680595220201413, "flos": 17173939593600.0, "grad_norm": 3.1240682353637212, "language_loss": 0.85588259, "learning_rate": 2.3028058812960497e-06, "loss": 0.88089532, "num_input_tokens_seen": 167131765, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.2121582, "step": 7785, "time_per_iteration": 2.8619790077209473 }, { "auxiliary_loss_clip": 0.01464067, "auxiliary_loss_mlp": 0.01042024, "balance_loss_clip": 1.28458309, "balance_loss_mlp": 1.01906419, "epoch": 0.46811964527280925, "flos": 11334943987200.0, "grad_norm": 3.101091957656719, "language_loss": 0.78012633, "learning_rate": 2.3024209035398678e-06, "loss": 0.80518723, "num_input_tokens_seen": 167149030, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22937012, "step": 7786, "time_per_iteration": 2.813344717025757 }, { "auxiliary_loss_clip": 0.01449612, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.27538788, "balance_loss_mlp": 1.01250446, "epoch": 0.4681797685254772, "flos": 24289204550400.0, "grad_norm": 2.460105322903489, "language_loss": 0.75006241, "learning_rate": 2.302035914315856e-06, "loss": 0.77489024, "num_input_tokens_seen": 167167375, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.20666504, "step": 7787, "time_per_iteration": 2.8822031021118164 }, { "auxiliary_loss_clip": 0.01473976, "auxiliary_loss_mlp": 0.0104244, "balance_loss_clip": 1.29414427, "balance_loss_mlp": 1.01958752, "epoch": 0.4682398917781452, "flos": 31663053394560.0, "grad_norm": 1.636886613708989, "language_loss": 0.66125584, "learning_rate": 2.3016509136386116e-06, "loss": 0.68641996, "num_input_tokens_seen": 167188065, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.22851562, "step": 7788, "time_per_iteration": 2.9383580684661865 }, { "auxiliary_loss_clip": 0.01468229, "auxiliary_loss_mlp": 0.01031464, "balance_loss_clip": 1.29083395, "balance_loss_mlp": 1.0109601, "epoch": 0.46830001503081314, "flos": 28122401347200.0, "grad_norm": 2.093833991661311, "language_loss": 0.64904666, "learning_rate": 2.3012659015227343e-06, "loss": 0.67404366, "num_input_tokens_seen": 167209675, "router_z_loss_clip": 1.77539062, "router_z_loss_mlp": 0.20507812, "step": 7789, "time_per_iteration": 2.9214930534362793 }, { "auxiliary_loss_clip": 0.01249193, "auxiliary_loss_mlp": 0.01023541, "balance_loss_clip": 1.14703476, "balance_loss_mlp": 1.00160646, "epoch": 0.4683601382834811, "flos": 57910178476800.0, "grad_norm": 0.7419934930033778, "language_loss": 0.61923254, "learning_rate": 2.300880877982825e-06, "loss": 0.64195991, "num_input_tokens_seen": 167273940, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.21972656, "step": 7790, "time_per_iteration": 3.4815359115600586 }, { "auxiliary_loss_clip": 0.01466685, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.29022551, "balance_loss_mlp": 1.01915669, "epoch": 0.46842026153614913, "flos": 21882010705920.0, "grad_norm": 1.7641774248931008, "language_loss": 0.79334438, "learning_rate": 2.3004958430334808e-06, "loss": 0.8184182, "num_input_tokens_seen": 167292730, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.21533203, "step": 7791, "time_per_iteration": 2.871277093887329 }, { "auxiliary_loss_clip": 0.01460662, "auxiliary_loss_mlp": 0.01040108, "balance_loss_clip": 1.28444588, "balance_loss_mlp": 1.01828074, "epoch": 0.4684803847888171, "flos": 24911829250560.0, "grad_norm": 1.5101446446404527, "language_loss": 0.75520283, "learning_rate": 2.3001107966893052e-06, "loss": 0.78021049, "num_input_tokens_seen": 167313460, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.21838379, "step": 7792, "time_per_iteration": 2.9237585067749023 }, { "auxiliary_loss_clip": 0.01453348, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.27958024, "balance_loss_mlp": 1.0175252, "epoch": 0.46854050804148506, "flos": 26263259493120.0, "grad_norm": 1.569305039042321, "language_loss": 0.68550348, "learning_rate": 2.299725738964898e-06, "loss": 0.71041799, "num_input_tokens_seen": 167335385, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.20568848, "step": 7793, "time_per_iteration": 2.9084737300872803 }, { "auxiliary_loss_clip": 0.01461058, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.28563738, "balance_loss_mlp": 1.01486659, "epoch": 0.468600631294153, "flos": 21589737425280.0, "grad_norm": 1.612128752491402, "language_loss": 0.74312907, "learning_rate": 2.2993406698748607e-06, "loss": 0.76810116, "num_input_tokens_seen": 167353625, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.21289062, "step": 7794, "time_per_iteration": 2.8693981170654297 }, { "auxiliary_loss_clip": 0.01471599, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.29467499, "balance_loss_mlp": 1.01950848, "epoch": 0.468660754546821, "flos": 25896820705920.0, "grad_norm": 1.5543804471693339, "language_loss": 0.64732748, "learning_rate": 2.2989555894337953e-06, "loss": 0.67245722, "num_input_tokens_seen": 167374565, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.21850586, "step": 7795, "time_per_iteration": 2.91186785697937 }, { "auxiliary_loss_clip": 0.01470685, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.29536247, "balance_loss_mlp": 1.01596737, "epoch": 0.46872087779948896, "flos": 35487020252160.0, "grad_norm": 2.2239675471800218, "language_loss": 0.69416094, "learning_rate": 2.298570497656304e-06, "loss": 0.71923, "num_input_tokens_seen": 167395010, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.20251465, "step": 7796, "time_per_iteration": 3.0099494457244873 }, { "auxiliary_loss_clip": 0.01472959, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.29321802, "balance_loss_mlp": 1.01819062, "epoch": 0.4687810010521569, "flos": 26407744698240.0, "grad_norm": 1.7875485151369395, "language_loss": 0.71962047, "learning_rate": 2.2981853945569894e-06, "loss": 0.74474311, "num_input_tokens_seen": 167415285, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.21105957, "step": 7797, "time_per_iteration": 3.064347505569458 }, { "auxiliary_loss_clip": 0.01500298, "auxiliary_loss_mlp": 0.01039021, "balance_loss_clip": 1.31849575, "balance_loss_mlp": 1.01748037, "epoch": 0.4688411243048249, "flos": 19981849800960.0, "grad_norm": 1.9146898263250078, "language_loss": 0.68150985, "learning_rate": 2.297800280150454e-06, "loss": 0.70690298, "num_input_tokens_seen": 167432405, "router_z_loss_clip": 1.81640625, "router_z_loss_mlp": 0.21533203, "step": 7798, "time_per_iteration": 2.833261013031006 }, { "auxiliary_loss_clip": 0.01260737, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.15929401, "balance_loss_mlp": 1.00628614, "epoch": 0.46890124755749285, "flos": 64007622236160.0, "grad_norm": 0.9289949272670465, "language_loss": 0.64611316, "learning_rate": 2.2974151544513033e-06, "loss": 0.66904473, "num_input_tokens_seen": 167499365, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.26171875, "step": 7799, "time_per_iteration": 3.5514254570007324 }, { "auxiliary_loss_clip": 0.01469718, "auxiliary_loss_mlp": 0.0103536, "balance_loss_clip": 1.29051244, "balance_loss_mlp": 1.0146296, "epoch": 0.4689613708101608, "flos": 23779502167680.0, "grad_norm": 1.5075276318152695, "language_loss": 0.73318988, "learning_rate": 2.2970300174741395e-06, "loss": 0.75824064, "num_input_tokens_seen": 167520390, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.20727539, "step": 7800, "time_per_iteration": 2.91387939453125 }, { "auxiliary_loss_clip": 0.01481853, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.30502319, "balance_loss_mlp": 1.02116382, "epoch": 0.4690214940628288, "flos": 24798997422720.0, "grad_norm": 2.2490006203645074, "language_loss": 0.73739249, "learning_rate": 2.296644869233568e-06, "loss": 0.76261747, "num_input_tokens_seen": 167539865, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.19470215, "step": 7801, "time_per_iteration": 2.9136178493499756 }, { "auxiliary_loss_clip": 0.01486086, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.30216217, "balance_loss_mlp": 1.02126169, "epoch": 0.46908161731549675, "flos": 18086394355200.0, "grad_norm": 2.5117133562382756, "language_loss": 0.64280307, "learning_rate": 2.2962597097441936e-06, "loss": 0.66809094, "num_input_tokens_seen": 167558190, "router_z_loss_clip": 1.84179688, "router_z_loss_mlp": 0.21435547, "step": 7802, "time_per_iteration": 2.8764374256134033 }, { "auxiliary_loss_clip": 0.01491123, "auxiliary_loss_mlp": 0.01039463, "balance_loss_clip": 1.31052065, "balance_loss_mlp": 1.01876783, "epoch": 0.4691417405681647, "flos": 25714528830720.0, "grad_norm": 1.7111735555453922, "language_loss": 0.74289227, "learning_rate": 2.2958745390206206e-06, "loss": 0.76819807, "num_input_tokens_seen": 167577685, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.20703125, "step": 7803, "time_per_iteration": 2.919506311416626 }, { "auxiliary_loss_clip": 0.0147496, "auxiliary_loss_mlp": 0.01044552, "balance_loss_clip": 1.29624987, "balance_loss_mlp": 1.02372646, "epoch": 0.46920186382083273, "flos": 17465941405440.0, "grad_norm": 1.6018488155008592, "language_loss": 0.77750731, "learning_rate": 2.2954893570774558e-06, "loss": 0.80270243, "num_input_tokens_seen": 167596390, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.20825195, "step": 7804, "time_per_iteration": 2.8506152629852295 }, { "auxiliary_loss_clip": 0.01471433, "auxiliary_loss_mlp": 0.01039168, "balance_loss_clip": 1.29616737, "balance_loss_mlp": 1.01738858, "epoch": 0.4692619870735007, "flos": 20349102994560.0, "grad_norm": 1.6855101613326242, "language_loss": 0.77824265, "learning_rate": 2.295104163929305e-06, "loss": 0.80334866, "num_input_tokens_seen": 167614980, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.21801758, "step": 7805, "time_per_iteration": 2.8661677837371826 }, { "auxiliary_loss_clip": 0.01490508, "auxiliary_loss_mlp": 0.01044446, "balance_loss_clip": 1.30569816, "balance_loss_mlp": 1.02279735, "epoch": 0.46932211032616866, "flos": 29508380634240.0, "grad_norm": 1.528944816733302, "language_loss": 0.83899462, "learning_rate": 2.2947189595907742e-06, "loss": 0.86434412, "num_input_tokens_seen": 167635895, "router_z_loss_clip": 1.84570312, "router_z_loss_mlp": 0.21655273, "step": 7806, "time_per_iteration": 2.913583755493164 }, { "auxiliary_loss_clip": 0.01485637, "auxiliary_loss_mlp": 0.01043155, "balance_loss_clip": 1.30454922, "balance_loss_mlp": 1.02201939, "epoch": 0.4693822335788366, "flos": 36225146223360.0, "grad_norm": 2.001180856885291, "language_loss": 0.78222942, "learning_rate": 2.294333744076472e-06, "loss": 0.80751735, "num_input_tokens_seen": 167657440, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.21130371, "step": 7807, "time_per_iteration": 3.01485276222229 }, { "auxiliary_loss_clip": 0.01494708, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.31445813, "balance_loss_mlp": 1.01653039, "epoch": 0.4694423568315046, "flos": 20348514812160.0, "grad_norm": 2.3734718760396993, "language_loss": 0.52023339, "learning_rate": 2.2939485174010035e-06, "loss": 0.54556084, "num_input_tokens_seen": 167675025, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.21520996, "step": 7808, "time_per_iteration": 4.304847002029419 }, { "auxiliary_loss_clip": 0.01259979, "auxiliary_loss_mlp": 0.01023494, "balance_loss_clip": 1.16038191, "balance_loss_mlp": 1.00022435, "epoch": 0.46950248008417256, "flos": 64353022191360.0, "grad_norm": 0.782187776037097, "language_loss": 0.57843632, "learning_rate": 2.293563279578978e-06, "loss": 0.60127103, "num_input_tokens_seen": 167729635, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.23242188, "step": 7809, "time_per_iteration": 3.2456390857696533 }, { "auxiliary_loss_clip": 0.01483452, "auxiliary_loss_mlp": 0.01042584, "balance_loss_clip": 1.30134118, "balance_loss_mlp": 1.02070916, "epoch": 0.4695626033368405, "flos": 19207319748480.0, "grad_norm": 2.142945239007722, "language_loss": 0.72492272, "learning_rate": 2.2931780306250045e-06, "loss": 0.75018311, "num_input_tokens_seen": 167745135, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.21875, "step": 7810, "time_per_iteration": 2.9011523723602295 }, { "auxiliary_loss_clip": 0.01478844, "auxiliary_loss_mlp": 0.01039907, "balance_loss_clip": 1.29854155, "balance_loss_mlp": 1.01838994, "epoch": 0.4696227265895085, "flos": 23012663731200.0, "grad_norm": 1.8374945381940766, "language_loss": 0.82175028, "learning_rate": 2.29279277055369e-06, "loss": 0.84693778, "num_input_tokens_seen": 167763875, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.21520996, "step": 7811, "time_per_iteration": 2.8768444061279297 }, { "auxiliary_loss_clip": 0.01480574, "auxiliary_loss_mlp": 0.01039235, "balance_loss_clip": 1.3004998, "balance_loss_mlp": 1.01752722, "epoch": 0.46968284984217645, "flos": 21880653361920.0, "grad_norm": 1.6401858213468052, "language_loss": 0.81001389, "learning_rate": 2.292407499379644e-06, "loss": 0.83521199, "num_input_tokens_seen": 167784895, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.21716309, "step": 7812, "time_per_iteration": 2.8815088272094727 }, { "auxiliary_loss_clip": 0.01463394, "auxiliary_loss_mlp": 0.01040914, "balance_loss_clip": 1.28846431, "balance_loss_mlp": 1.01955163, "epoch": 0.4697429730948444, "flos": 19984654978560.0, "grad_norm": 1.6410024830192649, "language_loss": 0.75020576, "learning_rate": 2.292022217117477e-06, "loss": 0.77524883, "num_input_tokens_seen": 167803185, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.21362305, "step": 7813, "time_per_iteration": 2.913576126098633 }, { "auxiliary_loss_clip": 0.01470667, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.29104733, "balance_loss_mlp": 1.01453304, "epoch": 0.4698030963475124, "flos": 15163933017600.0, "grad_norm": 2.15004620440249, "language_loss": 0.85116082, "learning_rate": 2.291636923781798e-06, "loss": 0.8762337, "num_input_tokens_seen": 167816550, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22094727, "step": 7814, "time_per_iteration": 2.831911325454712 }, { "auxiliary_loss_clip": 0.01450643, "auxiliary_loss_mlp": 0.01045528, "balance_loss_clip": 1.27605915, "balance_loss_mlp": 1.02440381, "epoch": 0.46986321960018035, "flos": 15157598745600.0, "grad_norm": 2.290244990236686, "language_loss": 0.82353669, "learning_rate": 2.291251619387217e-06, "loss": 0.8484984, "num_input_tokens_seen": 167831845, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.21130371, "step": 7815, "time_per_iteration": 2.8504645824432373 }, { "auxiliary_loss_clip": 0.01457627, "auxiliary_loss_mlp": 0.0103559, "balance_loss_clip": 1.28010654, "balance_loss_mlp": 1.01384628, "epoch": 0.4699233428528483, "flos": 23118392125440.0, "grad_norm": 2.0359092749536383, "language_loss": 0.78001314, "learning_rate": 2.2908663039483468e-06, "loss": 0.80494529, "num_input_tokens_seen": 167850360, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.21740723, "step": 7816, "time_per_iteration": 4.305444002151489 }, { "auxiliary_loss_clip": 0.01253574, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 1.15262461, "balance_loss_mlp": 1.0054301, "epoch": 0.46998346610551633, "flos": 68138702668800.0, "grad_norm": 0.8349083415262301, "language_loss": 0.59058952, "learning_rate": 2.290480977479796e-06, "loss": 0.61341417, "num_input_tokens_seen": 167908660, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.234375, "step": 7817, "time_per_iteration": 4.785445213317871 }, { "auxiliary_loss_clip": 0.01451632, "auxiliary_loss_mlp": 0.01036875, "balance_loss_clip": 1.27947927, "balance_loss_mlp": 1.01607347, "epoch": 0.4700435893581843, "flos": 24138928010880.0, "grad_norm": 1.6890186170951793, "language_loss": 0.80063552, "learning_rate": 2.2900956399961775e-06, "loss": 0.82552063, "num_input_tokens_seen": 167927905, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.20825195, "step": 7818, "time_per_iteration": 4.355440616607666 }, { "auxiliary_loss_clip": 0.014605, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.28285146, "balance_loss_mlp": 1.01301074, "epoch": 0.47010371261085226, "flos": 20158440831360.0, "grad_norm": 1.874728934131144, "language_loss": 0.84375691, "learning_rate": 2.289710291512104e-06, "loss": 0.86869532, "num_input_tokens_seen": 167945995, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.20336914, "step": 7819, "time_per_iteration": 2.9428608417510986 }, { "auxiliary_loss_clip": 0.01466979, "auxiliary_loss_mlp": 0.01038391, "balance_loss_clip": 1.28756261, "balance_loss_mlp": 1.01608729, "epoch": 0.47016383586352023, "flos": 15130560337920.0, "grad_norm": 2.8465941419502623, "language_loss": 0.77319783, "learning_rate": 2.289324932042186e-06, "loss": 0.79825151, "num_input_tokens_seen": 167963380, "router_z_loss_clip": 1.79492188, "router_z_loss_mlp": 0.22302246, "step": 7820, "time_per_iteration": 2.8793141841888428 }, { "auxiliary_loss_clip": 0.01456593, "auxiliary_loss_mlp": 0.01036752, "balance_loss_clip": 1.28267908, "balance_loss_mlp": 1.01651084, "epoch": 0.4702239591161882, "flos": 13560841342080.0, "grad_norm": 1.9837884291164707, "language_loss": 0.74586463, "learning_rate": 2.288939561601039e-06, "loss": 0.77079809, "num_input_tokens_seen": 167981740, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.20227051, "step": 7821, "time_per_iteration": 2.900174856185913 }, { "auxiliary_loss_clip": 0.01453181, "auxiliary_loss_mlp": 0.01035835, "balance_loss_clip": 1.27872109, "balance_loss_mlp": 1.01554513, "epoch": 0.47028408236885616, "flos": 24286308883200.0, "grad_norm": 1.8033370683402354, "language_loss": 0.8917166, "learning_rate": 2.2885541802032746e-06, "loss": 0.91660672, "num_input_tokens_seen": 167999380, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.20288086, "step": 7822, "time_per_iteration": 2.898970603942871 }, { "auxiliary_loss_clip": 0.01471444, "auxiliary_loss_mlp": 0.01035681, "balance_loss_clip": 1.29434514, "balance_loss_mlp": 1.01512957, "epoch": 0.4703442056215241, "flos": 22867228385280.0, "grad_norm": 1.5446502409283804, "language_loss": 0.80584037, "learning_rate": 2.2881687878635055e-06, "loss": 0.83091164, "num_input_tokens_seen": 168018395, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.20532227, "step": 7823, "time_per_iteration": 2.842829942703247 }, { "auxiliary_loss_clip": 0.01249799, "auxiliary_loss_mlp": 0.01043991, "balance_loss_clip": 1.14823866, "balance_loss_mlp": 1.02215183, "epoch": 0.4704043288741921, "flos": 69274604090880.0, "grad_norm": 0.7087614436710279, "language_loss": 0.56746227, "learning_rate": 2.2877833845963487e-06, "loss": 0.59040016, "num_input_tokens_seen": 168084080, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.21875, "step": 7824, "time_per_iteration": 3.43638014793396 }, { "auxiliary_loss_clip": 0.01469268, "auxiliary_loss_mlp": 0.01038668, "balance_loss_clip": 1.29015899, "balance_loss_mlp": 1.01626873, "epoch": 0.47046445212686006, "flos": 18050216497920.0, "grad_norm": 1.704437578773318, "language_loss": 0.81644452, "learning_rate": 2.2873979704164157e-06, "loss": 0.84152389, "num_input_tokens_seen": 168101555, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.22412109, "step": 7825, "time_per_iteration": 2.8232266902923584 }, { "auxiliary_loss_clip": 0.01468259, "auxiliary_loss_mlp": 0.01040784, "balance_loss_clip": 1.28833401, "balance_loss_mlp": 1.01859975, "epoch": 0.470524575379528, "flos": 23962020266880.0, "grad_norm": 1.6854491696583402, "language_loss": 0.67473251, "learning_rate": 2.287012545338324e-06, "loss": 0.6998229, "num_input_tokens_seen": 168121530, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.22180176, "step": 7826, "time_per_iteration": 2.9147801399230957 }, { "auxiliary_loss_clip": 0.0146393, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 1.28512299, "balance_loss_mlp": 1.01826119, "epoch": 0.470584698632196, "flos": 18122572212480.0, "grad_norm": 1.7331574855348737, "language_loss": 0.84073484, "learning_rate": 2.2866271093766877e-06, "loss": 0.86576855, "num_input_tokens_seen": 168140335, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21191406, "step": 7827, "time_per_iteration": 2.8743977546691895 }, { "auxiliary_loss_clip": 0.01253659, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.15188205, "balance_loss_mlp": 1.00680661, "epoch": 0.47064482188486395, "flos": 57277735655040.0, "grad_norm": 0.8008181991902088, "language_loss": 0.55612749, "learning_rate": 2.286241662546122e-06, "loss": 0.57898772, "num_input_tokens_seen": 168200535, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.25585938, "step": 7828, "time_per_iteration": 3.284156560897827 }, { "auxiliary_loss_clip": 0.01454124, "auxiliary_loss_mlp": 0.01035277, "balance_loss_clip": 1.27779698, "balance_loss_mlp": 1.01503563, "epoch": 0.4707049451375319, "flos": 17904147724800.0, "grad_norm": 1.8946849734897582, "language_loss": 0.82127982, "learning_rate": 2.285856204861245e-06, "loss": 0.84617382, "num_input_tokens_seen": 168219610, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.20239258, "step": 7829, "time_per_iteration": 2.8616549968719482 }, { "auxiliary_loss_clip": 0.01457314, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.28147697, "balance_loss_mlp": 1.01650906, "epoch": 0.47076506839019994, "flos": 25244669134080.0, "grad_norm": 1.4220725499870086, "language_loss": 0.76318157, "learning_rate": 2.2854707363366703e-06, "loss": 0.78813314, "num_input_tokens_seen": 168242505, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21337891, "step": 7830, "time_per_iteration": 2.9515786170959473 }, { "auxiliary_loss_clip": 0.0146302, "auxiliary_loss_mlp": 0.01034513, "balance_loss_clip": 1.28772914, "balance_loss_mlp": 1.0125314, "epoch": 0.4708251916428679, "flos": 13487444997120.0, "grad_norm": 2.496806263472205, "language_loss": 0.79466593, "learning_rate": 2.2850852569870177e-06, "loss": 0.81964123, "num_input_tokens_seen": 168260220, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.2199707, "step": 7831, "time_per_iteration": 2.8606808185577393 }, { "auxiliary_loss_clip": 0.01494509, "auxiliary_loss_mlp": 0.01038008, "balance_loss_clip": 1.30913651, "balance_loss_mlp": 1.01649046, "epoch": 0.47088531489553587, "flos": 30158405700480.0, "grad_norm": 1.674181010050731, "language_loss": 0.76316768, "learning_rate": 2.2846997668269033e-06, "loss": 0.7884928, "num_input_tokens_seen": 168277360, "router_z_loss_clip": 1.85644531, "router_z_loss_mlp": 0.21533203, "step": 7832, "time_per_iteration": 2.920445680618286 }, { "auxiliary_loss_clip": 0.01458191, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.28283715, "balance_loss_mlp": 1.01292706, "epoch": 0.47094543814820383, "flos": 21808161912960.0, "grad_norm": 1.4769851532734934, "language_loss": 0.75408977, "learning_rate": 2.2843142658709454e-06, "loss": 0.77900922, "num_input_tokens_seen": 168296605, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.20837402, "step": 7833, "time_per_iteration": 2.887096881866455 }, { "auxiliary_loss_clip": 0.01464233, "auxiliary_loss_mlp": 0.01040117, "balance_loss_clip": 1.28715324, "balance_loss_mlp": 1.01962495, "epoch": 0.4710055614008718, "flos": 23013387648000.0, "grad_norm": 1.6741331104421786, "language_loss": 0.76524794, "learning_rate": 2.283928754133762e-06, "loss": 0.79029143, "num_input_tokens_seen": 168316205, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.20495605, "step": 7834, "time_per_iteration": 2.891139268875122 }, { "auxiliary_loss_clip": 0.01458826, "auxiliary_loss_mlp": 0.01037648, "balance_loss_clip": 1.2839762, "balance_loss_mlp": 1.01680994, "epoch": 0.47106568465353976, "flos": 42756724270080.0, "grad_norm": 1.5334048843906214, "language_loss": 0.67330229, "learning_rate": 2.283543231629972e-06, "loss": 0.69826698, "num_input_tokens_seen": 168338935, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.20849609, "step": 7835, "time_per_iteration": 3.07818341255188 }, { "auxiliary_loss_clip": 0.01255208, "auxiliary_loss_mlp": 0.01053279, "balance_loss_clip": 1.14560843, "balance_loss_mlp": 1.02352452, "epoch": 0.4711258079062077, "flos": 68580555834240.0, "grad_norm": 0.8785818952814886, "language_loss": 0.6222533, "learning_rate": 2.283157698374194e-06, "loss": 0.64533818, "num_input_tokens_seen": 168392800, "router_z_loss_clip": 1.09375, "router_z_loss_mlp": 0.296875, "step": 7836, "time_per_iteration": 3.3395493030548096 }, { "auxiliary_loss_clip": 0.01490665, "auxiliary_loss_mlp": 0.01037672, "balance_loss_clip": 1.30507672, "balance_loss_mlp": 1.01723909, "epoch": 0.4711859311588757, "flos": 25457347532160.0, "grad_norm": 1.9455392564938272, "language_loss": 0.70281118, "learning_rate": 2.2827721543810475e-06, "loss": 0.72809458, "num_input_tokens_seen": 168412940, "router_z_loss_clip": 1.85546875, "router_z_loss_mlp": 0.2043457, "step": 7837, "time_per_iteration": 2.92879581451416 }, { "auxiliary_loss_clip": 0.0147429, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.29559255, "balance_loss_mlp": 1.0229311, "epoch": 0.47124605441154366, "flos": 21992127845760.0, "grad_norm": 1.8358517601885067, "language_loss": 0.66894317, "learning_rate": 2.282386599665153e-06, "loss": 0.69413471, "num_input_tokens_seen": 168431995, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21936035, "step": 7838, "time_per_iteration": 2.8693721294403076 }, { "auxiliary_loss_clip": 0.01483333, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 1.30039167, "balance_loss_mlp": 1.01609349, "epoch": 0.4713061776642116, "flos": 25424110586880.0, "grad_norm": 2.307480758739275, "language_loss": 0.78752005, "learning_rate": 2.2820010342411304e-06, "loss": 0.8127268, "num_input_tokens_seen": 168454585, "router_z_loss_clip": 1.83105469, "router_z_loss_mlp": 0.21240234, "step": 7839, "time_per_iteration": 3.012094497680664 }, { "auxiliary_loss_clip": 0.01451558, "auxiliary_loss_mlp": 0.01040555, "balance_loss_clip": 1.27638173, "balance_loss_mlp": 1.0194912, "epoch": 0.4713663009168796, "flos": 26553587247360.0, "grad_norm": 1.837168159703692, "language_loss": 0.73329735, "learning_rate": 2.2816154581235993e-06, "loss": 0.75821841, "num_input_tokens_seen": 168471265, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.21069336, "step": 7840, "time_per_iteration": 3.0048975944519043 }, { "auxiliary_loss_clip": 0.01460464, "auxiliary_loss_mlp": 0.01038364, "balance_loss_clip": 1.28310275, "balance_loss_mlp": 1.0179913, "epoch": 0.47142642416954755, "flos": 23634202556160.0, "grad_norm": 1.973271585402389, "language_loss": 0.76117682, "learning_rate": 2.2812298713271833e-06, "loss": 0.78616512, "num_input_tokens_seen": 168491360, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.20373535, "step": 7841, "time_per_iteration": 2.9037413597106934 }, { "auxiliary_loss_clip": 0.01459689, "auxiliary_loss_mlp": 0.01039675, "balance_loss_clip": 1.28170085, "balance_loss_mlp": 1.01869416, "epoch": 0.4714865474222155, "flos": 22320443249280.0, "grad_norm": 1.650317573945922, "language_loss": 0.70567644, "learning_rate": 2.280844273866501e-06, "loss": 0.73067003, "num_input_tokens_seen": 168511335, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.20996094, "step": 7842, "time_per_iteration": 2.8694963455200195 }, { "auxiliary_loss_clip": 0.01458587, "auxiliary_loss_mlp": 0.01037749, "balance_loss_clip": 1.28056216, "balance_loss_mlp": 1.01599348, "epoch": 0.4715466706748835, "flos": 17831565786240.0, "grad_norm": 1.8913768813448193, "language_loss": 0.79765463, "learning_rate": 2.280458665756177e-06, "loss": 0.82261801, "num_input_tokens_seen": 168529920, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.21728516, "step": 7843, "time_per_iteration": 4.255522012710571 }, { "auxiliary_loss_clip": 0.01461756, "auxiliary_loss_mlp": 0.010415, "balance_loss_clip": 1.28347254, "balance_loss_mlp": 1.02066207, "epoch": 0.4716067939275515, "flos": 23670018455040.0, "grad_norm": 1.8639463398720095, "language_loss": 0.74755329, "learning_rate": 2.280073047010832e-06, "loss": 0.77258581, "num_input_tokens_seen": 168550595, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.20837402, "step": 7844, "time_per_iteration": 2.8645012378692627 }, { "auxiliary_loss_clip": 0.01442409, "auxiliary_loss_mlp": 0.01041263, "balance_loss_clip": 1.26777339, "balance_loss_mlp": 1.01982892, "epoch": 0.47166691718021947, "flos": 17938877748480.0, "grad_norm": 1.793064000970921, "language_loss": 0.7941916, "learning_rate": 2.279687417645088e-06, "loss": 0.81902838, "num_input_tokens_seen": 168569765, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.2142334, "step": 7845, "time_per_iteration": 2.842479705810547 }, { "auxiliary_loss_clip": 0.0144826, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.27224171, "balance_loss_mlp": 1.01464176, "epoch": 0.47172704043288743, "flos": 26625761982720.0, "grad_norm": 1.401184361210611, "language_loss": 0.73972797, "learning_rate": 2.2793017776735703e-06, "loss": 0.76456958, "num_input_tokens_seen": 168591525, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.21252441, "step": 7846, "time_per_iteration": 2.916719913482666 }, { "auxiliary_loss_clip": 0.01435932, "auxiliary_loss_mlp": 0.01037947, "balance_loss_clip": 1.26316524, "balance_loss_mlp": 1.01663291, "epoch": 0.4717871636855554, "flos": 27932825059200.0, "grad_norm": 1.3922680085700205, "language_loss": 0.75336832, "learning_rate": 2.2789161271109e-06, "loss": 0.77810711, "num_input_tokens_seen": 168611235, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.21325684, "step": 7847, "time_per_iteration": 2.918854236602783 }, { "auxiliary_loss_clip": 0.0144397, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.26886129, "balance_loss_mlp": 1.01663494, "epoch": 0.47184728693822336, "flos": 14510514591360.0, "grad_norm": 1.7183308672731796, "language_loss": 0.81597966, "learning_rate": 2.278530465971703e-06, "loss": 0.84079039, "num_input_tokens_seen": 168628710, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.20471191, "step": 7848, "time_per_iteration": 2.8308908939361572 }, { "auxiliary_loss_clip": 0.01465003, "auxiliary_loss_mlp": 0.01035469, "balance_loss_clip": 1.28665221, "balance_loss_mlp": 1.01409507, "epoch": 0.47190741019089133, "flos": 17865074200320.0, "grad_norm": 6.370362858939226, "language_loss": 0.70873439, "learning_rate": 2.2781447942706032e-06, "loss": 0.73373914, "num_input_tokens_seen": 168645645, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.21386719, "step": 7849, "time_per_iteration": 2.848738193511963 }, { "auxiliary_loss_clip": 0.01462421, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.28122306, "balance_loss_mlp": 1.01790071, "epoch": 0.4719675334435593, "flos": 17904645417600.0, "grad_norm": 2.2195777818397717, "language_loss": 0.70986992, "learning_rate": 2.277759112022224e-06, "loss": 0.73488784, "num_input_tokens_seen": 168664165, "router_z_loss_clip": 1.80957031, "router_z_loss_mlp": 0.21459961, "step": 7850, "time_per_iteration": 2.9214279651641846 }, { "auxiliary_loss_clip": 0.0145751, "auxiliary_loss_mlp": 0.01036851, "balance_loss_clip": 1.27901661, "balance_loss_mlp": 1.0154295, "epoch": 0.47202765669622726, "flos": 20714139192960.0, "grad_norm": 2.474970264276976, "language_loss": 0.75611514, "learning_rate": 2.2773734192411916e-06, "loss": 0.78105879, "num_input_tokens_seen": 168681940, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21435547, "step": 7851, "time_per_iteration": 2.8645052909851074 }, { "auxiliary_loss_clip": 0.01449633, "auxiliary_loss_mlp": 0.01033693, "balance_loss_clip": 1.27105772, "balance_loss_mlp": 1.01204479, "epoch": 0.4720877799488952, "flos": 16368299101440.0, "grad_norm": 1.8767499447808493, "language_loss": 0.77674288, "learning_rate": 2.276987715942132e-06, "loss": 0.80157614, "num_input_tokens_seen": 168698830, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.21655273, "step": 7852, "time_per_iteration": 4.342456102371216 }, { "auxiliary_loss_clip": 0.01441278, "auxiliary_loss_mlp": 0.01041119, "balance_loss_clip": 1.26557589, "balance_loss_mlp": 1.01631141, "epoch": 0.4721479032015632, "flos": 20678006580480.0, "grad_norm": 1.5188824052618932, "language_loss": 0.69626069, "learning_rate": 2.2766020021396696e-06, "loss": 0.72108471, "num_input_tokens_seen": 168718305, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.24816895, "step": 7853, "time_per_iteration": 4.253943681716919 }, { "auxiliary_loss_clip": 0.01250261, "auxiliary_loss_mlp": 0.01026376, "balance_loss_clip": 1.14952064, "balance_loss_mlp": 0.99757463, "epoch": 0.47220802645423116, "flos": 67786018300800.0, "grad_norm": 0.7067996931114588, "language_loss": 0.50181437, "learning_rate": 2.276216277848432e-06, "loss": 0.52458072, "num_input_tokens_seen": 168782365, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.28710938, "step": 7854, "time_per_iteration": 3.4705193042755127 }, { "auxiliary_loss_clip": 0.01461464, "auxiliary_loss_mlp": 0.01034324, "balance_loss_clip": 1.28264916, "balance_loss_mlp": 1.01225877, "epoch": 0.4722681497068991, "flos": 20930799133440.0, "grad_norm": 3.1038921284818803, "language_loss": 0.64488685, "learning_rate": 2.2758305430830455e-06, "loss": 0.66984475, "num_input_tokens_seen": 168800485, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.2208252, "step": 7855, "time_per_iteration": 2.8398942947387695 }, { "auxiliary_loss_clip": 0.01438504, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.26277947, "balance_loss_mlp": 1.01600635, "epoch": 0.4723282729595671, "flos": 28304738467200.0, "grad_norm": 1.859403075528418, "language_loss": 0.76770616, "learning_rate": 2.2754447978581376e-06, "loss": 0.79246485, "num_input_tokens_seen": 168818965, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21362305, "step": 7856, "time_per_iteration": 2.909841537475586 }, { "auxiliary_loss_clip": 0.01450925, "auxiliary_loss_mlp": 0.01036533, "balance_loss_clip": 1.27640462, "balance_loss_mlp": 1.01476526, "epoch": 0.4723883962122351, "flos": 27136278771840.0, "grad_norm": 2.0442679916783657, "language_loss": 0.75421679, "learning_rate": 2.2750590421883347e-06, "loss": 0.77909136, "num_input_tokens_seen": 168840355, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.21765137, "step": 7857, "time_per_iteration": 2.897757053375244 }, { "auxiliary_loss_clip": 0.0143945, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.26613188, "balance_loss_mlp": 1.0159502, "epoch": 0.47244851946490307, "flos": 31548954712320.0, "grad_norm": 1.4972727236661267, "language_loss": 0.65591621, "learning_rate": 2.2746732760882655e-06, "loss": 0.68067849, "num_input_tokens_seen": 168861765, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.20837402, "step": 7858, "time_per_iteration": 2.978564977645874 }, { "auxiliary_loss_clip": 0.01429052, "auxiliary_loss_mlp": 0.01038788, "balance_loss_clip": 1.25767684, "balance_loss_mlp": 1.01753247, "epoch": 0.47250864271757104, "flos": 20896295333760.0, "grad_norm": 1.905955344899447, "language_loss": 0.71314061, "learning_rate": 2.2742874995725575e-06, "loss": 0.73781902, "num_input_tokens_seen": 168881310, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21252441, "step": 7859, "time_per_iteration": 2.855790138244629 }, { "auxiliary_loss_clip": 0.01458988, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.27835131, "balance_loss_mlp": 1.01610231, "epoch": 0.472568765970239, "flos": 20531847317760.0, "grad_norm": 2.0228191281621912, "language_loss": 0.62672973, "learning_rate": 2.2739017126558413e-06, "loss": 0.65169418, "num_input_tokens_seen": 168899470, "router_z_loss_clip": 1.80566406, "router_z_loss_mlp": 0.21350098, "step": 7860, "time_per_iteration": 2.855999231338501 }, { "auxiliary_loss_clip": 0.01439564, "auxiliary_loss_mlp": 0.01040181, "balance_loss_clip": 1.26314974, "balance_loss_mlp": 1.01881838, "epoch": 0.47262888922290697, "flos": 35817778874880.0, "grad_norm": 5.595022856215944, "language_loss": 0.72678924, "learning_rate": 2.2735159153527445e-06, "loss": 0.75158674, "num_input_tokens_seen": 168921495, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.21362305, "step": 7861, "time_per_iteration": 2.961784839630127 }, { "auxiliary_loss_clip": 0.01445223, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.26948655, "balance_loss_mlp": 1.01391768, "epoch": 0.47268901247557493, "flos": 20677463642880.0, "grad_norm": 1.916917282784984, "language_loss": 0.86495149, "learning_rate": 2.273130107677896e-06, "loss": 0.88976711, "num_input_tokens_seen": 168940515, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.22424316, "step": 7862, "time_per_iteration": 2.8530490398406982 }, { "auxiliary_loss_clip": 0.01449058, "auxiliary_loss_mlp": 0.01040726, "balance_loss_clip": 1.27061749, "balance_loss_mlp": 1.01988828, "epoch": 0.4727491357282429, "flos": 19582762250880.0, "grad_norm": 1.7889064881853776, "language_loss": 0.85212266, "learning_rate": 2.272744289645927e-06, "loss": 0.87702048, "num_input_tokens_seen": 168958340, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.20837402, "step": 7863, "time_per_iteration": 2.837104558944702 }, { "auxiliary_loss_clip": 0.01441237, "auxiliary_loss_mlp": 0.01038207, "balance_loss_clip": 1.26618516, "balance_loss_mlp": 1.01753569, "epoch": 0.47280925898091086, "flos": 18224771512320.0, "grad_norm": 1.8548898131851954, "language_loss": 0.67245579, "learning_rate": 2.272358461271467e-06, "loss": 0.69725025, "num_input_tokens_seen": 168974850, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.20666504, "step": 7864, "time_per_iteration": 2.834808588027954 }, { "auxiliary_loss_clip": 0.01439444, "auxiliary_loss_mlp": 0.01037238, "balance_loss_clip": 1.26503253, "balance_loss_mlp": 1.01616144, "epoch": 0.4728693822335788, "flos": 17830796624640.0, "grad_norm": 3.537489847010876, "language_loss": 0.66144288, "learning_rate": 2.271972622569147e-06, "loss": 0.68620968, "num_input_tokens_seen": 168992860, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.21069336, "step": 7865, "time_per_iteration": 2.8784751892089844 }, { "auxiliary_loss_clip": 0.01437637, "auxiliary_loss_mlp": 0.01041013, "balance_loss_clip": 1.26583445, "balance_loss_mlp": 1.01920998, "epoch": 0.4729295054862468, "flos": 20604836459520.0, "grad_norm": 1.752535118665879, "language_loss": 0.74988967, "learning_rate": 2.2715867735535976e-06, "loss": 0.77467614, "num_input_tokens_seen": 169010325, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.21801758, "step": 7866, "time_per_iteration": 2.846238374710083 }, { "auxiliary_loss_clip": 0.01452682, "auxiliary_loss_mlp": 0.01034737, "balance_loss_clip": 1.27481747, "balance_loss_mlp": 1.01346982, "epoch": 0.47298962873891476, "flos": 23378604825600.0, "grad_norm": 2.7315376622189955, "language_loss": 0.83674914, "learning_rate": 2.271200914239451e-06, "loss": 0.86162329, "num_input_tokens_seen": 169029840, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.21264648, "step": 7867, "time_per_iteration": 2.874845266342163 }, { "auxiliary_loss_clip": 0.01435391, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.26254082, "balance_loss_mlp": 1.01248884, "epoch": 0.4730497519915827, "flos": 22061814117120.0, "grad_norm": 1.643375970538988, "language_loss": 0.80238312, "learning_rate": 2.2708150446413385e-06, "loss": 0.82707769, "num_input_tokens_seen": 169049975, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.21582031, "step": 7868, "time_per_iteration": 2.8520069122314453 }, { "auxiliary_loss_clip": 0.01450689, "auxiliary_loss_mlp": 0.01042197, "balance_loss_clip": 1.27029562, "balance_loss_mlp": 1.02016735, "epoch": 0.4731098752442507, "flos": 21079718328960.0, "grad_norm": 1.795968225482409, "language_loss": 0.75799346, "learning_rate": 2.2704291647738915e-06, "loss": 0.78292227, "num_input_tokens_seen": 169069540, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.22045898, "step": 7869, "time_per_iteration": 2.8515944480895996 }, { "auxiliary_loss_clip": 0.01453628, "auxiliary_loss_mlp": 0.01043977, "balance_loss_clip": 1.27673209, "balance_loss_mlp": 1.02133918, "epoch": 0.4731699984969187, "flos": 22539048716160.0, "grad_norm": 1.7660068388069483, "language_loss": 0.74567491, "learning_rate": 2.2700432746517443e-06, "loss": 0.77065092, "num_input_tokens_seen": 169089940, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.22619629, "step": 7870, "time_per_iteration": 2.8965253829956055 }, { "auxiliary_loss_clip": 0.01486543, "auxiliary_loss_mlp": 0.01047802, "balance_loss_clip": 1.3041575, "balance_loss_mlp": 1.02612948, "epoch": 0.4732301217495867, "flos": 24908435890560.0, "grad_norm": 2.031840180327355, "language_loss": 0.82517678, "learning_rate": 2.2696573742895292e-06, "loss": 0.85052025, "num_input_tokens_seen": 169109650, "router_z_loss_clip": 1.82226562, "router_z_loss_mlp": 0.21679688, "step": 7871, "time_per_iteration": 2.8965330123901367 }, { "auxiliary_loss_clip": 0.01453391, "auxiliary_loss_mlp": 0.01041624, "balance_loss_clip": 1.27744746, "balance_loss_mlp": 1.02011847, "epoch": 0.47329024500225464, "flos": 22794827425920.0, "grad_norm": 1.5388626835197496, "language_loss": 0.76893228, "learning_rate": 2.269271463701879e-06, "loss": 0.79388237, "num_input_tokens_seen": 169128990, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.21520996, "step": 7872, "time_per_iteration": 2.876718521118164 }, { "auxiliary_loss_clip": 0.01464899, "auxiliary_loss_mlp": 0.0104313, "balance_loss_clip": 1.28724647, "balance_loss_mlp": 1.0210638, "epoch": 0.4733503682549226, "flos": 38711392012800.0, "grad_norm": 1.9210858742729469, "language_loss": 0.68885744, "learning_rate": 2.268885542903428e-06, "loss": 0.71393776, "num_input_tokens_seen": 169154645, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.22070312, "step": 7873, "time_per_iteration": 3.011988401412964 }, { "auxiliary_loss_clip": 0.01438428, "auxiliary_loss_mlp": 0.01037997, "balance_loss_clip": 1.26427913, "balance_loss_mlp": 1.01669395, "epoch": 0.47341049150759057, "flos": 22977300280320.0, "grad_norm": 2.008489158926519, "language_loss": 0.73071843, "learning_rate": 2.26849961190881e-06, "loss": 0.75548267, "num_input_tokens_seen": 169174995, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.2130127, "step": 7874, "time_per_iteration": 2.8827805519104004 }, { "auxiliary_loss_clip": 0.01454186, "auxiliary_loss_mlp": 0.01040357, "balance_loss_clip": 1.27540898, "balance_loss_mlp": 1.01837516, "epoch": 0.47347061476025853, "flos": 14546692448640.0, "grad_norm": 2.187710296549013, "language_loss": 0.65941012, "learning_rate": 2.26811367073266e-06, "loss": 0.68435556, "num_input_tokens_seen": 169191815, "router_z_loss_clip": 1.78710938, "router_z_loss_mlp": 0.21972656, "step": 7875, "time_per_iteration": 2.8408305644989014 }, { "auxiliary_loss_clip": 0.01461833, "auxiliary_loss_mlp": 0.0104557, "balance_loss_clip": 1.28527653, "balance_loss_mlp": 1.02444649, "epoch": 0.4735307380129265, "flos": 30275400049920.0, "grad_norm": 2.2797913752014085, "language_loss": 0.81907171, "learning_rate": 2.2677277193896125e-06, "loss": 0.84414572, "num_input_tokens_seen": 169210430, "router_z_loss_clip": 1.76757812, "router_z_loss_mlp": 0.21118164, "step": 7876, "time_per_iteration": 2.9677083492279053 }, { "auxiliary_loss_clip": 0.0144617, "auxiliary_loss_mlp": 0.01042673, "balance_loss_clip": 1.2691648, "balance_loss_mlp": 1.02020216, "epoch": 0.47359086126559446, "flos": 19400515620480.0, "grad_norm": 1.8039945578345826, "language_loss": 0.7937628, "learning_rate": 2.267341757894304e-06, "loss": 0.81865126, "num_input_tokens_seen": 169229295, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.22473145, "step": 7877, "time_per_iteration": 2.8970017433166504 }, { "auxiliary_loss_clip": 0.01428498, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.25511026, "balance_loss_mlp": 1.01936996, "epoch": 0.47365098451826243, "flos": 21948077393280.0, "grad_norm": 1.8742675027664282, "language_loss": 0.72131073, "learning_rate": 2.2669557862613685e-06, "loss": 0.74600422, "num_input_tokens_seen": 169247855, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.21459961, "step": 7878, "time_per_iteration": 4.311206102371216 }, { "auxiliary_loss_clip": 0.01443194, "auxiliary_loss_mlp": 0.010381, "balance_loss_clip": 1.27007389, "balance_loss_mlp": 1.01602221, "epoch": 0.4737111077709304, "flos": 25855168227840.0, "grad_norm": 1.6640024102260982, "language_loss": 0.75815666, "learning_rate": 2.2665698045054425e-06, "loss": 0.78296959, "num_input_tokens_seen": 169268860, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.22058105, "step": 7879, "time_per_iteration": 2.892313003540039 }, { "auxiliary_loss_clip": 0.01240752, "auxiliary_loss_mlp": 0.01020961, "balance_loss_clip": 1.14393878, "balance_loss_mlp": 1.00007522, "epoch": 0.47377123102359836, "flos": 67789710391680.0, "grad_norm": 0.7446899127067215, "language_loss": 0.61290729, "learning_rate": 2.266183812641164e-06, "loss": 0.63552445, "num_input_tokens_seen": 169331855, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.20898438, "step": 7880, "time_per_iteration": 3.355663776397705 }, { "auxiliary_loss_clip": 0.01431736, "auxiliary_loss_mlp": 0.0103941, "balance_loss_clip": 1.25914729, "balance_loss_mlp": 1.01722515, "epoch": 0.4738313542762663, "flos": 24326558772480.0, "grad_norm": 1.7297080364130981, "language_loss": 0.69108129, "learning_rate": 2.2657978106831675e-06, "loss": 0.71579278, "num_input_tokens_seen": 169352175, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.22192383, "step": 7881, "time_per_iteration": 2.8771960735321045 }, { "auxiliary_loss_clip": 0.01428056, "auxiliary_loss_mlp": 0.01037055, "balance_loss_clip": 1.25631428, "balance_loss_mlp": 1.01589537, "epoch": 0.4738914775289343, "flos": 20715360802560.0, "grad_norm": 2.262164047963832, "language_loss": 0.78132993, "learning_rate": 2.265411798646092e-06, "loss": 0.80598098, "num_input_tokens_seen": 169371215, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.21130371, "step": 7882, "time_per_iteration": 2.875046968460083 }, { "auxiliary_loss_clip": 0.01445618, "auxiliary_loss_mlp": 0.01037705, "balance_loss_clip": 1.26925242, "balance_loss_mlp": 1.01612782, "epoch": 0.4739516007816023, "flos": 25457030818560.0, "grad_norm": 1.5454473865535945, "language_loss": 0.77410281, "learning_rate": 2.2650257765445747e-06, "loss": 0.79893601, "num_input_tokens_seen": 169391745, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.21582031, "step": 7883, "time_per_iteration": 2.9306907653808594 }, { "auxiliary_loss_clip": 0.01441907, "auxiliary_loss_mlp": 0.01034406, "balance_loss_clip": 1.26768684, "balance_loss_mlp": 1.0135442, "epoch": 0.4740117240342703, "flos": 19983840572160.0, "grad_norm": 1.7032897891962955, "language_loss": 0.72521055, "learning_rate": 2.2646397443932525e-06, "loss": 0.74997365, "num_input_tokens_seen": 169409845, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.20837402, "step": 7884, "time_per_iteration": 2.8358917236328125 }, { "auxiliary_loss_clip": 0.01463034, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.28226089, "balance_loss_mlp": 1.01565742, "epoch": 0.47407184728693824, "flos": 15667482107520.0, "grad_norm": 2.099785369363229, "language_loss": 0.82768476, "learning_rate": 2.2642537022067655e-06, "loss": 0.85269129, "num_input_tokens_seen": 169426085, "router_z_loss_clip": 1.80664062, "router_z_loss_mlp": 0.21960449, "step": 7885, "time_per_iteration": 2.8035881519317627 }, { "auxiliary_loss_clip": 0.01463686, "auxiliary_loss_mlp": 0.01039093, "balance_loss_clip": 1.28825617, "balance_loss_mlp": 1.01820803, "epoch": 0.4741319705396062, "flos": 18598177998720.0, "grad_norm": 2.0142737391927517, "language_loss": 0.74412048, "learning_rate": 2.263867649999751e-06, "loss": 0.76914823, "num_input_tokens_seen": 169444705, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.20898438, "step": 7886, "time_per_iteration": 2.8094804286956787 }, { "auxiliary_loss_clip": 0.01468862, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 1.2857281, "balance_loss_mlp": 1.01524425, "epoch": 0.47419209379227417, "flos": 13268884775040.0, "grad_norm": 1.796130149320026, "language_loss": 0.74930668, "learning_rate": 2.263481587786849e-06, "loss": 0.77438056, "num_input_tokens_seen": 169460850, "router_z_loss_clip": 1.83300781, "router_z_loss_mlp": 0.23266602, "step": 7887, "time_per_iteration": 5.634202003479004 }, { "auxiliary_loss_clip": 0.01441568, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.2678895, "balance_loss_mlp": 1.01627946, "epoch": 0.47425221704494214, "flos": 20052712437120.0, "grad_norm": 1.7511428226217989, "language_loss": 0.77547526, "learning_rate": 2.2630955155826993e-06, "loss": 0.80026567, "num_input_tokens_seen": 169478890, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21203613, "step": 7888, "time_per_iteration": 4.364242792129517 }, { "auxiliary_loss_clip": 0.01446215, "auxiliary_loss_mlp": 0.01036469, "balance_loss_clip": 1.27005291, "balance_loss_mlp": 1.01514304, "epoch": 0.4743123402976101, "flos": 27283388175360.0, "grad_norm": 1.6691236659237736, "language_loss": 0.73672771, "learning_rate": 2.2627094334019406e-06, "loss": 0.76155454, "num_input_tokens_seen": 169499690, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.21325684, "step": 7889, "time_per_iteration": 2.898346424102783 }, { "auxiliary_loss_clip": 0.01230387, "auxiliary_loss_mlp": 0.01021554, "balance_loss_clip": 1.1312865, "balance_loss_mlp": 0.99961996, "epoch": 0.47437246355027807, "flos": 55420041634560.0, "grad_norm": 0.7246164590249049, "language_loss": 0.56165278, "learning_rate": 2.262323341259214e-06, "loss": 0.58417213, "num_input_tokens_seen": 169560475, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.21972656, "step": 7890, "time_per_iteration": 3.407794713973999 }, { "auxiliary_loss_clip": 0.01457518, "auxiliary_loss_mlp": 0.01036735, "balance_loss_clip": 1.27838063, "balance_loss_mlp": 1.01503849, "epoch": 0.47443258680294603, "flos": 23889031125120.0, "grad_norm": 1.9573746809417971, "language_loss": 0.66129559, "learning_rate": 2.2619372391691605e-06, "loss": 0.68623817, "num_input_tokens_seen": 169580110, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.21691895, "step": 7891, "time_per_iteration": 2.88854718208313 }, { "auxiliary_loss_clip": 0.01468378, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.2857573, "balance_loss_mlp": 1.01909852, "epoch": 0.474492710055614, "flos": 21986924693760.0, "grad_norm": 2.456255908336512, "language_loss": 0.71390045, "learning_rate": 2.26155112714642e-06, "loss": 0.73899007, "num_input_tokens_seen": 169597510, "router_z_loss_clip": 1.82519531, "router_z_loss_mlp": 0.21484375, "step": 7892, "time_per_iteration": 2.926445484161377 }, { "auxiliary_loss_clip": 0.01231542, "auxiliary_loss_mlp": 0.01019317, "balance_loss_clip": 1.13426185, "balance_loss_mlp": 0.99900389, "epoch": 0.47455283330828196, "flos": 62588225024640.0, "grad_norm": 0.8102391583503529, "language_loss": 0.58629745, "learning_rate": 2.2611650052056355e-06, "loss": 0.60880601, "num_input_tokens_seen": 169660010, "router_z_loss_clip": 0.97265625, "router_z_loss_mlp": 0.203125, "step": 7893, "time_per_iteration": 3.3877816200256348 }, { "auxiliary_loss_clip": 0.01452198, "auxiliary_loss_mlp": 0.01034466, "balance_loss_clip": 1.27670634, "balance_loss_mlp": 1.01404572, "epoch": 0.47461295656094993, "flos": 12101148996480.0, "grad_norm": 2.0851220742836096, "language_loss": 0.78656888, "learning_rate": 2.2607788733614463e-06, "loss": 0.81143558, "num_input_tokens_seen": 169678485, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.20422363, "step": 7894, "time_per_iteration": 2.8438351154327393 }, { "auxiliary_loss_clip": 0.01453912, "auxiliary_loss_mlp": 0.01036181, "balance_loss_clip": 1.27813804, "balance_loss_mlp": 1.01529539, "epoch": 0.4746730798136179, "flos": 20893806869760.0, "grad_norm": 2.224788415487396, "language_loss": 0.75619018, "learning_rate": 2.260392731628497e-06, "loss": 0.78109109, "num_input_tokens_seen": 169697335, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.20874023, "step": 7895, "time_per_iteration": 2.8740763664245605 }, { "auxiliary_loss_clip": 0.01458239, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.28126752, "balance_loss_mlp": 1.01762259, "epoch": 0.4747332030662859, "flos": 19984428754560.0, "grad_norm": 1.9145926299658675, "language_loss": 0.83680999, "learning_rate": 2.260006580021429e-06, "loss": 0.8617754, "num_input_tokens_seen": 169715395, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.20690918, "step": 7896, "time_per_iteration": 2.8380517959594727 }, { "auxiliary_loss_clip": 0.01466762, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.29035354, "balance_loss_mlp": 1.01648378, "epoch": 0.4747933263189539, "flos": 16042743630720.0, "grad_norm": 1.8297255476216812, "language_loss": 0.76886177, "learning_rate": 2.259620418554886e-06, "loss": 0.79390121, "num_input_tokens_seen": 169733755, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.20678711, "step": 7897, "time_per_iteration": 2.825946092605591 }, { "auxiliary_loss_clip": 0.01472292, "auxiliary_loss_mlp": 0.01038745, "balance_loss_clip": 1.29081774, "balance_loss_mlp": 1.01726341, "epoch": 0.47485344957162184, "flos": 13963277007360.0, "grad_norm": 1.875892332269221, "language_loss": 0.65271294, "learning_rate": 2.25923424724351e-06, "loss": 0.67782331, "num_input_tokens_seen": 169751390, "router_z_loss_clip": 1.81347656, "router_z_loss_mlp": 0.21496582, "step": 7898, "time_per_iteration": 2.8085544109344482 }, { "auxiliary_loss_clip": 0.01450629, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.27292216, "balance_loss_mlp": 1.02156878, "epoch": 0.4749135728242898, "flos": 20458812931200.0, "grad_norm": 2.1585640874822087, "language_loss": 0.71862209, "learning_rate": 2.258848066101946e-06, "loss": 0.74355745, "num_input_tokens_seen": 169769500, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.21337891, "step": 7899, "time_per_iteration": 2.9206736087799072 }, { "auxiliary_loss_clip": 0.0145496, "auxiliary_loss_mlp": 0.01042325, "balance_loss_clip": 1.27894425, "balance_loss_mlp": 1.02126098, "epoch": 0.4749736960769578, "flos": 28961821722240.0, "grad_norm": 1.8069131329959525, "language_loss": 0.69728923, "learning_rate": 2.258461875144837e-06, "loss": 0.72226208, "num_input_tokens_seen": 169789215, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21069336, "step": 7900, "time_per_iteration": 3.010093927383423 }, { "auxiliary_loss_clip": 0.01456563, "auxiliary_loss_mlp": 0.01039232, "balance_loss_clip": 1.28090489, "balance_loss_mlp": 1.01812029, "epoch": 0.47503381932962574, "flos": 31950621216000.0, "grad_norm": 2.0233452839536494, "language_loss": 0.71084154, "learning_rate": 2.2580756743868273e-06, "loss": 0.73579955, "num_input_tokens_seen": 169808825, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21118164, "step": 7901, "time_per_iteration": 2.923919439315796 }, { "auxiliary_loss_clip": 0.01443869, "auxiliary_loss_mlp": 0.01044421, "balance_loss_clip": 1.26960754, "balance_loss_mlp": 1.02358305, "epoch": 0.4750939425822937, "flos": 22137382212480.0, "grad_norm": 1.6989644654061156, "language_loss": 0.74870539, "learning_rate": 2.2576894638425636e-06, "loss": 0.77358824, "num_input_tokens_seen": 169827590, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.20861816, "step": 7902, "time_per_iteration": 2.839940309524536 }, { "auxiliary_loss_clip": 0.0144367, "auxiliary_loss_mlp": 0.01040427, "balance_loss_clip": 1.27042079, "balance_loss_mlp": 1.02006602, "epoch": 0.47515406583496167, "flos": 20859846007680.0, "grad_norm": 1.876129177612612, "language_loss": 0.69224894, "learning_rate": 2.257303243526688e-06, "loss": 0.71708989, "num_input_tokens_seen": 169844925, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.20349121, "step": 7903, "time_per_iteration": 2.845796585083008 }, { "auxiliary_loss_clip": 0.01438191, "auxiliary_loss_mlp": 0.01038333, "balance_loss_clip": 1.26548529, "balance_loss_mlp": 1.01904511, "epoch": 0.47521418908762963, "flos": 17532370051200.0, "grad_norm": 1.970702509167785, "language_loss": 0.72999454, "learning_rate": 2.256917013453848e-06, "loss": 0.75475979, "num_input_tokens_seen": 169862705, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.19274902, "step": 7904, "time_per_iteration": 2.810915946960449 }, { "auxiliary_loss_clip": 0.01449906, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.27752423, "balance_loss_mlp": 1.01840472, "epoch": 0.4752743123402976, "flos": 20569427763840.0, "grad_norm": 1.5635333258125323, "language_loss": 0.86566007, "learning_rate": 2.25653077363869e-06, "loss": 0.89054108, "num_input_tokens_seen": 169880155, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.19787598, "step": 7905, "time_per_iteration": 2.8209826946258545 }, { "auxiliary_loss_clip": 0.01426798, "auxiliary_loss_mlp": 0.01038568, "balance_loss_clip": 1.25825298, "balance_loss_mlp": 1.017802, "epoch": 0.47533443559296557, "flos": 26372109778560.0, "grad_norm": 1.5686102149324286, "language_loss": 0.83001417, "learning_rate": 2.2561445240958583e-06, "loss": 0.85466784, "num_input_tokens_seen": 169901525, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20776367, "step": 7906, "time_per_iteration": 2.9096903800964355 }, { "auxiliary_loss_clip": 0.01230183, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.13349199, "balance_loss_mlp": 1.00776279, "epoch": 0.47539455884563353, "flos": 65981405710080.0, "grad_norm": 0.6690045123722127, "language_loss": 0.59081483, "learning_rate": 2.255758264840002e-06, "loss": 0.61342603, "num_input_tokens_seen": 169970345, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.23144531, "step": 7907, "time_per_iteration": 3.474402666091919 }, { "auxiliary_loss_clip": 0.01438576, "auxiliary_loss_mlp": 0.01044248, "balance_loss_clip": 1.26537943, "balance_loss_mlp": 1.02310061, "epoch": 0.4754546820983015, "flos": 17246973980160.0, "grad_norm": 1.9371858244488456, "language_loss": 0.81604058, "learning_rate": 2.255371995885765e-06, "loss": 0.84086883, "num_input_tokens_seen": 169986440, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21130371, "step": 7908, "time_per_iteration": 2.837373733520508 }, { "auxiliary_loss_clip": 0.01468486, "auxiliary_loss_mlp": 0.01044411, "balance_loss_clip": 1.29094028, "balance_loss_mlp": 1.02331138, "epoch": 0.47551480535096946, "flos": 19834695152640.0, "grad_norm": 1.751726343034224, "language_loss": 0.75060117, "learning_rate": 2.254985717247797e-06, "loss": 0.77573013, "num_input_tokens_seen": 170005705, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.21105957, "step": 7909, "time_per_iteration": 2.8907501697540283 }, { "auxiliary_loss_clip": 0.01450541, "auxiliary_loss_mlp": 0.01039799, "balance_loss_clip": 1.27460253, "balance_loss_mlp": 1.01952159, "epoch": 0.4755749286036375, "flos": 22174148252160.0, "grad_norm": 1.9918412478014957, "language_loss": 0.7582044, "learning_rate": 2.2545994289407457e-06, "loss": 0.78310776, "num_input_tokens_seen": 170023415, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.20275879, "step": 7910, "time_per_iteration": 2.9048752784729004 }, { "auxiliary_loss_clip": 0.01443253, "auxiliary_loss_mlp": 0.01038448, "balance_loss_clip": 1.2695545, "balance_loss_mlp": 1.01864696, "epoch": 0.47563505185630545, "flos": 21657070967040.0, "grad_norm": 3.6949576837529277, "language_loss": 0.79551566, "learning_rate": 2.2542131309792577e-06, "loss": 0.82033265, "num_input_tokens_seen": 170042395, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.19799805, "step": 7911, "time_per_iteration": 2.9060709476470947 }, { "auxiliary_loss_clip": 0.01472042, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.2922883, "balance_loss_mlp": 1.02579832, "epoch": 0.4756951751089734, "flos": 20637847180800.0, "grad_norm": 1.5968336654195847, "language_loss": 0.76442933, "learning_rate": 2.253826823377983e-06, "loss": 0.78961587, "num_input_tokens_seen": 170061610, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.20812988, "step": 7912, "time_per_iteration": 2.8346428871154785 }, { "auxiliary_loss_clip": 0.01455963, "auxiliary_loss_mlp": 0.01044631, "balance_loss_clip": 1.27979958, "balance_loss_mlp": 1.02407932, "epoch": 0.4757552983616414, "flos": 25859873687040.0, "grad_norm": 1.4753427927154625, "language_loss": 0.75012153, "learning_rate": 2.253440506151569e-06, "loss": 0.77512747, "num_input_tokens_seen": 170083505, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.20556641, "step": 7913, "time_per_iteration": 4.36723256111145 }, { "auxiliary_loss_clip": 0.01452514, "auxiliary_loss_mlp": 0.01040511, "balance_loss_clip": 1.27838588, "balance_loss_mlp": 1.02061498, "epoch": 0.47581542161430934, "flos": 18232055925120.0, "grad_norm": 2.3119915412960705, "language_loss": 0.73447669, "learning_rate": 2.253054179314666e-06, "loss": 0.75940692, "num_input_tokens_seen": 170100690, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.19909668, "step": 7914, "time_per_iteration": 2.8622288703918457 }, { "auxiliary_loss_clip": 0.01453504, "auxiliary_loss_mlp": 0.01044605, "balance_loss_clip": 1.27739096, "balance_loss_mlp": 1.02340949, "epoch": 0.4758755448669773, "flos": 21589692180480.0, "grad_norm": 2.169745988239635, "language_loss": 0.65088075, "learning_rate": 2.2526678428819227e-06, "loss": 0.67586184, "num_input_tokens_seen": 170119240, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.21191406, "step": 7915, "time_per_iteration": 2.850184202194214 }, { "auxiliary_loss_clip": 0.01430904, "auxiliary_loss_mlp": 0.01038051, "balance_loss_clip": 1.2609638, "balance_loss_mlp": 1.01778531, "epoch": 0.47593566811964527, "flos": 15239410623360.0, "grad_norm": 1.9366611695726872, "language_loss": 0.77618688, "learning_rate": 2.2522814968679896e-06, "loss": 0.80087644, "num_input_tokens_seen": 170136450, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20263672, "step": 7916, "time_per_iteration": 2.835258960723877 }, { "auxiliary_loss_clip": 0.01433478, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.26300013, "balance_loss_mlp": 1.01854348, "epoch": 0.47599579137231324, "flos": 21553152364800.0, "grad_norm": 1.7387117118691906, "language_loss": 0.64461434, "learning_rate": 2.2518951412875173e-06, "loss": 0.6693362, "num_input_tokens_seen": 170155295, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.20178223, "step": 7917, "time_per_iteration": 2.837278366088867 }, { "auxiliary_loss_clip": 0.01226785, "auxiliary_loss_mlp": 0.01038668, "balance_loss_clip": 1.13236141, "balance_loss_mlp": 1.01349092, "epoch": 0.4760559146249812, "flos": 64583753264640.0, "grad_norm": 0.8420057263394259, "language_loss": 0.65787745, "learning_rate": 2.2515087761551557e-06, "loss": 0.68053198, "num_input_tokens_seen": 170222325, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.25195312, "step": 7918, "time_per_iteration": 3.3946964740753174 }, { "auxiliary_loss_clip": 0.014478, "auxiliary_loss_mlp": 0.0103962, "balance_loss_clip": 1.27267015, "balance_loss_mlp": 1.01800728, "epoch": 0.47611603787764917, "flos": 22243291585920.0, "grad_norm": 1.5341470990763908, "language_loss": 0.69664377, "learning_rate": 2.2511224014855563e-06, "loss": 0.72151798, "num_input_tokens_seen": 170241625, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.21606445, "step": 7919, "time_per_iteration": 2.862159490585327 }, { "auxiliary_loss_clip": 0.01457729, "auxiliary_loss_mlp": 0.01037134, "balance_loss_clip": 1.28084755, "balance_loss_mlp": 1.01581955, "epoch": 0.47617616113031713, "flos": 22789262315520.0, "grad_norm": 2.5850042652875778, "language_loss": 0.75568521, "learning_rate": 2.2507360172933694e-06, "loss": 0.78063381, "num_input_tokens_seen": 170262470, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.21313477, "step": 7920, "time_per_iteration": 2.9414336681365967 }, { "auxiliary_loss_clip": 0.01469862, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.28970337, "balance_loss_mlp": 1.01684403, "epoch": 0.4762362843829851, "flos": 24144402631680.0, "grad_norm": 1.5001670311001594, "language_loss": 0.78286862, "learning_rate": 2.2503496235932487e-06, "loss": 0.80796206, "num_input_tokens_seen": 170283460, "router_z_loss_clip": 1.80078125, "router_z_loss_mlp": 0.22668457, "step": 7921, "time_per_iteration": 2.8941493034362793 }, { "auxiliary_loss_clip": 0.01447172, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.26932287, "balance_loss_mlp": 1.0164032, "epoch": 0.47629640763565306, "flos": 22461354115200.0, "grad_norm": 1.5246718678344429, "language_loss": 0.78848642, "learning_rate": 2.249963220399845e-06, "loss": 0.8133328, "num_input_tokens_seen": 170304225, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.21057129, "step": 7922, "time_per_iteration": 4.306033372879028 }, { "auxiliary_loss_clip": 0.01459002, "auxiliary_loss_mlp": 0.01034372, "balance_loss_clip": 1.27929962, "balance_loss_mlp": 1.01287889, "epoch": 0.4763565308883211, "flos": 11188648990080.0, "grad_norm": 1.6914768447248065, "language_loss": 0.73693967, "learning_rate": 2.2495768077278104e-06, "loss": 0.76187336, "num_input_tokens_seen": 170322110, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.21496582, "step": 7923, "time_per_iteration": 4.273958206176758 }, { "auxiliary_loss_clip": 0.01455436, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.27922797, "balance_loss_mlp": 1.0149883, "epoch": 0.47641665414098905, "flos": 22392210781440.0, "grad_norm": 1.7814305703928432, "language_loss": 0.8314504, "learning_rate": 2.2491903855917992e-06, "loss": 0.85636753, "num_input_tokens_seen": 170340700, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.21289062, "step": 7924, "time_per_iteration": 4.3541576862335205 }, { "auxiliary_loss_clip": 0.01471961, "auxiliary_loss_mlp": 0.01040658, "balance_loss_clip": 1.29211628, "balance_loss_mlp": 1.01868773, "epoch": 0.476476777393657, "flos": 25057038372480.0, "grad_norm": 1.8756625482568685, "language_loss": 0.81860507, "learning_rate": 2.2488039540064626e-06, "loss": 0.84373128, "num_input_tokens_seen": 170359780, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.21960449, "step": 7925, "time_per_iteration": 2.9104220867156982 }, { "auxiliary_loss_clip": 0.01447694, "auxiliary_loss_mlp": 0.01035644, "balance_loss_clip": 1.27109289, "balance_loss_mlp": 1.0151757, "epoch": 0.476536900646325, "flos": 27280899711360.0, "grad_norm": 1.7793373890005506, "language_loss": 0.73208791, "learning_rate": 2.2484175129864558e-06, "loss": 0.75692129, "num_input_tokens_seen": 170381260, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.20471191, "step": 7926, "time_per_iteration": 2.9189493656158447 }, { "auxiliary_loss_clip": 0.0147246, "auxiliary_loss_mlp": 0.01042099, "balance_loss_clip": 1.29299951, "balance_loss_mlp": 1.01985502, "epoch": 0.47659702389899294, "flos": 25312319389440.0, "grad_norm": 2.257584878145908, "language_loss": 0.69917536, "learning_rate": 2.248031062546432e-06, "loss": 0.72432101, "num_input_tokens_seen": 170400595, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.22253418, "step": 7927, "time_per_iteration": 2.8907370567321777 }, { "auxiliary_loss_clip": 0.01450685, "auxiliary_loss_mlp": 0.0103753, "balance_loss_clip": 1.27743602, "balance_loss_mlp": 1.01728868, "epoch": 0.4766571471516609, "flos": 26003499240960.0, "grad_norm": 1.8358093499029962, "language_loss": 0.6835041, "learning_rate": 2.247644602701045e-06, "loss": 0.70838624, "num_input_tokens_seen": 170421110, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.20239258, "step": 7928, "time_per_iteration": 2.9191734790802 }, { "auxiliary_loss_clip": 0.01449735, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.27239752, "balance_loss_mlp": 1.01872373, "epoch": 0.4767172704043289, "flos": 16040255166720.0, "grad_norm": 2.3106231676446054, "language_loss": 0.79905987, "learning_rate": 2.2472581334649496e-06, "loss": 0.82397795, "num_input_tokens_seen": 170436700, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.23327637, "step": 7929, "time_per_iteration": 2.8347578048706055 }, { "auxiliary_loss_clip": 0.01444249, "auxiliary_loss_mlp": 0.01041076, "balance_loss_clip": 1.27113712, "balance_loss_mlp": 1.01951075, "epoch": 0.47677739365699684, "flos": 39248947209600.0, "grad_norm": 1.818756884187019, "language_loss": 0.67673957, "learning_rate": 2.2468716548528016e-06, "loss": 0.7015928, "num_input_tokens_seen": 170459555, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21569824, "step": 7930, "time_per_iteration": 3.027539014816284 }, { "auxiliary_loss_clip": 0.01437974, "auxiliary_loss_mlp": 0.01040454, "balance_loss_clip": 1.2644248, "balance_loss_mlp": 1.01952112, "epoch": 0.4768375169096648, "flos": 24728587234560.0, "grad_norm": 1.9001083619453858, "language_loss": 0.80587333, "learning_rate": 2.2464851668792555e-06, "loss": 0.8306576, "num_input_tokens_seen": 170479175, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.20935059, "step": 7931, "time_per_iteration": 2.970187187194824 }, { "auxiliary_loss_clip": 0.01455683, "auxiliary_loss_mlp": 0.01040805, "balance_loss_clip": 1.27845311, "balance_loss_mlp": 1.02038443, "epoch": 0.47689764016233277, "flos": 22538912981760.0, "grad_norm": 1.9819938362330696, "language_loss": 0.77207291, "learning_rate": 2.2460986695589678e-06, "loss": 0.79703772, "num_input_tokens_seen": 170498450, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.20422363, "step": 7932, "time_per_iteration": 2.842827558517456 }, { "auxiliary_loss_clip": 0.01437083, "auxiliary_loss_mlp": 0.01037431, "balance_loss_clip": 1.26441312, "balance_loss_mlp": 1.01616406, "epoch": 0.47695776341500074, "flos": 15128569566720.0, "grad_norm": 1.783631545389019, "language_loss": 0.81141335, "learning_rate": 2.245712162906593e-06, "loss": 0.83615851, "num_input_tokens_seen": 170516255, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.21264648, "step": 7933, "time_per_iteration": 3.0048182010650635 }, { "auxiliary_loss_clip": 0.0145493, "auxiliary_loss_mlp": 0.01039662, "balance_loss_clip": 1.27382267, "balance_loss_mlp": 1.01640463, "epoch": 0.4770178866676687, "flos": 14685748277760.0, "grad_norm": 1.7891748806098162, "language_loss": 0.74494922, "learning_rate": 2.2453256469367888e-06, "loss": 0.76989508, "num_input_tokens_seen": 170532705, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.23254395, "step": 7934, "time_per_iteration": 2.881333112716675 }, { "auxiliary_loss_clip": 0.01457388, "auxiliary_loss_mlp": 0.01037705, "balance_loss_clip": 1.27836823, "balance_loss_mlp": 1.01628268, "epoch": 0.47707800992033667, "flos": 22575950490240.0, "grad_norm": 2.0148548665650843, "language_loss": 0.80967647, "learning_rate": 2.244939121664211e-06, "loss": 0.83462739, "num_input_tokens_seen": 170551925, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.21411133, "step": 7935, "time_per_iteration": 2.934932231903076 }, { "auxiliary_loss_clip": 0.01474209, "auxiliary_loss_mlp": 0.01040096, "balance_loss_clip": 1.29099, "balance_loss_mlp": 1.0171361, "epoch": 0.4771381331730047, "flos": 30930085330560.0, "grad_norm": 1.895183297126345, "language_loss": 0.71792078, "learning_rate": 2.2445525871035177e-06, "loss": 0.74306381, "num_input_tokens_seen": 170572320, "router_z_loss_clip": 1.83203125, "router_z_loss_mlp": 0.22937012, "step": 7936, "time_per_iteration": 3.001569986343384 }, { "auxiliary_loss_clip": 0.01456553, "auxiliary_loss_mlp": 0.01036495, "balance_loss_clip": 1.27553093, "balance_loss_mlp": 1.01547837, "epoch": 0.47719825642567265, "flos": 25749847036800.0, "grad_norm": 2.396897277723704, "language_loss": 0.68382281, "learning_rate": 2.2441660432693656e-06, "loss": 0.70875323, "num_input_tokens_seen": 170589470, "router_z_loss_clip": 1.81054688, "router_z_loss_mlp": 0.21008301, "step": 7937, "time_per_iteration": 2.8742074966430664 }, { "auxiliary_loss_clip": 0.01234936, "auxiliary_loss_mlp": 0.01071162, "balance_loss_clip": 1.14122164, "balance_loss_mlp": 1.05018103, "epoch": 0.4772583796783406, "flos": 66388049141760.0, "grad_norm": 0.7194995213311575, "language_loss": 0.5652529, "learning_rate": 2.2437794901764128e-06, "loss": 0.58831382, "num_input_tokens_seen": 170662265, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.20996094, "step": 7938, "time_per_iteration": 3.533442735671997 }, { "auxiliary_loss_clip": 0.01441096, "auxiliary_loss_mlp": 0.01037659, "balance_loss_clip": 1.26679194, "balance_loss_mlp": 1.01601017, "epoch": 0.4773185029310086, "flos": 22060366283520.0, "grad_norm": 1.8075248938717292, "language_loss": 0.89444637, "learning_rate": 2.243392927839317e-06, "loss": 0.91923392, "num_input_tokens_seen": 170679680, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.21643066, "step": 7939, "time_per_iteration": 2.840416193008423 }, { "auxiliary_loss_clip": 0.01450332, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.27415764, "balance_loss_mlp": 1.01402736, "epoch": 0.47737862618367655, "flos": 16736638170240.0, "grad_norm": 2.796284533793052, "language_loss": 0.78930116, "learning_rate": 2.2430063562727367e-06, "loss": 0.81416303, "num_input_tokens_seen": 170697340, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.21826172, "step": 7940, "time_per_iteration": 2.9140732288360596 }, { "auxiliary_loss_clip": 0.01436832, "auxiliary_loss_mlp": 0.0103442, "balance_loss_clip": 1.26485133, "balance_loss_mlp": 1.01330817, "epoch": 0.4774387494363445, "flos": 19618849618560.0, "grad_norm": 1.667016976341965, "language_loss": 0.85843933, "learning_rate": 2.2426197754913322e-06, "loss": 0.88315183, "num_input_tokens_seen": 170714905, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.21105957, "step": 7941, "time_per_iteration": 2.8956892490386963 }, { "auxiliary_loss_clip": 0.01460285, "auxiliary_loss_mlp": 0.01043984, "balance_loss_clip": 1.28053987, "balance_loss_mlp": 1.02038026, "epoch": 0.4774988726890125, "flos": 16662563153280.0, "grad_norm": 3.6040550853451783, "language_loss": 0.76558268, "learning_rate": 2.24223318550976e-06, "loss": 0.79062539, "num_input_tokens_seen": 170731810, "router_z_loss_clip": 1.796875, "router_z_loss_mlp": 0.23583984, "step": 7942, "time_per_iteration": 2.8286314010620117 }, { "auxiliary_loss_clip": 0.01457229, "auxiliary_loss_mlp": 0.01040316, "balance_loss_clip": 1.28000283, "balance_loss_mlp": 1.02007437, "epoch": 0.47755899594168044, "flos": 20495307502080.0, "grad_norm": 1.651941950532469, "language_loss": 0.64448071, "learning_rate": 2.241846586342682e-06, "loss": 0.66945618, "num_input_tokens_seen": 170750270, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.20239258, "step": 7943, "time_per_iteration": 2.8506343364715576 }, { "auxiliary_loss_clip": 0.01477128, "auxiliary_loss_mlp": 0.01039503, "balance_loss_clip": 1.2948966, "balance_loss_mlp": 1.01763988, "epoch": 0.4776191191943484, "flos": 21662771811840.0, "grad_norm": 1.8410164717280832, "language_loss": 0.74352181, "learning_rate": 2.2414599780047577e-06, "loss": 0.76868814, "num_input_tokens_seen": 170769015, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.21862793, "step": 7944, "time_per_iteration": 2.862076997756958 }, { "auxiliary_loss_clip": 0.01448337, "auxiliary_loss_mlp": 0.01041772, "balance_loss_clip": 1.27118826, "balance_loss_mlp": 1.02006435, "epoch": 0.4776792424470164, "flos": 18779383998720.0, "grad_norm": 2.010855858882736, "language_loss": 0.69227648, "learning_rate": 2.2410733605106456e-06, "loss": 0.71717763, "num_input_tokens_seen": 170785725, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.21716309, "step": 7945, "time_per_iteration": 2.8539230823516846 }, { "auxiliary_loss_clip": 0.01446507, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.26834524, "balance_loss_mlp": 1.01606417, "epoch": 0.47773936569968434, "flos": 29727800507520.0, "grad_norm": 1.8045056132554278, "language_loss": 0.76118743, "learning_rate": 2.240686733875009e-06, "loss": 0.78602302, "num_input_tokens_seen": 170804600, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.21008301, "step": 7946, "time_per_iteration": 2.9083118438720703 }, { "auxiliary_loss_clip": 0.01449317, "auxiliary_loss_mlp": 0.01042397, "balance_loss_clip": 1.27118063, "balance_loss_mlp": 1.02073646, "epoch": 0.4777994889523523, "flos": 24802571761920.0, "grad_norm": 1.8017225958195104, "language_loss": 0.80142123, "learning_rate": 2.240300098112506e-06, "loss": 0.82633829, "num_input_tokens_seen": 170824230, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.21643066, "step": 7947, "time_per_iteration": 2.895531177520752 }, { "auxiliary_loss_clip": 0.0144208, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.26919341, "balance_loss_mlp": 1.01466036, "epoch": 0.47785961220502027, "flos": 17867110216320.0, "grad_norm": 2.1117316091413256, "language_loss": 0.74433041, "learning_rate": 2.2399134532377998e-06, "loss": 0.76911843, "num_input_tokens_seen": 170843365, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.2208252, "step": 7948, "time_per_iteration": 4.248170614242554 }, { "auxiliary_loss_clip": 0.01453769, "auxiliary_loss_mlp": 0.01037888, "balance_loss_clip": 1.27648318, "balance_loss_mlp": 1.01616859, "epoch": 0.4779197354576883, "flos": 20276204342400.0, "grad_norm": 1.5064424941895973, "language_loss": 0.79096341, "learning_rate": 2.2395267992655514e-06, "loss": 0.81587994, "num_input_tokens_seen": 170863515, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.21716309, "step": 7949, "time_per_iteration": 2.837682008743286 }, { "auxiliary_loss_clip": 0.01440064, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.26607645, "balance_loss_mlp": 1.0196619, "epoch": 0.47797985871035625, "flos": 17065632245760.0, "grad_norm": 2.278572798472134, "language_loss": 0.74984539, "learning_rate": 2.2391401362104227e-06, "loss": 0.77466154, "num_input_tokens_seen": 170881245, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.21887207, "step": 7950, "time_per_iteration": 2.870100498199463 }, { "auxiliary_loss_clip": 0.01444432, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.26934147, "balance_loss_mlp": 1.02218902, "epoch": 0.4780399819630242, "flos": 31371006337920.0, "grad_norm": 1.6198737308840891, "language_loss": 0.75231057, "learning_rate": 2.2387534640870756e-06, "loss": 0.77719939, "num_input_tokens_seen": 170901285, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.22229004, "step": 7951, "time_per_iteration": 2.9302005767822266 }, { "auxiliary_loss_clip": 0.01453093, "auxiliary_loss_mlp": 0.01037218, "balance_loss_clip": 1.27359128, "balance_loss_mlp": 1.01524758, "epoch": 0.4781001052156922, "flos": 24910290927360.0, "grad_norm": 1.919109942092921, "language_loss": 0.8090899, "learning_rate": 2.238366782910174e-06, "loss": 0.83399296, "num_input_tokens_seen": 170919740, "router_z_loss_clip": 1.79394531, "router_z_loss_mlp": 0.21984863, "step": 7952, "time_per_iteration": 2.887119770050049 }, { "auxiliary_loss_clip": 0.01467143, "auxiliary_loss_mlp": 0.01040345, "balance_loss_clip": 1.28840065, "balance_loss_mlp": 1.01873231, "epoch": 0.47816022846836015, "flos": 18706983039360.0, "grad_norm": 1.7156258351692608, "language_loss": 0.79525322, "learning_rate": 2.23798009269438e-06, "loss": 0.82032812, "num_input_tokens_seen": 170938510, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.21606445, "step": 7953, "time_per_iteration": 2.8589110374450684 }, { "auxiliary_loss_clip": 0.01464405, "auxiliary_loss_mlp": 0.01040282, "balance_loss_clip": 1.28268421, "balance_loss_mlp": 1.01912224, "epoch": 0.4782203517210281, "flos": 11983340240640.0, "grad_norm": 2.767307700880212, "language_loss": 0.84688824, "learning_rate": 2.2375933934543566e-06, "loss": 0.87193513, "num_input_tokens_seen": 170951170, "router_z_loss_clip": 1.81542969, "router_z_loss_mlp": 0.21154785, "step": 7954, "time_per_iteration": 2.786742925643921 }, { "auxiliary_loss_clip": 0.01441048, "auxiliary_loss_mlp": 0.01039343, "balance_loss_clip": 1.2660948, "balance_loss_mlp": 1.01708639, "epoch": 0.4782804749736961, "flos": 20823215702400.0, "grad_norm": 1.6622117051563683, "language_loss": 0.70623767, "learning_rate": 2.237206685204768e-06, "loss": 0.73104155, "num_input_tokens_seen": 170970990, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.22265625, "step": 7955, "time_per_iteration": 2.880290985107422 }, { "auxiliary_loss_clip": 0.0146181, "auxiliary_loss_mlp": 0.01043032, "balance_loss_clip": 1.28369248, "balance_loss_mlp": 1.02183652, "epoch": 0.47834059822636404, "flos": 23850410048640.0, "grad_norm": 1.6727667272234268, "language_loss": 0.82222223, "learning_rate": 2.2368199679602787e-06, "loss": 0.84727061, "num_input_tokens_seen": 170991215, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.21203613, "step": 7956, "time_per_iteration": 2.927103281021118 }, { "auxiliary_loss_clip": 0.0144786, "auxiliary_loss_mlp": 0.0103951, "balance_loss_clip": 1.27360499, "balance_loss_mlp": 1.01751566, "epoch": 0.478400721479032, "flos": 22642922073600.0, "grad_norm": 1.8429657620329316, "language_loss": 0.85649341, "learning_rate": 2.2364332417355516e-06, "loss": 0.88136709, "num_input_tokens_seen": 171007325, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.21984863, "step": 7957, "time_per_iteration": 4.283450603485107 }, { "auxiliary_loss_clip": 0.01443054, "auxiliary_loss_mlp": 0.01039922, "balance_loss_clip": 1.26830482, "balance_loss_mlp": 1.01932263, "epoch": 0.4784608447317, "flos": 19364518742400.0, "grad_norm": 1.7889230765108997, "language_loss": 0.80462468, "learning_rate": 2.2360465065452527e-06, "loss": 0.82945442, "num_input_tokens_seen": 171025650, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.20593262, "step": 7958, "time_per_iteration": 4.287928581237793 }, { "auxiliary_loss_clip": 0.0145318, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.2766856, "balance_loss_mlp": 1.02347922, "epoch": 0.47852096798436794, "flos": 24030846887040.0, "grad_norm": 11.208309163051378, "language_loss": 0.84226263, "learning_rate": 2.235659762404047e-06, "loss": 0.86724669, "num_input_tokens_seen": 171045045, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.2175293, "step": 7959, "time_per_iteration": 4.277459144592285 }, { "auxiliary_loss_clip": 0.01439043, "auxiliary_loss_mlp": 0.0103548, "balance_loss_clip": 1.26930642, "balance_loss_mlp": 1.01577437, "epoch": 0.4785810912370359, "flos": 25677491322240.0, "grad_norm": 2.917568435913246, "language_loss": 0.73711342, "learning_rate": 2.235273009326599e-06, "loss": 0.76185858, "num_input_tokens_seen": 171062910, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.19714355, "step": 7960, "time_per_iteration": 2.893531322479248 }, { "auxiliary_loss_clip": 0.01448327, "auxiliary_loss_mlp": 0.01039817, "balance_loss_clip": 1.27592111, "balance_loss_mlp": 1.01901448, "epoch": 0.47864121448970387, "flos": 21441587391360.0, "grad_norm": 1.6984373747146715, "language_loss": 0.77541584, "learning_rate": 2.2348862473275745e-06, "loss": 0.80029726, "num_input_tokens_seen": 171080875, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.20788574, "step": 7961, "time_per_iteration": 2.814842462539673 }, { "auxiliary_loss_clip": 0.01446014, "auxiliary_loss_mlp": 0.01037752, "balance_loss_clip": 1.2703042, "balance_loss_mlp": 1.01718807, "epoch": 0.47870133774237184, "flos": 16152544056960.0, "grad_norm": 1.8826411917429637, "language_loss": 0.78348994, "learning_rate": 2.2344994764216405e-06, "loss": 0.80832767, "num_input_tokens_seen": 171099190, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.20556641, "step": 7962, "time_per_iteration": 2.8784334659576416 }, { "auxiliary_loss_clip": 0.01454218, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.27687478, "balance_loss_mlp": 1.01837635, "epoch": 0.47876146099503986, "flos": 26918125752960.0, "grad_norm": 2.1610304750723026, "language_loss": 0.65879983, "learning_rate": 2.2341126966234635e-06, "loss": 0.68373072, "num_input_tokens_seen": 171119060, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.20495605, "step": 7963, "time_per_iteration": 2.944908380508423 }, { "auxiliary_loss_clip": 0.0145986, "auxiliary_loss_mlp": 0.01041042, "balance_loss_clip": 1.28285408, "balance_loss_mlp": 1.01963246, "epoch": 0.4788215842477078, "flos": 45348200760960.0, "grad_norm": 1.9829093081648996, "language_loss": 0.78575063, "learning_rate": 2.2337259079477083e-06, "loss": 0.81075966, "num_input_tokens_seen": 171141900, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.21398926, "step": 7964, "time_per_iteration": 3.041947364807129 }, { "auxiliary_loss_clip": 0.01465915, "auxiliary_loss_mlp": 0.01041672, "balance_loss_clip": 1.28495955, "balance_loss_mlp": 1.01917756, "epoch": 0.4788817075003758, "flos": 22247454107520.0, "grad_norm": 1.825409086212718, "language_loss": 0.76801038, "learning_rate": 2.233339110409044e-06, "loss": 0.79308629, "num_input_tokens_seen": 171161045, "router_z_loss_clip": 1.80859375, "router_z_loss_mlp": 0.22509766, "step": 7965, "time_per_iteration": 2.902519464492798 }, { "auxiliary_loss_clip": 0.01454327, "auxiliary_loss_mlp": 0.01039608, "balance_loss_clip": 1.27725518, "balance_loss_mlp": 1.01764953, "epoch": 0.47894183075304375, "flos": 16479637850880.0, "grad_norm": 1.571762812802044, "language_loss": 0.75843501, "learning_rate": 2.232952304022137e-06, "loss": 0.78337443, "num_input_tokens_seen": 171179675, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.21972656, "step": 7966, "time_per_iteration": 2.827443838119507 }, { "auxiliary_loss_clip": 0.01459112, "auxiliary_loss_mlp": 0.01038399, "balance_loss_clip": 1.28224015, "balance_loss_mlp": 1.01657212, "epoch": 0.4790019540057117, "flos": 24294090988800.0, "grad_norm": 1.707138716418712, "language_loss": 0.73791134, "learning_rate": 2.232565488801655e-06, "loss": 0.7628864, "num_input_tokens_seen": 171201175, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.21838379, "step": 7967, "time_per_iteration": 3.0499303340911865 }, { "auxiliary_loss_clip": 0.0144345, "auxiliary_loss_mlp": 0.01040986, "balance_loss_clip": 1.27343225, "balance_loss_mlp": 1.01915908, "epoch": 0.4790620772583797, "flos": 25677536567040.0, "grad_norm": 1.827366769135906, "language_loss": 0.803002, "learning_rate": 2.232178664762267e-06, "loss": 0.82784629, "num_input_tokens_seen": 171221750, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.21826172, "step": 7968, "time_per_iteration": 2.9276950359344482 }, { "auxiliary_loss_clip": 0.01233186, "auxiliary_loss_mlp": 0.01049447, "balance_loss_clip": 1.13905478, "balance_loss_mlp": 1.0280844, "epoch": 0.47912220051104765, "flos": 69463908910080.0, "grad_norm": 0.7638946134702306, "language_loss": 0.62269735, "learning_rate": 2.2317918319186408e-06, "loss": 0.64552367, "num_input_tokens_seen": 171292235, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 0.21386719, "step": 7969, "time_per_iteration": 3.5951247215270996 }, { "auxiliary_loss_clip": 0.01444823, "auxiliary_loss_mlp": 0.01043747, "balance_loss_clip": 1.27207911, "balance_loss_mlp": 1.0215739, "epoch": 0.4791823237637156, "flos": 24179177900160.0, "grad_norm": 1.417795177806257, "language_loss": 0.78075999, "learning_rate": 2.2314049902854446e-06, "loss": 0.8056457, "num_input_tokens_seen": 171312215, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.22167969, "step": 7970, "time_per_iteration": 2.953460693359375 }, { "auxiliary_loss_clip": 0.01446467, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.27104092, "balance_loss_mlp": 1.01418185, "epoch": 0.4792424470163836, "flos": 24761552711040.0, "grad_norm": 1.8780800365876704, "language_loss": 0.71320045, "learning_rate": 2.231018139877349e-06, "loss": 0.73801446, "num_input_tokens_seen": 171332975, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.20751953, "step": 7971, "time_per_iteration": 2.8848326206207275 }, { "auxiliary_loss_clip": 0.0144755, "auxiliary_loss_mlp": 0.01037135, "balance_loss_clip": 1.27413845, "balance_loss_mlp": 1.01542699, "epoch": 0.47930257026905154, "flos": 23268216216960.0, "grad_norm": 1.493220119932062, "language_loss": 0.80697685, "learning_rate": 2.230631280709021e-06, "loss": 0.83182371, "num_input_tokens_seen": 171353880, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21716309, "step": 7972, "time_per_iteration": 2.8771238327026367 }, { "auxiliary_loss_clip": 0.01462834, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.28429544, "balance_loss_mlp": 1.01025009, "epoch": 0.4793626935217195, "flos": 14072172537600.0, "grad_norm": 2.0718755723732185, "language_loss": 0.70957828, "learning_rate": 2.2302444127951327e-06, "loss": 0.73452026, "num_input_tokens_seen": 171370930, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.21118164, "step": 7973, "time_per_iteration": 2.8069820404052734 }, { "auxiliary_loss_clip": 0.01437785, "auxiliary_loss_mlp": 0.01044017, "balance_loss_clip": 1.26695621, "balance_loss_mlp": 1.02155757, "epoch": 0.4794228167743875, "flos": 21808885829760.0, "grad_norm": 2.114884285014367, "language_loss": 0.7959547, "learning_rate": 2.2298575361503523e-06, "loss": 0.82077277, "num_input_tokens_seen": 171387575, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.22436523, "step": 7974, "time_per_iteration": 2.8390238285064697 }, { "auxiliary_loss_clip": 0.01226348, "auxiliary_loss_mlp": 0.01024539, "balance_loss_clip": 1.13285351, "balance_loss_mlp": 1.00756347, "epoch": 0.47948294002705544, "flos": 67000177048320.0, "grad_norm": 0.75831902757407, "language_loss": 0.5411014, "learning_rate": 2.2294706507893517e-06, "loss": 0.56361026, "num_input_tokens_seen": 171449980, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.16992188, "step": 7975, "time_per_iteration": 3.3998100757598877 }, { "auxiliary_loss_clip": 0.01474574, "auxiliary_loss_mlp": 0.01036599, "balance_loss_clip": 1.29262376, "balance_loss_mlp": 1.01553464, "epoch": 0.47954306327972346, "flos": 12430188316800.0, "grad_norm": 2.1778087503371535, "language_loss": 0.90651679, "learning_rate": 2.2290837567268008e-06, "loss": 0.93162853, "num_input_tokens_seen": 171465290, "router_z_loss_clip": 1.8203125, "router_z_loss_mlp": 0.21069336, "step": 7976, "time_per_iteration": 2.8015477657318115 }, { "auxiliary_loss_clip": 0.01469731, "auxiliary_loss_mlp": 0.01039762, "balance_loss_clip": 1.28772855, "balance_loss_mlp": 1.01689792, "epoch": 0.4796031865323914, "flos": 18369211472640.0, "grad_norm": 2.3958547476113803, "language_loss": 0.7490648, "learning_rate": 2.2286968539773713e-06, "loss": 0.77415973, "num_input_tokens_seen": 171481130, "router_z_loss_clip": 1.81738281, "router_z_loss_mlp": 0.22888184, "step": 7977, "time_per_iteration": 2.851747512817383 }, { "auxiliary_loss_clip": 0.01445232, "auxiliary_loss_mlp": 0.0103578, "balance_loss_clip": 1.27186, "balance_loss_mlp": 1.01534748, "epoch": 0.4796633097850594, "flos": 21845063687040.0, "grad_norm": 1.713877891740713, "language_loss": 0.78898185, "learning_rate": 2.228309942555734e-06, "loss": 0.81379199, "num_input_tokens_seen": 171501140, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.2043457, "step": 7978, "time_per_iteration": 2.894744873046875 }, { "auxiliary_loss_clip": 0.01463166, "auxiliary_loss_mlp": 0.01038293, "balance_loss_clip": 1.2849896, "balance_loss_mlp": 1.01656127, "epoch": 0.47972343303772735, "flos": 23446933752960.0, "grad_norm": 1.6682053843263283, "language_loss": 0.89809173, "learning_rate": 2.22792302247656e-06, "loss": 0.92310631, "num_input_tokens_seen": 171519835, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.21728516, "step": 7979, "time_per_iteration": 2.885093927383423 }, { "auxiliary_loss_clip": 0.01465277, "auxiliary_loss_mlp": 0.01041016, "balance_loss_clip": 1.285537, "balance_loss_mlp": 1.01904571, "epoch": 0.4797835562903953, "flos": 24910517151360.0, "grad_norm": 1.5204224289895303, "language_loss": 0.77336419, "learning_rate": 2.227536093754523e-06, "loss": 0.7984271, "num_input_tokens_seen": 171540980, "router_z_loss_clip": 1.79980469, "router_z_loss_mlp": 0.21972656, "step": 7980, "time_per_iteration": 2.9058990478515625 }, { "auxiliary_loss_clip": 0.01472954, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.29174662, "balance_loss_mlp": 1.0196507, "epoch": 0.4798436795430633, "flos": 35056415059200.0, "grad_norm": 1.759323580275435, "language_loss": 0.72995806, "learning_rate": 2.227149156404295e-06, "loss": 0.75510764, "num_input_tokens_seen": 171563600, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 0.22351074, "step": 7981, "time_per_iteration": 2.9926531314849854 }, { "auxiliary_loss_clip": 0.0145242, "auxiliary_loss_mlp": 0.01038568, "balance_loss_clip": 1.27875364, "balance_loss_mlp": 1.0179565, "epoch": 0.47990380279573125, "flos": 20598728411520.0, "grad_norm": 1.923699508825224, "language_loss": 0.70587063, "learning_rate": 2.2267622104405473e-06, "loss": 0.73078048, "num_input_tokens_seen": 171580700, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.20617676, "step": 7982, "time_per_iteration": 2.895615577697754 }, { "auxiliary_loss_clip": 0.01435514, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.2658844, "balance_loss_mlp": 1.01508439, "epoch": 0.4799639260483992, "flos": 26370571455360.0, "grad_norm": 1.6099331029287876, "language_loss": 0.71994317, "learning_rate": 2.2263752558779544e-06, "loss": 0.74465036, "num_input_tokens_seen": 171602035, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20117188, "step": 7983, "time_per_iteration": 4.352653503417969 }, { "auxiliary_loss_clip": 0.01230113, "auxiliary_loss_mlp": 0.01051785, "balance_loss_clip": 1.1367898, "balance_loss_mlp": 1.03604925, "epoch": 0.4800240493010672, "flos": 71010598302720.0, "grad_norm": 0.8321816553551329, "language_loss": 0.59458911, "learning_rate": 2.2259882927311883e-06, "loss": 0.6174081, "num_input_tokens_seen": 171659215, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.15722656, "step": 7984, "time_per_iteration": 3.3027122020721436 }, { "auxiliary_loss_clip": 0.01437008, "auxiliary_loss_mlp": 0.01041783, "balance_loss_clip": 1.26417375, "balance_loss_mlp": 1.02014637, "epoch": 0.48008417255373514, "flos": 17094027997440.0, "grad_norm": 1.5909223358865534, "language_loss": 0.67351359, "learning_rate": 2.2256013210149247e-06, "loss": 0.69830149, "num_input_tokens_seen": 171675710, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.21630859, "step": 7985, "time_per_iteration": 2.854088306427002 }, { "auxiliary_loss_clip": 0.01461293, "auxiliary_loss_mlp": 0.01043303, "balance_loss_clip": 1.28187227, "balance_loss_mlp": 1.02091599, "epoch": 0.4801442958064031, "flos": 15421702498560.0, "grad_norm": 1.813739014653087, "language_loss": 0.71286267, "learning_rate": 2.225214340743835e-06, "loss": 0.7379086, "num_input_tokens_seen": 171692510, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.22387695, "step": 7986, "time_per_iteration": 2.9303128719329834 }, { "auxiliary_loss_clip": 0.01458847, "auxiliary_loss_mlp": 0.01039261, "balance_loss_clip": 1.27966392, "balance_loss_mlp": 1.01774335, "epoch": 0.4802044190590711, "flos": 11481193739520.0, "grad_norm": 1.9943469183220444, "language_loss": 0.80038142, "learning_rate": 2.2248273519325956e-06, "loss": 0.82536244, "num_input_tokens_seen": 171710235, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.21508789, "step": 7987, "time_per_iteration": 2.8450844287872314 }, { "auxiliary_loss_clip": 0.01447119, "auxiliary_loss_mlp": 0.01040662, "balance_loss_clip": 1.2713213, "balance_loss_mlp": 1.02040792, "epoch": 0.48026454231173904, "flos": 20958561457920.0, "grad_norm": 1.9799738798781703, "language_loss": 0.76637465, "learning_rate": 2.2244403545958812e-06, "loss": 0.79125243, "num_input_tokens_seen": 171726715, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.20251465, "step": 7988, "time_per_iteration": 2.8418564796447754 }, { "auxiliary_loss_clip": 0.01449707, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.27320051, "balance_loss_mlp": 1.01756454, "epoch": 0.48032466556440706, "flos": 20457591321600.0, "grad_norm": 2.1556683668002603, "language_loss": 0.79941022, "learning_rate": 2.224053348748365e-06, "loss": 0.82429403, "num_input_tokens_seen": 171743605, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.21118164, "step": 7989, "time_per_iteration": 2.850249767303467 }, { "auxiliary_loss_clip": 0.01469762, "auxiliary_loss_mlp": 0.01039781, "balance_loss_clip": 1.28891683, "balance_loss_mlp": 1.01853776, "epoch": 0.480384788817075, "flos": 37134298114560.0, "grad_norm": 1.7690563828819914, "language_loss": 0.74414825, "learning_rate": 2.223666334404724e-06, "loss": 0.76924366, "num_input_tokens_seen": 171765445, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.21240234, "step": 7990, "time_per_iteration": 3.0854408740997314 }, { "auxiliary_loss_clip": 0.01231484, "auxiliary_loss_mlp": 0.01023448, "balance_loss_clip": 1.1378274, "balance_loss_mlp": 1.00132227, "epoch": 0.480444912069743, "flos": 69582876048000.0, "grad_norm": 0.7696504984305313, "language_loss": 0.59093881, "learning_rate": 2.223279311579633e-06, "loss": 0.6134882, "num_input_tokens_seen": 171830115, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.22167969, "step": 7991, "time_per_iteration": 3.4493937492370605 }, { "auxiliary_loss_clip": 0.01440855, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.26621222, "balance_loss_mlp": 1.0137291, "epoch": 0.48050503532241096, "flos": 29833438412160.0, "grad_norm": 2.4762385066957813, "language_loss": 0.67651904, "learning_rate": 2.222892280287768e-06, "loss": 0.70127338, "num_input_tokens_seen": 171849135, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.20849609, "step": 7992, "time_per_iteration": 4.352416038513184 }, { "auxiliary_loss_clip": 0.01461452, "auxiliary_loss_mlp": 0.01044538, "balance_loss_clip": 1.28223157, "balance_loss_mlp": 1.02368808, "epoch": 0.4805651585750789, "flos": 23958355438080.0, "grad_norm": 1.8820432129740612, "language_loss": 0.76919591, "learning_rate": 2.2225052405438056e-06, "loss": 0.79425585, "num_input_tokens_seen": 171868880, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.20861816, "step": 7993, "time_per_iteration": 4.377902030944824 }, { "auxiliary_loss_clip": 0.01439263, "auxiliary_loss_mlp": 0.0104072, "balance_loss_clip": 1.26773238, "balance_loss_mlp": 1.0201447, "epoch": 0.4806252818277469, "flos": 25676224467840.0, "grad_norm": 1.9353126819328703, "language_loss": 0.78956151, "learning_rate": 2.222118192362422e-06, "loss": 0.81436127, "num_input_tokens_seen": 171889455, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.20556641, "step": 7994, "time_per_iteration": 2.9501264095306396 }, { "auxiliary_loss_clip": 0.01440736, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.26568949, "balance_loss_mlp": 1.01641679, "epoch": 0.48068540508041485, "flos": 13159853510400.0, "grad_norm": 1.8356234087053929, "language_loss": 0.80145812, "learning_rate": 2.2217311357582946e-06, "loss": 0.82623887, "num_input_tokens_seen": 171906070, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.20910645, "step": 7995, "time_per_iteration": 2.850233316421509 }, { "auxiliary_loss_clip": 0.01441428, "auxiliary_loss_mlp": 0.01044477, "balance_loss_clip": 1.2687633, "balance_loss_mlp": 1.02247095, "epoch": 0.4807455283330828, "flos": 21186125395200.0, "grad_norm": 1.9971995138691423, "language_loss": 0.83392429, "learning_rate": 2.2213440707461e-06, "loss": 0.85878336, "num_input_tokens_seen": 171926515, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.22009277, "step": 7996, "time_per_iteration": 2.8977081775665283 }, { "auxiliary_loss_clip": 0.01432743, "auxiliary_loss_mlp": 0.01040707, "balance_loss_clip": 1.26266611, "balance_loss_mlp": 1.01860571, "epoch": 0.4808056515857508, "flos": 12283531361280.0, "grad_norm": 16.175787720793505, "language_loss": 0.81336981, "learning_rate": 2.220956997340516e-06, "loss": 0.83810425, "num_input_tokens_seen": 171943845, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22094727, "step": 7997, "time_per_iteration": 2.873546838760376 }, { "auxiliary_loss_clip": 0.01452646, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.27685905, "balance_loss_mlp": 1.01999462, "epoch": 0.48086577483841875, "flos": 24836577868800.0, "grad_norm": 1.685274737077479, "language_loss": 0.73144317, "learning_rate": 2.220569915556221e-06, "loss": 0.75639242, "num_input_tokens_seen": 171964970, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.22265625, "step": 7998, "time_per_iteration": 2.944410562515259 }, { "auxiliary_loss_clip": 0.01431101, "auxiliary_loss_mlp": 0.01043078, "balance_loss_clip": 1.25838649, "balance_loss_mlp": 1.0202378, "epoch": 0.4809258980910867, "flos": 24476880556800.0, "grad_norm": 1.7481417099559415, "language_loss": 0.71442926, "learning_rate": 2.220182825407892e-06, "loss": 0.73917103, "num_input_tokens_seen": 171986340, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.2286377, "step": 7999, "time_per_iteration": 3.049927234649658 }, { "auxiliary_loss_clip": 0.01454771, "auxiliary_loss_mlp": 0.01046498, "balance_loss_clip": 1.27593374, "balance_loss_mlp": 1.02358603, "epoch": 0.4809860213437547, "flos": 21226058570880.0, "grad_norm": 1.5224006786718538, "language_loss": 0.72325689, "learning_rate": 2.2197957269102083e-06, "loss": 0.74826956, "num_input_tokens_seen": 172007300, "router_z_loss_clip": 1.79199219, "router_z_loss_mlp": 0.22912598, "step": 8000, "time_per_iteration": 2.9187307357788086 }, { "auxiliary_loss_clip": 0.01452473, "auxiliary_loss_mlp": 0.01040679, "balance_loss_clip": 1.27656174, "balance_loss_mlp": 1.01813626, "epoch": 0.48104614459642264, "flos": 37645810289280.0, "grad_norm": 1.4690997171397149, "language_loss": 0.75139773, "learning_rate": 2.2194086200778485e-06, "loss": 0.77632916, "num_input_tokens_seen": 172029585, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.22546387, "step": 8001, "time_per_iteration": 3.0310871601104736 }, { "auxiliary_loss_clip": 0.01455787, "auxiliary_loss_mlp": 0.01044168, "balance_loss_clip": 1.27804601, "balance_loss_mlp": 1.02248371, "epoch": 0.48110626784909066, "flos": 18415388430720.0, "grad_norm": 1.6703034474858074, "language_loss": 0.82015562, "learning_rate": 2.219021504925493e-06, "loss": 0.84515512, "num_input_tokens_seen": 172047495, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.21691895, "step": 8002, "time_per_iteration": 2.88570237159729 }, { "auxiliary_loss_clip": 0.0145258, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.27460814, "balance_loss_mlp": 1.01823831, "epoch": 0.48116639110175863, "flos": 28451712136320.0, "grad_norm": 1.7372421682889956, "language_loss": 0.72100884, "learning_rate": 2.218634381467819e-06, "loss": 0.74594748, "num_input_tokens_seen": 172067625, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.23046875, "step": 8003, "time_per_iteration": 2.928603172302246 }, { "auxiliary_loss_clip": 0.0142792, "auxiliary_loss_mlp": 0.01039762, "balance_loss_clip": 1.25713599, "balance_loss_mlp": 1.01792276, "epoch": 0.4812265143544266, "flos": 21735082281600.0, "grad_norm": 1.6178341630289752, "language_loss": 0.82884675, "learning_rate": 2.218247249719507e-06, "loss": 0.85352361, "num_input_tokens_seen": 172087885, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21826172, "step": 8004, "time_per_iteration": 2.9434335231781006 }, { "auxiliary_loss_clip": 0.01476651, "auxiliary_loss_mlp": 0.01042934, "balance_loss_clip": 1.29080272, "balance_loss_mlp": 1.02029634, "epoch": 0.48128663760709456, "flos": 13232480693760.0, "grad_norm": 1.9785219840643586, "language_loss": 0.78148961, "learning_rate": 2.217860109695239e-06, "loss": 0.80668539, "num_input_tokens_seen": 172105815, "router_z_loss_clip": 1.85839844, "router_z_loss_mlp": 0.2265625, "step": 8005, "time_per_iteration": 2.853102445602417 }, { "auxiliary_loss_clip": 0.01456384, "auxiliary_loss_mlp": 0.01039028, "balance_loss_clip": 1.27821374, "balance_loss_mlp": 1.01799941, "epoch": 0.4813467608597625, "flos": 24254112568320.0, "grad_norm": 1.761310330873654, "language_loss": 0.71587288, "learning_rate": 2.217472961409692e-06, "loss": 0.74082708, "num_input_tokens_seen": 172126125, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.21044922, "step": 8006, "time_per_iteration": 2.9403984546661377 }, { "auxiliary_loss_clip": 0.01452385, "auxiliary_loss_mlp": 0.01038208, "balance_loss_clip": 1.27412009, "balance_loss_mlp": 1.01665509, "epoch": 0.4814068841124305, "flos": 27489279853440.0, "grad_norm": 1.9187621192169098, "language_loss": 0.71263802, "learning_rate": 2.2170858048775495e-06, "loss": 0.73754394, "num_input_tokens_seen": 172141945, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.2154541, "step": 8007, "time_per_iteration": 2.895547389984131 }, { "auxiliary_loss_clip": 0.0146015, "auxiliary_loss_mlp": 0.01037177, "balance_loss_clip": 1.28154206, "balance_loss_mlp": 1.01549256, "epoch": 0.48146700736509845, "flos": 19582264558080.0, "grad_norm": 1.6640914707682575, "language_loss": 0.72706449, "learning_rate": 2.2166986401134914e-06, "loss": 0.7520377, "num_input_tokens_seen": 172161095, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.21679688, "step": 8008, "time_per_iteration": 2.9506442546844482 }, { "auxiliary_loss_clip": 0.01457586, "auxiliary_loss_mlp": 0.01042161, "balance_loss_clip": 1.2792033, "balance_loss_mlp": 1.01932037, "epoch": 0.4815271306177664, "flos": 20636580326400.0, "grad_norm": 1.8103298055598507, "language_loss": 0.61591613, "learning_rate": 2.216311467132199e-06, "loss": 0.64091349, "num_input_tokens_seen": 172178750, "router_z_loss_clip": 1.78320312, "router_z_loss_mlp": 0.22851562, "step": 8009, "time_per_iteration": 2.937575578689575 }, { "auxiliary_loss_clip": 0.01231813, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.13711929, "balance_loss_mlp": 1.003703, "epoch": 0.4815872538704344, "flos": 67720702792320.0, "grad_norm": 0.8805211965501725, "language_loss": 0.61471403, "learning_rate": 2.2159242859483547e-06, "loss": 0.63729799, "num_input_tokens_seen": 172240235, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.22851562, "step": 8010, "time_per_iteration": 3.417440891265869 }, { "auxiliary_loss_clip": 0.01446432, "auxiliary_loss_mlp": 0.01039648, "balance_loss_clip": 1.27107453, "balance_loss_mlp": 1.01566339, "epoch": 0.48164737712310235, "flos": 22831141017600.0, "grad_norm": 1.6237337707487096, "language_loss": 0.74107701, "learning_rate": 2.215537096576639e-06, "loss": 0.76593781, "num_input_tokens_seen": 172259875, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.23986816, "step": 8011, "time_per_iteration": 2.9076569080352783 }, { "auxiliary_loss_clip": 0.01440414, "auxiliary_loss_mlp": 0.01038125, "balance_loss_clip": 1.26746809, "balance_loss_mlp": 1.01658404, "epoch": 0.4817075003757703, "flos": 23744726899200.0, "grad_norm": 1.773482835948781, "language_loss": 0.80109239, "learning_rate": 2.2151498990317354e-06, "loss": 0.82587779, "num_input_tokens_seen": 172280150, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.21533203, "step": 8012, "time_per_iteration": 2.9070706367492676 }, { "auxiliary_loss_clip": 0.01455092, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.27848315, "balance_loss_mlp": 1.0162226, "epoch": 0.4817676236284383, "flos": 28194349858560.0, "grad_norm": 1.718084716160457, "language_loss": 0.74468762, "learning_rate": 2.214762693328326e-06, "loss": 0.7696265, "num_input_tokens_seen": 172300810, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.22595215, "step": 8013, "time_per_iteration": 2.9592089653015137 }, { "auxiliary_loss_clip": 0.01452941, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.27726054, "balance_loss_mlp": 1.01774359, "epoch": 0.48182774688110624, "flos": 17100452759040.0, "grad_norm": 1.8581532077794292, "language_loss": 0.90977663, "learning_rate": 2.214375479481094e-06, "loss": 0.93470323, "num_input_tokens_seen": 172317930, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.21984863, "step": 8014, "time_per_iteration": 2.866194248199463 }, { "auxiliary_loss_clip": 0.01458032, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.27855456, "balance_loss_mlp": 1.02150929, "epoch": 0.4818878701337742, "flos": 12575487928320.0, "grad_norm": 2.0689594427714146, "language_loss": 0.7545805, "learning_rate": 2.213988257504722e-06, "loss": 0.77960587, "num_input_tokens_seen": 172336340, "router_z_loss_clip": 1.79589844, "router_z_loss_mlp": 0.23010254, "step": 8015, "time_per_iteration": 2.970508098602295 }, { "auxiliary_loss_clip": 0.0147471, "auxiliary_loss_mlp": 0.01041855, "balance_loss_clip": 1.2908293, "balance_loss_mlp": 1.0193249, "epoch": 0.48194799338644223, "flos": 24619329745920.0, "grad_norm": 2.5640191400150147, "language_loss": 0.80913389, "learning_rate": 2.213601027413894e-06, "loss": 0.8342995, "num_input_tokens_seen": 172354315, "router_z_loss_clip": 1.83789062, "router_z_loss_mlp": 0.2253418, "step": 8016, "time_per_iteration": 2.927116870880127 }, { "auxiliary_loss_clip": 0.01436485, "auxiliary_loss_mlp": 0.01038222, "balance_loss_clip": 1.26443505, "balance_loss_mlp": 1.01681173, "epoch": 0.4820081166391102, "flos": 21114946045440.0, "grad_norm": 1.8637497119065356, "language_loss": 0.78400362, "learning_rate": 2.2132137892232933e-06, "loss": 0.80875075, "num_input_tokens_seen": 172372695, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.21398926, "step": 8017, "time_per_iteration": 4.364032983779907 }, { "auxiliary_loss_clip": 0.01441151, "auxiliary_loss_mlp": 0.01038007, "balance_loss_clip": 1.26871276, "balance_loss_mlp": 1.01575017, "epoch": 0.48206823989177816, "flos": 25275417615360.0, "grad_norm": 1.8641081324352222, "language_loss": 0.80884123, "learning_rate": 2.2128265429476043e-06, "loss": 0.83363283, "num_input_tokens_seen": 172390905, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.22253418, "step": 8018, "time_per_iteration": 2.912993907928467 }, { "auxiliary_loss_clip": 0.01466144, "auxiliary_loss_mlp": 0.01043754, "balance_loss_clip": 1.28629899, "balance_loss_mlp": 1.02351272, "epoch": 0.4821283631444461, "flos": 24655914806400.0, "grad_norm": 1.8991504010230011, "language_loss": 0.76844895, "learning_rate": 2.2124392886015124e-06, "loss": 0.79354793, "num_input_tokens_seen": 172412295, "router_z_loss_clip": 1.79785156, "router_z_loss_mlp": 0.20227051, "step": 8019, "time_per_iteration": 2.9922773838043213 }, { "auxiliary_loss_clip": 0.01451761, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.27387071, "balance_loss_mlp": 1.02099097, "epoch": 0.4821884863971141, "flos": 23962834673280.0, "grad_norm": 1.651683129202949, "language_loss": 0.79810244, "learning_rate": 2.212052026199701e-06, "loss": 0.82304347, "num_input_tokens_seen": 172432625, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.21337891, "step": 8020, "time_per_iteration": 3.019916296005249 }, { "auxiliary_loss_clip": 0.0142945, "auxiliary_loss_mlp": 0.01045968, "balance_loss_clip": 1.25740147, "balance_loss_mlp": 1.02478433, "epoch": 0.48224860964978206, "flos": 17168329238400.0, "grad_norm": 10.718134904980115, "language_loss": 0.70137709, "learning_rate": 2.211664755756855e-06, "loss": 0.72613126, "num_input_tokens_seen": 172450010, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.21191406, "step": 8021, "time_per_iteration": 2.939615249633789 }, { "auxiliary_loss_clip": 0.01474348, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.29206014, "balance_loss_mlp": 1.02315307, "epoch": 0.48230873290245, "flos": 23086195810560.0, "grad_norm": 1.864012865195003, "language_loss": 0.64054352, "learning_rate": 2.2112774772876603e-06, "loss": 0.66574156, "num_input_tokens_seen": 172469080, "router_z_loss_clip": 1.82421875, "router_z_loss_mlp": 0.22290039, "step": 8022, "time_per_iteration": 2.8991527557373047 }, { "auxiliary_loss_clip": 0.01436965, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.26431179, "balance_loss_mlp": 1.01695263, "epoch": 0.482368856155118, "flos": 19362708950400.0, "grad_norm": 2.8418351332735634, "language_loss": 0.67140746, "learning_rate": 2.2108901908068028e-06, "loss": 0.6961503, "num_input_tokens_seen": 172484850, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.20373535, "step": 8023, "time_per_iteration": 2.8752830028533936 }, { "auxiliary_loss_clip": 0.01453627, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.27809834, "balance_loss_mlp": 1.01846886, "epoch": 0.48242897940778595, "flos": 20087261481600.0, "grad_norm": 1.8058208886102474, "language_loss": 0.77542478, "learning_rate": 2.2105028963289683e-06, "loss": 0.80036139, "num_input_tokens_seen": 172503525, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.21557617, "step": 8024, "time_per_iteration": 2.9372446537017822 }, { "auxiliary_loss_clip": 0.01450113, "auxiliary_loss_mlp": 0.01044104, "balance_loss_clip": 1.27362776, "balance_loss_mlp": 1.02051187, "epoch": 0.4824891026604539, "flos": 23414375479680.0, "grad_norm": 1.5713528217868877, "language_loss": 0.75878501, "learning_rate": 2.2101155938688423e-06, "loss": 0.78372711, "num_input_tokens_seen": 172524360, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.23596191, "step": 8025, "time_per_iteration": 2.895030975341797 }, { "auxiliary_loss_clip": 0.01437201, "auxiliary_loss_mlp": 0.01042406, "balance_loss_clip": 1.26308405, "balance_loss_mlp": 1.02103186, "epoch": 0.4825492259131219, "flos": 20376458115840.0, "grad_norm": 1.9107915944120655, "language_loss": 0.71817243, "learning_rate": 2.209728283441112e-06, "loss": 0.74296856, "num_input_tokens_seen": 172541480, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21374512, "step": 8026, "time_per_iteration": 2.8466641902923584 }, { "auxiliary_loss_clip": 0.01453411, "auxiliary_loss_mlp": 0.01047567, "balance_loss_clip": 1.27448559, "balance_loss_mlp": 1.02571642, "epoch": 0.48260934916578985, "flos": 14327046351360.0, "grad_norm": 2.287695772785659, "language_loss": 0.76144081, "learning_rate": 2.209340965060465e-06, "loss": 0.78645062, "num_input_tokens_seen": 172559005, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.21838379, "step": 8027, "time_per_iteration": 5.129932165145874 }, { "auxiliary_loss_clip": 0.01446108, "auxiliary_loss_mlp": 0.01044907, "balance_loss_clip": 1.26997089, "balance_loss_mlp": 1.02305627, "epoch": 0.4826694724184578, "flos": 22129690596480.0, "grad_norm": 1.6297460318866361, "language_loss": 0.6856913, "learning_rate": 2.2089536387415868e-06, "loss": 0.71060145, "num_input_tokens_seen": 172578435, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.21862793, "step": 8028, "time_per_iteration": 4.345627069473267 }, { "auxiliary_loss_clip": 0.01445677, "auxiliary_loss_mlp": 0.01038641, "balance_loss_clip": 1.27115953, "balance_loss_mlp": 1.01670611, "epoch": 0.48272959567112583, "flos": 16190034013440.0, "grad_norm": 1.694295402777243, "language_loss": 0.74138522, "learning_rate": 2.2085663044991655e-06, "loss": 0.76622844, "num_input_tokens_seen": 172596095, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.21936035, "step": 8029, "time_per_iteration": 2.8974924087524414 }, { "auxiliary_loss_clip": 0.01451874, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.2736274, "balance_loss_mlp": 1.01528645, "epoch": 0.4827897189237938, "flos": 23189571475200.0, "grad_norm": 1.9546767708510602, "language_loss": 0.84901345, "learning_rate": 2.2081789623478896e-06, "loss": 0.8739239, "num_input_tokens_seen": 172615255, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.2388916, "step": 8030, "time_per_iteration": 2.9009616374969482 }, { "auxiliary_loss_clip": 0.01437047, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.2634902, "balance_loss_mlp": 1.01754391, "epoch": 0.48284984217646176, "flos": 21662545587840.0, "grad_norm": 2.0742542092825196, "language_loss": 0.74431676, "learning_rate": 2.2077916123024466e-06, "loss": 0.76908308, "num_input_tokens_seen": 172633185, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.22045898, "step": 8031, "time_per_iteration": 2.8582918643951416 }, { "auxiliary_loss_clip": 0.01473567, "auxiliary_loss_mlp": 0.01042698, "balance_loss_clip": 1.29095435, "balance_loss_mlp": 1.02019143, "epoch": 0.48290996542912973, "flos": 31479947112960.0, "grad_norm": 1.5975229955690644, "language_loss": 0.72770518, "learning_rate": 2.2074042543775245e-06, "loss": 0.75286782, "num_input_tokens_seen": 172654280, "router_z_loss_clip": 1.82714844, "router_z_loss_mlp": 0.22521973, "step": 8032, "time_per_iteration": 2.93294095993042 }, { "auxiliary_loss_clip": 0.01440138, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.26621902, "balance_loss_mlp": 1.01105213, "epoch": 0.4829700886817977, "flos": 24472582300800.0, "grad_norm": 1.38540646951421, "language_loss": 0.75057137, "learning_rate": 2.2070168885878126e-06, "loss": 0.77530259, "num_input_tokens_seen": 172675545, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.21936035, "step": 8033, "time_per_iteration": 2.960073471069336 }, { "auxiliary_loss_clip": 0.01468895, "auxiliary_loss_mlp": 0.01043461, "balance_loss_clip": 1.28767824, "balance_loss_mlp": 1.02180076, "epoch": 0.48303021193446566, "flos": 25713035752320.0, "grad_norm": 3.0041930592867563, "language_loss": 0.84033597, "learning_rate": 2.2066295149479996e-06, "loss": 0.86545956, "num_input_tokens_seen": 172696455, "router_z_loss_clip": 1.81445312, "router_z_loss_mlp": 0.2166748, "step": 8034, "time_per_iteration": 2.8878417015075684 }, { "auxiliary_loss_clip": 0.01429851, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.25885224, "balance_loss_mlp": 1.01777136, "epoch": 0.4830903351871336, "flos": 20095134076800.0, "grad_norm": 1.9106655716120213, "language_loss": 0.80411536, "learning_rate": 2.2062421334727744e-06, "loss": 0.82881176, "num_input_tokens_seen": 172716720, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.22021484, "step": 8035, "time_per_iteration": 2.891605854034424 }, { "auxiliary_loss_clip": 0.01437059, "auxiliary_loss_mlp": 0.01043221, "balance_loss_clip": 1.26182795, "balance_loss_mlp": 1.01885474, "epoch": 0.4831504584398016, "flos": 39465335681280.0, "grad_norm": 2.438277291074345, "language_loss": 0.70340228, "learning_rate": 2.2058547441768267e-06, "loss": 0.72820503, "num_input_tokens_seen": 172737435, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.24365234, "step": 8036, "time_per_iteration": 2.9818954467773438 }, { "auxiliary_loss_clip": 0.01447352, "auxiliary_loss_mlp": 0.01043407, "balance_loss_clip": 1.27193117, "balance_loss_mlp": 1.02248573, "epoch": 0.48321058169246955, "flos": 20014996256640.0, "grad_norm": 1.9817730381958845, "language_loss": 0.73786223, "learning_rate": 2.205467347074847e-06, "loss": 0.76276982, "num_input_tokens_seen": 172755700, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.20910645, "step": 8037, "time_per_iteration": 2.845090866088867 }, { "auxiliary_loss_clip": 0.01467061, "auxiliary_loss_mlp": 0.01039962, "balance_loss_clip": 1.28468919, "balance_loss_mlp": 1.01781321, "epoch": 0.4832707049451375, "flos": 20751402925440.0, "grad_norm": 2.0588494341109813, "language_loss": 0.70228326, "learning_rate": 2.205079942181525e-06, "loss": 0.72735345, "num_input_tokens_seen": 172775185, "router_z_loss_clip": 1.82128906, "router_z_loss_mlp": 0.22143555, "step": 8038, "time_per_iteration": 2.9087917804718018 }, { "auxiliary_loss_clip": 0.01437536, "auxiliary_loss_mlp": 0.01043383, "balance_loss_clip": 1.26288116, "balance_loss_mlp": 1.01967251, "epoch": 0.4833308281978055, "flos": 33158471149440.0, "grad_norm": 1.5105880071092512, "language_loss": 0.79417861, "learning_rate": 2.20469252951155e-06, "loss": 0.81898779, "num_input_tokens_seen": 172796990, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.23706055, "step": 8039, "time_per_iteration": 2.995439052581787 }, { "auxiliary_loss_clip": 0.01435852, "auxiliary_loss_mlp": 0.01041466, "balance_loss_clip": 1.26187956, "balance_loss_mlp": 1.02005601, "epoch": 0.48339095145047345, "flos": 19108649543040.0, "grad_norm": 1.639877376062746, "language_loss": 0.78355777, "learning_rate": 2.2043051090796143e-06, "loss": 0.80833101, "num_input_tokens_seen": 172814915, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.21435547, "step": 8040, "time_per_iteration": 3.003995895385742 }, { "auxiliary_loss_clip": 0.01450697, "auxiliary_loss_mlp": 0.01040162, "balance_loss_clip": 1.27450788, "balance_loss_mlp": 1.01733363, "epoch": 0.4834510747031414, "flos": 34472773393920.0, "grad_norm": 1.4730316763011786, "language_loss": 0.76194346, "learning_rate": 2.203917680900409e-06, "loss": 0.786852, "num_input_tokens_seen": 172837060, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.22839355, "step": 8041, "time_per_iteration": 2.9654502868652344 }, { "auxiliary_loss_clip": 0.0142614, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.25677872, "balance_loss_mlp": 1.01799703, "epoch": 0.48351119795580944, "flos": 27392238460800.0, "grad_norm": 2.232451265957086, "language_loss": 0.6772902, "learning_rate": 2.203530244988624e-06, "loss": 0.70195055, "num_input_tokens_seen": 172856545, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21911621, "step": 8042, "time_per_iteration": 2.9001424312591553 }, { "auxiliary_loss_clip": 0.01259886, "auxiliary_loss_mlp": 0.01047734, "balance_loss_clip": 1.15897596, "balance_loss_mlp": 1.01912332, "epoch": 0.4835713212084774, "flos": 67173465208320.0, "grad_norm": 0.701833726374452, "language_loss": 0.58668327, "learning_rate": 2.2031428013589517e-06, "loss": 0.60975945, "num_input_tokens_seen": 172923055, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.28515625, "step": 8043, "time_per_iteration": 3.4702389240264893 }, { "auxiliary_loss_clip": 0.01448629, "auxiliary_loss_mlp": 0.01044365, "balance_loss_clip": 1.27165091, "balance_loss_mlp": 1.02190566, "epoch": 0.48363144446114537, "flos": 17976412949760.0, "grad_norm": 3.1943441738863907, "language_loss": 0.73546666, "learning_rate": 2.2027553500260847e-06, "loss": 0.7603966, "num_input_tokens_seen": 172940700, "router_z_loss_clip": 1.77050781, "router_z_loss_mlp": 0.2244873, "step": 8044, "time_per_iteration": 2.842984199523926 }, { "auxiliary_loss_clip": 0.01439386, "auxiliary_loss_mlp": 0.01037541, "balance_loss_clip": 1.26619899, "balance_loss_mlp": 1.01372313, "epoch": 0.48369156771381333, "flos": 20602890933120.0, "grad_norm": 1.5918803391866734, "language_loss": 0.76196301, "learning_rate": 2.202367891004714e-06, "loss": 0.78673232, "num_input_tokens_seen": 172961125, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.23803711, "step": 8045, "time_per_iteration": 2.8632731437683105 }, { "auxiliary_loss_clip": 0.01448876, "auxiliary_loss_mlp": 0.01039652, "balance_loss_clip": 1.27292132, "balance_loss_mlp": 1.01753855, "epoch": 0.4837516909664813, "flos": 22685479447680.0, "grad_norm": 1.5220760521722976, "language_loss": 0.69723201, "learning_rate": 2.201980424309533e-06, "loss": 0.72211725, "num_input_tokens_seen": 172980405, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.22106934, "step": 8046, "time_per_iteration": 2.8912031650543213 }, { "auxiliary_loss_clip": 0.01437976, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.26261163, "balance_loss_mlp": 1.01619804, "epoch": 0.48381181421914926, "flos": 25529160309120.0, "grad_norm": 1.751074824553134, "language_loss": 0.82639217, "learning_rate": 2.2015929499552337e-06, "loss": 0.85117066, "num_input_tokens_seen": 172999105, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.23706055, "step": 8047, "time_per_iteration": 2.875175952911377 }, { "auxiliary_loss_clip": 0.01427702, "auxiliary_loss_mlp": 0.01037186, "balance_loss_clip": 1.25598717, "balance_loss_mlp": 1.01528692, "epoch": 0.4838719374718172, "flos": 24218296669440.0, "grad_norm": 1.8071422837358269, "language_loss": 0.80980414, "learning_rate": 2.2012054679565092e-06, "loss": 0.83445305, "num_input_tokens_seen": 173019935, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.21899414, "step": 8048, "time_per_iteration": 2.8864898681640625 }, { "auxiliary_loss_clip": 0.01459169, "auxiliary_loss_mlp": 0.0104431, "balance_loss_clip": 1.27935255, "balance_loss_mlp": 1.02088594, "epoch": 0.4839320607244852, "flos": 26735924367360.0, "grad_norm": 2.4998326903685397, "language_loss": 0.8234061, "learning_rate": 2.200817978328054e-06, "loss": 0.84844089, "num_input_tokens_seen": 173039700, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.23425293, "step": 8049, "time_per_iteration": 2.925243377685547 }, { "auxiliary_loss_clip": 0.01438797, "auxiliary_loss_mlp": 0.010353, "balance_loss_clip": 1.26761174, "balance_loss_mlp": 1.01443839, "epoch": 0.48399218397715316, "flos": 20458586707200.0, "grad_norm": 1.8061718237111914, "language_loss": 0.73694795, "learning_rate": 2.2004304810845602e-06, "loss": 0.76168889, "num_input_tokens_seen": 173059170, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.20861816, "step": 8050, "time_per_iteration": 2.9233829975128174 }, { "auxiliary_loss_clip": 0.0124702, "auxiliary_loss_mlp": 0.01047151, "balance_loss_clip": 1.1496799, "balance_loss_mlp": 1.01739621, "epoch": 0.4840523072298211, "flos": 67210140758400.0, "grad_norm": 0.7091427012831094, "language_loss": 0.56415063, "learning_rate": 2.200042976240723e-06, "loss": 0.58709234, "num_input_tokens_seen": 173119000, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.296875, "step": 8051, "time_per_iteration": 3.397515058517456 }, { "auxiliary_loss_clip": 0.01448871, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.272434, "balance_loss_mlp": 1.01644003, "epoch": 0.4841124304824891, "flos": 22419701637120.0, "grad_norm": 4.034535896796491, "language_loss": 0.76096451, "learning_rate": 2.199655463811236e-06, "loss": 0.78584731, "num_input_tokens_seen": 173137570, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.2298584, "step": 8052, "time_per_iteration": 4.300498008728027 }, { "auxiliary_loss_clip": 0.01449626, "auxiliary_loss_mlp": 0.01037336, "balance_loss_clip": 1.27585959, "balance_loss_mlp": 1.014853, "epoch": 0.48417255373515705, "flos": 13851666789120.0, "grad_norm": 4.270077672584764, "language_loss": 0.67061222, "learning_rate": 2.1992679438107936e-06, "loss": 0.6954819, "num_input_tokens_seen": 173154355, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.22497559, "step": 8053, "time_per_iteration": 2.833307981491089 }, { "auxiliary_loss_clip": 0.01439411, "auxiliary_loss_mlp": 0.01040292, "balance_loss_clip": 1.26779151, "balance_loss_mlp": 1.01767755, "epoch": 0.484232676987825, "flos": 31662193743360.0, "grad_norm": 2.61514655344533, "language_loss": 0.70877516, "learning_rate": 2.198880416254091e-06, "loss": 0.73357219, "num_input_tokens_seen": 173174845, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.22607422, "step": 8054, "time_per_iteration": 2.925290822982788 }, { "auxiliary_loss_clip": 0.01438566, "auxiliary_loss_mlp": 0.01034911, "balance_loss_clip": 1.26523471, "balance_loss_mlp": 1.01347721, "epoch": 0.48429280024049304, "flos": 24105464841600.0, "grad_norm": 2.3256861410830854, "language_loss": 0.70055044, "learning_rate": 2.1984928811558233e-06, "loss": 0.72528523, "num_input_tokens_seen": 173195025, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.21435547, "step": 8055, "time_per_iteration": 2.8612911701202393 }, { "auxiliary_loss_clip": 0.01460316, "auxiliary_loss_mlp": 0.01042063, "balance_loss_clip": 1.28302574, "balance_loss_mlp": 1.01813757, "epoch": 0.484352923493161, "flos": 17539021036800.0, "grad_norm": 2.6151571569005796, "language_loss": 0.63861001, "learning_rate": 2.198105338530685e-06, "loss": 0.66363382, "num_input_tokens_seen": 173213065, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.23913574, "step": 8056, "time_per_iteration": 2.8068222999572754 }, { "auxiliary_loss_clip": 0.01441721, "auxiliary_loss_mlp": 0.01040925, "balance_loss_clip": 1.26638496, "balance_loss_mlp": 1.01711941, "epoch": 0.48441304674582897, "flos": 29178255438720.0, "grad_norm": 1.6464511326427886, "language_loss": 0.6823284, "learning_rate": 2.1977177883933726e-06, "loss": 0.70715487, "num_input_tokens_seen": 173234545, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.23803711, "step": 8057, "time_per_iteration": 2.9068243503570557 }, { "auxiliary_loss_clip": 0.01425192, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 1.25356436, "balance_loss_mlp": 1.01505506, "epoch": 0.48447316999849693, "flos": 15894819820800.0, "grad_norm": 1.7583110916200952, "language_loss": 0.82154047, "learning_rate": 2.1973302307585827e-06, "loss": 0.84617937, "num_input_tokens_seen": 173252175, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.23632812, "step": 8058, "time_per_iteration": 2.8482937812805176 }, { "auxiliary_loss_clip": 0.01456847, "auxiliary_loss_mlp": 0.01045728, "balance_loss_clip": 1.27584779, "balance_loss_mlp": 1.02263761, "epoch": 0.4845332932511649, "flos": 24390046506240.0, "grad_norm": 1.6365331806822792, "language_loss": 0.80985039, "learning_rate": 2.1969426656410097e-06, "loss": 0.83487618, "num_input_tokens_seen": 173268790, "router_z_loss_clip": 1.80761719, "router_z_loss_mlp": 0.23095703, "step": 8059, "time_per_iteration": 2.8700923919677734 }, { "auxiliary_loss_clip": 0.01466689, "auxiliary_loss_mlp": 0.01047048, "balance_loss_clip": 1.2863214, "balance_loss_mlp": 1.02352798, "epoch": 0.48459341650383286, "flos": 37128823493760.0, "grad_norm": 2.2015483155723805, "language_loss": 0.66673303, "learning_rate": 2.196555093055352e-06, "loss": 0.69187033, "num_input_tokens_seen": 173288030, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.23522949, "step": 8060, "time_per_iteration": 3.029085874557495 }, { "auxiliary_loss_clip": 0.01452095, "auxiliary_loss_mlp": 0.01043536, "balance_loss_clip": 1.276999, "balance_loss_mlp": 1.02089787, "epoch": 0.48465353975650083, "flos": 22977209790720.0, "grad_norm": 1.9607669267781869, "language_loss": 0.68190807, "learning_rate": 2.1961675130163046e-06, "loss": 0.70686436, "num_input_tokens_seen": 173305965, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.22619629, "step": 8061, "time_per_iteration": 2.8707222938537598 }, { "auxiliary_loss_clip": 0.01457512, "auxiliary_loss_mlp": 0.01053537, "balance_loss_clip": 1.28130221, "balance_loss_mlp": 1.03027976, "epoch": 0.4847136630091688, "flos": 17715431088000.0, "grad_norm": 1.7687183212398327, "language_loss": 0.83119237, "learning_rate": 2.1957799255385653e-06, "loss": 0.85630298, "num_input_tokens_seen": 173321985, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.23242188, "step": 8062, "time_per_iteration": 4.433736801147461 }, { "auxiliary_loss_clip": 0.01438039, "auxiliary_loss_mlp": 0.01044369, "balance_loss_clip": 1.26651371, "balance_loss_mlp": 1.02229166, "epoch": 0.48477378626183676, "flos": 22028486682240.0, "grad_norm": 1.5396134741962573, "language_loss": 0.74999231, "learning_rate": 2.1953923306368325e-06, "loss": 0.77481639, "num_input_tokens_seen": 173341315, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.2208252, "step": 8063, "time_per_iteration": 5.73506760597229 }, { "auxiliary_loss_clip": 0.01444597, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.26917815, "balance_loss_mlp": 1.02211738, "epoch": 0.4848339095145047, "flos": 27974070334080.0, "grad_norm": 1.868383136602539, "language_loss": 0.79385269, "learning_rate": 2.1950047283258023e-06, "loss": 0.81873882, "num_input_tokens_seen": 173361055, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.21911621, "step": 8064, "time_per_iteration": 2.885828971862793 }, { "auxiliary_loss_clip": 0.01436847, "auxiliary_loss_mlp": 0.01042506, "balance_loss_clip": 1.2658658, "balance_loss_mlp": 1.02065492, "epoch": 0.4848940327671727, "flos": 21698587710720.0, "grad_norm": 3.1285754680966034, "language_loss": 0.79690337, "learning_rate": 2.194617118620173e-06, "loss": 0.82169694, "num_input_tokens_seen": 173379255, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21850586, "step": 8065, "time_per_iteration": 2.880333662033081 }, { "auxiliary_loss_clip": 0.01419716, "auxiliary_loss_mlp": 0.01044171, "balance_loss_clip": 1.25403714, "balance_loss_mlp": 1.02210593, "epoch": 0.48495415601984065, "flos": 20641376275200.0, "grad_norm": 2.8071562682373865, "language_loss": 0.76571661, "learning_rate": 2.194229501534644e-06, "loss": 0.7903555, "num_input_tokens_seen": 173398370, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.22058105, "step": 8066, "time_per_iteration": 2.8433468341827393 }, { "auxiliary_loss_clip": 0.01443366, "auxiliary_loss_mlp": 0.01042575, "balance_loss_clip": 1.27160716, "balance_loss_mlp": 1.01988912, "epoch": 0.4850142792725086, "flos": 25638734511360.0, "grad_norm": 1.3686760721919282, "language_loss": 0.72368866, "learning_rate": 2.193841877083912e-06, "loss": 0.74854809, "num_input_tokens_seen": 173419595, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.22692871, "step": 8067, "time_per_iteration": 2.891291618347168 }, { "auxiliary_loss_clip": 0.01446076, "auxiliary_loss_mlp": 0.01045048, "balance_loss_clip": 1.27179384, "balance_loss_mlp": 1.02349544, "epoch": 0.4850744025251766, "flos": 13779899256960.0, "grad_norm": 3.8965206340136644, "language_loss": 0.80188477, "learning_rate": 2.1934542452826767e-06, "loss": 0.82679605, "num_input_tokens_seen": 173435390, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21557617, "step": 8068, "time_per_iteration": 2.813518524169922 }, { "auxiliary_loss_clip": 0.01442591, "auxiliary_loss_mlp": 0.01044814, "balance_loss_clip": 1.27020168, "balance_loss_mlp": 1.02285576, "epoch": 0.4851345257778446, "flos": 20269643846400.0, "grad_norm": 1.4686560825483552, "language_loss": 0.84847629, "learning_rate": 2.193066606145638e-06, "loss": 0.87335038, "num_input_tokens_seen": 173454095, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.21960449, "step": 8069, "time_per_iteration": 2.8704187870025635 }, { "auxiliary_loss_clip": 0.01438317, "auxiliary_loss_mlp": 0.0104364, "balance_loss_clip": 1.26679838, "balance_loss_mlp": 1.02228904, "epoch": 0.48519464903051257, "flos": 27101186789760.0, "grad_norm": 1.6504494422275644, "language_loss": 0.78307521, "learning_rate": 2.192678959687493e-06, "loss": 0.80789477, "num_input_tokens_seen": 173475300, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21362305, "step": 8070, "time_per_iteration": 2.8899312019348145 }, { "auxiliary_loss_clip": 0.01435661, "auxiliary_loss_mlp": 0.01040032, "balance_loss_clip": 1.26295114, "balance_loss_mlp": 1.01773977, "epoch": 0.48525477228318054, "flos": 17135680475520.0, "grad_norm": 2.252941075151758, "language_loss": 0.79155713, "learning_rate": 2.192291305922943e-06, "loss": 0.8163141, "num_input_tokens_seen": 173492005, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.22290039, "step": 8071, "time_per_iteration": 2.826347827911377 }, { "auxiliary_loss_clip": 0.01445201, "auxiliary_loss_mlp": 0.01044551, "balance_loss_clip": 1.26935768, "balance_loss_mlp": 1.02105451, "epoch": 0.4853148955358485, "flos": 28191092232960.0, "grad_norm": 2.006240430079787, "language_loss": 0.7280674, "learning_rate": 2.1919036448666873e-06, "loss": 0.75296485, "num_input_tokens_seen": 173511995, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.23510742, "step": 8072, "time_per_iteration": 2.919768810272217 }, { "auxiliary_loss_clip": 0.01457559, "auxiliary_loss_mlp": 0.01043555, "balance_loss_clip": 1.28111303, "balance_loss_mlp": 1.02227569, "epoch": 0.48537501878851647, "flos": 17502164507520.0, "grad_norm": 1.9385614401495233, "language_loss": 0.88705039, "learning_rate": 2.1915159765334262e-06, "loss": 0.91206157, "num_input_tokens_seen": 173530215, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.21276855, "step": 8073, "time_per_iteration": 2.8192501068115234 }, { "auxiliary_loss_clip": 0.01429108, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.26142657, "balance_loss_mlp": 1.01591778, "epoch": 0.48543514204118443, "flos": 28596559299840.0, "grad_norm": 1.910572128136991, "language_loss": 0.61617911, "learning_rate": 2.19112830093786e-06, "loss": 0.64084423, "num_input_tokens_seen": 173550920, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21459961, "step": 8074, "time_per_iteration": 2.9331893920898438 }, { "auxiliary_loss_clip": 0.01437058, "auxiliary_loss_mlp": 0.01044356, "balance_loss_clip": 1.26278901, "balance_loss_mlp": 1.02184951, "epoch": 0.4854952652938524, "flos": 20969872657920.0, "grad_norm": 1.84367120633218, "language_loss": 0.73501194, "learning_rate": 2.19074061809469e-06, "loss": 0.75982606, "num_input_tokens_seen": 173569065, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.22521973, "step": 8075, "time_per_iteration": 2.8330326080322266 }, { "auxiliary_loss_clip": 0.01424549, "auxiliary_loss_mlp": 0.01039199, "balance_loss_clip": 1.25682068, "balance_loss_mlp": 1.01806331, "epoch": 0.48555538854652036, "flos": 66550958259840.0, "grad_norm": 1.6331113398150123, "language_loss": 0.82280684, "learning_rate": 2.1903529280186163e-06, "loss": 0.8474443, "num_input_tokens_seen": 173596085, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.21142578, "step": 8076, "time_per_iteration": 3.2422988414764404 }, { "auxiliary_loss_clip": 0.01442424, "auxiliary_loss_mlp": 0.01040012, "balance_loss_clip": 1.26951718, "balance_loss_mlp": 1.01769567, "epoch": 0.4856155117991883, "flos": 15933576631680.0, "grad_norm": 2.092076568716854, "language_loss": 0.8722049, "learning_rate": 2.1899652307243407e-06, "loss": 0.89702922, "num_input_tokens_seen": 173613900, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.22338867, "step": 8077, "time_per_iteration": 2.8467464447021484 }, { "auxiliary_loss_clip": 0.0122409, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.1306777, "balance_loss_mlp": 1.01858711, "epoch": 0.4856756350518563, "flos": 71077796110080.0, "grad_norm": 0.9129896384808979, "language_loss": 0.58601314, "learning_rate": 2.189577526226564e-06, "loss": 0.60867643, "num_input_tokens_seen": 173671305, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.23632812, "step": 8078, "time_per_iteration": 3.3495171070098877 }, { "auxiliary_loss_clip": 0.01452484, "auxiliary_loss_mlp": 0.01040098, "balance_loss_clip": 1.27526987, "balance_loss_mlp": 1.0188663, "epoch": 0.48573575830452426, "flos": 29837781912960.0, "grad_norm": 1.7341041488366238, "language_loss": 0.72863466, "learning_rate": 2.1891898145399884e-06, "loss": 0.75356042, "num_input_tokens_seen": 173692070, "router_z_loss_clip": 1.76953125, "router_z_loss_mlp": 0.21228027, "step": 8079, "time_per_iteration": 2.950404405593872 }, { "auxiliary_loss_clip": 0.01449127, "auxiliary_loss_mlp": 0.01039992, "balance_loss_clip": 1.27311623, "balance_loss_mlp": 1.0195353, "epoch": 0.4857958815571922, "flos": 17648459504640.0, "grad_norm": 2.109474449002977, "language_loss": 0.80138928, "learning_rate": 2.1888020956793172e-06, "loss": 0.82628047, "num_input_tokens_seen": 173709785, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.20458984, "step": 8080, "time_per_iteration": 2.833333969116211 }, { "auxiliary_loss_clip": 0.01446088, "auxiliary_loss_mlp": 0.01038657, "balance_loss_clip": 1.27214074, "balance_loss_mlp": 1.01736593, "epoch": 0.4858560048098602, "flos": 21115308003840.0, "grad_norm": 2.14373920976841, "language_loss": 0.85284358, "learning_rate": 2.188414369659251e-06, "loss": 0.87769103, "num_input_tokens_seen": 173728770, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.2130127, "step": 8081, "time_per_iteration": 2.86248517036438 }, { "auxiliary_loss_clip": 0.01426787, "auxiliary_loss_mlp": 0.0103981, "balance_loss_clip": 1.25496471, "balance_loss_mlp": 1.01801848, "epoch": 0.4859161280625282, "flos": 22100978131200.0, "grad_norm": 1.7302285115805585, "language_loss": 0.84254527, "learning_rate": 2.1880266364944924e-06, "loss": 0.86721122, "num_input_tokens_seen": 173747355, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.21789551, "step": 8082, "time_per_iteration": 2.8618814945220947 }, { "auxiliary_loss_clip": 0.0143099, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.26266766, "balance_loss_mlp": 1.01608443, "epoch": 0.4859762513151962, "flos": 17502345486720.0, "grad_norm": 2.1290044950941787, "language_loss": 0.88055992, "learning_rate": 2.187638896199746e-06, "loss": 0.90524375, "num_input_tokens_seen": 173764825, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.21313477, "step": 8083, "time_per_iteration": 2.835001230239868 }, { "auxiliary_loss_clip": 0.01420226, "auxiliary_loss_mlp": 0.01042531, "balance_loss_clip": 1.25270808, "balance_loss_mlp": 1.02088261, "epoch": 0.48603637456786414, "flos": 18013269479040.0, "grad_norm": 2.348167939239756, "language_loss": 0.82290447, "learning_rate": 2.1872511487897126e-06, "loss": 0.84753203, "num_input_tokens_seen": 173783215, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.21643066, "step": 8084, "time_per_iteration": 2.8705813884735107 }, { "auxiliary_loss_clip": 0.01443965, "auxiliary_loss_mlp": 0.01040847, "balance_loss_clip": 1.2703619, "balance_loss_mlp": 1.02043831, "epoch": 0.4860964978205321, "flos": 22502237431680.0, "grad_norm": 4.114739424010158, "language_loss": 0.68081319, "learning_rate": 2.186863394279098e-06, "loss": 0.7056613, "num_input_tokens_seen": 173801905, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.20422363, "step": 8085, "time_per_iteration": 2.8588743209838867 }, { "auxiliary_loss_clip": 0.01432467, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.26148438, "balance_loss_mlp": 1.01460814, "epoch": 0.48615662107320007, "flos": 23384215180800.0, "grad_norm": 1.503049634588074, "language_loss": 0.77912819, "learning_rate": 2.1864756326826046e-06, "loss": 0.80381733, "num_input_tokens_seen": 173824690, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21838379, "step": 8086, "time_per_iteration": 2.9791693687438965 }, { "auxiliary_loss_clip": 0.01437015, "auxiliary_loss_mlp": 0.01035235, "balance_loss_clip": 1.26506555, "balance_loss_mlp": 1.01401544, "epoch": 0.48621674432586803, "flos": 34431075671040.0, "grad_norm": 2.0802476933038774, "language_loss": 0.71267456, "learning_rate": 2.1860878640149355e-06, "loss": 0.73739707, "num_input_tokens_seen": 173844450, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.21228027, "step": 8087, "time_per_iteration": 4.35870099067688 }, { "auxiliary_loss_clip": 0.01465144, "auxiliary_loss_mlp": 0.01040952, "balance_loss_clip": 1.28564024, "balance_loss_mlp": 1.01894617, "epoch": 0.486276867578536, "flos": 33120031052160.0, "grad_norm": 2.4138602395549613, "language_loss": 0.73884028, "learning_rate": 2.1857000882907974e-06, "loss": 0.76390129, "num_input_tokens_seen": 173864975, "router_z_loss_clip": 1.79296875, "router_z_loss_mlp": 0.22021484, "step": 8088, "time_per_iteration": 2.9477765560150146 }, { "auxiliary_loss_clip": 0.01431126, "auxiliary_loss_mlp": 0.01038769, "balance_loss_clip": 1.26064634, "balance_loss_mlp": 1.01751423, "epoch": 0.48633699083120396, "flos": 21480706160640.0, "grad_norm": 1.6455066506297114, "language_loss": 0.76423609, "learning_rate": 2.185312305524892e-06, "loss": 0.78893507, "num_input_tokens_seen": 173883805, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.21264648, "step": 8089, "time_per_iteration": 2.850593328475952 }, { "auxiliary_loss_clip": 0.01436726, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.26246011, "balance_loss_mlp": 1.01601076, "epoch": 0.48639711408387193, "flos": 20094002956800.0, "grad_norm": 1.6936691947522515, "language_loss": 0.84970927, "learning_rate": 2.184924515731926e-06, "loss": 0.87446427, "num_input_tokens_seen": 173903520, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.22753906, "step": 8090, "time_per_iteration": 2.8234190940856934 }, { "auxiliary_loss_clip": 0.0141734, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.25151014, "balance_loss_mlp": 1.01332104, "epoch": 0.4864572373365399, "flos": 20789073861120.0, "grad_norm": 5.207403759525097, "language_loss": 0.76844466, "learning_rate": 2.1845367189266045e-06, "loss": 0.79296011, "num_input_tokens_seen": 173924255, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20898438, "step": 8091, "time_per_iteration": 2.850848913192749 }, { "auxiliary_loss_clip": 0.01422565, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.25205886, "balance_loss_mlp": 1.01429236, "epoch": 0.48651736058920786, "flos": 26034971639040.0, "grad_norm": 1.5126120665784986, "language_loss": 0.80743599, "learning_rate": 2.184148915123631e-06, "loss": 0.83201689, "num_input_tokens_seen": 173943285, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.21228027, "step": 8092, "time_per_iteration": 2.899534225463867 }, { "auxiliary_loss_clip": 0.01442825, "auxiliary_loss_mlp": 0.01035762, "balance_loss_clip": 1.26957977, "balance_loss_mlp": 1.01407754, "epoch": 0.4865774838418758, "flos": 20495352746880.0, "grad_norm": 1.6600810946338407, "language_loss": 0.72852021, "learning_rate": 2.1837611043377126e-06, "loss": 0.75330609, "num_input_tokens_seen": 173962205, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21704102, "step": 8093, "time_per_iteration": 2.8183488845825195 }, { "auxiliary_loss_clip": 0.01428029, "auxiliary_loss_mlp": 0.01035429, "balance_loss_clip": 1.25924492, "balance_loss_mlp": 1.01412654, "epoch": 0.4866376070945438, "flos": 23557548585600.0, "grad_norm": 1.7828753424000217, "language_loss": 0.69015515, "learning_rate": 2.1833732865835545e-06, "loss": 0.71478975, "num_input_tokens_seen": 173980945, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.2130127, "step": 8094, "time_per_iteration": 2.8831067085266113 }, { "auxiliary_loss_clip": 0.01453888, "auxiliary_loss_mlp": 0.01040509, "balance_loss_clip": 1.27743804, "balance_loss_mlp": 1.01976633, "epoch": 0.4866977303472118, "flos": 16699238703360.0, "grad_norm": 2.2106419077448254, "language_loss": 0.67152172, "learning_rate": 2.1829854618758636e-06, "loss": 0.69646567, "num_input_tokens_seen": 173998860, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.20727539, "step": 8095, "time_per_iteration": 2.96636700630188 }, { "auxiliary_loss_clip": 0.01454434, "auxiliary_loss_mlp": 0.01037894, "balance_loss_clip": 1.28058422, "balance_loss_mlp": 1.0159117, "epoch": 0.4867578535998798, "flos": 17905324089600.0, "grad_norm": 1.8844350650391613, "language_loss": 0.78903711, "learning_rate": 2.182597630229345e-06, "loss": 0.81396043, "num_input_tokens_seen": 174016665, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.21972656, "step": 8096, "time_per_iteration": 2.8480963706970215 }, { "auxiliary_loss_clip": 0.0143104, "auxiliary_loss_mlp": 0.01037886, "balance_loss_clip": 1.26022768, "balance_loss_mlp": 1.01574922, "epoch": 0.48681797685254774, "flos": 22647989491200.0, "grad_norm": 1.9197187802021536, "language_loss": 0.68015182, "learning_rate": 2.1822097916587067e-06, "loss": 0.70484108, "num_input_tokens_seen": 174034800, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.22119141, "step": 8097, "time_per_iteration": 4.334709405899048 }, { "auxiliary_loss_clip": 0.01435481, "auxiliary_loss_mlp": 0.01037197, "balance_loss_clip": 1.26474857, "balance_loss_mlp": 1.01662147, "epoch": 0.4868781001052157, "flos": 20895707151360.0, "grad_norm": 1.7183655076429076, "language_loss": 0.72248709, "learning_rate": 2.1818219461786543e-06, "loss": 0.74721384, "num_input_tokens_seen": 174054445, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20568848, "step": 8098, "time_per_iteration": 5.6493775844573975 }, { "auxiliary_loss_clip": 0.01461568, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.28320515, "balance_loss_mlp": 1.01329708, "epoch": 0.48693822335788367, "flos": 41990066812800.0, "grad_norm": 2.0338062527418006, "language_loss": 0.6685816, "learning_rate": 2.1814340938038956e-06, "loss": 0.69355935, "num_input_tokens_seen": 174077890, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.22900391, "step": 8099, "time_per_iteration": 3.1179182529449463 }, { "auxiliary_loss_clip": 0.01433614, "auxiliary_loss_mlp": 0.010415, "balance_loss_clip": 1.26063704, "balance_loss_mlp": 1.02001882, "epoch": 0.48699834661055164, "flos": 24253931589120.0, "grad_norm": 1.9550738723194627, "language_loss": 0.67769742, "learning_rate": 2.181046234549138e-06, "loss": 0.70244861, "num_input_tokens_seen": 174097460, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.21484375, "step": 8100, "time_per_iteration": 2.862433671951294 }, { "auxiliary_loss_clip": 0.01435036, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.26583946, "balance_loss_mlp": 1.01692581, "epoch": 0.4870584698632196, "flos": 25935532272000.0, "grad_norm": 1.4413725823341306, "language_loss": 0.76973641, "learning_rate": 2.180658368429088e-06, "loss": 0.79446363, "num_input_tokens_seen": 174120775, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20739746, "step": 8101, "time_per_iteration": 2.93546462059021 }, { "auxiliary_loss_clip": 0.01221442, "auxiliary_loss_mlp": 0.0103349, "balance_loss_clip": 1.12868667, "balance_loss_mlp": 1.0131768, "epoch": 0.48711859311588757, "flos": 70243081194240.0, "grad_norm": 0.6843800230410032, "language_loss": 0.52421474, "learning_rate": 2.1802704954584565e-06, "loss": 0.54676402, "num_input_tokens_seen": 174189135, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.203125, "step": 8102, "time_per_iteration": 3.5113306045532227 }, { "auxiliary_loss_clip": 0.01434911, "auxiliary_loss_mlp": 0.01039328, "balance_loss_clip": 1.262766, "balance_loss_mlp": 1.01881194, "epoch": 0.48717871636855553, "flos": 12348964621440.0, "grad_norm": 2.3693104330231414, "language_loss": 0.74097693, "learning_rate": 2.1798826156519484e-06, "loss": 0.76571929, "num_input_tokens_seen": 174203250, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.20507812, "step": 8103, "time_per_iteration": 2.8283851146698 }, { "auxiliary_loss_clip": 0.01442917, "auxiliary_loss_mlp": 0.0103982, "balance_loss_clip": 1.26881528, "balance_loss_mlp": 1.01771879, "epoch": 0.4872388396212235, "flos": 23487590845440.0, "grad_norm": 1.6214064846903644, "language_loss": 0.63837671, "learning_rate": 2.1794947290242737e-06, "loss": 0.66320407, "num_input_tokens_seen": 174224145, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.22094727, "step": 8104, "time_per_iteration": 2.890270233154297 }, { "auxiliary_loss_clip": 0.01432873, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.26150072, "balance_loss_mlp": 1.01524627, "epoch": 0.48729896287389146, "flos": 31439199530880.0, "grad_norm": 1.698156738404256, "language_loss": 0.69928646, "learning_rate": 2.1791068355901413e-06, "loss": 0.72398448, "num_input_tokens_seen": 174244435, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.21691895, "step": 8105, "time_per_iteration": 2.9396603107452393 }, { "auxiliary_loss_clip": 0.01430982, "auxiliary_loss_mlp": 0.01031046, "balance_loss_clip": 1.26085925, "balance_loss_mlp": 1.01007748, "epoch": 0.4873590861265594, "flos": 19066092168960.0, "grad_norm": 1.7735045158162335, "language_loss": 0.74410719, "learning_rate": 2.178718935364259e-06, "loss": 0.76872742, "num_input_tokens_seen": 174262710, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20947266, "step": 8106, "time_per_iteration": 2.8169360160827637 }, { "auxiliary_loss_clip": 0.01449256, "auxiliary_loss_mlp": 0.01039099, "balance_loss_clip": 1.27394724, "balance_loss_mlp": 1.01666343, "epoch": 0.4874192093792274, "flos": 24357804946560.0, "grad_norm": 1.7976885004382848, "language_loss": 0.77436775, "learning_rate": 2.1783310283613373e-06, "loss": 0.79925132, "num_input_tokens_seen": 174281545, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.22436523, "step": 8107, "time_per_iteration": 2.8699493408203125 }, { "auxiliary_loss_clip": 0.0142832, "auxiliary_loss_mlp": 0.01033333, "balance_loss_clip": 1.26163971, "balance_loss_mlp": 1.01282859, "epoch": 0.4874793326318954, "flos": 23123052339840.0, "grad_norm": 1.5656285412688782, "language_loss": 0.75906229, "learning_rate": 2.1779431145960853e-06, "loss": 0.78367889, "num_input_tokens_seen": 174300290, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20495605, "step": 8108, "time_per_iteration": 2.9119393825531006 }, { "auxiliary_loss_clip": 0.0143535, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.26842618, "balance_loss_mlp": 1.01777434, "epoch": 0.4875394558845634, "flos": 19035524666880.0, "grad_norm": 1.937278042420797, "language_loss": 0.74616641, "learning_rate": 2.177555194083212e-06, "loss": 0.77089274, "num_input_tokens_seen": 174318490, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19494629, "step": 8109, "time_per_iteration": 2.874976396560669 }, { "auxiliary_loss_clip": 0.01432604, "auxiliary_loss_mlp": 0.01036088, "balance_loss_clip": 1.26474524, "balance_loss_mlp": 1.01470184, "epoch": 0.48759957913723134, "flos": 21443442428160.0, "grad_norm": 2.2552804293361945, "language_loss": 0.785236, "learning_rate": 2.177167266837428e-06, "loss": 0.80992293, "num_input_tokens_seen": 174335505, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21398926, "step": 8110, "time_per_iteration": 2.8316781520843506 }, { "auxiliary_loss_clip": 0.01438444, "auxiliary_loss_mlp": 0.01041538, "balance_loss_clip": 1.26768303, "balance_loss_mlp": 1.02046227, "epoch": 0.4876597023898993, "flos": 17757581258880.0, "grad_norm": 2.3937053373797017, "language_loss": 0.73893827, "learning_rate": 2.176779332873444e-06, "loss": 0.76373804, "num_input_tokens_seen": 174353990, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.21081543, "step": 8111, "time_per_iteration": 2.7993099689483643 }, { "auxiliary_loss_clip": 0.01439993, "auxiliary_loss_mlp": 0.01040548, "balance_loss_clip": 1.27199674, "balance_loss_mlp": 1.01934087, "epoch": 0.4877198256425673, "flos": 17028549492480.0, "grad_norm": 2.7563350312291073, "language_loss": 0.76499283, "learning_rate": 2.17639139220597e-06, "loss": 0.78979826, "num_input_tokens_seen": 174373425, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21203613, "step": 8112, "time_per_iteration": 2.845808982849121 }, { "auxiliary_loss_clip": 0.01461038, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.28296244, "balance_loss_mlp": 1.01889873, "epoch": 0.48777994889523524, "flos": 22394654000640.0, "grad_norm": 2.24628151459118, "language_loss": 0.7608242, "learning_rate": 2.1760034448497166e-06, "loss": 0.78584421, "num_input_tokens_seen": 174393070, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.22070312, "step": 8113, "time_per_iteration": 2.877472162246704 }, { "auxiliary_loss_clip": 0.01231193, "auxiliary_loss_mlp": 0.01069275, "balance_loss_clip": 1.13772368, "balance_loss_mlp": 1.04428899, "epoch": 0.4878400721479032, "flos": 61271977253760.0, "grad_norm": 0.7965883454101493, "language_loss": 0.48907876, "learning_rate": 2.1756154908193943e-06, "loss": 0.51208341, "num_input_tokens_seen": 174446880, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.25, "step": 8114, "time_per_iteration": 3.2386727333068848 }, { "auxiliary_loss_clip": 0.01444637, "auxiliary_loss_mlp": 0.01044874, "balance_loss_clip": 1.27163064, "balance_loss_mlp": 1.02270126, "epoch": 0.48790019540057117, "flos": 24547697948160.0, "grad_norm": 1.3834542741610156, "language_loss": 0.7778511, "learning_rate": 2.1752275301297155e-06, "loss": 0.80274624, "num_input_tokens_seen": 174468485, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.22167969, "step": 8115, "time_per_iteration": 2.9347805976867676 }, { "auxiliary_loss_clip": 0.01456443, "auxiliary_loss_mlp": 0.01044654, "balance_loss_clip": 1.2812618, "balance_loss_mlp": 1.02319646, "epoch": 0.48796031865323913, "flos": 21843615853440.0, "grad_norm": 2.74312883504099, "language_loss": 0.72508246, "learning_rate": 2.1748395627953915e-06, "loss": 0.7500934, "num_input_tokens_seen": 174486360, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.21447754, "step": 8116, "time_per_iteration": 2.860118865966797 }, { "auxiliary_loss_clip": 0.01420356, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.25303376, "balance_loss_mlp": 1.01261675, "epoch": 0.4880204419059071, "flos": 18598313733120.0, "grad_norm": 2.4313775613708217, "language_loss": 0.63809431, "learning_rate": 2.1744515888311335e-06, "loss": 0.66263437, "num_input_tokens_seen": 174505075, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.21032715, "step": 8117, "time_per_iteration": 2.9099063873291016 }, { "auxiliary_loss_clip": 0.01436877, "auxiliary_loss_mlp": 0.01034342, "balance_loss_clip": 1.26570833, "balance_loss_mlp": 1.01370645, "epoch": 0.48808056515857506, "flos": 19181593440000.0, "grad_norm": 1.7193465027978267, "language_loss": 0.80135679, "learning_rate": 2.1740636082516533e-06, "loss": 0.826069, "num_input_tokens_seen": 174523385, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.20629883, "step": 8118, "time_per_iteration": 2.860924243927002 }, { "auxiliary_loss_clip": 0.01444277, "auxiliary_loss_mlp": 0.0104047, "balance_loss_clip": 1.27078712, "balance_loss_mlp": 1.0197041, "epoch": 0.48814068841124303, "flos": 20130090324480.0, "grad_norm": 2.1526762588888553, "language_loss": 0.64086854, "learning_rate": 2.1736756210716645e-06, "loss": 0.66571599, "num_input_tokens_seen": 174542200, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.20776367, "step": 8119, "time_per_iteration": 2.827331304550171 }, { "auxiliary_loss_clip": 0.01450205, "auxiliary_loss_mlp": 0.01036763, "balance_loss_clip": 1.27791524, "balance_loss_mlp": 1.01580572, "epoch": 0.488200811663911, "flos": 22975716712320.0, "grad_norm": 1.8976194225287937, "language_loss": 0.72929507, "learning_rate": 2.173287627305878e-06, "loss": 0.7541647, "num_input_tokens_seen": 174563620, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.20959473, "step": 8120, "time_per_iteration": 2.966766357421875 }, { "auxiliary_loss_clip": 0.01461384, "auxiliary_loss_mlp": 0.01039339, "balance_loss_clip": 1.28731966, "balance_loss_mlp": 1.01784539, "epoch": 0.48826093491657896, "flos": 33923499793920.0, "grad_norm": 1.8560281707501027, "language_loss": 0.64333105, "learning_rate": 2.1728996269690075e-06, "loss": 0.6683383, "num_input_tokens_seen": 174586465, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.21496582, "step": 8121, "time_per_iteration": 2.953099012374878 }, { "auxiliary_loss_clip": 0.01460596, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.28407276, "balance_loss_mlp": 1.01895738, "epoch": 0.488321058169247, "flos": 23079228111360.0, "grad_norm": 1.953756880873776, "language_loss": 0.84111011, "learning_rate": 2.1725116200757664e-06, "loss": 0.86612618, "num_input_tokens_seen": 174604035, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.22033691, "step": 8122, "time_per_iteration": 4.326294898986816 }, { "auxiliary_loss_clip": 0.01469179, "auxiliary_loss_mlp": 0.01045056, "balance_loss_clip": 1.29410779, "balance_loss_mlp": 1.02308607, "epoch": 0.48838118142191494, "flos": 19327074030720.0, "grad_norm": 1.8255772912046255, "language_loss": 0.86064613, "learning_rate": 2.172123606640866e-06, "loss": 0.88578844, "num_input_tokens_seen": 174621715, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.21960449, "step": 8123, "time_per_iteration": 2.850022315979004 }, { "auxiliary_loss_clip": 0.01464022, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.28754985, "balance_loss_mlp": 1.02516723, "epoch": 0.4884413046745829, "flos": 25421576878080.0, "grad_norm": 1.503435372386699, "language_loss": 0.86007309, "learning_rate": 2.1717355866790227e-06, "loss": 0.88517296, "num_input_tokens_seen": 174643835, "router_z_loss_clip": 1.76269531, "router_z_loss_mlp": 0.20800781, "step": 8124, "time_per_iteration": 2.896627187728882 }, { "auxiliary_loss_clip": 0.01455055, "auxiliary_loss_mlp": 0.01042745, "balance_loss_clip": 1.28271019, "balance_loss_mlp": 1.02137101, "epoch": 0.4885014279272509, "flos": 21000168691200.0, "grad_norm": 2.0485485972877493, "language_loss": 0.80284905, "learning_rate": 2.171347560204948e-06, "loss": 0.82782698, "num_input_tokens_seen": 174660955, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21362305, "step": 8125, "time_per_iteration": 2.864440441131592 }, { "auxiliary_loss_clip": 0.01458041, "auxiliary_loss_mlp": 0.01045961, "balance_loss_clip": 1.28514218, "balance_loss_mlp": 1.02472961, "epoch": 0.48856155117991884, "flos": 13779446808960.0, "grad_norm": 2.276154741886507, "language_loss": 0.7310468, "learning_rate": 2.170959527233356e-06, "loss": 0.75608683, "num_input_tokens_seen": 174678270, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.21228027, "step": 8126, "time_per_iteration": 2.8201115131378174 }, { "auxiliary_loss_clip": 0.01475823, "auxiliary_loss_mlp": 0.01034141, "balance_loss_clip": 1.29790938, "balance_loss_mlp": 1.01481688, "epoch": 0.4886216744325868, "flos": 32100445307520.0, "grad_norm": 1.683463162399402, "language_loss": 0.6891672, "learning_rate": 2.1705714877789633e-06, "loss": 0.7142669, "num_input_tokens_seen": 174698360, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.1932373, "step": 8127, "time_per_iteration": 2.917506217956543 }, { "auxiliary_loss_clip": 0.01466066, "auxiliary_loss_mlp": 0.01044397, "balance_loss_clip": 1.2884568, "balance_loss_mlp": 1.02338052, "epoch": 0.48868179768525477, "flos": 19619618780160.0, "grad_norm": 1.584792701769592, "language_loss": 0.77209163, "learning_rate": 2.170183441856481e-06, "loss": 0.79719627, "num_input_tokens_seen": 174716755, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.21020508, "step": 8128, "time_per_iteration": 2.851788282394409 }, { "auxiliary_loss_clip": 0.01467662, "auxiliary_loss_mlp": 0.01041751, "balance_loss_clip": 1.29159367, "balance_loss_mlp": 1.02119923, "epoch": 0.48874192093792274, "flos": 21296514003840.0, "grad_norm": 1.5418513861336363, "language_loss": 0.76918757, "learning_rate": 2.1697953894806265e-06, "loss": 0.79428172, "num_input_tokens_seen": 174735560, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.20556641, "step": 8129, "time_per_iteration": 2.8409652709960938 }, { "auxiliary_loss_clip": 0.01477301, "auxiliary_loss_mlp": 0.01038552, "balance_loss_clip": 1.29986167, "balance_loss_mlp": 1.01817966, "epoch": 0.4888020441905907, "flos": 14181520515840.0, "grad_norm": 3.222392148337765, "language_loss": 0.65420693, "learning_rate": 2.169407330666114e-06, "loss": 0.6793654, "num_input_tokens_seen": 174752730, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.20373535, "step": 8130, "time_per_iteration": 2.8144986629486084 }, { "auxiliary_loss_clip": 0.01455241, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 1.2827338, "balance_loss_mlp": 1.01100659, "epoch": 0.48886216744325867, "flos": 24108631977600.0, "grad_norm": 1.7308774045871818, "language_loss": 0.72655904, "learning_rate": 2.169019265427658e-06, "loss": 0.75143808, "num_input_tokens_seen": 174772520, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.21655273, "step": 8131, "time_per_iteration": 2.9621613025665283 }, { "auxiliary_loss_clip": 0.01479564, "auxiliary_loss_mlp": 0.01049076, "balance_loss_clip": 1.3015635, "balance_loss_mlp": 1.02585387, "epoch": 0.48892229069592663, "flos": 38444483082240.0, "grad_norm": 1.4740493672193002, "language_loss": 0.70379847, "learning_rate": 2.1686311937799745e-06, "loss": 0.72908491, "num_input_tokens_seen": 174796540, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.23205566, "step": 8132, "time_per_iteration": 4.395914077758789 }, { "auxiliary_loss_clip": 0.01455157, "auxiliary_loss_mlp": 0.01035624, "balance_loss_clip": 1.28432226, "balance_loss_mlp": 1.01404691, "epoch": 0.4889824139485946, "flos": 23854210611840.0, "grad_norm": 1.3977969574562459, "language_loss": 0.70919615, "learning_rate": 2.1682431157377797e-06, "loss": 0.73410398, "num_input_tokens_seen": 174817840, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21582031, "step": 8133, "time_per_iteration": 5.6911046504974365 }, { "auxiliary_loss_clip": 0.01449834, "auxiliary_loss_mlp": 0.01038465, "balance_loss_clip": 1.27768683, "balance_loss_mlp": 1.01711464, "epoch": 0.48904253720126256, "flos": 24436540177920.0, "grad_norm": 2.27269287371248, "language_loss": 0.72038043, "learning_rate": 2.1678550313157883e-06, "loss": 0.74526346, "num_input_tokens_seen": 174837885, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.21350098, "step": 8134, "time_per_iteration": 2.951812744140625 }, { "auxiliary_loss_clip": 0.01481212, "auxiliary_loss_mlp": 0.01036215, "balance_loss_clip": 1.30251372, "balance_loss_mlp": 1.01541305, "epoch": 0.4891026604539306, "flos": 24181394895360.0, "grad_norm": 2.8850364595340405, "language_loss": 0.81488162, "learning_rate": 2.167466940528718e-06, "loss": 0.84005582, "num_input_tokens_seen": 174855240, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.20800781, "step": 8135, "time_per_iteration": 2.8771300315856934 }, { "auxiliary_loss_clip": 0.01459283, "auxiliary_loss_mlp": 0.01037196, "balance_loss_clip": 1.28797078, "balance_loss_mlp": 1.01793146, "epoch": 0.48916278370659855, "flos": 21481022874240.0, "grad_norm": 1.6370145905378466, "language_loss": 0.75193465, "learning_rate": 2.1670788433912843e-06, "loss": 0.77689946, "num_input_tokens_seen": 174875145, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.19274902, "step": 8136, "time_per_iteration": 2.9013381004333496 }, { "auxiliary_loss_clip": 0.01465509, "auxiliary_loss_mlp": 0.01037904, "balance_loss_clip": 1.29330587, "balance_loss_mlp": 1.01612484, "epoch": 0.4892229069592665, "flos": 22319945556480.0, "grad_norm": 1.4321808075690805, "language_loss": 0.74365371, "learning_rate": 2.166690739918204e-06, "loss": 0.76868784, "num_input_tokens_seen": 174894770, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.21777344, "step": 8137, "time_per_iteration": 2.9524285793304443 }, { "auxiliary_loss_clip": 0.01464244, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.28843021, "balance_loss_mlp": 1.01855516, "epoch": 0.4892830302119345, "flos": 12794545843200.0, "grad_norm": 2.628099369172474, "language_loss": 0.7667135, "learning_rate": 2.1663026301241944e-06, "loss": 0.79174793, "num_input_tokens_seen": 174912780, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.20666504, "step": 8138, "time_per_iteration": 2.8581910133361816 }, { "auxiliary_loss_clip": 0.01460348, "auxiliary_loss_mlp": 0.01040332, "balance_loss_clip": 1.28933573, "balance_loss_mlp": 1.01891029, "epoch": 0.48934315346460244, "flos": 20823713395200.0, "grad_norm": 1.659086764356683, "language_loss": 0.74862987, "learning_rate": 2.165914514023972e-06, "loss": 0.7736367, "num_input_tokens_seen": 174931250, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.2142334, "step": 8139, "time_per_iteration": 2.993156909942627 }, { "auxiliary_loss_clip": 0.01478292, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.30192947, "balance_loss_mlp": 1.01716518, "epoch": 0.4894032767172704, "flos": 19765008881280.0, "grad_norm": 5.4852455055334755, "language_loss": 0.62823761, "learning_rate": 2.165526391632255e-06, "loss": 0.6534065, "num_input_tokens_seen": 174951105, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.21435547, "step": 8140, "time_per_iteration": 2.9858992099761963 }, { "auxiliary_loss_clip": 0.01474058, "auxiliary_loss_mlp": 0.01038881, "balance_loss_clip": 1.29551053, "balance_loss_mlp": 1.0162555, "epoch": 0.4894633999699384, "flos": 17827222285440.0, "grad_norm": 2.1255302553734805, "language_loss": 0.82898653, "learning_rate": 2.1651382629637608e-06, "loss": 0.85411596, "num_input_tokens_seen": 174969120, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.22607422, "step": 8141, "time_per_iteration": 2.9424965381622314 }, { "auxiliary_loss_clip": 0.01487861, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.3091433, "balance_loss_mlp": 1.01553917, "epoch": 0.48952352322260634, "flos": 25534951643520.0, "grad_norm": 1.5093269307163109, "language_loss": 0.7287147, "learning_rate": 2.1647501280332066e-06, "loss": 0.75397265, "num_input_tokens_seen": 174991295, "router_z_loss_clip": 1.78808594, "router_z_loss_mlp": 0.22387695, "step": 8142, "time_per_iteration": 2.935468912124634 }, { "auxiliary_loss_clip": 0.01461832, "auxiliary_loss_mlp": 0.01040792, "balance_loss_clip": 1.28672612, "balance_loss_mlp": 1.01852345, "epoch": 0.4895836464752743, "flos": 29066373751680.0, "grad_norm": 1.7155066064434383, "language_loss": 0.68184024, "learning_rate": 2.1643619868553105e-06, "loss": 0.7068665, "num_input_tokens_seen": 175012830, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.22265625, "step": 8143, "time_per_iteration": 2.8996427059173584 }, { "auxiliary_loss_clip": 0.01459036, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.28743935, "balance_loss_mlp": 1.01301634, "epoch": 0.48964376972794227, "flos": 33559956673920.0, "grad_norm": 2.189256093724405, "language_loss": 0.75473082, "learning_rate": 2.163973839444793e-06, "loss": 0.77965927, "num_input_tokens_seen": 175035695, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20776367, "step": 8144, "time_per_iteration": 2.95009183883667 }, { "auxiliary_loss_clip": 0.01455361, "auxiliary_loss_mlp": 0.01035215, "balance_loss_clip": 1.28204155, "balance_loss_mlp": 1.0144366, "epoch": 0.48970389298061023, "flos": 22064076357120.0, "grad_norm": 1.7575856458095978, "language_loss": 0.7643199, "learning_rate": 2.1635856858163695e-06, "loss": 0.7892257, "num_input_tokens_seen": 175056425, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.2076416, "step": 8145, "time_per_iteration": 2.8702006340026855 }, { "auxiliary_loss_clip": 0.01460258, "auxiliary_loss_mlp": 0.01038954, "balance_loss_clip": 1.284863, "balance_loss_mlp": 1.01696014, "epoch": 0.4897640162332782, "flos": 20093957712000.0, "grad_norm": 1.8360620187059318, "language_loss": 0.81097001, "learning_rate": 2.163197525984761e-06, "loss": 0.83596212, "num_input_tokens_seen": 175074800, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.2199707, "step": 8146, "time_per_iteration": 2.8272719383239746 }, { "auxiliary_loss_clip": 0.01441596, "auxiliary_loss_mlp": 0.01035922, "balance_loss_clip": 1.27227354, "balance_loss_mlp": 1.01403487, "epoch": 0.48982413948594616, "flos": 23816946879360.0, "grad_norm": 1.542716675832851, "language_loss": 0.75360078, "learning_rate": 2.162809359964687e-06, "loss": 0.77837598, "num_input_tokens_seen": 175094500, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.21875, "step": 8147, "time_per_iteration": 2.919450044631958 }, { "auxiliary_loss_clip": 0.01455195, "auxiliary_loss_mlp": 0.01035177, "balance_loss_clip": 1.2815001, "balance_loss_mlp": 1.01363635, "epoch": 0.4898842627386142, "flos": 17648640483840.0, "grad_norm": 2.8309275122872224, "language_loss": 0.83926558, "learning_rate": 2.162421187770864e-06, "loss": 0.8641693, "num_input_tokens_seen": 175112920, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.21557617, "step": 8148, "time_per_iteration": 2.8466272354125977 }, { "auxiliary_loss_clip": 0.01445528, "auxiliary_loss_mlp": 0.01035358, "balance_loss_clip": 1.27528977, "balance_loss_mlp": 1.01481867, "epoch": 0.48994438599128215, "flos": 16626611520000.0, "grad_norm": 12.122978510065916, "language_loss": 0.75237668, "learning_rate": 2.162033009418015e-06, "loss": 0.77718556, "num_input_tokens_seen": 175129910, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.2052002, "step": 8149, "time_per_iteration": 2.83486008644104 }, { "auxiliary_loss_clip": 0.0147117, "auxiliary_loss_mlp": 0.01035842, "balance_loss_clip": 1.29217565, "balance_loss_mlp": 1.01384759, "epoch": 0.4900045092439501, "flos": 26626078696320.0, "grad_norm": 2.552995464617393, "language_loss": 0.76374245, "learning_rate": 2.1616448249208567e-06, "loss": 0.78881252, "num_input_tokens_seen": 175148705, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.21984863, "step": 8150, "time_per_iteration": 2.8890230655670166 }, { "auxiliary_loss_clip": 0.01464281, "auxiliary_loss_mlp": 0.01046877, "balance_loss_clip": 1.28795791, "balance_loss_mlp": 1.02364302, "epoch": 0.4900646324966181, "flos": 19911801571200.0, "grad_norm": 2.033007191501246, "language_loss": 0.73038125, "learning_rate": 2.1612566342941106e-06, "loss": 0.75549287, "num_input_tokens_seen": 175167425, "router_z_loss_clip": 1.76074219, "router_z_loss_mlp": 0.23254395, "step": 8151, "time_per_iteration": 2.851952314376831 }, { "auxiliary_loss_clip": 0.01241399, "auxiliary_loss_mlp": 0.0104854, "balance_loss_clip": 1.13987446, "balance_loss_mlp": 1.02126503, "epoch": 0.49012475574928605, "flos": 59216970084480.0, "grad_norm": 0.9046548355209094, "language_loss": 0.54407847, "learning_rate": 2.1608684375524977e-06, "loss": 0.56697786, "num_input_tokens_seen": 175227985, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.2734375, "step": 8152, "time_per_iteration": 3.439268112182617 }, { "auxiliary_loss_clip": 0.01464514, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.28738987, "balance_loss_mlp": 1.01392221, "epoch": 0.490184879001954, "flos": 45276071270400.0, "grad_norm": 1.6442995348875897, "language_loss": 0.61540163, "learning_rate": 2.1604802347107364e-06, "loss": 0.64039516, "num_input_tokens_seen": 175251895, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.20910645, "step": 8153, "time_per_iteration": 3.4006574153900146 }, { "auxiliary_loss_clip": 0.0144816, "auxiliary_loss_mlp": 0.01038275, "balance_loss_clip": 1.27516651, "balance_loss_mlp": 1.01557767, "epoch": 0.490245002254622, "flos": 28013958264960.0, "grad_norm": 1.9849519706201668, "language_loss": 0.77525604, "learning_rate": 2.160092025783549e-06, "loss": 0.80012035, "num_input_tokens_seen": 175272770, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.22680664, "step": 8154, "time_per_iteration": 2.9402153491973877 }, { "auxiliary_loss_clip": 0.01231899, "auxiliary_loss_mlp": 0.01044798, "balance_loss_clip": 1.13343596, "balance_loss_mlp": 1.02419841, "epoch": 0.49030512550728994, "flos": 58983044613120.0, "grad_norm": 0.9948281143524212, "language_loss": 0.67037505, "learning_rate": 2.1597038107856564e-06, "loss": 0.693142, "num_input_tokens_seen": 175336320, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.20605469, "step": 8155, "time_per_iteration": 3.38663649559021 }, { "auxiliary_loss_clip": 0.01449062, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.27595496, "balance_loss_mlp": 1.0129056, "epoch": 0.4903652487599579, "flos": 19801277228160.0, "grad_norm": 3.475005000818123, "language_loss": 0.77528703, "learning_rate": 2.1593155897317784e-06, "loss": 0.80010623, "num_input_tokens_seen": 175353540, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.19934082, "step": 8156, "time_per_iteration": 2.8406879901885986 }, { "auxiliary_loss_clip": 0.01456824, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.28246582, "balance_loss_mlp": 1.0178746, "epoch": 0.49042537201262587, "flos": 21772029300480.0, "grad_norm": 2.5103102431106157, "language_loss": 0.8548401, "learning_rate": 2.1589273626366377e-06, "loss": 0.87980562, "num_input_tokens_seen": 175370445, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.21850586, "step": 8157, "time_per_iteration": 4.253950595855713 }, { "auxiliary_loss_clip": 0.01442471, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.27009249, "balance_loss_mlp": 1.0145998, "epoch": 0.49048549526529384, "flos": 18962354545920.0, "grad_norm": 1.7609584925004516, "language_loss": 0.80418414, "learning_rate": 2.158539129514956e-06, "loss": 0.82897651, "num_input_tokens_seen": 175389020, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.22167969, "step": 8158, "time_per_iteration": 2.83685302734375 }, { "auxiliary_loss_clip": 0.01466034, "auxiliary_loss_mlp": 0.01037979, "balance_loss_clip": 1.28857625, "balance_loss_mlp": 1.01572216, "epoch": 0.4905456185179618, "flos": 26917628060160.0, "grad_norm": 1.5874804959135074, "language_loss": 0.70052195, "learning_rate": 2.158150890381454e-06, "loss": 0.7255621, "num_input_tokens_seen": 175409545, "router_z_loss_clip": 1.7734375, "router_z_loss_mlp": 0.22265625, "step": 8159, "time_per_iteration": 2.905890464782715 }, { "auxiliary_loss_clip": 0.01445737, "auxiliary_loss_mlp": 0.01037514, "balance_loss_clip": 1.27380991, "balance_loss_mlp": 1.0152812, "epoch": 0.49060574177062977, "flos": 20422001646720.0, "grad_norm": 2.1881449593909523, "language_loss": 0.74049795, "learning_rate": 2.157762645250854e-06, "loss": 0.76533043, "num_input_tokens_seen": 175429335, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.22241211, "step": 8160, "time_per_iteration": 2.862586498260498 }, { "auxiliary_loss_clip": 0.01471136, "auxiliary_loss_mlp": 0.01037481, "balance_loss_clip": 1.29272616, "balance_loss_mlp": 1.016047, "epoch": 0.4906658650232978, "flos": 17502662200320.0, "grad_norm": 1.8794782787650801, "language_loss": 0.72672105, "learning_rate": 2.1573743941378796e-06, "loss": 0.75180721, "num_input_tokens_seen": 175446955, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.2142334, "step": 8161, "time_per_iteration": 2.8249099254608154 }, { "auxiliary_loss_clip": 0.01446996, "auxiliary_loss_mlp": 0.01038897, "balance_loss_clip": 1.27550733, "balance_loss_mlp": 1.01671243, "epoch": 0.49072598827596575, "flos": 26625581003520.0, "grad_norm": 2.585470909483985, "language_loss": 0.69815242, "learning_rate": 2.1569861370572517e-06, "loss": 0.72301137, "num_input_tokens_seen": 175468195, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.22192383, "step": 8162, "time_per_iteration": 2.9160163402557373 }, { "auxiliary_loss_clip": 0.01469684, "auxiliary_loss_mlp": 0.01040246, "balance_loss_clip": 1.29148722, "balance_loss_mlp": 1.01709509, "epoch": 0.4907861115286337, "flos": 20422318360320.0, "grad_norm": 3.6308608738458843, "language_loss": 0.64277595, "learning_rate": 2.1565978740236944e-06, "loss": 0.66787523, "num_input_tokens_seen": 175487455, "router_z_loss_clip": 1.78222656, "router_z_loss_mlp": 0.23144531, "step": 8163, "time_per_iteration": 2.8451972007751465 }, { "auxiliary_loss_clip": 0.01431555, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.26256776, "balance_loss_mlp": 1.01933336, "epoch": 0.4908462347813017, "flos": 14072398761600.0, "grad_norm": 2.203978744145157, "language_loss": 0.78309196, "learning_rate": 2.1562096050519293e-06, "loss": 0.80782223, "num_input_tokens_seen": 175504450, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.22131348, "step": 8164, "time_per_iteration": 2.8346669673919678 }, { "auxiliary_loss_clip": 0.01457079, "auxiliary_loss_mlp": 0.010402, "balance_loss_clip": 1.28153038, "balance_loss_mlp": 1.01881373, "epoch": 0.49090635803396965, "flos": 18744608730240.0, "grad_norm": 1.679217193289505, "language_loss": 0.77769208, "learning_rate": 2.1558213301566806e-06, "loss": 0.80266482, "num_input_tokens_seen": 175523600, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.21374512, "step": 8165, "time_per_iteration": 2.830990791320801 }, { "auxiliary_loss_clip": 0.01447307, "auxiliary_loss_mlp": 0.01038688, "balance_loss_clip": 1.2758925, "balance_loss_mlp": 1.01640797, "epoch": 0.4909664812866376, "flos": 20568251399040.0, "grad_norm": 1.8180130909832208, "language_loss": 0.78941929, "learning_rate": 2.1554330493526716e-06, "loss": 0.81427926, "num_input_tokens_seen": 175542720, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.22265625, "step": 8166, "time_per_iteration": 2.905067205429077 }, { "auxiliary_loss_clip": 0.01228036, "auxiliary_loss_mlp": 0.01042821, "balance_loss_clip": 1.12819755, "balance_loss_mlp": 1.01669014, "epoch": 0.4910266045393056, "flos": 54715034856960.0, "grad_norm": 0.8167167566084262, "language_loss": 0.54197699, "learning_rate": 2.1550447626546253e-06, "loss": 0.56468558, "num_input_tokens_seen": 175598640, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.26171875, "step": 8167, "time_per_iteration": 4.743985176086426 }, { "auxiliary_loss_clip": 0.01454062, "auxiliary_loss_mlp": 0.01043449, "balance_loss_clip": 1.28172266, "balance_loss_mlp": 1.02059627, "epoch": 0.49108672779197354, "flos": 16253340768000.0, "grad_norm": 1.7772006224515646, "language_loss": 0.86733973, "learning_rate": 2.1546564700772665e-06, "loss": 0.89231479, "num_input_tokens_seen": 175615675, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.22875977, "step": 8168, "time_per_iteration": 4.239855527877808 }, { "auxiliary_loss_clip": 0.01444552, "auxiliary_loss_mlp": 0.01042783, "balance_loss_clip": 1.2745707, "balance_loss_mlp": 1.02076554, "epoch": 0.4911468510446415, "flos": 19834106970240.0, "grad_norm": 1.6227190617428697, "language_loss": 0.74108881, "learning_rate": 2.1542681716353193e-06, "loss": 0.76596212, "num_input_tokens_seen": 175632255, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.2199707, "step": 8169, "time_per_iteration": 4.255047559738159 }, { "auxiliary_loss_clip": 0.0143134, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.26108551, "balance_loss_mlp": 1.0091083, "epoch": 0.4912069742973095, "flos": 21221941294080.0, "grad_norm": 1.5511620500651622, "language_loss": 0.7831502, "learning_rate": 2.1538798673435068e-06, "loss": 0.80775988, "num_input_tokens_seen": 175651625, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20532227, "step": 8170, "time_per_iteration": 2.8297572135925293 }, { "auxiliary_loss_clip": 0.01465566, "auxiliary_loss_mlp": 0.01040535, "balance_loss_clip": 1.28978884, "balance_loss_mlp": 1.01953065, "epoch": 0.49126709754997744, "flos": 19546674883200.0, "grad_norm": 4.172380421184691, "language_loss": 0.7663871, "learning_rate": 2.1534915572165545e-06, "loss": 0.79144812, "num_input_tokens_seen": 175669265, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.21008301, "step": 8171, "time_per_iteration": 2.8364923000335693 }, { "auxiliary_loss_clip": 0.01474133, "auxiliary_loss_mlp": 0.01040566, "balance_loss_clip": 1.29683673, "balance_loss_mlp": 1.02032471, "epoch": 0.4913272208026454, "flos": 12247263014400.0, "grad_norm": 2.1226654462227885, "language_loss": 0.82425833, "learning_rate": 2.1531032412691875e-06, "loss": 0.84940541, "num_input_tokens_seen": 175686065, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.20251465, "step": 8172, "time_per_iteration": 2.8341197967529297 }, { "auxiliary_loss_clip": 0.01222334, "auxiliary_loss_mlp": 0.01028708, "balance_loss_clip": 1.12450707, "balance_loss_mlp": 1.00496137, "epoch": 0.49138734405531337, "flos": 65495031661440.0, "grad_norm": 0.6919032315673139, "language_loss": 0.53306067, "learning_rate": 2.1527149195161295e-06, "loss": 0.55557108, "num_input_tokens_seen": 175748595, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.23730469, "step": 8173, "time_per_iteration": 3.4278345108032227 }, { "auxiliary_loss_clip": 0.01464264, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.28873575, "balance_loss_mlp": 1.01792574, "epoch": 0.4914474673079814, "flos": 18447403766400.0, "grad_norm": 2.0262613850166713, "language_loss": 0.63419211, "learning_rate": 2.152326591972107e-06, "loss": 0.65923065, "num_input_tokens_seen": 175766770, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.21643066, "step": 8174, "time_per_iteration": 2.8558428287506104 }, { "auxiliary_loss_clip": 0.01450988, "auxiliary_loss_mlp": 0.01040424, "balance_loss_clip": 1.27851462, "balance_loss_mlp": 1.01851356, "epoch": 0.49150759056064935, "flos": 21693927496320.0, "grad_norm": 1.9548159979775948, "language_loss": 0.70370489, "learning_rate": 2.1519382586518445e-06, "loss": 0.72861904, "num_input_tokens_seen": 175783605, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.21923828, "step": 8175, "time_per_iteration": 2.8537890911102295 }, { "auxiliary_loss_clip": 0.01451768, "auxiliary_loss_mlp": 0.01035392, "balance_loss_clip": 1.27958012, "balance_loss_mlp": 1.01520991, "epoch": 0.4915677138133173, "flos": 22392572739840.0, "grad_norm": 1.7542418646735947, "language_loss": 0.75831801, "learning_rate": 2.151549919570068e-06, "loss": 0.78318959, "num_input_tokens_seen": 175801390, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.2019043, "step": 8176, "time_per_iteration": 2.894666910171509 }, { "auxiliary_loss_clip": 0.0146646, "auxiliary_loss_mlp": 0.01040553, "balance_loss_clip": 1.29082322, "balance_loss_mlp": 1.01966739, "epoch": 0.4916278370659853, "flos": 18411316398720.0, "grad_norm": 2.1143225874530023, "language_loss": 0.703098, "learning_rate": 2.1511615747415036e-06, "loss": 0.72816813, "num_input_tokens_seen": 175819830, "router_z_loss_clip": 1.75976562, "router_z_loss_mlp": 0.2088623, "step": 8177, "time_per_iteration": 2.853842258453369 }, { "auxiliary_loss_clip": 0.01219642, "auxiliary_loss_mlp": 0.01040518, "balance_loss_clip": 1.12159252, "balance_loss_mlp": 1.01629436, "epoch": 0.49168796031865325, "flos": 66641746590720.0, "grad_norm": 0.6981189701045334, "language_loss": 0.46187276, "learning_rate": 2.150773224180877e-06, "loss": 0.48447436, "num_input_tokens_seen": 175881765, "router_z_loss_clip": 0.98046875, "router_z_loss_mlp": 0.2421875, "step": 8178, "time_per_iteration": 3.402052164077759 }, { "auxiliary_loss_clip": 0.01481933, "auxiliary_loss_mlp": 0.01045911, "balance_loss_clip": 1.30299723, "balance_loss_mlp": 1.02495408, "epoch": 0.4917480835713212, "flos": 20969058251520.0, "grad_norm": 2.1886455170928154, "language_loss": 0.66053569, "learning_rate": 2.1503848679029147e-06, "loss": 0.68581414, "num_input_tokens_seen": 175901795, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.20947266, "step": 8179, "time_per_iteration": 3.0126535892486572 }, { "auxiliary_loss_clip": 0.01462469, "auxiliary_loss_mlp": 0.01039066, "balance_loss_clip": 1.28367376, "balance_loss_mlp": 1.01648724, "epoch": 0.4918082068239892, "flos": 15779861487360.0, "grad_norm": 3.4135977935366606, "language_loss": 0.70682353, "learning_rate": 2.149996505922343e-06, "loss": 0.73183882, "num_input_tokens_seen": 175917770, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.22570801, "step": 8180, "time_per_iteration": 2.8449788093566895 }, { "auxiliary_loss_clip": 0.01438055, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.26807857, "balance_loss_mlp": 1.01757669, "epoch": 0.49186833007665715, "flos": 24614669531520.0, "grad_norm": 1.6844214782023053, "language_loss": 0.84909666, "learning_rate": 2.1496081382538895e-06, "loss": 0.87387532, "num_input_tokens_seen": 175937000, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.22241211, "step": 8181, "time_per_iteration": 2.9278435707092285 }, { "auxiliary_loss_clip": 0.01441335, "auxiliary_loss_mlp": 0.01035559, "balance_loss_clip": 1.27279067, "balance_loss_mlp": 1.01458967, "epoch": 0.4919284533293251, "flos": 22100616172800.0, "grad_norm": 1.9867022878885205, "language_loss": 0.73939145, "learning_rate": 2.1492197649122793e-06, "loss": 0.76416039, "num_input_tokens_seen": 175955170, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.2097168, "step": 8182, "time_per_iteration": 2.8432302474975586 }, { "auxiliary_loss_clip": 0.01451735, "auxiliary_loss_mlp": 0.01042073, "balance_loss_clip": 1.27907848, "balance_loss_mlp": 1.02057958, "epoch": 0.4919885765819931, "flos": 23378559580800.0, "grad_norm": 2.3780365824590284, "language_loss": 0.73283869, "learning_rate": 2.1488313859122412e-06, "loss": 0.7577768, "num_input_tokens_seen": 175973725, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.21496582, "step": 8183, "time_per_iteration": 2.879854917526245 }, { "auxiliary_loss_clip": 0.01482479, "auxiliary_loss_mlp": 0.01037611, "balance_loss_clip": 1.3038528, "balance_loss_mlp": 1.01707101, "epoch": 0.49204869983466104, "flos": 21370272307200.0, "grad_norm": 1.8502872252415148, "language_loss": 0.78039777, "learning_rate": 2.1484430012685015e-06, "loss": 0.80559874, "num_input_tokens_seen": 175993885, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.20556641, "step": 8184, "time_per_iteration": 2.8740339279174805 }, { "auxiliary_loss_clip": 0.0144757, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.2765063, "balance_loss_mlp": 1.01653171, "epoch": 0.492108823087329, "flos": 21152707470720.0, "grad_norm": 2.6839361956008974, "language_loss": 0.72237831, "learning_rate": 2.148054610995789e-06, "loss": 0.74722451, "num_input_tokens_seen": 176014210, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.2052002, "step": 8185, "time_per_iteration": 2.8489127159118652 }, { "auxiliary_loss_clip": 0.01462393, "auxiliary_loss_mlp": 0.01043367, "balance_loss_clip": 1.2872901, "balance_loss_mlp": 1.02212358, "epoch": 0.49216894633999697, "flos": 25126860378240.0, "grad_norm": 1.8710079259843284, "language_loss": 0.75696129, "learning_rate": 2.147666215108831e-06, "loss": 0.7820189, "num_input_tokens_seen": 176033890, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.21252441, "step": 8186, "time_per_iteration": 2.869615077972412 }, { "auxiliary_loss_clip": 0.01459307, "auxiliary_loss_mlp": 0.01037189, "balance_loss_clip": 1.2859478, "balance_loss_mlp": 1.01625562, "epoch": 0.49222906959266494, "flos": 22648170470400.0, "grad_norm": 2.0941665194130503, "language_loss": 0.68979287, "learning_rate": 2.1472778136223545e-06, "loss": 0.7147578, "num_input_tokens_seen": 176052720, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.20935059, "step": 8187, "time_per_iteration": 2.830219268798828 }, { "auxiliary_loss_clip": 0.01451611, "auxiliary_loss_mlp": 0.01041486, "balance_loss_clip": 1.27915454, "balance_loss_mlp": 1.02039814, "epoch": 0.49228919284533296, "flos": 20419739406720.0, "grad_norm": 1.7217662692687186, "language_loss": 0.67430449, "learning_rate": 2.1468894065510894e-06, "loss": 0.69923544, "num_input_tokens_seen": 176072545, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.21081543, "step": 8188, "time_per_iteration": 2.8512818813323975 }, { "auxiliary_loss_clip": 0.01441359, "auxiliary_loss_mlp": 0.01038184, "balance_loss_clip": 1.26866388, "balance_loss_mlp": 1.01642799, "epoch": 0.4923493160980009, "flos": 27132885411840.0, "grad_norm": 1.9811784906404113, "language_loss": 0.7592231, "learning_rate": 2.1465009939097623e-06, "loss": 0.78401852, "num_input_tokens_seen": 176091490, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.2175293, "step": 8189, "time_per_iteration": 2.908710479736328 }, { "auxiliary_loss_clip": 0.014504, "auxiliary_loss_mlp": 0.01035415, "balance_loss_clip": 1.27940679, "balance_loss_mlp": 1.01458931, "epoch": 0.4924094393506689, "flos": 35750400088320.0, "grad_norm": 1.5636197797100626, "language_loss": 0.64946198, "learning_rate": 2.146112575713104e-06, "loss": 0.6743201, "num_input_tokens_seen": 176113200, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.20812988, "step": 8190, "time_per_iteration": 3.0177488327026367 }, { "auxiliary_loss_clip": 0.01456166, "auxiliary_loss_mlp": 0.01035976, "balance_loss_clip": 1.28404009, "balance_loss_mlp": 1.01512599, "epoch": 0.49246956260333685, "flos": 20422182625920.0, "grad_norm": 1.9583403535608692, "language_loss": 0.72594404, "learning_rate": 2.1457241519758413e-06, "loss": 0.75086546, "num_input_tokens_seen": 176132485, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.20849609, "step": 8191, "time_per_iteration": 2.850749969482422 }, { "auxiliary_loss_clip": 0.01453985, "auxiliary_loss_mlp": 0.01039128, "balance_loss_clip": 1.28094435, "balance_loss_mlp": 1.01808763, "epoch": 0.4925296858560048, "flos": 38989684650240.0, "grad_norm": 1.6686216306710615, "language_loss": 0.7268399, "learning_rate": 2.1453357227127043e-06, "loss": 0.75177109, "num_input_tokens_seen": 176155755, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.21044922, "step": 8192, "time_per_iteration": 4.420319080352783 }, { "auxiliary_loss_clip": 0.01230015, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.13098443, "balance_loss_mlp": 1.01658332, "epoch": 0.4925898091086728, "flos": 64312202102400.0, "grad_norm": 0.7241045434073838, "language_loss": 0.52232754, "learning_rate": 2.1449472879384224e-06, "loss": 0.54505479, "num_input_tokens_seen": 176216295, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.26171875, "step": 8193, "time_per_iteration": 3.437406539916992 }, { "auxiliary_loss_clip": 0.01445633, "auxiliary_loss_mlp": 0.01040815, "balance_loss_clip": 1.27592158, "balance_loss_mlp": 1.01902318, "epoch": 0.49264993236134075, "flos": 23045945921280.0, "grad_norm": 1.7365441093438982, "language_loss": 0.77583838, "learning_rate": 2.1445588476677246e-06, "loss": 0.80070287, "num_input_tokens_seen": 176235925, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21789551, "step": 8194, "time_per_iteration": 2.8996403217315674 }, { "auxiliary_loss_clip": 0.01447805, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.27611732, "balance_loss_mlp": 1.01299524, "epoch": 0.4927100556140087, "flos": 24729130172160.0, "grad_norm": 1.991708363561768, "language_loss": 0.71441102, "learning_rate": 2.144170401915341e-06, "loss": 0.73921943, "num_input_tokens_seen": 176253865, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20056152, "step": 8195, "time_per_iteration": 2.855769157409668 }, { "auxiliary_loss_clip": 0.01447798, "auxiliary_loss_mlp": 0.01033477, "balance_loss_clip": 1.2742517, "balance_loss_mlp": 1.01286554, "epoch": 0.4927701788666767, "flos": 23513905336320.0, "grad_norm": 2.1378896912552623, "language_loss": 0.81784123, "learning_rate": 2.143781950696001e-06, "loss": 0.84265399, "num_input_tokens_seen": 176271525, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.20617676, "step": 8196, "time_per_iteration": 2.915947914123535 }, { "auxiliary_loss_clip": 0.01464516, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.28770912, "balance_loss_mlp": 1.01368105, "epoch": 0.49283030211934464, "flos": 22938950672640.0, "grad_norm": 1.92325499743484, "language_loss": 0.71602762, "learning_rate": 2.1433934940244356e-06, "loss": 0.74102467, "num_input_tokens_seen": 176290810, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.21520996, "step": 8197, "time_per_iteration": 2.9748408794403076 }, { "auxiliary_loss_clip": 0.01439091, "auxiliary_loss_mlp": 0.01036037, "balance_loss_clip": 1.26834989, "balance_loss_mlp": 1.01488876, "epoch": 0.4928904253720126, "flos": 16881711557760.0, "grad_norm": 2.0572289461413114, "language_loss": 0.86017811, "learning_rate": 2.143005031915374e-06, "loss": 0.88492942, "num_input_tokens_seen": 176309165, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21154785, "step": 8198, "time_per_iteration": 2.8543999195098877 }, { "auxiliary_loss_clip": 0.01467948, "auxiliary_loss_mlp": 0.01039683, "balance_loss_clip": 1.29027474, "balance_loss_mlp": 1.0183084, "epoch": 0.4929505486246806, "flos": 14874781628160.0, "grad_norm": 1.8676249686351547, "language_loss": 0.76783425, "learning_rate": 2.1426165643835467e-06, "loss": 0.79291052, "num_input_tokens_seen": 176324960, "router_z_loss_clip": 1.77441406, "router_z_loss_mlp": 0.21386719, "step": 8199, "time_per_iteration": 2.878070116043091 }, { "auxiliary_loss_clip": 0.01454815, "auxiliary_loss_mlp": 0.01037102, "balance_loss_clip": 1.27903676, "balance_loss_mlp": 1.01534605, "epoch": 0.49301067187734854, "flos": 23852989002240.0, "grad_norm": 1.872316379202134, "language_loss": 0.60379112, "learning_rate": 2.1422280914436864e-06, "loss": 0.62871027, "num_input_tokens_seen": 176346195, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.21740723, "step": 8200, "time_per_iteration": 2.8904125690460205 }, { "auxiliary_loss_clip": 0.01428716, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.26236987, "balance_loss_mlp": 1.01929808, "epoch": 0.49307079513001656, "flos": 22501287290880.0, "grad_norm": 1.5489372804232813, "language_loss": 0.79561245, "learning_rate": 2.1418396131105213e-06, "loss": 0.82030165, "num_input_tokens_seen": 176366735, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20922852, "step": 8201, "time_per_iteration": 2.920135021209717 }, { "auxiliary_loss_clip": 0.01473095, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.2928232, "balance_loss_mlp": 1.01708508, "epoch": 0.4931309183826845, "flos": 15933078938880.0, "grad_norm": 1.9891450096530265, "language_loss": 0.68311822, "learning_rate": 2.141451129398785e-06, "loss": 0.70823509, "num_input_tokens_seen": 176384475, "router_z_loss_clip": 1.80273438, "router_z_loss_mlp": 0.21520996, "step": 8202, "time_per_iteration": 4.269949436187744 }, { "auxiliary_loss_clip": 0.01441389, "auxiliary_loss_mlp": 0.0103527, "balance_loss_clip": 1.26808083, "balance_loss_mlp": 1.01370549, "epoch": 0.4931910416353525, "flos": 27320742397440.0, "grad_norm": 1.9923308273112112, "language_loss": 0.76327473, "learning_rate": 2.1410626403232076e-06, "loss": 0.78804135, "num_input_tokens_seen": 176402645, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21569824, "step": 8203, "time_per_iteration": 5.720124006271362 }, { "auxiliary_loss_clip": 0.01456064, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.28020239, "balance_loss_mlp": 1.01694953, "epoch": 0.49325116488802045, "flos": 20813940518400.0, "grad_norm": 1.9742443046693592, "language_loss": 0.81781983, "learning_rate": 2.1406741458985197e-06, "loss": 0.84277201, "num_input_tokens_seen": 176416715, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.2220459, "step": 8204, "time_per_iteration": 2.9217777252197266 }, { "auxiliary_loss_clip": 0.01441062, "auxiliary_loss_mlp": 0.01040396, "balance_loss_clip": 1.2687397, "balance_loss_mlp": 1.01902175, "epoch": 0.4933112881406884, "flos": 19875714203520.0, "grad_norm": 4.330394288741532, "language_loss": 0.6778729, "learning_rate": 2.140285646139455e-06, "loss": 0.7026875, "num_input_tokens_seen": 176435755, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21362305, "step": 8205, "time_per_iteration": 2.8415639400482178 }, { "auxiliary_loss_clip": 0.01460482, "auxiliary_loss_mlp": 0.01042365, "balance_loss_clip": 1.28115368, "balance_loss_mlp": 1.01934564, "epoch": 0.4933714113933564, "flos": 21836105216640.0, "grad_norm": 1.968802761724132, "language_loss": 0.67040551, "learning_rate": 2.139897141060744e-06, "loss": 0.69543397, "num_input_tokens_seen": 176453915, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.23010254, "step": 8206, "time_per_iteration": 2.8661131858825684 }, { "auxiliary_loss_clip": 0.01459445, "auxiliary_loss_mlp": 0.01036052, "balance_loss_clip": 1.2834723, "balance_loss_mlp": 1.01532114, "epoch": 0.49343153464602435, "flos": 27901262171520.0, "grad_norm": 2.1651463115234137, "language_loss": 0.77686775, "learning_rate": 2.1395086306771196e-06, "loss": 0.80182278, "num_input_tokens_seen": 176475175, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.20715332, "step": 8207, "time_per_iteration": 2.8913066387176514 }, { "auxiliary_loss_clip": 0.01442315, "auxiliary_loss_mlp": 0.01037263, "balance_loss_clip": 1.26901376, "balance_loss_mlp": 1.01463687, "epoch": 0.4934916578986923, "flos": 24691821194880.0, "grad_norm": 2.189598542845242, "language_loss": 0.61461997, "learning_rate": 2.1391201150033147e-06, "loss": 0.63941574, "num_input_tokens_seen": 176494250, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.22607422, "step": 8208, "time_per_iteration": 2.8725881576538086 }, { "auxiliary_loss_clip": 0.01446477, "auxiliary_loss_mlp": 0.01038712, "balance_loss_clip": 1.27137899, "balance_loss_mlp": 1.01620591, "epoch": 0.4935517811513603, "flos": 23415823313280.0, "grad_norm": 1.6282770113750196, "language_loss": 0.79234123, "learning_rate": 2.1387315940540598e-06, "loss": 0.81719315, "num_input_tokens_seen": 176513325, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.22497559, "step": 8209, "time_per_iteration": 2.850533962249756 }, { "auxiliary_loss_clip": 0.01436662, "auxiliary_loss_mlp": 0.0103948, "balance_loss_clip": 1.26424003, "balance_loss_mlp": 1.01710474, "epoch": 0.49361190440402825, "flos": 21954502154880.0, "grad_norm": 1.82626551194719, "language_loss": 0.79803318, "learning_rate": 2.138343067844089e-06, "loss": 0.82279462, "num_input_tokens_seen": 176532915, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.22375488, "step": 8210, "time_per_iteration": 2.8450286388397217 }, { "auxiliary_loss_clip": 0.01456005, "auxiliary_loss_mlp": 0.01038848, "balance_loss_clip": 1.27743852, "balance_loss_mlp": 1.01665151, "epoch": 0.4936720276566962, "flos": 25125684013440.0, "grad_norm": 1.6947884002719207, "language_loss": 0.81567526, "learning_rate": 2.1379545363881363e-06, "loss": 0.84062374, "num_input_tokens_seen": 176552775, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.22192383, "step": 8211, "time_per_iteration": 2.8857808113098145 }, { "auxiliary_loss_clip": 0.01451204, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.27582943, "balance_loss_mlp": 1.01271188, "epoch": 0.4937321509093642, "flos": 26369983272960.0, "grad_norm": 2.7305029919752286, "language_loss": 0.93177354, "learning_rate": 2.137565999700933e-06, "loss": 0.95663577, "num_input_tokens_seen": 176572185, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.22302246, "step": 8212, "time_per_iteration": 2.910034418106079 }, { "auxiliary_loss_clip": 0.01445364, "auxiliary_loss_mlp": 0.01036514, "balance_loss_clip": 1.27105355, "balance_loss_mlp": 1.0159384, "epoch": 0.49379227416203214, "flos": 22970423070720.0, "grad_norm": 1.749143286021176, "language_loss": 0.6567601, "learning_rate": 2.1371774577972138e-06, "loss": 0.68157887, "num_input_tokens_seen": 176591490, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.20568848, "step": 8213, "time_per_iteration": 2.886085271835327 }, { "auxiliary_loss_clip": 0.01440133, "auxiliary_loss_mlp": 0.01036297, "balance_loss_clip": 1.26711988, "balance_loss_mlp": 1.01524508, "epoch": 0.49385239741470016, "flos": 32501885587200.0, "grad_norm": 1.899885506491966, "language_loss": 0.76730549, "learning_rate": 2.136788910691711e-06, "loss": 0.79206985, "num_input_tokens_seen": 176612715, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.21057129, "step": 8214, "time_per_iteration": 2.935310125350952 }, { "auxiliary_loss_clip": 0.01448748, "auxiliary_loss_mlp": 0.0104081, "balance_loss_clip": 1.27502108, "balance_loss_mlp": 1.01875675, "epoch": 0.4939125206673681, "flos": 22503278062080.0, "grad_norm": 2.002967409918675, "language_loss": 0.84835982, "learning_rate": 2.1364003583991594e-06, "loss": 0.87325537, "num_input_tokens_seen": 176631950, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.22070312, "step": 8215, "time_per_iteration": 2.858553886413574 }, { "auxiliary_loss_clip": 0.01419376, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.25308371, "balance_loss_mlp": 1.01429796, "epoch": 0.4939726439200361, "flos": 31188985931520.0, "grad_norm": 1.513193717763, "language_loss": 0.83842307, "learning_rate": 2.136011800934292e-06, "loss": 0.86297929, "num_input_tokens_seen": 176653060, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.21936035, "step": 8216, "time_per_iteration": 3.0113375186920166 }, { "auxiliary_loss_clip": 0.01435548, "auxiliary_loss_mlp": 0.01042512, "balance_loss_clip": 1.26501751, "balance_loss_mlp": 1.01925397, "epoch": 0.49403276717270406, "flos": 22684393572480.0, "grad_norm": 1.4390454668871435, "language_loss": 0.75405836, "learning_rate": 2.1356232383118442e-06, "loss": 0.77883893, "num_input_tokens_seen": 176673895, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.2322998, "step": 8217, "time_per_iteration": 2.9483742713928223 }, { "auxiliary_loss_clip": 0.01445431, "auxiliary_loss_mlp": 0.01038973, "balance_loss_clip": 1.27547812, "balance_loss_mlp": 1.01662123, "epoch": 0.494092890425372, "flos": 20750905232640.0, "grad_norm": 1.6052079283455736, "language_loss": 0.79387152, "learning_rate": 2.1352346705465494e-06, "loss": 0.81871557, "num_input_tokens_seen": 176692550, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22363281, "step": 8218, "time_per_iteration": 2.91963529586792 }, { "auxiliary_loss_clip": 0.01436509, "auxiliary_loss_mlp": 0.01041804, "balance_loss_clip": 1.26696587, "balance_loss_mlp": 1.02052486, "epoch": 0.49415301367804, "flos": 18378124698240.0, "grad_norm": 2.4575384050721443, "language_loss": 0.7698158, "learning_rate": 2.134846097653142e-06, "loss": 0.79459882, "num_input_tokens_seen": 176709335, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21276855, "step": 8219, "time_per_iteration": 2.8443048000335693 }, { "auxiliary_loss_clip": 0.01449304, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.27658546, "balance_loss_mlp": 1.01671481, "epoch": 0.49421313693070795, "flos": 17539699708800.0, "grad_norm": 1.6612431482513117, "language_loss": 0.63228023, "learning_rate": 2.134457519646357e-06, "loss": 0.65716422, "num_input_tokens_seen": 176727715, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.22375488, "step": 8220, "time_per_iteration": 2.8849024772644043 }, { "auxiliary_loss_clip": 0.01446179, "auxiliary_loss_mlp": 0.0103666, "balance_loss_clip": 1.27391446, "balance_loss_mlp": 1.01545227, "epoch": 0.4942732601833759, "flos": 20821541644800.0, "grad_norm": 1.724176438696826, "language_loss": 0.73322552, "learning_rate": 2.1340689365409296e-06, "loss": 0.7580539, "num_input_tokens_seen": 176747530, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.21203613, "step": 8221, "time_per_iteration": 2.879103422164917 }, { "auxiliary_loss_clip": 0.01441856, "auxiliary_loss_mlp": 0.01044353, "balance_loss_clip": 1.27271998, "balance_loss_mlp": 1.02347994, "epoch": 0.4943333834360439, "flos": 15057209237760.0, "grad_norm": 2.2122399059870372, "language_loss": 0.80377233, "learning_rate": 2.133680348351595e-06, "loss": 0.82863444, "num_input_tokens_seen": 176765260, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.2088623, "step": 8222, "time_per_iteration": 2.9110918045043945 }, { "auxiliary_loss_clip": 0.01459338, "auxiliary_loss_mlp": 0.01043601, "balance_loss_clip": 1.28657675, "balance_loss_mlp": 1.02110648, "epoch": 0.49439350668871185, "flos": 16078921488000.0, "grad_norm": 2.269909493117277, "language_loss": 0.73425663, "learning_rate": 2.133291755093088e-06, "loss": 0.75928605, "num_input_tokens_seen": 176781770, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.22497559, "step": 8223, "time_per_iteration": 2.773883104324341 }, { "auxiliary_loss_clip": 0.01459208, "auxiliary_loss_mlp": 0.01044864, "balance_loss_clip": 1.28335881, "balance_loss_mlp": 1.02331066, "epoch": 0.4944536299413798, "flos": 20888151269760.0, "grad_norm": 1.7888037147271982, "language_loss": 0.75995475, "learning_rate": 2.132903156780144e-06, "loss": 0.78499544, "num_input_tokens_seen": 176800655, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.21520996, "step": 8224, "time_per_iteration": 2.856503963470459 }, { "auxiliary_loss_clip": 0.01455379, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.28197038, "balance_loss_mlp": 1.0214653, "epoch": 0.4945137531940478, "flos": 26618613304320.0, "grad_norm": 6.956542818011109, "language_loss": 0.64806193, "learning_rate": 2.1325145534274997e-06, "loss": 0.67304778, "num_input_tokens_seen": 176820610, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21740723, "step": 8225, "time_per_iteration": 2.896453380584717 }, { "auxiliary_loss_clip": 0.01459352, "auxiliary_loss_mlp": 0.01038813, "balance_loss_clip": 1.28432965, "balance_loss_mlp": 1.0180583, "epoch": 0.49457387644671574, "flos": 23998786306560.0, "grad_norm": 1.8964438747703498, "language_loss": 0.77447039, "learning_rate": 2.1321259450498893e-06, "loss": 0.79945201, "num_input_tokens_seen": 176840520, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.20751953, "step": 8226, "time_per_iteration": 2.8432672023773193 }, { "auxiliary_loss_clip": 0.01449, "auxiliary_loss_mlp": 0.01040559, "balance_loss_clip": 1.27474332, "balance_loss_mlp": 1.01795745, "epoch": 0.49463399969938376, "flos": 26987133352320.0, "grad_norm": 1.6948962462172361, "language_loss": 0.71732152, "learning_rate": 2.131737331662051e-06, "loss": 0.74221718, "num_input_tokens_seen": 176860265, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.22595215, "step": 8227, "time_per_iteration": 4.3562023639678955 }, { "auxiliary_loss_clip": 0.01470781, "auxiliary_loss_mlp": 0.01041908, "balance_loss_clip": 1.29303312, "balance_loss_mlp": 1.02123666, "epoch": 0.49469412295205173, "flos": 29694065869440.0, "grad_norm": 1.7121214570588066, "language_loss": 0.72287858, "learning_rate": 2.131348713278718e-06, "loss": 0.74800551, "num_input_tokens_seen": 176882910, "router_z_loss_clip": 1.77734375, "router_z_loss_mlp": 0.20678711, "step": 8228, "time_per_iteration": 2.9402685165405273 }, { "auxiliary_loss_clip": 0.01436251, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.26724005, "balance_loss_mlp": 1.01475883, "epoch": 0.4947542462047197, "flos": 24141868922880.0, "grad_norm": 1.439214482718858, "language_loss": 0.84433842, "learning_rate": 2.1309600899146304e-06, "loss": 0.86906397, "num_input_tokens_seen": 176903030, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.21533203, "step": 8229, "time_per_iteration": 2.858778238296509 }, { "auxiliary_loss_clip": 0.01456885, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.28232551, "balance_loss_mlp": 1.01530468, "epoch": 0.49481436945738766, "flos": 20053979291520.0, "grad_norm": 1.7897235968712297, "language_loss": 0.75671721, "learning_rate": 2.1305714615845227e-06, "loss": 0.78165066, "num_input_tokens_seen": 176919025, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.21166992, "step": 8230, "time_per_iteration": 2.8566603660583496 }, { "auxiliary_loss_clip": 0.01456711, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.28332138, "balance_loss_mlp": 1.01268864, "epoch": 0.4948744927100556, "flos": 15678069390720.0, "grad_norm": 3.2115603950343643, "language_loss": 0.80933547, "learning_rate": 2.1301828283031314e-06, "loss": 0.83422792, "num_input_tokens_seen": 176937945, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.19848633, "step": 8231, "time_per_iteration": 2.845608949661255 }, { "auxiliary_loss_clip": 0.01232457, "auxiliary_loss_mlp": 0.01046879, "balance_loss_clip": 1.13180113, "balance_loss_mlp": 1.02227461, "epoch": 0.4949346159627236, "flos": 68905495860480.0, "grad_norm": 0.8370235975750995, "language_loss": 0.60239446, "learning_rate": 2.1297941900851944e-06, "loss": 0.62518781, "num_input_tokens_seen": 177004575, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.24511719, "step": 8232, "time_per_iteration": 3.5098164081573486 }, { "auxiliary_loss_clip": 0.01465735, "auxiliary_loss_mlp": 0.0103662, "balance_loss_clip": 1.28680158, "balance_loss_mlp": 1.01397038, "epoch": 0.49499473921539155, "flos": 24800988193920.0, "grad_norm": 1.8740165854746917, "language_loss": 0.69920343, "learning_rate": 2.1294055469454496e-06, "loss": 0.72422701, "num_input_tokens_seen": 177024155, "router_z_loss_clip": 1.7890625, "router_z_loss_mlp": 0.2265625, "step": 8233, "time_per_iteration": 3.0036284923553467 }, { "auxiliary_loss_clip": 0.01432608, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.26357293, "balance_loss_mlp": 1.01430571, "epoch": 0.4950548624680595, "flos": 32720717278080.0, "grad_norm": 2.412614600127574, "language_loss": 0.68116248, "learning_rate": 2.129016898898633e-06, "loss": 0.70585108, "num_input_tokens_seen": 177046185, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21936035, "step": 8234, "time_per_iteration": 2.9526586532592773 }, { "auxiliary_loss_clip": 0.01232414, "auxiliary_loss_mlp": 0.01027663, "balance_loss_clip": 1.13321161, "balance_loss_mlp": 1.00248647, "epoch": 0.4951149857207275, "flos": 50110791857280.0, "grad_norm": 0.7934942557027935, "language_loss": 0.58026052, "learning_rate": 2.128628245959482e-06, "loss": 0.60286129, "num_input_tokens_seen": 177099025, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.25195312, "step": 8235, "time_per_iteration": 3.2508633136749268 }, { "auxiliary_loss_clip": 0.01456037, "auxiliary_loss_mlp": 0.01039209, "balance_loss_clip": 1.28156972, "balance_loss_mlp": 1.01820433, "epoch": 0.49517510897339545, "flos": 22246956414720.0, "grad_norm": 1.7113596090754408, "language_loss": 0.78627646, "learning_rate": 2.1282395881427355e-06, "loss": 0.81122887, "num_input_tokens_seen": 177118365, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.21008301, "step": 8236, "time_per_iteration": 2.8552067279815674 }, { "auxiliary_loss_clip": 0.01436008, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.26663017, "balance_loss_mlp": 1.01470041, "epoch": 0.4952352322260634, "flos": 25384991817600.0, "grad_norm": 2.298913212659366, "language_loss": 0.73657691, "learning_rate": 2.1278509254631315e-06, "loss": 0.76129591, "num_input_tokens_seen": 177136415, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.21191406, "step": 8237, "time_per_iteration": 4.280331134796143 }, { "auxiliary_loss_clip": 0.01457836, "auxiliary_loss_mlp": 0.01039912, "balance_loss_clip": 1.28733587, "balance_loss_mlp": 1.01841879, "epoch": 0.4952953554787314, "flos": 24619601214720.0, "grad_norm": 1.6505000943191281, "language_loss": 0.76647663, "learning_rate": 2.127462257935406e-06, "loss": 0.7914542, "num_input_tokens_seen": 177155690, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.21496582, "step": 8238, "time_per_iteration": 4.320410251617432 }, { "auxiliary_loss_clip": 0.01453883, "auxiliary_loss_mlp": 0.01040762, "balance_loss_clip": 1.2792784, "balance_loss_mlp": 1.01905406, "epoch": 0.49535547873139935, "flos": 17320189345920.0, "grad_norm": 3.786254977699487, "language_loss": 0.75076181, "learning_rate": 2.1270735855743008e-06, "loss": 0.77570832, "num_input_tokens_seen": 177173350, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.21716309, "step": 8239, "time_per_iteration": 4.280998706817627 }, { "auxiliary_loss_clip": 0.014474, "auxiliary_loss_mlp": 0.01037008, "balance_loss_clip": 1.27315426, "balance_loss_mlp": 1.01463294, "epoch": 0.4954156019840673, "flos": 20749955091840.0, "grad_norm": 2.471581904142579, "language_loss": 0.79762912, "learning_rate": 2.126684908394552e-06, "loss": 0.82247323, "num_input_tokens_seen": 177191115, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.22363281, "step": 8240, "time_per_iteration": 2.8616950511932373 }, { "auxiliary_loss_clip": 0.01440133, "auxiliary_loss_mlp": 0.01037914, "balance_loss_clip": 1.2710464, "balance_loss_mlp": 1.01667082, "epoch": 0.49547572523673533, "flos": 12827692298880.0, "grad_norm": 2.0043229767452444, "language_loss": 0.86245334, "learning_rate": 2.126296226410898e-06, "loss": 0.88723385, "num_input_tokens_seen": 177206155, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.21228027, "step": 8241, "time_per_iteration": 2.797053813934326 }, { "auxiliary_loss_clip": 0.01435311, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.26796055, "balance_loss_mlp": 1.01708472, "epoch": 0.4955358484894033, "flos": 15605668431360.0, "grad_norm": 2.2401795036012992, "language_loss": 0.78386188, "learning_rate": 2.1259075396380794e-06, "loss": 0.80858833, "num_input_tokens_seen": 177224815, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20251465, "step": 8242, "time_per_iteration": 2.8272273540496826 }, { "auxiliary_loss_clip": 0.0144507, "auxiliary_loss_mlp": 0.01037264, "balance_loss_clip": 1.27438462, "balance_loss_mlp": 1.01553273, "epoch": 0.49559597174207126, "flos": 26474399568000.0, "grad_norm": 2.6910323777114367, "language_loss": 0.67757714, "learning_rate": 2.125518848090833e-06, "loss": 0.70240051, "num_input_tokens_seen": 177244490, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.21740723, "step": 8243, "time_per_iteration": 2.8541581630706787 }, { "auxiliary_loss_clip": 0.01446235, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.27527487, "balance_loss_mlp": 1.01801074, "epoch": 0.4956560949947392, "flos": 23158461035520.0, "grad_norm": 2.6783266863319204, "language_loss": 0.69245607, "learning_rate": 2.125130151783901e-06, "loss": 0.71730644, "num_input_tokens_seen": 177264340, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.20788574, "step": 8244, "time_per_iteration": 2.871535062789917 }, { "auxiliary_loss_clip": 0.01444475, "auxiliary_loss_mlp": 0.01039328, "balance_loss_clip": 1.27150297, "balance_loss_mlp": 1.01747715, "epoch": 0.4957162182474072, "flos": 20782241896320.0, "grad_norm": 2.306718478188363, "language_loss": 0.75846064, "learning_rate": 2.12474145073202e-06, "loss": 0.78329867, "num_input_tokens_seen": 177283055, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.21862793, "step": 8245, "time_per_iteration": 2.980043888092041 }, { "auxiliary_loss_clip": 0.01438106, "auxiliary_loss_mlp": 0.01042818, "balance_loss_clip": 1.27001309, "balance_loss_mlp": 1.02058589, "epoch": 0.49577634150007516, "flos": 18743477610240.0, "grad_norm": 2.0239142419883063, "language_loss": 0.82741129, "learning_rate": 2.1243527449499306e-06, "loss": 0.85222054, "num_input_tokens_seen": 177301140, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.22241211, "step": 8246, "time_per_iteration": 2.9433393478393555 }, { "auxiliary_loss_clip": 0.01456667, "auxiliary_loss_mlp": 0.0103972, "balance_loss_clip": 1.28215802, "balance_loss_mlp": 1.01841772, "epoch": 0.4958364647527431, "flos": 25564478515200.0, "grad_norm": 1.7436618700432225, "language_loss": 0.85021126, "learning_rate": 2.1239640344523733e-06, "loss": 0.87517512, "num_input_tokens_seen": 177323095, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.2130127, "step": 8247, "time_per_iteration": 3.0296268463134766 }, { "auxiliary_loss_clip": 0.01458263, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 1.28421545, "balance_loss_mlp": 1.01915538, "epoch": 0.4958965880054111, "flos": 24435906750720.0, "grad_norm": 2.1832910518651265, "language_loss": 0.83970106, "learning_rate": 2.123575319254087e-06, "loss": 0.86467814, "num_input_tokens_seen": 177339845, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.20300293, "step": 8248, "time_per_iteration": 2.8678877353668213 }, { "auxiliary_loss_clip": 0.01450137, "auxiliary_loss_mlp": 0.01038513, "balance_loss_clip": 1.27632558, "balance_loss_mlp": 1.01599491, "epoch": 0.49595671125807905, "flos": 25094573573760.0, "grad_norm": 1.870974259656413, "language_loss": 0.7437439, "learning_rate": 2.123186599369812e-06, "loss": 0.76863045, "num_input_tokens_seen": 177359980, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.2253418, "step": 8249, "time_per_iteration": 2.88061261177063 }, { "auxiliary_loss_clip": 0.01464396, "auxiliary_loss_mlp": 0.0104353, "balance_loss_clip": 1.28953266, "balance_loss_mlp": 1.02164292, "epoch": 0.496016834510747, "flos": 16444636358400.0, "grad_norm": 1.7453559720161278, "language_loss": 0.76261747, "learning_rate": 2.122797874814289e-06, "loss": 0.78769672, "num_input_tokens_seen": 177378580, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.21862793, "step": 8250, "time_per_iteration": 2.827531099319458 }, { "auxiliary_loss_clip": 0.01474998, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.29934514, "balance_loss_mlp": 1.01960695, "epoch": 0.496076957763415, "flos": 23447657669760.0, "grad_norm": 2.041604718086462, "language_loss": 0.70400172, "learning_rate": 2.1224091456022585e-06, "loss": 0.72915912, "num_input_tokens_seen": 177398790, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.21118164, "step": 8251, "time_per_iteration": 2.856873035430908 }, { "auxiliary_loss_clip": 0.01449576, "auxiliary_loss_mlp": 0.01031942, "balance_loss_clip": 1.27898145, "balance_loss_mlp": 1.01193869, "epoch": 0.49613708101608295, "flos": 16918206128640.0, "grad_norm": 1.7902495305275334, "language_loss": 0.80099756, "learning_rate": 2.122020411748461e-06, "loss": 0.8258127, "num_input_tokens_seen": 177416515, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.19995117, "step": 8252, "time_per_iteration": 2.821040153503418 }, { "auxiliary_loss_clip": 0.0144719, "auxiliary_loss_mlp": 0.01033769, "balance_loss_clip": 1.27888441, "balance_loss_mlp": 1.01239514, "epoch": 0.4961972042687509, "flos": 16626837744000.0, "grad_norm": 1.7331384377033097, "language_loss": 0.82229441, "learning_rate": 2.1216316732676363e-06, "loss": 0.84710401, "num_input_tokens_seen": 177434425, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21386719, "step": 8253, "time_per_iteration": 2.8722009658813477 }, { "auxiliary_loss_clip": 0.01441682, "auxiliary_loss_mlp": 0.01036278, "balance_loss_clip": 1.27177858, "balance_loss_mlp": 1.01583314, "epoch": 0.49625732752141893, "flos": 28970372989440.0, "grad_norm": 1.6056117500591456, "language_loss": 0.67625052, "learning_rate": 2.1212429301745275e-06, "loss": 0.70103008, "num_input_tokens_seen": 177459675, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.2043457, "step": 8254, "time_per_iteration": 2.951997995376587 }, { "auxiliary_loss_clip": 0.01454627, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.28163397, "balance_loss_mlp": 1.01631784, "epoch": 0.4963174507740869, "flos": 23122735626240.0, "grad_norm": 2.47012597429492, "language_loss": 0.74861062, "learning_rate": 2.1208541824838743e-06, "loss": 0.77352309, "num_input_tokens_seen": 177478895, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.20288086, "step": 8255, "time_per_iteration": 2.8855559825897217 }, { "auxiliary_loss_clip": 0.01433998, "auxiliary_loss_mlp": 0.01040401, "balance_loss_clip": 1.2641151, "balance_loss_mlp": 1.01895475, "epoch": 0.49637757402675486, "flos": 13925832295680.0, "grad_norm": 1.7130947821587992, "language_loss": 0.82077241, "learning_rate": 2.1204654302104183e-06, "loss": 0.84551638, "num_input_tokens_seen": 177494920, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.21435547, "step": 8256, "time_per_iteration": 2.819978952407837 }, { "auxiliary_loss_clip": 0.01435752, "auxiliary_loss_mlp": 0.01034292, "balance_loss_clip": 1.2685504, "balance_loss_mlp": 1.01465797, "epoch": 0.49643769727942283, "flos": 22318995415680.0, "grad_norm": 1.8008453746631123, "language_loss": 0.81647611, "learning_rate": 2.120076673368901e-06, "loss": 0.84117651, "num_input_tokens_seen": 177515455, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19628906, "step": 8257, "time_per_iteration": 2.8695476055145264 }, { "auxiliary_loss_clip": 0.01476536, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.29646826, "balance_loss_mlp": 1.01595938, "epoch": 0.4964978205320908, "flos": 19509682619520.0, "grad_norm": 1.8755350827063946, "language_loss": 0.66822338, "learning_rate": 2.1196879119740647e-06, "loss": 0.69335926, "num_input_tokens_seen": 177534040, "router_z_loss_clip": 1.80175781, "router_z_loss_mlp": 0.2109375, "step": 8258, "time_per_iteration": 2.864656448364258 }, { "auxiliary_loss_clip": 0.01427335, "auxiliary_loss_mlp": 0.01034562, "balance_loss_clip": 1.26166916, "balance_loss_mlp": 1.01373577, "epoch": 0.49655794378475876, "flos": 23445893122560.0, "grad_norm": 1.4900845845911228, "language_loss": 0.78127182, "learning_rate": 2.1192991460406502e-06, "loss": 0.8058908, "num_input_tokens_seen": 177554510, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20825195, "step": 8259, "time_per_iteration": 2.9319028854370117 }, { "auxiliary_loss_clip": 0.01444747, "auxiliary_loss_mlp": 0.01036353, "balance_loss_clip": 1.27619994, "balance_loss_mlp": 1.01663613, "epoch": 0.4966180670374267, "flos": 26841743251200.0, "grad_norm": 1.3942112317785031, "language_loss": 0.79225117, "learning_rate": 2.1189103755834e-06, "loss": 0.81706214, "num_input_tokens_seen": 177575780, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19714355, "step": 8260, "time_per_iteration": 2.873436450958252 }, { "auxiliary_loss_clip": 0.01447524, "auxiliary_loss_mlp": 0.01033112, "balance_loss_clip": 1.27538347, "balance_loss_mlp": 1.01239347, "epoch": 0.4966781902900947, "flos": 22017220727040.0, "grad_norm": 4.192326549138425, "language_loss": 0.77176088, "learning_rate": 2.1185216006170573e-06, "loss": 0.79656726, "num_input_tokens_seen": 177588965, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.20703125, "step": 8261, "time_per_iteration": 2.8344788551330566 }, { "auxiliary_loss_clip": 0.01422316, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.25636733, "balance_loss_mlp": 1.01667953, "epoch": 0.49673831354276266, "flos": 26224457437440.0, "grad_norm": 1.9080778504008375, "language_loss": 0.90132582, "learning_rate": 2.1181328211563627e-06, "loss": 0.92591095, "num_input_tokens_seen": 177608425, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19506836, "step": 8262, "time_per_iteration": 4.348931312561035 }, { "auxiliary_loss_clip": 0.01426966, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.2619108, "balance_loss_mlp": 1.01273108, "epoch": 0.4967984367954306, "flos": 23191878960000.0, "grad_norm": 1.5031936255966605, "language_loss": 0.74680853, "learning_rate": 2.11774403721606e-06, "loss": 0.77140856, "num_input_tokens_seen": 177628240, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.203125, "step": 8263, "time_per_iteration": 2.8238422870635986 }, { "auxiliary_loss_clip": 0.01456706, "auxiliary_loss_mlp": 0.01036012, "balance_loss_clip": 1.28397548, "balance_loss_mlp": 1.01456594, "epoch": 0.4968585600480986, "flos": 19290850928640.0, "grad_norm": 2.099511989918052, "language_loss": 0.70824254, "learning_rate": 2.1173552488108923e-06, "loss": 0.73316967, "num_input_tokens_seen": 177645920, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.21447754, "step": 8264, "time_per_iteration": 2.853356122970581 }, { "auxiliary_loss_clip": 0.01451987, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.27845979, "balance_loss_mlp": 1.01362562, "epoch": 0.49691868330076655, "flos": 22538958226560.0, "grad_norm": 1.3277344961355184, "language_loss": 0.6518262, "learning_rate": 2.1169664559556007e-06, "loss": 0.67668021, "num_input_tokens_seen": 177667185, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.19787598, "step": 8265, "time_per_iteration": 3.0164215564727783 }, { "auxiliary_loss_clip": 0.01255557, "auxiliary_loss_mlp": 0.01050967, "balance_loss_clip": 1.15312171, "balance_loss_mlp": 1.02903259, "epoch": 0.4969788065534345, "flos": 66610907619840.0, "grad_norm": 0.8393662098591155, "language_loss": 0.53520167, "learning_rate": 2.1165776586649304e-06, "loss": 0.558267, "num_input_tokens_seen": 177733020, "router_z_loss_clip": 1.0234375, "router_z_loss_mlp": 0.21972656, "step": 8266, "time_per_iteration": 3.4696285724639893 }, { "auxiliary_loss_clip": 0.01419455, "auxiliary_loss_mlp": 0.01036884, "balance_loss_clip": 1.25695491, "balance_loss_mlp": 1.01554549, "epoch": 0.49703892980610254, "flos": 24069739432320.0, "grad_norm": 1.6257641741883453, "language_loss": 0.80075932, "learning_rate": 2.1161888569536223e-06, "loss": 0.82532275, "num_input_tokens_seen": 177753370, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.21337891, "step": 8267, "time_per_iteration": 2.9869699478149414 }, { "auxiliary_loss_clip": 0.01441554, "auxiliary_loss_mlp": 0.01037291, "balance_loss_clip": 1.27074361, "balance_loss_mlp": 1.01572561, "epoch": 0.4970990530587705, "flos": 29136467226240.0, "grad_norm": 2.45937698245482, "language_loss": 0.76137316, "learning_rate": 2.1158000508364223e-06, "loss": 0.78616166, "num_input_tokens_seen": 177771530, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21569824, "step": 8268, "time_per_iteration": 2.99566912651062 }, { "auxiliary_loss_clip": 0.01435114, "auxiliary_loss_mlp": 0.01035612, "balance_loss_clip": 1.26569617, "balance_loss_mlp": 1.01467836, "epoch": 0.49715917631143847, "flos": 46042683482880.0, "grad_norm": 1.8401971237031238, "language_loss": 0.68623108, "learning_rate": 2.115411240328073e-06, "loss": 0.71093833, "num_input_tokens_seen": 177796355, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20935059, "step": 8269, "time_per_iteration": 3.1610946655273438 }, { "auxiliary_loss_clip": 0.0141601, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.25186265, "balance_loss_mlp": 1.01035118, "epoch": 0.49721929956410643, "flos": 20200319533440.0, "grad_norm": 2.3343443519380607, "language_loss": 0.86134446, "learning_rate": 2.1150224254433167e-06, "loss": 0.88581491, "num_input_tokens_seen": 177814300, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20666504, "step": 8270, "time_per_iteration": 2.8996567726135254 }, { "auxiliary_loss_clip": 0.01446137, "auxiliary_loss_mlp": 0.01036859, "balance_loss_clip": 1.27500355, "balance_loss_mlp": 1.01708269, "epoch": 0.4972794228167744, "flos": 21663721952640.0, "grad_norm": 1.675690900753053, "language_loss": 0.7135663, "learning_rate": 2.114633606196899e-06, "loss": 0.73839629, "num_input_tokens_seen": 177833615, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.19787598, "step": 8271, "time_per_iteration": 2.926745891571045 }, { "auxiliary_loss_clip": 0.01440679, "auxiliary_loss_mlp": 0.01036005, "balance_loss_clip": 1.27150142, "balance_loss_mlp": 1.01607323, "epoch": 0.49733954606944236, "flos": 24290380915200.0, "grad_norm": 1.375068049414241, "language_loss": 0.7872116, "learning_rate": 2.1142447826035635e-06, "loss": 0.8119784, "num_input_tokens_seen": 177855315, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.19934082, "step": 8272, "time_per_iteration": 4.388345241546631 }, { "auxiliary_loss_clip": 0.01435361, "auxiliary_loss_mlp": 0.01038919, "balance_loss_clip": 1.26566136, "balance_loss_mlp": 1.01859379, "epoch": 0.4973996693221103, "flos": 37866768485760.0, "grad_norm": 1.9544743999724805, "language_loss": 0.6745559, "learning_rate": 2.1138559546780544e-06, "loss": 0.69929862, "num_input_tokens_seen": 177875590, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.203125, "step": 8273, "time_per_iteration": 4.518434286117554 }, { "auxiliary_loss_clip": 0.01444345, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.27587342, "balance_loss_mlp": 1.01529646, "epoch": 0.4974597925747783, "flos": 21371448672000.0, "grad_norm": 1.7316856689701627, "language_loss": 0.79017603, "learning_rate": 2.1134671224351163e-06, "loss": 0.81496489, "num_input_tokens_seen": 177894175, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.19250488, "step": 8274, "time_per_iteration": 3.0537285804748535 }, { "auxiliary_loss_clip": 0.0144434, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.27313995, "balance_loss_mlp": 1.01686776, "epoch": 0.49751991582744626, "flos": 30750553388160.0, "grad_norm": 1.783096844158929, "language_loss": 0.76313674, "learning_rate": 2.113078285889493e-06, "loss": 0.78795213, "num_input_tokens_seen": 177913920, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20336914, "step": 8275, "time_per_iteration": 2.997716188430786 }, { "auxiliary_loss_clip": 0.0143992, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.26839519, "balance_loss_mlp": 1.01742351, "epoch": 0.4975800390801142, "flos": 14108621863680.0, "grad_norm": 3.7241392144646808, "language_loss": 0.84945071, "learning_rate": 2.1126894450559303e-06, "loss": 0.8742336, "num_input_tokens_seen": 177930425, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20947266, "step": 8276, "time_per_iteration": 2.881439208984375 }, { "auxiliary_loss_clip": 0.01411302, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.24922705, "balance_loss_mlp": 1.01431727, "epoch": 0.4976401623327822, "flos": 24218025200640.0, "grad_norm": 2.6650933845193134, "language_loss": 0.70704055, "learning_rate": 2.112300599949172e-06, "loss": 0.73149103, "num_input_tokens_seen": 177949885, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19421387, "step": 8277, "time_per_iteration": 2.848639726638794 }, { "auxiliary_loss_clip": 0.01421986, "auxiliary_loss_mlp": 0.0103462, "balance_loss_clip": 1.25673127, "balance_loss_mlp": 1.0144738, "epoch": 0.49770028558545015, "flos": 21145242078720.0, "grad_norm": 1.8007828876973913, "language_loss": 0.82945389, "learning_rate": 2.111911750583964e-06, "loss": 0.85401994, "num_input_tokens_seen": 177965720, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20141602, "step": 8278, "time_per_iteration": 2.899932861328125 }, { "auxiliary_loss_clip": 0.01442089, "auxiliary_loss_mlp": 0.01038862, "balance_loss_clip": 1.27081096, "balance_loss_mlp": 1.0185008, "epoch": 0.4977604088381181, "flos": 16772680293120.0, "grad_norm": 2.6748067334441394, "language_loss": 0.68616974, "learning_rate": 2.111522896975052e-06, "loss": 0.71097928, "num_input_tokens_seen": 177983190, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.20349121, "step": 8279, "time_per_iteration": 2.8194363117218018 }, { "auxiliary_loss_clip": 0.01425069, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.25653839, "balance_loss_mlp": 1.01403987, "epoch": 0.49782053209078614, "flos": 15711396825600.0, "grad_norm": 2.0199987144363147, "language_loss": 0.71132362, "learning_rate": 2.1111340391371794e-06, "loss": 0.73593497, "num_input_tokens_seen": 178000155, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.22033691, "step": 8280, "time_per_iteration": 2.858469247817993 }, { "auxiliary_loss_clip": 0.01429725, "auxiliary_loss_mlp": 0.01034174, "balance_loss_clip": 1.26182532, "balance_loss_mlp": 1.01289487, "epoch": 0.4978806553434541, "flos": 24764357888640.0, "grad_norm": 1.6703510521572618, "language_loss": 0.65404719, "learning_rate": 2.1107451770850936e-06, "loss": 0.67868614, "num_input_tokens_seen": 178021060, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21289062, "step": 8281, "time_per_iteration": 2.8670592308044434 }, { "auxiliary_loss_clip": 0.01442373, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.2708981, "balance_loss_mlp": 1.01267481, "epoch": 0.49794077859612207, "flos": 13123630408320.0, "grad_norm": 1.9080773403449358, "language_loss": 0.74080968, "learning_rate": 2.1103563108335387e-06, "loss": 0.76557618, "num_input_tokens_seen": 178038180, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.21618652, "step": 8282, "time_per_iteration": 2.8007819652557373 }, { "auxiliary_loss_clip": 0.01428167, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.26124001, "balance_loss_mlp": 1.01644051, "epoch": 0.49800090184879003, "flos": 27536135483520.0, "grad_norm": 1.60909637287516, "language_loss": 0.7381171, "learning_rate": 2.109967440397263e-06, "loss": 0.7627579, "num_input_tokens_seen": 178057565, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19470215, "step": 8283, "time_per_iteration": 2.9009766578674316 }, { "auxiliary_loss_clip": 0.01416575, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.2510376, "balance_loss_mlp": 1.01490188, "epoch": 0.498061025101458, "flos": 19802091634560.0, "grad_norm": 1.5658581064192356, "language_loss": 0.7998445, "learning_rate": 2.1095785657910095e-06, "loss": 0.82436824, "num_input_tokens_seen": 178076965, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20910645, "step": 8284, "time_per_iteration": 2.826444387435913 }, { "auxiliary_loss_clip": 0.01445208, "auxiliary_loss_mlp": 0.01039723, "balance_loss_clip": 1.27169049, "balance_loss_mlp": 1.01856315, "epoch": 0.49812114835412596, "flos": 29904798741120.0, "grad_norm": 1.6118923491231278, "language_loss": 0.74326092, "learning_rate": 2.109189687029526e-06, "loss": 0.76811022, "num_input_tokens_seen": 178095105, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.21154785, "step": 8285, "time_per_iteration": 2.9164702892303467 }, { "auxiliary_loss_clip": 0.01439567, "auxiliary_loss_mlp": 0.01037054, "balance_loss_clip": 1.27078593, "balance_loss_mlp": 1.01503563, "epoch": 0.49818127160679393, "flos": 23156967957120.0, "grad_norm": 3.144371952258391, "language_loss": 0.74601632, "learning_rate": 2.1088008041275598e-06, "loss": 0.77078259, "num_input_tokens_seen": 178114505, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.22033691, "step": 8286, "time_per_iteration": 2.8582475185394287 }, { "auxiliary_loss_clip": 0.0143773, "auxiliary_loss_mlp": 0.0104109, "balance_loss_clip": 1.26827812, "balance_loss_mlp": 1.01973963, "epoch": 0.4982413948594619, "flos": 21662771811840.0, "grad_norm": 1.6482529627279956, "language_loss": 0.86190414, "learning_rate": 2.1084119170998545e-06, "loss": 0.88669229, "num_input_tokens_seen": 178131595, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21337891, "step": 8287, "time_per_iteration": 2.893589496612549 }, { "auxiliary_loss_clip": 0.01441928, "auxiliary_loss_mlp": 0.01036889, "balance_loss_clip": 1.27151847, "balance_loss_mlp": 1.01568198, "epoch": 0.49830151811212986, "flos": 32499216144000.0, "grad_norm": 1.9828190599963893, "language_loss": 0.72941405, "learning_rate": 2.108023025961159e-06, "loss": 0.75420225, "num_input_tokens_seen": 178152055, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.2121582, "step": 8288, "time_per_iteration": 3.0443592071533203 }, { "auxiliary_loss_clip": 0.01443297, "auxiliary_loss_mlp": 0.01043825, "balance_loss_clip": 1.27109027, "balance_loss_mlp": 1.02138948, "epoch": 0.4983616413647978, "flos": 18149384396160.0, "grad_norm": 5.924420055047721, "language_loss": 0.82203668, "learning_rate": 2.10763413072622e-06, "loss": 0.84690785, "num_input_tokens_seen": 178168150, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.22424316, "step": 8289, "time_per_iteration": 2.8993639945983887 }, { "auxiliary_loss_clip": 0.01427974, "auxiliary_loss_mlp": 0.01038902, "balance_loss_clip": 1.25956798, "balance_loss_mlp": 1.01740849, "epoch": 0.4984217646174658, "flos": 19728242841600.0, "grad_norm": 2.472707562387102, "language_loss": 0.74925655, "learning_rate": 2.107245231409784e-06, "loss": 0.7739253, "num_input_tokens_seen": 178186150, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.21496582, "step": 8290, "time_per_iteration": 2.829845428466797 }, { "auxiliary_loss_clip": 0.01448934, "auxiliary_loss_mlp": 0.01035063, "balance_loss_clip": 1.27721047, "balance_loss_mlp": 1.0117929, "epoch": 0.49848188787013376, "flos": 24947373680640.0, "grad_norm": 1.5034216334358055, "language_loss": 0.84753066, "learning_rate": 2.106856328026598e-06, "loss": 0.8723706, "num_input_tokens_seen": 178207665, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.23291016, "step": 8291, "time_per_iteration": 2.8804898262023926 }, { "auxiliary_loss_clip": 0.01450105, "auxiliary_loss_mlp": 0.01041039, "balance_loss_clip": 1.27495492, "balance_loss_mlp": 1.01909232, "epoch": 0.4985420111228017, "flos": 22392572739840.0, "grad_norm": 1.7168282525449434, "language_loss": 0.67791671, "learning_rate": 2.106467420591409e-06, "loss": 0.70282817, "num_input_tokens_seen": 178226325, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.21948242, "step": 8292, "time_per_iteration": 2.908088445663452 }, { "auxiliary_loss_clip": 0.01426437, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.25881827, "balance_loss_mlp": 1.01805258, "epoch": 0.4986021343754697, "flos": 16225035505920.0, "grad_norm": 1.6382674749987316, "language_loss": 0.67546499, "learning_rate": 2.106078509118965e-06, "loss": 0.70013011, "num_input_tokens_seen": 178244960, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.22021484, "step": 8293, "time_per_iteration": 2.895575761795044 }, { "auxiliary_loss_clip": 0.01449054, "auxiliary_loss_mlp": 0.01034604, "balance_loss_clip": 1.27714503, "balance_loss_mlp": 1.01451683, "epoch": 0.4986622576281377, "flos": 23413877786880.0, "grad_norm": 2.9146647437491295, "language_loss": 0.83533001, "learning_rate": 2.1056895936240133e-06, "loss": 0.86016655, "num_input_tokens_seen": 178265400, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.20068359, "step": 8294, "time_per_iteration": 2.8390579223632812 }, { "auxiliary_loss_clip": 0.01444253, "auxiliary_loss_mlp": 0.01040601, "balance_loss_clip": 1.27491736, "balance_loss_mlp": 1.0187974, "epoch": 0.49872238088080567, "flos": 19984112040960.0, "grad_norm": 1.849717484645562, "language_loss": 0.74261612, "learning_rate": 2.1053006741213016e-06, "loss": 0.76746464, "num_input_tokens_seen": 178284535, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21801758, "step": 8295, "time_per_iteration": 2.8709442615509033 }, { "auxiliary_loss_clip": 0.01428613, "auxiliary_loss_mlp": 0.01037078, "balance_loss_clip": 1.26084995, "balance_loss_mlp": 1.0164665, "epoch": 0.49878250413347364, "flos": 22903225263360.0, "grad_norm": 1.835415707141508, "language_loss": 0.68591869, "learning_rate": 2.1049117506255775e-06, "loss": 0.71057564, "num_input_tokens_seen": 178302425, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20617676, "step": 8296, "time_per_iteration": 4.269610404968262 }, { "auxiliary_loss_clip": 0.01437504, "auxiliary_loss_mlp": 0.01037026, "balance_loss_clip": 1.26491022, "balance_loss_mlp": 1.01560402, "epoch": 0.4988426273861416, "flos": 32611233565440.0, "grad_norm": 1.7879994703510005, "language_loss": 0.65337014, "learning_rate": 2.1045228231515895e-06, "loss": 0.67811543, "num_input_tokens_seen": 178323065, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.2142334, "step": 8297, "time_per_iteration": 2.914956569671631 }, { "auxiliary_loss_clip": 0.01422897, "auxiliary_loss_mlp": 0.0103922, "balance_loss_clip": 1.25621223, "balance_loss_mlp": 1.01866806, "epoch": 0.49890275063880957, "flos": 20933513821440.0, "grad_norm": 3.1681861252421295, "language_loss": 0.70977032, "learning_rate": 2.1041338917140857e-06, "loss": 0.73439151, "num_input_tokens_seen": 178343985, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20544434, "step": 8298, "time_per_iteration": 2.8613665103912354 }, { "auxiliary_loss_clip": 0.01421609, "auxiliary_loss_mlp": 0.01038332, "balance_loss_clip": 1.25400269, "balance_loss_mlp": 1.01756573, "epoch": 0.49896287389147753, "flos": 18633631939200.0, "grad_norm": 2.106564995235704, "language_loss": 0.85694665, "learning_rate": 2.103744956327814e-06, "loss": 0.88154602, "num_input_tokens_seen": 178362345, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20776367, "step": 8299, "time_per_iteration": 2.827418327331543 }, { "auxiliary_loss_clip": 0.01439466, "auxiliary_loss_mlp": 0.01040235, "balance_loss_clip": 1.26702237, "balance_loss_mlp": 1.01820505, "epoch": 0.4990229971441455, "flos": 24837211296000.0, "grad_norm": 2.184379209444536, "language_loss": 0.70049632, "learning_rate": 2.1033560170075234e-06, "loss": 0.72529334, "num_input_tokens_seen": 178383190, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.22045898, "step": 8300, "time_per_iteration": 2.910083055496216 }, { "auxiliary_loss_clip": 0.01249317, "auxiliary_loss_mlp": 0.01044677, "balance_loss_clip": 1.14495099, "balance_loss_mlp": 1.01434982, "epoch": 0.49908312039681346, "flos": 71417286979200.0, "grad_norm": 0.760877337354368, "language_loss": 0.51170915, "learning_rate": 2.1029670737679623e-06, "loss": 0.53464913, "num_input_tokens_seen": 178444250, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.30273438, "step": 8301, "time_per_iteration": 3.450291156768799 }, { "auxiliary_loss_clip": 0.01418382, "auxiliary_loss_mlp": 0.0104422, "balance_loss_clip": 1.25189281, "balance_loss_mlp": 1.02269137, "epoch": 0.4991432436494814, "flos": 19838179002240.0, "grad_norm": 2.054574732192301, "language_loss": 0.84988672, "learning_rate": 2.102578126623879e-06, "loss": 0.87451279, "num_input_tokens_seen": 178463250, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.2154541, "step": 8302, "time_per_iteration": 2.8369388580322266 }, { "auxiliary_loss_clip": 0.01418105, "auxiliary_loss_mlp": 0.01038835, "balance_loss_clip": 1.25159717, "balance_loss_mlp": 1.01691246, "epoch": 0.4992033669021494, "flos": 15130198379520.0, "grad_norm": 1.8683815165032045, "language_loss": 0.6985985, "learning_rate": 2.102189175590024e-06, "loss": 0.7231679, "num_input_tokens_seen": 178481340, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.21936035, "step": 8303, "time_per_iteration": 2.843846082687378 }, { "auxiliary_loss_clip": 0.01445396, "auxiliary_loss_mlp": 0.0104289, "balance_loss_clip": 1.27267957, "balance_loss_mlp": 1.02159953, "epoch": 0.49926349015481736, "flos": 31219327209600.0, "grad_norm": 2.1933706513510582, "language_loss": 0.73185921, "learning_rate": 2.101800220681144e-06, "loss": 0.756742, "num_input_tokens_seen": 178501545, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.21313477, "step": 8304, "time_per_iteration": 2.958021640777588 }, { "auxiliary_loss_clip": 0.01433485, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.26330113, "balance_loss_mlp": 1.01714551, "epoch": 0.4993236134074853, "flos": 24911060088960.0, "grad_norm": 2.1325587094168337, "language_loss": 0.8188889, "learning_rate": 2.10141126191199e-06, "loss": 0.84359777, "num_input_tokens_seen": 178519700, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20263672, "step": 8305, "time_per_iteration": 2.8711745738983154 }, { "auxiliary_loss_clip": 0.01241112, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.13984132, "balance_loss_mlp": 0.99868959, "epoch": 0.4993837366601533, "flos": 70449108606720.0, "grad_norm": 0.7122494379314129, "language_loss": 0.56986904, "learning_rate": 2.1010222992973107e-06, "loss": 0.59256458, "num_input_tokens_seen": 178576740, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.296875, "step": 8306, "time_per_iteration": 4.8658154010772705 }, { "auxiliary_loss_clip": 0.01431439, "auxiliary_loss_mlp": 0.01037779, "balance_loss_clip": 1.26179361, "balance_loss_mlp": 1.0157249, "epoch": 0.4994438599128213, "flos": 15969754488960.0, "grad_norm": 6.551209928916245, "language_loss": 0.83687961, "learning_rate": 2.1006333328518556e-06, "loss": 0.86157179, "num_input_tokens_seen": 178594745, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.2208252, "step": 8307, "time_per_iteration": 2.926408052444458 }, { "auxiliary_loss_clip": 0.01437395, "auxiliary_loss_mlp": 0.01040855, "balance_loss_clip": 1.2679379, "balance_loss_mlp": 1.01809788, "epoch": 0.4995039831654893, "flos": 27939883248000.0, "grad_norm": 2.327357043498748, "language_loss": 0.622298, "learning_rate": 2.1002443625903748e-06, "loss": 0.64708054, "num_input_tokens_seen": 178614110, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.22753906, "step": 8308, "time_per_iteration": 4.344104528427124 }, { "auxiliary_loss_clip": 0.01413202, "auxiliary_loss_mlp": 0.01035104, "balance_loss_clip": 1.2470181, "balance_loss_mlp": 1.01262093, "epoch": 0.49956410641815724, "flos": 24214903309440.0, "grad_norm": 1.6647158352699243, "language_loss": 0.75140208, "learning_rate": 2.0998553885276168e-06, "loss": 0.77588511, "num_input_tokens_seen": 178634170, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.22473145, "step": 8309, "time_per_iteration": 4.280550479888916 }, { "auxiliary_loss_clip": 0.01431563, "auxiliary_loss_mlp": 0.01042151, "balance_loss_clip": 1.26086056, "balance_loss_mlp": 1.02082479, "epoch": 0.4996242296708252, "flos": 16188721914240.0, "grad_norm": 2.091737009061471, "language_loss": 0.80294436, "learning_rate": 2.0994664106783335e-06, "loss": 0.82768148, "num_input_tokens_seen": 178651775, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.21313477, "step": 8310, "time_per_iteration": 2.8051228523254395 }, { "auxiliary_loss_clip": 0.01439917, "auxiliary_loss_mlp": 0.01041415, "balance_loss_clip": 1.26498699, "balance_loss_mlp": 1.01942039, "epoch": 0.49968435292349317, "flos": 16882344984960.0, "grad_norm": 1.5368714109456652, "language_loss": 0.71747857, "learning_rate": 2.0990774290572735e-06, "loss": 0.74229193, "num_input_tokens_seen": 178669720, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.2199707, "step": 8311, "time_per_iteration": 2.846593141555786 }, { "auxiliary_loss_clip": 0.01440471, "auxiliary_loss_mlp": 0.01039297, "balance_loss_clip": 1.27085924, "balance_loss_mlp": 1.01788723, "epoch": 0.49974447617616113, "flos": 14947770769920.0, "grad_norm": 1.707881682019575, "language_loss": 0.77848321, "learning_rate": 2.098688443679187e-06, "loss": 0.80328089, "num_input_tokens_seen": 178686765, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.2142334, "step": 8312, "time_per_iteration": 2.8453209400177 }, { "auxiliary_loss_clip": 0.01444967, "auxiliary_loss_mlp": 0.01040168, "balance_loss_clip": 1.27275681, "balance_loss_mlp": 1.01888871, "epoch": 0.4998045994288291, "flos": 26662844736000.0, "grad_norm": 2.4392611082195286, "language_loss": 0.85239315, "learning_rate": 2.0982994545588256e-06, "loss": 0.87724447, "num_input_tokens_seen": 178705845, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21289062, "step": 8313, "time_per_iteration": 2.8862712383270264 }, { "auxiliary_loss_clip": 0.01425537, "auxiliary_loss_mlp": 0.01037826, "balance_loss_clip": 1.2542429, "balance_loss_mlp": 1.01536691, "epoch": 0.49986472268149706, "flos": 20962045307520.0, "grad_norm": 1.8988266726809413, "language_loss": 0.8160485, "learning_rate": 2.097910461710939e-06, "loss": 0.84068215, "num_input_tokens_seen": 178723410, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.22460938, "step": 8314, "time_per_iteration": 2.8773045539855957 }, { "auxiliary_loss_clip": 0.01434306, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.26247096, "balance_loss_mlp": 1.01937854, "epoch": 0.49992484593416503, "flos": 22794148753920.0, "grad_norm": 1.7812668206997266, "language_loss": 0.80024195, "learning_rate": 2.0975214651502773e-06, "loss": 0.82501197, "num_input_tokens_seen": 178743560, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.23327637, "step": 8315, "time_per_iteration": 2.828500986099243 }, { "auxiliary_loss_clip": 0.01433071, "auxiliary_loss_mlp": 0.01040746, "balance_loss_clip": 1.26345956, "balance_loss_mlp": 1.01915729, "epoch": 0.499984969186833, "flos": 46802282751360.0, "grad_norm": 1.674495860334052, "language_loss": 0.75012171, "learning_rate": 2.0971324648915926e-06, "loss": 0.77485991, "num_input_tokens_seen": 178767225, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.21582031, "step": 8316, "time_per_iteration": 3.0383615493774414 }, { "auxiliary_loss_clip": 0.01426462, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.25919676, "balance_loss_mlp": 1.01234543, "epoch": 0.500045092439501, "flos": 25568007609600.0, "grad_norm": 1.5517483564985073, "language_loss": 0.81896949, "learning_rate": 2.0967434609496343e-06, "loss": 0.84357786, "num_input_tokens_seen": 178786810, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.22033691, "step": 8317, "time_per_iteration": 2.9274802207946777 }, { "auxiliary_loss_clip": 0.0143092, "auxiliary_loss_mlp": 0.01036087, "balance_loss_clip": 1.26013112, "balance_loss_mlp": 1.01424813, "epoch": 0.5001052156921689, "flos": 20714636885760.0, "grad_norm": 1.766743987489358, "language_loss": 0.83924854, "learning_rate": 2.0963544533391548e-06, "loss": 0.86391866, "num_input_tokens_seen": 178805660, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.21826172, "step": 8318, "time_per_iteration": 2.9248757362365723 }, { "auxiliary_loss_clip": 0.01430774, "auxiliary_loss_mlp": 0.010353, "balance_loss_clip": 1.25996411, "balance_loss_mlp": 1.01348436, "epoch": 0.500165338944837, "flos": 21260879084160.0, "grad_norm": 1.7711534398998237, "language_loss": 0.8239904, "learning_rate": 2.0959654420749045e-06, "loss": 0.84865117, "num_input_tokens_seen": 178824780, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.21813965, "step": 8319, "time_per_iteration": 2.865321159362793 }, { "auxiliary_loss_clip": 0.0142736, "auxiliary_loss_mlp": 0.01035886, "balance_loss_clip": 1.25908077, "balance_loss_mlp": 1.01459491, "epoch": 0.5002254621975049, "flos": 27865717741440.0, "grad_norm": 1.6054604667511614, "language_loss": 0.72079885, "learning_rate": 2.095576427171635e-06, "loss": 0.7454313, "num_input_tokens_seen": 178845640, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.21289062, "step": 8320, "time_per_iteration": 2.8846378326416016 }, { "auxiliary_loss_clip": 0.01456975, "auxiliary_loss_mlp": 0.01043032, "balance_loss_clip": 1.27836633, "balance_loss_mlp": 1.02014375, "epoch": 0.5002855854501729, "flos": 15558315108480.0, "grad_norm": 3.503262700148327, "language_loss": 0.78053772, "learning_rate": 2.0951874086440978e-06, "loss": 0.80553782, "num_input_tokens_seen": 178862290, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.22888184, "step": 8321, "time_per_iteration": 2.80202054977417 }, { "auxiliary_loss_clip": 0.01435109, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.26499844, "balance_loss_mlp": 1.02047932, "epoch": 0.5003457087028408, "flos": 16116049486080.0, "grad_norm": 1.8071262296644288, "language_loss": 0.83707947, "learning_rate": 2.0947983865070455e-06, "loss": 0.86185962, "num_input_tokens_seen": 178879805, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22436523, "step": 8322, "time_per_iteration": 2.8136980533599854 }, { "auxiliary_loss_clip": 0.0143442, "auxiliary_loss_mlp": 0.01037869, "balance_loss_clip": 1.26309729, "balance_loss_mlp": 1.01582706, "epoch": 0.5004058319555088, "flos": 22720661919360.0, "grad_norm": 4.262263903814169, "language_loss": 0.74692637, "learning_rate": 2.094409360775228e-06, "loss": 0.77164924, "num_input_tokens_seen": 178896985, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.22045898, "step": 8323, "time_per_iteration": 2.8452413082122803 }, { "auxiliary_loss_clip": 0.01432632, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.26146114, "balance_loss_mlp": 1.01397276, "epoch": 0.5004659552081767, "flos": 30130236172800.0, "grad_norm": 1.6950805850476818, "language_loss": 0.70290524, "learning_rate": 2.0940203314633977e-06, "loss": 0.72759277, "num_input_tokens_seen": 178920605, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.22155762, "step": 8324, "time_per_iteration": 2.9169812202453613 }, { "auxiliary_loss_clip": 0.0144191, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.27093947, "balance_loss_mlp": 1.01782775, "epoch": 0.5005260784608447, "flos": 18633948652800.0, "grad_norm": 2.041774308139965, "language_loss": 0.72617072, "learning_rate": 2.0936312985863077e-06, "loss": 0.75099748, "num_input_tokens_seen": 178937760, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.22937012, "step": 8325, "time_per_iteration": 2.817307710647583 }, { "auxiliary_loss_clip": 0.01434499, "auxiliary_loss_mlp": 0.01043584, "balance_loss_clip": 1.26249003, "balance_loss_mlp": 1.02056515, "epoch": 0.5005862017135126, "flos": 24869860058880.0, "grad_norm": 1.897849344792366, "language_loss": 0.7361055, "learning_rate": 2.093242262158709e-06, "loss": 0.76088637, "num_input_tokens_seen": 178957985, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.22998047, "step": 8326, "time_per_iteration": 2.9285545349121094 }, { "auxiliary_loss_clip": 0.01420082, "auxiliary_loss_mlp": 0.01038718, "balance_loss_clip": 1.25105786, "balance_loss_mlp": 1.01678324, "epoch": 0.5006463249661807, "flos": 18743703834240.0, "grad_norm": 1.917761741261309, "language_loss": 0.78234458, "learning_rate": 2.0928532221953544e-06, "loss": 0.80693257, "num_input_tokens_seen": 178977070, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21923828, "step": 8327, "time_per_iteration": 2.8496129512786865 }, { "auxiliary_loss_clip": 0.01439684, "auxiliary_loss_mlp": 0.01044486, "balance_loss_clip": 1.26718092, "balance_loss_mlp": 1.02182436, "epoch": 0.5007064482188487, "flos": 13049872104960.0, "grad_norm": 2.436914338529368, "language_loss": 0.88671136, "learning_rate": 2.092464178710997e-06, "loss": 0.91155303, "num_input_tokens_seen": 178994175, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.2265625, "step": 8328, "time_per_iteration": 2.8428597450256348 }, { "auxiliary_loss_clip": 0.01438773, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.26363051, "balance_loss_mlp": 1.01454365, "epoch": 0.5007665714715166, "flos": 21298730999040.0, "grad_norm": 7.359012599218469, "language_loss": 0.75390691, "learning_rate": 2.092075131720388e-06, "loss": 0.77866167, "num_input_tokens_seen": 179013710, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.22155762, "step": 8329, "time_per_iteration": 2.881842851638794 }, { "auxiliary_loss_clip": 0.01421733, "auxiliary_loss_mlp": 0.01038836, "balance_loss_clip": 1.25461721, "balance_loss_mlp": 1.01756871, "epoch": 0.5008266947241846, "flos": 29767145500800.0, "grad_norm": 1.8049434047077915, "language_loss": 0.80163586, "learning_rate": 2.091686081238281e-06, "loss": 0.82624161, "num_input_tokens_seen": 179035255, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.21264648, "step": 8330, "time_per_iteration": 2.9144208431243896 }, { "auxiliary_loss_clip": 0.01232396, "auxiliary_loss_mlp": 0.01040897, "balance_loss_clip": 1.13772273, "balance_loss_mlp": 1.01209593, "epoch": 0.5008868179768525, "flos": 63583695290880.0, "grad_norm": 0.7328523067955023, "language_loss": 0.56205106, "learning_rate": 2.0912970272794282e-06, "loss": 0.58478397, "num_input_tokens_seen": 179090915, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.28710938, "step": 8331, "time_per_iteration": 4.544417142868042 }, { "auxiliary_loss_clip": 0.01419724, "auxiliary_loss_mlp": 0.01035248, "balance_loss_clip": 1.25297618, "balance_loss_mlp": 1.01390958, "epoch": 0.5009469412295205, "flos": 27385677964800.0, "grad_norm": 1.898428778583623, "language_loss": 0.65698028, "learning_rate": 2.0909079698585833e-06, "loss": 0.68153, "num_input_tokens_seen": 179109160, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.21337891, "step": 8332, "time_per_iteration": 2.89968204498291 }, { "auxiliary_loss_clip": 0.01427431, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.25928521, "balance_loss_mlp": 1.01371098, "epoch": 0.5010070644821885, "flos": 27389704752000.0, "grad_norm": 1.8582003345615024, "language_loss": 0.75873542, "learning_rate": 2.0905189089904993e-06, "loss": 0.78336513, "num_input_tokens_seen": 179130610, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.21838379, "step": 8333, "time_per_iteration": 2.9078688621520996 }, { "auxiliary_loss_clip": 0.01440911, "auxiliary_loss_mlp": 0.01038295, "balance_loss_clip": 1.2676717, "balance_loss_mlp": 1.01746917, "epoch": 0.5010671877348565, "flos": 20671174615680.0, "grad_norm": 2.239478943239947, "language_loss": 0.81441379, "learning_rate": 2.090129844689929e-06, "loss": 0.8392058, "num_input_tokens_seen": 179147860, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.20837402, "step": 8334, "time_per_iteration": 2.8509902954101562 }, { "auxiliary_loss_clip": 0.01232857, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 1.1400075, "balance_loss_mlp": 1.00572705, "epoch": 0.5011273109875244, "flos": 59159092705920.0, "grad_norm": 0.9451909104062545, "language_loss": 0.62703294, "learning_rate": 2.089740776971626e-06, "loss": 0.64968771, "num_input_tokens_seen": 179210490, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.26953125, "step": 8335, "time_per_iteration": 3.262941360473633 }, { "auxiliary_loss_clip": 0.01418709, "auxiliary_loss_mlp": 0.01036411, "balance_loss_clip": 1.25121832, "balance_loss_mlp": 1.01554966, "epoch": 0.5011874342401924, "flos": 25346868433920.0, "grad_norm": 1.5039593143502603, "language_loss": 0.80683959, "learning_rate": 2.0893517058503435e-06, "loss": 0.83139074, "num_input_tokens_seen": 179231360, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20861816, "step": 8336, "time_per_iteration": 2.8914999961853027 }, { "auxiliary_loss_clip": 0.0142828, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.25797904, "balance_loss_mlp": 1.01483059, "epoch": 0.5012475574928603, "flos": 20239574037120.0, "grad_norm": 1.7934573804457412, "language_loss": 0.80807811, "learning_rate": 2.088962631340836e-06, "loss": 0.83273131, "num_input_tokens_seen": 179250625, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.2220459, "step": 8337, "time_per_iteration": 2.831033945083618 }, { "auxiliary_loss_clip": 0.01453022, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.27635765, "balance_loss_mlp": 1.01429844, "epoch": 0.5013076807455283, "flos": 22720164226560.0, "grad_norm": 2.027587609567849, "language_loss": 0.79567397, "learning_rate": 2.0885735534578555e-06, "loss": 0.82056665, "num_input_tokens_seen": 179267360, "router_z_loss_clip": 1.765625, "router_z_loss_mlp": 0.21948242, "step": 8338, "time_per_iteration": 2.8648276329040527 }, { "auxiliary_loss_clip": 0.01439613, "auxiliary_loss_mlp": 0.01037004, "balance_loss_clip": 1.26722431, "balance_loss_mlp": 1.01505804, "epoch": 0.5013678039981962, "flos": 24256284318720.0, "grad_norm": 1.7437196791305, "language_loss": 0.85392749, "learning_rate": 2.0881844722161583e-06, "loss": 0.87869364, "num_input_tokens_seen": 179289810, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.21948242, "step": 8339, "time_per_iteration": 2.9259607791900635 }, { "auxiliary_loss_clip": 0.01432415, "auxiliary_loss_mlp": 0.01039256, "balance_loss_clip": 1.26185846, "balance_loss_mlp": 1.01697576, "epoch": 0.5014279272508643, "flos": 26187736642560.0, "grad_norm": 2.1778429943543873, "language_loss": 0.71739352, "learning_rate": 2.0877953876304962e-06, "loss": 0.74211025, "num_input_tokens_seen": 179310620, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.22265625, "step": 8340, "time_per_iteration": 2.8991920948028564 }, { "auxiliary_loss_clip": 0.01441785, "auxiliary_loss_mlp": 0.01040238, "balance_loss_clip": 1.26655209, "balance_loss_mlp": 1.01758814, "epoch": 0.5014880505035323, "flos": 21439460885760.0, "grad_norm": 3.9456704509452165, "language_loss": 0.79154027, "learning_rate": 2.0874062997156245e-06, "loss": 0.81636047, "num_input_tokens_seen": 179329005, "router_z_loss_clip": 1.75292969, "router_z_loss_mlp": 0.22644043, "step": 8341, "time_per_iteration": 4.2436957359313965 }, { "auxiliary_loss_clip": 0.01441016, "auxiliary_loss_mlp": 0.01033547, "balance_loss_clip": 1.26589894, "balance_loss_mlp": 1.01200628, "epoch": 0.5015481737562002, "flos": 15777146799360.0, "grad_norm": 2.873417764662513, "language_loss": 0.90044016, "learning_rate": 2.0870172084862975e-06, "loss": 0.9251858, "num_input_tokens_seen": 179343785, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.2154541, "step": 8342, "time_per_iteration": 2.833301305770874 }, { "auxiliary_loss_clip": 0.01436643, "auxiliary_loss_mlp": 0.0103788, "balance_loss_clip": 1.26528335, "balance_loss_mlp": 1.01554072, "epoch": 0.5016082970088682, "flos": 26841336048000.0, "grad_norm": 1.8610268312154412, "language_loss": 0.77346671, "learning_rate": 2.0866281139572682e-06, "loss": 0.79821193, "num_input_tokens_seen": 179364070, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.22338867, "step": 8343, "time_per_iteration": 4.2628302574157715 }, { "auxiliary_loss_clip": 0.01420901, "auxiliary_loss_mlp": 0.01034526, "balance_loss_clip": 1.25267267, "balance_loss_mlp": 1.01278234, "epoch": 0.5016684202615361, "flos": 21480570426240.0, "grad_norm": 1.9044178418421396, "language_loss": 0.68900883, "learning_rate": 2.086239016143293e-06, "loss": 0.71356308, "num_input_tokens_seen": 179384225, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21740723, "step": 8344, "time_per_iteration": 4.295549154281616 }, { "auxiliary_loss_clip": 0.01436443, "auxiliary_loss_mlp": 0.01041306, "balance_loss_clip": 1.26479948, "balance_loss_mlp": 1.02014661, "epoch": 0.5017285435142042, "flos": 26257151445120.0, "grad_norm": 1.882164795888667, "language_loss": 0.76184022, "learning_rate": 2.0858499150591258e-06, "loss": 0.78661776, "num_input_tokens_seen": 179402595, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.21154785, "step": 8345, "time_per_iteration": 2.8923771381378174 }, { "auxiliary_loss_clip": 0.01428523, "auxiliary_loss_mlp": 0.01039034, "balance_loss_clip": 1.25782871, "balance_loss_mlp": 1.01553822, "epoch": 0.5017886667668721, "flos": 20787399803520.0, "grad_norm": 2.205529519438255, "language_loss": 0.79466939, "learning_rate": 2.0854608107195203e-06, "loss": 0.81934494, "num_input_tokens_seen": 179419635, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.23510742, "step": 8346, "time_per_iteration": 2.849806547164917 }, { "auxiliary_loss_clip": 0.01428157, "auxiliary_loss_mlp": 0.01042725, "balance_loss_clip": 1.25794113, "balance_loss_mlp": 1.02036142, "epoch": 0.5018487900195401, "flos": 20165860978560.0, "grad_norm": 1.5567607335147207, "language_loss": 0.70103025, "learning_rate": 2.0850717031392333e-06, "loss": 0.72573912, "num_input_tokens_seen": 179438770, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.22351074, "step": 8347, "time_per_iteration": 2.8379967212677 }, { "auxiliary_loss_clip": 0.01430398, "auxiliary_loss_mlp": 0.01039218, "balance_loss_clip": 1.25721264, "balance_loss_mlp": 1.01733065, "epoch": 0.501908913272208, "flos": 18159881189760.0, "grad_norm": 2.0254496702502913, "language_loss": 0.72951692, "learning_rate": 2.0846825923330174e-06, "loss": 0.75421309, "num_input_tokens_seen": 179457475, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.21887207, "step": 8348, "time_per_iteration": 2.908296585083008 }, { "auxiliary_loss_clip": 0.0142332, "auxiliary_loss_mlp": 0.01041447, "balance_loss_clip": 1.25820565, "balance_loss_mlp": 1.01967943, "epoch": 0.501969036524876, "flos": 23122645136640.0, "grad_norm": 1.5572992414891362, "language_loss": 0.75204551, "learning_rate": 2.0842934783156303e-06, "loss": 0.77669322, "num_input_tokens_seen": 179478140, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.21765137, "step": 8349, "time_per_iteration": 2.933507204055786 }, { "auxiliary_loss_clip": 0.01432036, "auxiliary_loss_mlp": 0.01043921, "balance_loss_clip": 1.25758529, "balance_loss_mlp": 1.02087808, "epoch": 0.5020291597775439, "flos": 11370352682880.0, "grad_norm": 2.4303316981335676, "language_loss": 0.6523363, "learning_rate": 2.0839043611018266e-06, "loss": 0.67709589, "num_input_tokens_seen": 179494325, "router_z_loss_clip": 1.74414062, "router_z_loss_mlp": 0.23059082, "step": 8350, "time_per_iteration": 2.9370200634002686 }, { "auxiliary_loss_clip": 0.0124238, "auxiliary_loss_mlp": 0.01039403, "balance_loss_clip": 1.14899004, "balance_loss_mlp": 1.01403487, "epoch": 0.5020892830302119, "flos": 64041085405440.0, "grad_norm": 0.8333635180892747, "language_loss": 0.59762996, "learning_rate": 2.0835152407063597e-06, "loss": 0.62044781, "num_input_tokens_seen": 179553545, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.25390625, "step": 8351, "time_per_iteration": 3.484189510345459 }, { "auxiliary_loss_clip": 0.01435046, "auxiliary_loss_mlp": 0.01041848, "balance_loss_clip": 1.26181102, "balance_loss_mlp": 1.02018726, "epoch": 0.5021494062828799, "flos": 23743460044800.0, "grad_norm": 1.7810241418359718, "language_loss": 0.75985932, "learning_rate": 2.0831261171439873e-06, "loss": 0.78462833, "num_input_tokens_seen": 179573645, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.21655273, "step": 8352, "time_per_iteration": 2.967494487762451 }, { "auxiliary_loss_clip": 0.01442221, "auxiliary_loss_mlp": 0.01044628, "balance_loss_clip": 1.26980519, "balance_loss_mlp": 1.02139437, "epoch": 0.5022095295355479, "flos": 21585891617280.0, "grad_norm": 2.7827969205692487, "language_loss": 0.72362196, "learning_rate": 2.082736990429464e-06, "loss": 0.74849045, "num_input_tokens_seen": 179591435, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.23242188, "step": 8353, "time_per_iteration": 2.883181095123291 }, { "auxiliary_loss_clip": 0.01444916, "auxiliary_loss_mlp": 0.01045694, "balance_loss_clip": 1.27311563, "balance_loss_mlp": 1.02310359, "epoch": 0.5022696527882159, "flos": 21407083591680.0, "grad_norm": 7.2227188232481385, "language_loss": 0.75211072, "learning_rate": 2.0823478605775455e-06, "loss": 0.77701682, "num_input_tokens_seen": 179609955, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.22595215, "step": 8354, "time_per_iteration": 2.8617396354675293 }, { "auxiliary_loss_clip": 0.01429783, "auxiliary_loss_mlp": 0.01044013, "balance_loss_clip": 1.25905633, "balance_loss_mlp": 1.02172089, "epoch": 0.5023297760408838, "flos": 27171732712320.0, "grad_norm": 1.5741106832654135, "language_loss": 0.73152691, "learning_rate": 2.0819587276029884e-06, "loss": 0.75626487, "num_input_tokens_seen": 179630875, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.22302246, "step": 8355, "time_per_iteration": 2.8915348052978516 }, { "auxiliary_loss_clip": 0.01443721, "auxiliary_loss_mlp": 0.01046109, "balance_loss_clip": 1.26813173, "balance_loss_mlp": 1.0241034, "epoch": 0.5023898992935518, "flos": 26225045619840.0, "grad_norm": 1.5418048718225648, "language_loss": 0.81866723, "learning_rate": 2.081569591520548e-06, "loss": 0.84356552, "num_input_tokens_seen": 179649835, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.22009277, "step": 8356, "time_per_iteration": 2.902352809906006 }, { "auxiliary_loss_clip": 0.01458082, "auxiliary_loss_mlp": 0.01053163, "balance_loss_clip": 1.27779138, "balance_loss_mlp": 1.03072739, "epoch": 0.5024500225462197, "flos": 13447330842240.0, "grad_norm": 2.101459375771465, "language_loss": 0.77415884, "learning_rate": 2.0811804523449803e-06, "loss": 0.79927129, "num_input_tokens_seen": 179667605, "router_z_loss_clip": 1.80371094, "router_z_loss_mlp": 0.22424316, "step": 8357, "time_per_iteration": 2.825868844985962 }, { "auxiliary_loss_clip": 0.01433006, "auxiliary_loss_mlp": 0.01050121, "balance_loss_clip": 1.26100099, "balance_loss_mlp": 1.02773321, "epoch": 0.5025101457988878, "flos": 21589465956480.0, "grad_norm": 1.9134179683542836, "language_loss": 0.77015704, "learning_rate": 2.0807913100910417e-06, "loss": 0.79498827, "num_input_tokens_seen": 179686910, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.22387695, "step": 8358, "time_per_iteration": 2.8802406787872314 }, { "auxiliary_loss_clip": 0.01428985, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.25771677, "balance_loss_mlp": 1.02813959, "epoch": 0.5025702690515557, "flos": 24656005296000.0, "grad_norm": 4.333566527318841, "language_loss": 0.73652041, "learning_rate": 2.0804021647734887e-06, "loss": 0.76131284, "num_input_tokens_seen": 179706395, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.22119141, "step": 8359, "time_per_iteration": 2.885776996612549 }, { "auxiliary_loss_clip": 0.01421508, "auxiliary_loss_mlp": 0.01048189, "balance_loss_clip": 1.25361586, "balance_loss_mlp": 1.02464533, "epoch": 0.5026303923042237, "flos": 22100118480000.0, "grad_norm": 1.5868764172211098, "language_loss": 0.77943122, "learning_rate": 2.080013016407077e-06, "loss": 0.80412817, "num_input_tokens_seen": 179725735, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.23547363, "step": 8360, "time_per_iteration": 2.896709442138672 }, { "auxiliary_loss_clip": 0.01422607, "auxiliary_loss_mlp": 0.0104435, "balance_loss_clip": 1.25362515, "balance_loss_mlp": 1.02217746, "epoch": 0.5026905155568916, "flos": 23708006104320.0, "grad_norm": 2.01962387853291, "language_loss": 0.7792362, "learning_rate": 2.0796238650065645e-06, "loss": 0.80390573, "num_input_tokens_seen": 179746150, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.22180176, "step": 8361, "time_per_iteration": 2.910471200942993 }, { "auxiliary_loss_clip": 0.01444824, "auxiliary_loss_mlp": 0.01049626, "balance_loss_clip": 1.27007318, "balance_loss_mlp": 1.02686882, "epoch": 0.5027506388095596, "flos": 25823514850560.0, "grad_norm": 2.734174263147762, "language_loss": 0.85745144, "learning_rate": 2.0792347105867065e-06, "loss": 0.88239592, "num_input_tokens_seen": 179767550, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.22753906, "step": 8362, "time_per_iteration": 2.9095380306243896 }, { "auxiliary_loss_clip": 0.01445046, "auxiliary_loss_mlp": 0.01042284, "balance_loss_clip": 1.27067614, "balance_loss_mlp": 1.02040887, "epoch": 0.5028107620622275, "flos": 27537990520320.0, "grad_norm": 1.6873138741466056, "language_loss": 0.79124451, "learning_rate": 2.0788455531622605e-06, "loss": 0.81611782, "num_input_tokens_seen": 179790075, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21875, "step": 8363, "time_per_iteration": 2.899487257003784 }, { "auxiliary_loss_clip": 0.01416399, "auxiliary_loss_mlp": 0.01043382, "balance_loss_clip": 1.25100303, "balance_loss_mlp": 1.02056527, "epoch": 0.5028708853148955, "flos": 24544757036160.0, "grad_norm": 3.7990575610916943, "language_loss": 0.7623589, "learning_rate": 2.0784563927479838e-06, "loss": 0.78695667, "num_input_tokens_seen": 179806515, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.22802734, "step": 8364, "time_per_iteration": 2.8654627799987793 }, { "auxiliary_loss_clip": 0.01421345, "auxiliary_loss_mlp": 0.01042622, "balance_loss_clip": 1.25234365, "balance_loss_mlp": 1.02053297, "epoch": 0.5029310085675635, "flos": 20823351436800.0, "grad_norm": 1.686707570182686, "language_loss": 0.70426142, "learning_rate": 2.0780672293586317e-06, "loss": 0.72890115, "num_input_tokens_seen": 179826450, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.22094727, "step": 8365, "time_per_iteration": 2.8564751148223877 }, { "auxiliary_loss_clip": 0.014395, "auxiliary_loss_mlp": 0.01044698, "balance_loss_clip": 1.2637136, "balance_loss_mlp": 1.02196527, "epoch": 0.5029911318202315, "flos": 22351689423360.0, "grad_norm": 1.688128288618669, "language_loss": 0.73786002, "learning_rate": 2.0776780630089635e-06, "loss": 0.76270205, "num_input_tokens_seen": 179846770, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.22729492, "step": 8366, "time_per_iteration": 4.360262155532837 }, { "auxiliary_loss_clip": 0.01426791, "auxiliary_loss_mlp": 0.01042081, "balance_loss_clip": 1.25742722, "balance_loss_mlp": 1.02061117, "epoch": 0.5030512550728995, "flos": 24363370056960.0, "grad_norm": 1.4687611303897505, "language_loss": 0.78595769, "learning_rate": 2.077288893713735e-06, "loss": 0.81064647, "num_input_tokens_seen": 179866585, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.21459961, "step": 8367, "time_per_iteration": 2.8941681385040283 }, { "auxiliary_loss_clip": 0.01420454, "auxiliary_loss_mlp": 0.01042896, "balance_loss_clip": 1.25170803, "balance_loss_mlp": 1.02140272, "epoch": 0.5031113783255674, "flos": 18269093433600.0, "grad_norm": 1.7787403888400009, "language_loss": 0.71134865, "learning_rate": 2.0768997214877035e-06, "loss": 0.73598206, "num_input_tokens_seen": 179885575, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21484375, "step": 8368, "time_per_iteration": 2.8883469104766846 }, { "auxiliary_loss_clip": 0.01236679, "auxiliary_loss_mlp": 0.01040436, "balance_loss_clip": 1.14295673, "balance_loss_mlp": 1.01182604, "epoch": 0.5031715015782354, "flos": 57279319223040.0, "grad_norm": 0.9125837126252, "language_loss": 0.63345504, "learning_rate": 2.0765105463456274e-06, "loss": 0.65622616, "num_input_tokens_seen": 179939650, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.28515625, "step": 8369, "time_per_iteration": 3.3405072689056396 }, { "auxiliary_loss_clip": 0.01423889, "auxiliary_loss_mlp": 0.01041004, "balance_loss_clip": 1.25612283, "balance_loss_mlp": 1.01937985, "epoch": 0.5032316248309033, "flos": 27538940661120.0, "grad_norm": 2.1570513528981303, "language_loss": 0.60673159, "learning_rate": 2.076121368302263e-06, "loss": 0.6313805, "num_input_tokens_seen": 179961765, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21630859, "step": 8370, "time_per_iteration": 2.966266393661499 }, { "auxiliary_loss_clip": 0.01431099, "auxiliary_loss_mlp": 0.01042771, "balance_loss_clip": 1.2576834, "balance_loss_mlp": 1.02038372, "epoch": 0.5032917480835714, "flos": 34509132230400.0, "grad_norm": 1.6131214806494696, "language_loss": 0.69009441, "learning_rate": 2.0757321873723695e-06, "loss": 0.71483308, "num_input_tokens_seen": 179983015, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.22375488, "step": 8371, "time_per_iteration": 2.98302960395813 }, { "auxiliary_loss_clip": 0.01430982, "auxiliary_loss_mlp": 0.01044839, "balance_loss_clip": 1.26059437, "balance_loss_mlp": 1.02108026, "epoch": 0.5033518713362393, "flos": 33669892834560.0, "grad_norm": 1.6924884608175534, "language_loss": 0.68739903, "learning_rate": 2.0753430035707042e-06, "loss": 0.71215725, "num_input_tokens_seen": 180003210, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.2376709, "step": 8372, "time_per_iteration": 2.9893484115600586 }, { "auxiliary_loss_clip": 0.01439924, "auxiliary_loss_mlp": 0.01038915, "balance_loss_clip": 1.26675034, "balance_loss_mlp": 1.01575279, "epoch": 0.5034119945889073, "flos": 28197607484160.0, "grad_norm": 1.8090425379098323, "language_loss": 0.67605072, "learning_rate": 2.0749538169120235e-06, "loss": 0.70083916, "num_input_tokens_seen": 180025530, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.23156738, "step": 8373, "time_per_iteration": 2.9488015174865723 }, { "auxiliary_loss_clip": 0.01428046, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.25696802, "balance_loss_mlp": 1.01639235, "epoch": 0.5034721178415752, "flos": 21368010067200.0, "grad_norm": 1.7364474658196598, "language_loss": 0.75180936, "learning_rate": 2.0745646274110872e-06, "loss": 0.77647543, "num_input_tokens_seen": 180043180, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.22155762, "step": 8374, "time_per_iteration": 2.8580431938171387 }, { "auxiliary_loss_clip": 0.01441762, "auxiliary_loss_mlp": 0.010413, "balance_loss_clip": 1.26911676, "balance_loss_mlp": 1.02021158, "epoch": 0.5035322410942432, "flos": 22685162734080.0, "grad_norm": 2.9473415558959317, "language_loss": 0.69214183, "learning_rate": 2.0741754350826525e-06, "loss": 0.71697247, "num_input_tokens_seen": 180062905, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.21081543, "step": 8375, "time_per_iteration": 2.862151622772217 }, { "auxiliary_loss_clip": 0.01451138, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.27640116, "balance_loss_mlp": 1.0144192, "epoch": 0.5035923643469111, "flos": 19838269491840.0, "grad_norm": 1.7768975137579637, "language_loss": 0.79972851, "learning_rate": 2.0737862399414777e-06, "loss": 0.82460022, "num_input_tokens_seen": 180082000, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.21606445, "step": 8376, "time_per_iteration": 2.8581931591033936 }, { "auxiliary_loss_clip": 0.01447988, "auxiliary_loss_mlp": 0.0103802, "balance_loss_clip": 1.27215457, "balance_loss_mlp": 1.01532292, "epoch": 0.5036524875995791, "flos": 30525794628480.0, "grad_norm": 1.872353408818428, "language_loss": 0.60490125, "learning_rate": 2.0733970420023213e-06, "loss": 0.62976134, "num_input_tokens_seen": 180101340, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.22692871, "step": 8377, "time_per_iteration": 4.293928146362305 }, { "auxiliary_loss_clip": 0.01432364, "auxiliary_loss_mlp": 0.01037567, "balance_loss_clip": 1.26219702, "balance_loss_mlp": 1.01482129, "epoch": 0.5037126108522471, "flos": 14728758099840.0, "grad_norm": 1.873481486683625, "language_loss": 0.77001822, "learning_rate": 2.0730078412799425e-06, "loss": 0.79471749, "num_input_tokens_seen": 180119160, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.22741699, "step": 8378, "time_per_iteration": 4.251403570175171 }, { "auxiliary_loss_clip": 0.01449701, "auxiliary_loss_mlp": 0.01036251, "balance_loss_clip": 1.27809834, "balance_loss_mlp": 1.01519895, "epoch": 0.5037727341049151, "flos": 25307659175040.0, "grad_norm": 1.7644916259846293, "language_loss": 0.75476587, "learning_rate": 2.0726186377890985e-06, "loss": 0.77962542, "num_input_tokens_seen": 180138730, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.21044922, "step": 8379, "time_per_iteration": 4.293873071670532 }, { "auxiliary_loss_clip": 0.01434946, "auxiliary_loss_mlp": 0.01043343, "balance_loss_clip": 1.26542413, "balance_loss_mlp": 1.02136087, "epoch": 0.5038328573575831, "flos": 28551739685760.0, "grad_norm": 2.0331784077447237, "language_loss": 0.67542553, "learning_rate": 2.072229431544548e-06, "loss": 0.70020843, "num_input_tokens_seen": 180158810, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21984863, "step": 8380, "time_per_iteration": 2.9331934452056885 }, { "auxiliary_loss_clip": 0.01423501, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.25548077, "balance_loss_mlp": 1.01760578, "epoch": 0.503892980610251, "flos": 31662419967360.0, "grad_norm": 1.8404363113023556, "language_loss": 0.64109039, "learning_rate": 2.071840222561051e-06, "loss": 0.66572213, "num_input_tokens_seen": 180179700, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.22070312, "step": 8381, "time_per_iteration": 2.939826488494873 }, { "auxiliary_loss_clip": 0.01429939, "auxiliary_loss_mlp": 0.01043949, "balance_loss_clip": 1.25893307, "balance_loss_mlp": 1.02114439, "epoch": 0.503953103862919, "flos": 27100779586560.0, "grad_norm": 1.6933261866109408, "language_loss": 0.67945534, "learning_rate": 2.071451010853365e-06, "loss": 0.70419419, "num_input_tokens_seen": 180199890, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.22827148, "step": 8382, "time_per_iteration": 2.893967628479004 }, { "auxiliary_loss_clip": 0.01453832, "auxiliary_loss_mlp": 0.01040615, "balance_loss_clip": 1.27519011, "balance_loss_mlp": 1.01816785, "epoch": 0.5040132271155869, "flos": 15641439085440.0, "grad_norm": 1.8218705496728391, "language_loss": 0.6279825, "learning_rate": 2.0710617964362506e-06, "loss": 0.65292698, "num_input_tokens_seen": 180217840, "router_z_loss_clip": 1.78515625, "router_z_loss_mlp": 0.22460938, "step": 8383, "time_per_iteration": 2.839463233947754 }, { "auxiliary_loss_clip": 0.01422096, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.25420523, "balance_loss_mlp": 1.01715982, "epoch": 0.504073350368255, "flos": 13598150319360.0, "grad_norm": 3.9612910837046393, "language_loss": 0.68009269, "learning_rate": 2.070672579324465e-06, "loss": 0.70471072, "num_input_tokens_seen": 180236465, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.22546387, "step": 8384, "time_per_iteration": 2.8835644721984863 }, { "auxiliary_loss_clip": 0.01432984, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.26229918, "balance_loss_mlp": 1.01645112, "epoch": 0.5041334736209229, "flos": 29069721866880.0, "grad_norm": 1.674905433489787, "language_loss": 0.72212869, "learning_rate": 2.0702833595327674e-06, "loss": 0.74683452, "num_input_tokens_seen": 180258025, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.21154785, "step": 8385, "time_per_iteration": 2.9000062942504883 }, { "auxiliary_loss_clip": 0.01428234, "auxiliary_loss_mlp": 0.01032515, "balance_loss_clip": 1.26082766, "balance_loss_mlp": 1.0108664, "epoch": 0.5041935968735909, "flos": 24619148766720.0, "grad_norm": 1.8089377201330277, "language_loss": 0.83810043, "learning_rate": 2.069894137075919e-06, "loss": 0.86270797, "num_input_tokens_seen": 180277825, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.21643066, "step": 8386, "time_per_iteration": 2.860795021057129 }, { "auxiliary_loss_clip": 0.01431738, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.26082897, "balance_loss_mlp": 1.01541853, "epoch": 0.5042537201262588, "flos": 26298215740800.0, "grad_norm": 1.6444205664665725, "language_loss": 0.67157227, "learning_rate": 2.0695049119686766e-06, "loss": 0.69626355, "num_input_tokens_seen": 180300465, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21984863, "step": 8387, "time_per_iteration": 2.9011127948760986 }, { "auxiliary_loss_clip": 0.0142495, "auxiliary_loss_mlp": 0.0103753, "balance_loss_clip": 1.2574259, "balance_loss_mlp": 1.01694298, "epoch": 0.5043138433789268, "flos": 22027355562240.0, "grad_norm": 1.4914332464492825, "language_loss": 0.80788159, "learning_rate": 2.0691156842258016e-06, "loss": 0.83250642, "num_input_tokens_seen": 180321050, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20605469, "step": 8388, "time_per_iteration": 2.917672872543335 }, { "auxiliary_loss_clip": 0.01423214, "auxiliary_loss_mlp": 0.01034922, "balance_loss_clip": 1.25372434, "balance_loss_mlp": 1.01403618, "epoch": 0.5043739666315947, "flos": 28778805930240.0, "grad_norm": 2.375197001613976, "language_loss": 0.71044517, "learning_rate": 2.0687264538620537e-06, "loss": 0.73502654, "num_input_tokens_seen": 180338870, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20910645, "step": 8389, "time_per_iteration": 2.924994707107544 }, { "auxiliary_loss_clip": 0.01452734, "auxiliary_loss_mlp": 0.01043091, "balance_loss_clip": 1.2789737, "balance_loss_mlp": 1.02103686, "epoch": 0.5044340898842627, "flos": 27610572458880.0, "grad_norm": 1.6621339312405221, "language_loss": 0.70131707, "learning_rate": 2.068337220892191e-06, "loss": 0.72627532, "num_input_tokens_seen": 180361285, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.22058105, "step": 8390, "time_per_iteration": 2.9571189880371094 }, { "auxiliary_loss_clip": 0.01217353, "auxiliary_loss_mlp": 0.01034245, "balance_loss_clip": 1.12460828, "balance_loss_mlp": 1.00468147, "epoch": 0.5044942131369307, "flos": 67483954160640.0, "grad_norm": 0.8589424854474446, "language_loss": 0.53022027, "learning_rate": 2.067947985330974e-06, "loss": 0.55273628, "num_input_tokens_seen": 180415170, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.29492188, "step": 8391, "time_per_iteration": 3.1589062213897705 }, { "auxiliary_loss_clip": 0.01213285, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.12073684, "balance_loss_mlp": 1.00441837, "epoch": 0.5045543363895987, "flos": 58655073185280.0, "grad_norm": 0.8996938084945462, "language_loss": 0.60749388, "learning_rate": 2.0675587471931628e-06, "loss": 0.63001043, "num_input_tokens_seen": 180468060, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.33984375, "step": 8392, "time_per_iteration": 3.2038748264312744 }, { "auxiliary_loss_clip": 0.01424458, "auxiliary_loss_mlp": 0.01039468, "balance_loss_clip": 1.25749922, "balance_loss_mlp": 1.01827288, "epoch": 0.5046144596422667, "flos": 22536334028160.0, "grad_norm": 1.4971834698392428, "language_loss": 0.84951389, "learning_rate": 2.067169506493517e-06, "loss": 0.87415314, "num_input_tokens_seen": 180486610, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.21191406, "step": 8393, "time_per_iteration": 2.880765676498413 }, { "auxiliary_loss_clip": 0.01431506, "auxiliary_loss_mlp": 0.01039509, "balance_loss_clip": 1.26138091, "balance_loss_mlp": 1.01803994, "epoch": 0.5046745828949346, "flos": 27465770540160.0, "grad_norm": 1.864628479015273, "language_loss": 0.51377147, "learning_rate": 2.0667802632467974e-06, "loss": 0.53848159, "num_input_tokens_seen": 180508135, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.21484375, "step": 8394, "time_per_iteration": 2.918851852416992 }, { "auxiliary_loss_clip": 0.01445159, "auxiliary_loss_mlp": 0.01040112, "balance_loss_clip": 1.27287519, "balance_loss_mlp": 1.01869035, "epoch": 0.5047347061476026, "flos": 17283378061440.0, "grad_norm": 1.5682013731798858, "language_loss": 0.76002765, "learning_rate": 2.0663910174677627e-06, "loss": 0.78488034, "num_input_tokens_seen": 180527000, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21435547, "step": 8395, "time_per_iteration": 2.892678737640381 }, { "auxiliary_loss_clip": 0.01446665, "auxiliary_loss_mlp": 0.01039173, "balance_loss_clip": 1.2756542, "balance_loss_mlp": 1.01834667, "epoch": 0.5047948294002705, "flos": 16656952798080.0, "grad_norm": 2.0710543108812027, "language_loss": 0.69213825, "learning_rate": 2.0660017691711737e-06, "loss": 0.71699667, "num_input_tokens_seen": 180544715, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.20837402, "step": 8396, "time_per_iteration": 2.8517794609069824 }, { "auxiliary_loss_clip": 0.0144869, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.27853036, "balance_loss_mlp": 1.01834106, "epoch": 0.5048549526529386, "flos": 26875839847680.0, "grad_norm": 2.4025512674138128, "language_loss": 0.79031914, "learning_rate": 2.065612518371792e-06, "loss": 0.81519997, "num_input_tokens_seen": 180565365, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21057129, "step": 8397, "time_per_iteration": 2.906883955001831 }, { "auxiliary_loss_clip": 0.01424657, "auxiliary_loss_mlp": 0.01037695, "balance_loss_clip": 1.25700569, "balance_loss_mlp": 1.01691651, "epoch": 0.5049150759056065, "flos": 21843615853440.0, "grad_norm": 1.5323956235716287, "language_loss": 0.6670686, "learning_rate": 2.065223265084376e-06, "loss": 0.69169199, "num_input_tokens_seen": 180586670, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20776367, "step": 8398, "time_per_iteration": 2.876971483230591 }, { "auxiliary_loss_clip": 0.01430321, "auxiliary_loss_mlp": 0.01036852, "balance_loss_clip": 1.25995624, "balance_loss_mlp": 1.01364183, "epoch": 0.5049751991582745, "flos": 21694877637120.0, "grad_norm": 1.640642452848556, "language_loss": 0.72115183, "learning_rate": 2.064834009323688e-06, "loss": 0.7458235, "num_input_tokens_seen": 180605085, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.23205566, "step": 8399, "time_per_iteration": 2.856163263320923 }, { "auxiliary_loss_clip": 0.01445142, "auxiliary_loss_mlp": 0.01044064, "balance_loss_clip": 1.27089179, "balance_loss_mlp": 1.0222491, "epoch": 0.5050353224109424, "flos": 21368869718400.0, "grad_norm": 2.177903217927153, "language_loss": 0.82704031, "learning_rate": 2.0644447511044878e-06, "loss": 0.85193241, "num_input_tokens_seen": 180624370, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.21789551, "step": 8400, "time_per_iteration": 2.836794376373291 }, { "auxiliary_loss_clip": 0.01430908, "auxiliary_loss_mlp": 0.01040565, "balance_loss_clip": 1.26082551, "balance_loss_mlp": 1.01842833, "epoch": 0.5050954456636104, "flos": 22830281366400.0, "grad_norm": 2.0007364024903995, "language_loss": 0.79622334, "learning_rate": 2.0640554904415362e-06, "loss": 0.82093805, "num_input_tokens_seen": 180642450, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.22119141, "step": 8401, "time_per_iteration": 4.242810487747192 }, { "auxiliary_loss_clip": 0.01451447, "auxiliary_loss_mlp": 0.01038507, "balance_loss_clip": 1.27609134, "balance_loss_mlp": 1.01676357, "epoch": 0.5051555689162783, "flos": 30461085285120.0, "grad_norm": 1.9758844169241188, "language_loss": 0.70358086, "learning_rate": 2.063666227349593e-06, "loss": 0.72848034, "num_input_tokens_seen": 180665250, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.2175293, "step": 8402, "time_per_iteration": 2.9414234161376953 }, { "auxiliary_loss_clip": 0.0142672, "auxiliary_loss_mlp": 0.01034451, "balance_loss_clip": 1.25642586, "balance_loss_mlp": 1.01304078, "epoch": 0.5052156921689464, "flos": 21297826103040.0, "grad_norm": 1.6316024423516056, "language_loss": 0.7007392, "learning_rate": 2.063276961843422e-06, "loss": 0.72535092, "num_input_tokens_seen": 180687425, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.2142334, "step": 8403, "time_per_iteration": 2.9457948207855225 }, { "auxiliary_loss_clip": 0.01427959, "auxiliary_loss_mlp": 0.0103989, "balance_loss_clip": 1.25934362, "balance_loss_mlp": 1.01837289, "epoch": 0.5052758154216143, "flos": 25091858885760.0, "grad_norm": 1.360456053684624, "language_loss": 0.86400396, "learning_rate": 2.062887693937781e-06, "loss": 0.88868248, "num_input_tokens_seen": 180708725, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21508789, "step": 8404, "time_per_iteration": 2.8990824222564697 }, { "auxiliary_loss_clip": 0.01434073, "auxiliary_loss_mlp": 0.01042366, "balance_loss_clip": 1.26511669, "balance_loss_mlp": 1.02132559, "epoch": 0.5053359386742823, "flos": 20894983234560.0, "grad_norm": 1.5863972537086344, "language_loss": 0.76160002, "learning_rate": 2.0624984236474322e-06, "loss": 0.78636444, "num_input_tokens_seen": 180727990, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21032715, "step": 8405, "time_per_iteration": 2.8412139415740967 }, { "auxiliary_loss_clip": 0.01439347, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.26580203, "balance_loss_mlp": 1.01267087, "epoch": 0.5053960619269503, "flos": 37757556241920.0, "grad_norm": 1.6429847174801282, "language_loss": 0.73644161, "learning_rate": 2.0621091509871378e-06, "loss": 0.76118064, "num_input_tokens_seen": 180749765, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.21887207, "step": 8406, "time_per_iteration": 2.9892184734344482 }, { "auxiliary_loss_clip": 0.01414694, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.24901152, "balance_loss_mlp": 1.01471615, "epoch": 0.5054561851796182, "flos": 23524356885120.0, "grad_norm": 1.7404957402067074, "language_loss": 0.76938939, "learning_rate": 2.0617198759716568e-06, "loss": 0.79389322, "num_input_tokens_seen": 180769580, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.2097168, "step": 8407, "time_per_iteration": 2.9271011352539062 }, { "auxiliary_loss_clip": 0.01434682, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.26178205, "balance_loss_mlp": 1.0132463, "epoch": 0.5055163084322862, "flos": 30422690432640.0, "grad_norm": 1.6296972485519716, "language_loss": 0.64397192, "learning_rate": 2.0613305986157535e-06, "loss": 0.66865736, "num_input_tokens_seen": 180790295, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.20617676, "step": 8408, "time_per_iteration": 2.937140464782715 }, { "auxiliary_loss_clip": 0.01434018, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.26410151, "balance_loss_mlp": 1.01751912, "epoch": 0.5055764316849541, "flos": 20267517340800.0, "grad_norm": 2.010447403194848, "language_loss": 0.64273238, "learning_rate": 2.0609413189341865e-06, "loss": 0.6674673, "num_input_tokens_seen": 180807875, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.21948242, "step": 8409, "time_per_iteration": 2.874007225036621 }, { "auxiliary_loss_clip": 0.01423959, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.25606585, "balance_loss_mlp": 1.01878381, "epoch": 0.5056365549376222, "flos": 26082867899520.0, "grad_norm": 1.3038123217300452, "language_loss": 0.70965564, "learning_rate": 2.0605520369417193e-06, "loss": 0.73428798, "num_input_tokens_seen": 180831300, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20507812, "step": 8410, "time_per_iteration": 2.8882715702056885 }, { "auxiliary_loss_clip": 0.01436225, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.26555562, "balance_loss_mlp": 1.02271914, "epoch": 0.5056966781902901, "flos": 19287864771840.0, "grad_norm": 1.7536641631581105, "language_loss": 0.79769498, "learning_rate": 2.060162752653113e-06, "loss": 0.82249802, "num_input_tokens_seen": 180849055, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21362305, "step": 8411, "time_per_iteration": 2.82243013381958 }, { "auxiliary_loss_clip": 0.01440756, "auxiliary_loss_mlp": 0.01044519, "balance_loss_clip": 1.26840961, "balance_loss_mlp": 1.02259612, "epoch": 0.5057568014429581, "flos": 21332918085120.0, "grad_norm": 2.8024781377972143, "language_loss": 0.82198322, "learning_rate": 2.0597734660831285e-06, "loss": 0.84683597, "num_input_tokens_seen": 180867395, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.21899414, "step": 8412, "time_per_iteration": 4.352912187576294 }, { "auxiliary_loss_clip": 0.01430433, "auxiliary_loss_mlp": 0.01042969, "balance_loss_clip": 1.26105785, "balance_loss_mlp": 1.0218091, "epoch": 0.505816924695626, "flos": 17502526465920.0, "grad_norm": 1.8306227199991982, "language_loss": 0.81850177, "learning_rate": 2.0593841772465283e-06, "loss": 0.84323573, "num_input_tokens_seen": 180886670, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.21154785, "step": 8413, "time_per_iteration": 4.262189865112305 }, { "auxiliary_loss_clip": 0.01434139, "auxiliary_loss_mlp": 0.01043944, "balance_loss_clip": 1.26196742, "balance_loss_mlp": 1.02116299, "epoch": 0.505877047948294, "flos": 21151938309120.0, "grad_norm": 1.9612729048308963, "language_loss": 0.81214094, "learning_rate": 2.0589948861580737e-06, "loss": 0.83692169, "num_input_tokens_seen": 180904645, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.22790527, "step": 8414, "time_per_iteration": 4.2377495765686035 }, { "auxiliary_loss_clip": 0.01437115, "auxiliary_loss_mlp": 0.01043772, "balance_loss_clip": 1.26679611, "balance_loss_mlp": 1.02182508, "epoch": 0.5059371712009619, "flos": 36362120791680.0, "grad_norm": 2.0150183168087357, "language_loss": 0.62961209, "learning_rate": 2.058605592832528e-06, "loss": 0.65442097, "num_input_tokens_seen": 180922340, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.21948242, "step": 8415, "time_per_iteration": 2.972032070159912 }, { "auxiliary_loss_clip": 0.01438642, "auxiliary_loss_mlp": 0.0104251, "balance_loss_clip": 1.26687741, "balance_loss_mlp": 1.02079034, "epoch": 0.50599729445363, "flos": 22683352942080.0, "grad_norm": 1.5166072449560248, "language_loss": 0.82559305, "learning_rate": 2.0582162972846515e-06, "loss": 0.85040462, "num_input_tokens_seen": 180941350, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.21716309, "step": 8416, "time_per_iteration": 2.865138053894043 }, { "auxiliary_loss_clip": 0.01430945, "auxiliary_loss_mlp": 0.01042171, "balance_loss_clip": 1.2641269, "balance_loss_mlp": 1.02055824, "epoch": 0.5060574177062979, "flos": 22758242365440.0, "grad_norm": 1.732460657027843, "language_loss": 0.79905635, "learning_rate": 2.0578269995292078e-06, "loss": 0.82378751, "num_input_tokens_seen": 180960720, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.21606445, "step": 8417, "time_per_iteration": 2.905461072921753 }, { "auxiliary_loss_clip": 0.01415068, "auxiliary_loss_mlp": 0.01044333, "balance_loss_clip": 1.2491802, "balance_loss_mlp": 1.02263641, "epoch": 0.5061175409589659, "flos": 21663269504640.0, "grad_norm": 1.8617179321424882, "language_loss": 0.63071805, "learning_rate": 2.0574376995809588e-06, "loss": 0.65531206, "num_input_tokens_seen": 180979725, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.21691895, "step": 8418, "time_per_iteration": 2.9169647693634033 }, { "auxiliary_loss_clip": 0.01445495, "auxiliary_loss_mlp": 0.01043975, "balance_loss_clip": 1.27124977, "balance_loss_mlp": 1.02216029, "epoch": 0.5061776642116339, "flos": 21626186751360.0, "grad_norm": 1.8345767274019003, "language_loss": 0.78280735, "learning_rate": 2.0570483974546653e-06, "loss": 0.80770206, "num_input_tokens_seen": 180998980, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21801758, "step": 8419, "time_per_iteration": 2.885929584503174 }, { "auxiliary_loss_clip": 0.0145107, "auxiliary_loss_mlp": 0.01044201, "balance_loss_clip": 1.27652955, "balance_loss_mlp": 1.0221715, "epoch": 0.5062377874643018, "flos": 24437128360320.0, "grad_norm": 1.9432323093172976, "language_loss": 0.78022265, "learning_rate": 2.0566590931650917e-06, "loss": 0.80517542, "num_input_tokens_seen": 181019165, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.22033691, "step": 8420, "time_per_iteration": 2.917098045349121 }, { "auxiliary_loss_clip": 0.01449041, "auxiliary_loss_mlp": 0.01039597, "balance_loss_clip": 1.27623057, "balance_loss_mlp": 1.01874733, "epoch": 0.5062979107169698, "flos": 22533574095360.0, "grad_norm": 1.9445207407282616, "language_loss": 0.78593767, "learning_rate": 2.056269786726999e-06, "loss": 0.81082404, "num_input_tokens_seen": 181037110, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.20861816, "step": 8421, "time_per_iteration": 2.8774590492248535 }, { "auxiliary_loss_clip": 0.01437301, "auxiliary_loss_mlp": 0.0103496, "balance_loss_clip": 1.266541, "balance_loss_mlp": 1.01505232, "epoch": 0.5063580339696377, "flos": 24582789930240.0, "grad_norm": 1.5494780858198773, "language_loss": 0.67388105, "learning_rate": 2.0558804781551512e-06, "loss": 0.69860363, "num_input_tokens_seen": 181057775, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.19897461, "step": 8422, "time_per_iteration": 2.868140935897827 }, { "auxiliary_loss_clip": 0.01437583, "auxiliary_loss_mlp": 0.01034871, "balance_loss_clip": 1.26879644, "balance_loss_mlp": 1.01416397, "epoch": 0.5064181572223058, "flos": 22604979669120.0, "grad_norm": 1.6061202140833888, "language_loss": 0.8217721, "learning_rate": 2.05549116746431e-06, "loss": 0.84649658, "num_input_tokens_seen": 181078260, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.20690918, "step": 8423, "time_per_iteration": 2.9397659301757812 }, { "auxiliary_loss_clip": 0.01436, "auxiliary_loss_mlp": 0.01039413, "balance_loss_clip": 1.26518965, "balance_loss_mlp": 1.01703691, "epoch": 0.5064782804749737, "flos": 26006123439360.0, "grad_norm": 1.8868588364196885, "language_loss": 0.75892246, "learning_rate": 2.055101854669237e-06, "loss": 0.78367656, "num_input_tokens_seen": 181098755, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.22387695, "step": 8424, "time_per_iteration": 2.8965418338775635 }, { "auxiliary_loss_clip": 0.01429659, "auxiliary_loss_mlp": 0.0103458, "balance_loss_clip": 1.26327729, "balance_loss_mlp": 1.01455271, "epoch": 0.5065384037276417, "flos": 28565946552960.0, "grad_norm": 2.063314636927337, "language_loss": 0.71662986, "learning_rate": 2.0547125397846975e-06, "loss": 0.74127233, "num_input_tokens_seen": 181121570, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20031738, "step": 8425, "time_per_iteration": 2.8993122577667236 }, { "auxiliary_loss_clip": 0.01436385, "auxiliary_loss_mlp": 0.01037269, "balance_loss_clip": 1.26627862, "balance_loss_mlp": 1.01718211, "epoch": 0.5065985269803096, "flos": 22976259649920.0, "grad_norm": 1.8888708848470737, "language_loss": 0.79546428, "learning_rate": 2.0543232228254524e-06, "loss": 0.8202008, "num_input_tokens_seen": 181140240, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.20092773, "step": 8426, "time_per_iteration": 2.885683298110962 }, { "auxiliary_loss_clip": 0.01439051, "auxiliary_loss_mlp": 0.01035264, "balance_loss_clip": 1.26913214, "balance_loss_mlp": 1.01448607, "epoch": 0.5066586502329776, "flos": 21616504364160.0, "grad_norm": 2.1367352476721098, "language_loss": 0.78653181, "learning_rate": 2.053933903806265e-06, "loss": 0.81127489, "num_input_tokens_seen": 181158630, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.2076416, "step": 8427, "time_per_iteration": 2.8570566177368164 }, { "auxiliary_loss_clip": 0.01427024, "auxiliary_loss_mlp": 0.01033046, "balance_loss_clip": 1.25858641, "balance_loss_mlp": 1.0127331, "epoch": 0.5067187734856455, "flos": 20349691176960.0, "grad_norm": 1.9628054624046263, "language_loss": 0.72443831, "learning_rate": 2.0535445827418997e-06, "loss": 0.74903893, "num_input_tokens_seen": 181176405, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20300293, "step": 8428, "time_per_iteration": 2.866793394088745 }, { "auxiliary_loss_clip": 0.01429714, "auxiliary_loss_mlp": 0.0103936, "balance_loss_clip": 1.26129866, "balance_loss_mlp": 1.01969039, "epoch": 0.5067788967383136, "flos": 28853061926400.0, "grad_norm": 1.8357902240640165, "language_loss": 0.83934456, "learning_rate": 2.0531552596471168e-06, "loss": 0.86403525, "num_input_tokens_seen": 181197595, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19665527, "step": 8429, "time_per_iteration": 2.8790953159332275 }, { "auxiliary_loss_clip": 0.01460446, "auxiliary_loss_mlp": 0.01040278, "balance_loss_clip": 1.28460729, "balance_loss_mlp": 1.01913011, "epoch": 0.5068390199909815, "flos": 32462314369920.0, "grad_norm": 1.7208891345236699, "language_loss": 0.73860759, "learning_rate": 2.052765934536682e-06, "loss": 0.76361477, "num_input_tokens_seen": 181218560, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21154785, "step": 8430, "time_per_iteration": 2.929250955581665 }, { "auxiliary_loss_clip": 0.01432024, "auxiliary_loss_mlp": 0.01036914, "balance_loss_clip": 1.26291287, "balance_loss_mlp": 1.016482, "epoch": 0.5068991432436495, "flos": 23156379774720.0, "grad_norm": 2.0686053569944005, "language_loss": 0.77607858, "learning_rate": 2.0523766074253575e-06, "loss": 0.80076796, "num_input_tokens_seen": 181237095, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20446777, "step": 8431, "time_per_iteration": 2.8330466747283936 }, { "auxiliary_loss_clip": 0.01426981, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.2587738, "balance_loss_mlp": 1.01780772, "epoch": 0.5069592664963174, "flos": 19945129006080.0, "grad_norm": 1.4482861068722297, "language_loss": 0.73027706, "learning_rate": 2.0519872783279074e-06, "loss": 0.75493163, "num_input_tokens_seen": 181255940, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20666504, "step": 8432, "time_per_iteration": 2.852435827255249 }, { "auxiliary_loss_clip": 0.01221189, "auxiliary_loss_mlp": 0.01040828, "balance_loss_clip": 1.1277914, "balance_loss_mlp": 1.01393437, "epoch": 0.5070193897489854, "flos": 65822894616960.0, "grad_norm": 0.7653311351175733, "language_loss": 0.63771522, "learning_rate": 2.0515979472590945e-06, "loss": 0.66033536, "num_input_tokens_seen": 181316945, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.26953125, "step": 8433, "time_per_iteration": 3.3860888481140137 }, { "auxiliary_loss_clip": 0.01438387, "auxiliary_loss_mlp": 0.01038699, "balance_loss_clip": 1.26827097, "balance_loss_mlp": 1.01811135, "epoch": 0.5070795130016534, "flos": 17284373447040.0, "grad_norm": 1.7378845091205517, "language_loss": 0.77607685, "learning_rate": 2.051208614233681e-06, "loss": 0.80084777, "num_input_tokens_seen": 181335555, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20593262, "step": 8434, "time_per_iteration": 2.8372671604156494 }, { "auxiliary_loss_clip": 0.01451932, "auxiliary_loss_mlp": 0.01043663, "balance_loss_clip": 1.27796197, "balance_loss_mlp": 1.0227294, "epoch": 0.5071396362543213, "flos": 21079989797760.0, "grad_norm": 1.9051270118005597, "language_loss": 0.71920085, "learning_rate": 2.0508192792664326e-06, "loss": 0.74415678, "num_input_tokens_seen": 181354580, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.20935059, "step": 8435, "time_per_iteration": 2.8685712814331055 }, { "auxiliary_loss_clip": 0.01439336, "auxiliary_loss_mlp": 0.01038588, "balance_loss_clip": 1.26728106, "balance_loss_mlp": 1.01666522, "epoch": 0.5071997595069894, "flos": 23154162779520.0, "grad_norm": 2.29015857664624, "language_loss": 0.72786754, "learning_rate": 2.050429942372112e-06, "loss": 0.75264674, "num_input_tokens_seen": 181374320, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.21923828, "step": 8436, "time_per_iteration": 4.290988445281982 }, { "auxiliary_loss_clip": 0.01439128, "auxiliary_loss_mlp": 0.01034381, "balance_loss_clip": 1.26943064, "balance_loss_mlp": 1.01244605, "epoch": 0.5072598827596573, "flos": 22757382714240.0, "grad_norm": 1.583456893096289, "language_loss": 0.84407461, "learning_rate": 2.050040603565483e-06, "loss": 0.86880958, "num_input_tokens_seen": 181392190, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21948242, "step": 8437, "time_per_iteration": 2.816401481628418 }, { "auxiliary_loss_clip": 0.01427753, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.25975585, "balance_loss_mlp": 1.01726496, "epoch": 0.5073200060123253, "flos": 22576855386240.0, "grad_norm": 1.4303919173805857, "language_loss": 0.8158921, "learning_rate": 2.049651262861309e-06, "loss": 0.840545, "num_input_tokens_seen": 181413890, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20263672, "step": 8438, "time_per_iteration": 2.8777084350585938 }, { "auxiliary_loss_clip": 0.01436973, "auxiliary_loss_mlp": 0.01035054, "balance_loss_clip": 1.26480055, "balance_loss_mlp": 1.0133934, "epoch": 0.5073801292649932, "flos": 25815868479360.0, "grad_norm": 1.7607600728688417, "language_loss": 0.80178136, "learning_rate": 2.0492619202743543e-06, "loss": 0.82650161, "num_input_tokens_seen": 181433240, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.2166748, "step": 8439, "time_per_iteration": 2.858907461166382 }, { "auxiliary_loss_clip": 0.01418149, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.25240088, "balance_loss_mlp": 1.01291025, "epoch": 0.5074402525176612, "flos": 25384765593600.0, "grad_norm": 1.5998205616015522, "language_loss": 0.72110605, "learning_rate": 2.048872575819383e-06, "loss": 0.74561858, "num_input_tokens_seen": 181453535, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20202637, "step": 8440, "time_per_iteration": 2.9026904106140137 }, { "auxiliary_loss_clip": 0.01430866, "auxiliary_loss_mlp": 0.01037097, "balance_loss_clip": 1.26155519, "balance_loss_mlp": 1.01594889, "epoch": 0.5075003757703291, "flos": 26074723835520.0, "grad_norm": 1.7041340309360287, "language_loss": 0.71429974, "learning_rate": 2.048483229511158e-06, "loss": 0.73897934, "num_input_tokens_seen": 181474195, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.21154785, "step": 8441, "time_per_iteration": 2.887484550476074 }, { "auxiliary_loss_clip": 0.01442662, "auxiliary_loss_mlp": 0.01041246, "balance_loss_clip": 1.26866114, "balance_loss_mlp": 1.01977658, "epoch": 0.5075604990229972, "flos": 21845516135040.0, "grad_norm": 1.7267076387361733, "language_loss": 0.64661813, "learning_rate": 2.0480938813644445e-06, "loss": 0.67145723, "num_input_tokens_seen": 181494000, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.21484375, "step": 8442, "time_per_iteration": 2.8651015758514404 }, { "auxiliary_loss_clip": 0.0141437, "auxiliary_loss_mlp": 0.01030714, "balance_loss_clip": 1.24971962, "balance_loss_mlp": 1.0102694, "epoch": 0.5076206222756651, "flos": 31991459287680.0, "grad_norm": 1.6497126657473655, "language_loss": 0.71864057, "learning_rate": 2.047704531394006e-06, "loss": 0.7430914, "num_input_tokens_seen": 181515955, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.2043457, "step": 8443, "time_per_iteration": 2.965240478515625 }, { "auxiliary_loss_clip": 0.01448525, "auxiliary_loss_mlp": 0.01039972, "balance_loss_clip": 1.27626359, "balance_loss_mlp": 1.01858568, "epoch": 0.5076807455283331, "flos": 36918000132480.0, "grad_norm": 1.3541423082557058, "language_loss": 0.62481171, "learning_rate": 2.047315179614607e-06, "loss": 0.64969659, "num_input_tokens_seen": 181540225, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21374512, "step": 8444, "time_per_iteration": 3.0283420085906982 }, { "auxiliary_loss_clip": 0.01431626, "auxiliary_loss_mlp": 0.01033994, "balance_loss_clip": 1.26292276, "balance_loss_mlp": 1.01278698, "epoch": 0.507740868781001, "flos": 29874593197440.0, "grad_norm": 2.6411471704673612, "language_loss": 0.64609236, "learning_rate": 2.046925826041012e-06, "loss": 0.67074859, "num_input_tokens_seen": 181560125, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.2121582, "step": 8445, "time_per_iteration": 2.9207613468170166 }, { "auxiliary_loss_clip": 0.01216104, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.12189209, "balance_loss_mlp": 1.00354099, "epoch": 0.507800992033669, "flos": 61945194919680.0, "grad_norm": 0.8654408244143234, "language_loss": 0.62085617, "learning_rate": 2.0465364706879845e-06, "loss": 0.64331579, "num_input_tokens_seen": 181618830, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 0.26367188, "step": 8446, "time_per_iteration": 3.4733080863952637 }, { "auxiliary_loss_clip": 0.014274, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.25941229, "balance_loss_mlp": 1.01484013, "epoch": 0.507861115286337, "flos": 20709252754560.0, "grad_norm": 1.5720390053304352, "language_loss": 0.81486833, "learning_rate": 2.04614711357029e-06, "loss": 0.83950096, "num_input_tokens_seen": 181637120, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21008301, "step": 8447, "time_per_iteration": 4.339062452316284 }, { "auxiliary_loss_clip": 0.01418025, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.25277555, "balance_loss_mlp": 1.01272357, "epoch": 0.507921238539005, "flos": 30859358428800.0, "grad_norm": 1.522616806236274, "language_loss": 0.71560478, "learning_rate": 2.0457577547026916e-06, "loss": 0.74011636, "num_input_tokens_seen": 181659965, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20410156, "step": 8448, "time_per_iteration": 4.964825391769409 }, { "auxiliary_loss_clip": 0.01425137, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.25868201, "balance_loss_mlp": 1.0130204, "epoch": 0.507981361791673, "flos": 35713905517440.0, "grad_norm": 1.4941117690384542, "language_loss": 0.72284615, "learning_rate": 2.045368394099955e-06, "loss": 0.74743307, "num_input_tokens_seen": 181685290, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20532227, "step": 8449, "time_per_iteration": 4.393014669418335 }, { "auxiliary_loss_clip": 0.01420406, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.25414467, "balance_loss_mlp": 1.01688552, "epoch": 0.5080414850443409, "flos": 27172682853120.0, "grad_norm": 1.7344679430012517, "language_loss": 0.73895395, "learning_rate": 2.044979031776844e-06, "loss": 0.7635355, "num_input_tokens_seen": 181706080, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20874023, "step": 8450, "time_per_iteration": 2.9104833602905273 }, { "auxiliary_loss_clip": 0.0142532, "auxiliary_loss_mlp": 0.01035676, "balance_loss_clip": 1.25542557, "balance_loss_mlp": 1.0142777, "epoch": 0.5081016082970089, "flos": 27095531189760.0, "grad_norm": 1.682311419242521, "language_loss": 0.77404702, "learning_rate": 2.0445896677481234e-06, "loss": 0.79865694, "num_input_tokens_seen": 181724805, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.21386719, "step": 8451, "time_per_iteration": 2.882288932800293 }, { "auxiliary_loss_clip": 0.01429199, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 1.25828826, "balance_loss_mlp": 1.01462281, "epoch": 0.5081617315496768, "flos": 22866730692480.0, "grad_norm": 1.9421302278091368, "language_loss": 0.86212534, "learning_rate": 2.044200302028559e-06, "loss": 0.88677466, "num_input_tokens_seen": 181743725, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21118164, "step": 8452, "time_per_iteration": 2.833099126815796 }, { "auxiliary_loss_clip": 0.01432897, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.2606442, "balance_loss_mlp": 1.01395726, "epoch": 0.5082218548023448, "flos": 16288523239680.0, "grad_norm": 2.7281836908078336, "language_loss": 0.79008687, "learning_rate": 2.0438109346329143e-06, "loss": 0.81477797, "num_input_tokens_seen": 181757720, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.22253418, "step": 8453, "time_per_iteration": 2.839404821395874 }, { "auxiliary_loss_clip": 0.01418628, "auxiliary_loss_mlp": 0.01035125, "balance_loss_clip": 1.25290036, "balance_loss_mlp": 1.0133692, "epoch": 0.5082819780550127, "flos": 24471089222400.0, "grad_norm": 2.17531218197675, "language_loss": 0.77645528, "learning_rate": 2.0434215655759544e-06, "loss": 0.80099279, "num_input_tokens_seen": 181778545, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.2175293, "step": 8454, "time_per_iteration": 2.8677916526794434 }, { "auxiliary_loss_clip": 0.01429336, "auxiliary_loss_mlp": 0.01036183, "balance_loss_clip": 1.25981522, "balance_loss_mlp": 1.0144515, "epoch": 0.5083421013076808, "flos": 23413470583680.0, "grad_norm": 2.4041914716097925, "language_loss": 0.89991367, "learning_rate": 2.0430321948724446e-06, "loss": 0.92456883, "num_input_tokens_seen": 181799495, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21740723, "step": 8455, "time_per_iteration": 2.873537302017212 }, { "auxiliary_loss_clip": 0.01435059, "auxiliary_loss_mlp": 0.0103809, "balance_loss_clip": 1.26044369, "balance_loss_mlp": 1.01539254, "epoch": 0.5084022245603487, "flos": 23881746712320.0, "grad_norm": 1.7866825756749463, "language_loss": 0.63025296, "learning_rate": 2.042642822537149e-06, "loss": 0.65498441, "num_input_tokens_seen": 181818400, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.22692871, "step": 8456, "time_per_iteration": 2.8587663173675537 }, { "auxiliary_loss_clip": 0.01217567, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.1222471, "balance_loss_mlp": 1.00522864, "epoch": 0.5084623478130167, "flos": 62901292930560.0, "grad_norm": 0.8306785563096524, "language_loss": 0.62488312, "learning_rate": 2.0422534485848343e-06, "loss": 0.64734852, "num_input_tokens_seen": 181875975, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.23730469, "step": 8457, "time_per_iteration": 3.244344711303711 }, { "auxiliary_loss_clip": 0.0144601, "auxiliary_loss_mlp": 0.01035408, "balance_loss_clip": 1.27390051, "balance_loss_mlp": 1.01489186, "epoch": 0.5085224710656846, "flos": 22356304392960.0, "grad_norm": 2.3328748839939553, "language_loss": 0.68403846, "learning_rate": 2.0418640730302644e-06, "loss": 0.70885265, "num_input_tokens_seen": 181896450, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.2052002, "step": 8458, "time_per_iteration": 2.9490063190460205 }, { "auxiliary_loss_clip": 0.01432609, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.26065946, "balance_loss_mlp": 1.01539552, "epoch": 0.5085825943183526, "flos": 26077031320320.0, "grad_norm": 1.8405414946572345, "language_loss": 0.78292191, "learning_rate": 2.0414746958882043e-06, "loss": 0.80761492, "num_input_tokens_seen": 181916770, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.2130127, "step": 8459, "time_per_iteration": 2.911602735519409 }, { "auxiliary_loss_clip": 0.01446789, "auxiliary_loss_mlp": 0.01039546, "balance_loss_clip": 1.27260363, "balance_loss_mlp": 1.01748037, "epoch": 0.5086427175710206, "flos": 17429673058560.0, "grad_norm": 2.7233004018424007, "language_loss": 0.81393164, "learning_rate": 2.0410853171734196e-06, "loss": 0.83879501, "num_input_tokens_seen": 181932710, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.22058105, "step": 8460, "time_per_iteration": 2.85504150390625 }, { "auxiliary_loss_clip": 0.01443177, "auxiliary_loss_mlp": 0.01034761, "balance_loss_clip": 1.27052593, "balance_loss_mlp": 1.01363707, "epoch": 0.5087028408236886, "flos": 20641602499200.0, "grad_norm": 1.8157133034936361, "language_loss": 0.69996375, "learning_rate": 2.0406959369006754e-06, "loss": 0.72474313, "num_input_tokens_seen": 181950665, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.21118164, "step": 8461, "time_per_iteration": 2.847578287124634 }, { "auxiliary_loss_clip": 0.01415334, "auxiliary_loss_mlp": 0.01037439, "balance_loss_clip": 1.24991202, "balance_loss_mlp": 1.01488471, "epoch": 0.5087629640763566, "flos": 25604954628480.0, "grad_norm": 1.7134584303536429, "language_loss": 0.7685191, "learning_rate": 2.0403065550847375e-06, "loss": 0.79304683, "num_input_tokens_seen": 181971270, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.22558594, "step": 8462, "time_per_iteration": 2.894981861114502 }, { "auxiliary_loss_clip": 0.01427752, "auxiliary_loss_mlp": 0.01033565, "balance_loss_clip": 1.25948405, "balance_loss_mlp": 1.01225042, "epoch": 0.5088230873290245, "flos": 13269563447040.0, "grad_norm": 2.338163313496933, "language_loss": 0.83358115, "learning_rate": 2.0399171717403706e-06, "loss": 0.85819429, "num_input_tokens_seen": 181988410, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.21313477, "step": 8463, "time_per_iteration": 2.8044345378875732 }, { "auxiliary_loss_clip": 0.01433299, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.26376116, "balance_loss_mlp": 1.01255465, "epoch": 0.5088832105816925, "flos": 20051988520320.0, "grad_norm": 1.7699699636978816, "language_loss": 0.76720876, "learning_rate": 2.039527786882341e-06, "loss": 0.79188377, "num_input_tokens_seen": 182006530, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21655273, "step": 8464, "time_per_iteration": 2.8320083618164062 }, { "auxiliary_loss_clip": 0.01211248, "auxiliary_loss_mlp": 0.01019708, "balance_loss_clip": 1.11585331, "balance_loss_mlp": 0.99834615, "epoch": 0.5089433338343604, "flos": 67457702897280.0, "grad_norm": 0.6823945633790366, "language_loss": 0.59381807, "learning_rate": 2.0391384005254133e-06, "loss": 0.61612767, "num_input_tokens_seen": 182074240, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.21386719, "step": 8465, "time_per_iteration": 3.509812593460083 }, { "auxiliary_loss_clip": 0.01427481, "auxiliary_loss_mlp": 0.0103926, "balance_loss_clip": 1.25800645, "balance_loss_mlp": 1.01762342, "epoch": 0.5090034570870284, "flos": 22720435695360.0, "grad_norm": 2.0060204939731205, "language_loss": 0.80920583, "learning_rate": 2.038749012684354e-06, "loss": 0.83387327, "num_input_tokens_seen": 182093360, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21630859, "step": 8466, "time_per_iteration": 2.8685302734375 }, { "auxiliary_loss_clip": 0.0141788, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.25118411, "balance_loss_mlp": 1.01312602, "epoch": 0.5090635803396963, "flos": 20454695654400.0, "grad_norm": 1.648144738558884, "language_loss": 0.79286939, "learning_rate": 2.0383596233739286e-06, "loss": 0.81739056, "num_input_tokens_seen": 182110170, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.21105957, "step": 8467, "time_per_iteration": 2.8418898582458496 }, { "auxiliary_loss_clip": 0.01412471, "auxiliary_loss_mlp": 0.01037236, "balance_loss_clip": 1.24932921, "balance_loss_mlp": 1.01607597, "epoch": 0.5091237035923644, "flos": 23779366433280.0, "grad_norm": 2.5223961911644124, "language_loss": 0.75128925, "learning_rate": 2.0379702326089013e-06, "loss": 0.77578634, "num_input_tokens_seen": 182129570, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.21142578, "step": 8468, "time_per_iteration": 2.878491163253784 }, { "auxiliary_loss_clip": 0.01426646, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.25845945, "balance_loss_mlp": 1.01653337, "epoch": 0.5091838268450323, "flos": 18335793548160.0, "grad_norm": 1.8354463492754354, "language_loss": 0.79107428, "learning_rate": 2.03758084040404e-06, "loss": 0.81570554, "num_input_tokens_seen": 182147565, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19946289, "step": 8469, "time_per_iteration": 2.8228046894073486 }, { "auxiliary_loss_clip": 0.01440354, "auxiliary_loss_mlp": 0.01042864, "balance_loss_clip": 1.27181542, "balance_loss_mlp": 1.02102494, "epoch": 0.5092439500977003, "flos": 29069043194880.0, "grad_norm": 1.4134218104524228, "language_loss": 0.70336318, "learning_rate": 2.037191446774109e-06, "loss": 0.72819531, "num_input_tokens_seen": 182169695, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.21838379, "step": 8470, "time_per_iteration": 2.8960089683532715 }, { "auxiliary_loss_clip": 0.0143329, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.26209855, "balance_loss_mlp": 1.02068615, "epoch": 0.5093040733503682, "flos": 13562017706880.0, "grad_norm": 1.9841287556265208, "language_loss": 0.7426551, "learning_rate": 2.0368020517338745e-06, "loss": 0.76741827, "num_input_tokens_seen": 182186385, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.2232666, "step": 8471, "time_per_iteration": 4.278416156768799 }, { "auxiliary_loss_clip": 0.01215659, "auxiliary_loss_mlp": 0.01021386, "balance_loss_clip": 1.11803198, "balance_loss_mlp": 0.99983293, "epoch": 0.5093641966030362, "flos": 68939547212160.0, "grad_norm": 0.7532947150471525, "language_loss": 0.58198535, "learning_rate": 2.036412655298103e-06, "loss": 0.60435581, "num_input_tokens_seen": 182247095, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.21582031, "step": 8472, "time_per_iteration": 3.354746103286743 }, { "auxiliary_loss_clip": 0.0143113, "auxiliary_loss_mlp": 0.01040663, "balance_loss_clip": 1.260566, "balance_loss_mlp": 1.01993227, "epoch": 0.5094243198557042, "flos": 21591185258880.0, "grad_norm": 1.739409784669117, "language_loss": 0.69643807, "learning_rate": 2.03602325748156e-06, "loss": 0.721156, "num_input_tokens_seen": 182266380, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.20727539, "step": 8473, "time_per_iteration": 2.8897671699523926 }, { "auxiliary_loss_clip": 0.01425257, "auxiliary_loss_mlp": 0.01036419, "balance_loss_clip": 1.25702477, "balance_loss_mlp": 1.01597404, "epoch": 0.5094844431083722, "flos": 28852609478400.0, "grad_norm": 2.0330218032089653, "language_loss": 0.8581996, "learning_rate": 2.0356338582990105e-06, "loss": 0.88281631, "num_input_tokens_seen": 182284685, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20446777, "step": 8474, "time_per_iteration": 2.881608486175537 }, { "auxiliary_loss_clip": 0.01441517, "auxiliary_loss_mlp": 0.01037902, "balance_loss_clip": 1.26939368, "balance_loss_mlp": 1.01608646, "epoch": 0.5095445663610402, "flos": 14984401075200.0, "grad_norm": 2.010098043535404, "language_loss": 0.65807259, "learning_rate": 2.035244457765222e-06, "loss": 0.68286681, "num_input_tokens_seen": 182301810, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.21813965, "step": 8475, "time_per_iteration": 2.827157735824585 }, { "auxiliary_loss_clip": 0.01460767, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.28499329, "balance_loss_mlp": 1.02044606, "epoch": 0.5096046896137081, "flos": 20786811621120.0, "grad_norm": 6.817158506031396, "language_loss": 0.82803774, "learning_rate": 2.0348550558949605e-06, "loss": 0.8530823, "num_input_tokens_seen": 182320285, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.23278809, "step": 8476, "time_per_iteration": 2.861250400543213 }, { "auxiliary_loss_clip": 0.01447887, "auxiliary_loss_mlp": 0.01042663, "balance_loss_clip": 1.27405226, "balance_loss_mlp": 1.01919031, "epoch": 0.5096648128663761, "flos": 23195408054400.0, "grad_norm": 2.2836268480174553, "language_loss": 0.81102234, "learning_rate": 2.0344656527029917e-06, "loss": 0.83592784, "num_input_tokens_seen": 182339465, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.23461914, "step": 8477, "time_per_iteration": 2.887803316116333 }, { "auxiliary_loss_clip": 0.01442376, "auxiliary_loss_mlp": 0.01036556, "balance_loss_clip": 1.27089906, "balance_loss_mlp": 1.01403737, "epoch": 0.509724936119044, "flos": 22319538353280.0, "grad_norm": 1.8553245167643873, "language_loss": 0.62620336, "learning_rate": 2.034076248204082e-06, "loss": 0.65099269, "num_input_tokens_seen": 182358375, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.22521973, "step": 8478, "time_per_iteration": 3.0758187770843506 }, { "auxiliary_loss_clip": 0.01423336, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.25495028, "balance_loss_mlp": 1.01662636, "epoch": 0.509785059371712, "flos": 26298396720000.0, "grad_norm": 2.979207101871582, "language_loss": 0.67171574, "learning_rate": 2.0336868424129968e-06, "loss": 0.69633263, "num_input_tokens_seen": 182377935, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.21728516, "step": 8479, "time_per_iteration": 2.8875436782836914 }, { "auxiliary_loss_clip": 0.01425821, "auxiliary_loss_mlp": 0.01033283, "balance_loss_clip": 1.25873423, "balance_loss_mlp": 1.012815, "epoch": 0.50984518262438, "flos": 22974449857920.0, "grad_norm": 1.820786164358667, "language_loss": 0.70222384, "learning_rate": 2.0332974353445037e-06, "loss": 0.72681487, "num_input_tokens_seen": 182396440, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20471191, "step": 8480, "time_per_iteration": 2.8651437759399414 }, { "auxiliary_loss_clip": 0.01433517, "auxiliary_loss_mlp": 0.01034009, "balance_loss_clip": 1.26117277, "balance_loss_mlp": 1.01207411, "epoch": 0.509905305877048, "flos": 26224502682240.0, "grad_norm": 1.9760478129768126, "language_loss": 0.79987741, "learning_rate": 2.0329080270133688e-06, "loss": 0.82455271, "num_input_tokens_seen": 182415890, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.21948242, "step": 8481, "time_per_iteration": 2.8679933547973633 }, { "auxiliary_loss_clip": 0.01418363, "auxiliary_loss_mlp": 0.01040737, "balance_loss_clip": 1.25112557, "balance_loss_mlp": 1.01907635, "epoch": 0.5099654291297159, "flos": 20349872156160.0, "grad_norm": 1.5151467579629598, "language_loss": 0.8407771, "learning_rate": 2.0325186174343578e-06, "loss": 0.86536813, "num_input_tokens_seen": 182434235, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.2166748, "step": 8482, "time_per_iteration": 4.229772567749023 }, { "auxiliary_loss_clip": 0.01444211, "auxiliary_loss_mlp": 0.0103713, "balance_loss_clip": 1.2695421, "balance_loss_mlp": 1.015517, "epoch": 0.5100255523823839, "flos": 29065514100480.0, "grad_norm": 1.9258168624672556, "language_loss": 0.86123598, "learning_rate": 2.032129206622238e-06, "loss": 0.88604939, "num_input_tokens_seen": 182454360, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.21618652, "step": 8483, "time_per_iteration": 4.432432174682617 }, { "auxiliary_loss_clip": 0.01428916, "auxiliary_loss_mlp": 0.0103423, "balance_loss_clip": 1.25836635, "balance_loss_mlp": 1.01372647, "epoch": 0.5100856756350518, "flos": 22466195308800.0, "grad_norm": 2.636592216805329, "language_loss": 0.83782536, "learning_rate": 2.031739794591775e-06, "loss": 0.8624568, "num_input_tokens_seen": 182471940, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20507812, "step": 8484, "time_per_iteration": 4.2665276527404785 }, { "auxiliary_loss_clip": 0.01423155, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.25322938, "balance_loss_mlp": 1.01166415, "epoch": 0.5101457988877198, "flos": 19180190851200.0, "grad_norm": 1.9135268971999295, "language_loss": 0.82423472, "learning_rate": 2.031350381357736e-06, "loss": 0.84879559, "num_input_tokens_seen": 182490685, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21264648, "step": 8485, "time_per_iteration": 2.8266758918762207 }, { "auxiliary_loss_clip": 0.01419282, "auxiliary_loss_mlp": 0.01039341, "balance_loss_clip": 1.25323403, "balance_loss_mlp": 1.01675034, "epoch": 0.5102059221403878, "flos": 14874555404160.0, "grad_norm": 1.8139421965510567, "language_loss": 0.74664581, "learning_rate": 2.0309609669348874e-06, "loss": 0.77123201, "num_input_tokens_seen": 182508325, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.22583008, "step": 8486, "time_per_iteration": 2.828829765319824 }, { "auxiliary_loss_clip": 0.0144207, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.26833689, "balance_loss_mlp": 1.01868415, "epoch": 0.5102660453930558, "flos": 22970061112320.0, "grad_norm": 1.5455227727815348, "language_loss": 0.7068947, "learning_rate": 2.0305715513379953e-06, "loss": 0.73172468, "num_input_tokens_seen": 182527020, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.22253418, "step": 8487, "time_per_iteration": 3.0205676555633545 }, { "auxiliary_loss_clip": 0.01421009, "auxiliary_loss_mlp": 0.01036797, "balance_loss_clip": 1.25363541, "balance_loss_mlp": 1.01493394, "epoch": 0.5103261686457238, "flos": 23159818379520.0, "grad_norm": 2.05230218229551, "language_loss": 0.7390396, "learning_rate": 2.030182134581827e-06, "loss": 0.76361763, "num_input_tokens_seen": 182543505, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.21862793, "step": 8488, "time_per_iteration": 2.8650991916656494 }, { "auxiliary_loss_clip": 0.0145235, "auxiliary_loss_mlp": 0.01039204, "balance_loss_clip": 1.27871299, "balance_loss_mlp": 1.01706648, "epoch": 0.5103862918983917, "flos": 14327317820160.0, "grad_norm": 1.8176735050634334, "language_loss": 0.70638013, "learning_rate": 2.0297927166811503e-06, "loss": 0.7312957, "num_input_tokens_seen": 182562250, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.22131348, "step": 8489, "time_per_iteration": 2.858232259750366 }, { "auxiliary_loss_clip": 0.01427147, "auxiliary_loss_mlp": 0.01039652, "balance_loss_clip": 1.25692677, "balance_loss_mlp": 1.0179913, "epoch": 0.5104464151510597, "flos": 25859556973440.0, "grad_norm": 3.520186770964302, "language_loss": 0.73314935, "learning_rate": 2.0294032976507297e-06, "loss": 0.75781733, "num_input_tokens_seen": 182581910, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.2166748, "step": 8490, "time_per_iteration": 2.8718101978302 }, { "auxiliary_loss_clip": 0.0141633, "auxiliary_loss_mlp": 0.01038373, "balance_loss_clip": 1.25035453, "balance_loss_mlp": 1.0168674, "epoch": 0.5105065384037276, "flos": 21662907546240.0, "grad_norm": 1.5323945782095576, "language_loss": 0.81554991, "learning_rate": 2.0290138775053337e-06, "loss": 0.84009689, "num_input_tokens_seen": 182601350, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21508789, "step": 8491, "time_per_iteration": 2.8677096366882324 }, { "auxiliary_loss_clip": 0.01406408, "auxiliary_loss_mlp": 0.01037752, "balance_loss_clip": 1.24294424, "balance_loss_mlp": 1.01727128, "epoch": 0.5105666616563956, "flos": 22501649249280.0, "grad_norm": 4.1781945342388305, "language_loss": 0.80294174, "learning_rate": 2.028624456259728e-06, "loss": 0.82738328, "num_input_tokens_seen": 182619660, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20471191, "step": 8492, "time_per_iteration": 2.848491668701172 }, { "auxiliary_loss_clip": 0.01439867, "auxiliary_loss_mlp": 0.01042878, "balance_loss_clip": 1.26657033, "balance_loss_mlp": 1.02059722, "epoch": 0.5106267849090635, "flos": 22466014329600.0, "grad_norm": 1.7787385288085535, "language_loss": 0.78721428, "learning_rate": 2.0282350339286804e-06, "loss": 0.8120417, "num_input_tokens_seen": 182639815, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.22277832, "step": 8493, "time_per_iteration": 2.86641788482666 }, { "auxiliary_loss_clip": 0.01431561, "auxiliary_loss_mlp": 0.01040713, "balance_loss_clip": 1.26132786, "balance_loss_mlp": 1.01797986, "epoch": 0.5106869081617316, "flos": 23556779424000.0, "grad_norm": 1.8551252394940954, "language_loss": 0.84677356, "learning_rate": 2.0278456105269574e-06, "loss": 0.87149632, "num_input_tokens_seen": 182659655, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.22717285, "step": 8494, "time_per_iteration": 2.907911777496338 }, { "auxiliary_loss_clip": 0.0143045, "auxiliary_loss_mlp": 0.01035945, "balance_loss_clip": 1.26022434, "balance_loss_mlp": 1.01519036, "epoch": 0.5107470314143995, "flos": 26803167419520.0, "grad_norm": 2.091037092639621, "language_loss": 0.79817587, "learning_rate": 2.027456186069326e-06, "loss": 0.8228398, "num_input_tokens_seen": 182677075, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.2076416, "step": 8495, "time_per_iteration": 2.881962299346924 }, { "auxiliary_loss_clip": 0.01431582, "auxiliary_loss_mlp": 0.01039164, "balance_loss_clip": 1.26166749, "balance_loss_mlp": 1.01814771, "epoch": 0.5108071546670675, "flos": 25750842422400.0, "grad_norm": 2.0340424922799083, "language_loss": 0.7879591, "learning_rate": 2.0270667605705535e-06, "loss": 0.81266659, "num_input_tokens_seen": 182699625, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21008301, "step": 8496, "time_per_iteration": 2.966218948364258 }, { "auxiliary_loss_clip": 0.01419311, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.25336528, "balance_loss_mlp": 1.0156405, "epoch": 0.5108672779197354, "flos": 18706756815360.0, "grad_norm": 2.0640895084225144, "language_loss": 0.7925334, "learning_rate": 2.0266773340454066e-06, "loss": 0.81710023, "num_input_tokens_seen": 182717020, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.21740723, "step": 8497, "time_per_iteration": 2.835843324661255 }, { "auxiliary_loss_clip": 0.01418484, "auxiliary_loss_mlp": 0.01035875, "balance_loss_clip": 1.25110984, "balance_loss_mlp": 1.01478744, "epoch": 0.5109274011724034, "flos": 26699520286080.0, "grad_norm": 2.00093596672389, "language_loss": 0.82445586, "learning_rate": 2.0262879065086525e-06, "loss": 0.8489995, "num_input_tokens_seen": 182736955, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.2109375, "step": 8498, "time_per_iteration": 2.914102792739868 }, { "auxiliary_loss_clip": 0.01414831, "auxiliary_loss_mlp": 0.01042214, "balance_loss_clip": 1.24974549, "balance_loss_mlp": 1.01871812, "epoch": 0.5109875244250714, "flos": 22794329733120.0, "grad_norm": 2.13561477992394, "language_loss": 0.72403753, "learning_rate": 2.0258984779750584e-06, "loss": 0.74860799, "num_input_tokens_seen": 182757620, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.23498535, "step": 8499, "time_per_iteration": 2.887773036956787 }, { "auxiliary_loss_clip": 0.01443465, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.27218497, "balance_loss_mlp": 1.01409519, "epoch": 0.5110476476777394, "flos": 35601616627200.0, "grad_norm": 1.5207032223026902, "language_loss": 0.72883856, "learning_rate": 2.0255090484593914e-06, "loss": 0.75364399, "num_input_tokens_seen": 182780195, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.22973633, "step": 8500, "time_per_iteration": 3.006488084793091 }, { "auxiliary_loss_clip": 0.0144705, "auxiliary_loss_mlp": 0.01039376, "balance_loss_clip": 1.27033281, "balance_loss_mlp": 1.01592755, "epoch": 0.5111077709304074, "flos": 19290036522240.0, "grad_norm": 3.931245653276883, "language_loss": 0.64368749, "learning_rate": 2.0251196179764183e-06, "loss": 0.66855174, "num_input_tokens_seen": 182795765, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.23461914, "step": 8501, "time_per_iteration": 2.8542048931121826 }, { "auxiliary_loss_clip": 0.01432117, "auxiliary_loss_mlp": 0.01038246, "balance_loss_clip": 1.25890028, "balance_loss_mlp": 1.01578712, "epoch": 0.5111678941830753, "flos": 20678051825280.0, "grad_norm": 1.6880578447338213, "language_loss": 0.884152, "learning_rate": 2.024730186540907e-06, "loss": 0.90885562, "num_input_tokens_seen": 182813120, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.2244873, "step": 8502, "time_per_iteration": 2.8260998725891113 }, { "auxiliary_loss_clip": 0.01416189, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.24792182, "balance_loss_mlp": 1.01300144, "epoch": 0.5112280174357433, "flos": 26299437350400.0, "grad_norm": 1.4873440754669713, "language_loss": 0.83154225, "learning_rate": 2.0243407541676253e-06, "loss": 0.85606116, "num_input_tokens_seen": 182835745, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.22717285, "step": 8503, "time_per_iteration": 2.927774429321289 }, { "auxiliary_loss_clip": 0.01231329, "auxiliary_loss_mlp": 0.01046073, "balance_loss_clip": 1.13070178, "balance_loss_mlp": 1.01498365, "epoch": 0.5112881406884112, "flos": 59499787201920.0, "grad_norm": 0.8581867575325997, "language_loss": 0.63910949, "learning_rate": 2.023951320871339e-06, "loss": 0.66188359, "num_input_tokens_seen": 182892540, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.31054688, "step": 8504, "time_per_iteration": 3.3933780193328857 }, { "auxiliary_loss_clip": 0.01419903, "auxiliary_loss_mlp": 0.01032978, "balance_loss_clip": 1.25241613, "balance_loss_mlp": 1.01036453, "epoch": 0.5113482639410792, "flos": 26480055168000.0, "grad_norm": 1.8758401628845973, "language_loss": 0.8468399, "learning_rate": 2.023561886666816e-06, "loss": 0.87136877, "num_input_tokens_seen": 182911515, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.22619629, "step": 8505, "time_per_iteration": 2.9149420261383057 }, { "auxiliary_loss_clip": 0.01417343, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.25114751, "balance_loss_mlp": 1.01468241, "epoch": 0.5114083871937471, "flos": 29907241960320.0, "grad_norm": 2.297806455726351, "language_loss": 0.76209646, "learning_rate": 2.0231724515688246e-06, "loss": 0.78664398, "num_input_tokens_seen": 182930860, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.22717285, "step": 8506, "time_per_iteration": 4.3589324951171875 }, { "auxiliary_loss_clip": 0.01414429, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.24666297, "balance_loss_mlp": 1.01489627, "epoch": 0.5114685104464152, "flos": 24324794225280.0, "grad_norm": 1.697927304266879, "language_loss": 0.58565152, "learning_rate": 2.022783015592131e-06, "loss": 0.61018538, "num_input_tokens_seen": 182949960, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.24060059, "step": 8507, "time_per_iteration": 2.856609582901001 }, { "auxiliary_loss_clip": 0.0142919, "auxiliary_loss_mlp": 0.01038311, "balance_loss_clip": 1.2608496, "balance_loss_mlp": 1.01543486, "epoch": 0.5115286336990831, "flos": 17028097044480.0, "grad_norm": 2.606928141595282, "language_loss": 0.86179835, "learning_rate": 2.022393578751503e-06, "loss": 0.88647342, "num_input_tokens_seen": 182968085, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.2286377, "step": 8508, "time_per_iteration": 2.848177671432495 }, { "auxiliary_loss_clip": 0.01419511, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.25178432, "balance_loss_mlp": 1.01400578, "epoch": 0.5115887569517511, "flos": 23669837475840.0, "grad_norm": 2.4207397662807906, "language_loss": 0.73308265, "learning_rate": 2.022004141061709e-06, "loss": 0.75764459, "num_input_tokens_seen": 182987275, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.22680664, "step": 8509, "time_per_iteration": 2.861140489578247 }, { "auxiliary_loss_clip": 0.01410964, "auxiliary_loss_mlp": 0.01036402, "balance_loss_clip": 1.24672747, "balance_loss_mlp": 1.01463473, "epoch": 0.511648880204419, "flos": 16115778017280.0, "grad_norm": 2.8864417737694428, "language_loss": 0.76715553, "learning_rate": 2.0216147025375153e-06, "loss": 0.7916292, "num_input_tokens_seen": 183004700, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.2175293, "step": 8510, "time_per_iteration": 2.7844481468200684 }, { "auxiliary_loss_clip": 0.01428313, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.26259041, "balance_loss_mlp": 1.01517737, "epoch": 0.511709003457087, "flos": 32647773381120.0, "grad_norm": 1.943786921543668, "language_loss": 0.71331018, "learning_rate": 2.0212252631936907e-06, "loss": 0.73797035, "num_input_tokens_seen": 183025830, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.22558594, "step": 8511, "time_per_iteration": 2.962165355682373 }, { "auxiliary_loss_clip": 0.01428651, "auxiliary_loss_mlp": 0.01036546, "balance_loss_clip": 1.26280046, "balance_loss_mlp": 1.01423001, "epoch": 0.511769126709755, "flos": 21772029300480.0, "grad_norm": 2.166333767808703, "language_loss": 0.67615151, "learning_rate": 2.020835823045001e-06, "loss": 0.7008034, "num_input_tokens_seen": 183045140, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.2232666, "step": 8512, "time_per_iteration": 2.8737900257110596 }, { "auxiliary_loss_clip": 0.01424425, "auxiliary_loss_mlp": 0.01039625, "balance_loss_clip": 1.25503671, "balance_loss_mlp": 1.01679635, "epoch": 0.511829249962423, "flos": 23926023388800.0, "grad_norm": 1.891302863438915, "language_loss": 0.68263471, "learning_rate": 2.0204463821062146e-06, "loss": 0.70727527, "num_input_tokens_seen": 183063935, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.22839355, "step": 8513, "time_per_iteration": 2.8625845909118652 }, { "auxiliary_loss_clip": 0.01419399, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.25372791, "balance_loss_mlp": 1.01656151, "epoch": 0.511889373215091, "flos": 23736085142400.0, "grad_norm": 2.733267677875469, "language_loss": 0.69553816, "learning_rate": 2.0200569403921e-06, "loss": 0.72012645, "num_input_tokens_seen": 183084135, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.22839355, "step": 8514, "time_per_iteration": 2.926398754119873 }, { "auxiliary_loss_clip": 0.01422022, "auxiliary_loss_mlp": 0.01042864, "balance_loss_clip": 1.25409102, "balance_loss_mlp": 1.02101302, "epoch": 0.5119494964677589, "flos": 28123351488000.0, "grad_norm": 2.146318832323308, "language_loss": 0.67199099, "learning_rate": 2.019667497917424e-06, "loss": 0.6966399, "num_input_tokens_seen": 183104570, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21850586, "step": 8515, "time_per_iteration": 2.9258053302764893 }, { "auxiliary_loss_clip": 0.01414138, "auxiliary_loss_mlp": 0.01042001, "balance_loss_clip": 1.24893093, "balance_loss_mlp": 1.01948214, "epoch": 0.5120096197204269, "flos": 24984094475520.0, "grad_norm": 1.884097709251792, "language_loss": 0.76684868, "learning_rate": 2.019278054696955e-06, "loss": 0.79141009, "num_input_tokens_seen": 183123850, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.22473145, "step": 8516, "time_per_iteration": 2.8912696838378906 }, { "auxiliary_loss_clip": 0.01424731, "auxiliary_loss_mlp": 0.01041492, "balance_loss_clip": 1.25930572, "balance_loss_mlp": 1.01893711, "epoch": 0.5120697429730948, "flos": 17977091621760.0, "grad_norm": 1.9087127385973055, "language_loss": 0.78812855, "learning_rate": 2.0188886107454595e-06, "loss": 0.81279075, "num_input_tokens_seen": 183141725, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.22546387, "step": 8517, "time_per_iteration": 4.209630250930786 }, { "auxiliary_loss_clip": 0.01442469, "auxiliary_loss_mlp": 0.01042325, "balance_loss_clip": 1.26922452, "balance_loss_mlp": 1.01935351, "epoch": 0.5121298662257628, "flos": 23302131834240.0, "grad_norm": 1.7765579553688344, "language_loss": 0.74590963, "learning_rate": 2.0184991660777063e-06, "loss": 0.77075756, "num_input_tokens_seen": 183161300, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.22973633, "step": 8518, "time_per_iteration": 4.281785726547241 }, { "auxiliary_loss_clip": 0.01433619, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.26399124, "balance_loss_mlp": 1.01831722, "epoch": 0.5121899894784308, "flos": 17319917877120.0, "grad_norm": 2.1151943022259494, "language_loss": 0.79394674, "learning_rate": 2.0181097207084625e-06, "loss": 0.81869429, "num_input_tokens_seen": 183180495, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.22790527, "step": 8519, "time_per_iteration": 4.263570785522461 }, { "auxiliary_loss_clip": 0.01422138, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.25635147, "balance_loss_mlp": 1.01904893, "epoch": 0.5122501127310988, "flos": 24939229616640.0, "grad_norm": 2.5199871791494357, "language_loss": 0.79431629, "learning_rate": 2.017720274652497e-06, "loss": 0.81895936, "num_input_tokens_seen": 183200330, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.23144531, "step": 8520, "time_per_iteration": 2.945737361907959 }, { "auxiliary_loss_clip": 0.01441449, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.26828754, "balance_loss_mlp": 1.02046704, "epoch": 0.5123102359837667, "flos": 18452199715200.0, "grad_norm": 2.5635350502908736, "language_loss": 0.82248294, "learning_rate": 2.0173308279245765e-06, "loss": 0.84733301, "num_input_tokens_seen": 183218230, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.23095703, "step": 8521, "time_per_iteration": 2.8297808170318604 }, { "auxiliary_loss_clip": 0.0143158, "auxiliary_loss_mlp": 0.0104057, "balance_loss_clip": 1.26162863, "balance_loss_mlp": 1.01783645, "epoch": 0.5123703592364347, "flos": 26695357764480.0, "grad_norm": 1.7901024270291064, "language_loss": 0.69225901, "learning_rate": 2.0169413805394692e-06, "loss": 0.71698046, "num_input_tokens_seen": 183236735, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22729492, "step": 8522, "time_per_iteration": 2.899533748626709 }, { "auxiliary_loss_clip": 0.01436355, "auxiliary_loss_mlp": 0.01043377, "balance_loss_clip": 1.26286149, "balance_loss_mlp": 1.02036965, "epoch": 0.5124304824891026, "flos": 28815752949120.0, "grad_norm": 1.6960102091478508, "language_loss": 0.62392962, "learning_rate": 2.0165519325119433e-06, "loss": 0.64872694, "num_input_tokens_seen": 183257550, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.23010254, "step": 8523, "time_per_iteration": 2.9079203605651855 }, { "auxiliary_loss_clip": 0.01430859, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.26285815, "balance_loss_mlp": 1.01813412, "epoch": 0.5124906057417706, "flos": 21771712586880.0, "grad_norm": 1.9594480063448338, "language_loss": 0.7864846, "learning_rate": 2.0161624838567656e-06, "loss": 0.81118774, "num_input_tokens_seen": 183275515, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21313477, "step": 8524, "time_per_iteration": 2.825228214263916 }, { "auxiliary_loss_clip": 0.01430325, "auxiliary_loss_mlp": 0.01037112, "balance_loss_clip": 1.26297855, "balance_loss_mlp": 1.01538038, "epoch": 0.5125507289944387, "flos": 18889953586560.0, "grad_norm": 2.0672172255614636, "language_loss": 0.75938892, "learning_rate": 2.015773034588706e-06, "loss": 0.78406322, "num_input_tokens_seen": 183293880, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.21728516, "step": 8525, "time_per_iteration": 2.830467700958252 }, { "auxiliary_loss_clip": 0.01441143, "auxiliary_loss_mlp": 0.01038693, "balance_loss_clip": 1.2696327, "balance_loss_mlp": 1.01507747, "epoch": 0.5126108522471066, "flos": 35641685537280.0, "grad_norm": 1.631232313952308, "language_loss": 0.75207621, "learning_rate": 2.015383584722531e-06, "loss": 0.77687454, "num_input_tokens_seen": 183315860, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.23620605, "step": 8526, "time_per_iteration": 2.95717716217041 }, { "auxiliary_loss_clip": 0.01442986, "auxiliary_loss_mlp": 0.01038163, "balance_loss_clip": 1.27256632, "balance_loss_mlp": 1.01506054, "epoch": 0.5126709754997746, "flos": 20199867085440.0, "grad_norm": 1.697826780247972, "language_loss": 0.66669095, "learning_rate": 2.0149941342730088e-06, "loss": 0.69150245, "num_input_tokens_seen": 183335480, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.23083496, "step": 8527, "time_per_iteration": 2.8749608993530273 }, { "auxiliary_loss_clip": 0.01410235, "auxiliary_loss_mlp": 0.01036324, "balance_loss_clip": 1.24942636, "balance_loss_mlp": 1.01441288, "epoch": 0.5127310987524425, "flos": 18597861285120.0, "grad_norm": 1.7530964747077653, "language_loss": 0.75079405, "learning_rate": 2.014604683254908e-06, "loss": 0.77525961, "num_input_tokens_seen": 183354395, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.21899414, "step": 8528, "time_per_iteration": 2.8794853687286377 }, { "auxiliary_loss_clip": 0.0142414, "auxiliary_loss_mlp": 0.01040145, "balance_loss_clip": 1.25706732, "balance_loss_mlp": 1.0164938, "epoch": 0.5127912220051105, "flos": 22464656985600.0, "grad_norm": 2.527393381917928, "language_loss": 0.83930188, "learning_rate": 2.014215231682995e-06, "loss": 0.86394471, "num_input_tokens_seen": 183372980, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.2364502, "step": 8529, "time_per_iteration": 2.870337724685669 }, { "auxiliary_loss_clip": 0.01422706, "auxiliary_loss_mlp": 0.01038123, "balance_loss_clip": 1.25569916, "balance_loss_mlp": 1.01589036, "epoch": 0.5128513452577784, "flos": 19101681843840.0, "grad_norm": 1.9715142998212867, "language_loss": 0.74938774, "learning_rate": 2.01382577957204e-06, "loss": 0.77399611, "num_input_tokens_seen": 183390160, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.22229004, "step": 8530, "time_per_iteration": 2.8797667026519775 }, { "auxiliary_loss_clip": 0.01241285, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.13450408, "balance_loss_mlp": 1.01168871, "epoch": 0.5129114685104464, "flos": 67926838677120.0, "grad_norm": 0.7437028506774569, "language_loss": 0.60715401, "learning_rate": 2.0134363269368095e-06, "loss": 0.62993741, "num_input_tokens_seen": 183455280, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.25390625, "step": 8531, "time_per_iteration": 3.459826946258545 }, { "auxiliary_loss_clip": 0.01447501, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.27500677, "balance_loss_mlp": 1.01584864, "epoch": 0.5129715917631144, "flos": 20458903420800.0, "grad_norm": 2.105932784327798, "language_loss": 0.77859092, "learning_rate": 2.0130468737920725e-06, "loss": 0.80345523, "num_input_tokens_seen": 183473955, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.2310791, "step": 8532, "time_per_iteration": 2.86252498626709 }, { "auxiliary_loss_clip": 0.01437454, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.26752067, "balance_loss_mlp": 1.01149213, "epoch": 0.5130317150157824, "flos": 35129630424960.0, "grad_norm": 1.9813395929652537, "language_loss": 0.67956024, "learning_rate": 2.012657420152597e-06, "loss": 0.70428216, "num_input_tokens_seen": 183497195, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.23254395, "step": 8533, "time_per_iteration": 2.993361473083496 }, { "auxiliary_loss_clip": 0.01432487, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.26181054, "balance_loss_mlp": 1.01355898, "epoch": 0.5130918382684503, "flos": 19801005759360.0, "grad_norm": 2.7855914618519395, "language_loss": 0.82244754, "learning_rate": 2.01226796603315e-06, "loss": 0.84713674, "num_input_tokens_seen": 183513675, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.22851562, "step": 8534, "time_per_iteration": 2.835568428039551 }, { "auxiliary_loss_clip": 0.01427645, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.25854647, "balance_loss_mlp": 1.01438189, "epoch": 0.5131519615211183, "flos": 26334303108480.0, "grad_norm": 1.4340188796656528, "language_loss": 0.64745963, "learning_rate": 2.0118785114485017e-06, "loss": 0.67211556, "num_input_tokens_seen": 183535165, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.23547363, "step": 8535, "time_per_iteration": 2.93871808052063 }, { "auxiliary_loss_clip": 0.01430542, "auxiliary_loss_mlp": 0.01036814, "balance_loss_clip": 1.26302767, "balance_loss_mlp": 1.01335359, "epoch": 0.5132120847737862, "flos": 19181276726400.0, "grad_norm": 1.744597277076536, "language_loss": 0.7013641, "learning_rate": 2.011489056413418e-06, "loss": 0.72603762, "num_input_tokens_seen": 183553780, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.23461914, "step": 8536, "time_per_iteration": 2.8297345638275146 }, { "auxiliary_loss_clip": 0.01432798, "auxiliary_loss_mlp": 0.0103815, "balance_loss_clip": 1.26017594, "balance_loss_mlp": 1.01509452, "epoch": 0.5132722080264542, "flos": 20240162219520.0, "grad_norm": 1.9744856201709355, "language_loss": 0.71536362, "learning_rate": 2.011099600942669e-06, "loss": 0.74007308, "num_input_tokens_seen": 183572285, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.23034668, "step": 8537, "time_per_iteration": 2.844421148300171 }, { "auxiliary_loss_clip": 0.01430054, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.2583859, "balance_loss_mlp": 1.01256561, "epoch": 0.5133323312791223, "flos": 16477466100480.0, "grad_norm": 1.8927638436740795, "language_loss": 0.80935514, "learning_rate": 2.0107101450510214e-06, "loss": 0.83401436, "num_input_tokens_seen": 183589330, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.23303223, "step": 8538, "time_per_iteration": 2.85497784614563 }, { "auxiliary_loss_clip": 0.01425718, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.25717807, "balance_loss_mlp": 1.01400161, "epoch": 0.5133924545317902, "flos": 26079338805120.0, "grad_norm": 1.863756969893624, "language_loss": 0.79242653, "learning_rate": 2.0103206887532437e-06, "loss": 0.81704223, "num_input_tokens_seen": 183609205, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.21850586, "step": 8539, "time_per_iteration": 2.9098622798919678 }, { "auxiliary_loss_clip": 0.01430138, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.26125407, "balance_loss_mlp": 1.01541781, "epoch": 0.5134525777844582, "flos": 29142575274240.0, "grad_norm": 1.628102056320393, "language_loss": 0.76677316, "learning_rate": 2.009931232064105e-06, "loss": 0.79146481, "num_input_tokens_seen": 183629985, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.23608398, "step": 8540, "time_per_iteration": 2.9349610805511475 }, { "auxiliary_loss_clip": 0.01450458, "auxiliary_loss_mlp": 0.0104125, "balance_loss_clip": 1.27618027, "balance_loss_mlp": 1.01701427, "epoch": 0.5135127010371261, "flos": 17463090983040.0, "grad_norm": 1.7502535060232345, "language_loss": 0.75944614, "learning_rate": 2.0095417749983724e-06, "loss": 0.78436321, "num_input_tokens_seen": 183648220, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.24243164, "step": 8541, "time_per_iteration": 4.290792226791382 }, { "auxiliary_loss_clip": 0.01435181, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.26491868, "balance_loss_mlp": 1.01669991, "epoch": 0.5135728242897941, "flos": 21955090337280.0, "grad_norm": 1.6437737208063912, "language_loss": 0.71258759, "learning_rate": 2.0091523175708162e-06, "loss": 0.73732948, "num_input_tokens_seen": 183668230, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.22314453, "step": 8542, "time_per_iteration": 2.845881700515747 }, { "auxiliary_loss_clip": 0.01423058, "auxiliary_loss_mlp": 0.01037439, "balance_loss_clip": 1.25334382, "balance_loss_mlp": 1.01306033, "epoch": 0.513632947542462, "flos": 22685207978880.0, "grad_norm": 2.1632320383704773, "language_loss": 0.80495483, "learning_rate": 2.0087628597962023e-06, "loss": 0.8295598, "num_input_tokens_seen": 183687800, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.24365234, "step": 8543, "time_per_iteration": 2.8472506999969482 }, { "auxiliary_loss_clip": 0.01424077, "auxiliary_loss_mlp": 0.01039495, "balance_loss_clip": 1.2574259, "balance_loss_mlp": 1.01485455, "epoch": 0.51369307079513, "flos": 29468221234560.0, "grad_norm": 3.314611651084389, "language_loss": 0.68673265, "learning_rate": 2.008373401689299e-06, "loss": 0.71136832, "num_input_tokens_seen": 183709025, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.24645996, "step": 8544, "time_per_iteration": 2.897733211517334 }, { "auxiliary_loss_clip": 0.01438581, "auxiliary_loss_mlp": 0.01037828, "balance_loss_clip": 1.26556849, "balance_loss_mlp": 1.0156436, "epoch": 0.513753194047798, "flos": 18998622892800.0, "grad_norm": 4.447531597314397, "language_loss": 0.73228896, "learning_rate": 2.0079839432648765e-06, "loss": 0.75705302, "num_input_tokens_seen": 183725740, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.22192383, "step": 8545, "time_per_iteration": 2.84002423286438 }, { "auxiliary_loss_clip": 0.01431548, "auxiliary_loss_mlp": 0.01046137, "balance_loss_clip": 1.25982928, "balance_loss_mlp": 1.02237821, "epoch": 0.513813317300466, "flos": 17830841869440.0, "grad_norm": 3.3879449544081646, "language_loss": 0.82699788, "learning_rate": 2.0075944845377016e-06, "loss": 0.85177469, "num_input_tokens_seen": 183743995, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.23779297, "step": 8546, "time_per_iteration": 2.8078904151916504 }, { "auxiliary_loss_clip": 0.01437686, "auxiliary_loss_mlp": 0.01038337, "balance_loss_clip": 1.26638234, "balance_loss_mlp": 1.01553237, "epoch": 0.5138734405531339, "flos": 24071865937920.0, "grad_norm": 1.670799900315332, "language_loss": 0.74233508, "learning_rate": 2.007205025522544e-06, "loss": 0.76709533, "num_input_tokens_seen": 183764150, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.22790527, "step": 8547, "time_per_iteration": 2.8861279487609863 }, { "auxiliary_loss_clip": 0.01423114, "auxiliary_loss_mlp": 0.0103982, "balance_loss_clip": 1.25443709, "balance_loss_mlp": 1.01757586, "epoch": 0.5139335638058019, "flos": 26106603436800.0, "grad_norm": 1.5665632817531792, "language_loss": 0.73918307, "learning_rate": 2.0068155662341702e-06, "loss": 0.76381242, "num_input_tokens_seen": 183783280, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.22253418, "step": 8548, "time_per_iteration": 2.886186122894287 }, { "auxiliary_loss_clip": 0.01433551, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.2636354, "balance_loss_mlp": 1.01658678, "epoch": 0.5139936870584698, "flos": 18926538647040.0, "grad_norm": 3.0572230282780057, "language_loss": 0.83179772, "learning_rate": 2.0064261066873495e-06, "loss": 0.85653394, "num_input_tokens_seen": 183800725, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.23498535, "step": 8549, "time_per_iteration": 2.811655282974243 }, { "auxiliary_loss_clip": 0.01417894, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.25119901, "balance_loss_mlp": 1.01580656, "epoch": 0.5140538103111378, "flos": 16152679791360.0, "grad_norm": 2.1407275121904794, "language_loss": 0.72996813, "learning_rate": 2.0060366468968504e-06, "loss": 0.75452483, "num_input_tokens_seen": 183818735, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.21984863, "step": 8550, "time_per_iteration": 2.850909471511841 }, { "auxiliary_loss_clip": 0.01439972, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.26656544, "balance_loss_mlp": 1.01265669, "epoch": 0.5141139335638057, "flos": 22430605633920.0, "grad_norm": 1.5284276993503516, "language_loss": 0.75932497, "learning_rate": 2.0056471868774408e-06, "loss": 0.78407919, "num_input_tokens_seen": 183840015, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.22766113, "step": 8551, "time_per_iteration": 4.37551474571228 }, { "auxiliary_loss_clip": 0.01421046, "auxiliary_loss_mlp": 0.01036661, "balance_loss_clip": 1.25617242, "balance_loss_mlp": 1.01531076, "epoch": 0.5141740568164738, "flos": 27101141544960.0, "grad_norm": 1.7124714405554866, "language_loss": 0.69616604, "learning_rate": 2.0052577266438897e-06, "loss": 0.72074318, "num_input_tokens_seen": 183860145, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.21350098, "step": 8552, "time_per_iteration": 2.902970314025879 }, { "auxiliary_loss_clip": 0.01439824, "auxiliary_loss_mlp": 0.01036515, "balance_loss_clip": 1.26765919, "balance_loss_mlp": 1.01421094, "epoch": 0.5142341800691418, "flos": 24984139720320.0, "grad_norm": 4.595494278346746, "language_loss": 0.7552011, "learning_rate": 2.004868266210965e-06, "loss": 0.77996445, "num_input_tokens_seen": 183880540, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.22314453, "step": 8553, "time_per_iteration": 4.314698696136475 }, { "auxiliary_loss_clip": 0.01424721, "auxiliary_loss_mlp": 0.01034653, "balance_loss_clip": 1.25685859, "balance_loss_mlp": 1.01350522, "epoch": 0.5142943033218097, "flos": 20714320172160.0, "grad_norm": 1.5983430933554927, "language_loss": 0.68087047, "learning_rate": 2.004478805593435e-06, "loss": 0.70546424, "num_input_tokens_seen": 183900895, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.21142578, "step": 8554, "time_per_iteration": 4.346302032470703 }, { "auxiliary_loss_clip": 0.01441748, "auxiliary_loss_mlp": 0.01038366, "balance_loss_clip": 1.26670253, "balance_loss_mlp": 1.01500082, "epoch": 0.5143544265744777, "flos": 22934652416640.0, "grad_norm": 1.7253439549209237, "language_loss": 0.74359274, "learning_rate": 2.004089344806068e-06, "loss": 0.76839387, "num_input_tokens_seen": 183920335, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.23364258, "step": 8555, "time_per_iteration": 2.8465735912323 }, { "auxiliary_loss_clip": 0.01438673, "auxiliary_loss_mlp": 0.0103785, "balance_loss_clip": 1.26834881, "balance_loss_mlp": 1.01516485, "epoch": 0.5144145498271456, "flos": 15929052151680.0, "grad_norm": 2.345357059345884, "language_loss": 0.76137418, "learning_rate": 2.003699883863633e-06, "loss": 0.78613937, "num_input_tokens_seen": 183936220, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.22668457, "step": 8556, "time_per_iteration": 2.8128280639648438 }, { "auxiliary_loss_clip": 0.01414731, "auxiliary_loss_mlp": 0.01037117, "balance_loss_clip": 1.24845719, "balance_loss_mlp": 1.01507556, "epoch": 0.5144746730798136, "flos": 19690345681920.0, "grad_norm": 2.2969614961983713, "language_loss": 0.87041146, "learning_rate": 2.003310422780898e-06, "loss": 0.89492995, "num_input_tokens_seen": 183953250, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.22058105, "step": 8557, "time_per_iteration": 2.8290698528289795 }, { "auxiliary_loss_clip": 0.0141025, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.24595571, "balance_loss_mlp": 1.01371241, "epoch": 0.5145347963324816, "flos": 23925208982400.0, "grad_norm": 1.5433023268916481, "language_loss": 0.89690745, "learning_rate": 2.0029209615726307e-06, "loss": 0.92135423, "num_input_tokens_seen": 183973865, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20727539, "step": 8558, "time_per_iteration": 2.899775743484497 }, { "auxiliary_loss_clip": 0.01411073, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.24776888, "balance_loss_mlp": 1.01429367, "epoch": 0.5145949195851496, "flos": 18269183923200.0, "grad_norm": 1.8422062224290896, "language_loss": 0.66216087, "learning_rate": 2.002531500253602e-06, "loss": 0.68664616, "num_input_tokens_seen": 183992555, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.23156738, "step": 8559, "time_per_iteration": 2.8489816188812256 }, { "auxiliary_loss_clip": 0.01419818, "auxiliary_loss_mlp": 0.01034688, "balance_loss_clip": 1.253057, "balance_loss_mlp": 1.01334965, "epoch": 0.5146550428378175, "flos": 26224547927040.0, "grad_norm": 1.6802778885134806, "language_loss": 0.63894749, "learning_rate": 2.002142038838577e-06, "loss": 0.6634925, "num_input_tokens_seen": 184010825, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.21337891, "step": 8560, "time_per_iteration": 2.873159408569336 }, { "auxiliary_loss_clip": 0.01427496, "auxiliary_loss_mlp": 0.01033966, "balance_loss_clip": 1.25955343, "balance_loss_mlp": 1.0123055, "epoch": 0.5147151660904855, "flos": 22684348327680.0, "grad_norm": 1.5622693371043959, "language_loss": 0.70944893, "learning_rate": 2.0017525773423265e-06, "loss": 0.73406351, "num_input_tokens_seen": 184030155, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.2166748, "step": 8561, "time_per_iteration": 2.881477117538452 }, { "auxiliary_loss_clip": 0.01426005, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.25761962, "balance_loss_mlp": 1.01750636, "epoch": 0.5147752893431534, "flos": 24983053845120.0, "grad_norm": 1.5017496340486944, "language_loss": 0.67342985, "learning_rate": 2.0013631157796177e-06, "loss": 0.69807684, "num_input_tokens_seen": 184051440, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.21179199, "step": 8562, "time_per_iteration": 2.8703243732452393 }, { "auxiliary_loss_clip": 0.01439901, "auxiliary_loss_mlp": 0.01037687, "balance_loss_clip": 1.26994205, "balance_loss_mlp": 1.01621723, "epoch": 0.5148354125958214, "flos": 22754215578240.0, "grad_norm": 3.7265979325736773, "language_loss": 0.78623223, "learning_rate": 2.0009736541652188e-06, "loss": 0.81100816, "num_input_tokens_seen": 184070205, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.21472168, "step": 8563, "time_per_iteration": 2.912950277328491 }, { "auxiliary_loss_clip": 0.01441795, "auxiliary_loss_mlp": 0.01040337, "balance_loss_clip": 1.26799798, "balance_loss_mlp": 1.01659048, "epoch": 0.5148955358484893, "flos": 23077282584960.0, "grad_norm": 1.9758184765681857, "language_loss": 0.84045267, "learning_rate": 2.0005841925139e-06, "loss": 0.86527401, "num_input_tokens_seen": 184087345, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.23754883, "step": 8564, "time_per_iteration": 2.8751637935638428 }, { "auxiliary_loss_clip": 0.01458272, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.28665781, "balance_loss_mlp": 1.01348543, "epoch": 0.5149556591011574, "flos": 20349962645760.0, "grad_norm": 1.8730441111013136, "language_loss": 0.74011374, "learning_rate": 2.0001947308404283e-06, "loss": 0.76504707, "num_input_tokens_seen": 184107110, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21557617, "step": 8565, "time_per_iteration": 2.876016616821289 }, { "auxiliary_loss_clip": 0.01452457, "auxiliary_loss_mlp": 0.01034385, "balance_loss_clip": 1.2778337, "balance_loss_mlp": 1.01245058, "epoch": 0.5150157823538254, "flos": 22648532428800.0, "grad_norm": 2.00728745813461, "language_loss": 0.69277853, "learning_rate": 1.9998052691595715e-06, "loss": 0.71764696, "num_input_tokens_seen": 184127105, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.21923828, "step": 8566, "time_per_iteration": 2.884556293487549 }, { "auxiliary_loss_clip": 0.0145258, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 1.27557302, "balance_loss_mlp": 1.01485455, "epoch": 0.5150759056064933, "flos": 26078750622720.0, "grad_norm": 1.8283237830297245, "language_loss": 0.78864336, "learning_rate": 1.9994158074861005e-06, "loss": 0.81353283, "num_input_tokens_seen": 184148060, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.21520996, "step": 8567, "time_per_iteration": 2.874546527862549 }, { "auxiliary_loss_clip": 0.01447375, "auxiliary_loss_mlp": 0.01043144, "balance_loss_clip": 1.27551389, "balance_loss_mlp": 1.02114987, "epoch": 0.5151360288591613, "flos": 25962299210880.0, "grad_norm": 2.7175653700919846, "language_loss": 0.79828227, "learning_rate": 1.9990263458347806e-06, "loss": 0.82318741, "num_input_tokens_seen": 184166175, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.2199707, "step": 8568, "time_per_iteration": 2.9142093658447266 }, { "auxiliary_loss_clip": 0.0141924, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.25256538, "balance_loss_mlp": 1.0118711, "epoch": 0.5151961521118292, "flos": 18514375349760.0, "grad_norm": 2.0849067280410742, "language_loss": 0.91622198, "learning_rate": 1.9986368842203825e-06, "loss": 0.94075572, "num_input_tokens_seen": 184182600, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.22265625, "step": 8569, "time_per_iteration": 2.8379158973693848 }, { "auxiliary_loss_clip": 0.01446977, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.2746067, "balance_loss_mlp": 1.01818562, "epoch": 0.5152562753644973, "flos": 22243065361920.0, "grad_norm": 1.6608159036229495, "language_loss": 0.77047825, "learning_rate": 1.998247422657674e-06, "loss": 0.79534197, "num_input_tokens_seen": 184202020, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.21203613, "step": 8570, "time_per_iteration": 2.9448909759521484 }, { "auxiliary_loss_clip": 0.01438893, "auxiliary_loss_mlp": 0.0104721, "balance_loss_clip": 1.26856351, "balance_loss_mlp": 1.02357078, "epoch": 0.5153163986171652, "flos": 38450953088640.0, "grad_norm": 1.5395901724754373, "language_loss": 0.74449921, "learning_rate": 1.9978579611614227e-06, "loss": 0.76936024, "num_input_tokens_seen": 184224850, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.23657227, "step": 8571, "time_per_iteration": 2.9909887313842773 }, { "auxiliary_loss_clip": 0.0123378, "auxiliary_loss_mlp": 0.01043643, "balance_loss_clip": 1.12884378, "balance_loss_mlp": 1.01503265, "epoch": 0.5153765218698332, "flos": 66415585242240.0, "grad_norm": 0.7768200589214824, "language_loss": 0.52912205, "learning_rate": 1.9974684997463984e-06, "loss": 0.55189627, "num_input_tokens_seen": 184288520, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.28515625, "step": 8572, "time_per_iteration": 3.441209316253662 }, { "auxiliary_loss_clip": 0.01425195, "auxiliary_loss_mlp": 0.01037776, "balance_loss_clip": 1.26077175, "balance_loss_mlp": 1.01587725, "epoch": 0.5154366451225011, "flos": 24035326122240.0, "grad_norm": 1.9383061055277306, "language_loss": 0.7759099, "learning_rate": 1.9970790384273687e-06, "loss": 0.80053955, "num_input_tokens_seen": 184308565, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.21899414, "step": 8573, "time_per_iteration": 2.9549922943115234 }, { "auxiliary_loss_clip": 0.01427681, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.26064324, "balance_loss_mlp": 1.01437783, "epoch": 0.5154967683751691, "flos": 23477139296640.0, "grad_norm": 1.8522111306105504, "language_loss": 0.78557229, "learning_rate": 1.996689577219102e-06, "loss": 0.8102212, "num_input_tokens_seen": 184326795, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.22814941, "step": 8574, "time_per_iteration": 2.8661134243011475 }, { "auxiliary_loss_clip": 0.01410022, "auxiliary_loss_mlp": 0.01038461, "balance_loss_clip": 1.24393344, "balance_loss_mlp": 1.01738441, "epoch": 0.515556891627837, "flos": 23816222962560.0, "grad_norm": 1.8156620061907192, "language_loss": 0.85733879, "learning_rate": 1.996300116136367e-06, "loss": 0.88182366, "num_input_tokens_seen": 184345990, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21081543, "step": 8575, "time_per_iteration": 2.890826940536499 }, { "auxiliary_loss_clip": 0.01432172, "auxiliary_loss_mlp": 0.01031896, "balance_loss_clip": 1.26232529, "balance_loss_mlp": 1.01033092, "epoch": 0.515617014880505, "flos": 19838043267840.0, "grad_norm": 1.6311877269754578, "language_loss": 0.77425218, "learning_rate": 1.995910655193932e-06, "loss": 0.79889286, "num_input_tokens_seen": 184366300, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.21569824, "step": 8576, "time_per_iteration": 4.354767799377441 }, { "auxiliary_loss_clip": 0.01458829, "auxiliary_loss_mlp": 0.01038065, "balance_loss_clip": 1.28054905, "balance_loss_mlp": 1.01516545, "epoch": 0.515677138133173, "flos": 14253876230400.0, "grad_norm": 2.6929223152262747, "language_loss": 0.77100194, "learning_rate": 1.9955211944065654e-06, "loss": 0.79597086, "num_input_tokens_seen": 184383030, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.22888184, "step": 8577, "time_per_iteration": 2.8082566261291504 }, { "auxiliary_loss_clip": 0.01437301, "auxiliary_loss_mlp": 0.01038121, "balance_loss_clip": 1.26562965, "balance_loss_mlp": 1.01387382, "epoch": 0.515737261385841, "flos": 28300485456000.0, "grad_norm": 1.648461017084631, "language_loss": 0.81525743, "learning_rate": 1.9951317337890353e-06, "loss": 0.84001166, "num_input_tokens_seen": 184403410, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.24230957, "step": 8578, "time_per_iteration": 2.939502477645874 }, { "auxiliary_loss_clip": 0.0141783, "auxiliary_loss_mlp": 0.01035249, "balance_loss_clip": 1.24995995, "balance_loss_mlp": 1.01261127, "epoch": 0.515797384638509, "flos": 27903524411520.0, "grad_norm": 1.701713119170849, "language_loss": 0.76608086, "learning_rate": 1.9947422733561105e-06, "loss": 0.79061157, "num_input_tokens_seen": 184423830, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.22644043, "step": 8579, "time_per_iteration": 2.930098056793213 }, { "auxiliary_loss_clip": 0.01426267, "auxiliary_loss_mlp": 0.01035774, "balance_loss_clip": 1.25619125, "balance_loss_mlp": 1.01394713, "epoch": 0.5158575078911769, "flos": 23050606135680.0, "grad_norm": 1.6834395028611489, "language_loss": 0.80090529, "learning_rate": 1.994352813122559e-06, "loss": 0.8255257, "num_input_tokens_seen": 184445050, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.21850586, "step": 8580, "time_per_iteration": 2.8765671253204346 }, { "auxiliary_loss_clip": 0.01440472, "auxiliary_loss_mlp": 0.01037085, "balance_loss_clip": 1.26780593, "balance_loss_mlp": 1.01370835, "epoch": 0.5159176311438449, "flos": 12649110497280.0, "grad_norm": 2.102008924324362, "language_loss": 0.73541576, "learning_rate": 1.99396335310315e-06, "loss": 0.76019132, "num_input_tokens_seen": 184460775, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.23376465, "step": 8581, "time_per_iteration": 2.8574702739715576 }, { "auxiliary_loss_clip": 0.01421442, "auxiliary_loss_mlp": 0.01037288, "balance_loss_clip": 1.25513172, "balance_loss_mlp": 1.01546085, "epoch": 0.5159777543965128, "flos": 15566142458880.0, "grad_norm": 2.404181812462474, "language_loss": 0.7515825, "learning_rate": 1.9935738933126508e-06, "loss": 0.77616978, "num_input_tokens_seen": 184477365, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21813965, "step": 8582, "time_per_iteration": 2.814892292022705 }, { "auxiliary_loss_clip": 0.01425901, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 1.2589438, "balance_loss_mlp": 1.01245379, "epoch": 0.5160378776491809, "flos": 23232038359680.0, "grad_norm": 1.9164216890935626, "language_loss": 0.669595, "learning_rate": 1.99318443376583e-06, "loss": 0.69419497, "num_input_tokens_seen": 184497045, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.21643066, "step": 8583, "time_per_iteration": 2.8361470699310303 }, { "auxiliary_loss_clip": 0.01437522, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.26871347, "balance_loss_mlp": 1.01928449, "epoch": 0.5160980009018488, "flos": 21954637889280.0, "grad_norm": 1.4106971058169335, "language_loss": 0.76505542, "learning_rate": 1.9927949744774568e-06, "loss": 0.78985053, "num_input_tokens_seen": 184517675, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.22717285, "step": 8584, "time_per_iteration": 2.8420839309692383 }, { "auxiliary_loss_clip": 0.01438041, "auxiliary_loss_mlp": 0.01042835, "balance_loss_clip": 1.26649523, "balance_loss_mlp": 1.01999414, "epoch": 0.5161581241545168, "flos": 22794374977920.0, "grad_norm": 2.134842685029386, "language_loss": 0.79876035, "learning_rate": 1.9924055154622983e-06, "loss": 0.82356906, "num_input_tokens_seen": 184537745, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.22851562, "step": 8585, "time_per_iteration": 2.868936061859131 }, { "auxiliary_loss_clip": 0.01409995, "auxiliary_loss_mlp": 0.01033495, "balance_loss_clip": 1.24757004, "balance_loss_mlp": 1.01240647, "epoch": 0.5162182474071847, "flos": 19682970779520.0, "grad_norm": 3.7350805582910276, "language_loss": 0.80722094, "learning_rate": 1.9920160567351238e-06, "loss": 0.83165586, "num_input_tokens_seen": 184553630, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.21081543, "step": 8586, "time_per_iteration": 4.289619445800781 }, { "auxiliary_loss_clip": 0.01418492, "auxiliary_loss_mlp": 0.01039658, "balance_loss_clip": 1.25177002, "balance_loss_mlp": 1.01722252, "epoch": 0.5162783706598527, "flos": 20055427125120.0, "grad_norm": 2.174855892957231, "language_loss": 0.73329449, "learning_rate": 1.991626598310701e-06, "loss": 0.75787604, "num_input_tokens_seen": 184573530, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.2244873, "step": 8587, "time_per_iteration": 2.860093116760254 }, { "auxiliary_loss_clip": 0.01247973, "auxiliary_loss_mlp": 0.01020779, "balance_loss_clip": 1.14338362, "balance_loss_mlp": 0.996746, "epoch": 0.5163384939125206, "flos": 69991419761280.0, "grad_norm": 0.7292709598892758, "language_loss": 0.57904816, "learning_rate": 1.9912371402037984e-06, "loss": 0.60173559, "num_input_tokens_seen": 184637875, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.24023438, "step": 8588, "time_per_iteration": 6.288766384124756 }, { "auxiliary_loss_clip": 0.01428929, "auxiliary_loss_mlp": 0.01037983, "balance_loss_clip": 1.25837266, "balance_loss_mlp": 1.01479697, "epoch": 0.5163986171651886, "flos": 17425012844160.0, "grad_norm": 2.09518075568316, "language_loss": 0.76002419, "learning_rate": 1.990847682429185e-06, "loss": 0.78469324, "num_input_tokens_seen": 184656125, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.23181152, "step": 8589, "time_per_iteration": 2.8317511081695557 }, { "auxiliary_loss_clip": 0.01432846, "auxiliary_loss_mlp": 0.0103696, "balance_loss_clip": 1.26329958, "balance_loss_mlp": 1.01584768, "epoch": 0.5164587404178566, "flos": 21332103678720.0, "grad_norm": 1.6475224654886933, "language_loss": 0.67909479, "learning_rate": 1.990458225001627e-06, "loss": 0.70379281, "num_input_tokens_seen": 184675920, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.21105957, "step": 8590, "time_per_iteration": 2.8623545169830322 }, { "auxiliary_loss_clip": 0.01244416, "auxiliary_loss_mlp": 0.01029625, "balance_loss_clip": 1.14062238, "balance_loss_mlp": 1.00330377, "epoch": 0.5165188636705246, "flos": 68087186824320.0, "grad_norm": 0.785103453897445, "language_loss": 0.55863392, "learning_rate": 1.990068767935895e-06, "loss": 0.58137429, "num_input_tokens_seen": 184730520, "router_z_loss_clip": 1.0390625, "router_z_loss_mlp": 0.26367188, "step": 8591, "time_per_iteration": 3.2376537322998047 }, { "auxiliary_loss_clip": 0.0139777, "auxiliary_loss_mlp": 0.010379, "balance_loss_clip": 1.23838603, "balance_loss_mlp": 1.0156436, "epoch": 0.5165789869231926, "flos": 19393728900480.0, "grad_norm": 1.5857166825992317, "language_loss": 0.82411504, "learning_rate": 1.9896793112467566e-06, "loss": 0.84847176, "num_input_tokens_seen": 184748340, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.22265625, "step": 8592, "time_per_iteration": 2.900273323059082 }, { "auxiliary_loss_clip": 0.01415103, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.25099492, "balance_loss_mlp": 1.00844419, "epoch": 0.5166391101758605, "flos": 20970144126720.0, "grad_norm": 2.9296953171523588, "language_loss": 0.83971125, "learning_rate": 1.989289854948979e-06, "loss": 0.86416638, "num_input_tokens_seen": 184766615, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.21984863, "step": 8593, "time_per_iteration": 2.9012959003448486 }, { "auxiliary_loss_clip": 0.01414455, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.24807239, "balance_loss_mlp": 1.0155592, "epoch": 0.5166992334285285, "flos": 29474826975360.0, "grad_norm": 2.117285919252505, "language_loss": 0.70229453, "learning_rate": 1.9889003990573314e-06, "loss": 0.72682029, "num_input_tokens_seen": 184788075, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.22570801, "step": 8594, "time_per_iteration": 2.924818277359009 }, { "auxiliary_loss_clip": 0.01417343, "auxiliary_loss_mlp": 0.01036891, "balance_loss_clip": 1.25171018, "balance_loss_mlp": 1.01462281, "epoch": 0.5167593566811964, "flos": 20314372970880.0, "grad_norm": 1.440637379396733, "language_loss": 0.78186822, "learning_rate": 1.988510943586582e-06, "loss": 0.80641067, "num_input_tokens_seen": 184808710, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.22253418, "step": 8595, "time_per_iteration": 2.895172357559204 }, { "auxiliary_loss_clip": 0.01416238, "auxiliary_loss_mlp": 0.01035485, "balance_loss_clip": 1.25012255, "balance_loss_mlp": 1.01400399, "epoch": 0.5168194799338645, "flos": 14619591100800.0, "grad_norm": 1.6963020197701275, "language_loss": 0.65837669, "learning_rate": 1.9881214885514986e-06, "loss": 0.68289393, "num_input_tokens_seen": 184826475, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21484375, "step": 8596, "time_per_iteration": 2.834728240966797 }, { "auxiliary_loss_clip": 0.01427272, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.26088214, "balance_loss_mlp": 1.01647401, "epoch": 0.5168796031865324, "flos": 25017512400000.0, "grad_norm": 1.6060162711811252, "language_loss": 0.76429975, "learning_rate": 1.9877320339668492e-06, "loss": 0.78896546, "num_input_tokens_seen": 184845245, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.22827148, "step": 8597, "time_per_iteration": 2.9289236068725586 }, { "auxiliary_loss_clip": 0.014271, "auxiliary_loss_mlp": 0.01037246, "balance_loss_clip": 1.25964975, "balance_loss_mlp": 1.01493001, "epoch": 0.5169397264392004, "flos": 26950457802240.0, "grad_norm": 1.7877374412665, "language_loss": 0.82113898, "learning_rate": 1.987342579847403e-06, "loss": 0.8457824, "num_input_tokens_seen": 184866605, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.2232666, "step": 8598, "time_per_iteration": 2.9432528018951416 }, { "auxiliary_loss_clip": 0.01412665, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.246418, "balance_loss_mlp": 1.0125227, "epoch": 0.5169998496918683, "flos": 25417957294080.0, "grad_norm": 1.664915774289835, "language_loss": 0.76155764, "learning_rate": 1.9869531262079273e-06, "loss": 0.78604364, "num_input_tokens_seen": 184886945, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.23413086, "step": 8599, "time_per_iteration": 2.8695733547210693 }, { "auxiliary_loss_clip": 0.01418366, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.25428271, "balance_loss_mlp": 1.01289809, "epoch": 0.5170599729445363, "flos": 24691459236480.0, "grad_norm": 2.15065901035362, "language_loss": 0.73668087, "learning_rate": 1.9865636730631904e-06, "loss": 0.76120991, "num_input_tokens_seen": 184905590, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.21618652, "step": 8600, "time_per_iteration": 2.9076287746429443 }, { "auxiliary_loss_clip": 0.01420353, "auxiliary_loss_mlp": 0.01038318, "balance_loss_clip": 1.25509191, "balance_loss_mlp": 1.01604986, "epoch": 0.5171200961972042, "flos": 21003833520000.0, "grad_norm": 1.578070606654716, "language_loss": 0.7529161, "learning_rate": 1.9861742204279602e-06, "loss": 0.77750278, "num_input_tokens_seen": 184925555, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.22265625, "step": 8601, "time_per_iteration": 2.833125114440918 }, { "auxiliary_loss_clip": 0.01419744, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.25246656, "balance_loss_mlp": 1.01714647, "epoch": 0.5171802194498722, "flos": 22755437187840.0, "grad_norm": 3.1491846710680726, "language_loss": 0.85497224, "learning_rate": 1.9857847683170045e-06, "loss": 0.87957191, "num_input_tokens_seen": 184944490, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.23071289, "step": 8602, "time_per_iteration": 2.8346335887908936 }, { "auxiliary_loss_clip": 0.01417785, "auxiliary_loss_mlp": 0.01037159, "balance_loss_clip": 1.25068688, "balance_loss_mlp": 1.01458073, "epoch": 0.5172403427025402, "flos": 28186567752960.0, "grad_norm": 1.738717168138201, "language_loss": 0.75564915, "learning_rate": 1.9853953167450926e-06, "loss": 0.78019857, "num_input_tokens_seen": 184963190, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.22570801, "step": 8603, "time_per_iteration": 2.8779842853546143 }, { "auxiliary_loss_clip": 0.01433109, "auxiliary_loss_mlp": 0.01041485, "balance_loss_clip": 1.26472139, "balance_loss_mlp": 1.01994419, "epoch": 0.5173004659552082, "flos": 20347066978560.0, "grad_norm": 2.5264268562428174, "language_loss": 0.74053353, "learning_rate": 1.9850058657269915e-06, "loss": 0.76527953, "num_input_tokens_seen": 184981220, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21533203, "step": 8604, "time_per_iteration": 2.8286824226379395 }, { "auxiliary_loss_clip": 0.01444193, "auxiliary_loss_mlp": 0.0103502, "balance_loss_clip": 1.26743686, "balance_loss_mlp": 1.01108241, "epoch": 0.5173605892078762, "flos": 19072878888960.0, "grad_norm": 2.363241814024563, "language_loss": 0.86125481, "learning_rate": 1.984616415277469e-06, "loss": 0.88604689, "num_input_tokens_seen": 184998810, "router_z_loss_clip": 1.76855469, "router_z_loss_mlp": 0.23937988, "step": 8605, "time_per_iteration": 2.855822801589966 }, { "auxiliary_loss_clip": 0.01416442, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.24975896, "balance_loss_mlp": 1.01285267, "epoch": 0.5174207124605441, "flos": 28005180773760.0, "grad_norm": 1.517408712855388, "language_loss": 0.65413946, "learning_rate": 1.984226965411294e-06, "loss": 0.67865592, "num_input_tokens_seen": 185021185, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.22351074, "step": 8606, "time_per_iteration": 3.0367088317871094 }, { "auxiliary_loss_clip": 0.01426361, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.2607795, "balance_loss_mlp": 1.01500952, "epoch": 0.5174808357132121, "flos": 19505158139520.0, "grad_norm": 1.4474617254817248, "language_loss": 0.78195989, "learning_rate": 1.983837516143234e-06, "loss": 0.80659676, "num_input_tokens_seen": 185038465, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.2232666, "step": 8607, "time_per_iteration": 2.9227051734924316 }, { "auxiliary_loss_clip": 0.01423007, "auxiliary_loss_mlp": 0.01045163, "balance_loss_clip": 1.25592279, "balance_loss_mlp": 1.02165449, "epoch": 0.51754095896588, "flos": 22794510712320.0, "grad_norm": 1.600782028660385, "language_loss": 0.72311389, "learning_rate": 1.983448067488057e-06, "loss": 0.74779564, "num_input_tokens_seen": 185057340, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.23522949, "step": 8608, "time_per_iteration": 3.0031864643096924 }, { "auxiliary_loss_clip": 0.01434514, "auxiliary_loss_mlp": 0.01038157, "balance_loss_clip": 1.26273942, "balance_loss_mlp": 1.01460147, "epoch": 0.5176010822185481, "flos": 22678149790080.0, "grad_norm": 1.9216289840562966, "language_loss": 0.87841308, "learning_rate": 1.983058619460531e-06, "loss": 0.90313977, "num_input_tokens_seen": 185074935, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.23583984, "step": 8609, "time_per_iteration": 2.889958143234253 }, { "auxiliary_loss_clip": 0.01416238, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.24808836, "balance_loss_mlp": 1.01487529, "epoch": 0.517661205471216, "flos": 23961703553280.0, "grad_norm": 1.6186988628422483, "language_loss": 0.74295551, "learning_rate": 1.9826691720754237e-06, "loss": 0.76749045, "num_input_tokens_seen": 185095050, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.22375488, "step": 8610, "time_per_iteration": 2.87986421585083 }, { "auxiliary_loss_clip": 0.01434876, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.26209044, "balance_loss_mlp": 1.01294446, "epoch": 0.517721328723884, "flos": 15604446821760.0, "grad_norm": 2.9072636268469267, "language_loss": 0.68749034, "learning_rate": 1.9822797253475034e-06, "loss": 0.71219039, "num_input_tokens_seen": 185112275, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.22192383, "step": 8611, "time_per_iteration": 4.315730571746826 }, { "auxiliary_loss_clip": 0.01431761, "auxiliary_loss_mlp": 0.01038321, "balance_loss_clip": 1.26297545, "balance_loss_mlp": 1.01489592, "epoch": 0.5177814519765519, "flos": 20970144126720.0, "grad_norm": 2.1567603449227857, "language_loss": 0.78425252, "learning_rate": 1.9818902792915373e-06, "loss": 0.80895329, "num_input_tokens_seen": 185132165, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.23425293, "step": 8612, "time_per_iteration": 2.8778679370880127 }, { "auxiliary_loss_clip": 0.01424279, "auxiliary_loss_mlp": 0.01037587, "balance_loss_clip": 1.25432992, "balance_loss_mlp": 1.01483023, "epoch": 0.5178415752292199, "flos": 17976865397760.0, "grad_norm": 1.935708719116835, "language_loss": 0.82669294, "learning_rate": 1.981500833922294e-06, "loss": 0.85131156, "num_input_tokens_seen": 185151025, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.22753906, "step": 8613, "time_per_iteration": 2.8252224922180176 }, { "auxiliary_loss_clip": 0.01437811, "auxiliary_loss_mlp": 0.01038958, "balance_loss_clip": 1.26670516, "balance_loss_mlp": 1.0154022, "epoch": 0.5179016984818878, "flos": 17830389421440.0, "grad_norm": 2.302460056280268, "language_loss": 0.67783988, "learning_rate": 1.981111389254541e-06, "loss": 0.70260751, "num_input_tokens_seen": 185168455, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.2355957, "step": 8614, "time_per_iteration": 2.7872633934020996 }, { "auxiliary_loss_clip": 0.0143034, "auxiliary_loss_mlp": 0.01036102, "balance_loss_clip": 1.26096296, "balance_loss_mlp": 1.0132376, "epoch": 0.5179618217345558, "flos": 17829077322240.0, "grad_norm": 1.9150861113586388, "language_loss": 0.87210906, "learning_rate": 1.9807219453030453e-06, "loss": 0.89677346, "num_input_tokens_seen": 185184415, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.2286377, "step": 8615, "time_per_iteration": 2.8271243572235107 }, { "auxiliary_loss_clip": 0.01423481, "auxiliary_loss_mlp": 0.01043619, "balance_loss_clip": 1.25796497, "balance_loss_mlp": 1.02044463, "epoch": 0.5180219449872238, "flos": 22530678428160.0, "grad_norm": 1.8367271041958324, "language_loss": 0.81343091, "learning_rate": 1.9803325020825763e-06, "loss": 0.83810198, "num_input_tokens_seen": 185202910, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.23181152, "step": 8616, "time_per_iteration": 2.945751905441284 }, { "auxiliary_loss_clip": 0.01445834, "auxiliary_loss_mlp": 0.01043678, "balance_loss_clip": 1.27351022, "balance_loss_mlp": 1.01932359, "epoch": 0.5180820682398918, "flos": 23926159123200.0, "grad_norm": 1.9464631489514959, "language_loss": 0.76093554, "learning_rate": 1.9799430596079e-06, "loss": 0.78583062, "num_input_tokens_seen": 185223085, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.24353027, "step": 8617, "time_per_iteration": 2.906691789627075 }, { "auxiliary_loss_clip": 0.01439866, "auxiliary_loss_mlp": 0.01046758, "balance_loss_clip": 1.26937747, "balance_loss_mlp": 1.02432275, "epoch": 0.5181421914925598, "flos": 16987982889600.0, "grad_norm": 2.308076907681896, "language_loss": 0.70751309, "learning_rate": 1.979553617893785e-06, "loss": 0.73237932, "num_input_tokens_seen": 185241295, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.22436523, "step": 8618, "time_per_iteration": 2.8437342643737793 }, { "auxiliary_loss_clip": 0.0124703, "auxiliary_loss_mlp": 0.01047363, "balance_loss_clip": 1.14251626, "balance_loss_mlp": 1.02313924, "epoch": 0.5182023147452277, "flos": 66091839563520.0, "grad_norm": 0.952369521090946, "language_loss": 0.67315549, "learning_rate": 1.979164176954999e-06, "loss": 0.6960994, "num_input_tokens_seen": 185298295, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 0.2421875, "step": 8619, "time_per_iteration": 3.311448097229004 }, { "auxiliary_loss_clip": 0.01408471, "auxiliary_loss_mlp": 0.01036041, "balance_loss_clip": 1.24539638, "balance_loss_mlp": 1.01409423, "epoch": 0.5182624379978957, "flos": 18197099677440.0, "grad_norm": 2.0673089570892196, "language_loss": 0.81179458, "learning_rate": 1.97877473680631e-06, "loss": 0.8362397, "num_input_tokens_seen": 185317000, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.21923828, "step": 8620, "time_per_iteration": 2.8482272624969482 }, { "auxiliary_loss_clip": 0.01415135, "auxiliary_loss_mlp": 0.01040156, "balance_loss_clip": 1.2509892, "balance_loss_mlp": 1.01779246, "epoch": 0.5183225612505636, "flos": 14034546846720.0, "grad_norm": 2.044219244362442, "language_loss": 0.82607317, "learning_rate": 1.9783852974624846e-06, "loss": 0.85062611, "num_input_tokens_seen": 185331185, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.22351074, "step": 8621, "time_per_iteration": 4.255959510803223 }, { "auxiliary_loss_clip": 0.01423246, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.25564349, "balance_loss_mlp": 1.01688886, "epoch": 0.5183826845032317, "flos": 23670335168640.0, "grad_norm": 2.3985188505698276, "language_loss": 0.65926021, "learning_rate": 1.9779958589382905e-06, "loss": 0.68388855, "num_input_tokens_seen": 185348955, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.22705078, "step": 8622, "time_per_iteration": 2.8502089977264404 }, { "auxiliary_loss_clip": 0.01445751, "auxiliary_loss_mlp": 0.01041261, "balance_loss_clip": 1.27125084, "balance_loss_mlp": 1.01750267, "epoch": 0.5184428077558996, "flos": 15897217795200.0, "grad_norm": 3.6521865258497037, "language_loss": 0.62200838, "learning_rate": 1.977606421248497e-06, "loss": 0.64687854, "num_input_tokens_seen": 185367330, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.23754883, "step": 8623, "time_per_iteration": 4.256067752838135 }, { "auxiliary_loss_clip": 0.01412287, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.24631131, "balance_loss_mlp": 1.0129807, "epoch": 0.5185029310085676, "flos": 21040192356480.0, "grad_norm": 1.6288499219975408, "language_loss": 0.76599979, "learning_rate": 1.9772169844078685e-06, "loss": 0.79048008, "num_input_tokens_seen": 185385060, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.22741699, "step": 8624, "time_per_iteration": 4.234911918640137 }, { "auxiliary_loss_clip": 0.01430086, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.26001859, "balance_loss_mlp": 1.01442814, "epoch": 0.5185630542612355, "flos": 26554130184960.0, "grad_norm": 1.7715748114051129, "language_loss": 0.72559273, "learning_rate": 1.9768275484311756e-06, "loss": 0.75026286, "num_input_tokens_seen": 185403745, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22497559, "step": 8625, "time_per_iteration": 2.8537485599517822 }, { "auxiliary_loss_clip": 0.0141565, "auxiliary_loss_mlp": 0.01036178, "balance_loss_clip": 1.24917889, "balance_loss_mlp": 1.01430273, "epoch": 0.5186231775139035, "flos": 20678323294080.0, "grad_norm": 2.070492977160189, "language_loss": 0.68390667, "learning_rate": 1.976438113333184e-06, "loss": 0.70842499, "num_input_tokens_seen": 185422620, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.21875, "step": 8626, "time_per_iteration": 2.8241803646087646 }, { "auxiliary_loss_clip": 0.01416645, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.24934769, "balance_loss_mlp": 1.01215959, "epoch": 0.5186833007665714, "flos": 20894983234560.0, "grad_norm": 2.459023087576182, "language_loss": 0.71284986, "learning_rate": 1.9760486791286612e-06, "loss": 0.7373625, "num_input_tokens_seen": 185439380, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.2244873, "step": 8627, "time_per_iteration": 2.8440866470336914 }, { "auxiliary_loss_clip": 0.01435089, "auxiliary_loss_mlp": 0.01038738, "balance_loss_clip": 1.26317787, "balance_loss_mlp": 1.01529002, "epoch": 0.5187434240192395, "flos": 20896747781760.0, "grad_norm": 1.964780179890976, "language_loss": 0.73704928, "learning_rate": 1.9756592458323753e-06, "loss": 0.76178753, "num_input_tokens_seen": 185458830, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.234375, "step": 8628, "time_per_iteration": 2.8834712505340576 }, { "auxiliary_loss_clip": 0.01418171, "auxiliary_loss_mlp": 0.01036614, "balance_loss_clip": 1.25150752, "balance_loss_mlp": 1.01451254, "epoch": 0.5188035472719074, "flos": 19868113077120.0, "grad_norm": 1.6678636127626478, "language_loss": 0.78088641, "learning_rate": 1.9752698134590927e-06, "loss": 0.80543423, "num_input_tokens_seen": 185477270, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.22094727, "step": 8629, "time_per_iteration": 2.859098196029663 }, { "auxiliary_loss_clip": 0.01445015, "auxiliary_loss_mlp": 0.01038508, "balance_loss_clip": 1.27343953, "balance_loss_mlp": 1.01459479, "epoch": 0.5188636705245754, "flos": 21147187605120.0, "grad_norm": 2.578989712161607, "language_loss": 0.75542837, "learning_rate": 1.9748803820235815e-06, "loss": 0.7802636, "num_input_tokens_seen": 185495795, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.23913574, "step": 8630, "time_per_iteration": 2.8534443378448486 }, { "auxiliary_loss_clip": 0.01432731, "auxiliary_loss_mlp": 0.01038675, "balance_loss_clip": 1.26197779, "balance_loss_mlp": 1.01590621, "epoch": 0.5189237937772434, "flos": 22430153185920.0, "grad_norm": 1.8998323648804376, "language_loss": 0.8172704, "learning_rate": 1.9744909515406093e-06, "loss": 0.84198451, "num_input_tokens_seen": 185514885, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.2277832, "step": 8631, "time_per_iteration": 2.8692946434020996 }, { "auxiliary_loss_clip": 0.01436539, "auxiliary_loss_mlp": 0.01035824, "balance_loss_clip": 1.26600838, "balance_loss_mlp": 1.01319778, "epoch": 0.5189839170299113, "flos": 25457483266560.0, "grad_norm": 1.4494253262754708, "language_loss": 0.7549181, "learning_rate": 1.974101522024942e-06, "loss": 0.77964175, "num_input_tokens_seen": 185537155, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.22644043, "step": 8632, "time_per_iteration": 2.8876359462738037 }, { "auxiliary_loss_clip": 0.01408685, "auxiliary_loss_mlp": 0.01034741, "balance_loss_clip": 1.24499559, "balance_loss_mlp": 1.01228237, "epoch": 0.5190440402825793, "flos": 18596820654720.0, "grad_norm": 1.860212879089833, "language_loss": 0.79257828, "learning_rate": 1.9737120934913477e-06, "loss": 0.81701255, "num_input_tokens_seen": 185555520, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.22473145, "step": 8633, "time_per_iteration": 2.863926887512207 }, { "auxiliary_loss_clip": 0.01423013, "auxiliary_loss_mlp": 0.01040334, "balance_loss_clip": 1.25457001, "balance_loss_mlp": 1.01652837, "epoch": 0.5191041635352472, "flos": 21918776745600.0, "grad_norm": 1.801417209724989, "language_loss": 0.8129167, "learning_rate": 1.9733226659545936e-06, "loss": 0.83755016, "num_input_tokens_seen": 185573855, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.23828125, "step": 8634, "time_per_iteration": 2.8558287620544434 }, { "auxiliary_loss_clip": 0.01421677, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.25525999, "balance_loss_mlp": 1.01504183, "epoch": 0.5191642867879153, "flos": 27539709822720.0, "grad_norm": 1.586951144675779, "language_loss": 0.69693935, "learning_rate": 1.9729332394294467e-06, "loss": 0.72153318, "num_input_tokens_seen": 185595145, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.22668457, "step": 8635, "time_per_iteration": 2.917783737182617 }, { "auxiliary_loss_clip": 0.0143877, "auxiliary_loss_mlp": 0.01037891, "balance_loss_clip": 1.26708198, "balance_loss_mlp": 1.01545572, "epoch": 0.5192244100405832, "flos": 15714518716800.0, "grad_norm": 1.6239807803248498, "language_loss": 0.78356314, "learning_rate": 1.9725438139306742e-06, "loss": 0.8083297, "num_input_tokens_seen": 185613320, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.22436523, "step": 8636, "time_per_iteration": 2.8527562618255615 }, { "auxiliary_loss_clip": 0.01441578, "auxiliary_loss_mlp": 0.01035752, "balance_loss_clip": 1.27007627, "balance_loss_mlp": 1.0130899, "epoch": 0.5192845332932512, "flos": 12064563936000.0, "grad_norm": 3.1978475476058317, "language_loss": 0.72664297, "learning_rate": 1.9721543894730425e-06, "loss": 0.75141627, "num_input_tokens_seen": 185630730, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.22668457, "step": 8637, "time_per_iteration": 2.8194668292999268 }, { "auxiliary_loss_clip": 0.01410884, "auxiliary_loss_mlp": 0.0104082, "balance_loss_clip": 1.2463001, "balance_loss_mlp": 1.01817024, "epoch": 0.5193446565459191, "flos": 18962535525120.0, "grad_norm": 3.575424544539419, "language_loss": 0.77004516, "learning_rate": 1.9717649660713194e-06, "loss": 0.79456216, "num_input_tokens_seen": 185648515, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.22631836, "step": 8638, "time_per_iteration": 2.8709611892700195 }, { "auxiliary_loss_clip": 0.01428671, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.26160908, "balance_loss_mlp": 1.0115087, "epoch": 0.5194047797985871, "flos": 20384375955840.0, "grad_norm": 1.8240228900880884, "language_loss": 0.75420022, "learning_rate": 1.971375543740272e-06, "loss": 0.77882242, "num_input_tokens_seen": 185665220, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.22033691, "step": 8639, "time_per_iteration": 2.837958574295044 }, { "auxiliary_loss_clip": 0.01420217, "auxiliary_loss_mlp": 0.01033998, "balance_loss_clip": 1.25331783, "balance_loss_mlp": 1.01157498, "epoch": 0.519464903051255, "flos": 24363732015360.0, "grad_norm": 1.677727294163514, "language_loss": 0.78503776, "learning_rate": 1.9709861224946665e-06, "loss": 0.80957991, "num_input_tokens_seen": 185683750, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.22399902, "step": 8640, "time_per_iteration": 2.8946542739868164 }, { "auxiliary_loss_clip": 0.01417036, "auxiliary_loss_mlp": 0.01038123, "balance_loss_clip": 1.25294018, "balance_loss_mlp": 1.01542592, "epoch": 0.519525026303923, "flos": 14069322115200.0, "grad_norm": 2.0968399967940075, "language_loss": 0.66649711, "learning_rate": 1.97059670234927e-06, "loss": 0.69104868, "num_input_tokens_seen": 185700625, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.22705078, "step": 8641, "time_per_iteration": 2.8317363262176514 }, { "auxiliary_loss_clip": 0.014226, "auxiliary_loss_mlp": 0.01032508, "balance_loss_clip": 1.25675392, "balance_loss_mlp": 1.01019216, "epoch": 0.519585149556591, "flos": 28846501430400.0, "grad_norm": 1.9547504086393888, "language_loss": 0.7705906, "learning_rate": 1.97020728331885e-06, "loss": 0.7951417, "num_input_tokens_seen": 185721155, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.22314453, "step": 8642, "time_per_iteration": 2.8858158588409424 }, { "auxiliary_loss_clip": 0.01422671, "auxiliary_loss_mlp": 0.01033622, "balance_loss_clip": 1.25625217, "balance_loss_mlp": 1.01124609, "epoch": 0.519645272809259, "flos": 25383589228800.0, "grad_norm": 1.4763516223620523, "language_loss": 0.83560228, "learning_rate": 1.9698178654181726e-06, "loss": 0.86016518, "num_input_tokens_seen": 185740990, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.22375488, "step": 8643, "time_per_iteration": 2.8954999446868896 }, { "auxiliary_loss_clip": 0.01435004, "auxiliary_loss_mlp": 0.01046838, "balance_loss_clip": 1.26325178, "balance_loss_mlp": 1.02411699, "epoch": 0.519705396061927, "flos": 25383408249600.0, "grad_norm": 1.5476529847380043, "language_loss": 0.70939302, "learning_rate": 1.969428448662004e-06, "loss": 0.73421139, "num_input_tokens_seen": 185762235, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.22729492, "step": 8644, "time_per_iteration": 2.8646645545959473 }, { "auxiliary_loss_clip": 0.01432045, "auxiliary_loss_mlp": 0.01037812, "balance_loss_clip": 1.26384974, "balance_loss_mlp": 1.01538873, "epoch": 0.5197655193145949, "flos": 28488116217600.0, "grad_norm": 1.6385286287669225, "language_loss": 0.80688095, "learning_rate": 1.9690390330651133e-06, "loss": 0.83157951, "num_input_tokens_seen": 185783415, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.22412109, "step": 8645, "time_per_iteration": 2.9118285179138184 }, { "auxiliary_loss_clip": 0.01421876, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.25324273, "balance_loss_mlp": 1.01462984, "epoch": 0.5198256425672629, "flos": 20018208637440.0, "grad_norm": 1.863719455787869, "language_loss": 0.78057897, "learning_rate": 1.968649618642264e-06, "loss": 0.80517733, "num_input_tokens_seen": 185801345, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.23327637, "step": 8646, "time_per_iteration": 4.3217527866363525 }, { "auxiliary_loss_clip": 0.01429046, "auxiliary_loss_mlp": 0.01039842, "balance_loss_clip": 1.26178384, "balance_loss_mlp": 1.01806235, "epoch": 0.5198857658199308, "flos": 19838450471040.0, "grad_norm": 2.5204431944277177, "language_loss": 0.66676521, "learning_rate": 1.9682602054082252e-06, "loss": 0.69145405, "num_input_tokens_seen": 185820815, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.21765137, "step": 8647, "time_per_iteration": 2.8281073570251465 }, { "auxiliary_loss_clip": 0.01433981, "auxiliary_loss_mlp": 0.01039718, "balance_loss_clip": 1.26215827, "balance_loss_mlp": 1.01674628, "epoch": 0.5199458890725989, "flos": 24472537056000.0, "grad_norm": 9.904541994808268, "language_loss": 0.72061908, "learning_rate": 1.967870793377763e-06, "loss": 0.74535608, "num_input_tokens_seen": 185841450, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.22949219, "step": 8648, "time_per_iteration": 2.98569393157959 }, { "auxiliary_loss_clip": 0.0143423, "auxiliary_loss_mlp": 0.01039422, "balance_loss_clip": 1.26611662, "balance_loss_mlp": 1.01573563, "epoch": 0.5200060123252668, "flos": 23415370865280.0, "grad_norm": 1.7001647291819395, "language_loss": 0.65769076, "learning_rate": 1.967481382565642e-06, "loss": 0.68242729, "num_input_tokens_seen": 185859935, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.23681641, "step": 8649, "time_per_iteration": 3.0430822372436523 }, { "auxiliary_loss_clip": 0.01442535, "auxiliary_loss_mlp": 0.01038857, "balance_loss_clip": 1.26918328, "balance_loss_mlp": 1.0167551, "epoch": 0.5200661355779348, "flos": 17210117450880.0, "grad_norm": 1.7989133742411625, "language_loss": 0.70948184, "learning_rate": 1.9670919729866315e-06, "loss": 0.73429573, "num_input_tokens_seen": 185876795, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.22094727, "step": 8650, "time_per_iteration": 2.862105369567871 }, { "auxiliary_loss_clip": 0.01415776, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.24878037, "balance_loss_mlp": 1.01784539, "epoch": 0.5201262588306027, "flos": 18524238716160.0, "grad_norm": 1.7413234974732394, "language_loss": 0.77832502, "learning_rate": 1.966702564655496e-06, "loss": 0.80288053, "num_input_tokens_seen": 185895570, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.21936035, "step": 8651, "time_per_iteration": 2.8867571353912354 }, { "auxiliary_loss_clip": 0.01437268, "auxiliary_loss_mlp": 0.01043773, "balance_loss_clip": 1.26740038, "balance_loss_mlp": 1.02087331, "epoch": 0.5201863820832707, "flos": 18627252422400.0, "grad_norm": 2.1986053933455736, "language_loss": 0.79657221, "learning_rate": 1.966313157587003e-06, "loss": 0.82138264, "num_input_tokens_seen": 185913700, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22912598, "step": 8652, "time_per_iteration": 2.8454201221466064 }, { "auxiliary_loss_clip": 0.01433579, "auxiliary_loss_mlp": 0.01044564, "balance_loss_clip": 1.26633573, "balance_loss_mlp": 1.02017426, "epoch": 0.5202465053359386, "flos": 22867499854080.0, "grad_norm": 2.0918641959083653, "language_loss": 0.70918107, "learning_rate": 1.9659237517959187e-06, "loss": 0.73396254, "num_input_tokens_seen": 185932460, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.24389648, "step": 8653, "time_per_iteration": 2.8421266078948975 }, { "auxiliary_loss_clip": 0.01447871, "auxiliary_loss_mlp": 0.0105035, "balance_loss_clip": 1.27563, "balance_loss_mlp": 1.02680588, "epoch": 0.5203066285886067, "flos": 21991675397760.0, "grad_norm": 1.759529301277713, "language_loss": 0.79305232, "learning_rate": 1.965534347297008e-06, "loss": 0.81803453, "num_input_tokens_seen": 185952030, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.23547363, "step": 8654, "time_per_iteration": 2.8366036415100098 }, { "auxiliary_loss_clip": 0.01452576, "auxiliary_loss_mlp": 0.01043365, "balance_loss_clip": 1.27764833, "balance_loss_mlp": 1.02147818, "epoch": 0.5203667518412746, "flos": 20243827048320.0, "grad_norm": 1.8684350863058066, "language_loss": 0.84529775, "learning_rate": 1.9651449441050393e-06, "loss": 0.8702572, "num_input_tokens_seen": 185973130, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.21875, "step": 8655, "time_per_iteration": 2.8998117446899414 }, { "auxiliary_loss_clip": 0.01438089, "auxiliary_loss_mlp": 0.01043024, "balance_loss_clip": 1.27163196, "balance_loss_mlp": 1.02038598, "epoch": 0.5204268750939426, "flos": 15713840044800.0, "grad_norm": 2.12059196589611, "language_loss": 0.67958069, "learning_rate": 1.9647555422347777e-06, "loss": 0.70439178, "num_input_tokens_seen": 185990200, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.22619629, "step": 8656, "time_per_iteration": 2.7986974716186523 }, { "auxiliary_loss_clip": 0.01432692, "auxiliary_loss_mlp": 0.01036618, "balance_loss_clip": 1.26348639, "balance_loss_mlp": 1.01529169, "epoch": 0.5204869983466105, "flos": 27459752981760.0, "grad_norm": 2.039336498804898, "language_loss": 0.74506706, "learning_rate": 1.9643661417009893e-06, "loss": 0.76976019, "num_input_tokens_seen": 186009880, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21313477, "step": 8657, "time_per_iteration": 4.2787744998931885 }, { "auxiliary_loss_clip": 0.01424716, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.25719333, "balance_loss_mlp": 1.0192405, "epoch": 0.5205471215992785, "flos": 20605334152320.0, "grad_norm": 1.792366109831257, "language_loss": 0.72415549, "learning_rate": 1.9639767425184408e-06, "loss": 0.74882013, "num_input_tokens_seen": 186026680, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.22521973, "step": 8658, "time_per_iteration": 4.214099407196045 }, { "auxiliary_loss_clip": 0.01430739, "auxiliary_loss_mlp": 0.01042657, "balance_loss_clip": 1.26281798, "balance_loss_mlp": 1.02009034, "epoch": 0.5206072448519465, "flos": 22138196618880.0, "grad_norm": 1.6613150579769567, "language_loss": 0.8424266, "learning_rate": 1.963587344701897e-06, "loss": 0.86716056, "num_input_tokens_seen": 186046920, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.22583008, "step": 8659, "time_per_iteration": 4.2712812423706055 }, { "auxiliary_loss_clip": 0.0145311, "auxiliary_loss_mlp": 0.01047788, "balance_loss_clip": 1.27744961, "balance_loss_mlp": 1.02402949, "epoch": 0.5206673681046144, "flos": 18338960684160.0, "grad_norm": 2.080892480883796, "language_loss": 0.76077008, "learning_rate": 1.9631979482661253e-06, "loss": 0.785779, "num_input_tokens_seen": 186062090, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.23730469, "step": 8660, "time_per_iteration": 2.8500616550445557 }, { "auxiliary_loss_clip": 0.0142996, "auxiliary_loss_mlp": 0.01041582, "balance_loss_clip": 1.26054549, "balance_loss_mlp": 1.01948023, "epoch": 0.5207274913572825, "flos": 20239935995520.0, "grad_norm": 1.6867721932715662, "language_loss": 0.78390443, "learning_rate": 1.9628085532258906e-06, "loss": 0.8086198, "num_input_tokens_seen": 186081135, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.22094727, "step": 8661, "time_per_iteration": 2.909979820251465 }, { "auxiliary_loss_clip": 0.01440276, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 1.26966846, "balance_loss_mlp": 1.01562905, "epoch": 0.5207876146099504, "flos": 22137110743680.0, "grad_norm": 1.7071137755458996, "language_loss": 0.71369624, "learning_rate": 1.9624191595959603e-06, "loss": 0.73847538, "num_input_tokens_seen": 186099700, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.22021484, "step": 8662, "time_per_iteration": 2.9941656589508057 }, { "auxiliary_loss_clip": 0.01417676, "auxiliary_loss_mlp": 0.01037693, "balance_loss_clip": 1.25249195, "balance_loss_mlp": 1.01522231, "epoch": 0.5208477378626184, "flos": 23889574062720.0, "grad_norm": 3.403398030035798, "language_loss": 0.70378315, "learning_rate": 1.962029767391098e-06, "loss": 0.72833681, "num_input_tokens_seen": 186119740, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.22460938, "step": 8663, "time_per_iteration": 2.894280195236206 }, { "auxiliary_loss_clip": 0.01434022, "auxiliary_loss_mlp": 0.01041417, "balance_loss_clip": 1.26446474, "balance_loss_mlp": 1.01696706, "epoch": 0.5209078611152863, "flos": 20971818184320.0, "grad_norm": 1.5472933057078044, "language_loss": 0.77328295, "learning_rate": 1.961640376626072e-06, "loss": 0.79803735, "num_input_tokens_seen": 186140645, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.24462891, "step": 8664, "time_per_iteration": 2.903635025024414 }, { "auxiliary_loss_clip": 0.01426397, "auxiliary_loss_mlp": 0.01038282, "balance_loss_clip": 1.2572099, "balance_loss_mlp": 1.01592994, "epoch": 0.5209679843679543, "flos": 20677463642880.0, "grad_norm": 2.9267120390402632, "language_loss": 0.76969332, "learning_rate": 1.961250987315646e-06, "loss": 0.79434013, "num_input_tokens_seen": 186160130, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.22338867, "step": 8665, "time_per_iteration": 2.8916714191436768 }, { "auxiliary_loss_clip": 0.01421547, "auxiliary_loss_mlp": 0.01034938, "balance_loss_clip": 1.25514448, "balance_loss_mlp": 1.01307476, "epoch": 0.5210281076206222, "flos": 20236225921920.0, "grad_norm": 2.7931245763094723, "language_loss": 0.72632968, "learning_rate": 1.960861599474586e-06, "loss": 0.75089449, "num_input_tokens_seen": 186179485, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.21875, "step": 8666, "time_per_iteration": 2.9067909717559814 }, { "auxiliary_loss_clip": 0.01466503, "auxiliary_loss_mlp": 0.01040436, "balance_loss_clip": 1.28542233, "balance_loss_mlp": 1.01659381, "epoch": 0.5210882308732903, "flos": 16078378550400.0, "grad_norm": 2.3153056865306687, "language_loss": 0.69982439, "learning_rate": 1.9604722131176592e-06, "loss": 0.72489381, "num_input_tokens_seen": 186197140, "router_z_loss_clip": 1.81152344, "router_z_loss_mlp": 0.23828125, "step": 8667, "time_per_iteration": 2.884077548980713 }, { "auxiliary_loss_clip": 0.01424611, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.25958991, "balance_loss_mlp": 1.01700747, "epoch": 0.5211483541259582, "flos": 24835627728000.0, "grad_norm": 1.3752063366566738, "language_loss": 0.81719434, "learning_rate": 1.960082828259629e-06, "loss": 0.84184504, "num_input_tokens_seen": 186216800, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.23449707, "step": 8668, "time_per_iteration": 2.9702322483062744 }, { "auxiliary_loss_clip": 0.01442199, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.27307177, "balance_loss_mlp": 1.01457322, "epoch": 0.5212084773786262, "flos": 20378946579840.0, "grad_norm": 1.8142536826367668, "language_loss": 0.64705729, "learning_rate": 1.9596934449152623e-06, "loss": 0.67184377, "num_input_tokens_seen": 186235320, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21862793, "step": 8669, "time_per_iteration": 2.9243452548980713 }, { "auxiliary_loss_clip": 0.01430553, "auxiliary_loss_mlp": 0.01038132, "balance_loss_clip": 1.26163936, "balance_loss_mlp": 1.01542306, "epoch": 0.5212686006312941, "flos": 23154977185920.0, "grad_norm": 1.6652448427165083, "language_loss": 0.67478335, "learning_rate": 1.959304063099325e-06, "loss": 0.69947028, "num_input_tokens_seen": 186254460, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.22705078, "step": 8670, "time_per_iteration": 2.929241180419922 }, { "auxiliary_loss_clip": 0.01414909, "auxiliary_loss_mlp": 0.01039645, "balance_loss_clip": 1.24942601, "balance_loss_mlp": 1.01735246, "epoch": 0.5213287238839621, "flos": 27783408170880.0, "grad_norm": 2.345522623299387, "language_loss": 0.77196723, "learning_rate": 1.9589146828265806e-06, "loss": 0.79651278, "num_input_tokens_seen": 186269465, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.22302246, "step": 8671, "time_per_iteration": 2.915121078491211 }, { "auxiliary_loss_clip": 0.01444757, "auxiliary_loss_mlp": 0.01043277, "balance_loss_clip": 1.27282357, "balance_loss_mlp": 1.02046049, "epoch": 0.5213888471366301, "flos": 19947119777280.0, "grad_norm": 2.760209779772705, "language_loss": 0.78877187, "learning_rate": 1.958525304111796e-06, "loss": 0.81365216, "num_input_tokens_seen": 186288660, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.22814941, "step": 8672, "time_per_iteration": 2.89900279045105 }, { "auxiliary_loss_clip": 0.01421881, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.25332808, "balance_loss_mlp": 1.01412463, "epoch": 0.521448970389298, "flos": 16991511984000.0, "grad_norm": 3.3563980653976655, "language_loss": 0.72366297, "learning_rate": 1.958135926969736e-06, "loss": 0.74823797, "num_input_tokens_seen": 186305760, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21496582, "step": 8673, "time_per_iteration": 2.880209445953369 }, { "auxiliary_loss_clip": 0.01417687, "auxiliary_loss_mlp": 0.01037757, "balance_loss_clip": 1.25010395, "balance_loss_mlp": 1.01467824, "epoch": 0.5215090936419661, "flos": 18999075340800.0, "grad_norm": 2.0217226975625873, "language_loss": 0.75902009, "learning_rate": 1.957746551415166e-06, "loss": 0.78357452, "num_input_tokens_seen": 186324135, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.23071289, "step": 8674, "time_per_iteration": 2.9045138359069824 }, { "auxiliary_loss_clip": 0.01432173, "auxiliary_loss_mlp": 0.01040681, "balance_loss_clip": 1.25972342, "balance_loss_mlp": 1.01741111, "epoch": 0.521569216894634, "flos": 16151955874560.0, "grad_norm": 2.0760677618110353, "language_loss": 0.86774719, "learning_rate": 1.9573571774628506e-06, "loss": 0.89247572, "num_input_tokens_seen": 186340205, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.23266602, "step": 8675, "time_per_iteration": 2.8877787590026855 }, { "auxiliary_loss_clip": 0.01223728, "auxiliary_loss_mlp": 0.01040556, "balance_loss_clip": 1.12271976, "balance_loss_mlp": 1.00813091, "epoch": 0.521629340147302, "flos": 57605191407360.0, "grad_norm": 0.8740351772811429, "language_loss": 0.63229823, "learning_rate": 1.9569678051275556e-06, "loss": 0.65494102, "num_input_tokens_seen": 186396940, "router_z_loss_clip": 1.0078125, "router_z_loss_mlp": 0.32421875, "step": 8676, "time_per_iteration": 3.3835835456848145 }, { "auxiliary_loss_clip": 0.01420001, "auxiliary_loss_mlp": 0.01034998, "balance_loss_clip": 1.2513268, "balance_loss_mlp": 1.01326656, "epoch": 0.5216894633999699, "flos": 26808099102720.0, "grad_norm": 5.725600368200996, "language_loss": 0.68822861, "learning_rate": 1.956578434424046e-06, "loss": 0.71277857, "num_input_tokens_seen": 186418680, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21740723, "step": 8677, "time_per_iteration": 2.9613449573516846 }, { "auxiliary_loss_clip": 0.01425452, "auxiliary_loss_mlp": 0.01037706, "balance_loss_clip": 1.25908566, "balance_loss_mlp": 1.01493728, "epoch": 0.5217495866526379, "flos": 26369711804160.0, "grad_norm": 1.7720293663194315, "language_loss": 0.65988523, "learning_rate": 1.956189065367086e-06, "loss": 0.68451685, "num_input_tokens_seen": 186438265, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.22753906, "step": 8678, "time_per_iteration": 2.9780349731445312 }, { "auxiliary_loss_clip": 0.01444895, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.26994061, "balance_loss_mlp": 1.01268649, "epoch": 0.5218097099053058, "flos": 23593816932480.0, "grad_norm": 2.308526178679417, "language_loss": 0.68997848, "learning_rate": 1.9557996979714414e-06, "loss": 0.71478212, "num_input_tokens_seen": 186456870, "router_z_loss_clip": 1.75, "router_z_loss_mlp": 0.22790527, "step": 8679, "time_per_iteration": 2.940371513366699 }, { "auxiliary_loss_clip": 0.01439722, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.26997495, "balance_loss_mlp": 1.01192021, "epoch": 0.5218698331579739, "flos": 18086484844800.0, "grad_norm": 1.7367010236964207, "language_loss": 0.67812192, "learning_rate": 1.9554103322518764e-06, "loss": 0.70284998, "num_input_tokens_seen": 186476425, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21179199, "step": 8680, "time_per_iteration": 2.8905317783355713 }, { "auxiliary_loss_clip": 0.01447112, "auxiliary_loss_mlp": 0.01037265, "balance_loss_clip": 1.27513623, "balance_loss_mlp": 1.01442468, "epoch": 0.5219299564106418, "flos": 19290443725440.0, "grad_norm": 6.50166766622628, "language_loss": 0.83848196, "learning_rate": 1.955020968223156e-06, "loss": 0.86332577, "num_input_tokens_seen": 186492555, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.22851562, "step": 8681, "time_per_iteration": 4.266220808029175 }, { "auxiliary_loss_clip": 0.0142651, "auxiliary_loss_mlp": 0.01035088, "balance_loss_clip": 1.2582171, "balance_loss_mlp": 1.01229548, "epoch": 0.5219900796633098, "flos": 26662618512000.0, "grad_norm": 1.8259938476678474, "language_loss": 0.78665322, "learning_rate": 1.9546316059000454e-06, "loss": 0.81126916, "num_input_tokens_seen": 186513190, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.2277832, "step": 8682, "time_per_iteration": 2.900270700454712 }, { "auxiliary_loss_clip": 0.01434119, "auxiliary_loss_mlp": 0.01043814, "balance_loss_clip": 1.2653203, "balance_loss_mlp": 1.02180791, "epoch": 0.5220502029159777, "flos": 34325799724800.0, "grad_norm": 1.5875297924098681, "language_loss": 0.69477952, "learning_rate": 1.9542422452973082e-06, "loss": 0.71955884, "num_input_tokens_seen": 186534830, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.22009277, "step": 8683, "time_per_iteration": 2.9405548572540283 }, { "auxiliary_loss_clip": 0.01432578, "auxiliary_loss_mlp": 0.01042676, "balance_loss_clip": 1.26116014, "balance_loss_mlp": 1.01962066, "epoch": 0.5221103261686457, "flos": 22164737333760.0, "grad_norm": 1.5950549985588451, "language_loss": 0.77018666, "learning_rate": 1.9538528864297104e-06, "loss": 0.79493916, "num_input_tokens_seen": 186554390, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.23034668, "step": 8684, "time_per_iteration": 2.8775296211242676 }, { "auxiliary_loss_clip": 0.01426038, "auxiliary_loss_mlp": 0.01035391, "balance_loss_clip": 1.26007867, "balance_loss_mlp": 1.01225197, "epoch": 0.5221704494213137, "flos": 19217771297280.0, "grad_norm": 2.112818763278496, "language_loss": 0.76604962, "learning_rate": 1.9534635293120153e-06, "loss": 0.79066384, "num_input_tokens_seen": 186572360, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.23144531, "step": 8685, "time_per_iteration": 2.8297641277313232 }, { "auxiliary_loss_clip": 0.01449657, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.27654588, "balance_loss_mlp": 1.02239251, "epoch": 0.5222305726739817, "flos": 19363387622400.0, "grad_norm": 3.3173716185161273, "language_loss": 0.81562221, "learning_rate": 1.9530741739589876e-06, "loss": 0.84057426, "num_input_tokens_seen": 186590655, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.23156738, "step": 8686, "time_per_iteration": 2.8636879920959473 }, { "auxiliary_loss_clip": 0.01407366, "auxiliary_loss_mlp": 0.01036384, "balance_loss_clip": 1.24435592, "balance_loss_mlp": 1.01357925, "epoch": 0.5222906959266497, "flos": 27825286872960.0, "grad_norm": 4.932936987156935, "language_loss": 0.70630151, "learning_rate": 1.9526848203853927e-06, "loss": 0.73073906, "num_input_tokens_seen": 186610345, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.22814941, "step": 8687, "time_per_iteration": 2.9503347873687744 }, { "auxiliary_loss_clip": 0.0141015, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.2457974, "balance_loss_mlp": 1.01217043, "epoch": 0.5223508191793176, "flos": 12720425581440.0, "grad_norm": 6.175388522840969, "language_loss": 0.8317554, "learning_rate": 1.9522954686059936e-06, "loss": 0.85619545, "num_input_tokens_seen": 186624360, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.21691895, "step": 8688, "time_per_iteration": 2.868380546569824 }, { "auxiliary_loss_clip": 0.01429925, "auxiliary_loss_mlp": 0.01037935, "balance_loss_clip": 1.2614677, "balance_loss_mlp": 1.01471269, "epoch": 0.5224109424319856, "flos": 15640534189440.0, "grad_norm": 2.3766813696010387, "language_loss": 0.75216746, "learning_rate": 1.9519061186355558e-06, "loss": 0.77684605, "num_input_tokens_seen": 186638680, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.2322998, "step": 8689, "time_per_iteration": 2.984994649887085 }, { "auxiliary_loss_clip": 0.01417621, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.25201488, "balance_loss_mlp": 1.01451802, "epoch": 0.5224710656846535, "flos": 15750696574080.0, "grad_norm": 1.9933809226295882, "language_loss": 0.834378, "learning_rate": 1.9515167704888417e-06, "loss": 0.85892802, "num_input_tokens_seen": 186655840, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.2286377, "step": 8690, "time_per_iteration": 2.8842451572418213 }, { "auxiliary_loss_clip": 0.01434467, "auxiliary_loss_mlp": 0.01041848, "balance_loss_clip": 1.265028, "balance_loss_mlp": 1.01830435, "epoch": 0.5225311889373215, "flos": 26042075072640.0, "grad_norm": 1.9108560111027004, "language_loss": 0.79847389, "learning_rate": 1.9511274241806173e-06, "loss": 0.82323706, "num_input_tokens_seen": 186674150, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.23547363, "step": 8691, "time_per_iteration": 4.313936471939087 }, { "auxiliary_loss_clip": 0.01432275, "auxiliary_loss_mlp": 0.01041697, "balance_loss_clip": 1.26037824, "balance_loss_mlp": 1.01766479, "epoch": 0.5225913121899894, "flos": 18378441411840.0, "grad_norm": 2.1609467624933654, "language_loss": 0.77627993, "learning_rate": 1.950738079725646e-06, "loss": 0.80101967, "num_input_tokens_seen": 186690675, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.24047852, "step": 8692, "time_per_iteration": 2.913616180419922 }, { "auxiliary_loss_clip": 0.01423836, "auxiliary_loss_mlp": 0.01045748, "balance_loss_clip": 1.25937176, "balance_loss_mlp": 1.02353907, "epoch": 0.5226514354426575, "flos": 29284210056960.0, "grad_norm": 1.7983586855325433, "language_loss": 0.72784805, "learning_rate": 1.950348737138691e-06, "loss": 0.75254387, "num_input_tokens_seen": 186710380, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.22216797, "step": 8693, "time_per_iteration": 2.9010279178619385 }, { "auxiliary_loss_clip": 0.01449637, "auxiliary_loss_mlp": 0.01040552, "balance_loss_clip": 1.27425587, "balance_loss_mlp": 1.01790261, "epoch": 0.5227115586953254, "flos": 22862658660480.0, "grad_norm": 6.2782600656544805, "language_loss": 0.83053547, "learning_rate": 1.949959396434517e-06, "loss": 0.85543734, "num_input_tokens_seen": 186729135, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.22644043, "step": 8694, "time_per_iteration": 4.290493726730347 }, { "auxiliary_loss_clip": 0.01222424, "auxiliary_loss_mlp": 0.01039937, "balance_loss_clip": 1.12347078, "balance_loss_mlp": 1.00713086, "epoch": 0.5227716819479934, "flos": 57501227560320.0, "grad_norm": 0.7544515112290998, "language_loss": 0.55717349, "learning_rate": 1.949570057627888e-06, "loss": 0.57979715, "num_input_tokens_seen": 186791115, "router_z_loss_clip": 0.98828125, "router_z_loss_mlp": 0.328125, "step": 8695, "time_per_iteration": 3.400378704071045 }, { "auxiliary_loss_clip": 0.0143567, "auxiliary_loss_mlp": 0.01040241, "balance_loss_clip": 1.26560605, "balance_loss_mlp": 1.01710248, "epoch": 0.5228318052006613, "flos": 13816167603840.0, "grad_norm": 1.7340808031825012, "language_loss": 0.73508185, "learning_rate": 1.9491807207335672e-06, "loss": 0.75984097, "num_input_tokens_seen": 186808660, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.23132324, "step": 8696, "time_per_iteration": 2.8728489875793457 }, { "auxiliary_loss_clip": 0.01437081, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.26580834, "balance_loss_mlp": 1.01348257, "epoch": 0.5228919284533293, "flos": 15604582556160.0, "grad_norm": 1.7726519697564127, "language_loss": 0.72185171, "learning_rate": 1.948791385766319e-06, "loss": 0.74658298, "num_input_tokens_seen": 186825900, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.22595215, "step": 8697, "time_per_iteration": 2.8408749103546143 }, { "auxiliary_loss_clip": 0.01434089, "auxiliary_loss_mlp": 0.01035531, "balance_loss_clip": 1.26625419, "balance_loss_mlp": 1.01300073, "epoch": 0.5229520517059973, "flos": 22501558759680.0, "grad_norm": 2.2834703196666615, "language_loss": 0.81493723, "learning_rate": 1.948402052740906e-06, "loss": 0.83963341, "num_input_tokens_seen": 186843735, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.22521973, "step": 8698, "time_per_iteration": 2.873993158340454 }, { "auxiliary_loss_clip": 0.01434345, "auxiliary_loss_mlp": 0.01040326, "balance_loss_clip": 1.26549673, "balance_loss_mlp": 1.01841533, "epoch": 0.5230121749586653, "flos": 22101023376000.0, "grad_norm": 1.8067306665909637, "language_loss": 0.75037909, "learning_rate": 1.948012721672093e-06, "loss": 0.77512574, "num_input_tokens_seen": 186862440, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21887207, "step": 8699, "time_per_iteration": 2.8649888038635254 }, { "auxiliary_loss_clip": 0.01453318, "auxiliary_loss_mlp": 0.01037801, "balance_loss_clip": 1.27685189, "balance_loss_mlp": 1.01521087, "epoch": 0.5230722982113333, "flos": 22137246478080.0, "grad_norm": 2.0809222700888097, "language_loss": 0.73586994, "learning_rate": 1.947623392574642e-06, "loss": 0.76078111, "num_input_tokens_seen": 186880940, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.22595215, "step": 8700, "time_per_iteration": 2.887451171875 }, { "auxiliary_loss_clip": 0.0144177, "auxiliary_loss_mlp": 0.01042734, "balance_loss_clip": 1.27022421, "balance_loss_mlp": 1.02022767, "epoch": 0.5231324214640012, "flos": 25020046108800.0, "grad_norm": 1.7259179290422546, "language_loss": 0.68856555, "learning_rate": 1.947234065463318e-06, "loss": 0.71341062, "num_input_tokens_seen": 186900785, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.22521973, "step": 8701, "time_per_iteration": 2.933171272277832 }, { "auxiliary_loss_clip": 0.01434882, "auxiliary_loss_mlp": 0.01040738, "balance_loss_clip": 1.26613402, "balance_loss_mlp": 1.01861238, "epoch": 0.5231925447166692, "flos": 25751883052800.0, "grad_norm": 1.7644656755120014, "language_loss": 0.67170274, "learning_rate": 1.9468447403528826e-06, "loss": 0.69645888, "num_input_tokens_seen": 186920895, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.22119141, "step": 8702, "time_per_iteration": 2.9349236488342285 }, { "auxiliary_loss_clip": 0.01430658, "auxiliary_loss_mlp": 0.01040672, "balance_loss_clip": 1.26138568, "balance_loss_mlp": 1.01854634, "epoch": 0.5232526679693371, "flos": 21443849631360.0, "grad_norm": 1.9392648119999083, "language_loss": 0.76971948, "learning_rate": 1.946455417258101e-06, "loss": 0.79443282, "num_input_tokens_seen": 186940605, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.22119141, "step": 8703, "time_per_iteration": 2.8604087829589844 }, { "auxiliary_loss_clip": 0.01454605, "auxiliary_loss_mlp": 0.01046378, "balance_loss_clip": 1.27638268, "balance_loss_mlp": 1.0228703, "epoch": 0.5233127912220051, "flos": 35311877055360.0, "grad_norm": 1.9924875203816799, "language_loss": 0.77497941, "learning_rate": 1.9460660961937348e-06, "loss": 0.79998928, "num_input_tokens_seen": 186960820, "router_z_loss_clip": 1.78417969, "router_z_loss_mlp": 0.23486328, "step": 8704, "time_per_iteration": 2.9390411376953125 }, { "auxiliary_loss_clip": 0.01435246, "auxiliary_loss_mlp": 0.01042684, "balance_loss_clip": 1.26677632, "balance_loss_mlp": 1.02134562, "epoch": 0.523372914474673, "flos": 17058981260160.0, "grad_norm": 1.8890442938585859, "language_loss": 0.79056221, "learning_rate": 1.9456767771745474e-06, "loss": 0.81534159, "num_input_tokens_seen": 186976240, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21325684, "step": 8705, "time_per_iteration": 2.806535005569458 }, { "auxiliary_loss_clip": 0.01453124, "auxiliary_loss_mlp": 0.01035879, "balance_loss_clip": 1.27757168, "balance_loss_mlp": 1.0138967, "epoch": 0.5234330377273411, "flos": 18415614654720.0, "grad_norm": 2.5282118937483786, "language_loss": 0.70484716, "learning_rate": 1.9452874602153027e-06, "loss": 0.72973716, "num_input_tokens_seen": 186992855, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.21972656, "step": 8706, "time_per_iteration": 2.806443452835083 }, { "auxiliary_loss_clip": 0.01217362, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.11899328, "balance_loss_mlp": 1.00303519, "epoch": 0.523493160980009, "flos": 65884635786240.0, "grad_norm": 0.6779726978998978, "language_loss": 0.5254972, "learning_rate": 1.9448981453307623e-06, "loss": 0.54796052, "num_input_tokens_seen": 187051205, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.25976562, "step": 8707, "time_per_iteration": 3.429171085357666 }, { "auxiliary_loss_clip": 0.01428115, "auxiliary_loss_mlp": 0.01043258, "balance_loss_clip": 1.25944483, "balance_loss_mlp": 1.02078664, "epoch": 0.523553284232677, "flos": 21882191685120.0, "grad_norm": 1.7064573689241458, "language_loss": 0.75664192, "learning_rate": 1.9445088325356904e-06, "loss": 0.78135568, "num_input_tokens_seen": 187070540, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.22460938, "step": 8708, "time_per_iteration": 2.8814473152160645 }, { "auxiliary_loss_clip": 0.01428207, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.26110649, "balance_loss_mlp": 1.01070631, "epoch": 0.5236134074853449, "flos": 20857448033280.0, "grad_norm": 1.5748890033673513, "language_loss": 0.78026497, "learning_rate": 1.944119521844849e-06, "loss": 0.80487776, "num_input_tokens_seen": 187089975, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.22351074, "step": 8709, "time_per_iteration": 2.902432918548584 }, { "auxiliary_loss_clip": 0.01459918, "auxiliary_loss_mlp": 0.0104169, "balance_loss_clip": 1.28259575, "balance_loss_mlp": 1.01820636, "epoch": 0.5236735307380129, "flos": 25531422549120.0, "grad_norm": 2.3042632962770564, "language_loss": 0.8423053, "learning_rate": 1.9437302132730003e-06, "loss": 0.86732137, "num_input_tokens_seen": 187108775, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.23486328, "step": 8710, "time_per_iteration": 2.930004835128784 }, { "auxiliary_loss_clip": 0.01435601, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.26771474, "balance_loss_mlp": 1.01530004, "epoch": 0.523733653990681, "flos": 23592685812480.0, "grad_norm": 1.9653450678461135, "language_loss": 0.70342129, "learning_rate": 1.943340906834908e-06, "loss": 0.72814035, "num_input_tokens_seen": 187128830, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21008301, "step": 8711, "time_per_iteration": 2.896667718887329 }, { "auxiliary_loss_clip": 0.01422858, "auxiliary_loss_mlp": 0.01038385, "balance_loss_clip": 1.25382257, "balance_loss_mlp": 1.01555634, "epoch": 0.5237937772433489, "flos": 21116258144640.0, "grad_norm": 2.035276995213851, "language_loss": 0.83864206, "learning_rate": 1.9429516025453345e-06, "loss": 0.86325443, "num_input_tokens_seen": 187149570, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.22814941, "step": 8712, "time_per_iteration": 2.8728442192077637 }, { "auxiliary_loss_clip": 0.01438232, "auxiliary_loss_mlp": 0.01039007, "balance_loss_clip": 1.26591825, "balance_loss_mlp": 1.01470041, "epoch": 0.5238539004960169, "flos": 19182498336000.0, "grad_norm": 1.7975799082431119, "language_loss": 0.69896102, "learning_rate": 1.9425623004190415e-06, "loss": 0.72373331, "num_input_tokens_seen": 187170575, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.24328613, "step": 8713, "time_per_iteration": 2.881561040878296 }, { "auxiliary_loss_clip": 0.01447462, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.27170944, "balance_loss_mlp": 1.01534224, "epoch": 0.5239140237486848, "flos": 17895505968000.0, "grad_norm": 2.6169529860918317, "language_loss": 0.77833641, "learning_rate": 1.9421730004707925e-06, "loss": 0.80319929, "num_input_tokens_seen": 187187190, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.23474121, "step": 8714, "time_per_iteration": 2.847336530685425 }, { "auxiliary_loss_clip": 0.0144675, "auxiliary_loss_mlp": 0.01035168, "balance_loss_clip": 1.27299476, "balance_loss_mlp": 1.01242304, "epoch": 0.5239741470013528, "flos": 17939194462080.0, "grad_norm": 2.1585717139085734, "language_loss": 0.76831394, "learning_rate": 1.9417837027153483e-06, "loss": 0.79313314, "num_input_tokens_seen": 187204350, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.22741699, "step": 8715, "time_per_iteration": 2.8359851837158203 }, { "auxiliary_loss_clip": 0.01428034, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.25964761, "balance_loss_mlp": 1.01921773, "epoch": 0.5240342702540207, "flos": 31005834405120.0, "grad_norm": 1.6712454989971324, "language_loss": 0.72139132, "learning_rate": 1.9413944071674723e-06, "loss": 0.74608755, "num_input_tokens_seen": 187225605, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.22387695, "step": 8716, "time_per_iteration": 4.357141971588135 }, { "auxiliary_loss_clip": 0.01430411, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.2607578, "balance_loss_mlp": 1.01502144, "epoch": 0.5240943935066887, "flos": 25014752467200.0, "grad_norm": 1.8441746235884853, "language_loss": 0.87364447, "learning_rate": 1.941005113841926e-06, "loss": 0.89831293, "num_input_tokens_seen": 187241335, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.2142334, "step": 8717, "time_per_iteration": 2.837306022644043 }, { "auxiliary_loss_clip": 0.0142424, "auxiliary_loss_mlp": 0.01037581, "balance_loss_clip": 1.25421929, "balance_loss_mlp": 1.01553953, "epoch": 0.5241545167593566, "flos": 23669475517440.0, "grad_norm": 1.8849469284130125, "language_loss": 0.62596458, "learning_rate": 1.9406158227534723e-06, "loss": 0.65058279, "num_input_tokens_seen": 187259925, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22033691, "step": 8718, "time_per_iteration": 2.876553773880005 }, { "auxiliary_loss_clip": 0.01438166, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.26457632, "balance_loss_mlp": 1.01268435, "epoch": 0.5242146400120247, "flos": 23409850999680.0, "grad_norm": 1.6520630066717683, "language_loss": 0.72703975, "learning_rate": 1.940226533916872e-06, "loss": 0.75178397, "num_input_tokens_seen": 187279035, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.2355957, "step": 8719, "time_per_iteration": 2.86631178855896 }, { "auxiliary_loss_clip": 0.01417236, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.25006795, "balance_loss_mlp": 1.01465917, "epoch": 0.5242747632646926, "flos": 17758078951680.0, "grad_norm": 1.8652464661622177, "language_loss": 0.74097371, "learning_rate": 1.9398372473468877e-06, "loss": 0.76550621, "num_input_tokens_seen": 187297555, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.21337891, "step": 8720, "time_per_iteration": 2.834582567214966 }, { "auxiliary_loss_clip": 0.01423701, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.2541374, "balance_loss_mlp": 1.01424992, "epoch": 0.5243348865173606, "flos": 32610328669440.0, "grad_norm": 1.5769831748074792, "language_loss": 0.70833611, "learning_rate": 1.939447963058281e-06, "loss": 0.73294079, "num_input_tokens_seen": 187320265, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.2253418, "step": 8721, "time_per_iteration": 2.9149248600006104 }, { "auxiliary_loss_clip": 0.01435317, "auxiliary_loss_mlp": 0.01040393, "balance_loss_clip": 1.26562846, "balance_loss_mlp": 1.01757693, "epoch": 0.5243950097700285, "flos": 25495154202240.0, "grad_norm": 1.6776504379994015, "language_loss": 0.86821866, "learning_rate": 1.939058681065813e-06, "loss": 0.89297581, "num_input_tokens_seen": 187338045, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.22814941, "step": 8722, "time_per_iteration": 2.8608219623565674 }, { "auxiliary_loss_clip": 0.01427736, "auxiliary_loss_mlp": 0.01035398, "balance_loss_clip": 1.25919724, "balance_loss_mlp": 1.01351118, "epoch": 0.5244551330226965, "flos": 15276855335040.0, "grad_norm": 1.7466848447137104, "language_loss": 0.80657172, "learning_rate": 1.938669401384247e-06, "loss": 0.83120304, "num_input_tokens_seen": 187356040, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21899414, "step": 8723, "time_per_iteration": 2.806652784347534 }, { "auxiliary_loss_clip": 0.01446459, "auxiliary_loss_mlp": 0.01040372, "balance_loss_clip": 1.27372658, "balance_loss_mlp": 1.01674426, "epoch": 0.5245152562753645, "flos": 22247363617920.0, "grad_norm": 2.46185143559545, "language_loss": 0.76091677, "learning_rate": 1.9382801240283426e-06, "loss": 0.78578502, "num_input_tokens_seen": 187374185, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.23608398, "step": 8724, "time_per_iteration": 2.872649908065796 }, { "auxiliary_loss_clip": 0.01451956, "auxiliary_loss_mlp": 0.01043497, "balance_loss_clip": 1.27494025, "balance_loss_mlp": 1.02053738, "epoch": 0.5245753795280325, "flos": 29439011076480.0, "grad_norm": 1.6002125436526868, "language_loss": 0.70973098, "learning_rate": 1.9378908490128625e-06, "loss": 0.73468548, "num_input_tokens_seen": 187396640, "router_z_loss_clip": 1.77148438, "router_z_loss_mlp": 0.22961426, "step": 8725, "time_per_iteration": 2.9218711853027344 }, { "auxiliary_loss_clip": 0.01217781, "auxiliary_loss_mlp": 0.01037005, "balance_loss_clip": 1.11896992, "balance_loss_mlp": 1.01678681, "epoch": 0.5246355027807005, "flos": 58864149982080.0, "grad_norm": 0.7778012333367323, "language_loss": 0.55684251, "learning_rate": 1.937501576352568e-06, "loss": 0.57939041, "num_input_tokens_seen": 187455945, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.20214844, "step": 8726, "time_per_iteration": 4.762006998062134 }, { "auxiliary_loss_clip": 0.01212489, "auxiliary_loss_mlp": 0.01032273, "balance_loss_clip": 1.11545527, "balance_loss_mlp": 1.01224566, "epoch": 0.5246956260333684, "flos": 64557936466560.0, "grad_norm": 0.7993136744378349, "language_loss": 0.58404535, "learning_rate": 1.937112306062219e-06, "loss": 0.606493, "num_input_tokens_seen": 187519975, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.20019531, "step": 8727, "time_per_iteration": 3.2745563983917236 }, { "auxiliary_loss_clip": 0.01446457, "auxiliary_loss_mlp": 0.01041122, "balance_loss_clip": 1.27191257, "balance_loss_mlp": 1.01882958, "epoch": 0.5247557492860364, "flos": 24543942629760.0, "grad_norm": 1.2919400210879883, "language_loss": 0.7135483, "learning_rate": 1.9367230381565786e-06, "loss": 0.73842406, "num_input_tokens_seen": 187541775, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.22290039, "step": 8728, "time_per_iteration": 2.909552574157715 }, { "auxiliary_loss_clip": 0.01432062, "auxiliary_loss_mlp": 0.01035491, "balance_loss_clip": 1.26190412, "balance_loss_mlp": 1.01471281, "epoch": 0.5248158725387043, "flos": 18815154652800.0, "grad_norm": 1.5487725503254566, "language_loss": 0.69698286, "learning_rate": 1.9363337726504062e-06, "loss": 0.72165847, "num_input_tokens_seen": 187560425, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20751953, "step": 8729, "time_per_iteration": 5.7708892822265625 }, { "auxiliary_loss_clip": 0.01446805, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.27386665, "balance_loss_mlp": 1.01894784, "epoch": 0.5248759957913723, "flos": 20964669505920.0, "grad_norm": 2.2254103429191034, "language_loss": 0.84482479, "learning_rate": 1.935944509558464e-06, "loss": 0.86969477, "num_input_tokens_seen": 187579930, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.21240234, "step": 8730, "time_per_iteration": 2.853999137878418 }, { "auxiliary_loss_clip": 0.0144817, "auxiliary_loss_mlp": 0.01048331, "balance_loss_clip": 1.27709258, "balance_loss_mlp": 1.02648008, "epoch": 0.5249361190440403, "flos": 18670171754880.0, "grad_norm": 2.4184364167087757, "language_loss": 0.7992419, "learning_rate": 1.9355552488955125e-06, "loss": 0.82420695, "num_input_tokens_seen": 187595365, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21838379, "step": 8731, "time_per_iteration": 2.8147811889648438 }, { "auxiliary_loss_clip": 0.01421719, "auxiliary_loss_mlp": 0.01040384, "balance_loss_clip": 1.25641322, "balance_loss_mlp": 1.01791334, "epoch": 0.5249962422967083, "flos": 24874067825280.0, "grad_norm": 1.6181430199173799, "language_loss": 0.8396827, "learning_rate": 1.935165990676312e-06, "loss": 0.86430371, "num_input_tokens_seen": 187614715, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.22497559, "step": 8732, "time_per_iteration": 2.922715187072754 }, { "auxiliary_loss_clip": 0.01425023, "auxiliary_loss_mlp": 0.01040856, "balance_loss_clip": 1.25572157, "balance_loss_mlp": 1.02023268, "epoch": 0.5250563655493762, "flos": 15269661411840.0, "grad_norm": 1.5594107793685648, "language_loss": 0.78315556, "learning_rate": 1.9347767349156237e-06, "loss": 0.80781436, "num_input_tokens_seen": 187630745, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.20629883, "step": 8733, "time_per_iteration": 2.8406968116760254 }, { "auxiliary_loss_clip": 0.01450289, "auxiliary_loss_mlp": 0.01043126, "balance_loss_clip": 1.27645874, "balance_loss_mlp": 1.02052355, "epoch": 0.5251164888020442, "flos": 18634536835200.0, "grad_norm": 2.9286396254012064, "language_loss": 0.82172465, "learning_rate": 1.934387481628208e-06, "loss": 0.84665883, "num_input_tokens_seen": 187648200, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.22607422, "step": 8734, "time_per_iteration": 2.8771281242370605 }, { "auxiliary_loss_clip": 0.01429193, "auxiliary_loss_mlp": 0.01045502, "balance_loss_clip": 1.26059663, "balance_loss_mlp": 1.02354407, "epoch": 0.5251766120547121, "flos": 29721420990720.0, "grad_norm": 1.409983861843141, "language_loss": 0.77015269, "learning_rate": 1.933998230828826e-06, "loss": 0.79489958, "num_input_tokens_seen": 187669205, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.21948242, "step": 8735, "time_per_iteration": 2.880671977996826 }, { "auxiliary_loss_clip": 0.01423538, "auxiliary_loss_mlp": 0.01042758, "balance_loss_clip": 1.25480807, "balance_loss_mlp": 1.02217078, "epoch": 0.5252367353073801, "flos": 23450643826560.0, "grad_norm": 1.5086434854480595, "language_loss": 0.81007409, "learning_rate": 1.9336089825322376e-06, "loss": 0.83473706, "num_input_tokens_seen": 187690890, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20593262, "step": 8736, "time_per_iteration": 2.8694167137145996 }, { "auxiliary_loss_clip": 0.01435369, "auxiliary_loss_mlp": 0.01045502, "balance_loss_clip": 1.26536691, "balance_loss_mlp": 1.0228281, "epoch": 0.5252968585600482, "flos": 30823044837120.0, "grad_norm": 2.133741611460623, "language_loss": 0.70779854, "learning_rate": 1.9332197367532033e-06, "loss": 0.73260725, "num_input_tokens_seen": 187713045, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22680664, "step": 8737, "time_per_iteration": 2.9072306156158447 }, { "auxiliary_loss_clip": 0.0143236, "auxiliary_loss_mlp": 0.01048382, "balance_loss_clip": 1.26104331, "balance_loss_mlp": 1.02634025, "epoch": 0.5253569818127161, "flos": 20637213753600.0, "grad_norm": 1.435211591103227, "language_loss": 0.7800473, "learning_rate": 1.9328304935064833e-06, "loss": 0.80485475, "num_input_tokens_seen": 187733640, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.22045898, "step": 8738, "time_per_iteration": 2.8810434341430664 }, { "auxiliary_loss_clip": 0.01218097, "auxiliary_loss_mlp": 0.01048648, "balance_loss_clip": 1.12017787, "balance_loss_mlp": 1.02747655, "epoch": 0.5254171050653841, "flos": 63458891573760.0, "grad_norm": 0.7440063838295472, "language_loss": 0.54520476, "learning_rate": 1.932441252806837e-06, "loss": 0.56787223, "num_input_tokens_seen": 187792930, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.21191406, "step": 8739, "time_per_iteration": 3.3355932235717773 }, { "auxiliary_loss_clip": 0.01430198, "auxiliary_loss_mlp": 0.01044716, "balance_loss_clip": 1.26075459, "balance_loss_mlp": 1.02423549, "epoch": 0.525477228318052, "flos": 34682194166400.0, "grad_norm": 1.803616612090727, "language_loss": 0.85113323, "learning_rate": 1.9320520146690263e-06, "loss": 0.87588239, "num_input_tokens_seen": 187812495, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20507812, "step": 8740, "time_per_iteration": 2.955010175704956 }, { "auxiliary_loss_clip": 0.0143058, "auxiliary_loss_mlp": 0.01047038, "balance_loss_clip": 1.26212168, "balance_loss_mlp": 1.0262835, "epoch": 0.52553735157072, "flos": 17939737399680.0, "grad_norm": 2.4212645953831746, "language_loss": 0.70107806, "learning_rate": 1.9316627791078093e-06, "loss": 0.72585422, "num_input_tokens_seen": 187829685, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20739746, "step": 8741, "time_per_iteration": 2.8294286727905273 }, { "auxiliary_loss_clip": 0.0144541, "auxiliary_loss_mlp": 0.01043809, "balance_loss_clip": 1.27176952, "balance_loss_mlp": 1.02081335, "epoch": 0.5255974748233879, "flos": 9947788335360.0, "grad_norm": 1.9695357766915123, "language_loss": 0.67461324, "learning_rate": 1.931273546137947e-06, "loss": 0.69950545, "num_input_tokens_seen": 187846495, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.23010254, "step": 8742, "time_per_iteration": 2.8184654712677 }, { "auxiliary_loss_clip": 0.01442982, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.26770842, "balance_loss_mlp": 1.02165806, "epoch": 0.5256575980760559, "flos": 16875920223360.0, "grad_norm": 2.572580999628291, "language_loss": 0.63674122, "learning_rate": 1.9308843157741983e-06, "loss": 0.66161066, "num_input_tokens_seen": 187862010, "router_z_loss_clip": 1.75195312, "router_z_loss_mlp": 0.22314453, "step": 8743, "time_per_iteration": 2.7902088165283203 }, { "auxiliary_loss_clip": 0.01212374, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.11811805, "balance_loss_mlp": 1.02396762, "epoch": 0.5257177213287239, "flos": 62420185261440.0, "grad_norm": 0.7757922865130171, "language_loss": 0.54161704, "learning_rate": 1.930495088031323e-06, "loss": 0.56415212, "num_input_tokens_seen": 187922730, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 0.171875, "step": 8744, "time_per_iteration": 3.4824509620666504 }, { "auxiliary_loss_clip": 0.01454064, "auxiliary_loss_mlp": 0.01044061, "balance_loss_clip": 1.27740383, "balance_loss_mlp": 1.02070761, "epoch": 0.5257778445813919, "flos": 20786585397120.0, "grad_norm": 2.2136442035060653, "language_loss": 0.77154922, "learning_rate": 1.9301058629240814e-06, "loss": 0.79653049, "num_input_tokens_seen": 187940160, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.23364258, "step": 8745, "time_per_iteration": 2.8471662998199463 }, { "auxiliary_loss_clip": 0.01427904, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.25963616, "balance_loss_mlp": 1.01652181, "epoch": 0.5258379678340598, "flos": 17027237393280.0, "grad_norm": 2.2274427574889883, "language_loss": 0.82166576, "learning_rate": 1.9297166404672324e-06, "loss": 0.84633851, "num_input_tokens_seen": 187958625, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.22851562, "step": 8746, "time_per_iteration": 2.855973720550537 }, { "auxiliary_loss_clip": 0.01414344, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.24842334, "balance_loss_mlp": 1.01551747, "epoch": 0.5258980910867278, "flos": 21078496719360.0, "grad_norm": 1.7717707560122293, "language_loss": 0.76101941, "learning_rate": 1.9293274206755353e-06, "loss": 0.78555906, "num_input_tokens_seen": 187977575, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.2409668, "step": 8747, "time_per_iteration": 2.8706488609313965 }, { "auxiliary_loss_clip": 0.01401209, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.23642349, "balance_loss_mlp": 1.01112247, "epoch": 0.5259582143393957, "flos": 18013540947840.0, "grad_norm": 3.7811661382953514, "language_loss": 0.83530742, "learning_rate": 1.9289382035637505e-06, "loss": 0.8596645, "num_input_tokens_seen": 187996650, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.23400879, "step": 8748, "time_per_iteration": 2.8194899559020996 }, { "auxiliary_loss_clip": 0.01439, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.26621819, "balance_loss_mlp": 1.01504374, "epoch": 0.5260183375920637, "flos": 22794103509120.0, "grad_norm": 2.1245999698110194, "language_loss": 0.8193332, "learning_rate": 1.9285489891466345e-06, "loss": 0.8441084, "num_input_tokens_seen": 188013510, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.23486328, "step": 8749, "time_per_iteration": 2.9239392280578613 }, { "auxiliary_loss_clip": 0.0142506, "auxiliary_loss_mlp": 0.01043747, "balance_loss_clip": 1.25713837, "balance_loss_mlp": 1.01964331, "epoch": 0.5260784608447318, "flos": 27063696833280.0, "grad_norm": 1.9457856628049557, "language_loss": 0.73520625, "learning_rate": 1.9281597774389487e-06, "loss": 0.75989425, "num_input_tokens_seen": 188032085, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.2409668, "step": 8750, "time_per_iteration": 2.9139065742492676 }, { "auxiliary_loss_clip": 0.01432798, "auxiliary_loss_mlp": 0.01038152, "balance_loss_clip": 1.26173449, "balance_loss_mlp": 1.0158838, "epoch": 0.5261385840973997, "flos": 20671265105280.0, "grad_norm": 1.6334525633414536, "language_loss": 0.76932061, "learning_rate": 1.9277705684554517e-06, "loss": 0.79403007, "num_input_tokens_seen": 188050590, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.22265625, "step": 8751, "time_per_iteration": 4.25470757484436 }, { "auxiliary_loss_clip": 0.01422546, "auxiliary_loss_mlp": 0.01038138, "balance_loss_clip": 1.25422668, "balance_loss_mlp": 1.01541638, "epoch": 0.5261987073500677, "flos": 23633026191360.0, "grad_norm": 1.4350382239197972, "language_loss": 0.76807415, "learning_rate": 1.927381362210902e-06, "loss": 0.79268092, "num_input_tokens_seen": 188071620, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.22705078, "step": 8752, "time_per_iteration": 2.890977144241333 }, { "auxiliary_loss_clip": 0.0144297, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.26909208, "balance_loss_mlp": 1.01564837, "epoch": 0.5262588306027356, "flos": 27647700456960.0, "grad_norm": 1.4526081537495705, "language_loss": 0.68140531, "learning_rate": 1.926992158720058e-06, "loss": 0.70622104, "num_input_tokens_seen": 188091740, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.22973633, "step": 8753, "time_per_iteration": 2.9059669971466064 }, { "auxiliary_loss_clip": 0.01432158, "auxiliary_loss_mlp": 0.01039995, "balance_loss_clip": 1.26445985, "balance_loss_mlp": 1.01736856, "epoch": 0.5263189538554036, "flos": 21769043143680.0, "grad_norm": 1.4680646437046334, "language_loss": 0.84383678, "learning_rate": 1.9266029579976785e-06, "loss": 0.86855829, "num_input_tokens_seen": 188111165, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.22619629, "step": 8754, "time_per_iteration": 2.8734164237976074 }, { "auxiliary_loss_clip": 0.01437231, "auxiliary_loss_mlp": 0.01037977, "balance_loss_clip": 1.26441133, "balance_loss_mlp": 1.01537478, "epoch": 0.5263790771080715, "flos": 14282769674880.0, "grad_norm": 2.2041005046268083, "language_loss": 0.88337982, "learning_rate": 1.926213760058522e-06, "loss": 0.90813202, "num_input_tokens_seen": 188127825, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.22619629, "step": 8755, "time_per_iteration": 2.8080687522888184 }, { "auxiliary_loss_clip": 0.01212621, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 1.11549759, "balance_loss_mlp": 1.00608623, "epoch": 0.5264392003607395, "flos": 65838820786560.0, "grad_norm": 0.722105104684321, "language_loss": 0.58907813, "learning_rate": 1.9258245649173477e-06, "loss": 0.61149311, "num_input_tokens_seen": 188194050, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.22753906, "step": 8756, "time_per_iteration": 3.434338331222534 }, { "auxiliary_loss_clip": 0.01452085, "auxiliary_loss_mlp": 0.01040832, "balance_loss_clip": 1.27560067, "balance_loss_mlp": 1.01803958, "epoch": 0.5264993236134075, "flos": 21042183127680.0, "grad_norm": 1.615195511140322, "language_loss": 0.71164739, "learning_rate": 1.925435372588913e-06, "loss": 0.73657662, "num_input_tokens_seen": 188212565, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.22790527, "step": 8757, "time_per_iteration": 2.837550401687622 }, { "auxiliary_loss_clip": 0.01433274, "auxiliary_loss_mlp": 0.01038605, "balance_loss_clip": 1.26276076, "balance_loss_mlp": 1.01593089, "epoch": 0.5265594468660755, "flos": 16626973478400.0, "grad_norm": 1.6331562858148623, "language_loss": 0.88430816, "learning_rate": 1.9250461830879768e-06, "loss": 0.90902698, "num_input_tokens_seen": 188229505, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.22668457, "step": 8758, "time_per_iteration": 2.8085427284240723 }, { "auxiliary_loss_clip": 0.0143722, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.26342654, "balance_loss_mlp": 1.01468825, "epoch": 0.5266195701187434, "flos": 24144809834880.0, "grad_norm": 1.4375647532350224, "language_loss": 0.7682575, "learning_rate": 1.9246569964292965e-06, "loss": 0.79299951, "num_input_tokens_seen": 188250395, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.22290039, "step": 8759, "time_per_iteration": 2.880958080291748 }, { "auxiliary_loss_clip": 0.01414367, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.24756849, "balance_loss_mlp": 1.01395798, "epoch": 0.5266796933714114, "flos": 15850633633920.0, "grad_norm": 2.1361868588963246, "language_loss": 0.72964072, "learning_rate": 1.9242678126276307e-06, "loss": 0.7541523, "num_input_tokens_seen": 188266785, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.22851562, "step": 8760, "time_per_iteration": 2.828507423400879 }, { "auxiliary_loss_clip": 0.0144135, "auxiliary_loss_mlp": 0.01041912, "balance_loss_clip": 1.26634061, "balance_loss_mlp": 1.01932228, "epoch": 0.5267398166240793, "flos": 20959104395520.0, "grad_norm": 2.3683782942269658, "language_loss": 0.76521891, "learning_rate": 1.923878631697736e-06, "loss": 0.79005152, "num_input_tokens_seen": 188282525, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.22595215, "step": 8761, "time_per_iteration": 2.8233888149261475 }, { "auxiliary_loss_clip": 0.01436695, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.26292765, "balance_loss_mlp": 1.01776433, "epoch": 0.5267999398767473, "flos": 21006276739200.0, "grad_norm": 1.88874835516363, "language_loss": 0.71764654, "learning_rate": 1.923489453654373e-06, "loss": 0.74241507, "num_input_tokens_seen": 188301395, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.22375488, "step": 8762, "time_per_iteration": 4.186644554138184 }, { "auxiliary_loss_clip": 0.01209704, "auxiliary_loss_mlp": 0.01023211, "balance_loss_clip": 1.11212337, "balance_loss_mlp": 1.00060928, "epoch": 0.5268600631294152, "flos": 66878160526080.0, "grad_norm": 0.9398142954319888, "language_loss": 0.6556437, "learning_rate": 1.9231002785122963e-06, "loss": 0.67797279, "num_input_tokens_seen": 188357665, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.22558594, "step": 8763, "time_per_iteration": 3.234480619430542 }, { "auxiliary_loss_clip": 0.01440548, "auxiliary_loss_mlp": 0.01036921, "balance_loss_clip": 1.26825118, "balance_loss_mlp": 1.01492715, "epoch": 0.5269201863820833, "flos": 17174482531200.0, "grad_norm": 1.8350991444776976, "language_loss": 0.71577322, "learning_rate": 1.922711106286265e-06, "loss": 0.7405479, "num_input_tokens_seen": 188376935, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.21972656, "step": 8764, "time_per_iteration": 5.644627809524536 }, { "auxiliary_loss_clip": 0.0143643, "auxiliary_loss_mlp": 0.01039476, "balance_loss_clip": 1.26360941, "balance_loss_mlp": 1.01659966, "epoch": 0.5269803096347513, "flos": 20532480744960.0, "grad_norm": 1.6227103207247704, "language_loss": 0.75380886, "learning_rate": 1.9223219369910368e-06, "loss": 0.77856791, "num_input_tokens_seen": 188394995, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.22875977, "step": 8765, "time_per_iteration": 2.8590245246887207 }, { "auxiliary_loss_clip": 0.01430878, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.25701737, "balance_loss_mlp": 1.01097989, "epoch": 0.5270404328874192, "flos": 27241328494080.0, "grad_norm": 1.8664767557464925, "language_loss": 0.86115289, "learning_rate": 1.9219327706413677e-06, "loss": 0.88579994, "num_input_tokens_seen": 188415475, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.2286377, "step": 8766, "time_per_iteration": 2.935728073120117 }, { "auxiliary_loss_clip": 0.01447417, "auxiliary_loss_mlp": 0.01041251, "balance_loss_clip": 1.27331936, "balance_loss_mlp": 1.01864862, "epoch": 0.5271005561400872, "flos": 23120382896640.0, "grad_norm": 2.271140646213792, "language_loss": 0.79440689, "learning_rate": 1.921543607252017e-06, "loss": 0.81929362, "num_input_tokens_seen": 188435665, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.22607422, "step": 8767, "time_per_iteration": 2.894381284713745 }, { "auxiliary_loss_clip": 0.01448878, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.27418518, "balance_loss_mlp": 1.01576436, "epoch": 0.5271606793927551, "flos": 22574955104640.0, "grad_norm": 1.7247085110664362, "language_loss": 0.74474561, "learning_rate": 1.9211544468377394e-06, "loss": 0.76961923, "num_input_tokens_seen": 188455405, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.22705078, "step": 8768, "time_per_iteration": 2.87921142578125 }, { "auxiliary_loss_clip": 0.01429389, "auxiliary_loss_mlp": 0.01039914, "balance_loss_clip": 1.25984907, "balance_loss_mlp": 1.01875472, "epoch": 0.5272208026454231, "flos": 18771873361920.0, "grad_norm": 1.815893473334963, "language_loss": 0.74980247, "learning_rate": 1.9207652894132933e-06, "loss": 0.77449548, "num_input_tokens_seen": 188472940, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21154785, "step": 8769, "time_per_iteration": 2.8532793521881104 }, { "auxiliary_loss_clip": 0.01440246, "auxiliary_loss_mlp": 0.0104074, "balance_loss_clip": 1.2685138, "balance_loss_mlp": 1.01906753, "epoch": 0.5272809258980911, "flos": 20421775422720.0, "grad_norm": 1.7174099069385786, "language_loss": 0.74666184, "learning_rate": 1.920376134993436e-06, "loss": 0.77147162, "num_input_tokens_seen": 188493035, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.21655273, "step": 8770, "time_per_iteration": 2.867903232574463 }, { "auxiliary_loss_clip": 0.01435546, "auxiliary_loss_mlp": 0.01038181, "balance_loss_clip": 1.26545715, "balance_loss_mlp": 1.01516175, "epoch": 0.5273410491507591, "flos": 28268424875520.0, "grad_norm": 1.6866715715570193, "language_loss": 0.68991208, "learning_rate": 1.9199869835929224e-06, "loss": 0.71464932, "num_input_tokens_seen": 188513860, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.23034668, "step": 8771, "time_per_iteration": 2.9144198894500732 }, { "auxiliary_loss_clip": 0.01431217, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.26163673, "balance_loss_mlp": 1.01724529, "epoch": 0.527401172403427, "flos": 22465245168000.0, "grad_norm": 1.8694496159156737, "language_loss": 0.77158803, "learning_rate": 1.9195978352265115e-06, "loss": 0.79629529, "num_input_tokens_seen": 188533345, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.22265625, "step": 8772, "time_per_iteration": 2.8367183208465576 }, { "auxiliary_loss_clip": 0.01437275, "auxiliary_loss_mlp": 0.01040457, "balance_loss_clip": 1.2645061, "balance_loss_mlp": 1.01699638, "epoch": 0.527461295656095, "flos": 21040509070080.0, "grad_norm": 36.85186210536831, "language_loss": 0.66610205, "learning_rate": 1.9192086899089585e-06, "loss": 0.69087934, "num_input_tokens_seen": 188551550, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.23449707, "step": 8773, "time_per_iteration": 2.896742343902588 }, { "auxiliary_loss_clip": 0.01428821, "auxiliary_loss_mlp": 0.01037466, "balance_loss_clip": 1.25743854, "balance_loss_mlp": 1.0164969, "epoch": 0.5275214189087629, "flos": 26332945764480.0, "grad_norm": 1.4870884558201882, "language_loss": 0.86000347, "learning_rate": 1.91881954765502e-06, "loss": 0.88466632, "num_input_tokens_seen": 188571615, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.20959473, "step": 8774, "time_per_iteration": 2.856257915496826 }, { "auxiliary_loss_clip": 0.01421994, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.25186896, "balance_loss_mlp": 1.01433444, "epoch": 0.5275815421614309, "flos": 20056467755520.0, "grad_norm": 2.130582152378828, "language_loss": 0.80706322, "learning_rate": 1.9184304084794523e-06, "loss": 0.83164573, "num_input_tokens_seen": 188591965, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21923828, "step": 8775, "time_per_iteration": 2.8674657344818115 }, { "auxiliary_loss_clip": 0.01425198, "auxiliary_loss_mlp": 0.01041004, "balance_loss_clip": 1.25802803, "balance_loss_mlp": 1.01848507, "epoch": 0.5276416654140988, "flos": 21441677880960.0, "grad_norm": 1.7505815149034518, "language_loss": 0.84283531, "learning_rate": 1.918041272397012e-06, "loss": 0.86749727, "num_input_tokens_seen": 188610675, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.22509766, "step": 8776, "time_per_iteration": 2.8411974906921387 }, { "auxiliary_loss_clip": 0.01439011, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.26621938, "balance_loss_mlp": 1.01203382, "epoch": 0.5277017886667669, "flos": 17173894348800.0, "grad_norm": 1.5801377867231254, "language_loss": 0.68097913, "learning_rate": 1.9176521394224547e-06, "loss": 0.70570725, "num_input_tokens_seen": 188628235, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.21765137, "step": 8777, "time_per_iteration": 2.9549124240875244 }, { "auxiliary_loss_clip": 0.01421248, "auxiliary_loss_mlp": 0.01037985, "balance_loss_clip": 1.25229561, "balance_loss_mlp": 1.01626456, "epoch": 0.5277619119194349, "flos": 20457636566400.0, "grad_norm": 1.4146098540259497, "language_loss": 0.82898527, "learning_rate": 1.9172630095705358e-06, "loss": 0.85357761, "num_input_tokens_seen": 188648925, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21728516, "step": 8778, "time_per_iteration": 2.911442518234253 }, { "auxiliary_loss_clip": 0.01439432, "auxiliary_loss_mlp": 0.01038439, "balance_loss_clip": 1.26654327, "balance_loss_mlp": 1.0162183, "epoch": 0.5278220351721028, "flos": 24071730203520.0, "grad_norm": 2.443825284861556, "language_loss": 0.80469275, "learning_rate": 1.916873882856013e-06, "loss": 0.82947147, "num_input_tokens_seen": 188668125, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.22229004, "step": 8779, "time_per_iteration": 2.8582377433776855 }, { "auxiliary_loss_clip": 0.01427617, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.25877118, "balance_loss_mlp": 1.01420784, "epoch": 0.5278821584247708, "flos": 24653200118400.0, "grad_norm": 2.5326297173412886, "language_loss": 0.77327615, "learning_rate": 1.9164847592936406e-06, "loss": 0.79790211, "num_input_tokens_seen": 188684410, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20776367, "step": 8780, "time_per_iteration": 2.8794655799865723 }, { "auxiliary_loss_clip": 0.01445579, "auxiliary_loss_mlp": 0.01038051, "balance_loss_clip": 1.26999724, "balance_loss_mlp": 1.01587784, "epoch": 0.5279422816774387, "flos": 35421858460800.0, "grad_norm": 1.6491312557827194, "language_loss": 0.70479405, "learning_rate": 1.916095638898174e-06, "loss": 0.72963035, "num_input_tokens_seen": 188706130, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.22180176, "step": 8781, "time_per_iteration": 2.953895330429077 }, { "auxiliary_loss_clip": 0.01414393, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.24719918, "balance_loss_mlp": 1.01593077, "epoch": 0.5280024049301068, "flos": 22977526504320.0, "grad_norm": 1.5998695060798207, "language_loss": 0.72995842, "learning_rate": 1.9157065216843696e-06, "loss": 0.75446594, "num_input_tokens_seen": 188725030, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20422363, "step": 8782, "time_per_iteration": 2.8601887226104736 }, { "auxiliary_loss_clip": 0.01414124, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.24758112, "balance_loss_mlp": 1.01265836, "epoch": 0.5280625281827747, "flos": 21517743669120.0, "grad_norm": 2.4411170503542787, "language_loss": 0.69207215, "learning_rate": 1.915317407666982e-06, "loss": 0.71655142, "num_input_tokens_seen": 188744325, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.21154785, "step": 8783, "time_per_iteration": 2.849311113357544 }, { "auxiliary_loss_clip": 0.01468234, "auxiliary_loss_mlp": 0.01043568, "balance_loss_clip": 1.28824484, "balance_loss_mlp": 1.02050138, "epoch": 0.5281226514354427, "flos": 31219598678400.0, "grad_norm": 1.6110857811698185, "language_loss": 0.70075589, "learning_rate": 1.9149282968607674e-06, "loss": 0.72587383, "num_input_tokens_seen": 188765100, "router_z_loss_clip": 1.79882812, "router_z_loss_mlp": 0.23059082, "step": 8784, "time_per_iteration": 2.928645133972168 }, { "auxiliary_loss_clip": 0.01456178, "auxiliary_loss_mlp": 0.01034682, "balance_loss_clip": 1.27779603, "balance_loss_mlp": 1.01339149, "epoch": 0.5281827746881106, "flos": 25087470140160.0, "grad_norm": 2.106533408456001, "language_loss": 0.76252258, "learning_rate": 1.91453918928048e-06, "loss": 0.78743118, "num_input_tokens_seen": 188783995, "router_z_loss_clip": 1.78613281, "router_z_loss_mlp": 0.21289062, "step": 8785, "time_per_iteration": 2.913539409637451 }, { "auxiliary_loss_clip": 0.01431852, "auxiliary_loss_mlp": 0.01037426, "balance_loss_clip": 1.26232719, "balance_loss_mlp": 1.01420379, "epoch": 0.5282428979407786, "flos": 20640969072000.0, "grad_norm": 1.6262516711294497, "language_loss": 0.83996463, "learning_rate": 1.9141500849408745e-06, "loss": 0.8646574, "num_input_tokens_seen": 188803120, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.2322998, "step": 8786, "time_per_iteration": 4.225078821182251 }, { "auxiliary_loss_clip": 0.01413344, "auxiliary_loss_mlp": 0.01039985, "balance_loss_clip": 1.24718666, "balance_loss_mlp": 1.01966023, "epoch": 0.5283030211934465, "flos": 22429655493120.0, "grad_norm": 2.508715494208792, "language_loss": 0.83901787, "learning_rate": 1.9137609838567076e-06, "loss": 0.86355114, "num_input_tokens_seen": 188820960, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20324707, "step": 8787, "time_per_iteration": 2.909749746322632 }, { "auxiliary_loss_clip": 0.01424772, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.25661111, "balance_loss_mlp": 1.01114082, "epoch": 0.5283631444461145, "flos": 23623977231360.0, "grad_norm": 2.0730698438651776, "language_loss": 0.84151733, "learning_rate": 1.9133718860427316e-06, "loss": 0.86607414, "num_input_tokens_seen": 188837165, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19750977, "step": 8788, "time_per_iteration": 2.842941999435425 }, { "auxiliary_loss_clip": 0.01424965, "auxiliary_loss_mlp": 0.01042655, "balance_loss_clip": 1.25789547, "balance_loss_mlp": 1.02131701, "epoch": 0.5284232676987825, "flos": 32684946624000.0, "grad_norm": 3.48355726910227, "language_loss": 0.75692397, "learning_rate": 1.9129827915137027e-06, "loss": 0.78160024, "num_input_tokens_seen": 188858555, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.21337891, "step": 8789, "time_per_iteration": 2.9299349784851074 }, { "auxiliary_loss_clip": 0.01435875, "auxiliary_loss_mlp": 0.01036682, "balance_loss_clip": 1.26517451, "balance_loss_mlp": 1.01599884, "epoch": 0.5284833909514505, "flos": 26772237959040.0, "grad_norm": 1.6230245830002972, "language_loss": 0.71058965, "learning_rate": 1.9125937002843754e-06, "loss": 0.7353152, "num_input_tokens_seen": 188879050, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.20678711, "step": 8790, "time_per_iteration": 2.949162483215332 }, { "auxiliary_loss_clip": 0.01429687, "auxiliary_loss_mlp": 0.01030592, "balance_loss_clip": 1.26212442, "balance_loss_mlp": 1.01143479, "epoch": 0.5285435142041185, "flos": 22100797152000.0, "grad_norm": 1.641425080174639, "language_loss": 0.79404342, "learning_rate": 1.9122046123695036e-06, "loss": 0.81864619, "num_input_tokens_seen": 188898885, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19165039, "step": 8791, "time_per_iteration": 2.832998275756836 }, { "auxiliary_loss_clip": 0.01427417, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.26080084, "balance_loss_mlp": 1.009884, "epoch": 0.5286036374567864, "flos": 20385009383040.0, "grad_norm": 2.176251538812177, "language_loss": 0.67526037, "learning_rate": 1.9118155277838423e-06, "loss": 0.6998415, "num_input_tokens_seen": 188917225, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20812988, "step": 8792, "time_per_iteration": 2.835094451904297 }, { "auxiliary_loss_clip": 0.01423296, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.25520039, "balance_loss_mlp": 1.01627338, "epoch": 0.5286637607094544, "flos": 24362691384960.0, "grad_norm": 1.8915353409360078, "language_loss": 0.80345666, "learning_rate": 1.9114264465421443e-06, "loss": 0.82805634, "num_input_tokens_seen": 188936120, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20397949, "step": 8793, "time_per_iteration": 2.8584837913513184 }, { "auxiliary_loss_clip": 0.01429303, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.26048946, "balance_loss_mlp": 1.0203526, "epoch": 0.5287238839621223, "flos": 17279125050240.0, "grad_norm": 2.0118201908760502, "language_loss": 0.85937744, "learning_rate": 1.9110373686591645e-06, "loss": 0.88408691, "num_input_tokens_seen": 188953405, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21289062, "step": 8794, "time_per_iteration": 2.802241802215576 }, { "auxiliary_loss_clip": 0.01448583, "auxiliary_loss_mlp": 0.01038218, "balance_loss_clip": 1.27307916, "balance_loss_mlp": 1.0177381, "epoch": 0.5287840072147904, "flos": 17575922810880.0, "grad_norm": 2.5349009397731286, "language_loss": 0.69198364, "learning_rate": 1.9106482941496564e-06, "loss": 0.71685159, "num_input_tokens_seen": 188971150, "router_z_loss_clip": 1.75585938, "router_z_loss_mlp": 0.20495605, "step": 8795, "time_per_iteration": 2.8242945671081543 }, { "auxiliary_loss_clip": 0.01434934, "auxiliary_loss_mlp": 0.01036918, "balance_loss_clip": 1.26356149, "balance_loss_mlp": 1.01594949, "epoch": 0.5288441304674583, "flos": 18561411959040.0, "grad_norm": 1.8190963174994497, "language_loss": 0.81624967, "learning_rate": 1.910259223028374e-06, "loss": 0.84096819, "num_input_tokens_seen": 188989550, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.20983887, "step": 8796, "time_per_iteration": 2.8238120079040527 }, { "auxiliary_loss_clip": 0.014329, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.26315522, "balance_loss_mlp": 1.01460898, "epoch": 0.5289042537201263, "flos": 20824482556800.0, "grad_norm": 2.1511222148454823, "language_loss": 0.70159656, "learning_rate": 1.909870155310071e-06, "loss": 0.7262845, "num_input_tokens_seen": 189008795, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21289062, "step": 8797, "time_per_iteration": 4.285072565078735 }, { "auxiliary_loss_clip": 0.01419048, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.25426602, "balance_loss_mlp": 1.01644981, "epoch": 0.5289643769727942, "flos": 15741919082880.0, "grad_norm": 1.5283411544731922, "language_loss": 0.82553327, "learning_rate": 1.9094810910095005e-06, "loss": 0.85009408, "num_input_tokens_seen": 189025540, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20581055, "step": 8798, "time_per_iteration": 4.2175164222717285 }, { "auxiliary_loss_clip": 0.01449727, "auxiliary_loss_mlp": 0.01038619, "balance_loss_clip": 1.27510905, "balance_loss_mlp": 1.01637471, "epoch": 0.5290245002254622, "flos": 19546901107200.0, "grad_norm": 2.3822751606281987, "language_loss": 0.71225846, "learning_rate": 1.9090920301414166e-06, "loss": 0.73714197, "num_input_tokens_seen": 189044885, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.22229004, "step": 8799, "time_per_iteration": 4.270482540130615 }, { "auxiliary_loss_clip": 0.01413544, "auxiliary_loss_mlp": 0.01041223, "balance_loss_clip": 1.24977696, "balance_loss_mlp": 1.0207665, "epoch": 0.5290846234781301, "flos": 15823595226240.0, "grad_norm": 2.260616545833147, "language_loss": 0.70517373, "learning_rate": 1.9087029727205716e-06, "loss": 0.72972137, "num_input_tokens_seen": 189061280, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20458984, "step": 8800, "time_per_iteration": 2.812929630279541 }, { "auxiliary_loss_clip": 0.01206812, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.11499953, "balance_loss_mlp": 1.01201975, "epoch": 0.5291447467307981, "flos": 70086560872320.0, "grad_norm": 0.9692953755789581, "language_loss": 0.57007611, "learning_rate": 1.9083139187617193e-06, "loss": 0.59242272, "num_input_tokens_seen": 189114775, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.15820312, "step": 8801, "time_per_iteration": 3.243163824081421 }, { "auxiliary_loss_clip": 0.01431042, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.26005459, "balance_loss_mlp": 1.01737082, "epoch": 0.529204869983466, "flos": 28375374879360.0, "grad_norm": 1.5427075698018011, "language_loss": 0.64754599, "learning_rate": 1.9079248682796123e-06, "loss": 0.6722368, "num_input_tokens_seen": 189134700, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.20666504, "step": 8802, "time_per_iteration": 2.920804500579834 }, { "auxiliary_loss_clip": 0.01430239, "auxiliary_loss_mlp": 0.01041722, "balance_loss_clip": 1.26325619, "balance_loss_mlp": 1.02102709, "epoch": 0.5292649932361341, "flos": 33770056118400.0, "grad_norm": 1.704249799922217, "language_loss": 0.6958949, "learning_rate": 1.907535821289003e-06, "loss": 0.72061449, "num_input_tokens_seen": 189155365, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20690918, "step": 8803, "time_per_iteration": 2.9439170360565186 }, { "auxiliary_loss_clip": 0.0142439, "auxiliary_loss_mlp": 0.01038585, "balance_loss_clip": 1.25820971, "balance_loss_mlp": 1.01847434, "epoch": 0.5293251164888021, "flos": 20457048384000.0, "grad_norm": 1.9177814334077956, "language_loss": 0.766922, "learning_rate": 1.9071467778046458e-06, "loss": 0.79155171, "num_input_tokens_seen": 189173885, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20117188, "step": 8804, "time_per_iteration": 2.9329490661621094 }, { "auxiliary_loss_clip": 0.01207574, "auxiliary_loss_mlp": 0.01023298, "balance_loss_clip": 1.1150403, "balance_loss_mlp": 1.0055598, "epoch": 0.52938523974147, "flos": 66580294872960.0, "grad_norm": 0.7588912456657476, "language_loss": 0.53032529, "learning_rate": 1.906757737841291e-06, "loss": 0.552634, "num_input_tokens_seen": 189236515, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.17773438, "step": 8805, "time_per_iteration": 3.391428232192993 }, { "auxiliary_loss_clip": 0.01208627, "auxiliary_loss_mlp": 0.01018533, "balance_loss_clip": 1.11675096, "balance_loss_mlp": 1.00270164, "epoch": 0.529445362994138, "flos": 67183283329920.0, "grad_norm": 0.742572667260767, "language_loss": 0.63882744, "learning_rate": 1.906368701413693e-06, "loss": 0.66109896, "num_input_tokens_seen": 189300500, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.15820312, "step": 8806, "time_per_iteration": 3.2838244438171387 }, { "auxiliary_loss_clip": 0.01443915, "auxiliary_loss_mlp": 0.01040062, "balance_loss_clip": 1.26749098, "balance_loss_mlp": 1.01921272, "epoch": 0.5295054862468059, "flos": 17758395665280.0, "grad_norm": 1.665566081116359, "language_loss": 0.72987866, "learning_rate": 1.9059796685366026e-06, "loss": 0.75471848, "num_input_tokens_seen": 189319745, "router_z_loss_clip": 1.76464844, "router_z_loss_mlp": 0.20861816, "step": 8807, "time_per_iteration": 2.891240358352661 }, { "auxiliary_loss_clip": 0.01426941, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.2594502, "balance_loss_mlp": 1.02157211, "epoch": 0.529565609499474, "flos": 11403951586560.0, "grad_norm": 2.176363969644659, "language_loss": 0.71134853, "learning_rate": 1.9055906392247723e-06, "loss": 0.73602957, "num_input_tokens_seen": 189334550, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19592285, "step": 8808, "time_per_iteration": 2.8540472984313965 }, { "auxiliary_loss_clip": 0.01431724, "auxiliary_loss_mlp": 0.01038893, "balance_loss_clip": 1.26265073, "balance_loss_mlp": 1.01947427, "epoch": 0.5296257327521419, "flos": 17203828423680.0, "grad_norm": 1.7727718572722577, "language_loss": 0.87823373, "learning_rate": 1.9052016134929554e-06, "loss": 0.90293992, "num_input_tokens_seen": 189351735, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.19396973, "step": 8809, "time_per_iteration": 2.844578266143799 }, { "auxiliary_loss_clip": 0.0146029, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.28154755, "balance_loss_mlp": 1.01554775, "epoch": 0.5296858560048099, "flos": 39977617017600.0, "grad_norm": 1.6554312032956429, "language_loss": 0.64825422, "learning_rate": 1.9048125913559016e-06, "loss": 0.67323446, "num_input_tokens_seen": 189373105, "router_z_loss_clip": 1.79003906, "router_z_loss_mlp": 0.22167969, "step": 8810, "time_per_iteration": 2.96635365486145 }, { "auxiliary_loss_clip": 0.01420454, "auxiliary_loss_mlp": 0.01036432, "balance_loss_clip": 1.25365555, "balance_loss_mlp": 1.01677465, "epoch": 0.5297459792574778, "flos": 20971682449920.0, "grad_norm": 2.73870256573814, "language_loss": 0.68571365, "learning_rate": 1.9044235728283646e-06, "loss": 0.71028244, "num_input_tokens_seen": 189394615, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.1965332, "step": 8811, "time_per_iteration": 2.8747780323028564 }, { "auxiliary_loss_clip": 0.01201562, "auxiliary_loss_mlp": 0.01017434, "balance_loss_clip": 1.1095103, "balance_loss_mlp": 1.00141239, "epoch": 0.5298061025101458, "flos": 66552967013760.0, "grad_norm": 0.6626694853394004, "language_loss": 0.53414667, "learning_rate": 1.9040345579250953e-06, "loss": 0.55633664, "num_input_tokens_seen": 189459750, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.16015625, "step": 8812, "time_per_iteration": 3.4563448429107666 }, { "auxiliary_loss_clip": 0.01201976, "auxiliary_loss_mlp": 0.01012123, "balance_loss_clip": 1.10981917, "balance_loss_mlp": 0.99762672, "epoch": 0.5298662257628137, "flos": 67694795504640.0, "grad_norm": 0.7274847592762559, "language_loss": 0.56322312, "learning_rate": 1.9036455466608453e-06, "loss": 0.5853641, "num_input_tokens_seen": 189527540, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.14453125, "step": 8813, "time_per_iteration": 3.3455958366394043 }, { "auxiliary_loss_clip": 0.01423777, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.258394, "balance_loss_mlp": 1.01588273, "epoch": 0.5299263490154817, "flos": 19655932371840.0, "grad_norm": 1.7962616070249795, "language_loss": 0.82430863, "learning_rate": 1.9032565390503657e-06, "loss": 0.84890985, "num_input_tokens_seen": 189546900, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20483398, "step": 8814, "time_per_iteration": 2.830716609954834 }, { "auxiliary_loss_clip": 0.01458145, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.28235793, "balance_loss_mlp": 1.01853633, "epoch": 0.5299864722681497, "flos": 22065297966720.0, "grad_norm": 1.5774213463902094, "language_loss": 0.85730016, "learning_rate": 1.9028675351084076e-06, "loss": 0.8822751, "num_input_tokens_seen": 189566490, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.20812988, "step": 8815, "time_per_iteration": 2.849721908569336 }, { "auxiliary_loss_clip": 0.01423864, "auxiliary_loss_mlp": 0.01033723, "balance_loss_clip": 1.2584933, "balance_loss_mlp": 1.01414895, "epoch": 0.5300465955208177, "flos": 21774065316480.0, "grad_norm": 4.137982845746858, "language_loss": 0.67364442, "learning_rate": 1.9024785348497225e-06, "loss": 0.69822031, "num_input_tokens_seen": 189585580, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19555664, "step": 8816, "time_per_iteration": 2.843815565109253 }, { "auxiliary_loss_clip": 0.0143019, "auxiliary_loss_mlp": 0.01040558, "balance_loss_clip": 1.26112032, "balance_loss_mlp": 1.02061462, "epoch": 0.5301067187734857, "flos": 43011869552640.0, "grad_norm": 1.8719714488835064, "language_loss": 0.73104727, "learning_rate": 1.9020895382890611e-06, "loss": 0.75575477, "num_input_tokens_seen": 189608485, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.19946289, "step": 8817, "time_per_iteration": 3.0196549892425537 }, { "auxiliary_loss_clip": 0.01430428, "auxiliary_loss_mlp": 0.01036141, "balance_loss_clip": 1.25958014, "balance_loss_mlp": 1.01481485, "epoch": 0.5301668420261536, "flos": 20562957757440.0, "grad_norm": 1.6892600829755202, "language_loss": 0.6592443, "learning_rate": 1.9017005454411743e-06, "loss": 0.68390989, "num_input_tokens_seen": 189627815, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.21350098, "step": 8818, "time_per_iteration": 2.873847723007202 }, { "auxiliary_loss_clip": 0.01434893, "auxiliary_loss_mlp": 0.01037344, "balance_loss_clip": 1.26367927, "balance_loss_mlp": 1.01630354, "epoch": 0.5302269652788216, "flos": 17493794219520.0, "grad_norm": 3.8001029537097337, "language_loss": 0.76059425, "learning_rate": 1.9013115563208126e-06, "loss": 0.78531659, "num_input_tokens_seen": 189644850, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.21032715, "step": 8819, "time_per_iteration": 2.8446319103240967 }, { "auxiliary_loss_clip": 0.01453134, "auxiliary_loss_mlp": 0.01043094, "balance_loss_clip": 1.27845478, "balance_loss_mlp": 1.02266157, "epoch": 0.5302870885314895, "flos": 14582236878720.0, "grad_norm": 1.9730044568934135, "language_loss": 0.82571322, "learning_rate": 1.9009225709427267e-06, "loss": 0.85067546, "num_input_tokens_seen": 189660945, "router_z_loss_clip": 1.74804688, "router_z_loss_mlp": 0.2043457, "step": 8820, "time_per_iteration": 2.9303741455078125 }, { "auxiliary_loss_clip": 0.01428979, "auxiliary_loss_mlp": 0.01036017, "balance_loss_clip": 1.25876093, "balance_loss_mlp": 1.0163238, "epoch": 0.5303472117841576, "flos": 23447793404160.0, "grad_norm": 1.7059309158588933, "language_loss": 0.72701776, "learning_rate": 1.9005335893216667e-06, "loss": 0.75166774, "num_input_tokens_seen": 189680425, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.19689941, "step": 8821, "time_per_iteration": 4.348541498184204 }, { "auxiliary_loss_clip": 0.01425363, "auxiliary_loss_mlp": 0.01040205, "balance_loss_clip": 1.25757766, "balance_loss_mlp": 1.02154827, "epoch": 0.5304073350368255, "flos": 22718761637760.0, "grad_norm": 1.5097278569951182, "language_loss": 0.74559736, "learning_rate": 1.9001446114723824e-06, "loss": 0.77025306, "num_input_tokens_seen": 189700375, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.18676758, "step": 8822, "time_per_iteration": 2.8640997409820557 }, { "auxiliary_loss_clip": 0.01429934, "auxiliary_loss_mlp": 0.01037749, "balance_loss_clip": 1.25927961, "balance_loss_mlp": 1.01645803, "epoch": 0.5304674582894935, "flos": 27940199961600.0, "grad_norm": 1.8496115534577406, "language_loss": 0.68094337, "learning_rate": 1.8997556374096257e-06, "loss": 0.70562017, "num_input_tokens_seen": 189721225, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.21289062, "step": 8823, "time_per_iteration": 2.8662829399108887 }, { "auxiliary_loss_clip": 0.01443168, "auxiliary_loss_mlp": 0.010348, "balance_loss_clip": 1.27090812, "balance_loss_mlp": 1.01392686, "epoch": 0.5305275815421614, "flos": 21260290901760.0, "grad_norm": 1.6996458295104548, "language_loss": 0.69972324, "learning_rate": 1.8993666671481444e-06, "loss": 0.72450298, "num_input_tokens_seen": 189740170, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.20861816, "step": 8824, "time_per_iteration": 2.831275224685669 }, { "auxiliary_loss_clip": 0.01418817, "auxiliary_loss_mlp": 0.01038077, "balance_loss_clip": 1.25349808, "balance_loss_mlp": 1.01843095, "epoch": 0.5305877047948294, "flos": 17612055423360.0, "grad_norm": 2.8203133687300044, "language_loss": 0.77010268, "learning_rate": 1.898977700702689e-06, "loss": 0.79467165, "num_input_tokens_seen": 189757890, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.1965332, "step": 8825, "time_per_iteration": 2.8152706623077393 }, { "auxiliary_loss_clip": 0.01413988, "auxiliary_loss_mlp": 0.01040204, "balance_loss_clip": 1.24860632, "balance_loss_mlp": 1.01919973, "epoch": 0.5306478280474973, "flos": 15203956682880.0, "grad_norm": 1.8188845261006807, "language_loss": 0.86510086, "learning_rate": 1.8985887380880103e-06, "loss": 0.88964272, "num_input_tokens_seen": 189775390, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20996094, "step": 8826, "time_per_iteration": 2.8048508167266846 }, { "auxiliary_loss_clip": 0.01423737, "auxiliary_loss_mlp": 0.01034248, "balance_loss_clip": 1.25773478, "balance_loss_mlp": 1.01405358, "epoch": 0.5307079513001653, "flos": 15349663497600.0, "grad_norm": 1.487386710268766, "language_loss": 0.64777899, "learning_rate": 1.8981997793188558e-06, "loss": 0.67235887, "num_input_tokens_seen": 189793975, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.2019043, "step": 8827, "time_per_iteration": 2.8064804077148438 }, { "auxiliary_loss_clip": 0.01431408, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.2617321, "balance_loss_mlp": 1.01652908, "epoch": 0.5307680745528333, "flos": 43560147767040.0, "grad_norm": 1.5615440184545537, "language_loss": 0.60281485, "learning_rate": 1.8978108244099762e-06, "loss": 0.62749797, "num_input_tokens_seen": 189817870, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20361328, "step": 8828, "time_per_iteration": 3.031874179840088 }, { "auxiliary_loss_clip": 0.01451571, "auxiliary_loss_mlp": 0.01040044, "balance_loss_clip": 1.27876413, "balance_loss_mlp": 1.01914668, "epoch": 0.5308281978055013, "flos": 20058865729920.0, "grad_norm": 2.1331856062856334, "language_loss": 0.82011205, "learning_rate": 1.8974218733761208e-06, "loss": 0.84502816, "num_input_tokens_seen": 189837905, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.20910645, "step": 8829, "time_per_iteration": 2.8473265171051025 }, { "auxiliary_loss_clip": 0.0141749, "auxiliary_loss_mlp": 0.01031561, "balance_loss_clip": 1.25106442, "balance_loss_mlp": 1.01136696, "epoch": 0.5308883210581693, "flos": 20713596255360.0, "grad_norm": 1.4228037860535858, "language_loss": 0.78610235, "learning_rate": 1.8970329262320375e-06, "loss": 0.81059289, "num_input_tokens_seen": 189856970, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.2019043, "step": 8830, "time_per_iteration": 2.798396348953247 }, { "auxiliary_loss_clip": 0.01427324, "auxiliary_loss_mlp": 0.01032818, "balance_loss_clip": 1.25803602, "balance_loss_mlp": 1.01364875, "epoch": 0.5309484443108372, "flos": 14363540922240.0, "grad_norm": 2.077584187931709, "language_loss": 0.81346893, "learning_rate": 1.8966439829924768e-06, "loss": 0.83807033, "num_input_tokens_seen": 189872830, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19177246, "step": 8831, "time_per_iteration": 2.792092800140381 }, { "auxiliary_loss_clip": 0.01420556, "auxiliary_loss_mlp": 0.01034514, "balance_loss_clip": 1.25305462, "balance_loss_mlp": 1.01466584, "epoch": 0.5310085675635052, "flos": 20019927939840.0, "grad_norm": 1.839931995524094, "language_loss": 0.74610317, "learning_rate": 1.896255043672186e-06, "loss": 0.77065384, "num_input_tokens_seen": 189891635, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19848633, "step": 8832, "time_per_iteration": 4.2181665897369385 }, { "auxiliary_loss_clip": 0.01443681, "auxiliary_loss_mlp": 0.01039071, "balance_loss_clip": 1.27119756, "balance_loss_mlp": 1.01904392, "epoch": 0.5310686908161731, "flos": 22137427457280.0, "grad_norm": 1.901835911119162, "language_loss": 0.76472789, "learning_rate": 1.8958661082859143e-06, "loss": 0.78955543, "num_input_tokens_seen": 189909050, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.20031738, "step": 8833, "time_per_iteration": 4.290328025817871 }, { "auxiliary_loss_clip": 0.01439676, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.26763177, "balance_loss_mlp": 1.01788187, "epoch": 0.5311288140688412, "flos": 24728722968960.0, "grad_norm": 1.738875731957632, "language_loss": 0.73832065, "learning_rate": 1.8954771768484103e-06, "loss": 0.76309067, "num_input_tokens_seen": 189927405, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.19421387, "step": 8834, "time_per_iteration": 4.296040058135986 }, { "auxiliary_loss_clip": 0.01455111, "auxiliary_loss_mlp": 0.01040422, "balance_loss_clip": 1.27741671, "balance_loss_mlp": 1.02034688, "epoch": 0.5311889373215091, "flos": 24108405753600.0, "grad_norm": 2.1236175092062726, "language_loss": 0.78863287, "learning_rate": 1.8950882493744226e-06, "loss": 0.81358826, "num_input_tokens_seen": 189947740, "router_z_loss_clip": 1.77636719, "router_z_loss_mlp": 0.20068359, "step": 8835, "time_per_iteration": 2.8598556518554688 }, { "auxiliary_loss_clip": 0.01422377, "auxiliary_loss_mlp": 0.01033049, "balance_loss_clip": 1.25341153, "balance_loss_mlp": 1.01101923, "epoch": 0.5312490605741771, "flos": 22026903114240.0, "grad_norm": 1.7726651596003606, "language_loss": 0.72693956, "learning_rate": 1.8946993258786985e-06, "loss": 0.75149381, "num_input_tokens_seen": 189966495, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.22058105, "step": 8836, "time_per_iteration": 2.855607032775879 }, { "auxiliary_loss_clip": 0.01426411, "auxiliary_loss_mlp": 0.0103723, "balance_loss_clip": 1.25476658, "balance_loss_mlp": 1.01539063, "epoch": 0.531309183826845, "flos": 19399882193280.0, "grad_norm": 1.6702781930271118, "language_loss": 0.81195903, "learning_rate": 1.894310406375987e-06, "loss": 0.83659542, "num_input_tokens_seen": 189985325, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.21838379, "step": 8837, "time_per_iteration": 2.811922311782837 }, { "auxiliary_loss_clip": 0.01416523, "auxiliary_loss_mlp": 0.01035276, "balance_loss_clip": 1.25048339, "balance_loss_mlp": 1.01445031, "epoch": 0.531369307079513, "flos": 20198781210240.0, "grad_norm": 3.5069943109706596, "language_loss": 0.86963886, "learning_rate": 1.893921490881035e-06, "loss": 0.89415681, "num_input_tokens_seen": 190003290, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20825195, "step": 8838, "time_per_iteration": 2.8138225078582764 }, { "auxiliary_loss_clip": 0.01415656, "auxiliary_loss_mlp": 0.0103314, "balance_loss_clip": 1.24957848, "balance_loss_mlp": 1.01251662, "epoch": 0.5314294303321809, "flos": 18889455893760.0, "grad_norm": 1.699227251714988, "language_loss": 0.73668069, "learning_rate": 1.8935325794085906e-06, "loss": 0.7611686, "num_input_tokens_seen": 190023260, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20617676, "step": 8839, "time_per_iteration": 2.959918737411499 }, { "auxiliary_loss_clip": 0.01425869, "auxiliary_loss_mlp": 0.01037483, "balance_loss_clip": 1.2552731, "balance_loss_mlp": 1.01712167, "epoch": 0.531489553584849, "flos": 23050560890880.0, "grad_norm": 5.841162525210617, "language_loss": 0.77264905, "learning_rate": 1.8931436719734023e-06, "loss": 0.79728258, "num_input_tokens_seen": 190042035, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20373535, "step": 8840, "time_per_iteration": 2.9403014183044434 }, { "auxiliary_loss_clip": 0.01429243, "auxiliary_loss_mlp": 0.01038658, "balance_loss_clip": 1.25798023, "balance_loss_mlp": 1.01591277, "epoch": 0.5315496768375169, "flos": 19799557925760.0, "grad_norm": 3.597842799302848, "language_loss": 0.77347648, "learning_rate": 1.892754768590216e-06, "loss": 0.79815555, "num_input_tokens_seen": 190057545, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.22729492, "step": 8841, "time_per_iteration": 2.863783121109009 }, { "auxiliary_loss_clip": 0.0121127, "auxiliary_loss_mlp": 0.01023734, "balance_loss_clip": 1.11670947, "balance_loss_mlp": 1.00933349, "epoch": 0.5316098000901849, "flos": 71056820505600.0, "grad_norm": 0.6960591058827164, "language_loss": 0.56823635, "learning_rate": 1.8923658692737793e-06, "loss": 0.59058642, "num_input_tokens_seen": 190123800, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.14355469, "step": 8842, "time_per_iteration": 3.492741346359253 }, { "auxiliary_loss_clip": 0.01440013, "auxiliary_loss_mlp": 0.01034819, "balance_loss_clip": 1.26711655, "balance_loss_mlp": 1.01455331, "epoch": 0.5316699233428529, "flos": 16444229155200.0, "grad_norm": 1.6812562413717513, "language_loss": 0.74289036, "learning_rate": 1.8919769740388407e-06, "loss": 0.76763868, "num_input_tokens_seen": 190141625, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.20263672, "step": 8843, "time_per_iteration": 2.8766441345214844 }, { "auxiliary_loss_clip": 0.01210103, "auxiliary_loss_mlp": 0.01017871, "balance_loss_clip": 1.11604214, "balance_loss_mlp": 1.00213552, "epoch": 0.5317300465955208, "flos": 67455983594880.0, "grad_norm": 0.8830084509329251, "language_loss": 0.61160439, "learning_rate": 1.891588082900145e-06, "loss": 0.63388413, "num_input_tokens_seen": 190198110, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.15722656, "step": 8844, "time_per_iteration": 3.3341541290283203 }, { "auxiliary_loss_clip": 0.01213624, "auxiliary_loss_mlp": 0.01017718, "balance_loss_clip": 1.11753523, "balance_loss_mlp": 1.00245881, "epoch": 0.5317901698481888, "flos": 59532933657600.0, "grad_norm": 0.8439931980558784, "language_loss": 0.62209809, "learning_rate": 1.8911991958724411e-06, "loss": 0.6444115, "num_input_tokens_seen": 190259950, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.15234375, "step": 8845, "time_per_iteration": 3.3040082454681396 }, { "auxiliary_loss_clip": 0.01427752, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.25956023, "balance_loss_mlp": 1.01327562, "epoch": 0.5318502931008567, "flos": 19136954805120.0, "grad_norm": 2.226302747796165, "language_loss": 0.76895672, "learning_rate": 1.890810312970474e-06, "loss": 0.79358351, "num_input_tokens_seen": 190278265, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.21643066, "step": 8846, "time_per_iteration": 2.8570191860198975 }, { "auxiliary_loss_clip": 0.01431652, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.26033282, "balance_loss_mlp": 1.01730597, "epoch": 0.5319104163535248, "flos": 24691775950080.0, "grad_norm": 1.6989911170044525, "language_loss": 0.76413083, "learning_rate": 1.8904214342089903e-06, "loss": 0.78881574, "num_input_tokens_seen": 190298400, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.19543457, "step": 8847, "time_per_iteration": 2.879509687423706 }, { "auxiliary_loss_clip": 0.01415444, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.24900901, "balance_loss_mlp": 1.0189147, "epoch": 0.5319705396061927, "flos": 19393547921280.0, "grad_norm": 1.5646435578738025, "language_loss": 0.88190579, "learning_rate": 1.8900325596027378e-06, "loss": 0.90644711, "num_input_tokens_seen": 190316235, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19775391, "step": 8848, "time_per_iteration": 2.8821699619293213 }, { "auxiliary_loss_clip": 0.01426462, "auxiliary_loss_mlp": 0.0103971, "balance_loss_clip": 1.25627804, "balance_loss_mlp": 1.01851416, "epoch": 0.5320306628588607, "flos": 18268324272000.0, "grad_norm": 2.598598248842916, "language_loss": 0.75547963, "learning_rate": 1.8896436891664609e-06, "loss": 0.78014135, "num_input_tokens_seen": 190335060, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.21203613, "step": 8849, "time_per_iteration": 2.8607053756713867 }, { "auxiliary_loss_clip": 0.01439759, "auxiliary_loss_mlp": 0.01038551, "balance_loss_clip": 1.26564872, "balance_loss_mlp": 1.01723695, "epoch": 0.5320907861115286, "flos": 23742781372800.0, "grad_norm": 2.863602753926276, "language_loss": 0.8051492, "learning_rate": 1.8892548229149066e-06, "loss": 0.82993233, "num_input_tokens_seen": 190353265, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.21276855, "step": 8850, "time_per_iteration": 2.8167285919189453 }, { "auxiliary_loss_clip": 0.01417076, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.24883342, "balance_loss_mlp": 1.01587033, "epoch": 0.5321509093641966, "flos": 34508408313600.0, "grad_norm": 1.3634999708125297, "language_loss": 0.55440974, "learning_rate": 1.888865960862821e-06, "loss": 0.57894796, "num_input_tokens_seen": 190376575, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20874023, "step": 8851, "time_per_iteration": 2.9646615982055664 }, { "auxiliary_loss_clip": 0.01437525, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.26626492, "balance_loss_mlp": 1.02094579, "epoch": 0.5322110326168645, "flos": 20020470877440.0, "grad_norm": 2.0753918674280922, "language_loss": 0.69481426, "learning_rate": 1.8884771030249484e-06, "loss": 0.71959448, "num_input_tokens_seen": 190395185, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.19555664, "step": 8852, "time_per_iteration": 2.798671007156372 }, { "auxiliary_loss_clip": 0.01208746, "auxiliary_loss_mlp": 0.01026063, "balance_loss_clip": 1.1162287, "balance_loss_mlp": 1.00956464, "epoch": 0.5322711558695326, "flos": 64661266886400.0, "grad_norm": 0.8044294190472469, "language_loss": 0.62994313, "learning_rate": 1.8880882494160357e-06, "loss": 0.65229124, "num_input_tokens_seen": 190452595, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.16503906, "step": 8853, "time_per_iteration": 3.3102242946624756 }, { "auxiliary_loss_clip": 0.01443286, "auxiliary_loss_mlp": 0.01044268, "balance_loss_clip": 1.26803732, "balance_loss_mlp": 1.02444315, "epoch": 0.5323312791222005, "flos": 14947544545920.0, "grad_norm": 2.4999240924995507, "language_loss": 0.80914903, "learning_rate": 1.8876994000508278e-06, "loss": 0.83402455, "num_input_tokens_seen": 190469140, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.19824219, "step": 8854, "time_per_iteration": 2.7819952964782715 }, { "auxiliary_loss_clip": 0.01413758, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 1.24979234, "balance_loss_mlp": 1.01551235, "epoch": 0.5323914023748685, "flos": 23451322498560.0, "grad_norm": 1.8176025394958988, "language_loss": 0.74519753, "learning_rate": 1.8873105549440698e-06, "loss": 0.76967919, "num_input_tokens_seen": 190489015, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18884277, "step": 8855, "time_per_iteration": 2.835422992706299 }, { "auxiliary_loss_clip": 0.01422524, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.25471759, "balance_loss_mlp": 1.01562965, "epoch": 0.5324515256275365, "flos": 26297265600000.0, "grad_norm": 2.1642255286287897, "language_loss": 0.66275465, "learning_rate": 1.886921714110507e-06, "loss": 0.68732679, "num_input_tokens_seen": 190508065, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19055176, "step": 8856, "time_per_iteration": 4.3179240226745605 }, { "auxiliary_loss_clip": 0.01433811, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.26088095, "balance_loss_mlp": 1.01590073, "epoch": 0.5325116488802044, "flos": 26882400343680.0, "grad_norm": 2.381173755041629, "language_loss": 0.78076309, "learning_rate": 1.8865328775648842e-06, "loss": 0.80546331, "num_input_tokens_seen": 190527045, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.203125, "step": 8857, "time_per_iteration": 2.854440689086914 }, { "auxiliary_loss_clip": 0.01430457, "auxiliary_loss_mlp": 0.01040316, "balance_loss_clip": 1.26102209, "balance_loss_mlp": 1.01971674, "epoch": 0.5325717721328724, "flos": 25895689585920.0, "grad_norm": 17.638073917407514, "language_loss": 0.71705532, "learning_rate": 1.8861440453219456e-06, "loss": 0.741763, "num_input_tokens_seen": 190544075, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20605469, "step": 8858, "time_per_iteration": 2.845877170562744 }, { "auxiliary_loss_clip": 0.01429206, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.25950623, "balance_loss_mlp": 1.02069104, "epoch": 0.5326318953855403, "flos": 21809474012160.0, "grad_norm": 1.8458563322804848, "language_loss": 0.70355803, "learning_rate": 1.8857552173964367e-06, "loss": 0.72827005, "num_input_tokens_seen": 190566030, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21313477, "step": 8859, "time_per_iteration": 2.9480140209198 }, { "auxiliary_loss_clip": 0.01416178, "auxiliary_loss_mlp": 0.01036291, "balance_loss_clip": 1.25220227, "balance_loss_mlp": 1.01671731, "epoch": 0.5326920186382084, "flos": 20932654170240.0, "grad_norm": 1.5625653568535898, "language_loss": 0.70179343, "learning_rate": 1.8853663938031013e-06, "loss": 0.72631812, "num_input_tokens_seen": 190585605, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19567871, "step": 8860, "time_per_iteration": 2.8792800903320312 }, { "auxiliary_loss_clip": 0.01419947, "auxiliary_loss_mlp": 0.01034149, "balance_loss_clip": 1.2535336, "balance_loss_mlp": 1.014539, "epoch": 0.5327521418908763, "flos": 21443216204160.0, "grad_norm": 2.045119622674687, "language_loss": 0.7853806, "learning_rate": 1.884977574556683e-06, "loss": 0.80992156, "num_input_tokens_seen": 190604625, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19604492, "step": 8861, "time_per_iteration": 2.85494327545166 }, { "auxiliary_loss_clip": 0.01427386, "auxiliary_loss_mlp": 0.01041047, "balance_loss_clip": 1.25754976, "balance_loss_mlp": 1.01945782, "epoch": 0.5328122651435443, "flos": 21769721815680.0, "grad_norm": 2.1045457623875357, "language_loss": 0.86404967, "learning_rate": 1.8845887596719279e-06, "loss": 0.88873404, "num_input_tokens_seen": 190625060, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21594238, "step": 8862, "time_per_iteration": 3.0078816413879395 }, { "auxiliary_loss_clip": 0.01431251, "auxiliary_loss_mlp": 0.01034622, "balance_loss_clip": 1.25851583, "balance_loss_mlp": 1.01185298, "epoch": 0.5328723883962122, "flos": 18305678494080.0, "grad_norm": 1.9433975953215303, "language_loss": 0.62562531, "learning_rate": 1.8841999491635778e-06, "loss": 0.65028405, "num_input_tokens_seen": 190643150, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.2277832, "step": 8863, "time_per_iteration": 2.8746144771575928 }, { "auxiliary_loss_clip": 0.01422817, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 1.25823522, "balance_loss_mlp": 1.01678491, "epoch": 0.5329325116488802, "flos": 25385987203200.0, "grad_norm": 2.859434726164187, "language_loss": 0.74938512, "learning_rate": 1.883811143046377e-06, "loss": 0.7739867, "num_input_tokens_seen": 190662725, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20556641, "step": 8864, "time_per_iteration": 2.8922362327575684 }, { "auxiliary_loss_clip": 0.01414509, "auxiliary_loss_mlp": 0.01035994, "balance_loss_clip": 1.24856043, "balance_loss_mlp": 1.01541877, "epoch": 0.5329926349015481, "flos": 25602782878080.0, "grad_norm": 1.989030663091117, "language_loss": 0.65468109, "learning_rate": 1.8834223413350702e-06, "loss": 0.67918611, "num_input_tokens_seen": 190683680, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20581055, "step": 8865, "time_per_iteration": 2.9081478118896484 }, { "auxiliary_loss_clip": 0.01433147, "auxiliary_loss_mlp": 0.01032799, "balance_loss_clip": 1.26530325, "balance_loss_mlp": 1.01253295, "epoch": 0.5330527581542162, "flos": 22898927007360.0, "grad_norm": 2.0235807801460894, "language_loss": 0.79424822, "learning_rate": 1.8830335440443989e-06, "loss": 0.81890774, "num_input_tokens_seen": 190703350, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20251465, "step": 8866, "time_per_iteration": 2.818025827407837 }, { "auxiliary_loss_clip": 0.01430357, "auxiliary_loss_mlp": 0.01041711, "balance_loss_clip": 1.26228976, "balance_loss_mlp": 1.02143335, "epoch": 0.5331128814068841, "flos": 16033377957120.0, "grad_norm": 2.0347730340952555, "language_loss": 0.74608225, "learning_rate": 1.882644751189108e-06, "loss": 0.77080286, "num_input_tokens_seen": 190721170, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20275879, "step": 8867, "time_per_iteration": 4.230633497238159 }, { "auxiliary_loss_clip": 0.01425008, "auxiliary_loss_mlp": 0.01038589, "balance_loss_clip": 1.25628424, "balance_loss_mlp": 1.01747656, "epoch": 0.5331730046595521, "flos": 39358295187840.0, "grad_norm": 1.5583660696227977, "language_loss": 0.72863257, "learning_rate": 1.88225596278394e-06, "loss": 0.75326854, "num_input_tokens_seen": 190743795, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.21130371, "step": 8868, "time_per_iteration": 4.420438051223755 }, { "auxiliary_loss_clip": 0.01428111, "auxiliary_loss_mlp": 0.01039691, "balance_loss_clip": 1.26008701, "balance_loss_mlp": 1.01929402, "epoch": 0.5332331279122201, "flos": 24034964163840.0, "grad_norm": 1.987840621428596, "language_loss": 0.7903254, "learning_rate": 1.881867178843637e-06, "loss": 0.8150034, "num_input_tokens_seen": 190761560, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20397949, "step": 8869, "time_per_iteration": 4.410567998886108 }, { "auxiliary_loss_clip": 0.01447211, "auxiliary_loss_mlp": 0.01039861, "balance_loss_clip": 1.27390599, "balance_loss_mlp": 1.01927328, "epoch": 0.533293251164888, "flos": 17138304673920.0, "grad_norm": 2.2665875144807694, "language_loss": 0.76660019, "learning_rate": 1.8814783993829434e-06, "loss": 0.79147089, "num_input_tokens_seen": 190778875, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.20568848, "step": 8870, "time_per_iteration": 2.8205859661102295 }, { "auxiliary_loss_clip": 0.01448273, "auxiliary_loss_mlp": 0.01044275, "balance_loss_clip": 1.27521312, "balance_loss_mlp": 1.02185202, "epoch": 0.533353374417556, "flos": 22136251092480.0, "grad_norm": 2.1505184398936334, "language_loss": 0.75913543, "learning_rate": 1.8810896244165997e-06, "loss": 0.7840609, "num_input_tokens_seen": 190799830, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.22412109, "step": 8871, "time_per_iteration": 2.8773412704467773 }, { "auxiliary_loss_clip": 0.01432255, "auxiliary_loss_mlp": 0.01040992, "balance_loss_clip": 1.26278901, "balance_loss_mlp": 1.02095342, "epoch": 0.533413497670224, "flos": 15018995364480.0, "grad_norm": 1.972671176364538, "language_loss": 0.72483617, "learning_rate": 1.8807008539593498e-06, "loss": 0.7495687, "num_input_tokens_seen": 190817155, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20031738, "step": 8872, "time_per_iteration": 2.8744421005249023 }, { "auxiliary_loss_clip": 0.01421765, "auxiliary_loss_mlp": 0.01037698, "balance_loss_clip": 1.25539088, "balance_loss_mlp": 1.01755166, "epoch": 0.533473620922892, "flos": 19619483045760.0, "grad_norm": 2.4611469090000857, "language_loss": 0.6572938, "learning_rate": 1.880312088025936e-06, "loss": 0.68188846, "num_input_tokens_seen": 190835240, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20153809, "step": 8873, "time_per_iteration": 2.8315553665161133 }, { "auxiliary_loss_clip": 0.01428491, "auxiliary_loss_mlp": 0.01040209, "balance_loss_clip": 1.26123977, "balance_loss_mlp": 1.01958621, "epoch": 0.5335337441755599, "flos": 14290687514880.0, "grad_norm": 3.3821941473010892, "language_loss": 0.80577004, "learning_rate": 1.879923326631099e-06, "loss": 0.83045709, "num_input_tokens_seen": 190851620, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20617676, "step": 8874, "time_per_iteration": 2.814331531524658 }, { "auxiliary_loss_clip": 0.01418703, "auxiliary_loss_mlp": 0.01031764, "balance_loss_clip": 1.25098395, "balance_loss_mlp": 1.01162922, "epoch": 0.5335938674282279, "flos": 20824889760000.0, "grad_norm": 2.224870553059437, "language_loss": 0.70400536, "learning_rate": 1.879534569789582e-06, "loss": 0.72851008, "num_input_tokens_seen": 190870545, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20129395, "step": 8875, "time_per_iteration": 2.853872060775757 }, { "auxiliary_loss_clip": 0.01216674, "auxiliary_loss_mlp": 0.01021291, "balance_loss_clip": 1.12600315, "balance_loss_mlp": 1.00584173, "epoch": 0.5336539906808958, "flos": 71432308252800.0, "grad_norm": 0.7296616415787806, "language_loss": 0.59737259, "learning_rate": 1.879145817516126e-06, "loss": 0.61975223, "num_input_tokens_seen": 190931995, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.15429688, "step": 8876, "time_per_iteration": 3.4561338424682617 }, { "auxiliary_loss_clip": 0.01425471, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.25599205, "balance_loss_mlp": 1.01663399, "epoch": 0.5337141139335638, "flos": 20161562722560.0, "grad_norm": 1.8792087772960298, "language_loss": 0.75667512, "learning_rate": 1.8787570698254727e-06, "loss": 0.78129292, "num_input_tokens_seen": 190949890, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19677734, "step": 8877, "time_per_iteration": 2.8564629554748535 }, { "auxiliary_loss_clip": 0.01218936, "auxiliary_loss_mlp": 0.01020744, "balance_loss_clip": 1.12477255, "balance_loss_mlp": 1.00224292, "epoch": 0.5337742371862317, "flos": 67758237993600.0, "grad_norm": 0.76282272658024, "language_loss": 0.57271379, "learning_rate": 1.8783683267323629e-06, "loss": 0.5951106, "num_input_tokens_seen": 191008480, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.18457031, "step": 8878, "time_per_iteration": 3.249645471572876 }, { "auxiliary_loss_clip": 0.0144643, "auxiliary_loss_mlp": 0.01041913, "balance_loss_clip": 1.27197826, "balance_loss_mlp": 1.02071738, "epoch": 0.5338343604388998, "flos": 25019593660800.0, "grad_norm": 1.4895106366175577, "language_loss": 0.73510408, "learning_rate": 1.8779795882515395e-06, "loss": 0.75998747, "num_input_tokens_seen": 191028995, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.21179199, "step": 8879, "time_per_iteration": 2.903784990310669 }, { "auxiliary_loss_clip": 0.01433637, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.26190472, "balance_loss_mlp": 1.01231027, "epoch": 0.5338944836915677, "flos": 17609747938560.0, "grad_norm": 2.172844974558253, "language_loss": 0.84581095, "learning_rate": 1.8775908543977416e-06, "loss": 0.87047255, "num_input_tokens_seen": 191045285, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.20227051, "step": 8880, "time_per_iteration": 2.948892593383789 }, { "auxiliary_loss_clip": 0.01423185, "auxiliary_loss_mlp": 0.01038183, "balance_loss_clip": 1.25766861, "balance_loss_mlp": 1.01837051, "epoch": 0.5339546069442357, "flos": 21733679692800.0, "grad_norm": 1.4286569998474872, "language_loss": 0.80452615, "learning_rate": 1.8772021251857107e-06, "loss": 0.82913983, "num_input_tokens_seen": 191066105, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19812012, "step": 8881, "time_per_iteration": 2.8531334400177 }, { "auxiliary_loss_clip": 0.01214078, "auxiliary_loss_mlp": 0.01025706, "balance_loss_clip": 1.12211788, "balance_loss_mlp": 1.00787187, "epoch": 0.5340147301969036, "flos": 69750961056000.0, "grad_norm": 0.8005766389682626, "language_loss": 0.59291178, "learning_rate": 1.8768134006301882e-06, "loss": 0.61530966, "num_input_tokens_seen": 191126315, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.17871094, "step": 8882, "time_per_iteration": 3.250101327896118 }, { "auxiliary_loss_clip": 0.01211607, "auxiliary_loss_mlp": 0.01034921, "balance_loss_clip": 1.12010467, "balance_loss_mlp": 1.01594281, "epoch": 0.5340748534495716, "flos": 63905223974400.0, "grad_norm": 0.8638521507057143, "language_loss": 0.63813633, "learning_rate": 1.876424680745913e-06, "loss": 0.66060162, "num_input_tokens_seen": 191174240, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.18945312, "step": 8883, "time_per_iteration": 3.1181132793426514 }, { "auxiliary_loss_clip": 0.01435152, "auxiliary_loss_mlp": 0.01035606, "balance_loss_clip": 1.26364994, "balance_loss_mlp": 1.01487541, "epoch": 0.5341349767022396, "flos": 28706043012480.0, "grad_norm": 2.150999768283258, "language_loss": 0.83269572, "learning_rate": 1.8760359655476272e-06, "loss": 0.85740334, "num_input_tokens_seen": 191193335, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20739746, "step": 8884, "time_per_iteration": 2.91094708442688 }, { "auxiliary_loss_clip": 0.01413099, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.24977911, "balance_loss_mlp": 1.01809096, "epoch": 0.5341950999549075, "flos": 16298341361280.0, "grad_norm": 1.498129726939141, "language_loss": 0.72541142, "learning_rate": 1.8756472550500695e-06, "loss": 0.74992824, "num_input_tokens_seen": 191210900, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20483398, "step": 8885, "time_per_iteration": 2.819249391555786 }, { "auxiliary_loss_clip": 0.01440691, "auxiliary_loss_mlp": 0.01033238, "balance_loss_clip": 1.26716471, "balance_loss_mlp": 1.01266229, "epoch": 0.5342552232075756, "flos": 14363721901440.0, "grad_norm": 2.369993488732426, "language_loss": 0.80112314, "learning_rate": 1.87525854926798e-06, "loss": 0.82586247, "num_input_tokens_seen": 191226730, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.20581055, "step": 8886, "time_per_iteration": 2.796842098236084 }, { "auxiliary_loss_clip": 0.01432241, "auxiliary_loss_mlp": 0.01036991, "balance_loss_clip": 1.26209617, "balance_loss_mlp": 1.01459169, "epoch": 0.5343153464602435, "flos": 30309677625600.0, "grad_norm": 1.872123671431763, "language_loss": 0.75612843, "learning_rate": 1.8748698482160996e-06, "loss": 0.78082073, "num_input_tokens_seen": 191250435, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22399902, "step": 8887, "time_per_iteration": 2.9201488494873047 }, { "auxiliary_loss_clip": 0.01435002, "auxiliary_loss_mlp": 0.01031779, "balance_loss_clip": 1.26601338, "balance_loss_mlp": 1.01171565, "epoch": 0.5343754697129115, "flos": 15604718290560.0, "grad_norm": 3.181768667518862, "language_loss": 0.70266426, "learning_rate": 1.8744811519091663e-06, "loss": 0.72733206, "num_input_tokens_seen": 191268315, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20056152, "step": 8888, "time_per_iteration": 2.8565337657928467 }, { "auxiliary_loss_clip": 0.01450655, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.2735393, "balance_loss_mlp": 1.01425719, "epoch": 0.5344355929655794, "flos": 16918341863040.0, "grad_norm": 2.1693031550369715, "language_loss": 0.78246927, "learning_rate": 1.8740924603619208e-06, "loss": 0.80732775, "num_input_tokens_seen": 191287000, "router_z_loss_clip": 1.77246094, "router_z_loss_mlp": 0.20947266, "step": 8889, "time_per_iteration": 2.819161891937256 }, { "auxiliary_loss_clip": 0.01425152, "auxiliary_loss_mlp": 0.01039454, "balance_loss_clip": 1.25769567, "balance_loss_mlp": 1.01731682, "epoch": 0.5344957162182474, "flos": 16806369686400.0, "grad_norm": 2.1061710133666436, "language_loss": 0.70230043, "learning_rate": 1.873703773589102e-06, "loss": 0.72694647, "num_input_tokens_seen": 191304565, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.22119141, "step": 8890, "time_per_iteration": 2.811339855194092 }, { "auxiliary_loss_clip": 0.0143266, "auxiliary_loss_mlp": 0.01041484, "balance_loss_clip": 1.26003778, "balance_loss_mlp": 1.01909649, "epoch": 0.5345558394709153, "flos": 12711105152640.0, "grad_norm": 2.6252363475755027, "language_loss": 0.77643985, "learning_rate": 1.8733150916054483e-06, "loss": 0.80118132, "num_input_tokens_seen": 191318300, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.22387695, "step": 8891, "time_per_iteration": 4.270822048187256 }, { "auxiliary_loss_clip": 0.01413488, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.24818623, "balance_loss_mlp": 1.0131377, "epoch": 0.5346159627235834, "flos": 22465109433600.0, "grad_norm": 1.4909887070357695, "language_loss": 0.75160748, "learning_rate": 1.872926414425699e-06, "loss": 0.77608168, "num_input_tokens_seen": 191337925, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20788574, "step": 8892, "time_per_iteration": 2.8724372386932373 }, { "auxiliary_loss_clip": 0.01427236, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.25812674, "balance_loss_mlp": 1.01186299, "epoch": 0.5346760859762513, "flos": 22425085768320.0, "grad_norm": 1.5883401918041742, "language_loss": 0.88294351, "learning_rate": 1.8725377420645932e-06, "loss": 0.90753436, "num_input_tokens_seen": 191357120, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19958496, "step": 8893, "time_per_iteration": 2.856987237930298 }, { "auxiliary_loss_clip": 0.01417241, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.2507956, "balance_loss_mlp": 1.01225805, "epoch": 0.5347362092289193, "flos": 22825304438400.0, "grad_norm": 2.32890627896578, "language_loss": 0.74270296, "learning_rate": 1.872149074536869e-06, "loss": 0.76720834, "num_input_tokens_seen": 191375395, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.21032715, "step": 8894, "time_per_iteration": 2.8510994911193848 }, { "auxiliary_loss_clip": 0.01401141, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.23750377, "balance_loss_mlp": 1.01173329, "epoch": 0.5347963324815872, "flos": 23229278426880.0, "grad_norm": 1.7408587739287371, "language_loss": 0.75281864, "learning_rate": 1.8717604118572648e-06, "loss": 0.77716553, "num_input_tokens_seen": 191395595, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.21813965, "step": 8895, "time_per_iteration": 2.851470470428467 }, { "auxiliary_loss_clip": 0.01422652, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.25380456, "balance_loss_mlp": 1.01547456, "epoch": 0.5348564557342552, "flos": 22611540165120.0, "grad_norm": 1.7074404696461465, "language_loss": 0.77273154, "learning_rate": 1.8713717540405178e-06, "loss": 0.79733682, "num_input_tokens_seen": 191413730, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.22399902, "step": 8896, "time_per_iteration": 2.8344829082489014 }, { "auxiliary_loss_clip": 0.01405159, "auxiliary_loss_mlp": 0.01038912, "balance_loss_clip": 1.24056518, "balance_loss_mlp": 1.01662016, "epoch": 0.5349165789869232, "flos": 18010916749440.0, "grad_norm": 1.777899018249959, "language_loss": 0.79224575, "learning_rate": 1.8709831011013676e-06, "loss": 0.81668651, "num_input_tokens_seen": 191432400, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.22290039, "step": 8897, "time_per_iteration": 2.8258559703826904 }, { "auxiliary_loss_clip": 0.01424236, "auxiliary_loss_mlp": 0.0103605, "balance_loss_clip": 1.25462902, "balance_loss_mlp": 1.01382971, "epoch": 0.5349767022395912, "flos": 17167198118400.0, "grad_norm": 2.843313671400572, "language_loss": 0.76714981, "learning_rate": 1.8705944530545509e-06, "loss": 0.79175258, "num_input_tokens_seen": 191448855, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.22229004, "step": 8898, "time_per_iteration": 2.815018653869629 }, { "auxiliary_loss_clip": 0.01214043, "auxiliary_loss_mlp": 0.01043426, "balance_loss_clip": 1.12038422, "balance_loss_mlp": 1.02425671, "epoch": 0.5350368254922592, "flos": 71027022165120.0, "grad_norm": 0.8569755672076108, "language_loss": 0.5804143, "learning_rate": 1.8702058099148052e-06, "loss": 0.60298902, "num_input_tokens_seen": 191519690, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.19140625, "step": 8899, "time_per_iteration": 3.5368707180023193 }, { "auxiliary_loss_clip": 0.01408653, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.24284542, "balance_loss_mlp": 1.01503062, "epoch": 0.5350969487449271, "flos": 27429366458880.0, "grad_norm": 1.7773252354404405, "language_loss": 0.70429337, "learning_rate": 1.869817171696868e-06, "loss": 0.72873008, "num_input_tokens_seen": 191539380, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.1998291, "step": 8900, "time_per_iteration": 2.955561637878418 }, { "auxiliary_loss_clip": 0.01424052, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.25405955, "balance_loss_mlp": 1.01358247, "epoch": 0.5351570719975951, "flos": 19325083259520.0, "grad_norm": 2.895172247888875, "language_loss": 0.71986711, "learning_rate": 1.8694285384154777e-06, "loss": 0.74445748, "num_input_tokens_seen": 191557400, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.21398926, "step": 8901, "time_per_iteration": 2.830043077468872 }, { "auxiliary_loss_clip": 0.01425633, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.25625873, "balance_loss_mlp": 1.01345825, "epoch": 0.535217195250263, "flos": 19838088512640.0, "grad_norm": 2.457606105317198, "language_loss": 0.78345859, "learning_rate": 1.8690399100853699e-06, "loss": 0.80806679, "num_input_tokens_seen": 191575860, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21728516, "step": 8902, "time_per_iteration": 4.377692461013794 }, { "auxiliary_loss_clip": 0.01403984, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.24120939, "balance_loss_mlp": 1.01093316, "epoch": 0.535277318502931, "flos": 22138151374080.0, "grad_norm": 1.5191504119178216, "language_loss": 0.70813847, "learning_rate": 1.868651286721281e-06, "loss": 0.7324999, "num_input_tokens_seen": 191595775, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.21228027, "step": 8903, "time_per_iteration": 4.269007205963135 }, { "auxiliary_loss_clip": 0.01424259, "auxiliary_loss_mlp": 0.01036782, "balance_loss_clip": 1.25321674, "balance_loss_mlp": 1.01609945, "epoch": 0.5353374417555989, "flos": 25056721658880.0, "grad_norm": 1.647286037561347, "language_loss": 0.73141348, "learning_rate": 1.86826266833795e-06, "loss": 0.75602388, "num_input_tokens_seen": 191617785, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.20678711, "step": 8904, "time_per_iteration": 2.9894731044769287 }, { "auxiliary_loss_clip": 0.01424979, "auxiliary_loss_mlp": 0.01037299, "balance_loss_clip": 1.256109, "balance_loss_mlp": 1.0155077, "epoch": 0.535397565008267, "flos": 19397574708480.0, "grad_norm": 2.338463669712278, "language_loss": 0.74180728, "learning_rate": 1.8678740549501103e-06, "loss": 0.76643002, "num_input_tokens_seen": 191636900, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21777344, "step": 8905, "time_per_iteration": 4.360944986343384 }, { "auxiliary_loss_clip": 0.0140337, "auxiliary_loss_mlp": 0.01038814, "balance_loss_clip": 1.2405597, "balance_loss_mlp": 1.01767778, "epoch": 0.5354576882609349, "flos": 21481339587840.0, "grad_norm": 1.5089512103997396, "language_loss": 0.84615541, "learning_rate": 1.8674854465725005e-06, "loss": 0.87057728, "num_input_tokens_seen": 191656720, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.21130371, "step": 8906, "time_per_iteration": 2.882399797439575 }, { "auxiliary_loss_clip": 0.01425731, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.25497806, "balance_loss_mlp": 1.01490927, "epoch": 0.5355178115136029, "flos": 20787354558720.0, "grad_norm": 1.792465442918349, "language_loss": 0.74657035, "learning_rate": 1.8670968432198563e-06, "loss": 0.77119011, "num_input_tokens_seen": 191674445, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21362305, "step": 8907, "time_per_iteration": 2.856821060180664 }, { "auxiliary_loss_clip": 0.01411553, "auxiliary_loss_mlp": 0.01040252, "balance_loss_clip": 1.24542594, "balance_loss_mlp": 1.01850796, "epoch": 0.5355779347662708, "flos": 23524583109120.0, "grad_norm": 2.1198747698168754, "language_loss": 0.76898015, "learning_rate": 1.866708244906912e-06, "loss": 0.79349816, "num_input_tokens_seen": 191695000, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21740723, "step": 8908, "time_per_iteration": 2.84199595451355 }, { "auxiliary_loss_clip": 0.01425684, "auxiliary_loss_mlp": 0.01039724, "balance_loss_clip": 1.25550222, "balance_loss_mlp": 1.01870739, "epoch": 0.5356380580189388, "flos": 20312925137280.0, "grad_norm": 2.060152889227769, "language_loss": 0.74808395, "learning_rate": 1.8663196516484055e-06, "loss": 0.7727381, "num_input_tokens_seen": 191713295, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.21020508, "step": 8909, "time_per_iteration": 2.8210549354553223 }, { "auxiliary_loss_clip": 0.01412617, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.24724424, "balance_loss_mlp": 1.02019572, "epoch": 0.5356981812716068, "flos": 21371720140800.0, "grad_norm": 2.057554132336899, "language_loss": 0.84555817, "learning_rate": 1.8659310634590702e-06, "loss": 0.87009716, "num_input_tokens_seen": 191732725, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.2109375, "step": 8910, "time_per_iteration": 2.8127779960632324 }, { "auxiliary_loss_clip": 0.01413378, "auxiliary_loss_mlp": 0.01037871, "balance_loss_clip": 1.24592292, "balance_loss_mlp": 1.01634145, "epoch": 0.5357583045242748, "flos": 23121333037440.0, "grad_norm": 1.691613980109304, "language_loss": 0.82861853, "learning_rate": 1.8655424803536427e-06, "loss": 0.85313106, "num_input_tokens_seen": 191753765, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.21533203, "step": 8911, "time_per_iteration": 2.8943979740142822 }, { "auxiliary_loss_clip": 0.01415985, "auxiliary_loss_mlp": 0.01035144, "balance_loss_clip": 1.25052273, "balance_loss_mlp": 1.01514053, "epoch": 0.5358184277769428, "flos": 21151531105920.0, "grad_norm": 2.0116558569311462, "language_loss": 0.69844258, "learning_rate": 1.8651539023468585e-06, "loss": 0.72295386, "num_input_tokens_seen": 191773560, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20007324, "step": 8912, "time_per_iteration": 2.816542148590088 }, { "auxiliary_loss_clip": 0.01413638, "auxiliary_loss_mlp": 0.01041946, "balance_loss_clip": 1.24820507, "balance_loss_mlp": 1.02145362, "epoch": 0.5358785510296107, "flos": 16289156666880.0, "grad_norm": 2.2878478264434143, "language_loss": 0.72789335, "learning_rate": 1.8647653294534509e-06, "loss": 0.75244915, "num_input_tokens_seen": 191791255, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20507812, "step": 8913, "time_per_iteration": 2.8243134021759033 }, { "auxiliary_loss_clip": 0.0142617, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.25298274, "balance_loss_mlp": 1.01951456, "epoch": 0.5359386742822787, "flos": 16984137081600.0, "grad_norm": 1.7288964849335533, "language_loss": 0.72728276, "learning_rate": 1.864376761688156e-06, "loss": 0.75194919, "num_input_tokens_seen": 191809325, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.20959473, "step": 8914, "time_per_iteration": 2.8418385982513428 }, { "auxiliary_loss_clip": 0.01437058, "auxiliary_loss_mlp": 0.01041481, "balance_loss_clip": 1.26330781, "balance_loss_mlp": 1.01926076, "epoch": 0.5359987975349466, "flos": 20822491785600.0, "grad_norm": 2.0284683327384423, "language_loss": 0.7085917, "learning_rate": 1.8639881990657079e-06, "loss": 0.73337704, "num_input_tokens_seen": 191829795, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.22241211, "step": 8915, "time_per_iteration": 2.9107766151428223 }, { "auxiliary_loss_clip": 0.01406072, "auxiliary_loss_mlp": 0.01042036, "balance_loss_clip": 1.23985565, "balance_loss_mlp": 1.02099526, "epoch": 0.5360589207876146, "flos": 22210009395840.0, "grad_norm": 1.5821632383443327, "language_loss": 0.75997221, "learning_rate": 1.8635996416008408e-06, "loss": 0.78445327, "num_input_tokens_seen": 191850840, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21032715, "step": 8916, "time_per_iteration": 2.869527578353882 }, { "auxiliary_loss_clip": 0.01422654, "auxiliary_loss_mlp": 0.01039465, "balance_loss_clip": 1.25346208, "balance_loss_mlp": 1.01852, "epoch": 0.5361190440402825, "flos": 31406822236800.0, "grad_norm": 3.0032879639462844, "language_loss": 0.72666186, "learning_rate": 1.863211089308289e-06, "loss": 0.75128305, "num_input_tokens_seen": 191869520, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20922852, "step": 8917, "time_per_iteration": 2.8968751430511475 }, { "auxiliary_loss_clip": 0.01426998, "auxiliary_loss_mlp": 0.0104555, "balance_loss_clip": 1.25818098, "balance_loss_mlp": 1.02462864, "epoch": 0.5361791672929506, "flos": 16078242816000.0, "grad_norm": 2.1520764269799346, "language_loss": 0.72786689, "learning_rate": 1.8628225422027865e-06, "loss": 0.75259233, "num_input_tokens_seen": 191887240, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20910645, "step": 8918, "time_per_iteration": 2.847893238067627 }, { "auxiliary_loss_clip": 0.01411477, "auxiliary_loss_mlp": 0.01043463, "balance_loss_clip": 1.2440505, "balance_loss_mlp": 1.02335215, "epoch": 0.5362392905456185, "flos": 20750724253440.0, "grad_norm": 1.585030033720012, "language_loss": 0.7550866, "learning_rate": 1.862434000299067e-06, "loss": 0.77963603, "num_input_tokens_seen": 191905690, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.2010498, "step": 8919, "time_per_iteration": 2.85774302482605 }, { "auxiliary_loss_clip": 0.01422848, "auxiliary_loss_mlp": 0.01040129, "balance_loss_clip": 1.25204635, "balance_loss_mlp": 1.02029216, "epoch": 0.5362994137982865, "flos": 17347001529600.0, "grad_norm": 3.1736842466194273, "language_loss": 0.72428405, "learning_rate": 1.862045463611864e-06, "loss": 0.74891376, "num_input_tokens_seen": 191920725, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.19836426, "step": 8920, "time_per_iteration": 2.7780590057373047 }, { "auxiliary_loss_clip": 0.014188, "auxiliary_loss_mlp": 0.01037023, "balance_loss_clip": 1.25118518, "balance_loss_mlp": 1.01665044, "epoch": 0.5363595370509544, "flos": 42829260963840.0, "grad_norm": 5.795808833494504, "language_loss": 0.69395804, "learning_rate": 1.8616569321559105e-06, "loss": 0.71851629, "num_input_tokens_seen": 191944645, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20361328, "step": 8921, "time_per_iteration": 3.032231330871582 }, { "auxiliary_loss_clip": 0.01429536, "auxiliary_loss_mlp": 0.01037656, "balance_loss_clip": 1.26095545, "balance_loss_mlp": 1.01832032, "epoch": 0.5364196603036224, "flos": 19181321971200.0, "grad_norm": 2.286362525925914, "language_loss": 0.81849205, "learning_rate": 1.86126840594594e-06, "loss": 0.84316397, "num_input_tokens_seen": 191962265, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19335938, "step": 8922, "time_per_iteration": 2.8201193809509277 }, { "auxiliary_loss_clip": 0.01427598, "auxiliary_loss_mlp": 0.01034485, "balance_loss_clip": 1.25805807, "balance_loss_mlp": 1.0148747, "epoch": 0.5364797835562904, "flos": 17940008868480.0, "grad_norm": 1.999784984703286, "language_loss": 0.77737546, "learning_rate": 1.860879884996686e-06, "loss": 0.80199629, "num_input_tokens_seen": 191978850, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19616699, "step": 8923, "time_per_iteration": 2.8236656188964844 }, { "auxiliary_loss_clip": 0.01432919, "auxiliary_loss_mlp": 0.0103748, "balance_loss_clip": 1.26205516, "balance_loss_mlp": 1.01730967, "epoch": 0.5365399068089584, "flos": 30240534291840.0, "grad_norm": 1.453904520558473, "language_loss": 0.71153915, "learning_rate": 1.8604913693228804e-06, "loss": 0.73624313, "num_input_tokens_seen": 192002000, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.20166016, "step": 8924, "time_per_iteration": 2.884310007095337 }, { "auxiliary_loss_clip": 0.01438358, "auxiliary_loss_mlp": 0.01040494, "balance_loss_clip": 1.26547384, "balance_loss_mlp": 1.02032387, "epoch": 0.5366000300616264, "flos": 24900337071360.0, "grad_norm": 11.781586507407393, "language_loss": 0.87968248, "learning_rate": 1.8601028589392558e-06, "loss": 0.90447098, "num_input_tokens_seen": 192019100, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.20166016, "step": 8925, "time_per_iteration": 2.8831605911254883 }, { "auxiliary_loss_clip": 0.01433546, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.26087844, "balance_loss_mlp": 1.01431119, "epoch": 0.5366601533142943, "flos": 29839003522560.0, "grad_norm": 1.8893982466035257, "language_loss": 0.78774905, "learning_rate": 1.8597143538605455e-06, "loss": 0.81242841, "num_input_tokens_seen": 192041660, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.20068359, "step": 8926, "time_per_iteration": 2.941638231277466 }, { "auxiliary_loss_clip": 0.01410222, "auxiliary_loss_mlp": 0.01037558, "balance_loss_clip": 1.24570656, "balance_loss_mlp": 1.01663685, "epoch": 0.5367202765669623, "flos": 27210941971200.0, "grad_norm": 1.4958331309021753, "language_loss": 0.67480373, "learning_rate": 1.85932585410148e-06, "loss": 0.69928151, "num_input_tokens_seen": 192063540, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20910645, "step": 8927, "time_per_iteration": 4.358684062957764 }, { "auxiliary_loss_clip": 0.01423281, "auxiliary_loss_mlp": 0.01033341, "balance_loss_clip": 1.2519815, "balance_loss_mlp": 1.01384997, "epoch": 0.5367803998196302, "flos": 20239574037120.0, "grad_norm": 2.310734486582534, "language_loss": 0.74209642, "learning_rate": 1.8589373596767929e-06, "loss": 0.76666266, "num_input_tokens_seen": 192081760, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.19494629, "step": 8928, "time_per_iteration": 2.867027997970581 }, { "auxiliary_loss_clip": 0.01417262, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.24895072, "balance_loss_mlp": 1.01375747, "epoch": 0.5368405230722982, "flos": 32165742833280.0, "grad_norm": 1.861919820381051, "language_loss": 0.63687021, "learning_rate": 1.8585488706012154e-06, "loss": 0.66137671, "num_input_tokens_seen": 192101620, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19616699, "step": 8929, "time_per_iteration": 2.9286742210388184 }, { "auxiliary_loss_clip": 0.01430094, "auxiliary_loss_mlp": 0.01039698, "balance_loss_clip": 1.2599982, "balance_loss_mlp": 1.01877642, "epoch": 0.5369006463249661, "flos": 26258554033920.0, "grad_norm": 2.0123728486869172, "language_loss": 0.66967595, "learning_rate": 1.8581603868894781e-06, "loss": 0.69437385, "num_input_tokens_seen": 192121805, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20935059, "step": 8930, "time_per_iteration": 2.9005870819091797 }, { "auxiliary_loss_clip": 0.01398483, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.23422778, "balance_loss_mlp": 1.0122683, "epoch": 0.5369607695776342, "flos": 26221878483840.0, "grad_norm": 1.5428557619981387, "language_loss": 0.67624468, "learning_rate": 1.8577719085563136e-06, "loss": 0.70054507, "num_input_tokens_seen": 192141765, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19287109, "step": 8931, "time_per_iteration": 2.8487045764923096 }, { "auxiliary_loss_clip": 0.01425965, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.25845075, "balance_loss_mlp": 1.0157094, "epoch": 0.5370208928303021, "flos": 25019729395200.0, "grad_norm": 1.949682724500395, "language_loss": 0.76562423, "learning_rate": 1.8573834356164525e-06, "loss": 0.79025757, "num_input_tokens_seen": 192161560, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.2166748, "step": 8932, "time_per_iteration": 2.8382511138916016 }, { "auxiliary_loss_clip": 0.01414882, "auxiliary_loss_mlp": 0.01035198, "balance_loss_clip": 1.24960828, "balance_loss_mlp": 1.01518297, "epoch": 0.5370810160829701, "flos": 31803783281280.0, "grad_norm": 1.8737736474575197, "language_loss": 0.66652882, "learning_rate": 1.8569949680846261e-06, "loss": 0.69102961, "num_input_tokens_seen": 192180190, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20019531, "step": 8933, "time_per_iteration": 2.9049322605133057 }, { "auxiliary_loss_clip": 0.01413058, "auxiliary_loss_mlp": 0.01040145, "balance_loss_clip": 1.24813747, "balance_loss_mlp": 1.01881802, "epoch": 0.537141139335638, "flos": 23853305715840.0, "grad_norm": 1.618397850895475, "language_loss": 0.83735406, "learning_rate": 1.856606505975565e-06, "loss": 0.86188602, "num_input_tokens_seen": 192198855, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.21325684, "step": 8934, "time_per_iteration": 2.9780325889587402 }, { "auxiliary_loss_clip": 0.01410029, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.2437849, "balance_loss_mlp": 1.00974488, "epoch": 0.537201262588306, "flos": 18516366120960.0, "grad_norm": 1.795865389196859, "language_loss": 0.80171472, "learning_rate": 1.856218049303999e-06, "loss": 0.82612073, "num_input_tokens_seen": 192216555, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20837402, "step": 8935, "time_per_iteration": 2.8895907402038574 }, { "auxiliary_loss_clip": 0.01415633, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.24640203, "balance_loss_mlp": 1.01511872, "epoch": 0.537261385840974, "flos": 25673102576640.0, "grad_norm": 1.705570607818306, "language_loss": 0.846807, "learning_rate": 1.855829598084659e-06, "loss": 0.87132251, "num_input_tokens_seen": 192236910, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20812988, "step": 8936, "time_per_iteration": 2.8970675468444824 }, { "auxiliary_loss_clip": 0.01423974, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.25647259, "balance_loss_mlp": 1.01742029, "epoch": 0.537321509093642, "flos": 40749975319680.0, "grad_norm": 1.293126378967101, "language_loss": 0.73221517, "learning_rate": 1.8554411523322754e-06, "loss": 0.75684249, "num_input_tokens_seen": 192260790, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.21325684, "step": 8937, "time_per_iteration": 2.9818408489227295 }, { "auxiliary_loss_clip": 0.01426059, "auxiliary_loss_mlp": 0.0102987, "balance_loss_clip": 1.25478601, "balance_loss_mlp": 1.00875759, "epoch": 0.53738163234631, "flos": 17247426428160.0, "grad_norm": 3.129650767987743, "language_loss": 0.83071494, "learning_rate": 1.8550527120615778e-06, "loss": 0.8552742, "num_input_tokens_seen": 192277230, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.21118164, "step": 8938, "time_per_iteration": 5.655934810638428 }, { "auxiliary_loss_clip": 0.01445762, "auxiliary_loss_mlp": 0.01035072, "balance_loss_clip": 1.26995993, "balance_loss_mlp": 1.01440144, "epoch": 0.5374417555989779, "flos": 12829321111680.0, "grad_norm": 2.6890863820618676, "language_loss": 0.81703407, "learning_rate": 1.8546642772872957e-06, "loss": 0.84184235, "num_input_tokens_seen": 192292840, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.20678711, "step": 8939, "time_per_iteration": 2.802462577819824 }, { "auxiliary_loss_clip": 0.01208704, "auxiliary_loss_mlp": 0.01027587, "balance_loss_clip": 1.11386037, "balance_loss_mlp": 1.00631988, "epoch": 0.5375018788516459, "flos": 67286251791360.0, "grad_norm": 0.7062389003169989, "language_loss": 0.52476394, "learning_rate": 1.8542758480241589e-06, "loss": 0.54712683, "num_input_tokens_seen": 192358240, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.21289062, "step": 8940, "time_per_iteration": 4.801281690597534 }, { "auxiliary_loss_clip": 0.01414582, "auxiliary_loss_mlp": 0.01031674, "balance_loss_clip": 1.2497499, "balance_loss_mlp": 1.0113008, "epoch": 0.5375620021043138, "flos": 18123341374080.0, "grad_norm": 1.9000843037793789, "language_loss": 0.72590655, "learning_rate": 1.8538874242868965e-06, "loss": 0.75036913, "num_input_tokens_seen": 192377370, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20373535, "step": 8941, "time_per_iteration": 2.855745792388916 }, { "auxiliary_loss_clip": 0.01413088, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.24863303, "balance_loss_mlp": 1.01615119, "epoch": 0.5376221253569818, "flos": 23159908869120.0, "grad_norm": 1.6490166189503295, "language_loss": 0.79996347, "learning_rate": 1.853499006090237e-06, "loss": 0.82445896, "num_input_tokens_seen": 192396450, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20324707, "step": 8942, "time_per_iteration": 2.8407585620880127 }, { "auxiliary_loss_clip": 0.01449928, "auxiliary_loss_mlp": 0.01036991, "balance_loss_clip": 1.2771982, "balance_loss_mlp": 1.01651049, "epoch": 0.5376822486096497, "flos": 29984619847680.0, "grad_norm": 1.6611610232794216, "language_loss": 0.71256363, "learning_rate": 1.853110593448911e-06, "loss": 0.73743284, "num_input_tokens_seen": 192417390, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.20483398, "step": 8943, "time_per_iteration": 2.949086904525757 }, { "auxiliary_loss_clip": 0.01206402, "auxiliary_loss_mlp": 0.01039201, "balance_loss_clip": 1.11367047, "balance_loss_mlp": 1.0199368, "epoch": 0.5377423718623178, "flos": 54198726733440.0, "grad_norm": 0.813589041006005, "language_loss": 0.59702611, "learning_rate": 1.852722186377645e-06, "loss": 0.61948216, "num_input_tokens_seen": 192478060, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.19238281, "step": 8944, "time_per_iteration": 3.2829582691192627 }, { "auxiliary_loss_clip": 0.01458174, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.28006017, "balance_loss_mlp": 1.01425898, "epoch": 0.5378024951149857, "flos": 23267175586560.0, "grad_norm": 2.04047266928568, "language_loss": 0.7845487, "learning_rate": 1.852333784891169e-06, "loss": 0.8094871, "num_input_tokens_seen": 192495985, "router_z_loss_clip": 1.78125, "router_z_loss_mlp": 0.21411133, "step": 8945, "time_per_iteration": 2.841061592102051 }, { "auxiliary_loss_clip": 0.01428168, "auxiliary_loss_mlp": 0.01033226, "balance_loss_clip": 1.2582407, "balance_loss_mlp": 1.01306748, "epoch": 0.5378626183676537, "flos": 24034602205440.0, "grad_norm": 1.8248802552683994, "language_loss": 0.69540739, "learning_rate": 1.8519453890042112e-06, "loss": 0.72002137, "num_input_tokens_seen": 192515445, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.20166016, "step": 8946, "time_per_iteration": 2.8461451530456543 }, { "auxiliary_loss_clip": 0.01411906, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.24858689, "balance_loss_mlp": 1.01274657, "epoch": 0.5379227416203216, "flos": 27173090056320.0, "grad_norm": 1.9986825820327534, "language_loss": 0.78047812, "learning_rate": 1.851556998731498e-06, "loss": 0.80492848, "num_input_tokens_seen": 192536530, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20385742, "step": 8947, "time_per_iteration": 2.9473037719726562 }, { "auxiliary_loss_clip": 0.01419878, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.25226617, "balance_loss_mlp": 1.01791048, "epoch": 0.5379828648729896, "flos": 24692499866880.0, "grad_norm": 3.877371744059826, "language_loss": 0.60491252, "learning_rate": 1.8511686140877592e-06, "loss": 0.62949252, "num_input_tokens_seen": 192556075, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20202637, "step": 8948, "time_per_iteration": 2.901362895965576 }, { "auxiliary_loss_clip": 0.01443299, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.27236784, "balance_loss_mlp": 1.01309884, "epoch": 0.5380429881256577, "flos": 22532126261760.0, "grad_norm": 1.7163230636559528, "language_loss": 0.79929006, "learning_rate": 1.8507802350877205e-06, "loss": 0.82404852, "num_input_tokens_seen": 192575535, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.19433594, "step": 8949, "time_per_iteration": 2.8166987895965576 }, { "auxiliary_loss_clip": 0.01417082, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.2518183, "balance_loss_mlp": 1.01400888, "epoch": 0.5381031113783256, "flos": 26990843425920.0, "grad_norm": 1.6434753119784284, "language_loss": 0.78666568, "learning_rate": 1.850391861746111e-06, "loss": 0.81119293, "num_input_tokens_seen": 192594490, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.21618652, "step": 8950, "time_per_iteration": 2.880636215209961 }, { "auxiliary_loss_clip": 0.0141686, "auxiliary_loss_mlp": 0.01035231, "balance_loss_clip": 1.25192738, "balance_loss_mlp": 1.01431024, "epoch": 0.5381632346309936, "flos": 24764855581440.0, "grad_norm": 1.4953268545907838, "language_loss": 0.73843145, "learning_rate": 1.8500034940776573e-06, "loss": 0.76295233, "num_input_tokens_seen": 192615650, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20910645, "step": 8951, "time_per_iteration": 2.878356456756592 }, { "auxiliary_loss_clip": 0.01444955, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.27137482, "balance_loss_mlp": 1.01268339, "epoch": 0.5382233578836615, "flos": 15568540433280.0, "grad_norm": 1.6480328691742299, "language_loss": 0.77006304, "learning_rate": 1.849615132097085e-06, "loss": 0.79484612, "num_input_tokens_seen": 192633840, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.20678711, "step": 8952, "time_per_iteration": 2.804182529449463 }, { "auxiliary_loss_clip": 0.01415179, "auxiliary_loss_mlp": 0.01033591, "balance_loss_clip": 1.24869061, "balance_loss_mlp": 1.01270556, "epoch": 0.5382834811363295, "flos": 25095885672960.0, "grad_norm": 1.4198517555916905, "language_loss": 0.80000925, "learning_rate": 1.8492267758191228e-06, "loss": 0.82449698, "num_input_tokens_seen": 192655890, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20874023, "step": 8953, "time_per_iteration": 2.85758113861084 }, { "auxiliary_loss_clip": 0.01420201, "auxiliary_loss_mlp": 0.0103765, "balance_loss_clip": 1.25490987, "balance_loss_mlp": 1.01550102, "epoch": 0.5383436043889974, "flos": 13305786549120.0, "grad_norm": 1.9091832665307673, "language_loss": 0.81318176, "learning_rate": 1.8488384252584964e-06, "loss": 0.83776027, "num_input_tokens_seen": 192673025, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.22143555, "step": 8954, "time_per_iteration": 2.813615083694458 }, { "auxiliary_loss_clip": 0.01427551, "auxiliary_loss_mlp": 0.01032985, "balance_loss_clip": 1.2594018, "balance_loss_mlp": 1.01221848, "epoch": 0.5384037276416654, "flos": 23049746484480.0, "grad_norm": 2.6216317461378007, "language_loss": 0.77287388, "learning_rate": 1.8484500804299318e-06, "loss": 0.79747921, "num_input_tokens_seen": 192692190, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20776367, "step": 8955, "time_per_iteration": 2.872096538543701 }, { "auxiliary_loss_clip": 0.01429392, "auxiliary_loss_mlp": 0.01033523, "balance_loss_clip": 1.26105499, "balance_loss_mlp": 1.01283967, "epoch": 0.5384638508943334, "flos": 20640833337600.0, "grad_norm": 1.5407344513667265, "language_loss": 0.78748143, "learning_rate": 1.8480617413481557e-06, "loss": 0.8121106, "num_input_tokens_seen": 192710380, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20690918, "step": 8956, "time_per_iteration": 2.8361237049102783 }, { "auxiliary_loss_clip": 0.01208313, "auxiliary_loss_mlp": 0.01043622, "balance_loss_clip": 1.11444044, "balance_loss_mlp": 1.02655113, "epoch": 0.5385239741470014, "flos": 66765826391040.0, "grad_norm": 0.8642590157164473, "language_loss": 0.63478488, "learning_rate": 1.8476734080278932e-06, "loss": 0.65730423, "num_input_tokens_seen": 192768995, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.17089844, "step": 8957, "time_per_iteration": 3.274256467819214 }, { "auxiliary_loss_clip": 0.01207879, "auxiliary_loss_mlp": 0.01032313, "balance_loss_clip": 1.11468422, "balance_loss_mlp": 1.0134306, "epoch": 0.5385840973996693, "flos": 64749277301760.0, "grad_norm": 0.7370190205337258, "language_loss": 0.51688123, "learning_rate": 1.8472850804838705e-06, "loss": 0.53928316, "num_input_tokens_seen": 192825585, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.18847656, "step": 8958, "time_per_iteration": 3.33748459815979 }, { "auxiliary_loss_clip": 0.01448755, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.27758956, "balance_loss_mlp": 1.01807928, "epoch": 0.5386442206523373, "flos": 26153232842880.0, "grad_norm": 1.6910147276020522, "language_loss": 0.77894878, "learning_rate": 1.8468967587308128e-06, "loss": 0.80382836, "num_input_tokens_seen": 192847335, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.21118164, "step": 8959, "time_per_iteration": 2.889307737350464 }, { "auxiliary_loss_clip": 0.01438348, "auxiliary_loss_mlp": 0.01038122, "balance_loss_clip": 1.26826692, "balance_loss_mlp": 1.01724863, "epoch": 0.5387043439050052, "flos": 18258913353600.0, "grad_norm": 2.325495932986291, "language_loss": 0.8492732, "learning_rate": 1.8465084427834455e-06, "loss": 0.87403792, "num_input_tokens_seen": 192862205, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20874023, "step": 8960, "time_per_iteration": 2.8324408531188965 }, { "auxiliary_loss_clip": 0.01423746, "auxiliary_loss_mlp": 0.01032598, "balance_loss_clip": 1.25631177, "balance_loss_mlp": 1.01214194, "epoch": 0.5387644671576732, "flos": 29800156222080.0, "grad_norm": 1.4609898878660013, "language_loss": 0.78895223, "learning_rate": 1.8461201326564933e-06, "loss": 0.81351572, "num_input_tokens_seen": 192883695, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20458984, "step": 8961, "time_per_iteration": 2.9115750789642334 }, { "auxiliary_loss_clip": 0.01427444, "auxiliary_loss_mlp": 0.01043902, "balance_loss_clip": 1.25821018, "balance_loss_mlp": 1.02276587, "epoch": 0.5388245904103413, "flos": 22382528394240.0, "grad_norm": 2.3369410929456165, "language_loss": 0.84721607, "learning_rate": 1.845731828364681e-06, "loss": 0.87192953, "num_input_tokens_seen": 192900190, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.21118164, "step": 8962, "time_per_iteration": 4.32567286491394 }, { "auxiliary_loss_clip": 0.01204558, "auxiliary_loss_mlp": 0.01033671, "balance_loss_clip": 1.11267567, "balance_loss_mlp": 1.0141207, "epoch": 0.5388847136630092, "flos": 69838111820160.0, "grad_norm": 0.7326750547948051, "language_loss": 0.54170394, "learning_rate": 1.8453435299227333e-06, "loss": 0.5640862, "num_input_tokens_seen": 192958675, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.1953125, "step": 8963, "time_per_iteration": 3.3045709133148193 }, { "auxiliary_loss_clip": 0.01206057, "auxiliary_loss_mlp": 0.01024358, "balance_loss_clip": 1.11266994, "balance_loss_mlp": 1.00623786, "epoch": 0.5389448369156772, "flos": 69856390719360.0, "grad_norm": 0.8081750044033027, "language_loss": 0.63489872, "learning_rate": 1.8449552373453744e-06, "loss": 0.6572029, "num_input_tokens_seen": 193033135, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.18164062, "step": 8964, "time_per_iteration": 3.3794405460357666 }, { "auxiliary_loss_clip": 0.01443927, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.2700367, "balance_loss_mlp": 1.01518869, "epoch": 0.5390049601683451, "flos": 31734911416320.0, "grad_norm": 1.6803053996999882, "language_loss": 0.70667315, "learning_rate": 1.8445669506473287e-06, "loss": 0.73147565, "num_input_tokens_seen": 193055570, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.21154785, "step": 8965, "time_per_iteration": 2.9054064750671387 }, { "auxiliary_loss_clip": 0.0144338, "auxiliary_loss_mlp": 0.01039576, "balance_loss_clip": 1.27119017, "balance_loss_mlp": 1.01777291, "epoch": 0.5390650834210131, "flos": 18122481722880.0, "grad_norm": 1.9533646333418373, "language_loss": 0.82758373, "learning_rate": 1.8441786698433192e-06, "loss": 0.8524133, "num_input_tokens_seen": 193073120, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.21801758, "step": 8966, "time_per_iteration": 2.78814959526062 }, { "auxiliary_loss_clip": 0.01428164, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.26118183, "balance_loss_mlp": 1.0128212, "epoch": 0.539125206673681, "flos": 17424605640960.0, "grad_norm": 2.1499490910245713, "language_loss": 0.73314631, "learning_rate": 1.8437903949480706e-06, "loss": 0.75776005, "num_input_tokens_seen": 193090105, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20385742, "step": 8967, "time_per_iteration": 2.830320358276367 }, { "auxiliary_loss_clip": 0.01421334, "auxiliary_loss_mlp": 0.01031817, "balance_loss_clip": 1.25373745, "balance_loss_mlp": 1.0112052, "epoch": 0.539185329926349, "flos": 22208742541440.0, "grad_norm": 1.7773521615617205, "language_loss": 0.82324696, "learning_rate": 1.8434021259763065e-06, "loss": 0.8477785, "num_input_tokens_seen": 193109325, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20617676, "step": 8968, "time_per_iteration": 2.8684980869293213 }, { "auxiliary_loss_clip": 0.01426907, "auxiliary_loss_mlp": 0.01031918, "balance_loss_clip": 1.25685215, "balance_loss_mlp": 1.01123476, "epoch": 0.539245453179017, "flos": 21444437813760.0, "grad_norm": 1.949239452855266, "language_loss": 0.74837804, "learning_rate": 1.8430138629427484e-06, "loss": 0.77296633, "num_input_tokens_seen": 193130595, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.20690918, "step": 8969, "time_per_iteration": 2.853201389312744 }, { "auxiliary_loss_clip": 0.01437077, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.26357996, "balance_loss_mlp": 1.01280379, "epoch": 0.539305576431685, "flos": 20743258861440.0, "grad_norm": 2.8336195709217082, "language_loss": 0.83057809, "learning_rate": 1.8426256058621205e-06, "loss": 0.85528624, "num_input_tokens_seen": 193148930, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.20947266, "step": 8970, "time_per_iteration": 2.82312273979187 }, { "auxiliary_loss_clip": 0.01423289, "auxiliary_loss_mlp": 0.010348, "balance_loss_clip": 1.25801837, "balance_loss_mlp": 1.01464176, "epoch": 0.5393656996843529, "flos": 30932890508160.0, "grad_norm": 1.402408770740932, "language_loss": 0.75941885, "learning_rate": 1.842237354749146e-06, "loss": 0.78399974, "num_input_tokens_seen": 193170140, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20166016, "step": 8971, "time_per_iteration": 2.885859727859497 }, { "auxiliary_loss_clip": 0.01197938, "auxiliary_loss_mlp": 0.01011492, "balance_loss_clip": 1.10799575, "balance_loss_mlp": 0.99594766, "epoch": 0.5394258229370209, "flos": 50341323968640.0, "grad_norm": 0.8834873485070505, "language_loss": 0.60382694, "learning_rate": 1.8418491096185465e-06, "loss": 0.62592131, "num_input_tokens_seen": 193227235, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.15527344, "step": 8972, "time_per_iteration": 3.390813112258911 }, { "auxiliary_loss_clip": 0.01423888, "auxiliary_loss_mlp": 0.0103399, "balance_loss_clip": 1.25396931, "balance_loss_mlp": 1.01379597, "epoch": 0.5394859461896888, "flos": 25423160446080.0, "grad_norm": 1.586718592731762, "language_loss": 0.79044104, "learning_rate": 1.841460870485045e-06, "loss": 0.81501985, "num_input_tokens_seen": 193248435, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.2019043, "step": 8973, "time_per_iteration": 5.8584747314453125 }, { "auxiliary_loss_clip": 0.01450065, "auxiliary_loss_mlp": 0.01037732, "balance_loss_clip": 1.2721504, "balance_loss_mlp": 1.01609588, "epoch": 0.5395460694423568, "flos": 25488050768640.0, "grad_norm": 1.83022667258761, "language_loss": 0.74905324, "learning_rate": 1.8410726373633623e-06, "loss": 0.77393126, "num_input_tokens_seen": 193267490, "router_z_loss_clip": 1.77832031, "router_z_loss_mlp": 0.21630859, "step": 8974, "time_per_iteration": 2.9077672958374023 }, { "auxiliary_loss_clip": 0.0119964, "auxiliary_loss_mlp": 0.01018476, "balance_loss_clip": 1.10754347, "balance_loss_mlp": 0.99816316, "epoch": 0.5396061926950249, "flos": 53277132522240.0, "grad_norm": 0.7410595503179584, "language_loss": 0.51195085, "learning_rate": 1.8406844102682215e-06, "loss": 0.534132, "num_input_tokens_seen": 193326050, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.203125, "step": 8975, "time_per_iteration": 4.790737628936768 }, { "auxiliary_loss_clip": 0.01424118, "auxiliary_loss_mlp": 0.0103763, "balance_loss_clip": 1.255795, "balance_loss_mlp": 1.01739967, "epoch": 0.5396663159476928, "flos": 26736557794560.0, "grad_norm": 1.877260784147271, "language_loss": 0.72972363, "learning_rate": 1.840296189214344e-06, "loss": 0.75434113, "num_input_tokens_seen": 193348785, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20227051, "step": 8976, "time_per_iteration": 3.0006206035614014 }, { "auxiliary_loss_clip": 0.014348, "auxiliary_loss_mlp": 0.01041555, "balance_loss_clip": 1.2667383, "balance_loss_mlp": 1.02099133, "epoch": 0.5397264392003608, "flos": 23262515372160.0, "grad_norm": 2.555205938058129, "language_loss": 0.71158767, "learning_rate": 1.8399079742164509e-06, "loss": 0.73635125, "num_input_tokens_seen": 193367080, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20556641, "step": 8977, "time_per_iteration": 2.883582353591919 }, { "auxiliary_loss_clip": 0.01437383, "auxiliary_loss_mlp": 0.01041695, "balance_loss_clip": 1.26557648, "balance_loss_mlp": 1.01855695, "epoch": 0.5397865624530287, "flos": 18302782826880.0, "grad_norm": 1.7828817385215254, "language_loss": 0.73909175, "learning_rate": 1.8395197652892636e-06, "loss": 0.76388258, "num_input_tokens_seen": 193383715, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.23156738, "step": 8978, "time_per_iteration": 2.8186049461364746 }, { "auxiliary_loss_clip": 0.01439972, "auxiliary_loss_mlp": 0.01040002, "balance_loss_clip": 1.26562917, "balance_loss_mlp": 1.01823449, "epoch": 0.5398466857056967, "flos": 15304074721920.0, "grad_norm": 2.8524206509386967, "language_loss": 0.74713862, "learning_rate": 1.8391315624475028e-06, "loss": 0.77193838, "num_input_tokens_seen": 193400560, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.21777344, "step": 8979, "time_per_iteration": 2.847637176513672 }, { "auxiliary_loss_clip": 0.01442909, "auxiliary_loss_mlp": 0.01045469, "balance_loss_clip": 1.26891875, "balance_loss_mlp": 1.0235827, "epoch": 0.5399068089583646, "flos": 17830706135040.0, "grad_norm": 1.953831979204437, "language_loss": 0.77401197, "learning_rate": 1.8387433657058892e-06, "loss": 0.79889572, "num_input_tokens_seen": 193418680, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.21875, "step": 8980, "time_per_iteration": 2.814945697784424 }, { "auxiliary_loss_clip": 0.01423362, "auxiliary_loss_mlp": 0.01039551, "balance_loss_clip": 1.25379133, "balance_loss_mlp": 1.01828384, "epoch": 0.5399669322110326, "flos": 27393233846400.0, "grad_norm": 1.8524360861150808, "language_loss": 0.82941604, "learning_rate": 1.8383551750791431e-06, "loss": 0.85404515, "num_input_tokens_seen": 193439310, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21276855, "step": 8981, "time_per_iteration": 2.8697774410247803 }, { "auxiliary_loss_clip": 0.01426541, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.2543447, "balance_loss_mlp": 1.02242494, "epoch": 0.5400270554637006, "flos": 20458858176000.0, "grad_norm": 1.831212148143244, "language_loss": 0.67514265, "learning_rate": 1.8379669905819857e-06, "loss": 0.69984555, "num_input_tokens_seen": 193458115, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.21313477, "step": 8982, "time_per_iteration": 2.825181484222412 }, { "auxiliary_loss_clip": 0.01428235, "auxiliary_loss_mlp": 0.0104487, "balance_loss_clip": 1.26042676, "balance_loss_mlp": 1.02553427, "epoch": 0.5400871787163686, "flos": 21699628341120.0, "grad_norm": 1.669578840398646, "language_loss": 0.83441806, "learning_rate": 1.8375788122291358e-06, "loss": 0.8591491, "num_input_tokens_seen": 193477365, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19335938, "step": 8983, "time_per_iteration": 2.8300886154174805 }, { "auxiliary_loss_clip": 0.01418362, "auxiliary_loss_mlp": 0.01040033, "balance_loss_clip": 1.25051641, "balance_loss_mlp": 1.01806259, "epoch": 0.5401473019690365, "flos": 19213156327680.0, "grad_norm": 1.7999974974715864, "language_loss": 0.72459704, "learning_rate": 1.8371906400353138e-06, "loss": 0.74918097, "num_input_tokens_seen": 193495595, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21960449, "step": 8984, "time_per_iteration": 2.788011312484741 }, { "auxiliary_loss_clip": 0.01456014, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.28040481, "balance_loss_mlp": 1.01730871, "epoch": 0.5402074252217045, "flos": 20636082633600.0, "grad_norm": 1.9355153499945539, "language_loss": 0.81204146, "learning_rate": 1.8368024740152386e-06, "loss": 0.83698487, "num_input_tokens_seen": 193514035, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.21008301, "step": 8985, "time_per_iteration": 2.8138253688812256 }, { "auxiliary_loss_clip": 0.01398136, "auxiliary_loss_mlp": 0.01030877, "balance_loss_clip": 1.23723912, "balance_loss_mlp": 1.01047993, "epoch": 0.5402675484743724, "flos": 24984411189120.0, "grad_norm": 1.5089977535448962, "language_loss": 0.79450333, "learning_rate": 1.83641431418363e-06, "loss": 0.81879342, "num_input_tokens_seen": 193535445, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.20410156, "step": 8986, "time_per_iteration": 2.8650524616241455 }, { "auxiliary_loss_clip": 0.01421681, "auxiliary_loss_mlp": 0.01034027, "balance_loss_clip": 1.25376916, "balance_loss_mlp": 1.01314163, "epoch": 0.5403276717270404, "flos": 19467215735040.0, "grad_norm": 1.6536001458292293, "language_loss": 0.77683431, "learning_rate": 1.8360261605552075e-06, "loss": 0.80139136, "num_input_tokens_seen": 193554780, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20874023, "step": 8987, "time_per_iteration": 2.857736349105835 }, { "auxiliary_loss_clip": 0.01430842, "auxiliary_loss_mlp": 0.01035059, "balance_loss_clip": 1.26120782, "balance_loss_mlp": 1.01454306, "epoch": 0.5403877949797083, "flos": 18451294819200.0, "grad_norm": 1.8466210808749368, "language_loss": 0.72386605, "learning_rate": 1.8356380131446887e-06, "loss": 0.74852508, "num_input_tokens_seen": 193573580, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20532227, "step": 8988, "time_per_iteration": 2.8420827388763428 }, { "auxiliary_loss_clip": 0.01433871, "auxiliary_loss_mlp": 0.01038285, "balance_loss_clip": 1.26267183, "balance_loss_mlp": 1.01371634, "epoch": 0.5404479182323764, "flos": 28304466998400.0, "grad_norm": 4.045882411279951, "language_loss": 0.69125223, "learning_rate": 1.8352498719667934e-06, "loss": 0.71597379, "num_input_tokens_seen": 193590490, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.24572754, "step": 8989, "time_per_iteration": 2.9180662631988525 }, { "auxiliary_loss_clip": 0.01426703, "auxiliary_loss_mlp": 0.01039369, "balance_loss_clip": 1.25628281, "balance_loss_mlp": 1.01818538, "epoch": 0.5405080414850444, "flos": 23377654684800.0, "grad_norm": 1.4565178976040045, "language_loss": 0.78692031, "learning_rate": 1.8348617370362399e-06, "loss": 0.81158102, "num_input_tokens_seen": 193609900, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.21179199, "step": 8990, "time_per_iteration": 2.93687105178833 }, { "auxiliary_loss_clip": 0.01421698, "auxiliary_loss_mlp": 0.01036105, "balance_loss_clip": 1.25359011, "balance_loss_mlp": 1.01579165, "epoch": 0.5405681647377123, "flos": 21116484368640.0, "grad_norm": 1.616427644708359, "language_loss": 0.69514138, "learning_rate": 1.834473608367745e-06, "loss": 0.71971941, "num_input_tokens_seen": 193629775, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.203125, "step": 8991, "time_per_iteration": 2.887840509414673 }, { "auxiliary_loss_clip": 0.01428886, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.25942206, "balance_loss_mlp": 1.01238632, "epoch": 0.5406282879903803, "flos": 20458948665600.0, "grad_norm": 1.732940196385924, "language_loss": 0.76927686, "learning_rate": 1.8340854859760277e-06, "loss": 0.79389441, "num_input_tokens_seen": 193648070, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20471191, "step": 8992, "time_per_iteration": 2.828887939453125 }, { "auxiliary_loss_clip": 0.01432085, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.26124644, "balance_loss_mlp": 1.01545382, "epoch": 0.5406884112430482, "flos": 14217472149120.0, "grad_norm": 2.7720614914376, "language_loss": 0.77117658, "learning_rate": 1.8336973698758056e-06, "loss": 0.7958495, "num_input_tokens_seen": 193665060, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.19750977, "step": 8993, "time_per_iteration": 2.8124849796295166 }, { "auxiliary_loss_clip": 0.01423831, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.25739157, "balance_loss_mlp": 1.01232445, "epoch": 0.5407485344957162, "flos": 23885366296320.0, "grad_norm": 1.751813055868749, "language_loss": 0.71057951, "learning_rate": 1.8333092600817959e-06, "loss": 0.73514938, "num_input_tokens_seen": 193683620, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20837402, "step": 8994, "time_per_iteration": 2.8828647136688232 }, { "auxiliary_loss_clip": 0.01436553, "auxiliary_loss_mlp": 0.01035802, "balance_loss_clip": 1.2648443, "balance_loss_mlp": 1.01426136, "epoch": 0.5408086577483842, "flos": 23158777749120.0, "grad_norm": 1.76609591056567, "language_loss": 0.75739348, "learning_rate": 1.8329211566087157e-06, "loss": 0.78211707, "num_input_tokens_seen": 193702990, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.2154541, "step": 8995, "time_per_iteration": 2.842376947402954 }, { "auxiliary_loss_clip": 0.01407771, "auxiliary_loss_mlp": 0.01033297, "balance_loss_clip": 1.24318898, "balance_loss_mlp": 1.01362777, "epoch": 0.5408687810010522, "flos": 18780424629120.0, "grad_norm": 2.3533047585742985, "language_loss": 0.73768067, "learning_rate": 1.832533059471282e-06, "loss": 0.76209134, "num_input_tokens_seen": 193721785, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19665527, "step": 8996, "time_per_iteration": 2.85122013092041 }, { "auxiliary_loss_clip": 0.01410859, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.24557853, "balance_loss_mlp": 1.02006817, "epoch": 0.5409289042537201, "flos": 13889563948800.0, "grad_norm": 1.7570971174379282, "language_loss": 0.74015641, "learning_rate": 1.8321449686842115e-06, "loss": 0.76467144, "num_input_tokens_seen": 193740315, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20568848, "step": 8997, "time_per_iteration": 4.35371208190918 }, { "auxiliary_loss_clip": 0.01436945, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.26725101, "balance_loss_mlp": 1.01487756, "epoch": 0.5409890275063881, "flos": 14473522327680.0, "grad_norm": 2.4362652965827882, "language_loss": 0.72732371, "learning_rate": 1.8317568842622207e-06, "loss": 0.75205064, "num_input_tokens_seen": 193757580, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20874023, "step": 8998, "time_per_iteration": 2.919760227203369 }, { "auxiliary_loss_clip": 0.01432738, "auxiliary_loss_mlp": 0.01034248, "balance_loss_clip": 1.26422215, "balance_loss_mlp": 1.01375604, "epoch": 0.541049150759056, "flos": 48993676306560.0, "grad_norm": 1.6167789804925377, "language_loss": 0.71070206, "learning_rate": 1.8313688062200256e-06, "loss": 0.73537195, "num_input_tokens_seen": 193780965, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20483398, "step": 8999, "time_per_iteration": 3.110267162322998 }, { "auxiliary_loss_clip": 0.01429224, "auxiliary_loss_mlp": 0.01033268, "balance_loss_clip": 1.26166725, "balance_loss_mlp": 1.013098, "epoch": 0.541109274011724, "flos": 18155990136960.0, "grad_norm": 4.268038388493625, "language_loss": 0.81602931, "learning_rate": 1.8309807345723422e-06, "loss": 0.84065425, "num_input_tokens_seen": 193797855, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20178223, "step": 9000, "time_per_iteration": 2.7846786975860596 }, { "auxiliary_loss_clip": 0.01419266, "auxiliary_loss_mlp": 0.01034585, "balance_loss_clip": 1.25172198, "balance_loss_mlp": 1.0130918, "epoch": 0.541169397264392, "flos": 20532435500160.0, "grad_norm": 1.9084544103321934, "language_loss": 0.73852444, "learning_rate": 1.8305926693338863e-06, "loss": 0.76306295, "num_input_tokens_seen": 193817375, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.21484375, "step": 9001, "time_per_iteration": 2.8318424224853516 }, { "auxiliary_loss_clip": 0.01437943, "auxiliary_loss_mlp": 0.01034081, "balance_loss_clip": 1.26570177, "balance_loss_mlp": 1.0121944, "epoch": 0.54122952051706, "flos": 20052712437120.0, "grad_norm": 2.1629965730297145, "language_loss": 0.86094469, "learning_rate": 1.8302046105193734e-06, "loss": 0.885665, "num_input_tokens_seen": 193832205, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.21887207, "step": 9002, "time_per_iteration": 2.814337730407715 }, { "auxiliary_loss_clip": 0.01430596, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.26375592, "balance_loss_mlp": 1.01492071, "epoch": 0.541289643769728, "flos": 19071521544960.0, "grad_norm": 2.0588551274814075, "language_loss": 0.78756249, "learning_rate": 1.8298165581435183e-06, "loss": 0.81221741, "num_input_tokens_seen": 193849830, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19970703, "step": 9003, "time_per_iteration": 2.792806386947632 }, { "auxiliary_loss_clip": 0.01432448, "auxiliary_loss_mlp": 0.01034956, "balance_loss_clip": 1.26378584, "balance_loss_mlp": 1.01343918, "epoch": 0.5413497670223959, "flos": 22392391760640.0, "grad_norm": 1.8002831305398725, "language_loss": 0.70410299, "learning_rate": 1.8294285122210372e-06, "loss": 0.72877705, "num_input_tokens_seen": 193869945, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21520996, "step": 9004, "time_per_iteration": 2.8294837474823 }, { "auxiliary_loss_clip": 0.01209547, "auxiliary_loss_mlp": 0.01061536, "balance_loss_clip": 1.1134572, "balance_loss_mlp": 1.03502393, "epoch": 0.5414098902750639, "flos": 70063413517440.0, "grad_norm": 1.3888896692995731, "language_loss": 0.59272075, "learning_rate": 1.8290404727666434e-06, "loss": 0.61543155, "num_input_tokens_seen": 193930860, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.265625, "step": 9005, "time_per_iteration": 3.472856044769287 }, { "auxiliary_loss_clip": 0.01431084, "auxiliary_loss_mlp": 0.01038095, "balance_loss_clip": 1.26063335, "balance_loss_mlp": 1.01927209, "epoch": 0.5414700135277318, "flos": 21809021564160.0, "grad_norm": 1.8128510358825087, "language_loss": 0.79211777, "learning_rate": 1.8286524397950517e-06, "loss": 0.81680954, "num_input_tokens_seen": 193949075, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.18835449, "step": 9006, "time_per_iteration": 2.8310296535491943 }, { "auxiliary_loss_clip": 0.01419508, "auxiliary_loss_mlp": 0.01036432, "balance_loss_clip": 1.25318682, "balance_loss_mlp": 1.01765656, "epoch": 0.5415301367803999, "flos": 16916215357440.0, "grad_norm": 1.7781748116201377, "language_loss": 0.83971131, "learning_rate": 1.8282644133209777e-06, "loss": 0.86427069, "num_input_tokens_seen": 193967630, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.1875, "step": 9007, "time_per_iteration": 2.841433525085449 }, { "auxiliary_loss_clip": 0.01423298, "auxiliary_loss_mlp": 0.01036926, "balance_loss_clip": 1.25536466, "balance_loss_mlp": 1.01570702, "epoch": 0.5415902600330678, "flos": 25715569461120.0, "grad_norm": 1.8859863271913577, "language_loss": 0.67859268, "learning_rate": 1.8278763933591334e-06, "loss": 0.70319492, "num_input_tokens_seen": 193988730, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.2121582, "step": 9008, "time_per_iteration": 4.418330907821655 }, { "auxiliary_loss_clip": 0.0144922, "auxiliary_loss_mlp": 0.0103635, "balance_loss_clip": 1.27506316, "balance_loss_mlp": 1.01508272, "epoch": 0.5416503832857358, "flos": 19217454583680.0, "grad_norm": 2.5197503073992724, "language_loss": 0.75140703, "learning_rate": 1.827488379924234e-06, "loss": 0.77626276, "num_input_tokens_seen": 194005160, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.21276855, "step": 9009, "time_per_iteration": 2.8470699787139893 }, { "auxiliary_loss_clip": 0.01435415, "auxiliary_loss_mlp": 0.01037966, "balance_loss_clip": 1.26409507, "balance_loss_mlp": 1.01823699, "epoch": 0.5417105065384037, "flos": 12721330477440.0, "grad_norm": 4.406774708242277, "language_loss": 0.88438904, "learning_rate": 1.8271003730309923e-06, "loss": 0.90912282, "num_input_tokens_seen": 194021700, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.19726562, "step": 9010, "time_per_iteration": 4.251540184020996 }, { "auxiliary_loss_clip": 0.01433421, "auxiliary_loss_mlp": 0.01038981, "balance_loss_clip": 1.2644906, "balance_loss_mlp": 1.0188942, "epoch": 0.5417706297910717, "flos": 30348027233280.0, "grad_norm": 8.245887491059138, "language_loss": 0.66197205, "learning_rate": 1.826712372694122e-06, "loss": 0.68669599, "num_input_tokens_seen": 194042620, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20092773, "step": 9011, "time_per_iteration": 2.906947612762451 }, { "auxiliary_loss_clip": 0.01426366, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.25915635, "balance_loss_mlp": 1.01671791, "epoch": 0.5418307530437396, "flos": 29032367644800.0, "grad_norm": 2.6719770667710914, "language_loss": 0.79918057, "learning_rate": 1.8263243789283362e-06, "loss": 0.82380223, "num_input_tokens_seen": 194061800, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19091797, "step": 9012, "time_per_iteration": 2.8591902256011963 }, { "auxiliary_loss_clip": 0.01432107, "auxiliary_loss_mlp": 0.01035332, "balance_loss_clip": 1.26306772, "balance_loss_mlp": 1.01566195, "epoch": 0.5418908762964076, "flos": 16882254495360.0, "grad_norm": 1.8957806315297538, "language_loss": 0.75777745, "learning_rate": 1.8259363917483466e-06, "loss": 0.78245187, "num_input_tokens_seen": 194079890, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.19665527, "step": 9013, "time_per_iteration": 2.822934627532959 }, { "auxiliary_loss_clip": 0.01450231, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.27640676, "balance_loss_mlp": 1.01864243, "epoch": 0.5419509995490756, "flos": 18958146779520.0, "grad_norm": 5.507497368292329, "language_loss": 0.73131073, "learning_rate": 1.8255484111688667e-06, "loss": 0.756208, "num_input_tokens_seen": 194097625, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.20849609, "step": 9014, "time_per_iteration": 2.8279306888580322 }, { "auxiliary_loss_clip": 0.01428734, "auxiliary_loss_mlp": 0.01038564, "balance_loss_clip": 1.26131296, "balance_loss_mlp": 1.01777422, "epoch": 0.5420111228017436, "flos": 18086892048000.0, "grad_norm": 1.5963577225886043, "language_loss": 0.8141551, "learning_rate": 1.8251604372046085e-06, "loss": 0.83882809, "num_input_tokens_seen": 194116055, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20776367, "step": 9015, "time_per_iteration": 2.8383967876434326 }, { "auxiliary_loss_clip": 0.01438045, "auxiliary_loss_mlp": 0.01039854, "balance_loss_clip": 1.26566541, "balance_loss_mlp": 1.01846838, "epoch": 0.5420712460544116, "flos": 19070933362560.0, "grad_norm": 2.366538450955225, "language_loss": 0.82370704, "learning_rate": 1.8247724698702843e-06, "loss": 0.84848601, "num_input_tokens_seen": 194130365, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.21386719, "step": 9016, "time_per_iteration": 2.8019955158233643 }, { "auxiliary_loss_clip": 0.01425881, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.25867844, "balance_loss_mlp": 1.01534641, "epoch": 0.5421313693070795, "flos": 18196285271040.0, "grad_norm": 1.6387790798944495, "language_loss": 0.8200525, "learning_rate": 1.8243845091806053e-06, "loss": 0.84466946, "num_input_tokens_seen": 194148975, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20471191, "step": 9017, "time_per_iteration": 2.8908965587615967 }, { "auxiliary_loss_clip": 0.01421537, "auxiliary_loss_mlp": 0.01041489, "balance_loss_clip": 1.25875854, "balance_loss_mlp": 1.01991248, "epoch": 0.5421914925597475, "flos": 13013739492480.0, "grad_norm": 1.5897500874077168, "language_loss": 0.78323913, "learning_rate": 1.8239965551502837e-06, "loss": 0.80786943, "num_input_tokens_seen": 194167185, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.21569824, "step": 9018, "time_per_iteration": 2.8278541564941406 }, { "auxiliary_loss_clip": 0.01438623, "auxiliary_loss_mlp": 0.01038998, "balance_loss_clip": 1.26482177, "balance_loss_mlp": 1.01805294, "epoch": 0.5422516158124154, "flos": 46775832526080.0, "grad_norm": 1.4565404085814075, "language_loss": 0.66593969, "learning_rate": 1.8236086077940303e-06, "loss": 0.69071591, "num_input_tokens_seen": 194192840, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.20959473, "step": 9019, "time_per_iteration": 3.06899094581604 }, { "auxiliary_loss_clip": 0.01409133, "auxiliary_loss_mlp": 0.01033935, "balance_loss_clip": 1.2464391, "balance_loss_mlp": 1.0140028, "epoch": 0.5423117390650835, "flos": 31771044028800.0, "grad_norm": 2.5363525183448616, "language_loss": 0.70842516, "learning_rate": 1.8232206671265555e-06, "loss": 0.73285586, "num_input_tokens_seen": 194213150, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19934082, "step": 9020, "time_per_iteration": 2.9403109550476074 }, { "auxiliary_loss_clip": 0.01407744, "auxiliary_loss_mlp": 0.01036702, "balance_loss_clip": 1.24431634, "balance_loss_mlp": 1.01587653, "epoch": 0.5423718623177514, "flos": 27214244841600.0, "grad_norm": 1.4534641032114617, "language_loss": 0.80309725, "learning_rate": 1.8228327331625717e-06, "loss": 0.82754171, "num_input_tokens_seen": 194234665, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20825195, "step": 9021, "time_per_iteration": 2.9365930557250977 }, { "auxiliary_loss_clip": 0.01413097, "auxiliary_loss_mlp": 0.01038277, "balance_loss_clip": 1.24717414, "balance_loss_mlp": 1.01699829, "epoch": 0.5424319855704194, "flos": 23555965017600.0, "grad_norm": 1.7257260235664884, "language_loss": 0.79799879, "learning_rate": 1.822444805916788e-06, "loss": 0.82251257, "num_input_tokens_seen": 194253790, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21276855, "step": 9022, "time_per_iteration": 2.8879082202911377 }, { "auxiliary_loss_clip": 0.01411936, "auxiliary_loss_mlp": 0.01037153, "balance_loss_clip": 1.24600077, "balance_loss_mlp": 1.01629162, "epoch": 0.5424921088230873, "flos": 26627074081920.0, "grad_norm": 4.198052277733731, "language_loss": 0.83055651, "learning_rate": 1.822056885403915e-06, "loss": 0.85504746, "num_input_tokens_seen": 194274950, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20874023, "step": 9023, "time_per_iteration": 2.87813401222229 }, { "auxiliary_loss_clip": 0.01417061, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.24888682, "balance_loss_mlp": 1.01733136, "epoch": 0.5425522320757553, "flos": 23597346026880.0, "grad_norm": 1.6137985307750142, "language_loss": 0.72168696, "learning_rate": 1.8216689716386627e-06, "loss": 0.7462334, "num_input_tokens_seen": 194296155, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20263672, "step": 9024, "time_per_iteration": 2.8306806087493896 }, { "auxiliary_loss_clip": 0.01419108, "auxiliary_loss_mlp": 0.01037722, "balance_loss_clip": 1.24974191, "balance_loss_mlp": 1.0170629, "epoch": 0.5426123553284232, "flos": 30604665594240.0, "grad_norm": 2.3272578719603487, "language_loss": 0.65865695, "learning_rate": 1.8212810646357405e-06, "loss": 0.68322521, "num_input_tokens_seen": 194318025, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20666504, "step": 9025, "time_per_iteration": 2.8984808921813965 }, { "auxiliary_loss_clip": 0.0143682, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.26440167, "balance_loss_mlp": 1.01128614, "epoch": 0.5426724785810912, "flos": 12502770255360.0, "grad_norm": 6.175099596573325, "language_loss": 0.74893928, "learning_rate": 1.8208931644098591e-06, "loss": 0.77362466, "num_input_tokens_seen": 194336150, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.2043457, "step": 9026, "time_per_iteration": 2.808222770690918 }, { "auxiliary_loss_clip": 0.01416621, "auxiliary_loss_mlp": 0.01041276, "balance_loss_clip": 1.24784195, "balance_loss_mlp": 1.01809001, "epoch": 0.5427326018337592, "flos": 26074950059520.0, "grad_norm": 2.055632565277127, "language_loss": 0.79767925, "learning_rate": 1.8205052709757265e-06, "loss": 0.82225823, "num_input_tokens_seen": 194355980, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.23205566, "step": 9027, "time_per_iteration": 2.8749659061431885 }, { "auxiliary_loss_clip": 0.01211062, "auxiliary_loss_mlp": 0.01061495, "balance_loss_clip": 1.11849284, "balance_loss_mlp": 1.0319314, "epoch": 0.5427927250864272, "flos": 66016135733760.0, "grad_norm": 0.7525474905907152, "language_loss": 0.56618929, "learning_rate": 1.8201173843480515e-06, "loss": 0.58891487, "num_input_tokens_seen": 194422660, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.29492188, "step": 9028, "time_per_iteration": 3.394814968109131 }, { "auxiliary_loss_clip": 0.01412644, "auxiliary_loss_mlp": 0.0103631, "balance_loss_clip": 1.24426734, "balance_loss_mlp": 1.01391065, "epoch": 0.5428528483390952, "flos": 19985152671360.0, "grad_norm": 2.0523283784278177, "language_loss": 0.78797269, "learning_rate": 1.8197295045415442e-06, "loss": 0.81246221, "num_input_tokens_seen": 194438545, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.22375488, "step": 9029, "time_per_iteration": 2.8460285663604736 }, { "auxiliary_loss_clip": 0.01404711, "auxiliary_loss_mlp": 0.01044773, "balance_loss_clip": 1.23879433, "balance_loss_mlp": 1.02233744, "epoch": 0.5429129715917631, "flos": 21841579837440.0, "grad_norm": 1.51280404985086, "language_loss": 0.83646327, "learning_rate": 1.8193416315709112e-06, "loss": 0.8609581, "num_input_tokens_seen": 194458060, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.2244873, "step": 9030, "time_per_iteration": 2.8728456497192383 }, { "auxiliary_loss_clip": 0.01405991, "auxiliary_loss_mlp": 0.01038092, "balance_loss_clip": 1.2396394, "balance_loss_mlp": 1.01609778, "epoch": 0.5429730948444311, "flos": 27794312167680.0, "grad_norm": 1.8285167872649473, "language_loss": 0.75795197, "learning_rate": 1.8189537654508623e-06, "loss": 0.78239286, "num_input_tokens_seen": 194477405, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.2199707, "step": 9031, "time_per_iteration": 4.294595241546631 }, { "auxiliary_loss_clip": 0.0140066, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.23880172, "balance_loss_mlp": 1.01604629, "epoch": 0.543033218097099, "flos": 26771649776640.0, "grad_norm": 1.7441965143108005, "language_loss": 0.85997349, "learning_rate": 1.8185659061961045e-06, "loss": 0.88435078, "num_input_tokens_seen": 194497085, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.21032715, "step": 9032, "time_per_iteration": 2.914761543273926 }, { "auxiliary_loss_clip": 0.01429691, "auxiliary_loss_mlp": 0.01039304, "balance_loss_clip": 1.25801158, "balance_loss_mlp": 1.01678538, "epoch": 0.5430933413497671, "flos": 22685705671680.0, "grad_norm": 1.5768800694538117, "language_loss": 0.74842083, "learning_rate": 1.8181780538213457e-06, "loss": 0.77311075, "num_input_tokens_seen": 194516785, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.22521973, "step": 9033, "time_per_iteration": 3.0117685794830322 }, { "auxiliary_loss_clip": 0.01403666, "auxiliary_loss_mlp": 0.0103802, "balance_loss_clip": 1.23680711, "balance_loss_mlp": 1.01589453, "epoch": 0.543153464602435, "flos": 24618062891520.0, "grad_norm": 2.122467325821096, "language_loss": 0.76687783, "learning_rate": 1.8177902083412935e-06, "loss": 0.79129469, "num_input_tokens_seen": 194536475, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.22106934, "step": 9034, "time_per_iteration": 2.890709161758423 }, { "auxiliary_loss_clip": 0.01401356, "auxiliary_loss_mlp": 0.01036824, "balance_loss_clip": 1.23519075, "balance_loss_mlp": 1.01570058, "epoch": 0.543213587855103, "flos": 19034981729280.0, "grad_norm": 2.4874599758451885, "language_loss": 0.8496722, "learning_rate": 1.817402369770655e-06, "loss": 0.87405401, "num_input_tokens_seen": 194554495, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.21118164, "step": 9035, "time_per_iteration": 2.8266024589538574 }, { "auxiliary_loss_clip": 0.01209474, "auxiliary_loss_mlp": 0.01045244, "balance_loss_clip": 1.11577749, "balance_loss_mlp": 1.01777864, "epoch": 0.5432737111077709, "flos": 65716080347520.0, "grad_norm": 0.7282683398210658, "language_loss": 0.55919766, "learning_rate": 1.8170145381241364e-06, "loss": 0.58174479, "num_input_tokens_seen": 194617620, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.27539062, "step": 9036, "time_per_iteration": 3.3816280364990234 }, { "auxiliary_loss_clip": 0.01422908, "auxiliary_loss_mlp": 0.01035533, "balance_loss_clip": 1.25287366, "balance_loss_mlp": 1.01347959, "epoch": 0.5433338343604389, "flos": 22102018761600.0, "grad_norm": 1.6830019413057093, "language_loss": 0.75868732, "learning_rate": 1.8166267134164451e-06, "loss": 0.78327173, "num_input_tokens_seen": 194637690, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.22045898, "step": 9037, "time_per_iteration": 2.881230115890503 }, { "auxiliary_loss_clip": 0.01403423, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.23785639, "balance_loss_mlp": 1.01731205, "epoch": 0.5433939576131068, "flos": 34684546896000.0, "grad_norm": 1.5409802959120982, "language_loss": 0.67750365, "learning_rate": 1.8162388956622875e-06, "loss": 0.70192921, "num_input_tokens_seen": 194659520, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.21838379, "step": 9038, "time_per_iteration": 2.971674919128418 }, { "auxiliary_loss_clip": 0.01414227, "auxiliary_loss_mlp": 0.01035326, "balance_loss_clip": 1.24565077, "balance_loss_mlp": 1.01414275, "epoch": 0.5434540808657748, "flos": 20313106116480.0, "grad_norm": 1.7672357724434005, "language_loss": 0.78821027, "learning_rate": 1.8158510848763692e-06, "loss": 0.81270581, "num_input_tokens_seen": 194677645, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21166992, "step": 9039, "time_per_iteration": 2.8569648265838623 }, { "auxiliary_loss_clip": 0.01414498, "auxiliary_loss_mlp": 0.01039335, "balance_loss_clip": 1.24673426, "balance_loss_mlp": 1.01720965, "epoch": 0.5435142041184428, "flos": 23123278563840.0, "grad_norm": 2.0231369873598672, "language_loss": 0.7731545, "learning_rate": 1.8154632810733962e-06, "loss": 0.79769284, "num_input_tokens_seen": 194697400, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.22131348, "step": 9040, "time_per_iteration": 2.9937820434570312 }, { "auxiliary_loss_clip": 0.01209577, "auxiliary_loss_mlp": 0.01036018, "balance_loss_clip": 1.11732531, "balance_loss_mlp": 1.01274812, "epoch": 0.5435743273711108, "flos": 64043166666240.0, "grad_norm": 0.6701205428892485, "language_loss": 0.52555817, "learning_rate": 1.815075484268074e-06, "loss": 0.5480141, "num_input_tokens_seen": 194761205, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.23242188, "step": 9041, "time_per_iteration": 3.4729487895965576 }, { "auxiliary_loss_clip": 0.01414007, "auxiliary_loss_mlp": 0.01039793, "balance_loss_clip": 1.24550104, "balance_loss_mlp": 1.018538, "epoch": 0.5436344506237788, "flos": 25129756045440.0, "grad_norm": 1.6063892960454471, "language_loss": 0.76365924, "learning_rate": 1.8146876944751078e-06, "loss": 0.78819722, "num_input_tokens_seen": 194782445, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.21264648, "step": 9042, "time_per_iteration": 3.0355684757232666 }, { "auxiliary_loss_clip": 0.01407105, "auxiliary_loss_mlp": 0.01040486, "balance_loss_clip": 1.24202943, "balance_loss_mlp": 1.01905227, "epoch": 0.5436945738764467, "flos": 19582355047680.0, "grad_norm": 2.1101972892227043, "language_loss": 0.68079531, "learning_rate": 1.8142999117092033e-06, "loss": 0.70527124, "num_input_tokens_seen": 194800325, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.21435547, "step": 9043, "time_per_iteration": 5.672695159912109 }, { "auxiliary_loss_clip": 0.01404139, "auxiliary_loss_mlp": 0.01037612, "balance_loss_clip": 1.24001193, "balance_loss_mlp": 1.01712012, "epoch": 0.5437546971291147, "flos": 21152481246720.0, "grad_norm": 1.7199596387440441, "language_loss": 0.85026282, "learning_rate": 1.8139121359850644e-06, "loss": 0.8746804, "num_input_tokens_seen": 194818675, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20483398, "step": 9044, "time_per_iteration": 2.846230983734131 }, { "auxiliary_loss_clip": 0.01431916, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.25846481, "balance_loss_mlp": 1.01384735, "epoch": 0.5438148203817826, "flos": 25129484576640.0, "grad_norm": 1.5958204975438293, "language_loss": 0.62387735, "learning_rate": 1.8135243673173956e-06, "loss": 0.64854443, "num_input_tokens_seen": 194836595, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.20947266, "step": 9045, "time_per_iteration": 4.251859426498413 }, { "auxiliary_loss_clip": 0.01414492, "auxiliary_loss_mlp": 0.01037479, "balance_loss_clip": 1.2475419, "balance_loss_mlp": 1.01659346, "epoch": 0.5438749436344507, "flos": 23013297158400.0, "grad_norm": 1.9126515091823184, "language_loss": 0.71258557, "learning_rate": 1.8131366057209023e-06, "loss": 0.73710531, "num_input_tokens_seen": 194857520, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20898438, "step": 9046, "time_per_iteration": 2.8860623836517334 }, { "auxiliary_loss_clip": 0.0140048, "auxiliary_loss_mlp": 0.01034639, "balance_loss_clip": 1.23645735, "balance_loss_mlp": 1.01420701, "epoch": 0.5439350668871186, "flos": 15495460801920.0, "grad_norm": 1.487736516374117, "language_loss": 0.78131527, "learning_rate": 1.8127488512102868e-06, "loss": 0.80566645, "num_input_tokens_seen": 194876020, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.2043457, "step": 9047, "time_per_iteration": 2.8618526458740234 }, { "auxiliary_loss_clip": 0.01417569, "auxiliary_loss_mlp": 0.01040007, "balance_loss_clip": 1.24991131, "balance_loss_mlp": 1.01758361, "epoch": 0.5439951901397866, "flos": 17247335938560.0, "grad_norm": 1.708040071749754, "language_loss": 0.73748779, "learning_rate": 1.8123611038002547e-06, "loss": 0.76206356, "num_input_tokens_seen": 194894650, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.22436523, "step": 9048, "time_per_iteration": 2.814488410949707 }, { "auxiliary_loss_clip": 0.01407521, "auxiliary_loss_mlp": 0.01042717, "balance_loss_clip": 1.2419703, "balance_loss_mlp": 1.02019811, "epoch": 0.5440553133924545, "flos": 18670216999680.0, "grad_norm": 2.5192054874523233, "language_loss": 0.94065231, "learning_rate": 1.8119733635055076e-06, "loss": 0.96515465, "num_input_tokens_seen": 194911935, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.2253418, "step": 9049, "time_per_iteration": 2.8397445678710938 }, { "auxiliary_loss_clip": 0.01405238, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.24053621, "balance_loss_mlp": 1.01852608, "epoch": 0.5441154366451225, "flos": 27133609328640.0, "grad_norm": 2.093694159076858, "language_loss": 0.7476896, "learning_rate": 1.8115856303407492e-06, "loss": 0.77212858, "num_input_tokens_seen": 194931620, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20141602, "step": 9050, "time_per_iteration": 2.8776893615722656 }, { "auxiliary_loss_clip": 0.0142162, "auxiliary_loss_mlp": 0.0103536, "balance_loss_clip": 1.25305295, "balance_loss_mlp": 1.0146656, "epoch": 0.5441755598977904, "flos": 26004766095360.0, "grad_norm": 1.9916390275369067, "language_loss": 0.68226546, "learning_rate": 1.8111979043206832e-06, "loss": 0.70683527, "num_input_tokens_seen": 194952560, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.20690918, "step": 9051, "time_per_iteration": 2.8801233768463135 }, { "auxiliary_loss_clip": 0.01408842, "auxiliary_loss_mlp": 0.01039932, "balance_loss_clip": 1.24265301, "balance_loss_mlp": 1.0185101, "epoch": 0.5442356831504584, "flos": 32392899567360.0, "grad_norm": 1.6767264047926993, "language_loss": 0.68214607, "learning_rate": 1.810810185460011e-06, "loss": 0.70663381, "num_input_tokens_seen": 194973915, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21435547, "step": 9052, "time_per_iteration": 2.936046600341797 }, { "auxiliary_loss_clip": 0.01413848, "auxiliary_loss_mlp": 0.01044674, "balance_loss_clip": 1.24552798, "balance_loss_mlp": 1.02269197, "epoch": 0.5442958064031264, "flos": 24173341320960.0, "grad_norm": 1.8095970766775444, "language_loss": 0.9393208, "learning_rate": 1.810422473773436e-06, "loss": 0.96390605, "num_input_tokens_seen": 194990170, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.21972656, "step": 9053, "time_per_iteration": 2.8618719577789307 }, { "auxiliary_loss_clip": 0.01421561, "auxiliary_loss_mlp": 0.01044905, "balance_loss_clip": 1.2518661, "balance_loss_mlp": 1.02310205, "epoch": 0.5443559296557944, "flos": 18772416299520.0, "grad_norm": 2.1869103240094145, "language_loss": 0.84693682, "learning_rate": 1.8100347692756595e-06, "loss": 0.87160152, "num_input_tokens_seen": 195006395, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21789551, "step": 9054, "time_per_iteration": 2.808042049407959 }, { "auxiliary_loss_clip": 0.01418084, "auxiliary_loss_mlp": 0.01044048, "balance_loss_clip": 1.25048006, "balance_loss_mlp": 1.02273321, "epoch": 0.5444160529084624, "flos": 22641383750400.0, "grad_norm": 2.5098837502472695, "language_loss": 0.69735873, "learning_rate": 1.8096470719813836e-06, "loss": 0.72198009, "num_input_tokens_seen": 195025080, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21337891, "step": 9055, "time_per_iteration": 2.8379151821136475 }, { "auxiliary_loss_clip": 0.01218372, "auxiliary_loss_mlp": 0.01046246, "balance_loss_clip": 1.12301707, "balance_loss_mlp": 1.01668262, "epoch": 0.5444761761611303, "flos": 69704802080640.0, "grad_norm": 0.7929515856145496, "language_loss": 0.57766998, "learning_rate": 1.80925938190531e-06, "loss": 0.60031617, "num_input_tokens_seen": 195085725, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.29492188, "step": 9056, "time_per_iteration": 3.351738214492798 }, { "auxiliary_loss_clip": 0.01408997, "auxiliary_loss_mlp": 0.01041998, "balance_loss_clip": 1.23948872, "balance_loss_mlp": 1.01970553, "epoch": 0.5445362994137983, "flos": 14285393873280.0, "grad_norm": 2.210096255297203, "language_loss": 0.70902705, "learning_rate": 1.8088716990621395e-06, "loss": 0.73353696, "num_input_tokens_seen": 195102585, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.22277832, "step": 9057, "time_per_iteration": 2.8208749294281006 }, { "auxiliary_loss_clip": 0.01398128, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.23238349, "balance_loss_mlp": 1.01689887, "epoch": 0.5445964226664662, "flos": 28997999579520.0, "grad_norm": 1.7962518849649793, "language_loss": 0.75496304, "learning_rate": 1.8084840234665738e-06, "loss": 0.77932978, "num_input_tokens_seen": 195120055, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.21643066, "step": 9058, "time_per_iteration": 2.9139692783355713 }, { "auxiliary_loss_clip": 0.01215417, "auxiliary_loss_mlp": 0.01046622, "balance_loss_clip": 1.12200165, "balance_loss_mlp": 1.01991892, "epoch": 0.5446565459191343, "flos": 68654513099520.0, "grad_norm": 0.815636861953567, "language_loss": 0.62775707, "learning_rate": 1.808096355133312e-06, "loss": 0.65037751, "num_input_tokens_seen": 195181045, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.26757812, "step": 9059, "time_per_iteration": 3.379582166671753 }, { "auxiliary_loss_clip": 0.01393995, "auxiliary_loss_mlp": 0.01043516, "balance_loss_clip": 1.23090005, "balance_loss_mlp": 1.02143884, "epoch": 0.5447166691718022, "flos": 16225035505920.0, "grad_norm": 1.7890475447033054, "language_loss": 0.80328143, "learning_rate": 1.8077086940770572e-06, "loss": 0.82765657, "num_input_tokens_seen": 195198840, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.22094727, "step": 9060, "time_per_iteration": 2.8314931392669678 }, { "auxiliary_loss_clip": 0.01404928, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.23935616, "balance_loss_mlp": 1.01745987, "epoch": 0.5447767924244702, "flos": 25860190400640.0, "grad_norm": 1.598708239119304, "language_loss": 0.80958223, "learning_rate": 1.8073210403125072e-06, "loss": 0.83402121, "num_input_tokens_seen": 195218720, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.21520996, "step": 9061, "time_per_iteration": 2.8762452602386475 }, { "auxiliary_loss_clip": 0.01402366, "auxiliary_loss_mlp": 0.01035009, "balance_loss_clip": 1.23723841, "balance_loss_mlp": 1.01356292, "epoch": 0.5448369156771381, "flos": 19685911691520.0, "grad_norm": 1.7327631731673026, "language_loss": 0.875193, "learning_rate": 1.8069333938543627e-06, "loss": 0.89956671, "num_input_tokens_seen": 195235770, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.21447754, "step": 9062, "time_per_iteration": 2.8251326084136963 }, { "auxiliary_loss_clip": 0.01421981, "auxiliary_loss_mlp": 0.01032635, "balance_loss_clip": 1.24990225, "balance_loss_mlp": 1.01154697, "epoch": 0.5448970389298061, "flos": 19291303376640.0, "grad_norm": 2.060732136789699, "language_loss": 0.8362726, "learning_rate": 1.8065457547173233e-06, "loss": 0.86081874, "num_input_tokens_seen": 195254870, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.2109375, "step": 9063, "time_per_iteration": 2.812537908554077 }, { "auxiliary_loss_clip": 0.01413823, "auxiliary_loss_mlp": 0.01034818, "balance_loss_clip": 1.2453723, "balance_loss_mlp": 1.0136106, "epoch": 0.544957162182474, "flos": 21000349670400.0, "grad_norm": 1.5095463910732438, "language_loss": 0.63933098, "learning_rate": 1.8061581229160878e-06, "loss": 0.66381735, "num_input_tokens_seen": 195273390, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.2121582, "step": 9064, "time_per_iteration": 2.8358242511749268 }, { "auxiliary_loss_clip": 0.01416059, "auxiliary_loss_mlp": 0.01036953, "balance_loss_clip": 1.24573803, "balance_loss_mlp": 1.01541162, "epoch": 0.545017285435142, "flos": 25385580000000.0, "grad_norm": 1.6284050072801928, "language_loss": 0.80852187, "learning_rate": 1.8057704984653566e-06, "loss": 0.83305192, "num_input_tokens_seen": 195295635, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.21557617, "step": 9065, "time_per_iteration": 2.9099161624908447 }, { "auxiliary_loss_clip": 0.01402708, "auxiliary_loss_mlp": 0.01036575, "balance_loss_clip": 1.2377243, "balance_loss_mlp": 1.01696527, "epoch": 0.54507740868781, "flos": 19143922504320.0, "grad_norm": 1.9522535492158117, "language_loss": 0.79123825, "learning_rate": 1.805382881379827e-06, "loss": 0.81563103, "num_input_tokens_seen": 195312545, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19616699, "step": 9066, "time_per_iteration": 4.251797914505005 }, { "auxiliary_loss_clip": 0.01420348, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.24934518, "balance_loss_mlp": 1.01408184, "epoch": 0.545137531940478, "flos": 26260906763520.0, "grad_norm": 1.8309276368620941, "language_loss": 0.76377088, "learning_rate": 1.8049952716741975e-06, "loss": 0.78832781, "num_input_tokens_seen": 195332955, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21264648, "step": 9067, "time_per_iteration": 2.8982961177825928 }, { "auxiliary_loss_clip": 0.01427648, "auxiliary_loss_mlp": 0.0103629, "balance_loss_clip": 1.25444913, "balance_loss_mlp": 1.01397431, "epoch": 0.545197655193146, "flos": 37568432401920.0, "grad_norm": 1.8702867626071644, "language_loss": 0.63930261, "learning_rate": 1.8046076693631682e-06, "loss": 0.66394198, "num_input_tokens_seen": 195355930, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.2232666, "step": 9068, "time_per_iteration": 2.991487979888916 }, { "auxiliary_loss_clip": 0.01401014, "auxiliary_loss_mlp": 0.01034475, "balance_loss_clip": 1.23740208, "balance_loss_mlp": 1.01337528, "epoch": 0.5452577784458139, "flos": 26042391786240.0, "grad_norm": 2.0900795366328855, "language_loss": 0.72821689, "learning_rate": 1.8042200744614343e-06, "loss": 0.75257176, "num_input_tokens_seen": 195376445, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.2109375, "step": 9069, "time_per_iteration": 2.900423526763916 }, { "auxiliary_loss_clip": 0.01408573, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.24385619, "balance_loss_mlp": 1.01436067, "epoch": 0.5453179016984819, "flos": 17647871322240.0, "grad_norm": 2.3405339600953323, "language_loss": 0.74767733, "learning_rate": 1.8038324869836957e-06, "loss": 0.77210212, "num_input_tokens_seen": 195393725, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19543457, "step": 9070, "time_per_iteration": 2.8094053268432617 }, { "auxiliary_loss_clip": 0.01414261, "auxiliary_loss_mlp": 0.01030778, "balance_loss_clip": 1.24819767, "balance_loss_mlp": 1.01040506, "epoch": 0.5453780249511498, "flos": 23225749332480.0, "grad_norm": 1.7593845303393616, "language_loss": 0.61334419, "learning_rate": 1.8034449069446489e-06, "loss": 0.63779461, "num_input_tokens_seen": 195411380, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20361328, "step": 9071, "time_per_iteration": 2.8430984020233154 }, { "auxiliary_loss_clip": 0.0121071, "auxiliary_loss_mlp": 0.01027952, "balance_loss_clip": 1.11938488, "balance_loss_mlp": 1.00983202, "epoch": 0.5454381482038179, "flos": 68731664762880.0, "grad_norm": 0.7022042239494368, "language_loss": 0.57212991, "learning_rate": 1.80305733435899e-06, "loss": 0.59451652, "num_input_tokens_seen": 195482015, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.18164062, "step": 9072, "time_per_iteration": 3.483304977416992 }, { "auxiliary_loss_clip": 0.01397164, "auxiliary_loss_mlp": 0.01038445, "balance_loss_clip": 1.2338469, "balance_loss_mlp": 1.01720178, "epoch": 0.5454982714564858, "flos": 13268296592640.0, "grad_norm": 2.081102575821753, "language_loss": 0.71088165, "learning_rate": 1.8026697692414174e-06, "loss": 0.73523772, "num_input_tokens_seen": 195500440, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.21252441, "step": 9073, "time_per_iteration": 2.8298094272613525 }, { "auxiliary_loss_clip": 0.01401523, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.23925257, "balance_loss_mlp": 1.01154768, "epoch": 0.5455583947091538, "flos": 21846059072640.0, "grad_norm": 1.6937981585483044, "language_loss": 0.72343028, "learning_rate": 1.802282211606627e-06, "loss": 0.74776357, "num_input_tokens_seen": 195520860, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20275879, "step": 9074, "time_per_iteration": 2.8695781230926514 }, { "auxiliary_loss_clip": 0.01413754, "auxiliary_loss_mlp": 0.01034461, "balance_loss_clip": 1.24655557, "balance_loss_mlp": 1.01460028, "epoch": 0.5456185179618217, "flos": 17825276759040.0, "grad_norm": 1.9294554315583292, "language_loss": 0.69292313, "learning_rate": 1.8018946614693148e-06, "loss": 0.7174052, "num_input_tokens_seen": 195538615, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.1986084, "step": 9075, "time_per_iteration": 2.906996488571167 }, { "auxiliary_loss_clip": 0.01412335, "auxiliary_loss_mlp": 0.01034597, "balance_loss_clip": 1.24792719, "balance_loss_mlp": 1.01470113, "epoch": 0.5456786412144897, "flos": 21079311125760.0, "grad_norm": 1.7518203280597282, "language_loss": 0.8161, "learning_rate": 1.8015071188441768e-06, "loss": 0.84056938, "num_input_tokens_seen": 195557460, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19909668, "step": 9076, "time_per_iteration": 2.8891210556030273 }, { "auxiliary_loss_clip": 0.0142443, "auxiliary_loss_mlp": 0.01033061, "balance_loss_clip": 1.2571981, "balance_loss_mlp": 1.01422584, "epoch": 0.5457387644671576, "flos": 23305434704640.0, "grad_norm": 2.121942328085589, "language_loss": 0.81235451, "learning_rate": 1.8011195837459089e-06, "loss": 0.83692944, "num_input_tokens_seen": 195577985, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.18835449, "step": 9077, "time_per_iteration": 2.827404260635376 }, { "auxiliary_loss_clip": 0.0141918, "auxiliary_loss_mlp": 0.01034209, "balance_loss_clip": 1.25148618, "balance_loss_mlp": 1.01443219, "epoch": 0.5457988877198257, "flos": 21627182136960.0, "grad_norm": 1.9758339813509054, "language_loss": 0.68987453, "learning_rate": 1.8007320561892064e-06, "loss": 0.71440846, "num_input_tokens_seen": 195597620, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19775391, "step": 9078, "time_per_iteration": 5.748022556304932 }, { "auxiliary_loss_clip": 0.01427681, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.25812495, "balance_loss_mlp": 1.01267743, "epoch": 0.5458590109724936, "flos": 23771312858880.0, "grad_norm": 1.7364010211421763, "language_loss": 0.81566846, "learning_rate": 1.800344536188764e-06, "loss": 0.84027505, "num_input_tokens_seen": 195615910, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20300293, "step": 9079, "time_per_iteration": 2.8544726371765137 }, { "auxiliary_loss_clip": 0.01437032, "auxiliary_loss_mlp": 0.01036834, "balance_loss_clip": 1.26376939, "balance_loss_mlp": 1.01611495, "epoch": 0.5459191342251616, "flos": 24434594651520.0, "grad_norm": 1.627042112276473, "language_loss": 0.7660802, "learning_rate": 1.799957023759277e-06, "loss": 0.79081893, "num_input_tokens_seen": 195635620, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.20703125, "step": 9080, "time_per_iteration": 4.230427980422974 }, { "auxiliary_loss_clip": 0.01421601, "auxiliary_loss_mlp": 0.01037595, "balance_loss_clip": 1.25229955, "balance_loss_mlp": 1.01698351, "epoch": 0.5459792574778296, "flos": 23633478639360.0, "grad_norm": 2.5115755437337826, "language_loss": 0.84320474, "learning_rate": 1.7995695189154392e-06, "loss": 0.86779666, "num_input_tokens_seen": 195652495, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.20617676, "step": 9081, "time_per_iteration": 2.84775972366333 }, { "auxiliary_loss_clip": 0.01432165, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.2618475, "balance_loss_mlp": 1.01448989, "epoch": 0.5460393807304975, "flos": 19144917889920.0, "grad_norm": 1.817337853522643, "language_loss": 0.703619, "learning_rate": 1.7991820216719461e-06, "loss": 0.72829562, "num_input_tokens_seen": 195671965, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21032715, "step": 9082, "time_per_iteration": 2.8508224487304688 }, { "auxiliary_loss_clip": 0.01413143, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.24883616, "balance_loss_mlp": 1.01304412, "epoch": 0.5460995039831655, "flos": 35932149025920.0, "grad_norm": 3.945495440577287, "language_loss": 0.67164081, "learning_rate": 1.7987945320434906e-06, "loss": 0.69610751, "num_input_tokens_seen": 195694725, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20495605, "step": 9083, "time_per_iteration": 2.961620569229126 }, { "auxiliary_loss_clip": 0.01409304, "auxiliary_loss_mlp": 0.01032927, "balance_loss_clip": 1.24584067, "balance_loss_mlp": 1.01319766, "epoch": 0.5461596272358334, "flos": 26770111453440.0, "grad_norm": 1.758398148162635, "language_loss": 0.80121392, "learning_rate": 1.798407050044766e-06, "loss": 0.82563621, "num_input_tokens_seen": 195714090, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19714355, "step": 9084, "time_per_iteration": 2.8601982593536377 }, { "auxiliary_loss_clip": 0.01432713, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.26348233, "balance_loss_mlp": 1.01948214, "epoch": 0.5462197504885015, "flos": 20896340578560.0, "grad_norm": 1.744454373484502, "language_loss": 0.76453453, "learning_rate": 1.7980195756904675e-06, "loss": 0.78925508, "num_input_tokens_seen": 195733585, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.19873047, "step": 9085, "time_per_iteration": 2.8377902507781982 }, { "auxiliary_loss_clip": 0.01422779, "auxiliary_loss_mlp": 0.01037196, "balance_loss_clip": 1.25283909, "balance_loss_mlp": 1.01788425, "epoch": 0.5462798737411694, "flos": 25814149176960.0, "grad_norm": 1.7539367059886104, "language_loss": 0.75391912, "learning_rate": 1.7976321089952857e-06, "loss": 0.77851892, "num_input_tokens_seen": 195752820, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.19299316, "step": 9086, "time_per_iteration": 2.8716304302215576 }, { "auxiliary_loss_clip": 0.01425355, "auxiliary_loss_mlp": 0.01034201, "balance_loss_clip": 1.25871301, "balance_loss_mlp": 1.01382792, "epoch": 0.5463399969938374, "flos": 25785889159680.0, "grad_norm": 1.5799684673524834, "language_loss": 0.77690279, "learning_rate": 1.7972446499739155e-06, "loss": 0.80149835, "num_input_tokens_seen": 195773740, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20373535, "step": 9087, "time_per_iteration": 2.9198362827301025 }, { "auxiliary_loss_clip": 0.01425479, "auxiliary_loss_mlp": 0.0103961, "balance_loss_clip": 1.25619125, "balance_loss_mlp": 1.0185219, "epoch": 0.5464001202465053, "flos": 18852508874880.0, "grad_norm": 1.7114051372167765, "language_loss": 0.77953506, "learning_rate": 1.7968571986410484e-06, "loss": 0.80418599, "num_input_tokens_seen": 195792125, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.21105957, "step": 9088, "time_per_iteration": 2.902883529663086 }, { "auxiliary_loss_clip": 0.01228308, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.12959051, "balance_loss_mlp": 1.02449036, "epoch": 0.5464602434991733, "flos": 69081724932480.0, "grad_norm": 0.7572139893915811, "language_loss": 0.57790196, "learning_rate": 1.7964697550113758e-06, "loss": 0.60064739, "num_input_tokens_seen": 195854935, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.21777344, "step": 9089, "time_per_iteration": 3.3946759700775146 }, { "auxiliary_loss_clip": 0.01437776, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.26642001, "balance_loss_mlp": 1.0173614, "epoch": 0.5465203667518412, "flos": 27570639283200.0, "grad_norm": 1.6720913180745938, "language_loss": 0.77665651, "learning_rate": 1.7960823190995918e-06, "loss": 0.80140185, "num_input_tokens_seen": 195874715, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.19421387, "step": 9090, "time_per_iteration": 2.909102201461792 }, { "auxiliary_loss_clip": 0.01437836, "auxiliary_loss_mlp": 0.010432, "balance_loss_clip": 1.26648235, "balance_loss_mlp": 1.02219546, "epoch": 0.5465804900045093, "flos": 21218819402880.0, "grad_norm": 1.9608294179543067, "language_loss": 0.74279493, "learning_rate": 1.7956948909203855e-06, "loss": 0.7676053, "num_input_tokens_seen": 195892610, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.21020508, "step": 9091, "time_per_iteration": 2.885824680328369 }, { "auxiliary_loss_clip": 0.01424677, "auxiliary_loss_mlp": 0.010446, "balance_loss_clip": 1.25683045, "balance_loss_mlp": 1.02445364, "epoch": 0.5466406132571772, "flos": 22498346378880.0, "grad_norm": 1.7729110363318752, "language_loss": 0.78730679, "learning_rate": 1.7953074704884498e-06, "loss": 0.8119995, "num_input_tokens_seen": 195911085, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20153809, "step": 9092, "time_per_iteration": 2.843961238861084 }, { "auxiliary_loss_clip": 0.01433869, "auxiliary_loss_mlp": 0.01043794, "balance_loss_clip": 1.26273465, "balance_loss_mlp": 1.0235641, "epoch": 0.5467007365098452, "flos": 17684773096320.0, "grad_norm": 3.648892480836694, "language_loss": 0.76190746, "learning_rate": 1.794920057818476e-06, "loss": 0.78668404, "num_input_tokens_seen": 195929845, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.20227051, "step": 9093, "time_per_iteration": 2.8119425773620605 }, { "auxiliary_loss_clip": 0.01427372, "auxiliary_loss_mlp": 0.01043056, "balance_loss_clip": 1.25674045, "balance_loss_mlp": 1.02035809, "epoch": 0.5467608597625132, "flos": 15705515001600.0, "grad_norm": 2.064180699333355, "language_loss": 0.69942135, "learning_rate": 1.7945326529251533e-06, "loss": 0.72412562, "num_input_tokens_seen": 195946350, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.22705078, "step": 9094, "time_per_iteration": 2.8016257286071777 }, { "auxiliary_loss_clip": 0.01434192, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.26584005, "balance_loss_mlp": 1.02062654, "epoch": 0.5468209830151811, "flos": 24322577230080.0, "grad_norm": 3.5633570993508648, "language_loss": 0.68961048, "learning_rate": 1.7941452558231731e-06, "loss": 0.71435988, "num_input_tokens_seen": 195959840, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20117188, "step": 9095, "time_per_iteration": 2.8455612659454346 }, { "auxiliary_loss_clip": 0.01441145, "auxiliary_loss_mlp": 0.01044109, "balance_loss_clip": 1.27251935, "balance_loss_mlp": 1.02439237, "epoch": 0.5468811062678491, "flos": 29177576766720.0, "grad_norm": 1.6151145072257393, "language_loss": 0.67951763, "learning_rate": 1.7937578665272256e-06, "loss": 0.70437014, "num_input_tokens_seen": 195981125, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.19702148, "step": 9096, "time_per_iteration": 2.893735647201538 }, { "auxiliary_loss_clip": 0.01231394, "auxiliary_loss_mlp": 0.01042539, "balance_loss_clip": 1.12970591, "balance_loss_mlp": 1.02213001, "epoch": 0.546941229520517, "flos": 67895031582720.0, "grad_norm": 0.7502373525587244, "language_loss": 0.57557315, "learning_rate": 1.7933704850520007e-06, "loss": 0.5983125, "num_input_tokens_seen": 196038880, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.20410156, "step": 9097, "time_per_iteration": 3.475468397140503 }, { "auxiliary_loss_clip": 0.01225701, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.12753582, "balance_loss_mlp": 1.01118755, "epoch": 0.5470013527731851, "flos": 58295556852480.0, "grad_norm": 0.9287890373437211, "language_loss": 0.64855701, "learning_rate": 1.7929831114121868e-06, "loss": 0.67114425, "num_input_tokens_seen": 196099215, "router_z_loss_clip": 0.98046875, "router_z_loss_mlp": 0.21875, "step": 9098, "time_per_iteration": 3.2841601371765137 }, { "auxiliary_loss_clip": 0.01434948, "auxiliary_loss_mlp": 0.01044402, "balance_loss_clip": 1.2634964, "balance_loss_mlp": 1.02255058, "epoch": 0.547061476025853, "flos": 22976123915520.0, "grad_norm": 1.814869540371251, "language_loss": 0.74431217, "learning_rate": 1.7925957456224753e-06, "loss": 0.76910567, "num_input_tokens_seen": 196120370, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.21838379, "step": 9099, "time_per_iteration": 2.8698720932006836 }, { "auxiliary_loss_clip": 0.01420325, "auxiliary_loss_mlp": 0.01035265, "balance_loss_clip": 1.25331104, "balance_loss_mlp": 1.0164541, "epoch": 0.547121599278521, "flos": 29979507185280.0, "grad_norm": 1.879240180359159, "language_loss": 0.74631906, "learning_rate": 1.7922083876975537e-06, "loss": 0.77087498, "num_input_tokens_seen": 196139075, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18811035, "step": 9100, "time_per_iteration": 2.9458322525024414 }, { "auxiliary_loss_clip": 0.01418656, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.25176024, "balance_loss_mlp": 1.0145874, "epoch": 0.5471817225311889, "flos": 36548348964480.0, "grad_norm": 2.2143472904602706, "language_loss": 0.69005001, "learning_rate": 1.7918210376521102e-06, "loss": 0.71460104, "num_input_tokens_seen": 196159990, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.21862793, "step": 9101, "time_per_iteration": 4.3752782344818115 }, { "auxiliary_loss_clip": 0.01417056, "auxiliary_loss_mlp": 0.01031856, "balance_loss_clip": 1.24988127, "balance_loss_mlp": 1.01309276, "epoch": 0.5472418457838569, "flos": 25786160628480.0, "grad_norm": 1.6930406978861114, "language_loss": 0.78603065, "learning_rate": 1.7914336955008343e-06, "loss": 0.81051975, "num_input_tokens_seen": 196180570, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.18762207, "step": 9102, "time_per_iteration": 2.9035396575927734 }, { "auxiliary_loss_clip": 0.01408768, "auxiliary_loss_mlp": 0.01035562, "balance_loss_clip": 1.24585247, "balance_loss_mlp": 1.0148437, "epoch": 0.5473019690365248, "flos": 27898185525120.0, "grad_norm": 1.415233596086656, "language_loss": 0.72948146, "learning_rate": 1.791046361258413e-06, "loss": 0.75392479, "num_input_tokens_seen": 196200300, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.20727539, "step": 9103, "time_per_iteration": 2.898728847503662 }, { "auxiliary_loss_clip": 0.0142368, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.25664306, "balance_loss_mlp": 1.01296484, "epoch": 0.5473620922891929, "flos": 57651757585920.0, "grad_norm": 1.2672445067178304, "language_loss": 0.65769935, "learning_rate": 1.7906590349395356e-06, "loss": 0.68227983, "num_input_tokens_seen": 196228525, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.21411133, "step": 9104, "time_per_iteration": 3.1887693405151367 }, { "auxiliary_loss_clip": 0.01414155, "auxiliary_loss_mlp": 0.01036373, "balance_loss_clip": 1.24547553, "balance_loss_mlp": 1.01515365, "epoch": 0.5474222155418608, "flos": 19363387622400.0, "grad_norm": 1.757954234944769, "language_loss": 0.82896346, "learning_rate": 1.790271716558888e-06, "loss": 0.85346872, "num_input_tokens_seen": 196247690, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.2121582, "step": 9105, "time_per_iteration": 2.82035493850708 }, { "auxiliary_loss_clip": 0.01412953, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.24735045, "balance_loss_mlp": 1.01351643, "epoch": 0.5474823387945288, "flos": 25131701571840.0, "grad_norm": 1.5114408110847612, "language_loss": 0.81053388, "learning_rate": 1.7898844061311575e-06, "loss": 0.83499074, "num_input_tokens_seen": 196268555, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19213867, "step": 9106, "time_per_iteration": 2.890048027038574 }, { "auxiliary_loss_clip": 0.01429468, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.26041079, "balance_loss_mlp": 1.01400006, "epoch": 0.5475424620471967, "flos": 18013178989440.0, "grad_norm": 2.350112029112099, "language_loss": 0.7035799, "learning_rate": 1.7894971036710322e-06, "loss": 0.72821575, "num_input_tokens_seen": 196285585, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20117188, "step": 9107, "time_per_iteration": 2.792612075805664 }, { "auxiliary_loss_clip": 0.01425682, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.25583148, "balance_loss_mlp": 1.01036358, "epoch": 0.5476025852998647, "flos": 22319402618880.0, "grad_norm": 1.7959530258903016, "language_loss": 0.6453619, "learning_rate": 1.789109809193197e-06, "loss": 0.66992933, "num_input_tokens_seen": 196305085, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.20690918, "step": 9108, "time_per_iteration": 2.8813259601593018 }, { "auxiliary_loss_clip": 0.01414844, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 1.24770164, "balance_loss_mlp": 1.01202011, "epoch": 0.5476627085525327, "flos": 20130090324480.0, "grad_norm": 1.783881515514454, "language_loss": 0.76026726, "learning_rate": 1.7887225227123396e-06, "loss": 0.78473318, "num_input_tokens_seen": 196323945, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.1973877, "step": 9109, "time_per_iteration": 2.8597776889801025 }, { "auxiliary_loss_clip": 0.01416666, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.25138867, "balance_loss_mlp": 1.01686621, "epoch": 0.5477228318052006, "flos": 17721584380800.0, "grad_norm": 10.429240478908586, "language_loss": 0.78085279, "learning_rate": 1.7883352442431457e-06, "loss": 0.80540133, "num_input_tokens_seen": 196342200, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.21325684, "step": 9110, "time_per_iteration": 2.8172390460968018 }, { "auxiliary_loss_clip": 0.01403497, "auxiliary_loss_mlp": 0.0103124, "balance_loss_clip": 1.2403003, "balance_loss_mlp": 1.01247668, "epoch": 0.5477829550578687, "flos": 25860009421440.0, "grad_norm": 1.5026533163110216, "language_loss": 0.72233093, "learning_rate": 1.7879479738002993e-06, "loss": 0.74667835, "num_input_tokens_seen": 196362940, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.1875, "step": 9111, "time_per_iteration": 2.8923873901367188 }, { "auxiliary_loss_clip": 0.01410487, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.24341941, "balance_loss_mlp": 1.01373363, "epoch": 0.5478430783105366, "flos": 23050108442880.0, "grad_norm": 1.680443442609996, "language_loss": 0.71696734, "learning_rate": 1.7875607113984876e-06, "loss": 0.74141729, "num_input_tokens_seen": 196383070, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.2076416, "step": 9112, "time_per_iteration": 2.838939905166626 }, { "auxiliary_loss_clip": 0.0142364, "auxiliary_loss_mlp": 0.01037264, "balance_loss_clip": 1.25425351, "balance_loss_mlp": 1.01693916, "epoch": 0.5479032015632046, "flos": 16079916873600.0, "grad_norm": 2.0893631248831617, "language_loss": 0.89657182, "learning_rate": 1.7871734570523953e-06, "loss": 0.92118084, "num_input_tokens_seen": 196398485, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.203125, "step": 9113, "time_per_iteration": 4.217290878295898 }, { "auxiliary_loss_clip": 0.01416381, "auxiliary_loss_mlp": 0.01033923, "balance_loss_clip": 1.24831152, "balance_loss_mlp": 1.01331198, "epoch": 0.5479633248158725, "flos": 24289068816000.0, "grad_norm": 1.4790353052585457, "language_loss": 0.73941541, "learning_rate": 1.7867862107767067e-06, "loss": 0.7639184, "num_input_tokens_seen": 196417725, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20593262, "step": 9114, "time_per_iteration": 5.6567277908325195 }, { "auxiliary_loss_clip": 0.01414523, "auxiliary_loss_mlp": 0.01033894, "balance_loss_clip": 1.24769163, "balance_loss_mlp": 1.01514208, "epoch": 0.5480234480685405, "flos": 26369078376960.0, "grad_norm": 1.6329720493439075, "language_loss": 0.72853744, "learning_rate": 1.7863989725861066e-06, "loss": 0.7530216, "num_input_tokens_seen": 196437840, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.18737793, "step": 9115, "time_per_iteration": 2.9427483081817627 }, { "auxiliary_loss_clip": 0.01429097, "auxiliary_loss_mlp": 0.01031601, "balance_loss_clip": 1.25761294, "balance_loss_mlp": 1.01131153, "epoch": 0.5480835713212084, "flos": 22065388456320.0, "grad_norm": 1.9015215179642648, "language_loss": 0.73013687, "learning_rate": 1.7860117424952781e-06, "loss": 0.75474387, "num_input_tokens_seen": 196457300, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20288086, "step": 9116, "time_per_iteration": 2.8350021839141846 }, { "auxiliary_loss_clip": 0.01416386, "auxiliary_loss_mlp": 0.01039361, "balance_loss_clip": 1.24936485, "balance_loss_mlp": 1.01812971, "epoch": 0.5481436945738765, "flos": 25311866941440.0, "grad_norm": 1.9106291823324282, "language_loss": 0.77558577, "learning_rate": 1.7856245205189063e-06, "loss": 0.80014318, "num_input_tokens_seen": 196476720, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.2121582, "step": 9117, "time_per_iteration": 2.995396852493286 }, { "auxiliary_loss_clip": 0.01398214, "auxiliary_loss_mlp": 0.01034531, "balance_loss_clip": 1.23513973, "balance_loss_mlp": 1.01300216, "epoch": 0.5482038178265444, "flos": 33593193619200.0, "grad_norm": 1.633547070010084, "language_loss": 0.63301086, "learning_rate": 1.785237306671674e-06, "loss": 0.65733826, "num_input_tokens_seen": 196496765, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.21533203, "step": 9118, "time_per_iteration": 2.971527576446533 }, { "auxiliary_loss_clip": 0.01439007, "auxiliary_loss_mlp": 0.01035951, "balance_loss_clip": 1.26688576, "balance_loss_mlp": 1.01499391, "epoch": 0.5482639410792124, "flos": 19035569911680.0, "grad_norm": 1.7211827462865907, "language_loss": 0.7939564, "learning_rate": 1.7848501009682646e-06, "loss": 0.81870598, "num_input_tokens_seen": 196516220, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.20947266, "step": 9119, "time_per_iteration": 2.8280746936798096 }, { "auxiliary_loss_clip": 0.01414366, "auxiliary_loss_mlp": 0.01034254, "balance_loss_clip": 1.24995995, "balance_loss_mlp": 1.01494169, "epoch": 0.5483240643318803, "flos": 25421124430080.0, "grad_norm": 1.7098565852536771, "language_loss": 0.83084321, "learning_rate": 1.7844629034233604e-06, "loss": 0.85532945, "num_input_tokens_seen": 196533860, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1932373, "step": 9120, "time_per_iteration": 2.885385513305664 }, { "auxiliary_loss_clip": 0.01414931, "auxiliary_loss_mlp": 0.01033089, "balance_loss_clip": 1.24749422, "balance_loss_mlp": 1.01277542, "epoch": 0.5483841875845483, "flos": 21475910211840.0, "grad_norm": 2.6630875586881873, "language_loss": 0.80689973, "learning_rate": 1.7840757140516455e-06, "loss": 0.83137995, "num_input_tokens_seen": 196551305, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20324707, "step": 9121, "time_per_iteration": 2.863783836364746 }, { "auxiliary_loss_clip": 0.01422288, "auxiliary_loss_mlp": 0.01038447, "balance_loss_clip": 1.25311947, "balance_loss_mlp": 1.01824164, "epoch": 0.5484443108372163, "flos": 24756259069440.0, "grad_norm": 1.686944193010063, "language_loss": 0.62179297, "learning_rate": 1.7836885328678008e-06, "loss": 0.64640033, "num_input_tokens_seen": 196569420, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20202637, "step": 9122, "time_per_iteration": 2.8834924697875977 }, { "auxiliary_loss_clip": 0.01422471, "auxiliary_loss_mlp": 0.01040155, "balance_loss_clip": 1.25755072, "balance_loss_mlp": 1.02033067, "epoch": 0.5485044340898843, "flos": 25386575385600.0, "grad_norm": 1.7167641995892524, "language_loss": 0.72045803, "learning_rate": 1.7833013598865084e-06, "loss": 0.74508435, "num_input_tokens_seen": 196590610, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19836426, "step": 9123, "time_per_iteration": 2.896503448486328 }, { "auxiliary_loss_clip": 0.01425183, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.25906563, "balance_loss_mlp": 1.01754475, "epoch": 0.5485645573425523, "flos": 12648884273280.0, "grad_norm": 1.9086068921915762, "language_loss": 0.83992553, "learning_rate": 1.7829141951224505e-06, "loss": 0.86454606, "num_input_tokens_seen": 196606495, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.1932373, "step": 9124, "time_per_iteration": 2.889718770980835 }, { "auxiliary_loss_clip": 0.01407737, "auxiliary_loss_mlp": 0.01033807, "balance_loss_clip": 1.24439597, "balance_loss_mlp": 1.0132314, "epoch": 0.5486246805952202, "flos": 28341052058880.0, "grad_norm": 2.035248399097569, "language_loss": 0.80967081, "learning_rate": 1.7825270385903075e-06, "loss": 0.83408618, "num_input_tokens_seen": 196626365, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20581055, "step": 9125, "time_per_iteration": 2.9590001106262207 }, { "auxiliary_loss_clip": 0.01418822, "auxiliary_loss_mlp": 0.01036112, "balance_loss_clip": 1.24951625, "balance_loss_mlp": 1.01607335, "epoch": 0.5486848038478882, "flos": 16808677171200.0, "grad_norm": 1.8648248448535556, "language_loss": 0.74926645, "learning_rate": 1.7821398903047617e-06, "loss": 0.77381581, "num_input_tokens_seen": 196644465, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20019531, "step": 9126, "time_per_iteration": 2.8152525424957275 }, { "auxiliary_loss_clip": 0.01420097, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.24980366, "balance_loss_mlp": 1.01529527, "epoch": 0.5487449271005561, "flos": 17244576005760.0, "grad_norm": 2.367891701545487, "language_loss": 0.68752348, "learning_rate": 1.7817527502804928e-06, "loss": 0.71208894, "num_input_tokens_seen": 196659160, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.21166992, "step": 9127, "time_per_iteration": 2.8367934226989746 }, { "auxiliary_loss_clip": 0.01413214, "auxiliary_loss_mlp": 0.01036266, "balance_loss_clip": 1.24756217, "balance_loss_mlp": 1.01538038, "epoch": 0.5488050503532241, "flos": 17347996915200.0, "grad_norm": 1.8270200155276102, "language_loss": 0.83728671, "learning_rate": 1.781365618532181e-06, "loss": 0.86178154, "num_input_tokens_seen": 196677410, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20874023, "step": 9128, "time_per_iteration": 2.8737471103668213 }, { "auxiliary_loss_clip": 0.01412442, "auxiliary_loss_mlp": 0.01034353, "balance_loss_clip": 1.24706411, "balance_loss_mlp": 1.01405191, "epoch": 0.548865173605892, "flos": 17248286079360.0, "grad_norm": 1.8777837852238617, "language_loss": 0.75239825, "learning_rate": 1.7809784950745078e-06, "loss": 0.7768662, "num_input_tokens_seen": 196696765, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20288086, "step": 9129, "time_per_iteration": 2.8646817207336426 }, { "auxiliary_loss_clip": 0.01439414, "auxiliary_loss_mlp": 0.01033876, "balance_loss_clip": 1.26910925, "balance_loss_mlp": 1.01350307, "epoch": 0.5489252968585601, "flos": 17465534202240.0, "grad_norm": 2.3424393028962958, "language_loss": 0.64374471, "learning_rate": 1.7805913799221511e-06, "loss": 0.66847759, "num_input_tokens_seen": 196714895, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.20373535, "step": 9130, "time_per_iteration": 2.8226993083953857 }, { "auxiliary_loss_clip": 0.01430506, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.26136446, "balance_loss_mlp": 1.01856661, "epoch": 0.548985420111228, "flos": 26334665066880.0, "grad_norm": 1.9106604047108067, "language_loss": 0.63760149, "learning_rate": 1.7802042730897915e-06, "loss": 0.66229761, "num_input_tokens_seen": 196735510, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20544434, "step": 9131, "time_per_iteration": 2.8641152381896973 }, { "auxiliary_loss_clip": 0.01430934, "auxiliary_loss_mlp": 0.01036066, "balance_loss_clip": 1.26319265, "balance_loss_mlp": 1.01565766, "epoch": 0.549045543363896, "flos": 18701960866560.0, "grad_norm": 2.1191353162236535, "language_loss": 0.76219618, "learning_rate": 1.7798171745921084e-06, "loss": 0.78686619, "num_input_tokens_seen": 196752855, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20410156, "step": 9132, "time_per_iteration": 2.8237524032592773 }, { "auxiliary_loss_clip": 0.01419595, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.25114346, "balance_loss_mlp": 1.0139842, "epoch": 0.5491056666165639, "flos": 24728089541760.0, "grad_norm": 2.166564023913624, "language_loss": 0.82134151, "learning_rate": 1.7794300844437795e-06, "loss": 0.84587872, "num_input_tokens_seen": 196772230, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20129395, "step": 9133, "time_per_iteration": 2.8673338890075684 }, { "auxiliary_loss_clip": 0.01414931, "auxiliary_loss_mlp": 0.01040141, "balance_loss_clip": 1.24899983, "balance_loss_mlp": 1.01943469, "epoch": 0.5491657898692319, "flos": 21586117841280.0, "grad_norm": 1.6498032937855793, "language_loss": 0.70865196, "learning_rate": 1.7790430026594841e-06, "loss": 0.7332027, "num_input_tokens_seen": 196790405, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20703125, "step": 9134, "time_per_iteration": 2.8482511043548584 }, { "auxiliary_loss_clip": 0.01434241, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.26419389, "balance_loss_mlp": 1.01498771, "epoch": 0.5492259131219, "flos": 50493844765440.0, "grad_norm": 2.1405621517916695, "language_loss": 0.61300993, "learning_rate": 1.7786559292539004e-06, "loss": 0.63769424, "num_input_tokens_seen": 196813785, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.19189453, "step": 9135, "time_per_iteration": 3.0730113983154297 }, { "auxiliary_loss_clip": 0.01421072, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.25238109, "balance_loss_mlp": 1.01272702, "epoch": 0.5492860363745679, "flos": 25130208493440.0, "grad_norm": 1.8718978544378502, "language_loss": 0.73753494, "learning_rate": 1.7782688642417058e-06, "loss": 0.76208329, "num_input_tokens_seen": 196834390, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21032715, "step": 9136, "time_per_iteration": 4.296404123306274 }, { "auxiliary_loss_clip": 0.0143881, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.26569057, "balance_loss_mlp": 1.01810431, "epoch": 0.5493461596272359, "flos": 22642922073600.0, "grad_norm": 2.0516947540541826, "language_loss": 0.69200778, "learning_rate": 1.7778818076375781e-06, "loss": 0.71678388, "num_input_tokens_seen": 196853290, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.20703125, "step": 9137, "time_per_iteration": 2.8520023822784424 }, { "auxiliary_loss_clip": 0.01236231, "auxiliary_loss_mlp": 0.01036186, "balance_loss_clip": 1.13758874, "balance_loss_mlp": 1.01777995, "epoch": 0.5494062828799038, "flos": 66179243813760.0, "grad_norm": 0.7565129713164069, "language_loss": 0.65355521, "learning_rate": 1.7774947594561947e-06, "loss": 0.67627937, "num_input_tokens_seen": 196913120, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.18359375, "step": 9138, "time_per_iteration": 3.4025259017944336 }, { "auxiliary_loss_clip": 0.01429049, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.25995636, "balance_loss_mlp": 1.01461124, "epoch": 0.5494664061325718, "flos": 21115896186240.0, "grad_norm": 1.7503214316382862, "language_loss": 0.75811285, "learning_rate": 1.7771077197122321e-06, "loss": 0.78276235, "num_input_tokens_seen": 196931530, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21289062, "step": 9139, "time_per_iteration": 2.8350234031677246 }, { "auxiliary_loss_clip": 0.0140333, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.23902702, "balance_loss_mlp": 1.01374853, "epoch": 0.5495265293852397, "flos": 14400759409920.0, "grad_norm": 1.9398652840921373, "language_loss": 0.72375572, "learning_rate": 1.7767206884203672e-06, "loss": 0.7481277, "num_input_tokens_seen": 196949430, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20129395, "step": 9140, "time_per_iteration": 2.839172840118408 }, { "auxiliary_loss_clip": 0.01406413, "auxiliary_loss_mlp": 0.01029008, "balance_loss_clip": 1.2410804, "balance_loss_mlp": 1.0087539, "epoch": 0.5495866526379077, "flos": 25558144243200.0, "grad_norm": 1.8307502302713003, "language_loss": 0.77158189, "learning_rate": 1.7763336655952762e-06, "loss": 0.79593611, "num_input_tokens_seen": 196968265, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20239258, "step": 9141, "time_per_iteration": 2.8996851444244385 }, { "auxiliary_loss_clip": 0.01405942, "auxiliary_loss_mlp": 0.01035662, "balance_loss_clip": 1.2451148, "balance_loss_mlp": 1.01581347, "epoch": 0.5496467758905756, "flos": 21325271713920.0, "grad_norm": 2.2492073298881605, "language_loss": 0.75823903, "learning_rate": 1.7759466512516346e-06, "loss": 0.782655, "num_input_tokens_seen": 196984930, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19848633, "step": 9142, "time_per_iteration": 2.897500991821289 }, { "auxiliary_loss_clip": 0.01429356, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.26061821, "balance_loss_mlp": 1.0151875, "epoch": 0.5497068991432437, "flos": 22242296200320.0, "grad_norm": 2.2799499677690456, "language_loss": 0.77517581, "learning_rate": 1.7755596454041192e-06, "loss": 0.79982847, "num_input_tokens_seen": 197002320, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20703125, "step": 9143, "time_per_iteration": 2.837388753890991 }, { "auxiliary_loss_clip": 0.01423348, "auxiliary_loss_mlp": 0.01038459, "balance_loss_clip": 1.25788271, "balance_loss_mlp": 1.01751399, "epoch": 0.5497670223959116, "flos": 18488377572480.0, "grad_norm": 2.4489796133825164, "language_loss": 0.81579709, "learning_rate": 1.7751726480674044e-06, "loss": 0.84041506, "num_input_tokens_seen": 197020825, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20935059, "step": 9144, "time_per_iteration": 2.8427326679229736 }, { "auxiliary_loss_clip": 0.01430124, "auxiliary_loss_mlp": 0.01031306, "balance_loss_clip": 1.26306677, "balance_loss_mlp": 1.01108837, "epoch": 0.5498271456485796, "flos": 29216967004800.0, "grad_norm": 1.7541524300642592, "language_loss": 0.71561837, "learning_rate": 1.7747856592561645e-06, "loss": 0.74023271, "num_input_tokens_seen": 197040450, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20202637, "step": 9145, "time_per_iteration": 2.88031005859375 }, { "auxiliary_loss_clip": 0.014272, "auxiliary_loss_mlp": 0.01037211, "balance_loss_clip": 1.26067781, "balance_loss_mlp": 1.01764894, "epoch": 0.5498872689012475, "flos": 34837809592320.0, "grad_norm": 1.4768600797483837, "language_loss": 0.70829028, "learning_rate": 1.774398678985076e-06, "loss": 0.73293436, "num_input_tokens_seen": 197063930, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19555664, "step": 9146, "time_per_iteration": 2.9555160999298096 }, { "auxiliary_loss_clip": 0.01402181, "auxiliary_loss_mlp": 0.01035987, "balance_loss_clip": 1.24050188, "balance_loss_mlp": 1.0164609, "epoch": 0.5499473921539155, "flos": 25933089052800.0, "grad_norm": 1.7018950182593435, "language_loss": 0.65263987, "learning_rate": 1.7740117072688113e-06, "loss": 0.6770215, "num_input_tokens_seen": 197082660, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19543457, "step": 9147, "time_per_iteration": 2.8988664150238037 }, { "auxiliary_loss_clip": 0.01414235, "auxiliary_loss_mlp": 0.0103824, "balance_loss_clip": 1.25014699, "balance_loss_mlp": 1.01894021, "epoch": 0.5500075154065835, "flos": 22283993923200.0, "grad_norm": 1.940340215326659, "language_loss": 0.81452781, "learning_rate": 1.7736247441220458e-06, "loss": 0.83905256, "num_input_tokens_seen": 197100675, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19299316, "step": 9148, "time_per_iteration": 4.334068775177002 }, { "auxiliary_loss_clip": 0.01429982, "auxiliary_loss_mlp": 0.01039674, "balance_loss_clip": 1.263448, "balance_loss_mlp": 1.01927733, "epoch": 0.5500676386592515, "flos": 28049050247040.0, "grad_norm": 1.7599777543695485, "language_loss": 0.80094439, "learning_rate": 1.773237789559453e-06, "loss": 0.82564098, "num_input_tokens_seen": 197121320, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20385742, "step": 9149, "time_per_iteration": 4.414827108383179 }, { "auxiliary_loss_clip": 0.01415041, "auxiliary_loss_mlp": 0.01035742, "balance_loss_clip": 1.24881113, "balance_loss_mlp": 1.01627505, "epoch": 0.5501277619119195, "flos": 23925344716800.0, "grad_norm": 2.149773307209368, "language_loss": 0.73098147, "learning_rate": 1.7728508435957052e-06, "loss": 0.75548935, "num_input_tokens_seen": 197138965, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19445801, "step": 9150, "time_per_iteration": 2.8473238945007324 }, { "auxiliary_loss_clip": 0.01429757, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.25908589, "balance_loss_mlp": 1.01758683, "epoch": 0.5501878851645874, "flos": 20933740045440.0, "grad_norm": 1.9085180619360453, "language_loss": 0.755422, "learning_rate": 1.772463906245477e-06, "loss": 0.78010571, "num_input_tokens_seen": 197156460, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.21020508, "step": 9151, "time_per_iteration": 2.802121877670288 }, { "auxiliary_loss_clip": 0.01420162, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.2547307, "balance_loss_mlp": 1.01637161, "epoch": 0.5502480084172554, "flos": 20674386996480.0, "grad_norm": 1.82344951389052, "language_loss": 0.76832521, "learning_rate": 1.7720769775234394e-06, "loss": 0.7928828, "num_input_tokens_seen": 197175140, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19238281, "step": 9152, "time_per_iteration": 2.8398048877716064 }, { "auxiliary_loss_clip": 0.01406736, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.24425817, "balance_loss_mlp": 1.01656878, "epoch": 0.5503081316699233, "flos": 26443786821120.0, "grad_norm": 2.0160805450962243, "language_loss": 0.83587551, "learning_rate": 1.7716900574442662e-06, "loss": 0.86030328, "num_input_tokens_seen": 197194345, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19458008, "step": 9153, "time_per_iteration": 2.8943891525268555 }, { "auxiliary_loss_clip": 0.01409242, "auxiliary_loss_mlp": 0.01038569, "balance_loss_clip": 1.24608862, "balance_loss_mlp": 1.01853037, "epoch": 0.5503682549225913, "flos": 30641657857920.0, "grad_norm": 2.280596672040626, "language_loss": 0.75065356, "learning_rate": 1.7713031460226294e-06, "loss": 0.7751317, "num_input_tokens_seen": 197215535, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20043945, "step": 9154, "time_per_iteration": 2.923618793487549 }, { "auxiliary_loss_clip": 0.01431765, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.26033449, "balance_loss_mlp": 1.01829457, "epoch": 0.5504283781752592, "flos": 22575769511040.0, "grad_norm": 1.6391757145653894, "language_loss": 0.73739552, "learning_rate": 1.770916243273199e-06, "loss": 0.76210105, "num_input_tokens_seen": 197234945, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20495605, "step": 9155, "time_per_iteration": 2.8508124351501465 }, { "auxiliary_loss_clip": 0.01233949, "auxiliary_loss_mlp": 0.01031341, "balance_loss_clip": 1.13531542, "balance_loss_mlp": 1.01283979, "epoch": 0.5504885014279273, "flos": 67928359017600.0, "grad_norm": 0.7614755764141152, "language_loss": 0.55338782, "learning_rate": 1.7705293492106483e-06, "loss": 0.57604074, "num_input_tokens_seen": 197302285, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.18457031, "step": 9156, "time_per_iteration": 3.4969427585601807 }, { "auxiliary_loss_clip": 0.0142621, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.25915956, "balance_loss_mlp": 1.01705027, "epoch": 0.5505486246805952, "flos": 22458820406400.0, "grad_norm": 1.627020947520391, "language_loss": 0.83168387, "learning_rate": 1.7701424638496475e-06, "loss": 0.85631943, "num_input_tokens_seen": 197321575, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20275879, "step": 9157, "time_per_iteration": 2.848139762878418 }, { "auxiliary_loss_clip": 0.01432007, "auxiliary_loss_mlp": 0.01036812, "balance_loss_clip": 1.26043046, "balance_loss_mlp": 1.01559293, "epoch": 0.5506087479332632, "flos": 26918261487360.0, "grad_norm": 2.1040743097786896, "language_loss": 0.76336122, "learning_rate": 1.7697555872048677e-06, "loss": 0.78804946, "num_input_tokens_seen": 197340255, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.2121582, "step": 9158, "time_per_iteration": 2.8947582244873047 }, { "auxiliary_loss_clip": 0.01410817, "auxiliary_loss_mlp": 0.01037218, "balance_loss_clip": 1.24903095, "balance_loss_mlp": 1.01742947, "epoch": 0.5506688711859311, "flos": 22940669975040.0, "grad_norm": 1.7124340911438078, "language_loss": 0.70670128, "learning_rate": 1.769368719290979e-06, "loss": 0.73118162, "num_input_tokens_seen": 197360360, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19799805, "step": 9159, "time_per_iteration": 2.9524168968200684 }, { "auxiliary_loss_clip": 0.01413875, "auxiliary_loss_mlp": 0.01041162, "balance_loss_clip": 1.24713969, "balance_loss_mlp": 1.02150428, "epoch": 0.5507289944385991, "flos": 29618362039680.0, "grad_norm": 1.9258338984711891, "language_loss": 0.69732106, "learning_rate": 1.7689818601226516e-06, "loss": 0.72187144, "num_input_tokens_seen": 197381905, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19665527, "step": 9160, "time_per_iteration": 3.120114803314209 }, { "auxiliary_loss_clip": 0.01414006, "auxiliary_loss_mlp": 0.01033137, "balance_loss_clip": 1.24879217, "balance_loss_mlp": 1.01357424, "epoch": 0.5507891176912671, "flos": 15341112230400.0, "grad_norm": 1.8790133588437596, "language_loss": 0.72555482, "learning_rate": 1.7685950097145552e-06, "loss": 0.75002617, "num_input_tokens_seen": 197398555, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19567871, "step": 9161, "time_per_iteration": 2.8633625507354736 }, { "auxiliary_loss_clip": 0.01426746, "auxiliary_loss_mlp": 0.01047972, "balance_loss_clip": 1.26083541, "balance_loss_mlp": 1.02713454, "epoch": 0.5508492409439351, "flos": 26589538880640.0, "grad_norm": 1.6565803047354553, "language_loss": 0.70162684, "learning_rate": 1.768208168081359e-06, "loss": 0.72637403, "num_input_tokens_seen": 197419630, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20837402, "step": 9162, "time_per_iteration": 2.8852133750915527 }, { "auxiliary_loss_clip": 0.0141087, "auxiliary_loss_mlp": 0.01039848, "balance_loss_clip": 1.24679542, "balance_loss_mlp": 1.01847434, "epoch": 0.5509093641966031, "flos": 25453365989760.0, "grad_norm": 1.70336890007208, "language_loss": 0.86793834, "learning_rate": 1.767821335237733e-06, "loss": 0.8924455, "num_input_tokens_seen": 197438480, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.21374512, "step": 9163, "time_per_iteration": 2.8761379718780518 }, { "auxiliary_loss_clip": 0.01413455, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.25058794, "balance_loss_mlp": 1.01474321, "epoch": 0.550969487449271, "flos": 18708023669760.0, "grad_norm": 1.66229913442318, "language_loss": 0.81547582, "learning_rate": 1.7674345111983441e-06, "loss": 0.83995068, "num_input_tokens_seen": 197456755, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19299316, "step": 9164, "time_per_iteration": 2.8426811695098877 }, { "auxiliary_loss_clip": 0.01427339, "auxiliary_loss_mlp": 0.01039884, "balance_loss_clip": 1.25873387, "balance_loss_mlp": 1.01966584, "epoch": 0.551029610701939, "flos": 22718399679360.0, "grad_norm": 2.033234516712243, "language_loss": 0.74490082, "learning_rate": 1.767047695977863e-06, "loss": 0.76957309, "num_input_tokens_seen": 197475530, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20214844, "step": 9165, "time_per_iteration": 2.8632712364196777 }, { "auxiliary_loss_clip": 0.01403905, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.24078774, "balance_loss_mlp": 1.01517546, "epoch": 0.5510897339546069, "flos": 12427699852800.0, "grad_norm": 1.9588861113560423, "language_loss": 0.80525649, "learning_rate": 1.7666608895909563e-06, "loss": 0.82963955, "num_input_tokens_seen": 197490835, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19226074, "step": 9166, "time_per_iteration": 2.806156635284424 }, { "auxiliary_loss_clip": 0.01423109, "auxiliary_loss_mlp": 0.01030747, "balance_loss_clip": 1.25453532, "balance_loss_mlp": 1.01091015, "epoch": 0.5511498572072749, "flos": 18779836446720.0, "grad_norm": 2.306423290274891, "language_loss": 0.77205682, "learning_rate": 1.7662740920522913e-06, "loss": 0.79659545, "num_input_tokens_seen": 197508770, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19836426, "step": 9167, "time_per_iteration": 2.9085423946380615 }, { "auxiliary_loss_clip": 0.01408328, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.24267852, "balance_loss_mlp": 1.01315069, "epoch": 0.5512099804599428, "flos": 19582807495680.0, "grad_norm": 1.9337746296782128, "language_loss": 0.81831932, "learning_rate": 1.7658873033765374e-06, "loss": 0.84274071, "num_input_tokens_seen": 197527340, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20654297, "step": 9168, "time_per_iteration": 2.9388225078582764 }, { "auxiliary_loss_clip": 0.01436254, "auxiliary_loss_mlp": 0.01040509, "balance_loss_clip": 1.26672506, "balance_loss_mlp": 1.01930153, "epoch": 0.5512701037126109, "flos": 26255613121920.0, "grad_norm": 1.629054889370561, "language_loss": 0.69553053, "learning_rate": 1.7655005235783591e-06, "loss": 0.72029817, "num_input_tokens_seen": 197547280, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.2121582, "step": 9169, "time_per_iteration": 2.9172489643096924 }, { "auxiliary_loss_clip": 0.01402142, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.2414552, "balance_loss_mlp": 1.0139904, "epoch": 0.5513302269652788, "flos": 21955633274880.0, "grad_norm": 2.722099259582532, "language_loss": 0.85652047, "learning_rate": 1.7651137526724251e-06, "loss": 0.88087296, "num_input_tokens_seen": 197565045, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19104004, "step": 9170, "time_per_iteration": 2.880378246307373 }, { "auxiliary_loss_clip": 0.01226713, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.12763071, "balance_loss_mlp": 1.0156945, "epoch": 0.5513903502179468, "flos": 68265089953920.0, "grad_norm": 0.7909924409821402, "language_loss": 0.60044181, "learning_rate": 1.7647269906734017e-06, "loss": 0.62305367, "num_input_tokens_seen": 197625005, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.1875, "step": 9171, "time_per_iteration": 4.806863784790039 }, { "auxiliary_loss_clip": 0.01418167, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.25302601, "balance_loss_mlp": 1.01944017, "epoch": 0.5514504734706147, "flos": 18743522855040.0, "grad_norm": 1.6256597504623487, "language_loss": 0.7148512, "learning_rate": 1.7643402375959533e-06, "loss": 0.73942351, "num_input_tokens_seen": 197645050, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19628906, "step": 9172, "time_per_iteration": 2.872443675994873 }, { "auxiliary_loss_clip": 0.01412787, "auxiliary_loss_mlp": 0.01037618, "balance_loss_clip": 1.24724579, "balance_loss_mlp": 1.01768589, "epoch": 0.5515105967232827, "flos": 22280555318400.0, "grad_norm": 1.7573338556975262, "language_loss": 0.77055025, "learning_rate": 1.7639534934547474e-06, "loss": 0.79505432, "num_input_tokens_seen": 197663910, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19921875, "step": 9173, "time_per_iteration": 2.874814748764038 }, { "auxiliary_loss_clip": 0.01408263, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.24424613, "balance_loss_mlp": 1.0150162, "epoch": 0.5515707199759508, "flos": 22565996634240.0, "grad_norm": 2.588638353416107, "language_loss": 0.76004046, "learning_rate": 1.7635667582644484e-06, "loss": 0.78446341, "num_input_tokens_seen": 197681580, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19018555, "step": 9174, "time_per_iteration": 2.855783700942993 }, { "auxiliary_loss_clip": 0.01419899, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.25249386, "balance_loss_mlp": 1.01592088, "epoch": 0.5516308432286187, "flos": 28302204758400.0, "grad_norm": 2.0022832172105107, "language_loss": 0.73603451, "learning_rate": 1.7631800320397217e-06, "loss": 0.76058829, "num_input_tokens_seen": 197702095, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19567871, "step": 9175, "time_per_iteration": 2.9203622341156006 }, { "auxiliary_loss_clip": 0.01408131, "auxiliary_loss_mlp": 0.01034136, "balance_loss_clip": 1.2433548, "balance_loss_mlp": 1.01490748, "epoch": 0.5516909664812867, "flos": 18772144830720.0, "grad_norm": 1.8413960100493747, "language_loss": 0.6973623, "learning_rate": 1.7627933147952318e-06, "loss": 0.72178495, "num_input_tokens_seen": 197720720, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19238281, "step": 9176, "time_per_iteration": 2.8106584548950195 }, { "auxiliary_loss_clip": 0.01408194, "auxiliary_loss_mlp": 0.01032961, "balance_loss_clip": 1.24547851, "balance_loss_mlp": 1.01283789, "epoch": 0.5517510897339546, "flos": 27750442694400.0, "grad_norm": 1.6354286188960747, "language_loss": 0.71338803, "learning_rate": 1.7624066065456435e-06, "loss": 0.73779958, "num_input_tokens_seen": 197741820, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20117188, "step": 9177, "time_per_iteration": 2.897047758102417 }, { "auxiliary_loss_clip": 0.01416365, "auxiliary_loss_mlp": 0.01035412, "balance_loss_clip": 1.25022936, "balance_loss_mlp": 1.01518226, "epoch": 0.5518112129866226, "flos": 18413307169920.0, "grad_norm": 1.6774299880871542, "language_loss": 0.80288601, "learning_rate": 1.7620199073056204e-06, "loss": 0.82740378, "num_input_tokens_seen": 197759160, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20227051, "step": 9178, "time_per_iteration": 2.8258657455444336 }, { "auxiliary_loss_clip": 0.01432263, "auxiliary_loss_mlp": 0.01039888, "balance_loss_clip": 1.26222777, "balance_loss_mlp": 1.01937175, "epoch": 0.5518713362392905, "flos": 25093578188160.0, "grad_norm": 1.5941628618138666, "language_loss": 0.76223242, "learning_rate": 1.761633217089826e-06, "loss": 0.78695393, "num_input_tokens_seen": 197779760, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.20507812, "step": 9179, "time_per_iteration": 2.8562726974487305 }, { "auxiliary_loss_clip": 0.0141435, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 1.2493217, "balance_loss_mlp": 1.01604152, "epoch": 0.5519314594919585, "flos": 36552828199680.0, "grad_norm": 1.6896127172281417, "language_loss": 0.70872533, "learning_rate": 1.761246535912924e-06, "loss": 0.73322618, "num_input_tokens_seen": 197801545, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19702148, "step": 9180, "time_per_iteration": 2.967099189758301 }, { "auxiliary_loss_clip": 0.01416677, "auxiliary_loss_mlp": 0.01037261, "balance_loss_clip": 1.25074077, "balance_loss_mlp": 1.01628029, "epoch": 0.5519915827446265, "flos": 20458224748800.0, "grad_norm": 1.764710868450375, "language_loss": 0.68027431, "learning_rate": 1.7608598637895776e-06, "loss": 0.70481372, "num_input_tokens_seen": 197820760, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.2097168, "step": 9181, "time_per_iteration": 2.846630096435547 }, { "auxiliary_loss_clip": 0.01431307, "auxiliary_loss_mlp": 0.01036471, "balance_loss_clip": 1.25971186, "balance_loss_mlp": 1.01554966, "epoch": 0.5520517059972945, "flos": 23778325802880.0, "grad_norm": 2.35549704852929, "language_loss": 0.79626477, "learning_rate": 1.7604732007344486e-06, "loss": 0.82094252, "num_input_tokens_seen": 197840195, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20922852, "step": 9182, "time_per_iteration": 2.892591714859009 }, { "auxiliary_loss_clip": 0.01417203, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.24898088, "balance_loss_mlp": 1.01315212, "epoch": 0.5521118292499624, "flos": 22205620650240.0, "grad_norm": 3.16564790796765, "language_loss": 0.83848405, "learning_rate": 1.7600865467622003e-06, "loss": 0.86298704, "num_input_tokens_seen": 197859475, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19934082, "step": 9183, "time_per_iteration": 4.289189100265503 }, { "auxiliary_loss_clip": 0.01405937, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.24113655, "balance_loss_mlp": 1.01176751, "epoch": 0.5521719525026304, "flos": 23592912036480.0, "grad_norm": 1.288063980745063, "language_loss": 0.6804738, "learning_rate": 1.7596999018874936e-06, "loss": 0.70484579, "num_input_tokens_seen": 197879395, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19482422, "step": 9184, "time_per_iteration": 4.278069257736206 }, { "auxiliary_loss_clip": 0.0141068, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.24536335, "balance_loss_mlp": 1.0117358, "epoch": 0.5522320757552983, "flos": 26147984446080.0, "grad_norm": 1.5729762248420704, "language_loss": 0.77150178, "learning_rate": 1.7593132661249917e-06, "loss": 0.79593253, "num_input_tokens_seen": 197900815, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20654297, "step": 9185, "time_per_iteration": 4.260830402374268 }, { "auxiliary_loss_clip": 0.01419899, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.25253344, "balance_loss_mlp": 1.02308273, "epoch": 0.5522921990079663, "flos": 24685622657280.0, "grad_norm": 1.9935410701657963, "language_loss": 0.74460506, "learning_rate": 1.7589266394893536e-06, "loss": 0.76923847, "num_input_tokens_seen": 197918985, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20361328, "step": 9186, "time_per_iteration": 2.8752036094665527 }, { "auxiliary_loss_clip": 0.01429227, "auxiliary_loss_mlp": 0.01037071, "balance_loss_clip": 1.26080596, "balance_loss_mlp": 1.01731753, "epoch": 0.5523523222606344, "flos": 22758604323840.0, "grad_norm": 2.1455094395112217, "language_loss": 0.66742527, "learning_rate": 1.7585400219952421e-06, "loss": 0.69208819, "num_input_tokens_seen": 197937725, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19763184, "step": 9187, "time_per_iteration": 2.8720691204071045 }, { "auxiliary_loss_clip": 0.01424505, "auxiliary_loss_mlp": 0.01034883, "balance_loss_clip": 1.25719237, "balance_loss_mlp": 1.01523757, "epoch": 0.5524124455133023, "flos": 19765370839680.0, "grad_norm": 4.998499500343574, "language_loss": 0.78588784, "learning_rate": 1.758153413657318e-06, "loss": 0.81048173, "num_input_tokens_seen": 197955635, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19665527, "step": 9188, "time_per_iteration": 2.8594326972961426 }, { "auxiliary_loss_clip": 0.01416663, "auxiliary_loss_mlp": 0.01034802, "balance_loss_clip": 1.25061989, "balance_loss_mlp": 1.01518047, "epoch": 0.5524725687659703, "flos": 23305253725440.0, "grad_norm": 1.8102656894487326, "language_loss": 0.82758927, "learning_rate": 1.7577668144902394e-06, "loss": 0.85210389, "num_input_tokens_seen": 197974490, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19628906, "step": 9189, "time_per_iteration": 2.8807060718536377 }, { "auxiliary_loss_clip": 0.01416304, "auxiliary_loss_mlp": 0.01032744, "balance_loss_clip": 1.25225127, "balance_loss_mlp": 1.01218045, "epoch": 0.5525326920186382, "flos": 24872619991680.0, "grad_norm": 1.3154899229975119, "language_loss": 0.77203107, "learning_rate": 1.7573802245086684e-06, "loss": 0.79652154, "num_input_tokens_seen": 197995735, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20544434, "step": 9190, "time_per_iteration": 2.869014263153076 }, { "auxiliary_loss_clip": 0.0143361, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.26100039, "balance_loss_mlp": 1.01449072, "epoch": 0.5525928152713062, "flos": 13743540420480.0, "grad_norm": 2.7238300374719056, "language_loss": 0.80434698, "learning_rate": 1.7569936437272627e-06, "loss": 0.82903063, "num_input_tokens_seen": 198009685, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.20263672, "step": 9191, "time_per_iteration": 2.8163223266601562 }, { "auxiliary_loss_clip": 0.01420573, "auxiliary_loss_mlp": 0.0103817, "balance_loss_clip": 1.25472331, "balance_loss_mlp": 1.0184412, "epoch": 0.5526529385239741, "flos": 13077498695040.0, "grad_norm": 2.069404222315648, "language_loss": 0.69438744, "learning_rate": 1.7566070721606829e-06, "loss": 0.71897489, "num_input_tokens_seen": 198026845, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19714355, "step": 9192, "time_per_iteration": 2.8063876628875732 }, { "auxiliary_loss_clip": 0.01414292, "auxiliary_loss_mlp": 0.01035913, "balance_loss_clip": 1.25039577, "balance_loss_mlp": 1.01669693, "epoch": 0.5527130617766421, "flos": 23158642014720.0, "grad_norm": 2.013441039073215, "language_loss": 0.7815702, "learning_rate": 1.756220509823588e-06, "loss": 0.80607224, "num_input_tokens_seen": 198045275, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19226074, "step": 9193, "time_per_iteration": 2.836169958114624 }, { "auxiliary_loss_clip": 0.01410259, "auxiliary_loss_mlp": 0.01036318, "balance_loss_clip": 1.24575758, "balance_loss_mlp": 1.01665998, "epoch": 0.55277318502931, "flos": 21294794701440.0, "grad_norm": 1.9676124675269684, "language_loss": 0.79464185, "learning_rate": 1.7558339567306344e-06, "loss": 0.81910759, "num_input_tokens_seen": 198065760, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19641113, "step": 9194, "time_per_iteration": 2.8515470027923584 }, { "auxiliary_loss_clip": 0.0143172, "auxiliary_loss_mlp": 0.01037527, "balance_loss_clip": 1.2583909, "balance_loss_mlp": 1.01715374, "epoch": 0.5528333082819781, "flos": 38338256995200.0, "grad_norm": 2.0012023834639265, "language_loss": 0.70325541, "learning_rate": 1.7554474128964825e-06, "loss": 0.72794795, "num_input_tokens_seen": 198087595, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.20361328, "step": 9195, "time_per_iteration": 2.995591402053833 }, { "auxiliary_loss_clip": 0.01432579, "auxiliary_loss_mlp": 0.01034829, "balance_loss_clip": 1.25924468, "balance_loss_mlp": 1.01481342, "epoch": 0.552893431534646, "flos": 13561655748480.0, "grad_norm": 4.437874115784483, "language_loss": 0.7518003, "learning_rate": 1.7550608783357887e-06, "loss": 0.77647436, "num_input_tokens_seen": 198104620, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.20019531, "step": 9196, "time_per_iteration": 2.8335769176483154 }, { "auxiliary_loss_clip": 0.01417032, "auxiliary_loss_mlp": 0.01037922, "balance_loss_clip": 1.25215244, "balance_loss_mlp": 1.0178237, "epoch": 0.552953554787314, "flos": 21948077393280.0, "grad_norm": 1.5009993245466737, "language_loss": 0.77504855, "learning_rate": 1.7546743530632115e-06, "loss": 0.7995981, "num_input_tokens_seen": 198123565, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20092773, "step": 9197, "time_per_iteration": 2.836165428161621 }, { "auxiliary_loss_clip": 0.01409879, "auxiliary_loss_mlp": 0.01032227, "balance_loss_clip": 1.24623275, "balance_loss_mlp": 1.0129745, "epoch": 0.5530136780399819, "flos": 43674789386880.0, "grad_norm": 1.5022509385703051, "language_loss": 0.76980966, "learning_rate": 1.754287837093407e-06, "loss": 0.7942307, "num_input_tokens_seen": 198148270, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19238281, "step": 9198, "time_per_iteration": 3.1392149925231934 }, { "auxiliary_loss_clip": 0.01414737, "auxiliary_loss_mlp": 0.01032642, "balance_loss_clip": 1.2492981, "balance_loss_mlp": 1.01415205, "epoch": 0.5530738012926499, "flos": 25056223966080.0, "grad_norm": 8.912757707948662, "language_loss": 0.79782701, "learning_rate": 1.7539013304410327e-06, "loss": 0.82230079, "num_input_tokens_seen": 198168810, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.18481445, "step": 9199, "time_per_iteration": 2.854294538497925 }, { "auxiliary_loss_clip": 0.01406636, "auxiliary_loss_mlp": 0.01035949, "balance_loss_clip": 1.24257565, "balance_loss_mlp": 1.01703012, "epoch": 0.553133924545318, "flos": 16480768970880.0, "grad_norm": 1.801779231346644, "language_loss": 0.6401788, "learning_rate": 1.7535148331207443e-06, "loss": 0.66460466, "num_input_tokens_seen": 198186200, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18908691, "step": 9200, "time_per_iteration": 2.8390984535217285 }, { "auxiliary_loss_clip": 0.01429496, "auxiliary_loss_mlp": 0.01037068, "balance_loss_clip": 1.25890565, "balance_loss_mlp": 1.01605117, "epoch": 0.5531940477979859, "flos": 24616434078720.0, "grad_norm": 1.454438209389934, "language_loss": 0.66725814, "learning_rate": 1.7531283451471978e-06, "loss": 0.6919238, "num_input_tokens_seen": 198207050, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.21020508, "step": 9201, "time_per_iteration": 2.880108594894409 }, { "auxiliary_loss_clip": 0.01429662, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.26409912, "balance_loss_mlp": 1.01801109, "epoch": 0.5532541710506539, "flos": 22168628386560.0, "grad_norm": 2.2356712301169224, "language_loss": 0.61858606, "learning_rate": 1.7527418665350502e-06, "loss": 0.64326793, "num_input_tokens_seen": 198224565, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20532227, "step": 9202, "time_per_iteration": 2.876905918121338 }, { "auxiliary_loss_clip": 0.01399678, "auxiliary_loss_mlp": 0.01035042, "balance_loss_clip": 1.2379663, "balance_loss_mlp": 1.01515841, "epoch": 0.5533142943033218, "flos": 21407128836480.0, "grad_norm": 1.4704429421879965, "language_loss": 0.65283406, "learning_rate": 1.7523553972989548e-06, "loss": 0.67718124, "num_input_tokens_seen": 198244790, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19873047, "step": 9203, "time_per_iteration": 2.8461949825286865 }, { "auxiliary_loss_clip": 0.01416522, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 1.25109816, "balance_loss_mlp": 1.01380539, "epoch": 0.5533744175559898, "flos": 23561122924800.0, "grad_norm": 1.8391345588480252, "language_loss": 0.64408815, "learning_rate": 1.7519689374535683e-06, "loss": 0.66859198, "num_input_tokens_seen": 198264375, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20068359, "step": 9204, "time_per_iteration": 3.003610372543335 }, { "auxiliary_loss_clip": 0.01398033, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.23652864, "balance_loss_mlp": 1.01308644, "epoch": 0.5534345408086577, "flos": 24072273141120.0, "grad_norm": 2.00345854403708, "language_loss": 0.78001666, "learning_rate": 1.7515824870135445e-06, "loss": 0.80432057, "num_input_tokens_seen": 198283895, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19274902, "step": 9205, "time_per_iteration": 2.8933606147766113 }, { "auxiliary_loss_clip": 0.01398523, "auxiliary_loss_mlp": 0.01035955, "balance_loss_clip": 1.23779058, "balance_loss_mlp": 1.01549828, "epoch": 0.5534946640613257, "flos": 33786434736000.0, "grad_norm": 1.8840794173613205, "language_loss": 0.72919661, "learning_rate": 1.751196045993537e-06, "loss": 0.75354135, "num_input_tokens_seen": 198310035, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20446777, "step": 9206, "time_per_iteration": 4.523120880126953 }, { "auxiliary_loss_clip": 0.01418609, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.25232375, "balance_loss_mlp": 1.01723647, "epoch": 0.5535547873139937, "flos": 15167733580800.0, "grad_norm": 2.0369270287556245, "language_loss": 0.76077515, "learning_rate": 1.7508096144082012e-06, "loss": 0.78532898, "num_input_tokens_seen": 198327810, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19543457, "step": 9207, "time_per_iteration": 2.850191116333008 }, { "auxiliary_loss_clip": 0.0142886, "auxiliary_loss_mlp": 0.01033908, "balance_loss_clip": 1.25689507, "balance_loss_mlp": 1.01246262, "epoch": 0.5536149105666617, "flos": 16989159254400.0, "grad_norm": 2.3519025494515793, "language_loss": 0.63673484, "learning_rate": 1.750423192272189e-06, "loss": 0.66136253, "num_input_tokens_seen": 198343150, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.21435547, "step": 9208, "time_per_iteration": 2.939732074737549 }, { "auxiliary_loss_clip": 0.01421846, "auxiliary_loss_mlp": 0.01034299, "balance_loss_clip": 1.25171947, "balance_loss_mlp": 1.01470089, "epoch": 0.5536750338193296, "flos": 18158931048960.0, "grad_norm": 2.0255170405615397, "language_loss": 0.65221727, "learning_rate": 1.7500367796001547e-06, "loss": 0.67677873, "num_input_tokens_seen": 198360925, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.19604492, "step": 9209, "time_per_iteration": 2.8680806159973145 }, { "auxiliary_loss_clip": 0.0141018, "auxiliary_loss_mlp": 0.01036845, "balance_loss_clip": 1.2452929, "balance_loss_mlp": 1.01600742, "epoch": 0.5537351570719976, "flos": 22758151875840.0, "grad_norm": 2.2166993799110406, "language_loss": 0.83873928, "learning_rate": 1.7496503764067513e-06, "loss": 0.86320949, "num_input_tokens_seen": 198379265, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20849609, "step": 9210, "time_per_iteration": 2.8926303386688232 }, { "auxiliary_loss_clip": 0.01416244, "auxiliary_loss_mlp": 0.01034615, "balance_loss_clip": 1.25239062, "balance_loss_mlp": 1.0143609, "epoch": 0.5537952803246655, "flos": 26366318444160.0, "grad_norm": 1.7220019171202197, "language_loss": 0.73350996, "learning_rate": 1.74926398270663e-06, "loss": 0.75801861, "num_input_tokens_seen": 198399490, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.20251465, "step": 9211, "time_per_iteration": 2.927570104598999 }, { "auxiliary_loss_clip": 0.01424956, "auxiliary_loss_mlp": 0.01036576, "balance_loss_clip": 1.25442648, "balance_loss_mlp": 1.01600051, "epoch": 0.5538554035773335, "flos": 18045601528320.0, "grad_norm": 2.1088253189996737, "language_loss": 0.6796459, "learning_rate": 1.7488775985144437e-06, "loss": 0.70426118, "num_input_tokens_seen": 198419110, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.20568848, "step": 9212, "time_per_iteration": 2.866010904312134 }, { "auxiliary_loss_clip": 0.0141657, "auxiliary_loss_mlp": 0.01033657, "balance_loss_clip": 1.24584687, "balance_loss_mlp": 1.01279509, "epoch": 0.5539155268300014, "flos": 31698778803840.0, "grad_norm": 1.4160599044896138, "language_loss": 0.52450621, "learning_rate": 1.7484912238448443e-06, "loss": 0.54900849, "num_input_tokens_seen": 198441360, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.20849609, "step": 9213, "time_per_iteration": 2.9189345836639404 }, { "auxiliary_loss_clip": 0.01421778, "auxiliary_loss_mlp": 0.01037358, "balance_loss_clip": 1.25391328, "balance_loss_mlp": 1.01708055, "epoch": 0.5539756500826695, "flos": 15201784932480.0, "grad_norm": 1.8412303452432042, "language_loss": 0.86802304, "learning_rate": 1.7481048587124827e-06, "loss": 0.89261436, "num_input_tokens_seen": 198459835, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20288086, "step": 9214, "time_per_iteration": 2.8368709087371826 }, { "auxiliary_loss_clip": 0.01406519, "auxiliary_loss_mlp": 0.01034478, "balance_loss_clip": 1.24217498, "balance_loss_mlp": 1.01462984, "epoch": 0.5540357733353375, "flos": 26362653615360.0, "grad_norm": 1.623618854730382, "language_loss": 0.70911413, "learning_rate": 1.7477185031320108e-06, "loss": 0.73352408, "num_input_tokens_seen": 198478955, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.1986084, "step": 9215, "time_per_iteration": 2.8909409046173096 }, { "auxiliary_loss_clip": 0.01421575, "auxiliary_loss_mlp": 0.01035962, "balance_loss_clip": 1.25242436, "balance_loss_mlp": 1.0156244, "epoch": 0.5540958965880054, "flos": 21333370533120.0, "grad_norm": 1.594206212948758, "language_loss": 0.73872721, "learning_rate": 1.7473321571180773e-06, "loss": 0.76330256, "num_input_tokens_seen": 198499030, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20349121, "step": 9216, "time_per_iteration": 2.903143882751465 }, { "auxiliary_loss_clip": 0.0140819, "auxiliary_loss_mlp": 0.01041681, "balance_loss_clip": 1.24578953, "balance_loss_mlp": 1.02172542, "epoch": 0.5541560198406734, "flos": 25677989015040.0, "grad_norm": 2.4403332655548304, "language_loss": 0.72662246, "learning_rate": 1.7469458206853345e-06, "loss": 0.75112116, "num_input_tokens_seen": 198520265, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19946289, "step": 9217, "time_per_iteration": 2.861161708831787 }, { "auxiliary_loss_clip": 0.01415335, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.25100279, "balance_loss_mlp": 1.01333976, "epoch": 0.5542161430933413, "flos": 21948348862080.0, "grad_norm": 1.6659257658059903, "language_loss": 0.78924412, "learning_rate": 1.7465594938484315e-06, "loss": 0.81372368, "num_input_tokens_seen": 198539645, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19287109, "step": 9218, "time_per_iteration": 4.258141279220581 }, { "auxiliary_loss_clip": 0.01421345, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.25280154, "balance_loss_mlp": 1.01549494, "epoch": 0.5542762663460093, "flos": 19580454766080.0, "grad_norm": 2.56049766497957, "language_loss": 0.73069715, "learning_rate": 1.7461731766220176e-06, "loss": 0.75527787, "num_input_tokens_seen": 198558710, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.2121582, "step": 9219, "time_per_iteration": 5.690054178237915 }, { "auxiliary_loss_clip": 0.01428171, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.25945377, "balance_loss_mlp": 1.01529872, "epoch": 0.5543363895986773, "flos": 19508189541120.0, "grad_norm": 1.4930965608391995, "language_loss": 0.7224474, "learning_rate": 1.7457868690207426e-06, "loss": 0.74709022, "num_input_tokens_seen": 198577050, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20812988, "step": 9220, "time_per_iteration": 2.9086062908172607 }, { "auxiliary_loss_clip": 0.01409959, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.24628723, "balance_loss_mlp": 1.01288795, "epoch": 0.5543965128513453, "flos": 22645229558400.0, "grad_norm": 1.610922436656548, "language_loss": 0.80456412, "learning_rate": 1.7454005710592547e-06, "loss": 0.82899433, "num_input_tokens_seen": 198595290, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.20166016, "step": 9221, "time_per_iteration": 2.842595338821411 }, { "auxiliary_loss_clip": 0.01412032, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.24885941, "balance_loss_mlp": 1.01937342, "epoch": 0.5544566361040132, "flos": 25999834412160.0, "grad_norm": 1.7039732439949604, "language_loss": 0.84409475, "learning_rate": 1.7450142827522027e-06, "loss": 0.8686167, "num_input_tokens_seen": 198614110, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.20800781, "step": 9222, "time_per_iteration": 2.8829965591430664 }, { "auxiliary_loss_clip": 0.01419257, "auxiliary_loss_mlp": 0.01041256, "balance_loss_clip": 1.24984837, "balance_loss_mlp": 1.02021563, "epoch": 0.5545167593566812, "flos": 28269465505920.0, "grad_norm": 1.7254794986261688, "language_loss": 0.76626468, "learning_rate": 1.7446280041142344e-06, "loss": 0.79086977, "num_input_tokens_seen": 198633880, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.21032715, "step": 9223, "time_per_iteration": 2.8942532539367676 }, { "auxiliary_loss_clip": 0.01417053, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.24959016, "balance_loss_mlp": 1.01695633, "epoch": 0.5545768826093491, "flos": 28488523420800.0, "grad_norm": 1.507401974586855, "language_loss": 0.8256005, "learning_rate": 1.7442417351599986e-06, "loss": 0.85014516, "num_input_tokens_seen": 198653505, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20458984, "step": 9224, "time_per_iteration": 2.9010984897613525 }, { "auxiliary_loss_clip": 0.01422363, "auxiliary_loss_mlp": 0.01040041, "balance_loss_clip": 1.25489759, "balance_loss_mlp": 1.01984715, "epoch": 0.5546370058620171, "flos": 18487065473280.0, "grad_norm": 1.975020045648244, "language_loss": 0.58196604, "learning_rate": 1.743855475904141e-06, "loss": 0.60659015, "num_input_tokens_seen": 198671890, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20202637, "step": 9225, "time_per_iteration": 2.8291940689086914 }, { "auxiliary_loss_clip": 0.01418124, "auxiliary_loss_mlp": 0.01035641, "balance_loss_clip": 1.24927175, "balance_loss_mlp": 1.01576853, "epoch": 0.554697129114685, "flos": 22940986688640.0, "grad_norm": 1.5678424600177723, "language_loss": 0.68233633, "learning_rate": 1.7434692263613098e-06, "loss": 0.70687395, "num_input_tokens_seen": 198691995, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.1986084, "step": 9226, "time_per_iteration": 2.8624730110168457 }, { "auxiliary_loss_clip": 0.01419819, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.25186694, "balance_loss_mlp": 1.01707602, "epoch": 0.5547572523673531, "flos": 21806759324160.0, "grad_norm": 4.618002408653813, "language_loss": 0.7545352, "learning_rate": 1.7430829865461518e-06, "loss": 0.77910411, "num_input_tokens_seen": 198712440, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.1998291, "step": 9227, "time_per_iteration": 2.8682520389556885 }, { "auxiliary_loss_clip": 0.01435506, "auxiliary_loss_mlp": 0.01033776, "balance_loss_clip": 1.26647329, "balance_loss_mlp": 1.01417756, "epoch": 0.5548173756200211, "flos": 22352141871360.0, "grad_norm": 1.5576251326732928, "language_loss": 0.74364966, "learning_rate": 1.7426967564733118e-06, "loss": 0.76834249, "num_input_tokens_seen": 198731515, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.19604492, "step": 9228, "time_per_iteration": 2.980915069580078 }, { "auxiliary_loss_clip": 0.01432521, "auxiliary_loss_mlp": 0.01033507, "balance_loss_clip": 1.26523566, "balance_loss_mlp": 1.01358664, "epoch": 0.554877498872689, "flos": 17867834133120.0, "grad_norm": 1.7808331930101902, "language_loss": 0.76950371, "learning_rate": 1.7423105361574373e-06, "loss": 0.79416406, "num_input_tokens_seen": 198749750, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19909668, "step": 9229, "time_per_iteration": 2.850552558898926 }, { "auxiliary_loss_clip": 0.01425061, "auxiliary_loss_mlp": 0.01045709, "balance_loss_clip": 1.25621772, "balance_loss_mlp": 1.02481198, "epoch": 0.554937622125357, "flos": 17247245448960.0, "grad_norm": 1.4252960391919423, "language_loss": 0.69392419, "learning_rate": 1.741924325613172e-06, "loss": 0.71863192, "num_input_tokens_seen": 198768320, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20910645, "step": 9230, "time_per_iteration": 2.8476011753082275 }, { "auxiliary_loss_clip": 0.01432851, "auxiliary_loss_mlp": 0.01038543, "balance_loss_clip": 1.26247191, "balance_loss_mlp": 1.01821828, "epoch": 0.5549977453780249, "flos": 25377390691200.0, "grad_norm": 2.4032648541210895, "language_loss": 0.69675225, "learning_rate": 1.741538124855163e-06, "loss": 0.72146624, "num_input_tokens_seen": 198787230, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20324707, "step": 9231, "time_per_iteration": 2.8714869022369385 }, { "auxiliary_loss_clip": 0.01444207, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.27080202, "balance_loss_mlp": 1.01804042, "epoch": 0.555057868630693, "flos": 25089098952960.0, "grad_norm": 1.7495695639287634, "language_loss": 0.7849642, "learning_rate": 1.7411519338980548e-06, "loss": 0.80979389, "num_input_tokens_seen": 198806720, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20727539, "step": 9232, "time_per_iteration": 2.882852077484131 }, { "auxiliary_loss_clip": 0.01414326, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.2501204, "balance_loss_mlp": 1.01311815, "epoch": 0.5551179918833609, "flos": 26115380928000.0, "grad_norm": 1.7272922386620237, "language_loss": 0.83804309, "learning_rate": 1.7407657527564898e-06, "loss": 0.86251104, "num_input_tokens_seen": 198826235, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19335938, "step": 9233, "time_per_iteration": 2.8633804321289062 }, { "auxiliary_loss_clip": 0.01437391, "auxiliary_loss_mlp": 0.01038595, "balance_loss_clip": 1.26534951, "balance_loss_mlp": 1.01875877, "epoch": 0.5551781151360289, "flos": 19392597780480.0, "grad_norm": 2.024008225896809, "language_loss": 0.76064992, "learning_rate": 1.7403795814451142e-06, "loss": 0.78540981, "num_input_tokens_seen": 198842655, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.19848633, "step": 9234, "time_per_iteration": 2.841935634613037 }, { "auxiliary_loss_clip": 0.01419435, "auxiliary_loss_mlp": 0.01034162, "balance_loss_clip": 1.2537241, "balance_loss_mlp": 1.01511276, "epoch": 0.5552382383886968, "flos": 21735987177600.0, "grad_norm": 1.9233048336552236, "language_loss": 0.66656262, "learning_rate": 1.7399934199785706e-06, "loss": 0.69109857, "num_input_tokens_seen": 198861210, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19055176, "step": 9235, "time_per_iteration": 2.842550754547119 }, { "auxiliary_loss_clip": 0.01429017, "auxiliary_loss_mlp": 0.01035003, "balance_loss_clip": 1.26183391, "balance_loss_mlp": 1.01493979, "epoch": 0.5552983616413648, "flos": 14364400573440.0, "grad_norm": 1.8626865383138118, "language_loss": 0.69477957, "learning_rate": 1.7396072683715029e-06, "loss": 0.71941978, "num_input_tokens_seen": 198880045, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20068359, "step": 9236, "time_per_iteration": 2.8254916667938232 }, { "auxiliary_loss_clip": 0.01418376, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.25495124, "balance_loss_mlp": 1.0165484, "epoch": 0.5553584848940327, "flos": 25488141258240.0, "grad_norm": 1.572757364299027, "language_loss": 0.86476958, "learning_rate": 1.7392211266385536e-06, "loss": 0.88931406, "num_input_tokens_seen": 198900210, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19519043, "step": 9237, "time_per_iteration": 2.870772361755371 }, { "auxiliary_loss_clip": 0.01417655, "auxiliary_loss_mlp": 0.01037892, "balance_loss_clip": 1.25427485, "balance_loss_mlp": 1.01806784, "epoch": 0.5554186081467007, "flos": 22174012517760.0, "grad_norm": 3.2005170364381152, "language_loss": 0.74582595, "learning_rate": 1.7388349947943652e-06, "loss": 0.77038139, "num_input_tokens_seen": 198919055, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19824219, "step": 9238, "time_per_iteration": 2.91479229927063 }, { "auxiliary_loss_clip": 0.01434514, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.26324713, "balance_loss_mlp": 1.0163548, "epoch": 0.5554787313993687, "flos": 49763862858240.0, "grad_norm": 1.8519414027964267, "language_loss": 0.7908051, "learning_rate": 1.73844887285358e-06, "loss": 0.81551003, "num_input_tokens_seen": 198943505, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.19641113, "step": 9239, "time_per_iteration": 3.106022596359253 }, { "auxiliary_loss_clip": 0.0143394, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.26566958, "balance_loss_mlp": 1.01657414, "epoch": 0.5555388546520367, "flos": 22137517946880.0, "grad_norm": 2.3376443846190447, "language_loss": 0.80747378, "learning_rate": 1.7380627608308393e-06, "loss": 0.83217126, "num_input_tokens_seen": 198963590, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19238281, "step": 9240, "time_per_iteration": 4.314273118972778 }, { "auxiliary_loss_clip": 0.01417105, "auxiliary_loss_mlp": 0.01044246, "balance_loss_clip": 1.25164926, "balance_loss_mlp": 1.02409923, "epoch": 0.5555989779047047, "flos": 24693133294080.0, "grad_norm": 2.178501707522861, "language_loss": 0.66640037, "learning_rate": 1.737676658740786e-06, "loss": 0.69101393, "num_input_tokens_seen": 198982680, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20141602, "step": 9241, "time_per_iteration": 2.9098565578460693 }, { "auxiliary_loss_clip": 0.01428309, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.26077306, "balance_loss_mlp": 1.0215342, "epoch": 0.5556591011573726, "flos": 16114918366080.0, "grad_norm": 2.050148132496567, "language_loss": 0.7352308, "learning_rate": 1.7372905665980594e-06, "loss": 0.75992614, "num_input_tokens_seen": 199000185, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19689941, "step": 9242, "time_per_iteration": 2.8350276947021484 }, { "auxiliary_loss_clip": 0.01432142, "auxiliary_loss_mlp": 0.01043459, "balance_loss_clip": 1.26338899, "balance_loss_mlp": 1.02308667, "epoch": 0.5557192244100406, "flos": 12941655246720.0, "grad_norm": 1.8766084468539421, "language_loss": 0.65139353, "learning_rate": 1.7369044844173012e-06, "loss": 0.67614961, "num_input_tokens_seen": 199018380, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.20373535, "step": 9243, "time_per_iteration": 2.852041482925415 }, { "auxiliary_loss_clip": 0.01423938, "auxiliary_loss_mlp": 0.01037992, "balance_loss_clip": 1.25721049, "balance_loss_mlp": 1.01884675, "epoch": 0.5557793476627085, "flos": 23121921219840.0, "grad_norm": 6.706419277091673, "language_loss": 0.75821471, "learning_rate": 1.7365184122131509e-06, "loss": 0.78283405, "num_input_tokens_seen": 199037115, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19140625, "step": 9244, "time_per_iteration": 2.8342297077178955 }, { "auxiliary_loss_clip": 0.01411375, "auxiliary_loss_mlp": 0.01040215, "balance_loss_clip": 1.24973071, "balance_loss_mlp": 1.0205934, "epoch": 0.5558394709153766, "flos": 21436429484160.0, "grad_norm": 2.6354787384983007, "language_loss": 0.75685728, "learning_rate": 1.7361323500002486e-06, "loss": 0.78137326, "num_input_tokens_seen": 199053375, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19616699, "step": 9245, "time_per_iteration": 2.836925983428955 }, { "auxiliary_loss_clip": 0.01441661, "auxiliary_loss_mlp": 0.01040462, "balance_loss_clip": 1.27015018, "balance_loss_mlp": 1.01945734, "epoch": 0.5558995941680445, "flos": 25088103567360.0, "grad_norm": 1.9305676850147917, "language_loss": 0.80472702, "learning_rate": 1.7357462977932348e-06, "loss": 0.82954824, "num_input_tokens_seen": 199070930, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.21020508, "step": 9246, "time_per_iteration": 2.8689868450164795 }, { "auxiliary_loss_clip": 0.01437042, "auxiliary_loss_mlp": 0.01037168, "balance_loss_clip": 1.26835179, "balance_loss_mlp": 1.01736712, "epoch": 0.5559597174207125, "flos": 20020742346240.0, "grad_norm": 2.399745345100065, "language_loss": 0.7439183, "learning_rate": 1.7353602556067471e-06, "loss": 0.76866043, "num_input_tokens_seen": 199088675, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.19812012, "step": 9247, "time_per_iteration": 2.981285333633423 }, { "auxiliary_loss_clip": 0.01420917, "auxiliary_loss_mlp": 0.01037554, "balance_loss_clip": 1.2533536, "balance_loss_mlp": 1.01673985, "epoch": 0.5560198406733804, "flos": 16844131111680.0, "grad_norm": 2.840685804059471, "language_loss": 0.77285171, "learning_rate": 1.7349742234554254e-06, "loss": 0.79743648, "num_input_tokens_seen": 199103075, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20800781, "step": 9248, "time_per_iteration": 2.8557651042938232 }, { "auxiliary_loss_clip": 0.01211684, "auxiliary_loss_mlp": 0.01022165, "balance_loss_clip": 1.11866426, "balance_loss_mlp": 1.00480831, "epoch": 0.5560799639260484, "flos": 70731129300480.0, "grad_norm": 0.847614633551443, "language_loss": 0.59502852, "learning_rate": 1.7345882013539081e-06, "loss": 0.61736703, "num_input_tokens_seen": 199160325, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.17382812, "step": 9249, "time_per_iteration": 3.4729621410369873 }, { "auxiliary_loss_clip": 0.01431435, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.2606771, "balance_loss_mlp": 1.01443839, "epoch": 0.5561400871787163, "flos": 23158913483520.0, "grad_norm": 1.913626393946407, "language_loss": 0.80674154, "learning_rate": 1.734202189316832e-06, "loss": 0.83139145, "num_input_tokens_seen": 199179760, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.19116211, "step": 9250, "time_per_iteration": 2.842432737350464 }, { "auxiliary_loss_clip": 0.01434941, "auxiliary_loss_mlp": 0.01033275, "balance_loss_clip": 1.26349056, "balance_loss_mlp": 1.01341462, "epoch": 0.5562002104313843, "flos": 17575334628480.0, "grad_norm": 4.140769766009932, "language_loss": 0.70562959, "learning_rate": 1.733816187358836e-06, "loss": 0.73031169, "num_input_tokens_seen": 199196695, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.1986084, "step": 9251, "time_per_iteration": 2.8029706478118896 }, { "auxiliary_loss_clip": 0.01427532, "auxiliary_loss_mlp": 0.01034359, "balance_loss_clip": 1.25880814, "balance_loss_mlp": 1.01480913, "epoch": 0.5562603336840523, "flos": 25056133476480.0, "grad_norm": 1.6235100342155677, "language_loss": 0.76054144, "learning_rate": 1.7334301954945569e-06, "loss": 0.7851603, "num_input_tokens_seen": 199217845, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19543457, "step": 9252, "time_per_iteration": 2.8871986865997314 }, { "auxiliary_loss_clip": 0.01439233, "auxiliary_loss_mlp": 0.01036303, "balance_loss_clip": 1.26668549, "balance_loss_mlp": 1.01641941, "epoch": 0.5563204569367203, "flos": 29070807742080.0, "grad_norm": 2.3376794423607814, "language_loss": 0.73558795, "learning_rate": 1.7330442137386313e-06, "loss": 0.76034331, "num_input_tokens_seen": 199239250, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.19885254, "step": 9253, "time_per_iteration": 4.358929395675659 }, { "auxiliary_loss_clip": 0.01429785, "auxiliary_loss_mlp": 0.01030084, "balance_loss_clip": 1.26221323, "balance_loss_mlp": 1.01034296, "epoch": 0.5563805801893883, "flos": 22100480438400.0, "grad_norm": 1.969050528774385, "language_loss": 0.83608669, "learning_rate": 1.7326582421056965e-06, "loss": 0.86068535, "num_input_tokens_seen": 199258320, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.1973877, "step": 9254, "time_per_iteration": 4.280235528945923 }, { "auxiliary_loss_clip": 0.01220104, "auxiliary_loss_mlp": 0.0101876, "balance_loss_clip": 1.1225481, "balance_loss_mlp": 0.99940091, "epoch": 0.5564407034420562, "flos": 58661724170880.0, "grad_norm": 0.8717556817510829, "language_loss": 0.64900929, "learning_rate": 1.732272280610387e-06, "loss": 0.67139798, "num_input_tokens_seen": 199314840, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.19335938, "step": 9255, "time_per_iteration": 4.573627233505249 }, { "auxiliary_loss_clip": 0.01428673, "auxiliary_loss_mlp": 0.01036494, "balance_loss_clip": 1.26257312, "balance_loss_mlp": 1.01786184, "epoch": 0.5565008266947242, "flos": 23122690381440.0, "grad_norm": 2.3795448880263432, "language_loss": 0.70203328, "learning_rate": 1.7318863292673399e-06, "loss": 0.72668505, "num_input_tokens_seen": 199335405, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.1862793, "step": 9256, "time_per_iteration": 2.866562604904175 }, { "auxiliary_loss_clip": 0.01410969, "auxiliary_loss_mlp": 0.01030729, "balance_loss_clip": 1.24761224, "balance_loss_mlp": 1.01182282, "epoch": 0.5565609499473921, "flos": 21587972878080.0, "grad_norm": 1.5915660823393525, "language_loss": 0.76202178, "learning_rate": 1.73150038809119e-06, "loss": 0.78643876, "num_input_tokens_seen": 199354345, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18908691, "step": 9257, "time_per_iteration": 2.8297667503356934 }, { "auxiliary_loss_clip": 0.01431724, "auxiliary_loss_mlp": 0.01032435, "balance_loss_clip": 1.26294756, "balance_loss_mlp": 1.01214552, "epoch": 0.5566210732000602, "flos": 18378893859840.0, "grad_norm": 2.1865536709232276, "language_loss": 0.61827201, "learning_rate": 1.7311144570965724e-06, "loss": 0.64291358, "num_input_tokens_seen": 199372250, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20300293, "step": 9258, "time_per_iteration": 2.84818959236145 }, { "auxiliary_loss_clip": 0.01433467, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.26387882, "balance_loss_mlp": 1.01742935, "epoch": 0.5566811964527281, "flos": 25714528830720.0, "grad_norm": 1.6310406172683192, "language_loss": 0.80072582, "learning_rate": 1.7307285362981215e-06, "loss": 0.82544202, "num_input_tokens_seen": 199392815, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20715332, "step": 9259, "time_per_iteration": 2.8979134559631348 }, { "auxiliary_loss_clip": 0.01415865, "auxiliary_loss_mlp": 0.01033118, "balance_loss_clip": 1.24972653, "balance_loss_mlp": 1.01355553, "epoch": 0.5567413197053961, "flos": 26955525219840.0, "grad_norm": 1.896720767264879, "language_loss": 0.8204664, "learning_rate": 1.7303426257104712e-06, "loss": 0.84495622, "num_input_tokens_seen": 199412375, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19567871, "step": 9260, "time_per_iteration": 2.885531425476074 }, { "auxiliary_loss_clip": 0.01432133, "auxiliary_loss_mlp": 0.01037254, "balance_loss_clip": 1.26328135, "balance_loss_mlp": 1.01691675, "epoch": 0.556801442958064, "flos": 20860479434880.0, "grad_norm": 1.422876516680739, "language_loss": 0.69380534, "learning_rate": 1.729956725348256e-06, "loss": 0.71849918, "num_input_tokens_seen": 199431490, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20349121, "step": 9261, "time_per_iteration": 2.8547275066375732 }, { "auxiliary_loss_clip": 0.01216215, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 1.1207794, "balance_loss_mlp": 1.00626886, "epoch": 0.556861566210732, "flos": 70527979572480.0, "grad_norm": 0.7432492172157791, "language_loss": 0.61172593, "learning_rate": 1.729570835226108e-06, "loss": 0.63415956, "num_input_tokens_seen": 199495855, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.20898438, "step": 9262, "time_per_iteration": 3.3514251708984375 }, { "auxiliary_loss_clip": 0.01432686, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.26388597, "balance_loss_mlp": 1.01422799, "epoch": 0.5569216894633999, "flos": 25348044798720.0, "grad_norm": 2.0333074872899446, "language_loss": 0.65219796, "learning_rate": 1.7291849553586622e-06, "loss": 0.67687142, "num_input_tokens_seen": 199515870, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.2043457, "step": 9263, "time_per_iteration": 2.8993165493011475 }, { "auxiliary_loss_clip": 0.01429936, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.26358628, "balance_loss_mlp": 1.01434755, "epoch": 0.556981812716068, "flos": 22649165856000.0, "grad_norm": 4.357551981511657, "language_loss": 0.73649466, "learning_rate": 1.7287990857605497e-06, "loss": 0.76112884, "num_input_tokens_seen": 199535745, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19128418, "step": 9264, "time_per_iteration": 2.876309394836426 }, { "auxiliary_loss_clip": 0.01428633, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.26083446, "balance_loss_mlp": 1.01496863, "epoch": 0.5570419359687359, "flos": 11043168399360.0, "grad_norm": 2.217781855285657, "language_loss": 0.77826941, "learning_rate": 1.7284132264464022e-06, "loss": 0.80290139, "num_input_tokens_seen": 199554035, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19592285, "step": 9265, "time_per_iteration": 2.8559341430664062 }, { "auxiliary_loss_clip": 0.01414961, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.25272107, "balance_loss_mlp": 1.01786196, "epoch": 0.5571020592214039, "flos": 22834127174400.0, "grad_norm": 1.5818533634717795, "language_loss": 0.71660846, "learning_rate": 1.7280273774308536e-06, "loss": 0.74112946, "num_input_tokens_seen": 199576120, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19274902, "step": 9266, "time_per_iteration": 2.954575777053833 }, { "auxiliary_loss_clip": 0.01414241, "auxiliary_loss_mlp": 0.01036404, "balance_loss_clip": 1.24916267, "balance_loss_mlp": 1.01685381, "epoch": 0.5571621824740719, "flos": 22937548083840.0, "grad_norm": 1.8483822437606747, "language_loss": 0.69238961, "learning_rate": 1.727641538728533e-06, "loss": 0.71689606, "num_input_tokens_seen": 199593780, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19543457, "step": 9267, "time_per_iteration": 2.848322629928589 }, { "auxiliary_loss_clip": 0.01416821, "auxiliary_loss_mlp": 0.01034803, "balance_loss_clip": 1.25416505, "balance_loss_mlp": 1.01565826, "epoch": 0.5572223057267398, "flos": 22977119301120.0, "grad_norm": 2.0282062803527965, "language_loss": 0.75365686, "learning_rate": 1.7272557103540736e-06, "loss": 0.77817309, "num_input_tokens_seen": 199613220, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19140625, "step": 9268, "time_per_iteration": 2.8956868648529053 }, { "auxiliary_loss_clip": 0.01420458, "auxiliary_loss_mlp": 0.01035608, "balance_loss_clip": 1.25604057, "balance_loss_mlp": 1.01697588, "epoch": 0.5572824289794078, "flos": 20969963147520.0, "grad_norm": 2.0142127660046323, "language_loss": 0.75951433, "learning_rate": 1.726869892322104e-06, "loss": 0.78407502, "num_input_tokens_seen": 199632085, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18640137, "step": 9269, "time_per_iteration": 2.8706235885620117 }, { "auxiliary_loss_clip": 0.01417318, "auxiliary_loss_mlp": 0.01036005, "balance_loss_clip": 1.25098372, "balance_loss_mlp": 1.0155127, "epoch": 0.5573425522320757, "flos": 25052332913280.0, "grad_norm": 1.8008261050843994, "language_loss": 0.837165, "learning_rate": 1.726484084647256e-06, "loss": 0.86169821, "num_input_tokens_seen": 199649295, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20495605, "step": 9270, "time_per_iteration": 2.882671594619751 }, { "auxiliary_loss_clip": 0.01433964, "auxiliary_loss_mlp": 0.01039542, "balance_loss_clip": 1.26463532, "balance_loss_mlp": 1.02026582, "epoch": 0.5574026754847438, "flos": 23670154189440.0, "grad_norm": 1.8727066345973866, "language_loss": 0.80649883, "learning_rate": 1.7260982873441591e-06, "loss": 0.83123386, "num_input_tokens_seen": 199668870, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.19287109, "step": 9271, "time_per_iteration": 2.854628086090088 }, { "auxiliary_loss_clip": 0.01426142, "auxiliary_loss_mlp": 0.01037506, "balance_loss_clip": 1.26070237, "balance_loss_mlp": 1.01819396, "epoch": 0.5574627987374117, "flos": 24790853358720.0, "grad_norm": 1.825897085425875, "language_loss": 0.90769219, "learning_rate": 1.725712500427442e-06, "loss": 0.93232864, "num_input_tokens_seen": 199684870, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19311523, "step": 9272, "time_per_iteration": 2.8624749183654785 }, { "auxiliary_loss_clip": 0.01409868, "auxiliary_loss_mlp": 0.01037753, "balance_loss_clip": 1.24770832, "balance_loss_mlp": 1.01951432, "epoch": 0.5575229219900797, "flos": 21844882707840.0, "grad_norm": 1.8957955201404233, "language_loss": 0.84707189, "learning_rate": 1.7253267239117347e-06, "loss": 0.87154806, "num_input_tokens_seen": 199701975, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18225098, "step": 9273, "time_per_iteration": 2.8766167163848877 }, { "auxiliary_loss_clip": 0.01413011, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.24861872, "balance_loss_mlp": 1.01924944, "epoch": 0.5575830452427476, "flos": 27825739320960.0, "grad_norm": 2.9992908999921197, "language_loss": 0.75172174, "learning_rate": 1.7249409578116655e-06, "loss": 0.77624249, "num_input_tokens_seen": 199721865, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19799805, "step": 9274, "time_per_iteration": 2.897397994995117 }, { "auxiliary_loss_clip": 0.01437291, "auxiliary_loss_mlp": 0.01040662, "balance_loss_clip": 1.26378989, "balance_loss_mlp": 1.01983619, "epoch": 0.5576431684954156, "flos": 17819440179840.0, "grad_norm": 3.0783446678549473, "language_loss": 0.7973572, "learning_rate": 1.7245552021418629e-06, "loss": 0.8221367, "num_input_tokens_seen": 199736455, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.20825195, "step": 9275, "time_per_iteration": 4.236073732376099 }, { "auxiliary_loss_clip": 0.01430793, "auxiliary_loss_mlp": 0.01034369, "balance_loss_clip": 1.26365089, "balance_loss_mlp": 1.01432955, "epoch": 0.5577032917480835, "flos": 15495279822720.0, "grad_norm": 1.7890290772592448, "language_loss": 0.76463413, "learning_rate": 1.7241694569169546e-06, "loss": 0.78928578, "num_input_tokens_seen": 199753125, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20031738, "step": 9276, "time_per_iteration": 2.8965048789978027 }, { "auxiliary_loss_clip": 0.01416858, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.25064206, "balance_loss_mlp": 1.01674294, "epoch": 0.5577634150007516, "flos": 21589692180480.0, "grad_norm": 2.0599009634448646, "language_loss": 0.76853293, "learning_rate": 1.7237837221515678e-06, "loss": 0.79305786, "num_input_tokens_seen": 199771365, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.18884277, "step": 9277, "time_per_iteration": 2.8652896881103516 }, { "auxiliary_loss_clip": 0.01407785, "auxiliary_loss_mlp": 0.01038339, "balance_loss_clip": 1.24409163, "balance_loss_mlp": 1.01946831, "epoch": 0.5578235382534195, "flos": 21148997397120.0, "grad_norm": 1.4742552708690304, "language_loss": 0.72323096, "learning_rate": 1.7233979978603304e-06, "loss": 0.74769217, "num_input_tokens_seen": 199790035, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18859863, "step": 9278, "time_per_iteration": 2.8807456493377686 }, { "auxiliary_loss_clip": 0.01419731, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.25138164, "balance_loss_mlp": 1.01260078, "epoch": 0.5578836615060875, "flos": 26516368759680.0, "grad_norm": 1.697758220821337, "language_loss": 0.76115042, "learning_rate": 1.723012284057868e-06, "loss": 0.7856791, "num_input_tokens_seen": 199811125, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20544434, "step": 9279, "time_per_iteration": 2.9149298667907715 }, { "auxiliary_loss_clip": 0.01424347, "auxiliary_loss_mlp": 0.01038112, "balance_loss_clip": 1.2568419, "balance_loss_mlp": 1.01790607, "epoch": 0.5579437847587555, "flos": 20162558108160.0, "grad_norm": 1.9944024837492509, "language_loss": 0.68205965, "learning_rate": 1.7226265807588082e-06, "loss": 0.70668423, "num_input_tokens_seen": 199829915, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20202637, "step": 9280, "time_per_iteration": 2.8445005416870117 }, { "auxiliary_loss_clip": 0.01434038, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.26390243, "balance_loss_mlp": 1.02019715, "epoch": 0.5580039080114234, "flos": 26112937708800.0, "grad_norm": 1.8286381368691422, "language_loss": 0.73993313, "learning_rate": 1.7222408879777763e-06, "loss": 0.76467228, "num_input_tokens_seen": 199850670, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.19677734, "step": 9281, "time_per_iteration": 2.9101452827453613 }, { "auxiliary_loss_clip": 0.0141895, "auxiliary_loss_mlp": 0.01040492, "balance_loss_clip": 1.25440645, "balance_loss_mlp": 1.02053607, "epoch": 0.5580640312640914, "flos": 13779537298560.0, "grad_norm": 3.4304962154970475, "language_loss": 0.75196344, "learning_rate": 1.7218552057293974e-06, "loss": 0.77655786, "num_input_tokens_seen": 199867645, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19958496, "step": 9282, "time_per_iteration": 2.829041004180908 }, { "auxiliary_loss_clip": 0.01414664, "auxiliary_loss_mlp": 0.01036441, "balance_loss_clip": 1.250126, "balance_loss_mlp": 1.01671159, "epoch": 0.5581241545167593, "flos": 17684954075520.0, "grad_norm": 1.7217431304903226, "language_loss": 0.66650379, "learning_rate": 1.721469534028297e-06, "loss": 0.69101483, "num_input_tokens_seen": 199886320, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19726562, "step": 9283, "time_per_iteration": 2.8560469150543213 }, { "auxiliary_loss_clip": 0.01419436, "auxiliary_loss_mlp": 0.01033512, "balance_loss_clip": 1.25245571, "balance_loss_mlp": 1.01476002, "epoch": 0.5581842777694274, "flos": 19577559098880.0, "grad_norm": 2.162969930711974, "language_loss": 0.83748436, "learning_rate": 1.7210838728890994e-06, "loss": 0.86201382, "num_input_tokens_seen": 199904895, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18737793, "step": 9284, "time_per_iteration": 2.8415160179138184 }, { "auxiliary_loss_clip": 0.01422443, "auxiliary_loss_mlp": 0.01036008, "balance_loss_clip": 1.25695956, "balance_loss_mlp": 1.01693463, "epoch": 0.5582444010220953, "flos": 20604972193920.0, "grad_norm": 3.165396790514728, "language_loss": 0.85994667, "learning_rate": 1.7206982223264304e-06, "loss": 0.88453114, "num_input_tokens_seen": 199921090, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.1907959, "step": 9285, "time_per_iteration": 2.865011692047119 }, { "auxiliary_loss_clip": 0.01414836, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.25034177, "balance_loss_mlp": 1.0210638, "epoch": 0.5583045242747633, "flos": 19144872645120.0, "grad_norm": 2.1443568091571255, "language_loss": 0.75049198, "learning_rate": 1.720312582354912e-06, "loss": 0.77505147, "num_input_tokens_seen": 199939925, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20056152, "step": 9286, "time_per_iteration": 2.83902907371521 }, { "auxiliary_loss_clip": 0.01415815, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.25092292, "balance_loss_mlp": 1.01477766, "epoch": 0.5583646475274312, "flos": 27466177743360.0, "grad_norm": 1.6057836200072089, "language_loss": 0.74798, "learning_rate": 1.7199269529891684e-06, "loss": 0.77248812, "num_input_tokens_seen": 199960015, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20214844, "step": 9287, "time_per_iteration": 2.9636423587799072 }, { "auxiliary_loss_clip": 0.01435097, "auxiliary_loss_mlp": 0.01037229, "balance_loss_clip": 1.26438236, "balance_loss_mlp": 1.01770294, "epoch": 0.5584247707800992, "flos": 23662779287040.0, "grad_norm": 1.5682248037557764, "language_loss": 0.75770253, "learning_rate": 1.7195413342438233e-06, "loss": 0.78242576, "num_input_tokens_seen": 199980505, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.1953125, "step": 9288, "time_per_iteration": 4.363052606582642 }, { "auxiliary_loss_clip": 0.01424051, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.25793839, "balance_loss_mlp": 1.01746917, "epoch": 0.5584848940327671, "flos": 13706774380800.0, "grad_norm": 6.124889762055855, "language_loss": 0.7923367, "learning_rate": 1.7191557261334984e-06, "loss": 0.81694871, "num_input_tokens_seen": 199999020, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19677734, "step": 9289, "time_per_iteration": 4.371615886688232 }, { "auxiliary_loss_clip": 0.01446536, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.27421403, "balance_loss_mlp": 1.01983571, "epoch": 0.5585450172854352, "flos": 27027564220800.0, "grad_norm": 1.7483631623971752, "language_loss": 0.62133467, "learning_rate": 1.718770128672817e-06, "loss": 0.64620894, "num_input_tokens_seen": 200019020, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.21057129, "step": 9290, "time_per_iteration": 4.2945849895477295 }, { "auxiliary_loss_clip": 0.01424829, "auxiliary_loss_mlp": 0.01030711, "balance_loss_clip": 1.2553668, "balance_loss_mlp": 1.01091075, "epoch": 0.5586051405381031, "flos": 23196131971200.0, "grad_norm": 1.8062052815106115, "language_loss": 0.68405575, "learning_rate": 1.7183845418764e-06, "loss": 0.70861113, "num_input_tokens_seen": 200038110, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19812012, "step": 9291, "time_per_iteration": 2.8358399868011475 }, { "auxiliary_loss_clip": 0.0141675, "auxiliary_loss_mlp": 0.01036416, "balance_loss_clip": 1.24974561, "balance_loss_mlp": 1.01656759, "epoch": 0.5586652637907711, "flos": 20784866094720.0, "grad_norm": 1.8476504235218487, "language_loss": 0.85069847, "learning_rate": 1.7179989657588698e-06, "loss": 0.87523013, "num_input_tokens_seen": 200056210, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.19836426, "step": 9292, "time_per_iteration": 2.8832287788391113 }, { "auxiliary_loss_clip": 0.01415172, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.25100327, "balance_loss_mlp": 1.01695204, "epoch": 0.5587253870434391, "flos": 28231477856640.0, "grad_norm": 4.029888371957987, "language_loss": 0.7477597, "learning_rate": 1.7176134003348476e-06, "loss": 0.77227372, "num_input_tokens_seen": 200075620, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19274902, "step": 9293, "time_per_iteration": 2.879192590713501 }, { "auxiliary_loss_clip": 0.0141109, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.2460475, "balance_loss_mlp": 1.01832652, "epoch": 0.558785510296107, "flos": 26627209816320.0, "grad_norm": 1.6827269062399426, "language_loss": 0.7279743, "learning_rate": 1.7172278456189523e-06, "loss": 0.75247073, "num_input_tokens_seen": 200095945, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20227051, "step": 9294, "time_per_iteration": 2.8775336742401123 }, { "auxiliary_loss_clip": 0.0141989, "auxiliary_loss_mlp": 0.01040077, "balance_loss_clip": 1.25350988, "balance_loss_mlp": 1.0206697, "epoch": 0.558845633548775, "flos": 20166313426560.0, "grad_norm": 3.003460357299817, "language_loss": 0.68947786, "learning_rate": 1.716842301625806e-06, "loss": 0.71407753, "num_input_tokens_seen": 200114185, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19396973, "step": 9295, "time_per_iteration": 2.8540284633636475 }, { "auxiliary_loss_clip": 0.01413907, "auxiliary_loss_mlp": 0.01037629, "balance_loss_clip": 1.24801052, "balance_loss_mlp": 1.01707697, "epoch": 0.5589057568014429, "flos": 24360926837760.0, "grad_norm": 1.5515043894462173, "language_loss": 0.81789756, "learning_rate": 1.7164567683700281e-06, "loss": 0.84241295, "num_input_tokens_seen": 200135030, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20544434, "step": 9296, "time_per_iteration": 2.9652867317199707 }, { "auxiliary_loss_clip": 0.01409637, "auxiliary_loss_mlp": 0.0104146, "balance_loss_clip": 1.24465823, "balance_loss_mlp": 1.0203954, "epoch": 0.558965880054111, "flos": 21114900800640.0, "grad_norm": 1.701714122889957, "language_loss": 0.65689182, "learning_rate": 1.7160712458662379e-06, "loss": 0.6814028, "num_input_tokens_seen": 200154290, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.21057129, "step": 9297, "time_per_iteration": 2.8729748725891113 }, { "auxiliary_loss_clip": 0.01433684, "auxiliary_loss_mlp": 0.01040131, "balance_loss_clip": 1.26402259, "balance_loss_mlp": 1.01977015, "epoch": 0.5590260033067789, "flos": 18443648448000.0, "grad_norm": 1.566592526248745, "language_loss": 0.76452398, "learning_rate": 1.7156857341290544e-06, "loss": 0.78926212, "num_input_tokens_seen": 200171555, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20361328, "step": 9298, "time_per_iteration": 2.850032329559326 }, { "auxiliary_loss_clip": 0.01224613, "auxiliary_loss_mlp": 0.01064895, "balance_loss_clip": 1.12164044, "balance_loss_mlp": 1.03628433, "epoch": 0.5590861265594469, "flos": 70608841309440.0, "grad_norm": 0.7209466849920116, "language_loss": 0.52462107, "learning_rate": 1.7153002331730967e-06, "loss": 0.54751611, "num_input_tokens_seen": 200237010, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.28515625, "step": 9299, "time_per_iteration": 3.482156991958618 }, { "auxiliary_loss_clip": 0.01401917, "auxiliary_loss_mlp": 0.01037942, "balance_loss_clip": 1.23959577, "balance_loss_mlp": 1.01681828, "epoch": 0.5591462498121148, "flos": 30676614105600.0, "grad_norm": 1.818869366568032, "language_loss": 0.70368147, "learning_rate": 1.7149147430129824e-06, "loss": 0.72808009, "num_input_tokens_seen": 200260820, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.21130371, "step": 9300, "time_per_iteration": 2.998533010482788 }, { "auxiliary_loss_clip": 0.01426893, "auxiliary_loss_mlp": 0.0104564, "balance_loss_clip": 1.25639915, "balance_loss_mlp": 1.02424169, "epoch": 0.5592063730647828, "flos": 18159564476160.0, "grad_norm": 1.705624328466361, "language_loss": 0.82484972, "learning_rate": 1.7145292636633293e-06, "loss": 0.8495751, "num_input_tokens_seen": 200278035, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.21398926, "step": 9301, "time_per_iteration": 2.8691420555114746 }, { "auxiliary_loss_clip": 0.0142428, "auxiliary_loss_mlp": 0.01037789, "balance_loss_clip": 1.25579929, "balance_loss_mlp": 1.01664114, "epoch": 0.5592664963174507, "flos": 24070870552320.0, "grad_norm": 1.924345217538142, "language_loss": 0.68566865, "learning_rate": 1.714143795138756e-06, "loss": 0.7102893, "num_input_tokens_seen": 200297255, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21142578, "step": 9302, "time_per_iteration": 2.896989107131958 }, { "auxiliary_loss_clip": 0.01426183, "auxiliary_loss_mlp": 0.01038328, "balance_loss_clip": 1.2552917, "balance_loss_mlp": 1.01704884, "epoch": 0.5593266195701188, "flos": 19837228861440.0, "grad_norm": 1.6307211442437868, "language_loss": 0.70959425, "learning_rate": 1.713758337453878e-06, "loss": 0.73423934, "num_input_tokens_seen": 200317505, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21276855, "step": 9303, "time_per_iteration": 2.861273765563965 }, { "auxiliary_loss_clip": 0.01406283, "auxiliary_loss_mlp": 0.01049962, "balance_loss_clip": 1.24444389, "balance_loss_mlp": 1.03006601, "epoch": 0.5593867428227867, "flos": 25311504983040.0, "grad_norm": 1.6023447496843732, "language_loss": 0.73489279, "learning_rate": 1.7133728906233124e-06, "loss": 0.75945526, "num_input_tokens_seen": 200338350, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19885254, "step": 9304, "time_per_iteration": 2.9268555641174316 }, { "auxiliary_loss_clip": 0.01402024, "auxiliary_loss_mlp": 0.01051206, "balance_loss_clip": 1.23684478, "balance_loss_mlp": 1.02980781, "epoch": 0.5594468660754547, "flos": 12940795595520.0, "grad_norm": 2.7310462241503353, "language_loss": 0.78705919, "learning_rate": 1.7129874546616763e-06, "loss": 0.81159151, "num_input_tokens_seen": 200353965, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.21411133, "step": 9305, "time_per_iteration": 2.8147459030151367 }, { "auxiliary_loss_clip": 0.01393784, "auxiliary_loss_mlp": 0.01049584, "balance_loss_clip": 1.2325691, "balance_loss_mlp": 1.02826929, "epoch": 0.5595069893281227, "flos": 19072109727360.0, "grad_norm": 2.5432460525039757, "language_loss": 0.70187515, "learning_rate": 1.7126020295835836e-06, "loss": 0.72630882, "num_input_tokens_seen": 200373595, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.21313477, "step": 9306, "time_per_iteration": 2.844775438308716 }, { "auxiliary_loss_clip": 0.0123112, "auxiliary_loss_mlp": 0.0104833, "balance_loss_clip": 1.12491226, "balance_loss_mlp": 1.01838517, "epoch": 0.5595671125807906, "flos": 70301546755200.0, "grad_norm": 0.9144979160315362, "language_loss": 0.60335124, "learning_rate": 1.7122166154036518e-06, "loss": 0.62614584, "num_input_tokens_seen": 200429155, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 0.29882812, "step": 9307, "time_per_iteration": 3.4641947746276855 }, { "auxiliary_loss_clip": 0.0139548, "auxiliary_loss_mlp": 0.01044742, "balance_loss_clip": 1.23317146, "balance_loss_mlp": 1.0235343, "epoch": 0.5596272358334586, "flos": 20674658465280.0, "grad_norm": 1.8246570396371218, "language_loss": 0.74453717, "learning_rate": 1.7118312121364943e-06, "loss": 0.76893938, "num_input_tokens_seen": 200448290, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.21203613, "step": 9308, "time_per_iteration": 2.8633666038513184 }, { "auxiliary_loss_clip": 0.01401732, "auxiliary_loss_mlp": 0.01054485, "balance_loss_clip": 1.23444819, "balance_loss_mlp": 1.03048825, "epoch": 0.5596873590861265, "flos": 25050568366080.0, "grad_norm": 1.7826291089004214, "language_loss": 0.70911634, "learning_rate": 1.7114458197967257e-06, "loss": 0.73367852, "num_input_tokens_seen": 200466555, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.24023438, "step": 9309, "time_per_iteration": 2.9289212226867676 }, { "auxiliary_loss_clip": 0.01413355, "auxiliary_loss_mlp": 0.0104225, "balance_loss_clip": 1.24766254, "balance_loss_mlp": 1.01973128, "epoch": 0.5597474823387946, "flos": 25969538378880.0, "grad_norm": 2.899117471551678, "language_loss": 0.76131952, "learning_rate": 1.7110604383989613e-06, "loss": 0.78587556, "num_input_tokens_seen": 200485980, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.22521973, "step": 9310, "time_per_iteration": 4.369547128677368 }, { "auxiliary_loss_clip": 0.01424579, "auxiliary_loss_mlp": 0.01042764, "balance_loss_clip": 1.25530648, "balance_loss_mlp": 1.02094924, "epoch": 0.5598076055914625, "flos": 26188686783360.0, "grad_norm": 1.9349595045523655, "language_loss": 0.703529, "learning_rate": 1.7106750679578133e-06, "loss": 0.72820234, "num_input_tokens_seen": 200504555, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21813965, "step": 9311, "time_per_iteration": 2.9144012928009033 }, { "auxiliary_loss_clip": 0.01404755, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.24051404, "balance_loss_mlp": 1.01965272, "epoch": 0.5598677288441305, "flos": 11663168901120.0, "grad_norm": 4.080888023727629, "language_loss": 0.73353374, "learning_rate": 1.7102897084878962e-06, "loss": 0.75799286, "num_input_tokens_seen": 200522700, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.21496582, "step": 9312, "time_per_iteration": 2.848167657852173 }, { "auxiliary_loss_clip": 0.01408004, "auxiliary_loss_mlp": 0.01037981, "balance_loss_clip": 1.24433696, "balance_loss_mlp": 1.01667845, "epoch": 0.5599278520967984, "flos": 22976983566720.0, "grad_norm": 1.9420917527393908, "language_loss": 0.90240407, "learning_rate": 1.709904360003822e-06, "loss": 0.92686391, "num_input_tokens_seen": 200541910, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.2130127, "step": 9313, "time_per_iteration": 2.840688467025757 }, { "auxiliary_loss_clip": 0.01405407, "auxiliary_loss_mlp": 0.01037713, "balance_loss_clip": 1.24227023, "balance_loss_mlp": 1.01648188, "epoch": 0.5599879753494664, "flos": 21225560878080.0, "grad_norm": 1.6355564578980195, "language_loss": 0.78113729, "learning_rate": 1.709519022520204e-06, "loss": 0.80556852, "num_input_tokens_seen": 200562600, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.21228027, "step": 9314, "time_per_iteration": 2.8981707096099854 }, { "auxiliary_loss_clip": 0.01404077, "auxiliary_loss_mlp": 0.01035195, "balance_loss_clip": 1.2403512, "balance_loss_mlp": 1.01465499, "epoch": 0.5600480986021343, "flos": 31915303009920.0, "grad_norm": 2.0085464865945655, "language_loss": 0.7098068, "learning_rate": 1.7091336960516537e-06, "loss": 0.73419952, "num_input_tokens_seen": 200584795, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20532227, "step": 9315, "time_per_iteration": 2.9505648612976074 }, { "auxiliary_loss_clip": 0.01424053, "auxiliary_loss_mlp": 0.010385, "balance_loss_clip": 1.25474882, "balance_loss_mlp": 1.01798379, "epoch": 0.5601082218548024, "flos": 28487889993600.0, "grad_norm": 1.7916021451333635, "language_loss": 0.67531383, "learning_rate": 1.7087483806127824e-06, "loss": 0.69993937, "num_input_tokens_seen": 200606945, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20507812, "step": 9316, "time_per_iteration": 2.908461093902588 }, { "auxiliary_loss_clip": 0.01395683, "auxiliary_loss_mlp": 0.01034305, "balance_loss_clip": 1.23247313, "balance_loss_mlp": 1.01362264, "epoch": 0.5601683451074703, "flos": 24107727081600.0, "grad_norm": 1.8772187334713977, "language_loss": 0.87311065, "learning_rate": 1.7083630762182022e-06, "loss": 0.89741051, "num_input_tokens_seen": 200626340, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20678711, "step": 9317, "time_per_iteration": 2.8824527263641357 }, { "auxiliary_loss_clip": 0.0141445, "auxiliary_loss_mlp": 0.01040446, "balance_loss_clip": 1.24603486, "balance_loss_mlp": 1.01865423, "epoch": 0.5602284683601383, "flos": 26366544668160.0, "grad_norm": 1.6855291213773649, "language_loss": 0.78416234, "learning_rate": 1.7079777828825233e-06, "loss": 0.80871129, "num_input_tokens_seen": 200644520, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21777344, "step": 9318, "time_per_iteration": 2.887216806411743 }, { "auxiliary_loss_clip": 0.01406962, "auxiliary_loss_mlp": 0.01044556, "balance_loss_clip": 1.24191833, "balance_loss_mlp": 1.02455282, "epoch": 0.5602885916128063, "flos": 24506497918080.0, "grad_norm": 1.6035752381846726, "language_loss": 0.76457369, "learning_rate": 1.7075925006203558e-06, "loss": 0.78908885, "num_input_tokens_seen": 200664845, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20007324, "step": 9319, "time_per_iteration": 2.9100589752197266 }, { "auxiliary_loss_clip": 0.01407235, "auxiliary_loss_mlp": 0.01040574, "balance_loss_clip": 1.24284172, "balance_loss_mlp": 1.02010536, "epoch": 0.5603487148654742, "flos": 27356558296320.0, "grad_norm": 16.766503018733943, "language_loss": 0.86186934, "learning_rate": 1.7072072294463101e-06, "loss": 0.88634747, "num_input_tokens_seen": 200686535, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20458984, "step": 9320, "time_per_iteration": 2.929385185241699 }, { "auxiliary_loss_clip": 0.01244405, "auxiliary_loss_mlp": 0.01063641, "balance_loss_clip": 1.14190555, "balance_loss_mlp": 1.03865516, "epoch": 0.5604088381181422, "flos": 54115376532480.0, "grad_norm": 0.7644657537327092, "language_loss": 0.52555829, "learning_rate": 1.706821969374996e-06, "loss": 0.54863876, "num_input_tokens_seen": 200736965, "router_z_loss_clip": 1.03125, "router_z_loss_mlp": 0.24902344, "step": 9321, "time_per_iteration": 3.173736333847046 }, { "auxiliary_loss_clip": 0.01402749, "auxiliary_loss_mlp": 0.01040432, "balance_loss_clip": 1.24119782, "balance_loss_mlp": 1.0199399, "epoch": 0.5604689613708101, "flos": 22246458721920.0, "grad_norm": 1.3664775023855107, "language_loss": 0.75285757, "learning_rate": 1.7064367204210216e-06, "loss": 0.77728927, "num_input_tokens_seen": 200757420, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20471191, "step": 9322, "time_per_iteration": 2.9062600135803223 }, { "auxiliary_loss_clip": 0.01404462, "auxiliary_loss_mlp": 0.01037679, "balance_loss_clip": 1.23807192, "balance_loss_mlp": 1.01576853, "epoch": 0.5605290846234782, "flos": 35311107893760.0, "grad_norm": 1.847533923289402, "language_loss": 0.74081314, "learning_rate": 1.7060514825989963e-06, "loss": 0.76523453, "num_input_tokens_seen": 200779520, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21911621, "step": 9323, "time_per_iteration": 4.436866760253906 }, { "auxiliary_loss_clip": 0.01418372, "auxiliary_loss_mlp": 0.01037929, "balance_loss_clip": 1.24987829, "balance_loss_mlp": 1.01657867, "epoch": 0.5605892078761461, "flos": 20272132310400.0, "grad_norm": 1.6100945449504744, "language_loss": 0.62854695, "learning_rate": 1.7056662559235286e-06, "loss": 0.65310991, "num_input_tokens_seen": 200799485, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21362305, "step": 9324, "time_per_iteration": 2.8644447326660156 }, { "auxiliary_loss_clip": 0.01401056, "auxiliary_loss_mlp": 0.01037794, "balance_loss_clip": 1.23616254, "balance_loss_mlp": 1.01568079, "epoch": 0.5606493311288141, "flos": 17316479272320.0, "grad_norm": 1.8741131451055681, "language_loss": 0.88587064, "learning_rate": 1.705281040409226e-06, "loss": 0.91025913, "num_input_tokens_seen": 200817540, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.22131348, "step": 9325, "time_per_iteration": 5.599449872970581 }, { "auxiliary_loss_clip": 0.01410726, "auxiliary_loss_mlp": 0.01037533, "balance_loss_clip": 1.24296522, "balance_loss_mlp": 1.01606297, "epoch": 0.560709454381482, "flos": 21662998035840.0, "grad_norm": 1.6330646843795364, "language_loss": 0.74878407, "learning_rate": 1.7048958360706952e-06, "loss": 0.77326661, "num_input_tokens_seen": 200838380, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.21472168, "step": 9326, "time_per_iteration": 2.852649450302124 }, { "auxiliary_loss_clip": 0.0142531, "auxiliary_loss_mlp": 0.01033137, "balance_loss_clip": 1.25514007, "balance_loss_mlp": 1.01160789, "epoch": 0.56076957763415, "flos": 20313106116480.0, "grad_norm": 2.6814447180793337, "language_loss": 0.79156345, "learning_rate": 1.7045106429225447e-06, "loss": 0.81614792, "num_input_tokens_seen": 200855640, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21520996, "step": 9327, "time_per_iteration": 2.8337314128875732 }, { "auxiliary_loss_clip": 0.01407243, "auxiliary_loss_mlp": 0.0103546, "balance_loss_clip": 1.24119925, "balance_loss_mlp": 1.01407385, "epoch": 0.5608297008868179, "flos": 25056721658880.0, "grad_norm": 1.3883721348505498, "language_loss": 0.79077792, "learning_rate": 1.7041254609793795e-06, "loss": 0.81520498, "num_input_tokens_seen": 200876585, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.21386719, "step": 9328, "time_per_iteration": 2.9429502487182617 }, { "auxiliary_loss_clip": 0.01395138, "auxiliary_loss_mlp": 0.01034888, "balance_loss_clip": 1.23244667, "balance_loss_mlp": 1.01267934, "epoch": 0.560889824139486, "flos": 19876800078720.0, "grad_norm": 1.4897740457775261, "language_loss": 0.74595141, "learning_rate": 1.7037402902558066e-06, "loss": 0.77025175, "num_input_tokens_seen": 200898175, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.22192383, "step": 9329, "time_per_iteration": 2.895230770111084 }, { "auxiliary_loss_clip": 0.01430742, "auxiliary_loss_mlp": 0.01037408, "balance_loss_clip": 1.25829935, "balance_loss_mlp": 1.01523519, "epoch": 0.5609499473921539, "flos": 22939584099840.0, "grad_norm": 1.8491541623266425, "language_loss": 0.84281659, "learning_rate": 1.7033551307664324e-06, "loss": 0.86749816, "num_input_tokens_seen": 200917515, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.22167969, "step": 9330, "time_per_iteration": 2.8667471408843994 }, { "auxiliary_loss_clip": 0.01235737, "auxiliary_loss_mlp": 0.01027788, "balance_loss_clip": 1.13549376, "balance_loss_mlp": 1.00432777, "epoch": 0.5610100706448219, "flos": 53064770837760.0, "grad_norm": 0.7169877640485568, "language_loss": 0.57893485, "learning_rate": 1.7029699825258603e-06, "loss": 0.60157007, "num_input_tokens_seen": 200978615, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.234375, "step": 9331, "time_per_iteration": 3.4176065921783447 }, { "auxiliary_loss_clip": 0.01410596, "auxiliary_loss_mlp": 0.010367, "balance_loss_clip": 1.24234819, "balance_loss_mlp": 1.01461017, "epoch": 0.5610701938974898, "flos": 21844882707840.0, "grad_norm": 1.8924933167931834, "language_loss": 0.82623172, "learning_rate": 1.7025848455486971e-06, "loss": 0.85070467, "num_input_tokens_seen": 200997745, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.22106934, "step": 9332, "time_per_iteration": 2.849066734313965 }, { "auxiliary_loss_clip": 0.01428777, "auxiliary_loss_mlp": 0.010432, "balance_loss_clip": 1.25762749, "balance_loss_mlp": 1.01989448, "epoch": 0.5611303171501578, "flos": 17466122384640.0, "grad_norm": 1.9329738822333147, "language_loss": 0.8256464, "learning_rate": 1.7021997198495454e-06, "loss": 0.85036618, "num_input_tokens_seen": 201016370, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.23278809, "step": 9333, "time_per_iteration": 2.858982563018799 }, { "auxiliary_loss_clip": 0.01415006, "auxiliary_loss_mlp": 0.01036196, "balance_loss_clip": 1.246786, "balance_loss_mlp": 1.01457143, "epoch": 0.5611904404028258, "flos": 22647989491200.0, "grad_norm": 1.841649338890242, "language_loss": 0.73362577, "learning_rate": 1.7018146054430108e-06, "loss": 0.75813776, "num_input_tokens_seen": 201034310, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21618652, "step": 9334, "time_per_iteration": 2.889134168624878 }, { "auxiliary_loss_clip": 0.01411417, "auxiliary_loss_mlp": 0.01040017, "balance_loss_clip": 1.2457211, "balance_loss_mlp": 1.01906037, "epoch": 0.5612505636554938, "flos": 14323652991360.0, "grad_norm": 1.7754154452143311, "language_loss": 0.72288465, "learning_rate": 1.7014295023436961e-06, "loss": 0.74739897, "num_input_tokens_seen": 201052030, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.2097168, "step": 9335, "time_per_iteration": 2.831958770751953 }, { "auxiliary_loss_clip": 0.0141692, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.24908781, "balance_loss_mlp": 1.01531732, "epoch": 0.5613106869081618, "flos": 16516901583360.0, "grad_norm": 1.7175507800729208, "language_loss": 0.77346587, "learning_rate": 1.701044410566205e-06, "loss": 0.79800069, "num_input_tokens_seen": 201068445, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21240234, "step": 9336, "time_per_iteration": 2.8406970500946045 }, { "auxiliary_loss_clip": 0.01404825, "auxiliary_loss_mlp": 0.01039696, "balance_loss_clip": 1.24020505, "balance_loss_mlp": 1.01982379, "epoch": 0.5613708101608297, "flos": 24068563067520.0, "grad_norm": 2.628127408423442, "language_loss": 0.65858203, "learning_rate": 1.7006593301251393e-06, "loss": 0.68302727, "num_input_tokens_seen": 201082140, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19873047, "step": 9337, "time_per_iteration": 2.843745708465576 }, { "auxiliary_loss_clip": 0.01229456, "auxiliary_loss_mlp": 0.01026081, "balance_loss_clip": 1.1280508, "balance_loss_mlp": 0.99747032, "epoch": 0.5614309334134977, "flos": 64931415459840.0, "grad_norm": 0.8806738508813645, "language_loss": 0.62647903, "learning_rate": 1.700274261035102e-06, "loss": 0.64903438, "num_input_tokens_seen": 201137245, "router_z_loss_clip": 1.015625, "router_z_loss_mlp": 0.28515625, "step": 9338, "time_per_iteration": 3.328169107437134 }, { "auxiliary_loss_clip": 0.01411027, "auxiliary_loss_mlp": 0.01036904, "balance_loss_clip": 1.24548769, "balance_loss_mlp": 1.01606607, "epoch": 0.5614910566661656, "flos": 32930907212160.0, "grad_norm": 2.047629599986269, "language_loss": 0.66775191, "learning_rate": 1.6998892033106946e-06, "loss": 0.69223118, "num_input_tokens_seen": 201157270, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20849609, "step": 9339, "time_per_iteration": 2.949659585952759 }, { "auxiliary_loss_clip": 0.01400207, "auxiliary_loss_mlp": 0.01032675, "balance_loss_clip": 1.23661411, "balance_loss_mlp": 1.01169384, "epoch": 0.5615511799188336, "flos": 18598268488320.0, "grad_norm": 1.7716903093737688, "language_loss": 0.70817304, "learning_rate": 1.6995041569665184e-06, "loss": 0.73250186, "num_input_tokens_seen": 201174530, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20983887, "step": 9340, "time_per_iteration": 2.870175361633301 }, { "auxiliary_loss_clip": 0.01389237, "auxiliary_loss_mlp": 0.01033935, "balance_loss_clip": 1.22969866, "balance_loss_mlp": 1.01385999, "epoch": 0.5616113031715015, "flos": 22830190876800.0, "grad_norm": 1.6141062334516045, "language_loss": 0.78106487, "learning_rate": 1.6991191220171756e-06, "loss": 0.80529654, "num_input_tokens_seen": 201194905, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.20092773, "step": 9341, "time_per_iteration": 2.8883588314056396 }, { "auxiliary_loss_clip": 0.01413864, "auxiliary_loss_mlp": 0.0103292, "balance_loss_clip": 1.24569845, "balance_loss_mlp": 1.01267874, "epoch": 0.5616714264241696, "flos": 22355761455360.0, "grad_norm": 1.5484100035509594, "language_loss": 0.80557287, "learning_rate": 1.6987340984772653e-06, "loss": 0.83004069, "num_input_tokens_seen": 201213715, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20227051, "step": 9342, "time_per_iteration": 2.8724279403686523 }, { "auxiliary_loss_clip": 0.01420599, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.25048566, "balance_loss_mlp": 1.01484156, "epoch": 0.5617315496768375, "flos": 18817235913600.0, "grad_norm": 1.7554286137194293, "language_loss": 0.77332783, "learning_rate": 1.6983490863613882e-06, "loss": 0.79789448, "num_input_tokens_seen": 201231415, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.2121582, "step": 9343, "time_per_iteration": 2.8901476860046387 }, { "auxiliary_loss_clip": 0.01409825, "auxiliary_loss_mlp": 0.01040092, "balance_loss_clip": 1.24594426, "balance_loss_mlp": 1.01727533, "epoch": 0.5617916729295055, "flos": 18378486656640.0, "grad_norm": 1.726057266246324, "language_loss": 0.70127022, "learning_rate": 1.6979640856841442e-06, "loss": 0.7257694, "num_input_tokens_seen": 201249625, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.22827148, "step": 9344, "time_per_iteration": 2.848414659500122 }, { "auxiliary_loss_clip": 0.01403613, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.23727751, "balance_loss_mlp": 1.01596367, "epoch": 0.5618517961821734, "flos": 28190639784960.0, "grad_norm": 1.8077263449864864, "language_loss": 0.67399824, "learning_rate": 1.6975790964601318e-06, "loss": 0.69839871, "num_input_tokens_seen": 201271205, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20471191, "step": 9345, "time_per_iteration": 2.8788208961486816 }, { "auxiliary_loss_clip": 0.01413976, "auxiliary_loss_mlp": 0.01034037, "balance_loss_clip": 1.24861753, "balance_loss_mlp": 1.01501155, "epoch": 0.5619119194348414, "flos": 15495279822720.0, "grad_norm": 2.2772681082243946, "language_loss": 0.87691343, "learning_rate": 1.6971941187039512e-06, "loss": 0.90139359, "num_input_tokens_seen": 201287700, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19030762, "step": 9346, "time_per_iteration": 4.216963529586792 }, { "auxiliary_loss_clip": 0.01398171, "auxiliary_loss_mlp": 0.01036375, "balance_loss_clip": 1.23459649, "balance_loss_mlp": 1.01562071, "epoch": 0.5619720426875094, "flos": 29139589117440.0, "grad_norm": 2.809654557608602, "language_loss": 0.59511828, "learning_rate": 1.6968091524301993e-06, "loss": 0.61946368, "num_input_tokens_seen": 201307530, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20739746, "step": 9347, "time_per_iteration": 2.896257162094116 }, { "auxiliary_loss_clip": 0.01410704, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.24358332, "balance_loss_mlp": 1.01295018, "epoch": 0.5620321659401774, "flos": 18012636051840.0, "grad_norm": 2.1874516860300184, "language_loss": 0.7088939, "learning_rate": 1.6964241976534745e-06, "loss": 0.73334312, "num_input_tokens_seen": 201326210, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.21276855, "step": 9348, "time_per_iteration": 2.857466220855713 }, { "auxiliary_loss_clip": 0.01424189, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.25444722, "balance_loss_mlp": 1.01411676, "epoch": 0.5620922891928454, "flos": 20604157787520.0, "grad_norm": 2.122950236091542, "language_loss": 0.79793125, "learning_rate": 1.6960392543883754e-06, "loss": 0.82253003, "num_input_tokens_seen": 201346120, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.2154541, "step": 9349, "time_per_iteration": 2.8628158569335938 }, { "auxiliary_loss_clip": 0.01409928, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.24472141, "balance_loss_mlp": 1.01644957, "epoch": 0.5621524124455133, "flos": 26298306230400.0, "grad_norm": 3.9983993034414484, "language_loss": 0.68516552, "learning_rate": 1.6956543226494975e-06, "loss": 0.70963514, "num_input_tokens_seen": 201365700, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20593262, "step": 9350, "time_per_iteration": 2.8944284915924072 }, { "auxiliary_loss_clip": 0.01420165, "auxiliary_loss_mlp": 0.01038662, "balance_loss_clip": 1.25065672, "balance_loss_mlp": 1.01763368, "epoch": 0.5622125356981813, "flos": 12757960782720.0, "grad_norm": 5.494201160770851, "language_loss": 0.79982817, "learning_rate": 1.6952694024514381e-06, "loss": 0.82441646, "num_input_tokens_seen": 201382795, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21020508, "step": 9351, "time_per_iteration": 2.846269130706787 }, { "auxiliary_loss_clip": 0.01437847, "auxiliary_loss_mlp": 0.0104052, "balance_loss_clip": 1.26592577, "balance_loss_mlp": 1.01913333, "epoch": 0.5622726589508492, "flos": 23815815759360.0, "grad_norm": 1.4588508595375442, "language_loss": 0.5978151, "learning_rate": 1.6948844938087945e-06, "loss": 0.62259877, "num_input_tokens_seen": 201402780, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.21386719, "step": 9352, "time_per_iteration": 2.863811492919922 }, { "auxiliary_loss_clip": 0.01394603, "auxiliary_loss_mlp": 0.01031746, "balance_loss_clip": 1.23470068, "balance_loss_mlp": 1.0115515, "epoch": 0.5623327822035172, "flos": 24729130172160.0, "grad_norm": 1.7845986716832944, "language_loss": 0.72836679, "learning_rate": 1.6944995967361604e-06, "loss": 0.75263023, "num_input_tokens_seen": 201424140, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.20178223, "step": 9353, "time_per_iteration": 2.898775577545166 }, { "auxiliary_loss_clip": 0.01419994, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.25182569, "balance_loss_mlp": 1.01937795, "epoch": 0.5623929054561851, "flos": 14025181173120.0, "grad_norm": 3.1412465205016797, "language_loss": 0.77755934, "learning_rate": 1.6941147112481327e-06, "loss": 0.80216038, "num_input_tokens_seen": 201439645, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20739746, "step": 9354, "time_per_iteration": 2.7942707538604736 }, { "auxiliary_loss_clip": 0.01431758, "auxiliary_loss_mlp": 0.01037168, "balance_loss_clip": 1.2605269, "balance_loss_mlp": 1.01520979, "epoch": 0.5624530287088532, "flos": 20714184437760.0, "grad_norm": 2.0608824246219863, "language_loss": 0.7387377, "learning_rate": 1.6937298373593056e-06, "loss": 0.76342702, "num_input_tokens_seen": 201459970, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.21948242, "step": 9355, "time_per_iteration": 2.9006314277648926 }, { "auxiliary_loss_clip": 0.01410145, "auxiliary_loss_mlp": 0.01034318, "balance_loss_clip": 1.2442559, "balance_loss_mlp": 1.01443362, "epoch": 0.5625131519615211, "flos": 21480977629440.0, "grad_norm": 1.6989513527288505, "language_loss": 0.7432791, "learning_rate": 1.693344975084274e-06, "loss": 0.76772374, "num_input_tokens_seen": 201480055, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19885254, "step": 9356, "time_per_iteration": 2.8853390216827393 }, { "auxiliary_loss_clip": 0.01407872, "auxiliary_loss_mlp": 0.01039565, "balance_loss_clip": 1.24416196, "balance_loss_mlp": 1.01803565, "epoch": 0.5625732752141891, "flos": 18707344997760.0, "grad_norm": 1.9723809837186166, "language_loss": 0.84349364, "learning_rate": 1.6929601244376318e-06, "loss": 0.86796802, "num_input_tokens_seen": 201497645, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.21520996, "step": 9357, "time_per_iteration": 2.8349740505218506 }, { "auxiliary_loss_clip": 0.01411244, "auxiliary_loss_mlp": 0.01034477, "balance_loss_clip": 1.24705672, "balance_loss_mlp": 1.01392484, "epoch": 0.562633398466857, "flos": 16225578443520.0, "grad_norm": 2.0023594186351588, "language_loss": 0.73083436, "learning_rate": 1.6925752854339722e-06, "loss": 0.75529158, "num_input_tokens_seen": 201515455, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20568848, "step": 9358, "time_per_iteration": 4.248685598373413 }, { "auxiliary_loss_clip": 0.01404114, "auxiliary_loss_mlp": 0.01037557, "balance_loss_clip": 1.23988008, "balance_loss_mlp": 1.01701689, "epoch": 0.562693521719525, "flos": 22502101697280.0, "grad_norm": 1.7105102547204254, "language_loss": 0.78646964, "learning_rate": 1.6921904580878885e-06, "loss": 0.81088638, "num_input_tokens_seen": 201534500, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20544434, "step": 9359, "time_per_iteration": 4.311997175216675 }, { "auxiliary_loss_clip": 0.01416093, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.24921036, "balance_loss_mlp": 1.01722574, "epoch": 0.562753644972193, "flos": 25340353182720.0, "grad_norm": 1.8964015898555722, "language_loss": 0.71313059, "learning_rate": 1.6918056424139736e-06, "loss": 0.73765856, "num_input_tokens_seen": 201553280, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19482422, "step": 9360, "time_per_iteration": 4.2870965003967285 }, { "auxiliary_loss_clip": 0.01223791, "auxiliary_loss_mlp": 0.01051149, "balance_loss_clip": 1.12468517, "balance_loss_mlp": 1.02578092, "epoch": 0.562813768224861, "flos": 67420466426880.0, "grad_norm": 0.7777117777699426, "language_loss": 0.55594409, "learning_rate": 1.6914208384268197e-06, "loss": 0.57869351, "num_input_tokens_seen": 201610030, "router_z_loss_clip": 0.9921875, "router_z_loss_mlp": 0.25390625, "step": 9361, "time_per_iteration": 3.3018205165863037 }, { "auxiliary_loss_clip": 0.01404653, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.24203253, "balance_loss_mlp": 1.01737213, "epoch": 0.562873891477529, "flos": 23341567317120.0, "grad_norm": 1.4739897129490012, "language_loss": 0.82213289, "learning_rate": 1.691036046141018e-06, "loss": 0.84655291, "num_input_tokens_seen": 201628370, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1998291, "step": 9362, "time_per_iteration": 2.8663251399993896 }, { "auxiliary_loss_clip": 0.01408437, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.24287617, "balance_loss_mlp": 1.0189271, "epoch": 0.5629340147301969, "flos": 38487176190720.0, "grad_norm": 2.0603222722126153, "language_loss": 0.75718373, "learning_rate": 1.6906512655711614e-06, "loss": 0.78166431, "num_input_tokens_seen": 201649790, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20703125, "step": 9363, "time_per_iteration": 2.9943294525146484 }, { "auxiliary_loss_clip": 0.01419933, "auxiliary_loss_mlp": 0.0103487, "balance_loss_clip": 1.25236869, "balance_loss_mlp": 1.01462865, "epoch": 0.5629941379828649, "flos": 29254004513280.0, "grad_norm": 1.6907564539087825, "language_loss": 0.83718908, "learning_rate": 1.690266496731839e-06, "loss": 0.86173713, "num_input_tokens_seen": 201669175, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20227051, "step": 9364, "time_per_iteration": 2.9175331592559814 }, { "auxiliary_loss_clip": 0.01399491, "auxiliary_loss_mlp": 0.01038837, "balance_loss_clip": 1.2382338, "balance_loss_mlp": 1.02019286, "epoch": 0.5630542612355328, "flos": 19428639903360.0, "grad_norm": 2.112363532664369, "language_loss": 0.66463292, "learning_rate": 1.689881739637642e-06, "loss": 0.68901616, "num_input_tokens_seen": 201687000, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18640137, "step": 9365, "time_per_iteration": 2.8665270805358887 }, { "auxiliary_loss_clip": 0.01438667, "auxiliary_loss_mlp": 0.01035496, "balance_loss_clip": 1.26545334, "balance_loss_mlp": 1.01480079, "epoch": 0.5631143844882008, "flos": 22274673494400.0, "grad_norm": 3.0530219738535824, "language_loss": 0.82323456, "learning_rate": 1.6894969943031611e-06, "loss": 0.84797609, "num_input_tokens_seen": 201703335, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20690918, "step": 9366, "time_per_iteration": 2.821389675140381 }, { "auxiliary_loss_clip": 0.01398512, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.23671603, "balance_loss_mlp": 1.02064848, "epoch": 0.5631745077408687, "flos": 22975354753920.0, "grad_norm": 1.4316957510975195, "language_loss": 0.74765313, "learning_rate": 1.6891122607429845e-06, "loss": 0.77204579, "num_input_tokens_seen": 201723495, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20117188, "step": 9367, "time_per_iteration": 2.8589718341827393 }, { "auxiliary_loss_clip": 0.01222432, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.1255362, "balance_loss_mlp": 1.00885057, "epoch": 0.5632346309935368, "flos": 65113725317760.0, "grad_norm": 0.6355456449918558, "language_loss": 0.53555715, "learning_rate": 1.6887275389717028e-06, "loss": 0.55811608, "num_input_tokens_seen": 201792615, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.24609375, "step": 9368, "time_per_iteration": 3.5262792110443115 }, { "auxiliary_loss_clip": 0.01418707, "auxiliary_loss_mlp": 0.01042219, "balance_loss_clip": 1.25434899, "balance_loss_mlp": 1.02263308, "epoch": 0.5632947542462047, "flos": 23013251913600.0, "grad_norm": 1.6119780567689044, "language_loss": 0.69318819, "learning_rate": 1.6883428290039046e-06, "loss": 0.71779752, "num_input_tokens_seen": 201812520, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19580078, "step": 9369, "time_per_iteration": 2.8820393085479736 }, { "auxiliary_loss_clip": 0.01406259, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.24030542, "balance_loss_mlp": 1.0212847, "epoch": 0.5633548774988727, "flos": 30494910412800.0, "grad_norm": 1.714091383526198, "language_loss": 0.76295698, "learning_rate": 1.6879581308541763e-06, "loss": 0.78743184, "num_input_tokens_seen": 201834185, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19934082, "step": 9370, "time_per_iteration": 2.9321036338806152 }, { "auxiliary_loss_clip": 0.01429197, "auxiliary_loss_mlp": 0.0105185, "balance_loss_clip": 1.25883889, "balance_loss_mlp": 1.03023767, "epoch": 0.5634150007515406, "flos": 18524193471360.0, "grad_norm": 1.9598298666554268, "language_loss": 0.7643494, "learning_rate": 1.687573444537108e-06, "loss": 0.78915989, "num_input_tokens_seen": 201851305, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.21606445, "step": 9371, "time_per_iteration": 2.8234100341796875 }, { "auxiliary_loss_clip": 0.01411089, "auxiliary_loss_mlp": 0.0104755, "balance_loss_clip": 1.24779141, "balance_loss_mlp": 1.02791619, "epoch": 0.5634751240042086, "flos": 19253994399360.0, "grad_norm": 1.8302339502994007, "language_loss": 0.76747006, "learning_rate": 1.687188770067285e-06, "loss": 0.79205644, "num_input_tokens_seen": 201870350, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19628906, "step": 9372, "time_per_iteration": 2.8431994915008545 }, { "auxiliary_loss_clip": 0.01407976, "auxiliary_loss_mlp": 0.01044474, "balance_loss_clip": 1.2453239, "balance_loss_mlp": 1.0238272, "epoch": 0.5635352472568766, "flos": 12028114609920.0, "grad_norm": 3.064069483457172, "language_loss": 0.72621918, "learning_rate": 1.6868041074592956e-06, "loss": 0.75074369, "num_input_tokens_seen": 201886800, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.2064209, "step": 9373, "time_per_iteration": 2.8267507553100586 }, { "auxiliary_loss_clip": 0.01417551, "auxiliary_loss_mlp": 0.01045344, "balance_loss_clip": 1.2511313, "balance_loss_mlp": 1.02357697, "epoch": 0.5635953705095446, "flos": 21881648747520.0, "grad_norm": 1.9323783826923924, "language_loss": 0.84033948, "learning_rate": 1.6864194567277264e-06, "loss": 0.86496842, "num_input_tokens_seen": 201904730, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.21765137, "step": 9374, "time_per_iteration": 2.96347975730896 }, { "auxiliary_loss_clip": 0.01406106, "auxiliary_loss_mlp": 0.01040547, "balance_loss_clip": 1.24189246, "balance_loss_mlp": 1.0211401, "epoch": 0.5636554937622126, "flos": 27137681360640.0, "grad_norm": 1.5082911657906233, "language_loss": 0.67234302, "learning_rate": 1.6860348178871618e-06, "loss": 0.69680953, "num_input_tokens_seen": 201924850, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1940918, "step": 9375, "time_per_iteration": 2.9098799228668213 }, { "auxiliary_loss_clip": 0.01418046, "auxiliary_loss_mlp": 0.01043079, "balance_loss_clip": 1.24935555, "balance_loss_mlp": 1.0225271, "epoch": 0.5637156170148805, "flos": 12932470552320.0, "grad_norm": 2.074510503932805, "language_loss": 0.81165183, "learning_rate": 1.6856501909521889e-06, "loss": 0.83626306, "num_input_tokens_seen": 201939500, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.20544434, "step": 9376, "time_per_iteration": 2.7961621284484863 }, { "auxiliary_loss_clip": 0.01423293, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.25147486, "balance_loss_mlp": 1.01908457, "epoch": 0.5637757402675485, "flos": 45567213431040.0, "grad_norm": 1.36438214249065, "language_loss": 0.70256335, "learning_rate": 1.6852655759373925e-06, "loss": 0.72719026, "num_input_tokens_seen": 201963000, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.203125, "step": 9377, "time_per_iteration": 3.052825450897217 }, { "auxiliary_loss_clip": 0.01406413, "auxiliary_loss_mlp": 0.0103999, "balance_loss_clip": 1.24455607, "balance_loss_mlp": 1.02035618, "epoch": 0.5638358635202164, "flos": 20895661906560.0, "grad_norm": 1.3331957922916715, "language_loss": 0.75091738, "learning_rate": 1.6848809728573565e-06, "loss": 0.77538139, "num_input_tokens_seen": 201983145, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19628906, "step": 9378, "time_per_iteration": 2.8639626502990723 }, { "auxiliary_loss_clip": 0.01436299, "auxiliary_loss_mlp": 0.01034837, "balance_loss_clip": 1.26227987, "balance_loss_mlp": 1.01489305, "epoch": 0.5638959867728844, "flos": 18815697590400.0, "grad_norm": 2.268732276837131, "language_loss": 0.83194983, "learning_rate": 1.6844963817266656e-06, "loss": 0.8566612, "num_input_tokens_seen": 202000335, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.19934082, "step": 9379, "time_per_iteration": 2.8374645709991455 }, { "auxiliary_loss_clip": 0.01415682, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.24827003, "balance_loss_mlp": 1.01864481, "epoch": 0.5639561100255523, "flos": 27501767418240.0, "grad_norm": 2.223143489132079, "language_loss": 0.73289323, "learning_rate": 1.6841118025599042e-06, "loss": 0.75743294, "num_input_tokens_seen": 202018275, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19641113, "step": 9380, "time_per_iteration": 2.8951053619384766 }, { "auxiliary_loss_clip": 0.01421396, "auxiliary_loss_mlp": 0.01042668, "balance_loss_clip": 1.25323105, "balance_loss_mlp": 1.02119863, "epoch": 0.5640162332782204, "flos": 18085444214400.0, "grad_norm": 3.2341472460551066, "language_loss": 0.75731438, "learning_rate": 1.6837272353716542e-06, "loss": 0.781955, "num_input_tokens_seen": 202034330, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21472168, "step": 9381, "time_per_iteration": 4.275495529174805 }, { "auxiliary_loss_clip": 0.01422864, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.2526722, "balance_loss_mlp": 1.01759052, "epoch": 0.5640763565308883, "flos": 20894168828160.0, "grad_norm": 2.322667603813712, "language_loss": 0.73521984, "learning_rate": 1.683342680176499e-06, "loss": 0.75982118, "num_input_tokens_seen": 202053100, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.19677734, "step": 9382, "time_per_iteration": 2.852339029312134 }, { "auxiliary_loss_clip": 0.01227916, "auxiliary_loss_mlp": 0.01024033, "balance_loss_clip": 1.12773645, "balance_loss_mlp": 1.00228965, "epoch": 0.5641364797835563, "flos": 64477753401600.0, "grad_norm": 0.7581882613420264, "language_loss": 0.54447323, "learning_rate": 1.682958136989022e-06, "loss": 0.5669927, "num_input_tokens_seen": 202120125, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 0.21777344, "step": 9383, "time_per_iteration": 3.487288475036621 }, { "auxiliary_loss_clip": 0.01442321, "auxiliary_loss_mlp": 0.01035001, "balance_loss_clip": 1.26962399, "balance_loss_mlp": 1.01478302, "epoch": 0.5641966030362242, "flos": 18670126510080.0, "grad_norm": 1.8181744635838362, "language_loss": 0.71667194, "learning_rate": 1.6825736058238033e-06, "loss": 0.74144518, "num_input_tokens_seen": 202138030, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.20227051, "step": 9384, "time_per_iteration": 2.847229242324829 }, { "auxiliary_loss_clip": 0.01416249, "auxiliary_loss_mlp": 0.01033702, "balance_loss_clip": 1.24791849, "balance_loss_mlp": 1.01316214, "epoch": 0.5642567262888922, "flos": 22502599390080.0, "grad_norm": 1.9385148323712222, "language_loss": 0.76499945, "learning_rate": 1.6821890866954263e-06, "loss": 0.78949904, "num_input_tokens_seen": 202155580, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20532227, "step": 9385, "time_per_iteration": 2.877686023712158 }, { "auxiliary_loss_clip": 0.01410278, "auxiliary_loss_mlp": 0.01038545, "balance_loss_clip": 1.24477279, "balance_loss_mlp": 1.01912558, "epoch": 0.5643168495415603, "flos": 13011251028480.0, "grad_norm": 2.1146457356709565, "language_loss": 0.82908708, "learning_rate": 1.6818045796184703e-06, "loss": 0.85357535, "num_input_tokens_seen": 202170365, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19421387, "step": 9386, "time_per_iteration": 2.8118197917938232 }, { "auxiliary_loss_clip": 0.01437712, "auxiliary_loss_mlp": 0.01035984, "balance_loss_clip": 1.26500678, "balance_loss_mlp": 1.01553965, "epoch": 0.5643769727942282, "flos": 18597544571520.0, "grad_norm": 1.8255916140831692, "language_loss": 0.71131074, "learning_rate": 1.681420084607516e-06, "loss": 0.73604774, "num_input_tokens_seen": 202189095, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.20458984, "step": 9387, "time_per_iteration": 2.8321290016174316 }, { "auxiliary_loss_clip": 0.01437643, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.26595187, "balance_loss_mlp": 1.01619887, "epoch": 0.5644370960468962, "flos": 33820350353280.0, "grad_norm": 1.5963940916579629, "language_loss": 0.75297099, "learning_rate": 1.6810356016771452e-06, "loss": 0.77771366, "num_input_tokens_seen": 202213500, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20422363, "step": 9388, "time_per_iteration": 2.968747615814209 }, { "auxiliary_loss_clip": 0.01405198, "auxiliary_loss_mlp": 0.0103162, "balance_loss_clip": 1.24193406, "balance_loss_mlp": 1.01298797, "epoch": 0.5644972192995641, "flos": 21224746471680.0, "grad_norm": 1.5038401033374895, "language_loss": 0.82912266, "learning_rate": 1.6806511308419353e-06, "loss": 0.85349089, "num_input_tokens_seen": 202231920, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18640137, "step": 9389, "time_per_iteration": 2.884032964706421 }, { "auxiliary_loss_clip": 0.01427182, "auxiliary_loss_mlp": 0.0103569, "balance_loss_clip": 1.25746167, "balance_loss_mlp": 1.0150435, "epoch": 0.5645573425522321, "flos": 18596820654720.0, "grad_norm": 1.859611929736712, "language_loss": 0.64393973, "learning_rate": 1.680266672116467e-06, "loss": 0.66856843, "num_input_tokens_seen": 202247600, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.2064209, "step": 9390, "time_per_iteration": 2.8411383628845215 }, { "auxiliary_loss_clip": 0.01411961, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.24688363, "balance_loss_mlp": 1.01220465, "epoch": 0.5646174658049, "flos": 18122888926080.0, "grad_norm": 1.7063661023909833, "language_loss": 0.93209946, "learning_rate": 1.6798822255153192e-06, "loss": 0.95653713, "num_input_tokens_seen": 202265350, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19616699, "step": 9391, "time_per_iteration": 2.836562395095825 }, { "auxiliary_loss_clip": 0.01438898, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.26500416, "balance_loss_mlp": 1.01496673, "epoch": 0.564677589057568, "flos": 28341730730880.0, "grad_norm": 2.466757371544214, "language_loss": 0.62037551, "learning_rate": 1.6794977910530684e-06, "loss": 0.64513099, "num_input_tokens_seen": 202284285, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.2166748, "step": 9392, "time_per_iteration": 4.329622268676758 }, { "auxiliary_loss_clip": 0.01421063, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.2527976, "balance_loss_mlp": 1.01288581, "epoch": 0.564737712310236, "flos": 22173876783360.0, "grad_norm": 2.3602645247450003, "language_loss": 0.82508719, "learning_rate": 1.6791133687442937e-06, "loss": 0.8496291, "num_input_tokens_seen": 202303450, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20239258, "step": 9393, "time_per_iteration": 2.88104248046875 }, { "auxiliary_loss_clip": 0.01423514, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.25686729, "balance_loss_mlp": 1.01245511, "epoch": 0.564797835562904, "flos": 20968651048320.0, "grad_norm": 1.972706552632908, "language_loss": 0.87693, "learning_rate": 1.6787289586035725e-06, "loss": 0.90149105, "num_input_tokens_seen": 202322315, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20141602, "step": 9394, "time_per_iteration": 4.339195966720581 }, { "auxiliary_loss_clip": 0.01420822, "auxiliary_loss_mlp": 0.01040691, "balance_loss_clip": 1.2549994, "balance_loss_mlp": 1.02092648, "epoch": 0.5648579588155719, "flos": 17429130120960.0, "grad_norm": 1.7747426413288732, "language_loss": 0.86112571, "learning_rate": 1.6783445606454814e-06, "loss": 0.88574082, "num_input_tokens_seen": 202339905, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19763184, "step": 9395, "time_per_iteration": 4.272717475891113 }, { "auxiliary_loss_clip": 0.01226234, "auxiliary_loss_mlp": 0.01024949, "balance_loss_clip": 1.13356709, "balance_loss_mlp": 1.00263298, "epoch": 0.5649180820682399, "flos": 69963114499200.0, "grad_norm": 0.795948793216419, "language_loss": 0.58376384, "learning_rate": 1.677960174884597e-06, "loss": 0.60627568, "num_input_tokens_seen": 202397320, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.22363281, "step": 9396, "time_per_iteration": 3.3553919792175293 }, { "auxiliary_loss_clip": 0.0144666, "auxiliary_loss_mlp": 0.01033283, "balance_loss_clip": 1.27568412, "balance_loss_mlp": 1.01355386, "epoch": 0.5649782053209078, "flos": 24983913496320.0, "grad_norm": 1.7967393232975348, "language_loss": 0.71360409, "learning_rate": 1.6775758013354943e-06, "loss": 0.7384035, "num_input_tokens_seen": 202416865, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.1973877, "step": 9397, "time_per_iteration": 2.900918483734131 }, { "auxiliary_loss_clip": 0.01438305, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.26708269, "balance_loss_mlp": 1.01602387, "epoch": 0.5650383285735758, "flos": 21736801584000.0, "grad_norm": 2.3240852399449223, "language_loss": 0.67901731, "learning_rate": 1.67719144001275e-06, "loss": 0.70376104, "num_input_tokens_seen": 202436210, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20043945, "step": 9398, "time_per_iteration": 2.8741259574890137 }, { "auxiliary_loss_clip": 0.01238678, "auxiliary_loss_mlp": 0.01024849, "balance_loss_clip": 1.14254212, "balance_loss_mlp": 0.99986249, "epoch": 0.5650984518262439, "flos": 65933961897600.0, "grad_norm": 0.7804560745935144, "language_loss": 0.58168489, "learning_rate": 1.6768070909309386e-06, "loss": 0.60432017, "num_input_tokens_seen": 202492925, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.24902344, "step": 9399, "time_per_iteration": 3.268998146057129 }, { "auxiliary_loss_clip": 0.01438163, "auxiliary_loss_mlp": 0.01038433, "balance_loss_clip": 1.26536059, "balance_loss_mlp": 1.01773834, "epoch": 0.5651585750789118, "flos": 21042409351680.0, "grad_norm": 1.8202441845040518, "language_loss": 0.73517835, "learning_rate": 1.6764227541046347e-06, "loss": 0.75994432, "num_input_tokens_seen": 202511905, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.20690918, "step": 9400, "time_per_iteration": 2.8744728565216064 }, { "auxiliary_loss_clip": 0.01451148, "auxiliary_loss_mlp": 0.01041391, "balance_loss_clip": 1.27694178, "balance_loss_mlp": 1.0203867, "epoch": 0.5652186983315798, "flos": 18561185735040.0, "grad_norm": 2.1055788907332804, "language_loss": 0.61871833, "learning_rate": 1.676038429548412e-06, "loss": 0.64364374, "num_input_tokens_seen": 202529815, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.21008301, "step": 9401, "time_per_iteration": 2.9127774238586426 }, { "auxiliary_loss_clip": 0.01436551, "auxiliary_loss_mlp": 0.01034951, "balance_loss_clip": 1.26808512, "balance_loss_mlp": 1.01540065, "epoch": 0.5652788215842477, "flos": 18487834634880.0, "grad_norm": 1.8836278288537285, "language_loss": 0.81901073, "learning_rate": 1.6756541172768453e-06, "loss": 0.8437258, "num_input_tokens_seen": 202547710, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.19543457, "step": 9402, "time_per_iteration": 2.8449625968933105 }, { "auxiliary_loss_clip": 0.01425072, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.25865054, "balance_loss_mlp": 1.0227468, "epoch": 0.5653389448369157, "flos": 30056070666240.0, "grad_norm": 1.3837350678578444, "language_loss": 0.78417832, "learning_rate": 1.6752698173045068e-06, "loss": 0.80884427, "num_input_tokens_seen": 202568835, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.18762207, "step": 9403, "time_per_iteration": 2.9033427238464355 }, { "auxiliary_loss_clip": 0.01438379, "auxiliary_loss_mlp": 0.01044512, "balance_loss_clip": 1.27000928, "balance_loss_mlp": 1.02436519, "epoch": 0.5653990680895836, "flos": 16736321456640.0, "grad_norm": 1.688410070589302, "language_loss": 0.69972134, "learning_rate": 1.6748855296459685e-06, "loss": 0.72455025, "num_input_tokens_seen": 202587385, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20141602, "step": 9404, "time_per_iteration": 2.8530681133270264 }, { "auxiliary_loss_clip": 0.01423742, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.2598207, "balance_loss_mlp": 1.01717854, "epoch": 0.5654591913422516, "flos": 14546420979840.0, "grad_norm": 1.8894020963364817, "language_loss": 0.67997372, "learning_rate": 1.6745012543158045e-06, "loss": 0.70457578, "num_input_tokens_seen": 202604815, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19274902, "step": 9405, "time_per_iteration": 2.815533399581909 }, { "auxiliary_loss_clip": 0.01420662, "auxiliary_loss_mlp": 0.01036539, "balance_loss_clip": 1.26054168, "balance_loss_mlp": 1.01742983, "epoch": 0.5655193145949196, "flos": 26220113936640.0, "grad_norm": 1.7506548530627937, "language_loss": 0.75133622, "learning_rate": 1.6741169913285852e-06, "loss": 0.77590823, "num_input_tokens_seen": 202623775, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19104004, "step": 9406, "time_per_iteration": 2.8814337253570557 }, { "auxiliary_loss_clip": 0.01451575, "auxiliary_loss_mlp": 0.01040103, "balance_loss_clip": 1.28044248, "balance_loss_mlp": 1.0195632, "epoch": 0.5655794378475876, "flos": 25057581310080.0, "grad_norm": 4.235114589794394, "language_loss": 0.80161226, "learning_rate": 1.673732740698882e-06, "loss": 0.82652903, "num_input_tokens_seen": 202643375, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.20544434, "step": 9407, "time_per_iteration": 2.8819329738616943 }, { "auxiliary_loss_clip": 0.01431928, "auxiliary_loss_mlp": 0.01039565, "balance_loss_clip": 1.26857901, "balance_loss_mlp": 1.02017009, "epoch": 0.5656395611002555, "flos": 31046084294400.0, "grad_norm": 1.4342920188084056, "language_loss": 0.72090453, "learning_rate": 1.6733485024412666e-06, "loss": 0.74561942, "num_input_tokens_seen": 202668400, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19396973, "step": 9408, "time_per_iteration": 2.9531404972076416 }, { "auxiliary_loss_clip": 0.01427705, "auxiliary_loss_mlp": 0.01033781, "balance_loss_clip": 1.26391757, "balance_loss_mlp": 1.0143851, "epoch": 0.5656996843529235, "flos": 20239031099520.0, "grad_norm": 2.2292977384186052, "language_loss": 0.81928909, "learning_rate": 1.672964276570308e-06, "loss": 0.8439039, "num_input_tokens_seen": 202685125, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19396973, "step": 9409, "time_per_iteration": 2.8472046852111816 }, { "auxiliary_loss_clip": 0.01439677, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.27089834, "balance_loss_mlp": 1.01451159, "epoch": 0.5657598076055914, "flos": 21006095760000.0, "grad_norm": 1.6612052019295793, "language_loss": 0.78912699, "learning_rate": 1.6725800631005776e-06, "loss": 0.81386429, "num_input_tokens_seen": 202703830, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.19543457, "step": 9410, "time_per_iteration": 2.8426504135131836 }, { "auxiliary_loss_clip": 0.01437124, "auxiliary_loss_mlp": 0.01036932, "balance_loss_clip": 1.26987588, "balance_loss_mlp": 1.01736927, "epoch": 0.5658199308582594, "flos": 11553277985280.0, "grad_norm": 2.383238828255604, "language_loss": 0.8424896, "learning_rate": 1.6721958620466432e-06, "loss": 0.86723024, "num_input_tokens_seen": 202719835, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19555664, "step": 9411, "time_per_iteration": 2.834829568862915 }, { "auxiliary_loss_clip": 0.01452636, "auxiliary_loss_mlp": 0.01036761, "balance_loss_clip": 1.28040481, "balance_loss_mlp": 1.01644742, "epoch": 0.5658800541109275, "flos": 14179891703040.0, "grad_norm": 7.261702149814612, "language_loss": 0.68351585, "learning_rate": 1.6718116734230749e-06, "loss": 0.70840979, "num_input_tokens_seen": 202736795, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.20300293, "step": 9412, "time_per_iteration": 2.8223440647125244 }, { "auxiliary_loss_clip": 0.01421868, "auxiliary_loss_mlp": 0.01032718, "balance_loss_clip": 1.26023149, "balance_loss_mlp": 1.01352525, "epoch": 0.5659401773635954, "flos": 27315720224640.0, "grad_norm": 1.40825707180426, "language_loss": 0.58803034, "learning_rate": 1.6714274972444413e-06, "loss": 0.61257613, "num_input_tokens_seen": 202756900, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19189453, "step": 9413, "time_per_iteration": 2.8971059322357178 }, { "auxiliary_loss_clip": 0.01423166, "auxiliary_loss_mlp": 0.01035133, "balance_loss_clip": 1.25879955, "balance_loss_mlp": 1.01635754, "epoch": 0.5660003006162634, "flos": 16737135863040.0, "grad_norm": 1.689984000509327, "language_loss": 0.70497054, "learning_rate": 1.6710433335253092e-06, "loss": 0.72955352, "num_input_tokens_seen": 202775145, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18798828, "step": 9414, "time_per_iteration": 2.815650224685669 }, { "auxiliary_loss_clip": 0.01430463, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.26534247, "balance_loss_mlp": 1.01580513, "epoch": 0.5660604238689313, "flos": 21663586218240.0, "grad_norm": 1.5307436858923236, "language_loss": 0.7880283, "learning_rate": 1.670659182280247e-06, "loss": 0.81267828, "num_input_tokens_seen": 202794505, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.18713379, "step": 9415, "time_per_iteration": 4.318682670593262 }, { "auxiliary_loss_clip": 0.01239351, "auxiliary_loss_mlp": 0.01019786, "balance_loss_clip": 1.14269853, "balance_loss_mlp": 1.0009985, "epoch": 0.5661205471215993, "flos": 68854432464000.0, "grad_norm": 0.6834210269193465, "language_loss": 0.49173048, "learning_rate": 1.670275043523822e-06, "loss": 0.51432186, "num_input_tokens_seen": 202858580, "router_z_loss_clip": 0.96484375, "router_z_loss_mlp": 0.1875, "step": 9416, "time_per_iteration": 3.5066428184509277 }, { "auxiliary_loss_clip": 0.01443836, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.27535439, "balance_loss_mlp": 1.01690745, "epoch": 0.5661806703742672, "flos": 28633596808320.0, "grad_norm": 2.3201586426157017, "language_loss": 0.64116001, "learning_rate": 1.6698909172706e-06, "loss": 0.6659683, "num_input_tokens_seen": 202878565, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20080566, "step": 9417, "time_per_iteration": 2.957254409790039 }, { "auxiliary_loss_clip": 0.01436984, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.26676369, "balance_loss_mlp": 1.01299, "epoch": 0.5662407936269352, "flos": 21407988487680.0, "grad_norm": 1.658988816320876, "language_loss": 0.69352567, "learning_rate": 1.6695068035351479e-06, "loss": 0.71822447, "num_input_tokens_seen": 202897350, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.19885254, "step": 9418, "time_per_iteration": 2.9104931354522705 }, { "auxiliary_loss_clip": 0.01431905, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.26424956, "balance_loss_mlp": 1.01454866, "epoch": 0.5663009168796032, "flos": 25669392503040.0, "grad_norm": 1.7886751878880403, "language_loss": 0.65221971, "learning_rate": 1.6691227023320304e-06, "loss": 0.67688638, "num_input_tokens_seen": 202916745, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20227051, "step": 9419, "time_per_iteration": 2.9190573692321777 }, { "auxiliary_loss_clip": 0.01234458, "auxiliary_loss_mlp": 0.01025605, "balance_loss_clip": 1.13892555, "balance_loss_mlp": 1.00367022, "epoch": 0.5663610401322712, "flos": 67965459753600.0, "grad_norm": 0.7534772980186489, "language_loss": 0.59746611, "learning_rate": 1.6687386136758135e-06, "loss": 0.62006676, "num_input_tokens_seen": 202982375, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.21972656, "step": 9420, "time_per_iteration": 3.353546619415283 }, { "auxiliary_loss_clip": 0.01430826, "auxiliary_loss_mlp": 0.01037782, "balance_loss_clip": 1.26598978, "balance_loss_mlp": 1.01897073, "epoch": 0.5664211633849391, "flos": 24620415621120.0, "grad_norm": 1.7420575134703122, "language_loss": 0.75812018, "learning_rate": 1.6683545375810618e-06, "loss": 0.78280628, "num_input_tokens_seen": 203002430, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18798828, "step": 9421, "time_per_iteration": 2.8828439712524414 }, { "auxiliary_loss_clip": 0.01435542, "auxiliary_loss_mlp": 0.01035761, "balance_loss_clip": 1.26540816, "balance_loss_mlp": 1.01668715, "epoch": 0.5664812866376071, "flos": 11654391409920.0, "grad_norm": 2.1195294468722357, "language_loss": 0.74239552, "learning_rate": 1.6679704740623389e-06, "loss": 0.76710856, "num_input_tokens_seen": 203019425, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.1907959, "step": 9422, "time_per_iteration": 2.841552972793579 }, { "auxiliary_loss_clip": 0.0142642, "auxiliary_loss_mlp": 0.01034903, "balance_loss_clip": 1.26231456, "balance_loss_mlp": 1.01683092, "epoch": 0.566541409890275, "flos": 24654557462400.0, "grad_norm": 1.8289011604541323, "language_loss": 0.82164085, "learning_rate": 1.6675864231342085e-06, "loss": 0.84625411, "num_input_tokens_seen": 203039035, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18066406, "step": 9423, "time_per_iteration": 2.8847339153289795 }, { "auxiliary_loss_clip": 0.01434456, "auxiliary_loss_mlp": 0.01036898, "balance_loss_clip": 1.26784575, "balance_loss_mlp": 1.01741958, "epoch": 0.566601533142943, "flos": 22280057625600.0, "grad_norm": 1.7173130125778444, "language_loss": 0.81583792, "learning_rate": 1.6672023848112353e-06, "loss": 0.84055144, "num_input_tokens_seen": 203059320, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19482422, "step": 9424, "time_per_iteration": 2.8601438999176025 }, { "auxiliary_loss_clip": 0.01438195, "auxiliary_loss_mlp": 0.01033056, "balance_loss_clip": 1.26757312, "balance_loss_mlp": 1.01350617, "epoch": 0.5666616563956111, "flos": 29983805441280.0, "grad_norm": 1.9357972786342548, "language_loss": 0.79486442, "learning_rate": 1.6668183591079805e-06, "loss": 0.81957698, "num_input_tokens_seen": 203078490, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.19543457, "step": 9425, "time_per_iteration": 2.897944927215576 }, { "auxiliary_loss_clip": 0.01421783, "auxiliary_loss_mlp": 0.01033734, "balance_loss_clip": 1.25585485, "balance_loss_mlp": 1.01396883, "epoch": 0.566721779648279, "flos": 17789415615360.0, "grad_norm": 2.0877704395054213, "language_loss": 0.59876502, "learning_rate": 1.6664343460390064e-06, "loss": 0.62332016, "num_input_tokens_seen": 203096065, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19763184, "step": 9426, "time_per_iteration": 2.831064462661743 }, { "auxiliary_loss_clip": 0.01441147, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.27021086, "balance_loss_mlp": 1.01458192, "epoch": 0.566781902900947, "flos": 21043540471680.0, "grad_norm": 1.5933838572721544, "language_loss": 0.82286489, "learning_rate": 1.6660503456188764e-06, "loss": 0.847615, "num_input_tokens_seen": 203115270, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.19274902, "step": 9427, "time_per_iteration": 4.355990886688232 }, { "auxiliary_loss_clip": 0.01423334, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.26056004, "balance_loss_mlp": 1.01465487, "epoch": 0.5668420261536149, "flos": 23158913483520.0, "grad_norm": 1.7717698194449667, "language_loss": 0.86842209, "learning_rate": 1.6656663578621498e-06, "loss": 0.89299583, "num_input_tokens_seen": 203134290, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19384766, "step": 9428, "time_per_iteration": 2.897840976715088 }, { "auxiliary_loss_clip": 0.01438728, "auxiliary_loss_mlp": 0.01040245, "balance_loss_clip": 1.2664783, "balance_loss_mlp": 1.02009869, "epoch": 0.5669021494062829, "flos": 22611856878720.0, "grad_norm": 2.7325118607728913, "language_loss": 0.73919731, "learning_rate": 1.6652823827833886e-06, "loss": 0.76398706, "num_input_tokens_seen": 203152935, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.20129395, "step": 9429, "time_per_iteration": 4.237279891967773 }, { "auxiliary_loss_clip": 0.01437018, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.26581514, "balance_loss_mlp": 1.01581979, "epoch": 0.5669622726589508, "flos": 17389377924480.0, "grad_norm": 1.7631268607340798, "language_loss": 0.76056099, "learning_rate": 1.6648984203971538e-06, "loss": 0.78528923, "num_input_tokens_seen": 203170110, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.1998291, "step": 9430, "time_per_iteration": 4.221285104751587 }, { "auxiliary_loss_clip": 0.01434374, "auxiliary_loss_mlp": 0.01034895, "balance_loss_clip": 1.26724231, "balance_loss_mlp": 1.01503515, "epoch": 0.5670223959116188, "flos": 18770606507520.0, "grad_norm": 2.037654132148481, "language_loss": 0.7358259, "learning_rate": 1.6645144707180032e-06, "loss": 0.76051855, "num_input_tokens_seen": 203188825, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.1986084, "step": 9431, "time_per_iteration": 2.852400541305542 }, { "auxiliary_loss_clip": 0.01393186, "auxiliary_loss_mlp": 0.01031747, "balance_loss_clip": 1.23572087, "balance_loss_mlp": 1.01281655, "epoch": 0.5670825191642868, "flos": 13561293790080.0, "grad_norm": 1.7425923165099155, "language_loss": 0.74225903, "learning_rate": 1.6641305337604984e-06, "loss": 0.7665084, "num_input_tokens_seen": 203206860, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18933105, "step": 9432, "time_per_iteration": 2.8428280353546143 }, { "auxiliary_loss_clip": 0.01424315, "auxiliary_loss_mlp": 0.0103437, "balance_loss_clip": 1.25721073, "balance_loss_mlp": 1.01466429, "epoch": 0.5671426424169548, "flos": 22064076357120.0, "grad_norm": 1.4640797497492521, "language_loss": 0.78866804, "learning_rate": 1.663746609539197e-06, "loss": 0.81325489, "num_input_tokens_seen": 203225625, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19714355, "step": 9433, "time_per_iteration": 2.871584415435791 }, { "auxiliary_loss_clip": 0.01428109, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.25704789, "balance_loss_mlp": 1.01402521, "epoch": 0.5672027656696227, "flos": 21333732491520.0, "grad_norm": 3.061552338031281, "language_loss": 0.64415681, "learning_rate": 1.6633626980686582e-06, "loss": 0.66879785, "num_input_tokens_seen": 203242920, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.21972656, "step": 9434, "time_per_iteration": 2.847593069076538 }, { "auxiliary_loss_clip": 0.01408463, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.24457812, "balance_loss_mlp": 1.01438928, "epoch": 0.5672628889222907, "flos": 23524628353920.0, "grad_norm": 1.8061187349951455, "language_loss": 0.66941649, "learning_rate": 1.6629787993634399e-06, "loss": 0.69384038, "num_input_tokens_seen": 203261995, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19543457, "step": 9435, "time_per_iteration": 2.9053030014038086 }, { "auxiliary_loss_clip": 0.01407147, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.24265754, "balance_loss_mlp": 1.01047516, "epoch": 0.5673230121749586, "flos": 27132342474240.0, "grad_norm": 34.46757317767153, "language_loss": 0.72155058, "learning_rate": 1.6625949134380984e-06, "loss": 0.74591887, "num_input_tokens_seen": 203280670, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19189453, "step": 9436, "time_per_iteration": 2.8863418102264404 }, { "auxiliary_loss_clip": 0.01429011, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.25790048, "balance_loss_mlp": 1.01515937, "epoch": 0.5673831354276266, "flos": 31155160803840.0, "grad_norm": 1.6100964268055542, "language_loss": 0.74743372, "learning_rate": 1.6622110403071921e-06, "loss": 0.77207905, "num_input_tokens_seen": 203304800, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.20361328, "step": 9437, "time_per_iteration": 2.947248697280884 }, { "auxiliary_loss_clip": 0.01427966, "auxiliary_loss_mlp": 0.01034853, "balance_loss_clip": 1.25898719, "balance_loss_mlp": 1.01506472, "epoch": 0.5674432586802945, "flos": 27684783210240.0, "grad_norm": 1.9088402119251382, "language_loss": 0.61716866, "learning_rate": 1.661827179985277e-06, "loss": 0.64179695, "num_input_tokens_seen": 203324060, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19775391, "step": 9438, "time_per_iteration": 2.9599976539611816 }, { "auxiliary_loss_clip": 0.01419722, "auxiliary_loss_mlp": 0.01034814, "balance_loss_clip": 1.25071859, "balance_loss_mlp": 1.0153712, "epoch": 0.5675033819329626, "flos": 26626485899520.0, "grad_norm": 1.4536720182862295, "language_loss": 0.75945854, "learning_rate": 1.661443332486909e-06, "loss": 0.78400385, "num_input_tokens_seen": 203344360, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.19445801, "step": 9439, "time_per_iteration": 2.8907687664031982 }, { "auxiliary_loss_clip": 0.01413379, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.2486304, "balance_loss_mlp": 1.01270962, "epoch": 0.5675635051856306, "flos": 19107563667840.0, "grad_norm": 1.9506268761534913, "language_loss": 0.84168601, "learning_rate": 1.6610594978266438e-06, "loss": 0.86615717, "num_input_tokens_seen": 203362115, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.21032715, "step": 9440, "time_per_iteration": 2.8396666049957275 }, { "auxiliary_loss_clip": 0.01433925, "auxiliary_loss_mlp": 0.01038772, "balance_loss_clip": 1.26058745, "balance_loss_mlp": 1.01792192, "epoch": 0.5676236284382985, "flos": 17575198894080.0, "grad_norm": 2.0667624243172926, "language_loss": 0.76768792, "learning_rate": 1.6606756760190365e-06, "loss": 0.79241484, "num_input_tokens_seen": 203380550, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.20849609, "step": 9441, "time_per_iteration": 2.8016974925994873 }, { "auxiliary_loss_clip": 0.01420666, "auxiliary_loss_mlp": 0.01035337, "balance_loss_clip": 1.25374007, "balance_loss_mlp": 1.01561987, "epoch": 0.5676837516909665, "flos": 15960750773760.0, "grad_norm": 1.8099527254814918, "language_loss": 0.8363477, "learning_rate": 1.6602918670786413e-06, "loss": 0.86090779, "num_input_tokens_seen": 203396590, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19726562, "step": 9442, "time_per_iteration": 2.8361682891845703 }, { "auxiliary_loss_clip": 0.01398651, "auxiliary_loss_mlp": 0.01036521, "balance_loss_clip": 1.23849964, "balance_loss_mlp": 1.0153017, "epoch": 0.5677438749436344, "flos": 18304954577280.0, "grad_norm": 2.0092365533127996, "language_loss": 0.75162971, "learning_rate": 1.6599080710200126e-06, "loss": 0.77598137, "num_input_tokens_seen": 203414280, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.21228027, "step": 9443, "time_per_iteration": 2.8197946548461914 }, { "auxiliary_loss_clip": 0.01422529, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 1.25566483, "balance_loss_mlp": 1.01216388, "epoch": 0.5678039981963025, "flos": 17940325582080.0, "grad_norm": 1.8382722535720921, "language_loss": 0.78273636, "learning_rate": 1.6595242878577046e-06, "loss": 0.80728316, "num_input_tokens_seen": 203433280, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.1998291, "step": 9444, "time_per_iteration": 2.895387649536133 }, { "auxiliary_loss_clip": 0.01431482, "auxiliary_loss_mlp": 0.01041739, "balance_loss_clip": 1.25951064, "balance_loss_mlp": 1.02061558, "epoch": 0.5678641214489704, "flos": 19325354728320.0, "grad_norm": 2.0585487914692755, "language_loss": 0.81391823, "learning_rate": 1.6591405176062687e-06, "loss": 0.83865047, "num_input_tokens_seen": 203449935, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.21130371, "step": 9445, "time_per_iteration": 2.8279154300689697 }, { "auxiliary_loss_clip": 0.0141918, "auxiliary_loss_mlp": 0.01033739, "balance_loss_clip": 1.25174046, "balance_loss_mlp": 1.01336598, "epoch": 0.5679242447016384, "flos": 27763970889600.0, "grad_norm": 1.4110182194907486, "language_loss": 0.71606338, "learning_rate": 1.658756760280259e-06, "loss": 0.7405926, "num_input_tokens_seen": 203473025, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20361328, "step": 9446, "time_per_iteration": 2.917187213897705 }, { "auxiliary_loss_clip": 0.01430781, "auxiliary_loss_mlp": 0.0103433, "balance_loss_clip": 1.25838065, "balance_loss_mlp": 1.01409972, "epoch": 0.5679843679543063, "flos": 23779637902080.0, "grad_norm": 1.7287073250120715, "language_loss": 0.74734282, "learning_rate": 1.6583730158942276e-06, "loss": 0.77199399, "num_input_tokens_seen": 203492895, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.20227051, "step": 9447, "time_per_iteration": 2.8806142807006836 }, { "auxiliary_loss_clip": 0.01431203, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.25962043, "balance_loss_mlp": 1.01435745, "epoch": 0.5680444912069743, "flos": 25602511409280.0, "grad_norm": 2.099535486194818, "language_loss": 0.75725633, "learning_rate": 1.657989284462725e-06, "loss": 0.78192043, "num_input_tokens_seen": 203513710, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20849609, "step": 9448, "time_per_iteration": 2.8827052116394043 }, { "auxiliary_loss_clip": 0.01442634, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.26969826, "balance_loss_mlp": 1.01236558, "epoch": 0.5681046144596422, "flos": 23706241557120.0, "grad_norm": 2.2618494194320937, "language_loss": 0.77563429, "learning_rate": 1.6576055660003038e-06, "loss": 0.80037498, "num_input_tokens_seen": 203531630, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.19067383, "step": 9449, "time_per_iteration": 2.916572093963623 }, { "auxiliary_loss_clip": 0.01423397, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.25406039, "balance_loss_mlp": 1.0156157, "epoch": 0.5681647377123102, "flos": 28012238962560.0, "grad_norm": 1.563858319919621, "language_loss": 0.75545263, "learning_rate": 1.6572218605215128e-06, "loss": 0.7800464, "num_input_tokens_seen": 203551885, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20361328, "step": 9450, "time_per_iteration": 4.3708531856536865 }, { "auxiliary_loss_clip": 0.01421643, "auxiliary_loss_mlp": 0.01036051, "balance_loss_clip": 1.25242448, "balance_loss_mlp": 1.01535606, "epoch": 0.5682248609649782, "flos": 22758106631040.0, "grad_norm": 2.131644361973487, "language_loss": 0.68131918, "learning_rate": 1.6568381680409038e-06, "loss": 0.70589614, "num_input_tokens_seen": 203572250, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20690918, "step": 9451, "time_per_iteration": 2.8501923084259033 }, { "auxiliary_loss_clip": 0.01440515, "auxiliary_loss_mlp": 0.01034814, "balance_loss_clip": 1.26388502, "balance_loss_mlp": 1.01334488, "epoch": 0.5682849842176462, "flos": 21298640509440.0, "grad_norm": 1.9320860383102612, "language_loss": 0.73342329, "learning_rate": 1.656454488573026e-06, "loss": 0.75817662, "num_input_tokens_seen": 203590605, "router_z_loss_clip": 1.76660156, "router_z_loss_mlp": 0.21472168, "step": 9452, "time_per_iteration": 2.8510501384735107 }, { "auxiliary_loss_clip": 0.01408227, "auxiliary_loss_mlp": 0.01032806, "balance_loss_clip": 1.24287081, "balance_loss_mlp": 1.01276731, "epoch": 0.5683451074703142, "flos": 21151440616320.0, "grad_norm": 1.5957458862831349, "language_loss": 0.71441233, "learning_rate": 1.656070822132428e-06, "loss": 0.7388227, "num_input_tokens_seen": 203610080, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20031738, "step": 9453, "time_per_iteration": 2.8485419750213623 }, { "auxiliary_loss_clip": 0.01418598, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.25103998, "balance_loss_mlp": 1.01559126, "epoch": 0.5684052307229821, "flos": 22354223132160.0, "grad_norm": 1.8277144652504675, "language_loss": 0.70571893, "learning_rate": 1.6556871687336592e-06, "loss": 0.73026192, "num_input_tokens_seen": 203630060, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20117188, "step": 9454, "time_per_iteration": 2.8821299076080322 }, { "auxiliary_loss_clip": 0.01410977, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.24604082, "balance_loss_mlp": 1.01446557, "epoch": 0.5684653539756501, "flos": 21808614360960.0, "grad_norm": 2.4453218506474834, "language_loss": 0.61521524, "learning_rate": 1.6553035283912671e-06, "loss": 0.63965428, "num_input_tokens_seen": 203649065, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18469238, "step": 9455, "time_per_iteration": 2.849801778793335 }, { "auxiliary_loss_clip": 0.01438011, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.26573896, "balance_loss_mlp": 1.01300144, "epoch": 0.568525477228318, "flos": 23009134636800.0, "grad_norm": 2.071200064679158, "language_loss": 0.7463913, "learning_rate": 1.6549199011198e-06, "loss": 0.77110493, "num_input_tokens_seen": 203667545, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.20336914, "step": 9456, "time_per_iteration": 2.8493847846984863 }, { "auxiliary_loss_clip": 0.01426719, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.2593894, "balance_loss_mlp": 1.01473069, "epoch": 0.568585600480986, "flos": 21401111278080.0, "grad_norm": 3.0590334984810243, "language_loss": 0.77667254, "learning_rate": 1.6545362869338048e-06, "loss": 0.80128384, "num_input_tokens_seen": 203686025, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19689941, "step": 9457, "time_per_iteration": 2.8741726875305176 }, { "auxiliary_loss_clip": 0.01427272, "auxiliary_loss_mlp": 0.01038695, "balance_loss_clip": 1.25744987, "balance_loss_mlp": 1.0172255, "epoch": 0.568645723733654, "flos": 30019304626560.0, "grad_norm": 1.9225646972249055, "language_loss": 0.6721437, "learning_rate": 1.6541526858478285e-06, "loss": 0.69680333, "num_input_tokens_seen": 203705540, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.21459961, "step": 9458, "time_per_iteration": 2.912330150604248 }, { "auxiliary_loss_clip": 0.01436731, "auxiliary_loss_mlp": 0.01032991, "balance_loss_clip": 1.2642827, "balance_loss_mlp": 1.01258218, "epoch": 0.568705846986322, "flos": 20422408849920.0, "grad_norm": 2.0025162586942704, "language_loss": 0.6886763, "learning_rate": 1.6537690978764167e-06, "loss": 0.71337354, "num_input_tokens_seen": 203723670, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.20397949, "step": 9459, "time_per_iteration": 2.8779401779174805 }, { "auxiliary_loss_clip": 0.0143537, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.26312733, "balance_loss_mlp": 1.02014112, "epoch": 0.5687659702389899, "flos": 17465579447040.0, "grad_norm": 2.6105928192726537, "language_loss": 0.77320999, "learning_rate": 1.6533855230341155e-06, "loss": 0.79796302, "num_input_tokens_seen": 203739705, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.19787598, "step": 9460, "time_per_iteration": 2.920724630355835 }, { "auxiliary_loss_clip": 0.01426022, "auxiliary_loss_mlp": 0.01038777, "balance_loss_clip": 1.25464058, "balance_loss_mlp": 1.01839221, "epoch": 0.5688260934916579, "flos": 25415785543680.0, "grad_norm": 1.6081728217505364, "language_loss": 0.72451061, "learning_rate": 1.65300196133547e-06, "loss": 0.74915862, "num_input_tokens_seen": 203759000, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.20361328, "step": 9461, "time_per_iteration": 2.906696081161499 }, { "auxiliary_loss_clip": 0.01416942, "auxiliary_loss_mlp": 0.01034077, "balance_loss_clip": 1.24965274, "balance_loss_mlp": 1.01372766, "epoch": 0.5688862167443258, "flos": 21615825692160.0, "grad_norm": 1.915012585529728, "language_loss": 0.7332058, "learning_rate": 1.6526184127950249e-06, "loss": 0.757716, "num_input_tokens_seen": 203774295, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20349121, "step": 9462, "time_per_iteration": 4.274590730667114 }, { "auxiliary_loss_clip": 0.01413806, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.25033355, "balance_loss_mlp": 1.01812232, "epoch": 0.5689463399969938, "flos": 22429157800320.0, "grad_norm": 2.0733788858133657, "language_loss": 0.73934746, "learning_rate": 1.6522348774273246e-06, "loss": 0.76385283, "num_input_tokens_seen": 203792710, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18615723, "step": 9463, "time_per_iteration": 2.864074468612671 }, { "auxiliary_loss_clip": 0.01421525, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 1.25273442, "balance_loss_mlp": 1.01310003, "epoch": 0.5690064632496618, "flos": 18306085697280.0, "grad_norm": 1.8412639573453236, "language_loss": 0.75299937, "learning_rate": 1.6518513552469123e-06, "loss": 0.77754343, "num_input_tokens_seen": 203811645, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.19799805, "step": 9464, "time_per_iteration": 4.274746417999268 }, { "auxiliary_loss_clip": 0.01417068, "auxiliary_loss_mlp": 0.01036333, "balance_loss_clip": 1.2481761, "balance_loss_mlp": 1.01550663, "epoch": 0.5690665865023298, "flos": 21589058753280.0, "grad_norm": 1.6836548974331096, "language_loss": 0.8511641, "learning_rate": 1.6514678462683312e-06, "loss": 0.87569809, "num_input_tokens_seen": 203830040, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.20825195, "step": 9465, "time_per_iteration": 4.217571973800659 }, { "auxiliary_loss_clip": 0.01409459, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.2449441, "balance_loss_mlp": 1.01375377, "epoch": 0.5691267097549978, "flos": 24431020312320.0, "grad_norm": 1.7250420363162762, "language_loss": 0.72971869, "learning_rate": 1.651084350506125e-06, "loss": 0.75414044, "num_input_tokens_seen": 203851245, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18945312, "step": 9466, "time_per_iteration": 2.8731305599212646 }, { "auxiliary_loss_clip": 0.01220271, "auxiliary_loss_mlp": 0.01019828, "balance_loss_clip": 1.12631011, "balance_loss_mlp": 1.00237584, "epoch": 0.5691868330076657, "flos": 61692058391040.0, "grad_norm": 0.7152723500551653, "language_loss": 0.55504262, "learning_rate": 1.6507008679748343e-06, "loss": 0.5774436, "num_input_tokens_seen": 203916400, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.17480469, "step": 9467, "time_per_iteration": 3.3957624435424805 }, { "auxiliary_loss_clip": 0.01420951, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.25322247, "balance_loss_mlp": 1.01183915, "epoch": 0.5692469562603337, "flos": 21335270814720.0, "grad_norm": 5.882338844244037, "language_loss": 0.6402272, "learning_rate": 1.6503173986890023e-06, "loss": 0.66476226, "num_input_tokens_seen": 203935870, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20727539, "step": 9468, "time_per_iteration": 2.91056489944458 }, { "auxiliary_loss_clip": 0.0141677, "auxiliary_loss_mlp": 0.01035387, "balance_loss_clip": 1.25133514, "balance_loss_mlp": 1.01509714, "epoch": 0.5693070795130016, "flos": 23378378601600.0, "grad_norm": 1.8572052199355693, "language_loss": 0.79472923, "learning_rate": 1.64993394266317e-06, "loss": 0.81925076, "num_input_tokens_seen": 203954950, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20275879, "step": 9469, "time_per_iteration": 2.900360107421875 }, { "auxiliary_loss_clip": 0.01427132, "auxiliary_loss_mlp": 0.01041771, "balance_loss_clip": 1.25538158, "balance_loss_mlp": 1.02090907, "epoch": 0.5693672027656697, "flos": 18706394856960.0, "grad_norm": 4.075816497729076, "language_loss": 0.70578182, "learning_rate": 1.6495504999118769e-06, "loss": 0.73047084, "num_input_tokens_seen": 203972715, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.20861816, "step": 9470, "time_per_iteration": 2.795785427093506 }, { "auxiliary_loss_clip": 0.01416448, "auxiliary_loss_mlp": 0.01036903, "balance_loss_clip": 1.24881709, "balance_loss_mlp": 1.01682854, "epoch": 0.5694273260183376, "flos": 20458948665600.0, "grad_norm": 1.6512052386773992, "language_loss": 0.75051957, "learning_rate": 1.6491670704496644e-06, "loss": 0.77505308, "num_input_tokens_seen": 203990775, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20068359, "step": 9471, "time_per_iteration": 2.8562755584716797 }, { "auxiliary_loss_clip": 0.01411371, "auxiliary_loss_mlp": 0.01031537, "balance_loss_clip": 1.24724829, "balance_loss_mlp": 1.01234412, "epoch": 0.5694874492710056, "flos": 17612191157760.0, "grad_norm": 1.6915788343008675, "language_loss": 0.59218585, "learning_rate": 1.6487836542910716e-06, "loss": 0.61661494, "num_input_tokens_seen": 204008845, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19189453, "step": 9472, "time_per_iteration": 2.8294436931610107 }, { "auxiliary_loss_clip": 0.01403346, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.24108601, "balance_loss_mlp": 1.01279342, "epoch": 0.5695475725236735, "flos": 13378730446080.0, "grad_norm": 2.1116510186060475, "language_loss": 0.74635136, "learning_rate": 1.648400251450638e-06, "loss": 0.77070928, "num_input_tokens_seen": 204023755, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19641113, "step": 9473, "time_per_iteration": 2.8105525970458984 }, { "auxiliary_loss_clip": 0.01220162, "auxiliary_loss_mlp": 0.010207, "balance_loss_clip": 1.12506247, "balance_loss_mlp": 1.00591838, "epoch": 0.5696076957763415, "flos": 68206488658560.0, "grad_norm": 0.6531212367226398, "language_loss": 0.57648313, "learning_rate": 1.6480168619429023e-06, "loss": 0.59889174, "num_input_tokens_seen": 204091255, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.14746094, "step": 9474, "time_per_iteration": 3.410151720046997 }, { "auxiliary_loss_clip": 0.01411816, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.24826908, "balance_loss_mlp": 1.01615286, "epoch": 0.5696678190290094, "flos": 33851234568960.0, "grad_norm": 6.065464204212192, "language_loss": 0.54140127, "learning_rate": 1.6476334857824017e-06, "loss": 0.56588674, "num_input_tokens_seen": 204113285, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20581055, "step": 9475, "time_per_iteration": 2.9617111682891846 }, { "auxiliary_loss_clip": 0.0142484, "auxiliary_loss_mlp": 0.0103414, "balance_loss_clip": 1.25766659, "balance_loss_mlp": 1.01382637, "epoch": 0.5697279422816774, "flos": 26367042360960.0, "grad_norm": 2.0108881186602585, "language_loss": 0.80096334, "learning_rate": 1.647250122983675e-06, "loss": 0.82555318, "num_input_tokens_seen": 204133045, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20300293, "step": 9476, "time_per_iteration": 2.943243980407715 }, { "auxiliary_loss_clip": 0.01439932, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.26993012, "balance_loss_mlp": 1.01804614, "epoch": 0.5697880655343454, "flos": 22940986688640.0, "grad_norm": 2.079497958060551, "language_loss": 0.6745497, "learning_rate": 1.6468667735612592e-06, "loss": 0.69933695, "num_input_tokens_seen": 204152590, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20751953, "step": 9477, "time_per_iteration": 2.8671066761016846 }, { "auxiliary_loss_clip": 0.01428981, "auxiliary_loss_mlp": 0.01036232, "balance_loss_clip": 1.26000118, "balance_loss_mlp": 1.01484609, "epoch": 0.5698481887870134, "flos": 26772735651840.0, "grad_norm": 2.214859598873471, "language_loss": 0.71487045, "learning_rate": 1.6464834375296906e-06, "loss": 0.73952258, "num_input_tokens_seen": 204171815, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21374512, "step": 9478, "time_per_iteration": 2.8818464279174805 }, { "auxiliary_loss_clip": 0.01401503, "auxiliary_loss_mlp": 0.01035146, "balance_loss_clip": 1.24136662, "balance_loss_mlp": 1.0143559, "epoch": 0.5699083120396814, "flos": 15750741818880.0, "grad_norm": 1.5787095941094638, "language_loss": 0.69525969, "learning_rate": 1.6461001149035055e-06, "loss": 0.71962619, "num_input_tokens_seen": 204188535, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.2076416, "step": 9479, "time_per_iteration": 2.8226168155670166 }, { "auxiliary_loss_clip": 0.01401501, "auxiliary_loss_mlp": 0.01036933, "balance_loss_clip": 1.24023104, "balance_loss_mlp": 1.01740623, "epoch": 0.5699684352923493, "flos": 19546855862400.0, "grad_norm": 1.4311482583878463, "language_loss": 0.72036511, "learning_rate": 1.6457168056972392e-06, "loss": 0.74474943, "num_input_tokens_seen": 204208365, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.1953125, "step": 9480, "time_per_iteration": 2.8688602447509766 }, { "auxiliary_loss_clip": 0.0141878, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.25404835, "balance_loss_mlp": 1.01509929, "epoch": 0.5700285585450173, "flos": 16262027769600.0, "grad_norm": 2.1217300978077724, "language_loss": 0.72948015, "learning_rate": 1.6453335099254276e-06, "loss": 0.75402641, "num_input_tokens_seen": 204226560, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.2076416, "step": 9481, "time_per_iteration": 2.8214454650878906 }, { "auxiliary_loss_clip": 0.01417065, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.25188279, "balance_loss_mlp": 1.01507413, "epoch": 0.5700886817976852, "flos": 19874492593920.0, "grad_norm": 1.795180129698411, "language_loss": 0.79204834, "learning_rate": 1.6449502276026041e-06, "loss": 0.81656468, "num_input_tokens_seen": 204245410, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19494629, "step": 9482, "time_per_iteration": 2.8182265758514404 }, { "auxiliary_loss_clip": 0.01406186, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.241925, "balance_loss_mlp": 1.01335919, "epoch": 0.5701488050503533, "flos": 23852174595840.0, "grad_norm": 1.51885201109169, "language_loss": 0.78351671, "learning_rate": 1.6445669587433043e-06, "loss": 0.80790734, "num_input_tokens_seen": 204264840, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19519043, "step": 9483, "time_per_iteration": 2.8460726737976074 }, { "auxiliary_loss_clip": 0.01417536, "auxiliary_loss_mlp": 0.01036817, "balance_loss_clip": 1.25093007, "balance_loss_mlp": 1.01683784, "epoch": 0.5702089283030212, "flos": 23670199434240.0, "grad_norm": 1.578253470239627, "language_loss": 0.81667703, "learning_rate": 1.6441837033620612e-06, "loss": 0.8412205, "num_input_tokens_seen": 204284335, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19970703, "step": 9484, "time_per_iteration": 2.851006031036377 }, { "auxiliary_loss_clip": 0.01413152, "auxiliary_loss_mlp": 0.01033461, "balance_loss_clip": 1.24600029, "balance_loss_mlp": 1.01239681, "epoch": 0.5702690515556892, "flos": 27902574270720.0, "grad_norm": 2.9261440213714613, "language_loss": 0.60751534, "learning_rate": 1.6438004614734073e-06, "loss": 0.63198149, "num_input_tokens_seen": 204302590, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.21069336, "step": 9485, "time_per_iteration": 4.371880054473877 }, { "auxiliary_loss_clip": 0.0141635, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.24900556, "balance_loss_mlp": 1.01622391, "epoch": 0.5703291748083571, "flos": 24034421226240.0, "grad_norm": 2.556852608642169, "language_loss": 0.66351944, "learning_rate": 1.6434172330918757e-06, "loss": 0.6880427, "num_input_tokens_seen": 204323055, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19750977, "step": 9486, "time_per_iteration": 2.863821268081665 }, { "auxiliary_loss_clip": 0.01215331, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.12159324, "balance_loss_mlp": 1.01077127, "epoch": 0.5703892980610251, "flos": 57056116769280.0, "grad_norm": 0.6701509433190653, "language_loss": 0.48041552, "learning_rate": 1.6430340182319978e-06, "loss": 0.50288254, "num_input_tokens_seen": 204386160, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.20605469, "step": 9487, "time_per_iteration": 3.4431607723236084 }, { "auxiliary_loss_clip": 0.01419753, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.2529192, "balance_loss_mlp": 1.01707006, "epoch": 0.570449421313693, "flos": 24361243551360.0, "grad_norm": 1.4947224234991785, "language_loss": 0.87648553, "learning_rate": 1.6426508169083067e-06, "loss": 0.90105855, "num_input_tokens_seen": 204406315, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20483398, "step": 9488, "time_per_iteration": 2.931861162185669 }, { "auxiliary_loss_clip": 0.01424109, "auxiliary_loss_mlp": 0.0103379, "balance_loss_clip": 1.25440454, "balance_loss_mlp": 1.01446581, "epoch": 0.570509544566361, "flos": 24839745004800.0, "grad_norm": 1.4250455256532184, "language_loss": 0.79058492, "learning_rate": 1.6422676291353314e-06, "loss": 0.81516391, "num_input_tokens_seen": 204427645, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.1932373, "step": 9489, "time_per_iteration": 2.9101195335388184 }, { "auxiliary_loss_clip": 0.01412705, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.24739802, "balance_loss_mlp": 1.02034616, "epoch": 0.570569667819029, "flos": 21407174081280.0, "grad_norm": 1.8420314337451194, "language_loss": 0.70890808, "learning_rate": 1.641884454927604e-06, "loss": 0.73343039, "num_input_tokens_seen": 204445910, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19165039, "step": 9490, "time_per_iteration": 2.856544256210327 }, { "auxiliary_loss_clip": 0.01423365, "auxiliary_loss_mlp": 0.01038786, "balance_loss_clip": 1.25746441, "balance_loss_mlp": 1.0190804, "epoch": 0.570629791071697, "flos": 23225839822080.0, "grad_norm": 2.12319997976811, "language_loss": 0.76598084, "learning_rate": 1.6415012942996548e-06, "loss": 0.79060239, "num_input_tokens_seen": 204464680, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19714355, "step": 9491, "time_per_iteration": 2.8958823680877686 }, { "auxiliary_loss_clip": 0.0122125, "auxiliary_loss_mlp": 0.01039299, "balance_loss_clip": 1.12666225, "balance_loss_mlp": 1.02079809, "epoch": 0.570689914324365, "flos": 65314097130240.0, "grad_norm": 0.801386007240681, "language_loss": 0.5746851, "learning_rate": 1.641118147266011e-06, "loss": 0.59729058, "num_input_tokens_seen": 204525580, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.18457031, "step": 9492, "time_per_iteration": 3.340087890625 }, { "auxiliary_loss_clip": 0.01414901, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.25011301, "balance_loss_mlp": 1.02149594, "epoch": 0.5707500375770329, "flos": 21151712085120.0, "grad_norm": 4.093888198481086, "language_loss": 0.72963572, "learning_rate": 1.6407350138412035e-06, "loss": 0.75419837, "num_input_tokens_seen": 204541320, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19885254, "step": 9493, "time_per_iteration": 2.8598716259002686 }, { "auxiliary_loss_clip": 0.01439427, "auxiliary_loss_mlp": 0.01037577, "balance_loss_clip": 1.26835144, "balance_loss_mlp": 1.01750207, "epoch": 0.5708101608297009, "flos": 20822265561600.0, "grad_norm": 1.8106270963586066, "language_loss": 0.7845037, "learning_rate": 1.6403518940397606e-06, "loss": 0.80927372, "num_input_tokens_seen": 204560275, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20068359, "step": 9494, "time_per_iteration": 2.8673572540283203 }, { "auxiliary_loss_clip": 0.01444272, "auxiliary_loss_mlp": 0.01037398, "balance_loss_clip": 1.27114928, "balance_loss_mlp": 1.01656008, "epoch": 0.5708702840823688, "flos": 25823424360960.0, "grad_norm": 2.2901655610866207, "language_loss": 0.8079139, "learning_rate": 1.6399687878762096e-06, "loss": 0.83273059, "num_input_tokens_seen": 204579430, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20837402, "step": 9495, "time_per_iteration": 2.903404474258423 }, { "auxiliary_loss_clip": 0.01453598, "auxiliary_loss_mlp": 0.01043953, "balance_loss_clip": 1.27886939, "balance_loss_mlp": 1.02263844, "epoch": 0.5709304073350369, "flos": 23660426557440.0, "grad_norm": 5.645404682163034, "language_loss": 0.67103297, "learning_rate": 1.6395856953650784e-06, "loss": 0.6960085, "num_input_tokens_seen": 204597710, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.21313477, "step": 9496, "time_per_iteration": 2.8979718685150146 }, { "auxiliary_loss_clip": 0.01432376, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.26089859, "balance_loss_mlp": 1.0161128, "epoch": 0.5709905305877048, "flos": 16116230465280.0, "grad_norm": 7.305600688108984, "language_loss": 0.70400262, "learning_rate": 1.6392026165208938e-06, "loss": 0.72868764, "num_input_tokens_seen": 204616140, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20007324, "step": 9497, "time_per_iteration": 4.27473783493042 }, { "auxiliary_loss_clip": 0.01430534, "auxiliary_loss_mlp": 0.0103458, "balance_loss_clip": 1.26066828, "balance_loss_mlp": 1.013659, "epoch": 0.5710506538403728, "flos": 24760738304640.0, "grad_norm": 2.9983893503404135, "language_loss": 0.82050073, "learning_rate": 1.638819551358182e-06, "loss": 0.8451519, "num_input_tokens_seen": 204636470, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.20922852, "step": 9498, "time_per_iteration": 2.8640806674957275 }, { "auxiliary_loss_clip": 0.0143141, "auxiliary_loss_mlp": 0.01040532, "balance_loss_clip": 1.26072454, "balance_loss_mlp": 1.0194087, "epoch": 0.5711107770930407, "flos": 21992580293760.0, "grad_norm": 1.974863980733816, "language_loss": 0.67181462, "learning_rate": 1.638436499891469e-06, "loss": 0.69653404, "num_input_tokens_seen": 204656640, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.21118164, "step": 9499, "time_per_iteration": 4.237907648086548 }, { "auxiliary_loss_clip": 0.01429597, "auxiliary_loss_mlp": 0.01035068, "balance_loss_clip": 1.2609061, "balance_loss_mlp": 1.01511264, "epoch": 0.5711709003457087, "flos": 19583667146880.0, "grad_norm": 1.5018218256361155, "language_loss": 0.72986007, "learning_rate": 1.6380534621352805e-06, "loss": 0.75450671, "num_input_tokens_seen": 204675475, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.19958496, "step": 9500, "time_per_iteration": 4.216063022613525 }, { "auxiliary_loss_clip": 0.01437239, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.26543617, "balance_loss_mlp": 1.01084566, "epoch": 0.5712310235983766, "flos": 24253162427520.0, "grad_norm": 2.082957913938169, "language_loss": 0.77475381, "learning_rate": 1.6376704381041407e-06, "loss": 0.79943955, "num_input_tokens_seen": 204695385, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.20483398, "step": 9501, "time_per_iteration": 3.0606400966644287 }, { "auxiliary_loss_clip": 0.01434046, "auxiliary_loss_mlp": 0.01035333, "balance_loss_clip": 1.26444685, "balance_loss_mlp": 1.01549685, "epoch": 0.5712911468510447, "flos": 21005960025600.0, "grad_norm": 1.688759608228194, "language_loss": 0.7561754, "learning_rate": 1.6372874278125742e-06, "loss": 0.78086919, "num_input_tokens_seen": 204714730, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.19824219, "step": 9502, "time_per_iteration": 2.8382177352905273 }, { "auxiliary_loss_clip": 0.01425399, "auxiliary_loss_mlp": 0.01036363, "balance_loss_clip": 1.25880051, "balance_loss_mlp": 1.01644349, "epoch": 0.5713512701037126, "flos": 18926402912640.0, "grad_norm": 2.514422205451015, "language_loss": 0.83032072, "learning_rate": 1.636904431275105e-06, "loss": 0.85493833, "num_input_tokens_seen": 204735025, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19934082, "step": 9503, "time_per_iteration": 2.8908681869506836 }, { "auxiliary_loss_clip": 0.01426842, "auxiliary_loss_mlp": 0.01035791, "balance_loss_clip": 1.2604661, "balance_loss_mlp": 1.0155611, "epoch": 0.5714113933563806, "flos": 17420488364160.0, "grad_norm": 2.0589880389036623, "language_loss": 0.8651346, "learning_rate": 1.6365214485062553e-06, "loss": 0.88976091, "num_input_tokens_seen": 204751365, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20227051, "step": 9504, "time_per_iteration": 2.79015851020813 }, { "auxiliary_loss_clip": 0.01418991, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.25415123, "balance_loss_mlp": 1.01292038, "epoch": 0.5714715166090486, "flos": 20202943731840.0, "grad_norm": 2.0082949324275376, "language_loss": 0.76005912, "learning_rate": 1.6361384795205496e-06, "loss": 0.78457493, "num_input_tokens_seen": 204768980, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19665527, "step": 9505, "time_per_iteration": 2.811351776123047 }, { "auxiliary_loss_clip": 0.01420096, "auxiliary_loss_mlp": 0.01034878, "balance_loss_clip": 1.2541182, "balance_loss_mlp": 1.01526809, "epoch": 0.5715316398617165, "flos": 18560869021440.0, "grad_norm": 1.4514664605425311, "language_loss": 0.8220377, "learning_rate": 1.635755524332509e-06, "loss": 0.84658742, "num_input_tokens_seen": 204788110, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19604492, "step": 9506, "time_per_iteration": 2.851440906524658 }, { "auxiliary_loss_clip": 0.01421123, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.25488639, "balance_loss_mlp": 1.0124948, "epoch": 0.5715917631143845, "flos": 18487246452480.0, "grad_norm": 1.9662333328430843, "language_loss": 0.78312016, "learning_rate": 1.6353725829566552e-06, "loss": 0.80765563, "num_input_tokens_seen": 204807240, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19934082, "step": 9507, "time_per_iteration": 2.853846311569214 }, { "auxiliary_loss_clip": 0.0142781, "auxiliary_loss_mlp": 0.01034064, "balance_loss_clip": 1.25789738, "balance_loss_mlp": 1.01308298, "epoch": 0.5716518863670524, "flos": 24029489543040.0, "grad_norm": 1.550007792930395, "language_loss": 0.69687253, "learning_rate": 1.63498965540751e-06, "loss": 0.72149128, "num_input_tokens_seen": 204826415, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.2097168, "step": 9508, "time_per_iteration": 2.8868308067321777 }, { "auxiliary_loss_clip": 0.01432215, "auxiliary_loss_mlp": 0.01036022, "balance_loss_clip": 1.26163042, "balance_loss_mlp": 1.01569724, "epoch": 0.5717120096197205, "flos": 17827855712640.0, "grad_norm": 2.0947844690105333, "language_loss": 0.8101151, "learning_rate": 1.634606741699593e-06, "loss": 0.83479744, "num_input_tokens_seen": 204844305, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20324707, "step": 9509, "time_per_iteration": 2.829927682876587 }, { "auxiliary_loss_clip": 0.01415826, "auxiliary_loss_mlp": 0.01031332, "balance_loss_clip": 1.25167871, "balance_loss_mlp": 1.01122093, "epoch": 0.5717721328723884, "flos": 21874590558720.0, "grad_norm": 1.9158664632967035, "language_loss": 0.73731232, "learning_rate": 1.6342238418474255e-06, "loss": 0.76178396, "num_input_tokens_seen": 204861765, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.2010498, "step": 9510, "time_per_iteration": 2.8271336555480957 }, { "auxiliary_loss_clip": 0.01427037, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 1.2600832, "balance_loss_mlp": 1.01485658, "epoch": 0.5718322561250564, "flos": 28448952203520.0, "grad_norm": 1.53935491248813, "language_loss": 0.70458221, "learning_rate": 1.6338409558655264e-06, "loss": 0.72920179, "num_input_tokens_seen": 204882505, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20056152, "step": 9511, "time_per_iteration": 2.879603147506714 }, { "auxiliary_loss_clip": 0.01425286, "auxiliary_loss_mlp": 0.01038873, "balance_loss_clip": 1.25647831, "balance_loss_mlp": 1.0179162, "epoch": 0.5718923793777243, "flos": 13559393508480.0, "grad_norm": 2.1578421711967697, "language_loss": 0.62093818, "learning_rate": 1.6334580837684152e-06, "loss": 0.64557981, "num_input_tokens_seen": 204899830, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20959473, "step": 9512, "time_per_iteration": 2.830775499343872 }, { "auxiliary_loss_clip": 0.01415423, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.24961925, "balance_loss_mlp": 1.01605439, "epoch": 0.5719525026303923, "flos": 17831068093440.0, "grad_norm": 2.557844938257377, "language_loss": 0.76497895, "learning_rate": 1.6330752255706104e-06, "loss": 0.78948122, "num_input_tokens_seen": 204918100, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.1875, "step": 9513, "time_per_iteration": 2.8179476261138916 }, { "auxiliary_loss_clip": 0.01221817, "auxiliary_loss_mlp": 0.0103404, "balance_loss_clip": 1.12570024, "balance_loss_mlp": 1.01458478, "epoch": 0.5720126258830602, "flos": 61323927563520.0, "grad_norm": 0.8874807727193097, "language_loss": 0.66853786, "learning_rate": 1.6326923812866288e-06, "loss": 0.69109643, "num_input_tokens_seen": 204972925, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.19433594, "step": 9514, "time_per_iteration": 3.3427693843841553 }, { "auxiliary_loss_clip": 0.0144128, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.27004552, "balance_loss_mlp": 1.01552749, "epoch": 0.5720727491357283, "flos": 23998650572160.0, "grad_norm": 2.0054396893750224, "language_loss": 0.82261151, "learning_rate": 1.63230955093099e-06, "loss": 0.84738755, "num_input_tokens_seen": 204990910, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20825195, "step": 9515, "time_per_iteration": 2.8941762447357178 }, { "auxiliary_loss_clip": 0.01409042, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.24661458, "balance_loss_mlp": 1.01444435, "epoch": 0.5721328723883962, "flos": 23416185271680.0, "grad_norm": 1.5700018916518101, "language_loss": 0.860587, "learning_rate": 1.6319267345182092e-06, "loss": 0.88502097, "num_input_tokens_seen": 205010500, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19909668, "step": 9516, "time_per_iteration": 2.8979694843292236 }, { "auxiliary_loss_clip": 0.01429154, "auxiliary_loss_mlp": 0.01036746, "balance_loss_clip": 1.26209617, "balance_loss_mlp": 1.01584864, "epoch": 0.5721929956410642, "flos": 18813616329600.0, "grad_norm": 1.7572092527946557, "language_loss": 0.88324738, "learning_rate": 1.6315439320628038e-06, "loss": 0.90790641, "num_input_tokens_seen": 205028560, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.2088623, "step": 9517, "time_per_iteration": 2.843526840209961 }, { "auxiliary_loss_clip": 0.01420149, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.25444901, "balance_loss_mlp": 1.01514006, "epoch": 0.5722531188937322, "flos": 27207548611200.0, "grad_norm": 1.6674365453802922, "language_loss": 0.86046076, "learning_rate": 1.6311611435792893e-06, "loss": 0.88501561, "num_input_tokens_seen": 205048650, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.2019043, "step": 9518, "time_per_iteration": 2.873119831085205 }, { "auxiliary_loss_clip": 0.01413574, "auxiliary_loss_mlp": 0.01032579, "balance_loss_clip": 1.25145113, "balance_loss_mlp": 1.01271892, "epoch": 0.5723132421464001, "flos": 15204182906880.0, "grad_norm": 1.9316760452950092, "language_loss": 0.79981303, "learning_rate": 1.6307783690821812e-06, "loss": 0.82427454, "num_input_tokens_seen": 205066480, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19873047, "step": 9519, "time_per_iteration": 2.872514486312866 }, { "auxiliary_loss_clip": 0.01420218, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.25503504, "balance_loss_mlp": 1.01671731, "epoch": 0.5723733653990681, "flos": 27610798682880.0, "grad_norm": 3.6522574749398293, "language_loss": 0.83572119, "learning_rate": 1.6303956085859944e-06, "loss": 0.86028606, "num_input_tokens_seen": 205087475, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.1953125, "step": 9520, "time_per_iteration": 4.39677095413208 }, { "auxiliary_loss_clip": 0.01440141, "auxiliary_loss_mlp": 0.0103699, "balance_loss_clip": 1.26951396, "balance_loss_mlp": 1.01635456, "epoch": 0.572433488651736, "flos": 18231874945920.0, "grad_norm": 2.1263273079446052, "language_loss": 0.73889863, "learning_rate": 1.630012862105243e-06, "loss": 0.76367003, "num_input_tokens_seen": 205106495, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.2064209, "step": 9521, "time_per_iteration": 2.8062448501586914 }, { "auxiliary_loss_clip": 0.01425293, "auxiliary_loss_mlp": 0.01036382, "balance_loss_clip": 1.25852346, "balance_loss_mlp": 1.01629555, "epoch": 0.5724936119044041, "flos": 31261703604480.0, "grad_norm": 1.5178492224270694, "language_loss": 0.78953665, "learning_rate": 1.6296301296544415e-06, "loss": 0.81415337, "num_input_tokens_seen": 205128285, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.2010498, "step": 9522, "time_per_iteration": 2.9147608280181885 }, { "auxiliary_loss_clip": 0.01416347, "auxiliary_loss_mlp": 0.01033863, "balance_loss_clip": 1.25425398, "balance_loss_mlp": 1.01474166, "epoch": 0.572553735157072, "flos": 19209943946880.0, "grad_norm": 1.7068541284664465, "language_loss": 0.724886, "learning_rate": 1.629247411248102e-06, "loss": 0.7493881, "num_input_tokens_seen": 205146595, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19116211, "step": 9523, "time_per_iteration": 2.8089489936828613 }, { "auxiliary_loss_clip": 0.01407206, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.24471247, "balance_loss_mlp": 1.01085293, "epoch": 0.57261385840974, "flos": 21224746471680.0, "grad_norm": 1.6391920514711746, "language_loss": 0.70805359, "learning_rate": 1.628864706900738e-06, "loss": 0.73242092, "num_input_tokens_seen": 205164295, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18676758, "step": 9524, "time_per_iteration": 2.8604204654693604 }, { "auxiliary_loss_clip": 0.01413978, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.24995553, "balance_loss_mlp": 1.01296473, "epoch": 0.5726739816624079, "flos": 33997574810880.0, "grad_norm": 1.294532231739259, "language_loss": 0.66388881, "learning_rate": 1.6284820166268615e-06, "loss": 0.68835753, "num_input_tokens_seen": 205185380, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19921875, "step": 9525, "time_per_iteration": 2.9671006202697754 }, { "auxiliary_loss_clip": 0.01413538, "auxiliary_loss_mlp": 0.0103315, "balance_loss_clip": 1.24994135, "balance_loss_mlp": 1.01373053, "epoch": 0.5727341049150759, "flos": 24285630211200.0, "grad_norm": 1.6065963345647825, "language_loss": 0.73884594, "learning_rate": 1.628099340440984e-06, "loss": 0.76331282, "num_input_tokens_seen": 205204895, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.1940918, "step": 9526, "time_per_iteration": 2.8433845043182373 }, { "auxiliary_loss_clip": 0.01407944, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.2460041, "balance_loss_mlp": 1.01184368, "epoch": 0.5727942281677438, "flos": 28411733715840.0, "grad_norm": 1.5754941119492267, "language_loss": 0.81187916, "learning_rate": 1.6277166783576176e-06, "loss": 0.83627284, "num_input_tokens_seen": 205223440, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19555664, "step": 9527, "time_per_iteration": 2.942884683609009 }, { "auxiliary_loss_clip": 0.01409509, "auxiliary_loss_mlp": 0.01039259, "balance_loss_clip": 1.24800563, "balance_loss_mlp": 1.01914871, "epoch": 0.5728543514204119, "flos": 19546222435200.0, "grad_norm": 1.5267328106237117, "language_loss": 0.72874755, "learning_rate": 1.6273340303912713e-06, "loss": 0.75323522, "num_input_tokens_seen": 205242800, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.2010498, "step": 9528, "time_per_iteration": 2.8228275775909424 }, { "auxiliary_loss_clip": 0.01415962, "auxiliary_loss_mlp": 0.01031639, "balance_loss_clip": 1.25261092, "balance_loss_mlp": 1.01212454, "epoch": 0.5729144746730798, "flos": 21516657793920.0, "grad_norm": 1.9587400244919435, "language_loss": 0.86786616, "learning_rate": 1.6269513965564557e-06, "loss": 0.89234215, "num_input_tokens_seen": 205259465, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19506836, "step": 9529, "time_per_iteration": 2.8684070110321045 }, { "auxiliary_loss_clip": 0.01221773, "auxiliary_loss_mlp": 0.01030353, "balance_loss_clip": 1.12775195, "balance_loss_mlp": 1.01318705, "epoch": 0.5729745979257478, "flos": 58709575186560.0, "grad_norm": 0.7602970773718527, "language_loss": 0.56152934, "learning_rate": 1.6265687768676813e-06, "loss": 0.5840506, "num_input_tokens_seen": 205314100, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.171875, "step": 9530, "time_per_iteration": 3.305128574371338 }, { "auxiliary_loss_clip": 0.01444046, "auxiliary_loss_mlp": 0.01037947, "balance_loss_clip": 1.2749933, "balance_loss_mlp": 1.01930249, "epoch": 0.5730347211784158, "flos": 18561411959040.0, "grad_norm": 1.704356429975184, "language_loss": 0.6754207, "learning_rate": 1.6261861713394553e-06, "loss": 0.70024061, "num_input_tokens_seen": 205333420, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.18652344, "step": 9531, "time_per_iteration": 2.9308788776397705 }, { "auxiliary_loss_clip": 0.01421951, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.25667703, "balance_loss_mlp": 1.01395869, "epoch": 0.5730948444310837, "flos": 38044807349760.0, "grad_norm": 1.9407483552989404, "language_loss": 0.76065183, "learning_rate": 1.6258035799862876e-06, "loss": 0.78520441, "num_input_tokens_seen": 205350995, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19348145, "step": 9532, "time_per_iteration": 2.9904747009277344 }, { "auxiliary_loss_clip": 0.01413161, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 1.24812078, "balance_loss_mlp": 1.01602447, "epoch": 0.5731549676837517, "flos": 25237610945280.0, "grad_norm": 1.3334348394814057, "language_loss": 0.79526341, "learning_rate": 1.625421002822686e-06, "loss": 0.81974614, "num_input_tokens_seen": 205372675, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19091797, "step": 9533, "time_per_iteration": 4.348853826522827 }, { "auxiliary_loss_clip": 0.01413162, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.25103736, "balance_loss_mlp": 1.01196384, "epoch": 0.5732150909364196, "flos": 23378785804800.0, "grad_norm": 1.6736005906560716, "language_loss": 0.85862541, "learning_rate": 1.6250384398631574e-06, "loss": 0.88307071, "num_input_tokens_seen": 205392590, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19396973, "step": 9534, "time_per_iteration": 4.376283645629883 }, { "auxiliary_loss_clip": 0.01415651, "auxiliary_loss_mlp": 0.01037219, "balance_loss_clip": 1.25158405, "balance_loss_mlp": 1.01669121, "epoch": 0.5732752141890877, "flos": 23090629800960.0, "grad_norm": 1.8273271761365368, "language_loss": 0.76057446, "learning_rate": 1.6246558911222085e-06, "loss": 0.7851032, "num_input_tokens_seen": 205414885, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.20507812, "step": 9535, "time_per_iteration": 4.2519567012786865 }, { "auxiliary_loss_clip": 0.01434717, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.26540184, "balance_loss_mlp": 1.01488733, "epoch": 0.5733353374417556, "flos": 24362827119360.0, "grad_norm": 2.1228061448288007, "language_loss": 0.71772772, "learning_rate": 1.624273356614346e-06, "loss": 0.74243271, "num_input_tokens_seen": 205434440, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20898438, "step": 9536, "time_per_iteration": 2.857120990753174 }, { "auxiliary_loss_clip": 0.01417228, "auxiliary_loss_mlp": 0.01037558, "balance_loss_clip": 1.25436497, "balance_loss_mlp": 1.01788807, "epoch": 0.5733954606944236, "flos": 27210308544000.0, "grad_norm": 1.8834504599634612, "language_loss": 0.70550221, "learning_rate": 1.6238908363540755e-06, "loss": 0.73005009, "num_input_tokens_seen": 205454225, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19677734, "step": 9537, "time_per_iteration": 2.890341281890869 }, { "auxiliary_loss_clip": 0.01417808, "auxiliary_loss_mlp": 0.01038322, "balance_loss_clip": 1.25283015, "balance_loss_mlp": 1.01891494, "epoch": 0.5734555839470915, "flos": 28776317466240.0, "grad_norm": 1.8907529337853262, "language_loss": 0.63458312, "learning_rate": 1.623508330355902e-06, "loss": 0.6591444, "num_input_tokens_seen": 205474750, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.1940918, "step": 9538, "time_per_iteration": 2.9428534507751465 }, { "auxiliary_loss_clip": 0.01419994, "auxiliary_loss_mlp": 0.01035429, "balance_loss_clip": 1.25418139, "balance_loss_mlp": 1.01430488, "epoch": 0.5735157071997595, "flos": 22977255035520.0, "grad_norm": 1.9298084288917203, "language_loss": 0.83407229, "learning_rate": 1.6231258386343306e-06, "loss": 0.85862654, "num_input_tokens_seen": 205495495, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.21130371, "step": 9539, "time_per_iteration": 2.8669750690460205 }, { "auxiliary_loss_clip": 0.01427652, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.25866675, "balance_loss_mlp": 1.015818, "epoch": 0.5735758304524274, "flos": 18998758627200.0, "grad_norm": 2.0609671103577902, "language_loss": 0.73481762, "learning_rate": 1.6227433612038647e-06, "loss": 0.75945508, "num_input_tokens_seen": 205510070, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20275879, "step": 9540, "time_per_iteration": 2.814688205718994 }, { "auxiliary_loss_clip": 0.01412521, "auxiliary_loss_mlp": 0.01036061, "balance_loss_clip": 1.2477355, "balance_loss_mlp": 1.01677299, "epoch": 0.5736359537050955, "flos": 28408430845440.0, "grad_norm": 2.8203460593542573, "language_loss": 0.81252217, "learning_rate": 1.6223608980790089e-06, "loss": 0.83700794, "num_input_tokens_seen": 205530190, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19287109, "step": 9541, "time_per_iteration": 2.9061129093170166 }, { "auxiliary_loss_clip": 0.01434804, "auxiliary_loss_mlp": 0.01039017, "balance_loss_clip": 1.26551282, "balance_loss_mlp": 1.01932335, "epoch": 0.5736960769577634, "flos": 15634199917440.0, "grad_norm": 4.225179032884918, "language_loss": 0.65386033, "learning_rate": 1.6219784492742654e-06, "loss": 0.67859852, "num_input_tokens_seen": 205547380, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.19689941, "step": 9542, "time_per_iteration": 2.8120384216308594 }, { "auxiliary_loss_clip": 0.01420831, "auxiliary_loss_mlp": 0.01040502, "balance_loss_clip": 1.25589061, "balance_loss_mlp": 1.02180946, "epoch": 0.5737562002104314, "flos": 18012952765440.0, "grad_norm": 2.2170858832765257, "language_loss": 0.83612823, "learning_rate": 1.6215960148041365e-06, "loss": 0.8607415, "num_input_tokens_seen": 205566540, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18688965, "step": 9543, "time_per_iteration": 2.8360350131988525 }, { "auxiliary_loss_clip": 0.01423512, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.2552731, "balance_loss_mlp": 1.02257991, "epoch": 0.5738163234630994, "flos": 20706357087360.0, "grad_norm": 1.751282260302756, "language_loss": 0.74906731, "learning_rate": 1.6212135946831257e-06, "loss": 0.77373028, "num_input_tokens_seen": 205584200, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20202637, "step": 9544, "time_per_iteration": 2.9210257530212402 }, { "auxiliary_loss_clip": 0.01425973, "auxiliary_loss_mlp": 0.01037364, "balance_loss_clip": 1.25774169, "balance_loss_mlp": 1.01758695, "epoch": 0.5738764467157673, "flos": 23159818379520.0, "grad_norm": 1.7378467268109927, "language_loss": 0.76671624, "learning_rate": 1.620831188925733e-06, "loss": 0.79134965, "num_input_tokens_seen": 205604675, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19763184, "step": 9545, "time_per_iteration": 2.9093708992004395 }, { "auxiliary_loss_clip": 0.01420014, "auxiliary_loss_mlp": 0.01037153, "balance_loss_clip": 1.25349736, "balance_loss_mlp": 1.01674461, "epoch": 0.5739365699684353, "flos": 29503810909440.0, "grad_norm": 4.103101787022907, "language_loss": 0.57093716, "learning_rate": 1.620448797546459e-06, "loss": 0.59550881, "num_input_tokens_seen": 205624680, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20422363, "step": 9546, "time_per_iteration": 2.9276201725006104 }, { "auxiliary_loss_clip": 0.01425277, "auxiliary_loss_mlp": 0.01036239, "balance_loss_clip": 1.25730324, "balance_loss_mlp": 1.0169034, "epoch": 0.5739966932211032, "flos": 14035225518720.0, "grad_norm": 2.4857200736378813, "language_loss": 0.7805407, "learning_rate": 1.6200664205598055e-06, "loss": 0.80515587, "num_input_tokens_seen": 205641950, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.19311523, "step": 9547, "time_per_iteration": 2.8136696815490723 }, { "auxiliary_loss_clip": 0.01425324, "auxiliary_loss_mlp": 0.01039681, "balance_loss_clip": 1.25819063, "balance_loss_mlp": 1.01899791, "epoch": 0.5740568164737713, "flos": 19071250076160.0, "grad_norm": 2.975573972296334, "language_loss": 0.75599027, "learning_rate": 1.6196840579802704e-06, "loss": 0.78064024, "num_input_tokens_seen": 205660130, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20678711, "step": 9548, "time_per_iteration": 2.826310396194458 }, { "auxiliary_loss_clip": 0.01417129, "auxiliary_loss_mlp": 0.01042432, "balance_loss_clip": 1.25215089, "balance_loss_mlp": 1.02210724, "epoch": 0.5741169397264392, "flos": 22138106129280.0, "grad_norm": 2.9577904376407, "language_loss": 0.70302999, "learning_rate": 1.619301709822355e-06, "loss": 0.72762549, "num_input_tokens_seen": 205678895, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20336914, "step": 9549, "time_per_iteration": 2.8783011436462402 }, { "auxiliary_loss_clip": 0.01426039, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.26112318, "balance_loss_mlp": 1.01919675, "epoch": 0.5741770629791072, "flos": 24947735639040.0, "grad_norm": 1.5352606983842598, "language_loss": 0.79949892, "learning_rate": 1.6189193761005564e-06, "loss": 0.82414651, "num_input_tokens_seen": 205698450, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19506836, "step": 9550, "time_per_iteration": 2.8792431354522705 }, { "auxiliary_loss_clip": 0.01429951, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.26307523, "balance_loss_mlp": 1.01697123, "epoch": 0.5742371862317751, "flos": 18809272828800.0, "grad_norm": 1.9316810859985574, "language_loss": 0.68306887, "learning_rate": 1.6185370568293727e-06, "loss": 0.70774519, "num_input_tokens_seen": 205714870, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20703125, "step": 9551, "time_per_iteration": 2.847304582595825 }, { "auxiliary_loss_clip": 0.01440759, "auxiliary_loss_mlp": 0.01043553, "balance_loss_clip": 1.26909328, "balance_loss_mlp": 1.02242959, "epoch": 0.5742973094844431, "flos": 24470953488000.0, "grad_norm": 2.0949910831289467, "language_loss": 0.72689509, "learning_rate": 1.6181547520233031e-06, "loss": 0.75173819, "num_input_tokens_seen": 205736045, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.21118164, "step": 9552, "time_per_iteration": 2.8693933486938477 }, { "auxiliary_loss_clip": 0.01425042, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 1.26100612, "balance_loss_mlp": 1.01276207, "epoch": 0.574357432737111, "flos": 21662726567040.0, "grad_norm": 2.651156061226727, "language_loss": 0.80676121, "learning_rate": 1.617772461696843e-06, "loss": 0.83133221, "num_input_tokens_seen": 205754445, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19287109, "step": 9553, "time_per_iteration": 2.8567821979522705 }, { "auxiliary_loss_clip": 0.01437375, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.26685858, "balance_loss_mlp": 1.01481938, "epoch": 0.5744175559897791, "flos": 16553350909440.0, "grad_norm": 2.258008785980967, "language_loss": 0.84068257, "learning_rate": 1.6173901858644895e-06, "loss": 0.86540049, "num_input_tokens_seen": 205770595, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.19592285, "step": 9554, "time_per_iteration": 2.866321325302124 }, { "auxiliary_loss_clip": 0.01437337, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.26746595, "balance_loss_mlp": 1.02083039, "epoch": 0.574477679242447, "flos": 24218432403840.0, "grad_norm": 1.4331481925124931, "language_loss": 0.71815902, "learning_rate": 1.6170079245407385e-06, "loss": 0.74294782, "num_input_tokens_seen": 205791935, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20715332, "step": 9555, "time_per_iteration": 4.306546688079834 }, { "auxiliary_loss_clip": 0.01432725, "auxiliary_loss_mlp": 0.01036167, "balance_loss_clip": 1.26550865, "balance_loss_mlp": 1.01587749, "epoch": 0.574537802495115, "flos": 14911230954240.0, "grad_norm": 2.166929401136669, "language_loss": 0.73883015, "learning_rate": 1.6166256777400853e-06, "loss": 0.76351905, "num_input_tokens_seen": 205807260, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20300293, "step": 9556, "time_per_iteration": 2.8328654766082764 }, { "auxiliary_loss_clip": 0.01412477, "auxiliary_loss_mlp": 0.01035528, "balance_loss_clip": 1.24806213, "balance_loss_mlp": 1.01446342, "epoch": 0.5745979257477829, "flos": 24945382909440.0, "grad_norm": 2.0390443354187187, "language_loss": 0.74979794, "learning_rate": 1.6162434454770248e-06, "loss": 0.77427799, "num_input_tokens_seen": 205826885, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.21069336, "step": 9557, "time_per_iteration": 2.8800582885742188 }, { "auxiliary_loss_clip": 0.0142505, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.25853133, "balance_loss_mlp": 1.0172255, "epoch": 0.5746580490004509, "flos": 17243987823360.0, "grad_norm": 2.02212664695032, "language_loss": 0.68894887, "learning_rate": 1.6158612277660514e-06, "loss": 0.71357083, "num_input_tokens_seen": 205844630, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19934082, "step": 9558, "time_per_iteration": 2.827829360961914 }, { "auxiliary_loss_clip": 0.01439619, "auxiliary_loss_mlp": 0.01039591, "balance_loss_clip": 1.26677299, "balance_loss_mlp": 1.01733422, "epoch": 0.5747181722531189, "flos": 13195578919680.0, "grad_norm": 2.496132899961028, "language_loss": 0.71952289, "learning_rate": 1.615479024621659e-06, "loss": 0.74431497, "num_input_tokens_seen": 205860960, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.22241211, "step": 9559, "time_per_iteration": 2.8413195610046387 }, { "auxiliary_loss_clip": 0.01420839, "auxiliary_loss_mlp": 0.01034477, "balance_loss_clip": 1.25608587, "balance_loss_mlp": 1.01510525, "epoch": 0.5747782955057869, "flos": 22972459086720.0, "grad_norm": 1.7016791454251394, "language_loss": 0.79914033, "learning_rate": 1.6150968360583398e-06, "loss": 0.82369339, "num_input_tokens_seen": 205880675, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19372559, "step": 9560, "time_per_iteration": 2.8604722023010254 }, { "auxiliary_loss_clip": 0.01430299, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.26101768, "balance_loss_mlp": 1.01694322, "epoch": 0.5748384187584549, "flos": 23413425338880.0, "grad_norm": 1.9916162822667793, "language_loss": 0.65101969, "learning_rate": 1.614714662090588e-06, "loss": 0.67569625, "num_input_tokens_seen": 205900050, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20410156, "step": 9561, "time_per_iteration": 2.875861883163452 }, { "auxiliary_loss_clip": 0.01452665, "auxiliary_loss_mlp": 0.01039145, "balance_loss_clip": 1.27849436, "balance_loss_mlp": 1.01749706, "epoch": 0.5748985420111228, "flos": 17794528277760.0, "grad_norm": 10.516035198460475, "language_loss": 0.72344041, "learning_rate": 1.6143325027328945e-06, "loss": 0.74835849, "num_input_tokens_seen": 205918855, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.21643066, "step": 9562, "time_per_iteration": 2.8049819469451904 }, { "auxiliary_loss_clip": 0.01432933, "auxiliary_loss_mlp": 0.01036952, "balance_loss_clip": 1.26508522, "balance_loss_mlp": 1.01792645, "epoch": 0.5749586652637908, "flos": 19876076161920.0, "grad_norm": 1.4945400050753446, "language_loss": 0.84474027, "learning_rate": 1.613950357999751e-06, "loss": 0.86943913, "num_input_tokens_seen": 205936970, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19030762, "step": 9563, "time_per_iteration": 2.8334243297576904 }, { "auxiliary_loss_clip": 0.01439105, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.26676774, "balance_loss_mlp": 1.0175072, "epoch": 0.5750187885164587, "flos": 21296921207040.0, "grad_norm": 1.9393347653174426, "language_loss": 0.58768857, "learning_rate": 1.6135682279056488e-06, "loss": 0.61246514, "num_input_tokens_seen": 205954630, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21044922, "step": 9564, "time_per_iteration": 2.842576026916504 }, { "auxiliary_loss_clip": 0.01397458, "auxiliary_loss_mlp": 0.01036153, "balance_loss_clip": 1.23724508, "balance_loss_mlp": 1.01498127, "epoch": 0.5750789117691267, "flos": 18813299616000.0, "grad_norm": 2.271382244061916, "language_loss": 0.76563203, "learning_rate": 1.613186112465078e-06, "loss": 0.78996813, "num_input_tokens_seen": 205971510, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.21179199, "step": 9565, "time_per_iteration": 2.822629690170288 }, { "auxiliary_loss_clip": 0.01222744, "auxiliary_loss_mlp": 0.01059471, "balance_loss_clip": 1.12820506, "balance_loss_mlp": 1.03486598, "epoch": 0.5751390350217946, "flos": 70697982844800.0, "grad_norm": 0.7429110730220063, "language_loss": 0.60757947, "learning_rate": 1.6128040116925287e-06, "loss": 0.63040161, "num_input_tokens_seen": 206035125, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.24511719, "step": 9566, "time_per_iteration": 3.4485089778900146 }, { "auxiliary_loss_clip": 0.01427413, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.26046824, "balance_loss_mlp": 1.01490426, "epoch": 0.5751991582744627, "flos": 14254419168000.0, "grad_norm": 1.97416771907183, "language_loss": 0.76655155, "learning_rate": 1.6124219256024901e-06, "loss": 0.79117703, "num_input_tokens_seen": 206052075, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20214844, "step": 9567, "time_per_iteration": 2.8615429401397705 }, { "auxiliary_loss_clip": 0.01417491, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.25172365, "balance_loss_mlp": 1.01512504, "epoch": 0.5752592815271306, "flos": 18336110261760.0, "grad_norm": 1.3954716258947677, "language_loss": 0.75165492, "learning_rate": 1.6120398542094504e-06, "loss": 0.77618766, "num_input_tokens_seen": 206069970, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20666504, "step": 9568, "time_per_iteration": 4.267677545547485 }, { "auxiliary_loss_clip": 0.01422093, "auxiliary_loss_mlp": 0.01032582, "balance_loss_clip": 1.25459337, "balance_loss_mlp": 1.01222086, "epoch": 0.5753194047797986, "flos": 20932427946240.0, "grad_norm": 1.582773656486258, "language_loss": 0.72274375, "learning_rate": 1.6116577975278994e-06, "loss": 0.74729049, "num_input_tokens_seen": 206088950, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20349121, "step": 9569, "time_per_iteration": 4.320723533630371 }, { "auxiliary_loss_clip": 0.01420295, "auxiliary_loss_mlp": 0.01041439, "balance_loss_clip": 1.25295377, "balance_loss_mlp": 1.01982594, "epoch": 0.5753795280324665, "flos": 19291303376640.0, "grad_norm": 2.889669900598554, "language_loss": 0.5639959, "learning_rate": 1.6112757555723223e-06, "loss": 0.58861327, "num_input_tokens_seen": 206107780, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.21618652, "step": 9570, "time_per_iteration": 4.243241310119629 }, { "auxiliary_loss_clip": 0.01408948, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.24538743, "balance_loss_mlp": 1.01973927, "epoch": 0.5754396512851345, "flos": 21662862301440.0, "grad_norm": 1.4931114289916747, "language_loss": 0.64696705, "learning_rate": 1.6108937283572082e-06, "loss": 0.67146045, "num_input_tokens_seen": 206127445, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20654297, "step": 9571, "time_per_iteration": 2.905670642852783 }, { "auxiliary_loss_clip": 0.01411058, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.24552405, "balance_loss_mlp": 1.01586938, "epoch": 0.5754997745378025, "flos": 51038729619840.0, "grad_norm": 1.6079505050111795, "language_loss": 0.67880315, "learning_rate": 1.6105117158970434e-06, "loss": 0.70327318, "num_input_tokens_seen": 206152005, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20092773, "step": 9572, "time_per_iteration": 3.1498560905456543 }, { "auxiliary_loss_clip": 0.01410305, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.24618959, "balance_loss_mlp": 1.01825356, "epoch": 0.5755598977904705, "flos": 22867183140480.0, "grad_norm": 1.6564921837005748, "language_loss": 0.73163533, "learning_rate": 1.6101297182063123e-06, "loss": 0.75613815, "num_input_tokens_seen": 206169875, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.21704102, "step": 9573, "time_per_iteration": 2.842282772064209 }, { "auxiliary_loss_clip": 0.01409034, "auxiliary_loss_mlp": 0.01033359, "balance_loss_clip": 1.24837971, "balance_loss_mlp": 1.01457131, "epoch": 0.5756200210431385, "flos": 38487176190720.0, "grad_norm": 1.8001822030984374, "language_loss": 0.76878279, "learning_rate": 1.6097477352995022e-06, "loss": 0.79320669, "num_input_tokens_seen": 206192635, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18811035, "step": 9574, "time_per_iteration": 3.0207772254943848 }, { "auxiliary_loss_clip": 0.0144358, "auxiliary_loss_mlp": 0.01038977, "balance_loss_clip": 1.27037239, "balance_loss_mlp": 1.01738787, "epoch": 0.5756801442958064, "flos": 23919281913600.0, "grad_norm": 3.890396386877448, "language_loss": 0.67286688, "learning_rate": 1.6093657671910968e-06, "loss": 0.69769251, "num_input_tokens_seen": 206211485, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.21569824, "step": 9575, "time_per_iteration": 2.8627254962921143 }, { "auxiliary_loss_clip": 0.01403803, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.2424705, "balance_loss_mlp": 1.01420975, "epoch": 0.5757402675484744, "flos": 21115036535040.0, "grad_norm": 1.4358875820676233, "language_loss": 0.80280131, "learning_rate": 1.6089838138955804e-06, "loss": 0.82719254, "num_input_tokens_seen": 206231740, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.21105957, "step": 9576, "time_per_iteration": 2.8811585903167725 }, { "auxiliary_loss_clip": 0.01420466, "auxiliary_loss_mlp": 0.01035183, "balance_loss_clip": 1.25617206, "balance_loss_mlp": 1.01546621, "epoch": 0.5758003908011423, "flos": 20569020560640.0, "grad_norm": 2.637115430720199, "language_loss": 0.71000242, "learning_rate": 1.6086018754274372e-06, "loss": 0.73455894, "num_input_tokens_seen": 206250975, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19714355, "step": 9577, "time_per_iteration": 2.825751304626465 }, { "auxiliary_loss_clip": 0.01420236, "auxiliary_loss_mlp": 0.01036685, "balance_loss_clip": 1.25228119, "balance_loss_mlp": 1.01690841, "epoch": 0.5758605140538103, "flos": 16481945335680.0, "grad_norm": 2.0072066267817124, "language_loss": 0.66838574, "learning_rate": 1.6082199518011504e-06, "loss": 0.69295502, "num_input_tokens_seen": 206268800, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.19787598, "step": 9578, "time_per_iteration": 2.8282041549682617 }, { "auxiliary_loss_clip": 0.01417122, "auxiliary_loss_mlp": 0.01032252, "balance_loss_clip": 1.25257552, "balance_loss_mlp": 1.01254666, "epoch": 0.5759206373064782, "flos": 21297328410240.0, "grad_norm": 1.599003805009062, "language_loss": 0.73434865, "learning_rate": 1.6078380430312016e-06, "loss": 0.75884241, "num_input_tokens_seen": 206287190, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19702148, "step": 9579, "time_per_iteration": 2.8292505741119385 }, { "auxiliary_loss_clip": 0.01435285, "auxiliary_loss_mlp": 0.0103414, "balance_loss_clip": 1.26400018, "balance_loss_mlp": 1.0133971, "epoch": 0.5759807605591463, "flos": 26079067336320.0, "grad_norm": 2.4174328209053852, "language_loss": 0.66628897, "learning_rate": 1.6074561491320742e-06, "loss": 0.69098324, "num_input_tokens_seen": 206307020, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20751953, "step": 9580, "time_per_iteration": 2.9403624534606934 }, { "auxiliary_loss_clip": 0.01412302, "auxiliary_loss_mlp": 0.01035945, "balance_loss_clip": 1.24585176, "balance_loss_mlp": 1.01499987, "epoch": 0.5760408838118142, "flos": 18880542668160.0, "grad_norm": 2.1142892414711105, "language_loss": 0.86011469, "learning_rate": 1.6070742701182486e-06, "loss": 0.88459712, "num_input_tokens_seen": 206324095, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20935059, "step": 9581, "time_per_iteration": 2.844252347946167 }, { "auxiliary_loss_clip": 0.01458896, "auxiliary_loss_mlp": 0.01040192, "balance_loss_clip": 1.28678691, "balance_loss_mlp": 1.01896119, "epoch": 0.5761010070644822, "flos": 15386701006080.0, "grad_norm": 2.0718823693040256, "language_loss": 0.68749315, "learning_rate": 1.6066924060042057e-06, "loss": 0.712484, "num_input_tokens_seen": 206343210, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.21228027, "step": 9582, "time_per_iteration": 2.833075523376465 }, { "auxiliary_loss_clip": 0.01215791, "auxiliary_loss_mlp": 0.01049125, "balance_loss_clip": 1.12376189, "balance_loss_mlp": 1.02700007, "epoch": 0.5761611303171501, "flos": 71508464530560.0, "grad_norm": 0.6635219380741795, "language_loss": 0.57224357, "learning_rate": 1.6063105568044271e-06, "loss": 0.59489274, "num_input_tokens_seen": 206415935, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.22167969, "step": 9583, "time_per_iteration": 3.5074870586395264 }, { "auxiliary_loss_clip": 0.01417746, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.25224674, "balance_loss_mlp": 1.01276231, "epoch": 0.5762212535698181, "flos": 16252978809600.0, "grad_norm": 1.7843484743298361, "language_loss": 0.83281118, "learning_rate": 1.6059287225333912e-06, "loss": 0.85731578, "num_input_tokens_seen": 206431900, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19946289, "step": 9584, "time_per_iteration": 2.828568696975708 }, { "auxiliary_loss_clip": 0.01225705, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 1.12884545, "balance_loss_mlp": 1.01113844, "epoch": 0.5762813768224861, "flos": 70219255167360.0, "grad_norm": 0.6287756609342996, "language_loss": 0.49619007, "learning_rate": 1.6055469032055773e-06, "loss": 0.51877594, "num_input_tokens_seen": 206501200, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.21777344, "step": 9585, "time_per_iteration": 3.389831304550171 }, { "auxiliary_loss_clip": 0.0141373, "auxiliary_loss_mlp": 0.01033827, "balance_loss_clip": 1.24769115, "balance_loss_mlp": 1.01335871, "epoch": 0.5763415000751541, "flos": 20526734655360.0, "grad_norm": 1.5513011508405017, "language_loss": 0.8579042, "learning_rate": 1.605165098835465e-06, "loss": 0.88237983, "num_input_tokens_seen": 206520575, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20458984, "step": 9586, "time_per_iteration": 2.8331639766693115 }, { "auxiliary_loss_clip": 0.01414065, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.2479744, "balance_loss_mlp": 1.01438653, "epoch": 0.5764016233278221, "flos": 15824092919040.0, "grad_norm": 1.6537595481364107, "language_loss": 0.80499583, "learning_rate": 1.6047833094375308e-06, "loss": 0.82948977, "num_input_tokens_seen": 206538060, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20935059, "step": 9587, "time_per_iteration": 2.879678726196289 }, { "auxiliary_loss_clip": 0.01423418, "auxiliary_loss_mlp": 0.01037727, "balance_loss_clip": 1.25662589, "balance_loss_mlp": 1.01644838, "epoch": 0.57646174658049, "flos": 20780794062720.0, "grad_norm": 1.8720312761335045, "language_loss": 0.66755772, "learning_rate": 1.6044015350262542e-06, "loss": 0.69216919, "num_input_tokens_seen": 206557320, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.21289062, "step": 9588, "time_per_iteration": 2.9449524879455566 }, { "auxiliary_loss_clip": 0.01419182, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.25110805, "balance_loss_mlp": 1.01480854, "epoch": 0.576521869833158, "flos": 23560353763200.0, "grad_norm": 1.8885624535290177, "language_loss": 0.79448062, "learning_rate": 1.6040197756161104e-06, "loss": 0.81903201, "num_input_tokens_seen": 206575780, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21154785, "step": 9589, "time_per_iteration": 4.370093822479248 }, { "auxiliary_loss_clip": 0.01396854, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.23489976, "balance_loss_mlp": 1.01348245, "epoch": 0.5765819930858259, "flos": 20276249587200.0, "grad_norm": 2.116985814696345, "language_loss": 0.8057307, "learning_rate": 1.6036380312215762e-06, "loss": 0.83003151, "num_input_tokens_seen": 206594100, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19750977, "step": 9590, "time_per_iteration": 2.838031053543091 }, { "auxiliary_loss_clip": 0.01427822, "auxiliary_loss_mlp": 0.01032152, "balance_loss_clip": 1.26109099, "balance_loss_mlp": 1.01208913, "epoch": 0.5766421163384939, "flos": 23159139707520.0, "grad_norm": 1.8856156118005807, "language_loss": 0.63818425, "learning_rate": 1.6032563018571283e-06, "loss": 0.66278398, "num_input_tokens_seen": 206613325, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20080566, "step": 9591, "time_per_iteration": 2.837923288345337 }, { "auxiliary_loss_clip": 0.01417692, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.25080609, "balance_loss_mlp": 1.01640213, "epoch": 0.5767022395911618, "flos": 25860009421440.0, "grad_norm": 1.5167557531385518, "language_loss": 0.78387856, "learning_rate": 1.6028745875372406e-06, "loss": 0.80842113, "num_input_tokens_seen": 206634265, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20141602, "step": 9592, "time_per_iteration": 2.882469892501831 }, { "auxiliary_loss_clip": 0.01217321, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.1212635, "balance_loss_mlp": 1.01184297, "epoch": 0.5767623628438299, "flos": 68327871753600.0, "grad_norm": 0.7350499726264537, "language_loss": 0.59753281, "learning_rate": 1.6024928882763885e-06, "loss": 0.62002379, "num_input_tokens_seen": 206696990, "router_z_loss_clip": 0.9609375, "router_z_loss_mlp": 0.19921875, "step": 9593, "time_per_iteration": 3.4963326454162598 }, { "auxiliary_loss_clip": 0.01432185, "auxiliary_loss_mlp": 0.01035762, "balance_loss_clip": 1.26171219, "balance_loss_mlp": 1.01388693, "epoch": 0.5768224860964978, "flos": 30200103423360.0, "grad_norm": 2.0924087788161074, "language_loss": 0.71531767, "learning_rate": 1.6021112040890463e-06, "loss": 0.73999715, "num_input_tokens_seen": 206717815, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21862793, "step": 9594, "time_per_iteration": 2.9057695865631104 }, { "auxiliary_loss_clip": 0.01418276, "auxiliary_loss_mlp": 0.01039628, "balance_loss_clip": 1.25193191, "balance_loss_mlp": 1.02005434, "epoch": 0.5768826093491658, "flos": 17904102480000.0, "grad_norm": 2.1106477122275304, "language_loss": 0.71573049, "learning_rate": 1.6017295349896863e-06, "loss": 0.74030954, "num_input_tokens_seen": 206735985, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19567871, "step": 9595, "time_per_iteration": 2.843494415283203 }, { "auxiliary_loss_clip": 0.01413084, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.24761248, "balance_loss_mlp": 1.01411057, "epoch": 0.5769427326018337, "flos": 17466122384640.0, "grad_norm": 1.9824875285738517, "language_loss": 0.69971752, "learning_rate": 1.6013478809927828e-06, "loss": 0.72419012, "num_input_tokens_seen": 206753370, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20068359, "step": 9596, "time_per_iteration": 2.837700128555298 }, { "auxiliary_loss_clip": 0.01440765, "auxiliary_loss_mlp": 0.01040604, "balance_loss_clip": 1.26713872, "balance_loss_mlp": 1.01917017, "epoch": 0.5770028558545017, "flos": 39436396992000.0, "grad_norm": 2.147558485661204, "language_loss": 0.68229485, "learning_rate": 1.6009662421128074e-06, "loss": 0.7071085, "num_input_tokens_seen": 206777645, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.21435547, "step": 9597, "time_per_iteration": 3.0175962448120117 }, { "auxiliary_loss_clip": 0.01415753, "auxiliary_loss_mlp": 0.01036339, "balance_loss_clip": 1.24956489, "balance_loss_mlp": 1.01644325, "epoch": 0.5770629791071697, "flos": 21544510608000.0, "grad_norm": 1.8924729523168022, "language_loss": 0.82062888, "learning_rate": 1.6005846183642323e-06, "loss": 0.84514982, "num_input_tokens_seen": 206794865, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19873047, "step": 9598, "time_per_iteration": 2.825298547744751 }, { "auxiliary_loss_clip": 0.01425756, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.25743723, "balance_loss_mlp": 1.0140065, "epoch": 0.5771231023598377, "flos": 20896612047360.0, "grad_norm": 1.7958867892416215, "language_loss": 0.7361927, "learning_rate": 1.6002030097615277e-06, "loss": 0.76079184, "num_input_tokens_seen": 206814095, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20153809, "step": 9599, "time_per_iteration": 2.83890438079834 }, { "auxiliary_loss_clip": 0.0140623, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.24416876, "balance_loss_mlp": 1.0132283, "epoch": 0.5771832256125057, "flos": 18086258620800.0, "grad_norm": 2.292385817507339, "language_loss": 0.78933167, "learning_rate": 1.5998214163191663e-06, "loss": 0.81371939, "num_input_tokens_seen": 206832245, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19299316, "step": 9600, "time_per_iteration": 2.8331172466278076 }, { "auxiliary_loss_clip": 0.01427326, "auxiliary_loss_mlp": 0.01039675, "balance_loss_clip": 1.25778604, "balance_loss_mlp": 1.01981449, "epoch": 0.5772433488651736, "flos": 26370164252160.0, "grad_norm": 1.7113576102411356, "language_loss": 0.73006511, "learning_rate": 1.5994398380516163e-06, "loss": 0.75473517, "num_input_tokens_seen": 206851535, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.19873047, "step": 9601, "time_per_iteration": 2.875852584838867 }, { "auxiliary_loss_clip": 0.01414668, "auxiliary_loss_mlp": 0.01035951, "balance_loss_clip": 1.2507534, "balance_loss_mlp": 1.01482737, "epoch": 0.5773034721178416, "flos": 19690028968320.0, "grad_norm": 1.7058665889078708, "language_loss": 0.69148374, "learning_rate": 1.599058274973348e-06, "loss": 0.71598995, "num_input_tokens_seen": 206870595, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.21142578, "step": 9602, "time_per_iteration": 2.838705539703369 }, { "auxiliary_loss_clip": 0.01396994, "auxiliary_loss_mlp": 0.01040007, "balance_loss_clip": 1.23765135, "balance_loss_mlp": 1.02036166, "epoch": 0.5773635953705095, "flos": 25093849656960.0, "grad_norm": 1.527887390502405, "language_loss": 0.73743999, "learning_rate": 1.5986767270988297e-06, "loss": 0.76181, "num_input_tokens_seen": 206892320, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19641113, "step": 9603, "time_per_iteration": 4.263092756271362 }, { "auxiliary_loss_clip": 0.0141325, "auxiliary_loss_mlp": 0.0103553, "balance_loss_clip": 1.24895012, "balance_loss_mlp": 1.01613426, "epoch": 0.5774237186231775, "flos": 21042907044480.0, "grad_norm": 1.6294794917873814, "language_loss": 0.77594185, "learning_rate": 1.5982951944425298e-06, "loss": 0.80042964, "num_input_tokens_seen": 206912485, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19396973, "step": 9604, "time_per_iteration": 4.4003331661224365 }, { "auxiliary_loss_clip": 0.01422813, "auxiliary_loss_mlp": 0.01036478, "balance_loss_clip": 1.25548756, "balance_loss_mlp": 1.01567626, "epoch": 0.5774838418758454, "flos": 15240134540160.0, "grad_norm": 1.6634882962032873, "language_loss": 0.837524, "learning_rate": 1.5979136770189174e-06, "loss": 0.86211693, "num_input_tokens_seen": 206929100, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20812988, "step": 9605, "time_per_iteration": 4.197317361831665 }, { "auxiliary_loss_clip": 0.01452494, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.27742732, "balance_loss_mlp": 1.0170064, "epoch": 0.5775439651285135, "flos": 23592052385280.0, "grad_norm": 1.5894050009088756, "language_loss": 0.78632814, "learning_rate": 1.5975321748424581e-06, "loss": 0.81122994, "num_input_tokens_seen": 206947020, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.20678711, "step": 9606, "time_per_iteration": 2.85898756980896 }, { "auxiliary_loss_clip": 0.01415554, "auxiliary_loss_mlp": 0.01035633, "balance_loss_clip": 1.24989283, "balance_loss_mlp": 1.0159632, "epoch": 0.5776040883811814, "flos": 18049492581120.0, "grad_norm": 1.761725318161477, "language_loss": 0.74180353, "learning_rate": 1.597150687927619e-06, "loss": 0.76631546, "num_input_tokens_seen": 206964065, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.1965332, "step": 9607, "time_per_iteration": 2.8024539947509766 }, { "auxiliary_loss_clip": 0.01426683, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.25929523, "balance_loss_mlp": 1.01889408, "epoch": 0.5776642116338494, "flos": 18633858163200.0, "grad_norm": 2.0253264297958857, "language_loss": 0.69928145, "learning_rate": 1.5967692162888664e-06, "loss": 0.72393513, "num_input_tokens_seen": 206981940, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19787598, "step": 9608, "time_per_iteration": 2.848527431488037 }, { "auxiliary_loss_clip": 0.01416867, "auxiliary_loss_mlp": 0.01039145, "balance_loss_clip": 1.2496897, "balance_loss_mlp": 1.01835454, "epoch": 0.5777243348865173, "flos": 28414222179840.0, "grad_norm": 1.7471931396889104, "language_loss": 0.77358437, "learning_rate": 1.596387759940665e-06, "loss": 0.79814446, "num_input_tokens_seen": 207002365, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20800781, "step": 9609, "time_per_iteration": 2.9093523025512695 }, { "auxiliary_loss_clip": 0.01406307, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.24074125, "balance_loss_mlp": 1.01253915, "epoch": 0.5777844581391853, "flos": 24035371367040.0, "grad_norm": 1.8320377933634127, "language_loss": 0.78219056, "learning_rate": 1.5960063188974808e-06, "loss": 0.80658031, "num_input_tokens_seen": 207021195, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.2010498, "step": 9610, "time_per_iteration": 2.842288017272949 }, { "auxiliary_loss_clip": 0.01422465, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.25669622, "balance_loss_mlp": 1.01661539, "epoch": 0.5778445813918534, "flos": 17784257708160.0, "grad_norm": 2.179219040735565, "language_loss": 0.6989606, "learning_rate": 1.5956248931737777e-06, "loss": 0.72355616, "num_input_tokens_seen": 207037465, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20471191, "step": 9611, "time_per_iteration": 2.84462833404541 }, { "auxiliary_loss_clip": 0.01413098, "auxiliary_loss_mlp": 0.01034817, "balance_loss_clip": 1.24954891, "balance_loss_mlp": 1.01386046, "epoch": 0.5779047046445213, "flos": 22242431934720.0, "grad_norm": 1.834851675157946, "language_loss": 0.83500773, "learning_rate": 1.5952434827840185e-06, "loss": 0.85948682, "num_input_tokens_seen": 207054230, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20947266, "step": 9612, "time_per_iteration": 2.8602166175842285 }, { "auxiliary_loss_clip": 0.01412651, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.24850702, "balance_loss_mlp": 1.01404536, "epoch": 0.5779648278971893, "flos": 21444392568960.0, "grad_norm": 1.6650491066534918, "language_loss": 0.80110729, "learning_rate": 1.594862087742667e-06, "loss": 0.82557189, "num_input_tokens_seen": 207073150, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19750977, "step": 9613, "time_per_iteration": 2.8393123149871826 }, { "auxiliary_loss_clip": 0.01405607, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.24204922, "balance_loss_mlp": 1.01541138, "epoch": 0.5780249511498572, "flos": 19035479422080.0, "grad_norm": 1.9041929212471937, "language_loss": 0.78186792, "learning_rate": 1.5944807080641863e-06, "loss": 0.80628407, "num_input_tokens_seen": 207090375, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20593262, "step": 9614, "time_per_iteration": 2.833939552307129 }, { "auxiliary_loss_clip": 0.01419383, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.25114131, "balance_loss_mlp": 1.01644897, "epoch": 0.5780850744025252, "flos": 12130766357760.0, "grad_norm": 3.6386443646621887, "language_loss": 0.82369852, "learning_rate": 1.5940993437630375e-06, "loss": 0.84826273, "num_input_tokens_seen": 207106030, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20593262, "step": 9615, "time_per_iteration": 2.8209636211395264 }, { "auxiliary_loss_clip": 0.0142063, "auxiliary_loss_mlp": 0.0103991, "balance_loss_clip": 1.25190294, "balance_loss_mlp": 1.02081251, "epoch": 0.5781451976551931, "flos": 25054866622080.0, "grad_norm": 1.5577939982528795, "language_loss": 0.6750465, "learning_rate": 1.5937179948536825e-06, "loss": 0.6996519, "num_input_tokens_seen": 207125435, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19091797, "step": 9616, "time_per_iteration": 2.8806276321411133 }, { "auxiliary_loss_clip": 0.01404374, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.24265385, "balance_loss_mlp": 1.0196743, "epoch": 0.5782053209078611, "flos": 19255170764160.0, "grad_norm": 1.7448875836798061, "language_loss": 0.78801596, "learning_rate": 1.5933366613505812e-06, "loss": 0.81245923, "num_input_tokens_seen": 207145095, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.20275879, "step": 9617, "time_per_iteration": 2.8146486282348633 }, { "auxiliary_loss_clip": 0.01407482, "auxiliary_loss_mlp": 0.01039944, "balance_loss_clip": 1.24389744, "balance_loss_mlp": 1.01949918, "epoch": 0.578265444160529, "flos": 26004720850560.0, "grad_norm": 1.50095810234385, "language_loss": 0.76080847, "learning_rate": 1.5929553432681947e-06, "loss": 0.78528273, "num_input_tokens_seen": 207166045, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.2043457, "step": 9618, "time_per_iteration": 2.8852405548095703 }, { "auxiliary_loss_clip": 0.01411582, "auxiliary_loss_mlp": 0.0103618, "balance_loss_clip": 1.24803841, "balance_loss_mlp": 1.01661789, "epoch": 0.5783255674131971, "flos": 21808569116160.0, "grad_norm": 1.7132596656387997, "language_loss": 0.82130975, "learning_rate": 1.5925740406209826e-06, "loss": 0.84578735, "num_input_tokens_seen": 207185290, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19567871, "step": 9619, "time_per_iteration": 2.8386099338531494 }, { "auxiliary_loss_clip": 0.01411469, "auxiliary_loss_mlp": 0.01042609, "balance_loss_clip": 1.24641538, "balance_loss_mlp": 1.02304673, "epoch": 0.578385690665865, "flos": 24800309521920.0, "grad_norm": 1.834069995924176, "language_loss": 0.73240411, "learning_rate": 1.5921927534234039e-06, "loss": 0.75694489, "num_input_tokens_seen": 207205505, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19555664, "step": 9620, "time_per_iteration": 2.865807056427002 }, { "auxiliary_loss_clip": 0.01429839, "auxiliary_loss_mlp": 0.01040361, "balance_loss_clip": 1.26335335, "balance_loss_mlp": 1.02054858, "epoch": 0.578445813918533, "flos": 21222167518080.0, "grad_norm": 1.5909783185512139, "language_loss": 0.78326213, "learning_rate": 1.591811481689916e-06, "loss": 0.80796415, "num_input_tokens_seen": 207225315, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19787598, "step": 9621, "time_per_iteration": 2.8440253734588623 }, { "auxiliary_loss_clip": 0.01421698, "auxiliary_loss_mlp": 0.01038268, "balance_loss_clip": 1.25558519, "balance_loss_mlp": 1.01852667, "epoch": 0.5785059371712009, "flos": 25057852778880.0, "grad_norm": 1.4019016669559574, "language_loss": 0.71212596, "learning_rate": 1.5914302254349787e-06, "loss": 0.73672563, "num_input_tokens_seen": 207247690, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19750977, "step": 9622, "time_per_iteration": 2.888073444366455 }, { "auxiliary_loss_clip": 0.01217027, "auxiliary_loss_mlp": 0.01033537, "balance_loss_clip": 1.12587953, "balance_loss_mlp": 1.01131654, "epoch": 0.5785660604238689, "flos": 70877333808000.0, "grad_norm": 0.7777259577716826, "language_loss": 0.56019592, "learning_rate": 1.5910489846730476e-06, "loss": 0.58270156, "num_input_tokens_seen": 207301735, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.22265625, "step": 9623, "time_per_iteration": 3.388606309890747 }, { "auxiliary_loss_clip": 0.01417477, "auxiliary_loss_mlp": 0.01043099, "balance_loss_clip": 1.24971581, "balance_loss_mlp": 1.02257133, "epoch": 0.578626183676537, "flos": 31662555701760.0, "grad_norm": 1.9168647914571713, "language_loss": 0.72048259, "learning_rate": 1.5906677594185799e-06, "loss": 0.7450884, "num_input_tokens_seen": 207321240, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20532227, "step": 9624, "time_per_iteration": 2.933692693710327 }, { "auxiliary_loss_clip": 0.01413407, "auxiliary_loss_mlp": 0.01045239, "balance_loss_clip": 1.24838221, "balance_loss_mlp": 1.02490222, "epoch": 0.5786863069292049, "flos": 21873504683520.0, "grad_norm": 2.437695218053764, "language_loss": 0.83242714, "learning_rate": 1.5902865496860322e-06, "loss": 0.85701364, "num_input_tokens_seen": 207339540, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20349121, "step": 9625, "time_per_iteration": 4.262711048126221 }, { "auxiliary_loss_clip": 0.01420048, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.25616479, "balance_loss_mlp": 1.0178802, "epoch": 0.5787464301818729, "flos": 23374668528000.0, "grad_norm": 1.5527125035245037, "language_loss": 0.70528024, "learning_rate": 1.5899053554898591e-06, "loss": 0.72986668, "num_input_tokens_seen": 207360470, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20727539, "step": 9626, "time_per_iteration": 2.9079980850219727 }, { "auxiliary_loss_clip": 0.01410475, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.24646783, "balance_loss_mlp": 1.01803136, "epoch": 0.5788065534345408, "flos": 30015096860160.0, "grad_norm": 4.270699692625408, "language_loss": 0.72456765, "learning_rate": 1.5895241768445166e-06, "loss": 0.74905491, "num_input_tokens_seen": 207383080, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.20227051, "step": 9627, "time_per_iteration": 2.9150376319885254 }, { "auxiliary_loss_clip": 0.01418061, "auxiliary_loss_mlp": 0.0103752, "balance_loss_clip": 1.25417519, "balance_loss_mlp": 1.01764798, "epoch": 0.5788666766872088, "flos": 24536658216960.0, "grad_norm": 1.7334315434430667, "language_loss": 0.8428638, "learning_rate": 1.589143013764458e-06, "loss": 0.8674196, "num_input_tokens_seen": 207401000, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19885254, "step": 9628, "time_per_iteration": 2.8801112174987793 }, { "auxiliary_loss_clip": 0.01427272, "auxiliary_loss_mlp": 0.01039518, "balance_loss_clip": 1.26155567, "balance_loss_mlp": 1.01906204, "epoch": 0.5789267999398767, "flos": 23743414800000.0, "grad_norm": 1.5867481812755941, "language_loss": 0.72934943, "learning_rate": 1.5887618662641376e-06, "loss": 0.75401735, "num_input_tokens_seen": 207419230, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20458984, "step": 9629, "time_per_iteration": 2.892033338546753 }, { "auxiliary_loss_clip": 0.01418725, "auxiliary_loss_mlp": 0.01039893, "balance_loss_clip": 1.25384867, "balance_loss_mlp": 1.01916194, "epoch": 0.5789869231925447, "flos": 21143070328320.0, "grad_norm": 2.065372319135669, "language_loss": 0.750741, "learning_rate": 1.5883807343580087e-06, "loss": 0.77532715, "num_input_tokens_seen": 207437615, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20739746, "step": 9630, "time_per_iteration": 2.9415769577026367 }, { "auxiliary_loss_clip": 0.01399114, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.23911309, "balance_loss_mlp": 1.01738894, "epoch": 0.5790470464452127, "flos": 21218095486080.0, "grad_norm": 1.537372840187083, "language_loss": 0.79492116, "learning_rate": 1.587999618060523e-06, "loss": 0.81928098, "num_input_tokens_seen": 207457270, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19482422, "step": 9631, "time_per_iteration": 2.8702595233917236 }, { "auxiliary_loss_clip": 0.01423249, "auxiliary_loss_mlp": 0.01038703, "balance_loss_clip": 1.25823879, "balance_loss_mlp": 1.01875949, "epoch": 0.5791071696978807, "flos": 23414873172480.0, "grad_norm": 9.662733625350944, "language_loss": 0.75739115, "learning_rate": 1.5876185173861333e-06, "loss": 0.78201067, "num_input_tokens_seen": 207477890, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19934082, "step": 9632, "time_per_iteration": 2.8572511672973633 }, { "auxiliary_loss_clip": 0.01410774, "auxiliary_loss_mlp": 0.01032991, "balance_loss_clip": 1.2462523, "balance_loss_mlp": 1.0127852, "epoch": 0.5791672929505486, "flos": 24217075059840.0, "grad_norm": 2.124403997800271, "language_loss": 0.79579717, "learning_rate": 1.5872374323492915e-06, "loss": 0.82023478, "num_input_tokens_seen": 207497670, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.2019043, "step": 9633, "time_per_iteration": 2.8659183979034424 }, { "auxiliary_loss_clip": 0.01452831, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.27888763, "balance_loss_mlp": 1.01975942, "epoch": 0.5792274162032166, "flos": 24358528863360.0, "grad_norm": 1.753991951687585, "language_loss": 0.78801966, "learning_rate": 1.5868563629644464e-06, "loss": 0.81295431, "num_input_tokens_seen": 207516105, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.20861816, "step": 9634, "time_per_iteration": 2.8666832447052 }, { "auxiliary_loss_clip": 0.01431893, "auxiliary_loss_mlp": 0.01037507, "balance_loss_clip": 1.26334524, "balance_loss_mlp": 1.01684773, "epoch": 0.5792875394558845, "flos": 20459220134400.0, "grad_norm": 2.5498845316499823, "language_loss": 0.64774567, "learning_rate": 1.5864753092460502e-06, "loss": 0.67243969, "num_input_tokens_seen": 207533685, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.20666504, "step": 9635, "time_per_iteration": 2.8647119998931885 }, { "auxiliary_loss_clip": 0.01411968, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.25012696, "balance_loss_mlp": 1.01405442, "epoch": 0.5793476627085525, "flos": 24070780062720.0, "grad_norm": 1.4630507628619012, "language_loss": 0.78032589, "learning_rate": 1.5860942712085516e-06, "loss": 0.80477893, "num_input_tokens_seen": 207552840, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19287109, "step": 9636, "time_per_iteration": 2.874861478805542 }, { "auxiliary_loss_clip": 0.01410257, "auxiliary_loss_mlp": 0.01036135, "balance_loss_clip": 1.24932826, "balance_loss_mlp": 1.01762199, "epoch": 0.5794077859612206, "flos": 22064393070720.0, "grad_norm": 1.576172456139333, "language_loss": 0.69591963, "learning_rate": 1.5857132488663998e-06, "loss": 0.72038364, "num_input_tokens_seen": 207572095, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18505859, "step": 9637, "time_per_iteration": 2.8470871448516846 }, { "auxiliary_loss_clip": 0.01435388, "auxiliary_loss_mlp": 0.01038974, "balance_loss_clip": 1.26678252, "balance_loss_mlp": 1.01832688, "epoch": 0.5794679092138885, "flos": 11441667767040.0, "grad_norm": 13.814267988394317, "language_loss": 0.73239106, "learning_rate": 1.585332242234043e-06, "loss": 0.75713468, "num_input_tokens_seen": 207587495, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.2064209, "step": 9638, "time_per_iteration": 4.2034752368927 }, { "auxiliary_loss_clip": 0.01421954, "auxiliary_loss_mlp": 0.01040582, "balance_loss_clip": 1.25773251, "balance_loss_mlp": 1.02113891, "epoch": 0.5795280324665565, "flos": 18889546383360.0, "grad_norm": 1.5656758674596514, "language_loss": 0.73185062, "learning_rate": 1.5849512513259291e-06, "loss": 0.75647593, "num_input_tokens_seen": 207606795, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19445801, "step": 9639, "time_per_iteration": 4.262020587921143 }, { "auxiliary_loss_clip": 0.01417103, "auxiliary_loss_mlp": 0.01038806, "balance_loss_clip": 1.25280678, "balance_loss_mlp": 1.01920772, "epoch": 0.5795881557192244, "flos": 13013739492480.0, "grad_norm": 4.007521070679836, "language_loss": 0.69990987, "learning_rate": 1.5845702761565054e-06, "loss": 0.72446895, "num_input_tokens_seen": 207623620, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19604492, "step": 9640, "time_per_iteration": 4.309274196624756 }, { "auxiliary_loss_clip": 0.01453974, "auxiliary_loss_mlp": 0.01043175, "balance_loss_clip": 1.28020322, "balance_loss_mlp": 1.02168131, "epoch": 0.5796482789718924, "flos": 19940468791680.0, "grad_norm": 2.769298514180623, "language_loss": 0.78119987, "learning_rate": 1.5841893167402183e-06, "loss": 0.80617142, "num_input_tokens_seen": 207639380, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.21496582, "step": 9641, "time_per_iteration": 2.7930023670196533 }, { "auxiliary_loss_clip": 0.01419821, "auxiliary_loss_mlp": 0.01038372, "balance_loss_clip": 1.25481629, "balance_loss_mlp": 1.01809442, "epoch": 0.5797084022245603, "flos": 21660238103040.0, "grad_norm": 2.103226355990795, "language_loss": 0.74886531, "learning_rate": 1.5838083730915143e-06, "loss": 0.77344722, "num_input_tokens_seen": 207657915, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20288086, "step": 9642, "time_per_iteration": 2.841566562652588 }, { "auxiliary_loss_clip": 0.01415094, "auxiliary_loss_mlp": 0.01041646, "balance_loss_clip": 1.2508713, "balance_loss_mlp": 1.02082014, "epoch": 0.5797685254772283, "flos": 26042798989440.0, "grad_norm": 1.4937907130960841, "language_loss": 0.74237978, "learning_rate": 1.5834274452248378e-06, "loss": 0.76694715, "num_input_tokens_seen": 207678620, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20812988, "step": 9643, "time_per_iteration": 2.8780648708343506 }, { "auxiliary_loss_clip": 0.01427635, "auxiliary_loss_mlp": 0.01037011, "balance_loss_clip": 1.26114643, "balance_loss_mlp": 1.01649523, "epoch": 0.5798286487298963, "flos": 22713920444160.0, "grad_norm": 3.8865801566266223, "language_loss": 0.68518865, "learning_rate": 1.5830465331546352e-06, "loss": 0.70983517, "num_input_tokens_seen": 207696980, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20507812, "step": 9644, "time_per_iteration": 2.8448522090911865 }, { "auxiliary_loss_clip": 0.01435206, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.26650763, "balance_loss_mlp": 1.01765049, "epoch": 0.5798887719825643, "flos": 23159411176320.0, "grad_norm": 2.0564735405782266, "language_loss": 0.86839861, "learning_rate": 1.5826656368953496e-06, "loss": 0.89314032, "num_input_tokens_seen": 207714065, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21313477, "step": 9645, "time_per_iteration": 2.8298254013061523 }, { "auxiliary_loss_clip": 0.01425137, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.25997949, "balance_loss_mlp": 1.01581144, "epoch": 0.5799488952352322, "flos": 24436811646720.0, "grad_norm": 2.2907869005974737, "language_loss": 0.76128471, "learning_rate": 1.5822847564614244e-06, "loss": 0.78589129, "num_input_tokens_seen": 207734720, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19714355, "step": 9646, "time_per_iteration": 2.899134635925293 }, { "auxiliary_loss_clip": 0.01436424, "auxiliary_loss_mlp": 0.01039859, "balance_loss_clip": 1.26713765, "balance_loss_mlp": 1.01775765, "epoch": 0.5800090184879002, "flos": 38409210120960.0, "grad_norm": 2.03549269623621, "language_loss": 0.59862489, "learning_rate": 1.5819038918673038e-06, "loss": 0.62338769, "num_input_tokens_seen": 207755435, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.2208252, "step": 9647, "time_per_iteration": 3.032667398452759 }, { "auxiliary_loss_clip": 0.01433655, "auxiliary_loss_mlp": 0.01045947, "balance_loss_clip": 1.26467729, "balance_loss_mlp": 1.02543068, "epoch": 0.5800691417405681, "flos": 19792454492160.0, "grad_norm": 1.589211301148527, "language_loss": 0.84715652, "learning_rate": 1.5815230431274288e-06, "loss": 0.87195253, "num_input_tokens_seen": 207773570, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20507812, "step": 9648, "time_per_iteration": 2.9254708290100098 }, { "auxiliary_loss_clip": 0.01214758, "auxiliary_loss_mlp": 0.01024178, "balance_loss_clip": 1.12095559, "balance_loss_mlp": 1.00376916, "epoch": 0.5801292649932361, "flos": 70343968394880.0, "grad_norm": 0.8604722644748034, "language_loss": 0.63157737, "learning_rate": 1.581142210256242e-06, "loss": 0.65396678, "num_input_tokens_seen": 207830095, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.20410156, "step": 9649, "time_per_iteration": 3.407658338546753 }, { "auxiliary_loss_clip": 0.01405054, "auxiliary_loss_mlp": 0.01034423, "balance_loss_clip": 1.24384511, "balance_loss_mlp": 1.01509881, "epoch": 0.5801893882459042, "flos": 18744246771840.0, "grad_norm": 1.8771501263449424, "language_loss": 0.82366675, "learning_rate": 1.5807613932681857e-06, "loss": 0.84806156, "num_input_tokens_seen": 207848555, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1932373, "step": 9650, "time_per_iteration": 2.8750314712524414 }, { "auxiliary_loss_clip": 0.01425657, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.25614154, "balance_loss_mlp": 1.01493013, "epoch": 0.5802495114985721, "flos": 15604989759360.0, "grad_norm": 2.14037973103482, "language_loss": 0.77429354, "learning_rate": 1.580380592177698e-06, "loss": 0.79889977, "num_input_tokens_seen": 207867060, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20019531, "step": 9651, "time_per_iteration": 2.8271162509918213 }, { "auxiliary_loss_clip": 0.01429727, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.26090932, "balance_loss_mlp": 1.017308, "epoch": 0.5803096347512401, "flos": 18263392588800.0, "grad_norm": 2.2856666126433285, "language_loss": 0.74991155, "learning_rate": 1.5799998069992213e-06, "loss": 0.77457595, "num_input_tokens_seen": 207884520, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.1940918, "step": 9652, "time_per_iteration": 2.9189822673797607 }, { "auxiliary_loss_clip": 0.01417885, "auxiliary_loss_mlp": 0.01038213, "balance_loss_clip": 1.24969983, "balance_loss_mlp": 1.01708949, "epoch": 0.580369758003908, "flos": 22903406242560.0, "grad_norm": 4.400847956487363, "language_loss": 0.77783048, "learning_rate": 1.579619037747193e-06, "loss": 0.80239141, "num_input_tokens_seen": 207905370, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.21118164, "step": 9653, "time_per_iteration": 2.8709278106689453 }, { "auxiliary_loss_clip": 0.01419052, "auxiliary_loss_mlp": 0.01035946, "balance_loss_clip": 1.25171161, "balance_loss_mlp": 1.01508391, "epoch": 0.580429881256576, "flos": 18706937794560.0, "grad_norm": 2.2308169713829287, "language_loss": 0.75700247, "learning_rate": 1.5792382844360534e-06, "loss": 0.78155243, "num_input_tokens_seen": 207923790, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20849609, "step": 9654, "time_per_iteration": 2.825124740600586 }, { "auxiliary_loss_clip": 0.01400842, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.23925114, "balance_loss_mlp": 1.01522446, "epoch": 0.5804900045092439, "flos": 24692952314880.0, "grad_norm": 1.8439425104151423, "language_loss": 0.71393049, "learning_rate": 1.5788575470802408e-06, "loss": 0.73829186, "num_input_tokens_seen": 207942335, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20056152, "step": 9655, "time_per_iteration": 2.8343758583068848 }, { "auxiliary_loss_clip": 0.01423433, "auxiliary_loss_mlp": 0.01034612, "balance_loss_clip": 1.25214159, "balance_loss_mlp": 1.01297569, "epoch": 0.580550127761912, "flos": 23123278563840.0, "grad_norm": 2.1005716896837603, "language_loss": 0.70085263, "learning_rate": 1.5784768256941915e-06, "loss": 0.72543311, "num_input_tokens_seen": 207961975, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21630859, "step": 9656, "time_per_iteration": 2.8374574184417725 }, { "auxiliary_loss_clip": 0.01392963, "auxiliary_loss_mlp": 0.01029501, "balance_loss_clip": 1.23334038, "balance_loss_mlp": 1.01043904, "epoch": 0.5806102510145799, "flos": 18484079316480.0, "grad_norm": 1.670552258171982, "language_loss": 0.72103113, "learning_rate": 1.5780961202923433e-06, "loss": 0.74525577, "num_input_tokens_seen": 207979520, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19055176, "step": 9657, "time_per_iteration": 2.8063547611236572 }, { "auxiliary_loss_clip": 0.01437333, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.26663828, "balance_loss_mlp": 1.01776123, "epoch": 0.5806703742672479, "flos": 23926385347200.0, "grad_norm": 2.4987999101303564, "language_loss": 0.7201649, "learning_rate": 1.5777154308891328e-06, "loss": 0.74492252, "num_input_tokens_seen": 207998375, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.20678711, "step": 9658, "time_per_iteration": 2.9283196926116943 }, { "auxiliary_loss_clip": 0.0120973, "auxiliary_loss_mlp": 0.01028751, "balance_loss_clip": 1.1197058, "balance_loss_mlp": 1.00576723, "epoch": 0.5807304975199158, "flos": 66343184282880.0, "grad_norm": 0.6479624456958403, "language_loss": 0.5359453, "learning_rate": 1.5773347574989953e-06, "loss": 0.55833012, "num_input_tokens_seen": 208060605, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.22949219, "step": 9659, "time_per_iteration": 4.812680959701538 }, { "auxiliary_loss_clip": 0.01424726, "auxiliary_loss_mlp": 0.01039849, "balance_loss_clip": 1.25564694, "balance_loss_mlp": 1.01872468, "epoch": 0.5807906207725838, "flos": 31734911416320.0, "grad_norm": 3.6420768443223683, "language_loss": 0.63312209, "learning_rate": 1.576954100136366e-06, "loss": 0.65776789, "num_input_tokens_seen": 208080320, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.21105957, "step": 9660, "time_per_iteration": 2.938620090484619 }, { "auxiliary_loss_clip": 0.01411904, "auxiliary_loss_mlp": 0.01034269, "balance_loss_clip": 1.24419188, "balance_loss_mlp": 1.01403928, "epoch": 0.5808507440252517, "flos": 23810567362560.0, "grad_norm": 1.593740415267695, "language_loss": 0.66413468, "learning_rate": 1.5765734588156797e-06, "loss": 0.68859637, "num_input_tokens_seen": 208099305, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20227051, "step": 9661, "time_per_iteration": 2.8557522296905518 }, { "auxiliary_loss_clip": 0.01392883, "auxiliary_loss_mlp": 0.01031899, "balance_loss_clip": 1.23334396, "balance_loss_mlp": 1.01097786, "epoch": 0.5809108672779197, "flos": 13706231443200.0, "grad_norm": 2.6670945630162213, "language_loss": 0.75088805, "learning_rate": 1.5761928335513704e-06, "loss": 0.77513587, "num_input_tokens_seen": 208116960, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.20935059, "step": 9662, "time_per_iteration": 2.8302438259124756 }, { "auxiliary_loss_clip": 0.01214195, "auxiliary_loss_mlp": 0.01052736, "balance_loss_clip": 1.12023628, "balance_loss_mlp": 1.02946651, "epoch": 0.5809709905305876, "flos": 69170278285440.0, "grad_norm": 0.8847474026625254, "language_loss": 0.5849157, "learning_rate": 1.5758122243578709e-06, "loss": 0.60758501, "num_input_tokens_seen": 208182190, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.23242188, "step": 9663, "time_per_iteration": 3.3706986904144287 }, { "auxiliary_loss_clip": 0.01404862, "auxiliary_loss_mlp": 0.01030255, "balance_loss_clip": 1.24237251, "balance_loss_mlp": 1.01076448, "epoch": 0.5810311137832557, "flos": 19836776413440.0, "grad_norm": 2.351922942396373, "language_loss": 0.82888025, "learning_rate": 1.5754316312496152e-06, "loss": 0.85323137, "num_input_tokens_seen": 208197015, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19470215, "step": 9664, "time_per_iteration": 2.8335611820220947 }, { "auxiliary_loss_clip": 0.01408164, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.23928022, "balance_loss_mlp": 1.01241338, "epoch": 0.5810912370359237, "flos": 29249570522880.0, "grad_norm": 1.6119931103610399, "language_loss": 0.82341415, "learning_rate": 1.5750510542410337e-06, "loss": 0.84783196, "num_input_tokens_seen": 208215795, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.21203613, "step": 9665, "time_per_iteration": 2.9228031635284424 }, { "auxiliary_loss_clip": 0.01435346, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.26398349, "balance_loss_mlp": 1.01672828, "epoch": 0.5811513602885916, "flos": 22796275259520.0, "grad_norm": 1.6137206480338748, "language_loss": 0.81821132, "learning_rate": 1.5746704933465599e-06, "loss": 0.84293824, "num_input_tokens_seen": 208234655, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20629883, "step": 9666, "time_per_iteration": 2.8480751514434814 }, { "auxiliary_loss_clip": 0.01413193, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.24985802, "balance_loss_mlp": 1.01574397, "epoch": 0.5812114835412596, "flos": 18743658589440.0, "grad_norm": 1.927148951646313, "language_loss": 0.81023264, "learning_rate": 1.5742899485806227e-06, "loss": 0.8347162, "num_input_tokens_seen": 208251300, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19421387, "step": 9667, "time_per_iteration": 2.8306288719177246 }, { "auxiliary_loss_clip": 0.01441449, "auxiliary_loss_mlp": 0.01038031, "balance_loss_clip": 1.26641202, "balance_loss_mlp": 1.01651382, "epoch": 0.5812716067939275, "flos": 26442203253120.0, "grad_norm": 3.022829467062817, "language_loss": 0.79354572, "learning_rate": 1.573909419957653e-06, "loss": 0.81834054, "num_input_tokens_seen": 208272685, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.21520996, "step": 9668, "time_per_iteration": 2.8757975101470947 }, { "auxiliary_loss_clip": 0.01410786, "auxiliary_loss_mlp": 0.01040352, "balance_loss_clip": 1.24442458, "balance_loss_mlp": 1.02007437, "epoch": 0.5813317300465956, "flos": 43413355077120.0, "grad_norm": 1.7779705162503576, "language_loss": 0.65692675, "learning_rate": 1.5735289074920819e-06, "loss": 0.68143821, "num_input_tokens_seen": 208294315, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20275879, "step": 9669, "time_per_iteration": 3.0489182472229004 }, { "auxiliary_loss_clip": 0.01405791, "auxiliary_loss_mlp": 0.01042599, "balance_loss_clip": 1.24131405, "balance_loss_mlp": 1.02105784, "epoch": 0.5813918532992635, "flos": 24794925390720.0, "grad_norm": 1.4745824876898979, "language_loss": 0.7407136, "learning_rate": 1.5731484111983363e-06, "loss": 0.76519758, "num_input_tokens_seen": 208315610, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.2154541, "step": 9670, "time_per_iteration": 2.9266796112060547 }, { "auxiliary_loss_clip": 0.01416773, "auxiliary_loss_mlp": 0.01038793, "balance_loss_clip": 1.24924731, "balance_loss_mlp": 1.0186584, "epoch": 0.5814519765519315, "flos": 22867771322880.0, "grad_norm": 2.0199678771878298, "language_loss": 0.79506385, "learning_rate": 1.5727679310908464e-06, "loss": 0.81961954, "num_input_tokens_seen": 208334725, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20141602, "step": 9671, "time_per_iteration": 2.8541440963745117 }, { "auxiliary_loss_clip": 0.01445218, "auxiliary_loss_mlp": 0.01042285, "balance_loss_clip": 1.27219164, "balance_loss_mlp": 1.02123249, "epoch": 0.5815120998045994, "flos": 24071413489920.0, "grad_norm": 2.033699395922381, "language_loss": 0.62801576, "learning_rate": 1.5723874671840399e-06, "loss": 0.6528908, "num_input_tokens_seen": 208353825, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.21032715, "step": 9672, "time_per_iteration": 4.229781627655029 }, { "auxiliary_loss_clip": 0.01410273, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.24586141, "balance_loss_mlp": 1.01457453, "epoch": 0.5815722230572674, "flos": 24290064201600.0, "grad_norm": 1.6086933957418081, "language_loss": 0.82325524, "learning_rate": 1.572007019492342e-06, "loss": 0.84770179, "num_input_tokens_seen": 208374160, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19799805, "step": 9673, "time_per_iteration": 2.961327075958252 }, { "auxiliary_loss_clip": 0.01421899, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.25221586, "balance_loss_mlp": 1.01652801, "epoch": 0.5816323463099353, "flos": 22210552333440.0, "grad_norm": 2.049307579270645, "language_loss": 0.89059794, "learning_rate": 1.5716265880301817e-06, "loss": 0.91519082, "num_input_tokens_seen": 208392105, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20861816, "step": 9674, "time_per_iteration": 2.8462719917297363 }, { "auxiliary_loss_clip": 0.01415659, "auxiliary_loss_mlp": 0.0103676, "balance_loss_clip": 1.25016499, "balance_loss_mlp": 1.01709032, "epoch": 0.5816924695626033, "flos": 24145217038080.0, "grad_norm": 1.646159890888627, "language_loss": 0.79586762, "learning_rate": 1.571246172811984e-06, "loss": 0.82039183, "num_input_tokens_seen": 208411755, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19665527, "step": 9675, "time_per_iteration": 4.34076452255249 }, { "auxiliary_loss_clip": 0.01424015, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.257146, "balance_loss_mlp": 1.01609516, "epoch": 0.5817525928152713, "flos": 21334049205120.0, "grad_norm": 2.8290256239270213, "language_loss": 0.70876747, "learning_rate": 1.5708657738521748e-06, "loss": 0.73337501, "num_input_tokens_seen": 208429995, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20654297, "step": 9676, "time_per_iteration": 2.8427958488464355 }, { "auxiliary_loss_clip": 0.01424719, "auxiliary_loss_mlp": 0.01040486, "balance_loss_clip": 1.25625145, "balance_loss_mlp": 1.02075696, "epoch": 0.5818127160679393, "flos": 26943580592640.0, "grad_norm": 2.560486599265088, "language_loss": 0.6486333, "learning_rate": 1.5704853911651779e-06, "loss": 0.67328537, "num_input_tokens_seen": 208443655, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19726562, "step": 9677, "time_per_iteration": 2.8650524616241455 }, { "auxiliary_loss_clip": 0.01218561, "auxiliary_loss_mlp": 0.01033328, "balance_loss_clip": 1.12005293, "balance_loss_mlp": 1.00509918, "epoch": 0.5818728393206073, "flos": 63953002483200.0, "grad_norm": 0.8048764321380758, "language_loss": 0.54221833, "learning_rate": 1.5701050247654182e-06, "loss": 0.56473732, "num_input_tokens_seen": 208498405, "router_z_loss_clip": 0.984375, "router_z_loss_mlp": 0.28320312, "step": 9678, "time_per_iteration": 3.4022960662841797 }, { "auxiliary_loss_clip": 0.0121324, "auxiliary_loss_mlp": 0.01037177, "balance_loss_clip": 1.11514449, "balance_loss_mlp": 1.01104641, "epoch": 0.5819329625732752, "flos": 64982886059520.0, "grad_norm": 0.7487936075394347, "language_loss": 0.56282324, "learning_rate": 1.569724674667319e-06, "loss": 0.58532739, "num_input_tokens_seen": 208559075, "router_z_loss_clip": 0.9765625, "router_z_loss_mlp": 0.26171875, "step": 9679, "time_per_iteration": 3.2118566036224365 }, { "auxiliary_loss_clip": 0.01423609, "auxiliary_loss_mlp": 0.01042017, "balance_loss_clip": 1.25541556, "balance_loss_mlp": 1.02142978, "epoch": 0.5819930858259432, "flos": 21225198919680.0, "grad_norm": 1.5938441270552801, "language_loss": 0.66086036, "learning_rate": 1.5693443408853032e-06, "loss": 0.6855166, "num_input_tokens_seen": 208577770, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20581055, "step": 9680, "time_per_iteration": 2.852072238922119 }, { "auxiliary_loss_clip": 0.01422856, "auxiliary_loss_mlp": 0.01034671, "balance_loss_clip": 1.25579154, "balance_loss_mlp": 1.01415455, "epoch": 0.5820532090786111, "flos": 19466582307840.0, "grad_norm": 2.2313199376645008, "language_loss": 0.8416512, "learning_rate": 1.5689640234337933e-06, "loss": 0.86622649, "num_input_tokens_seen": 208595110, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20507812, "step": 9681, "time_per_iteration": 2.8397393226623535 }, { "auxiliary_loss_clip": 0.0141893, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.25355279, "balance_loss_mlp": 1.01616573, "epoch": 0.5821133323312792, "flos": 17721629625600.0, "grad_norm": 1.791458377621193, "language_loss": 0.76984024, "learning_rate": 1.5685837223272109e-06, "loss": 0.79439384, "num_input_tokens_seen": 208612080, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20275879, "step": 9682, "time_per_iteration": 2.833104372024536 }, { "auxiliary_loss_clip": 0.01427628, "auxiliary_loss_mlp": 0.01035688, "balance_loss_clip": 1.2572161, "balance_loss_mlp": 1.01495767, "epoch": 0.5821734555839471, "flos": 24582563706240.0, "grad_norm": 2.1529484069121465, "language_loss": 0.75832772, "learning_rate": 1.568203437579977e-06, "loss": 0.78296089, "num_input_tokens_seen": 208630235, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20727539, "step": 9683, "time_per_iteration": 2.8651559352874756 }, { "auxiliary_loss_clip": 0.01432917, "auxiliary_loss_mlp": 0.01033727, "balance_loss_clip": 1.26151764, "balance_loss_mlp": 1.01363981, "epoch": 0.5822335788366151, "flos": 22392346515840.0, "grad_norm": 2.2989960855594154, "language_loss": 0.74830556, "learning_rate": 1.5678231692065116e-06, "loss": 0.77297199, "num_input_tokens_seen": 208647925, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20092773, "step": 9684, "time_per_iteration": 2.8428680896759033 }, { "auxiliary_loss_clip": 0.01429947, "auxiliary_loss_mlp": 0.01043939, "balance_loss_clip": 1.26024687, "balance_loss_mlp": 1.02317262, "epoch": 0.582293702089283, "flos": 26733300168960.0, "grad_norm": 2.161050554772547, "language_loss": 0.79112697, "learning_rate": 1.5674429172212348e-06, "loss": 0.81586587, "num_input_tokens_seen": 208666180, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.2076416, "step": 9685, "time_per_iteration": 2.8790948390960693 }, { "auxiliary_loss_clip": 0.01425972, "auxiliary_loss_mlp": 0.0104226, "balance_loss_clip": 1.25707865, "balance_loss_mlp": 1.02180338, "epoch": 0.582353825341951, "flos": 17357815036800.0, "grad_norm": 1.693622388726344, "language_loss": 0.76161206, "learning_rate": 1.5670626816385667e-06, "loss": 0.7862944, "num_input_tokens_seen": 208684240, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20458984, "step": 9686, "time_per_iteration": 2.8611233234405518 }, { "auxiliary_loss_clip": 0.01212111, "auxiliary_loss_mlp": 0.01024441, "balance_loss_clip": 1.117311, "balance_loss_mlp": 0.9971658, "epoch": 0.5824139485946189, "flos": 55500993861120.0, "grad_norm": 0.8142254599066522, "language_loss": 0.57390594, "learning_rate": 1.5666824624729244e-06, "loss": 0.59627151, "num_input_tokens_seen": 208736090, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.2734375, "step": 9687, "time_per_iteration": 3.1717722415924072 }, { "auxiliary_loss_clip": 0.01426124, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.25716054, "balance_loss_mlp": 1.01165211, "epoch": 0.582474071847287, "flos": 20312698913280.0, "grad_norm": 3.0611420396577205, "language_loss": 0.70981491, "learning_rate": 1.566302259738727e-06, "loss": 0.73440552, "num_input_tokens_seen": 208754600, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21264648, "step": 9688, "time_per_iteration": 2.8849117755889893 }, { "auxiliary_loss_clip": 0.01418339, "auxiliary_loss_mlp": 0.0103461, "balance_loss_clip": 1.25107467, "balance_loss_mlp": 1.01461875, "epoch": 0.5825341950999549, "flos": 23888442942720.0, "grad_norm": 2.397390264003874, "language_loss": 0.66238022, "learning_rate": 1.5659220734503918e-06, "loss": 0.68690968, "num_input_tokens_seen": 208773140, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.1998291, "step": 9689, "time_per_iteration": 2.896416425704956 }, { "auxiliary_loss_clip": 0.01421131, "auxiliary_loss_mlp": 0.01039059, "balance_loss_clip": 1.25619447, "balance_loss_mlp": 1.0177089, "epoch": 0.5825943183526229, "flos": 23123595277440.0, "grad_norm": 1.8927864218734844, "language_loss": 0.7424801, "learning_rate": 1.5655419036223341e-06, "loss": 0.76708198, "num_input_tokens_seen": 208793410, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.21350098, "step": 9690, "time_per_iteration": 2.8680717945098877 }, { "auxiliary_loss_clip": 0.01424799, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.2553612, "balance_loss_mlp": 1.01815951, "epoch": 0.5826544416052909, "flos": 22867861812480.0, "grad_norm": 1.8665242399882351, "language_loss": 0.76331794, "learning_rate": 1.5651617502689717e-06, "loss": 0.78796935, "num_input_tokens_seen": 208811920, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.22180176, "step": 9691, "time_per_iteration": 2.839571237564087 }, { "auxiliary_loss_clip": 0.01412853, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.24444783, "balance_loss_mlp": 1.01475811, "epoch": 0.5827145648579588, "flos": 31513817485440.0, "grad_norm": 1.6841847879511873, "language_loss": 0.81338853, "learning_rate": 1.5647816134047184e-06, "loss": 0.83786863, "num_input_tokens_seen": 208834720, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20410156, "step": 9692, "time_per_iteration": 2.8897199630737305 }, { "auxiliary_loss_clip": 0.01208009, "auxiliary_loss_mlp": 0.01023425, "balance_loss_clip": 1.11375308, "balance_loss_mlp": 0.9967224, "epoch": 0.5827746881106268, "flos": 69843179237760.0, "grad_norm": 0.7596682392227048, "language_loss": 0.56978083, "learning_rate": 1.5644014930439907e-06, "loss": 0.59209514, "num_input_tokens_seen": 208898415, "router_z_loss_clip": 0.94140625, "router_z_loss_mlp": 0.26757812, "step": 9693, "time_per_iteration": 3.330054998397827 }, { "auxiliary_loss_clip": 0.01419218, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.25144732, "balance_loss_mlp": 1.0171206, "epoch": 0.5828348113632947, "flos": 23122780871040.0, "grad_norm": 1.764733363679559, "language_loss": 0.79567814, "learning_rate": 1.5640213892012025e-06, "loss": 0.82025248, "num_input_tokens_seen": 208919045, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.2109375, "step": 9694, "time_per_iteration": 4.3018248081207275 }, { "auxiliary_loss_clip": 0.01396337, "auxiliary_loss_mlp": 0.01032469, "balance_loss_clip": 1.23617756, "balance_loss_mlp": 1.01338339, "epoch": 0.5828949346159628, "flos": 21883458539520.0, "grad_norm": 1.3184454041143983, "language_loss": 0.77102661, "learning_rate": 1.5636413018907656e-06, "loss": 0.79531467, "num_input_tokens_seen": 208939375, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19091797, "step": 9695, "time_per_iteration": 2.8470022678375244 }, { "auxiliary_loss_clip": 0.01212185, "auxiliary_loss_mlp": 0.01023038, "balance_loss_clip": 1.11669779, "balance_loss_mlp": 0.99614471, "epoch": 0.5829550578686307, "flos": 65997648593280.0, "grad_norm": 0.7769868158984526, "language_loss": 0.55081832, "learning_rate": 1.563261231127095e-06, "loss": 0.57317054, "num_input_tokens_seen": 209004760, "router_z_loss_clip": 0.953125, "router_z_loss_mlp": 0.26953125, "step": 9696, "time_per_iteration": 3.3739840984344482 }, { "auxiliary_loss_clip": 0.01425434, "auxiliary_loss_mlp": 0.01033495, "balance_loss_clip": 1.25724792, "balance_loss_mlp": 1.01299107, "epoch": 0.5830151811212987, "flos": 16298477095680.0, "grad_norm": 2.1411064077493345, "language_loss": 0.77801967, "learning_rate": 1.5628811769246021e-06, "loss": 0.80260897, "num_input_tokens_seen": 209022930, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20495605, "step": 9697, "time_per_iteration": 2.817671060562134 }, { "auxiliary_loss_clip": 0.0142503, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.25431085, "balance_loss_mlp": 1.01768613, "epoch": 0.5830753043739666, "flos": 24179087410560.0, "grad_norm": 1.715919874126014, "language_loss": 0.7821691, "learning_rate": 1.5625011392976991e-06, "loss": 0.80681401, "num_input_tokens_seen": 209043740, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21777344, "step": 9698, "time_per_iteration": 2.8826565742492676 }, { "auxiliary_loss_clip": 0.01411112, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.24587297, "balance_loss_mlp": 1.01881647, "epoch": 0.5831354276266346, "flos": 27072067121280.0, "grad_norm": 1.8108074775284826, "language_loss": 0.84124774, "learning_rate": 1.5621211182607966e-06, "loss": 0.86575437, "num_input_tokens_seen": 209068885, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20739746, "step": 9699, "time_per_iteration": 2.9753732681274414 }, { "auxiliary_loss_clip": 0.01421283, "auxiliary_loss_mlp": 0.01036062, "balance_loss_clip": 1.25099277, "balance_loss_mlp": 1.01486611, "epoch": 0.5831955508793025, "flos": 23634066821760.0, "grad_norm": 2.2487772090029763, "language_loss": 0.66762424, "learning_rate": 1.561741113828305e-06, "loss": 0.69219768, "num_input_tokens_seen": 209087340, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21191406, "step": 9700, "time_per_iteration": 2.955373764038086 }, { "auxiliary_loss_clip": 0.01423996, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.25561738, "balance_loss_mlp": 1.01743698, "epoch": 0.5832556741319705, "flos": 24984139720320.0, "grad_norm": 1.5813117954852438, "language_loss": 0.71552503, "learning_rate": 1.5613611260146344e-06, "loss": 0.74015617, "num_input_tokens_seen": 209108840, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21691895, "step": 9701, "time_per_iteration": 2.923311710357666 }, { "auxiliary_loss_clip": 0.01408708, "auxiliary_loss_mlp": 0.01036997, "balance_loss_clip": 1.2418226, "balance_loss_mlp": 1.01636207, "epoch": 0.5833157973846385, "flos": 23231902625280.0, "grad_norm": 13.706303535976698, "language_loss": 0.86112154, "learning_rate": 1.5609811548341936e-06, "loss": 0.88557857, "num_input_tokens_seen": 209127985, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.2064209, "step": 9702, "time_per_iteration": 2.8524200916290283 }, { "auxiliary_loss_clip": 0.01405209, "auxiliary_loss_mlp": 0.01036295, "balance_loss_clip": 1.24109197, "balance_loss_mlp": 1.01612508, "epoch": 0.5833759206373065, "flos": 21987150917760.0, "grad_norm": 1.5600128491478518, "language_loss": 0.78371543, "learning_rate": 1.560601200301392e-06, "loss": 0.8081305, "num_input_tokens_seen": 209146885, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20178223, "step": 9703, "time_per_iteration": 2.8563156127929688 }, { "auxiliary_loss_clip": 0.01422738, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.25290561, "balance_loss_mlp": 1.01642728, "epoch": 0.5834360438899745, "flos": 21772436503680.0, "grad_norm": 1.7458935014644952, "language_loss": 0.72066379, "learning_rate": 1.5602212624306366e-06, "loss": 0.74525821, "num_input_tokens_seen": 209166130, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20275879, "step": 9704, "time_per_iteration": 2.8372626304626465 }, { "auxiliary_loss_clip": 0.01423305, "auxiliary_loss_mlp": 0.01037785, "balance_loss_clip": 1.25456917, "balance_loss_mlp": 1.01778185, "epoch": 0.5834961671426424, "flos": 15999779053440.0, "grad_norm": 1.7348376578404974, "language_loss": 0.82279408, "learning_rate": 1.559841341236335e-06, "loss": 0.84740496, "num_input_tokens_seen": 209183350, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.1998291, "step": 9705, "time_per_iteration": 2.842571496963501 }, { "auxiliary_loss_clip": 0.01421575, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.25360191, "balance_loss_mlp": 1.01265836, "epoch": 0.5835562903953104, "flos": 22828064371200.0, "grad_norm": 1.6209014300463165, "language_loss": 0.80654061, "learning_rate": 1.5594614367328937e-06, "loss": 0.83108008, "num_input_tokens_seen": 209203945, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.1973877, "step": 9706, "time_per_iteration": 2.8453478813171387 }, { "auxiliary_loss_clip": 0.01415916, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.25005603, "balance_loss_mlp": 1.01638305, "epoch": 0.5836164136479783, "flos": 48484516861440.0, "grad_norm": 1.747309918742351, "language_loss": 0.75973254, "learning_rate": 1.5590815489347187e-06, "loss": 0.7842651, "num_input_tokens_seen": 209227080, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20947266, "step": 9707, "time_per_iteration": 4.4875876903533936 }, { "auxiliary_loss_clip": 0.0140191, "auxiliary_loss_mlp": 0.01033908, "balance_loss_clip": 1.23857784, "balance_loss_mlp": 1.01390481, "epoch": 0.5836765369006464, "flos": 26917130367360.0, "grad_norm": 2.062395980263255, "language_loss": 0.82534826, "learning_rate": 1.5587016778562163e-06, "loss": 0.84970641, "num_input_tokens_seen": 209248170, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20007324, "step": 9708, "time_per_iteration": 2.88557505607605 }, { "auxiliary_loss_clip": 0.01414949, "auxiliary_loss_mlp": 0.01036852, "balance_loss_clip": 1.25026846, "balance_loss_mlp": 1.01664567, "epoch": 0.5837366601533143, "flos": 20093640998400.0, "grad_norm": 1.5586350508639089, "language_loss": 0.79298002, "learning_rate": 1.5583218235117896e-06, "loss": 0.81749803, "num_input_tokens_seen": 209267730, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.2019043, "step": 9709, "time_per_iteration": 2.864179849624634 }, { "auxiliary_loss_clip": 0.01210554, "auxiliary_loss_mlp": 0.01046193, "balance_loss_clip": 1.11683726, "balance_loss_mlp": 1.02473533, "epoch": 0.5837967834059823, "flos": 65394958867200.0, "grad_norm": 0.7706546265753152, "language_loss": 0.56527823, "learning_rate": 1.557941985915844e-06, "loss": 0.58784568, "num_input_tokens_seen": 209332510, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.21484375, "step": 9710, "time_per_iteration": 6.105860233306885 }, { "auxiliary_loss_clip": 0.0141018, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.24632573, "balance_loss_mlp": 1.0146482, "epoch": 0.5838569066586502, "flos": 25349764101120.0, "grad_norm": 1.45963145397657, "language_loss": 0.66170299, "learning_rate": 1.5575621650827833e-06, "loss": 0.68615431, "num_input_tokens_seen": 209353355, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.203125, "step": 9711, "time_per_iteration": 2.8632452487945557 }, { "auxiliary_loss_clip": 0.0143401, "auxiliary_loss_mlp": 0.01039902, "balance_loss_clip": 1.26131749, "balance_loss_mlp": 1.01838458, "epoch": 0.5839170299113182, "flos": 22237455006720.0, "grad_norm": 1.918735942555622, "language_loss": 0.7976433, "learning_rate": 1.5571823610270085e-06, "loss": 0.82238245, "num_input_tokens_seen": 209370960, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.21520996, "step": 9712, "time_per_iteration": 2.8469126224517822 }, { "auxiliary_loss_clip": 0.01409694, "auxiliary_loss_mlp": 0.01032393, "balance_loss_clip": 1.24368906, "balance_loss_mlp": 1.01210403, "epoch": 0.5839771531639861, "flos": 22210099885440.0, "grad_norm": 8.447012770526932, "language_loss": 0.7393434, "learning_rate": 1.5568025737629234e-06, "loss": 0.76376426, "num_input_tokens_seen": 209390955, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20300293, "step": 9713, "time_per_iteration": 2.8470218181610107 }, { "auxiliary_loss_clip": 0.01426276, "auxiliary_loss_mlp": 0.01038829, "balance_loss_clip": 1.25399649, "balance_loss_mlp": 1.0172286, "epoch": 0.5840372764166541, "flos": 22429610248320.0, "grad_norm": 5.5438793445392385, "language_loss": 0.70712888, "learning_rate": 1.5564228033049292e-06, "loss": 0.73177993, "num_input_tokens_seen": 209410260, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.21606445, "step": 9714, "time_per_iteration": 2.842230796813965 }, { "auxiliary_loss_clip": 0.01421276, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.251351, "balance_loss_mlp": 1.01242852, "epoch": 0.5840973996693221, "flos": 19837681309440.0, "grad_norm": 1.8134360816003714, "language_loss": 0.80482936, "learning_rate": 1.5560430496674268e-06, "loss": 0.82937944, "num_input_tokens_seen": 209429920, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21313477, "step": 9715, "time_per_iteration": 3.0888350009918213 }, { "auxiliary_loss_clip": 0.01406116, "auxiliary_loss_mlp": 0.01031155, "balance_loss_clip": 1.24066699, "balance_loss_mlp": 1.0110209, "epoch": 0.5841575229219901, "flos": 21153069429120.0, "grad_norm": 1.989117865866564, "language_loss": 0.74667025, "learning_rate": 1.5556633128648167e-06, "loss": 0.77104294, "num_input_tokens_seen": 209449470, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20129395, "step": 9716, "time_per_iteration": 2.8566577434539795 }, { "auxiliary_loss_clip": 0.01400602, "auxiliary_loss_mlp": 0.01035034, "balance_loss_clip": 1.23708248, "balance_loss_mlp": 1.01506674, "epoch": 0.5842176461746581, "flos": 24649942492800.0, "grad_norm": 1.8996633850271123, "language_loss": 0.75274879, "learning_rate": 1.5552835929114976e-06, "loss": 0.77710509, "num_input_tokens_seen": 209467695, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19970703, "step": 9717, "time_per_iteration": 2.8826334476470947 }, { "auxiliary_loss_clip": 0.01411661, "auxiliary_loss_mlp": 0.01038806, "balance_loss_clip": 1.24576128, "balance_loss_mlp": 1.01852894, "epoch": 0.584277769427326, "flos": 19139398024320.0, "grad_norm": 2.2245987653615007, "language_loss": 0.81622529, "learning_rate": 1.5549038898218697e-06, "loss": 0.84072989, "num_input_tokens_seen": 209484250, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20275879, "step": 9718, "time_per_iteration": 2.849442481994629 }, { "auxiliary_loss_clip": 0.01406182, "auxiliary_loss_mlp": 0.01036291, "balance_loss_clip": 1.24128866, "balance_loss_mlp": 1.01508343, "epoch": 0.584337892679994, "flos": 22685524692480.0, "grad_norm": 3.3902088437828617, "language_loss": 0.68191838, "learning_rate": 1.5545242036103306e-06, "loss": 0.70634311, "num_input_tokens_seen": 209502830, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.2121582, "step": 9719, "time_per_iteration": 2.8596949577331543 }, { "auxiliary_loss_clip": 0.01408322, "auxiliary_loss_mlp": 0.01035513, "balance_loss_clip": 1.24099588, "balance_loss_mlp": 1.01447237, "epoch": 0.5843980159326619, "flos": 31296297893760.0, "grad_norm": 3.0566714578921683, "language_loss": 0.76728868, "learning_rate": 1.5541445342912786e-06, "loss": 0.79172707, "num_input_tokens_seen": 209525995, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.21044922, "step": 9720, "time_per_iteration": 2.942124128341675 }, { "auxiliary_loss_clip": 0.01419948, "auxiliary_loss_mlp": 0.01039574, "balance_loss_clip": 1.25076485, "balance_loss_mlp": 1.01897478, "epoch": 0.58445813918533, "flos": 22758559079040.0, "grad_norm": 1.6918560297080556, "language_loss": 0.83246863, "learning_rate": 1.5537648818791105e-06, "loss": 0.85706377, "num_input_tokens_seen": 209545895, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20605469, "step": 9721, "time_per_iteration": 2.8595967292785645 }, { "auxiliary_loss_clip": 0.01203196, "auxiliary_loss_mlp": 0.010337, "balance_loss_clip": 1.11259675, "balance_loss_mlp": 1.00985813, "epoch": 0.5845182624379979, "flos": 60714559589760.0, "grad_norm": 0.9330605472687717, "language_loss": 0.71373463, "learning_rate": 1.5533852463882226e-06, "loss": 0.73610353, "num_input_tokens_seen": 209602315, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.23828125, "step": 9722, "time_per_iteration": 3.398693561553955 }, { "auxiliary_loss_clip": 0.01409672, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.24361157, "balance_loss_mlp": 1.01577246, "epoch": 0.5845783856906659, "flos": 16371059034240.0, "grad_norm": 2.3128844263992177, "language_loss": 0.90350217, "learning_rate": 1.5530056278330113e-06, "loss": 0.92795497, "num_input_tokens_seen": 209617615, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19836426, "step": 9723, "time_per_iteration": 2.807891368865967 }, { "auxiliary_loss_clip": 0.01403476, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.23950005, "balance_loss_mlp": 1.01647615, "epoch": 0.5846385089433338, "flos": 20092826592000.0, "grad_norm": 1.6512253282716858, "language_loss": 0.69302917, "learning_rate": 1.5526260262278709e-06, "loss": 0.71742713, "num_input_tokens_seen": 209637005, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19836426, "step": 9724, "time_per_iteration": 2.871147632598877 }, { "auxiliary_loss_clip": 0.01413641, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.24495327, "balance_loss_mlp": 1.01527524, "epoch": 0.5846986321960018, "flos": 17319917877120.0, "grad_norm": 2.25761214802707, "language_loss": 0.87255704, "learning_rate": 1.552246441587197e-06, "loss": 0.89705175, "num_input_tokens_seen": 209653170, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.20544434, "step": 9725, "time_per_iteration": 2.8130106925964355 }, { "auxiliary_loss_clip": 0.01429999, "auxiliary_loss_mlp": 0.01041097, "balance_loss_clip": 1.2596103, "balance_loss_mlp": 1.02009273, "epoch": 0.5847587554486697, "flos": 17205773950080.0, "grad_norm": 1.5457931101314528, "language_loss": 0.83382905, "learning_rate": 1.5518668739253821e-06, "loss": 0.85853994, "num_input_tokens_seen": 209671275, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20996094, "step": 9726, "time_per_iteration": 2.8753154277801514 }, { "auxiliary_loss_clip": 0.0143113, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.26199031, "balance_loss_mlp": 1.01448333, "epoch": 0.5848188787013378, "flos": 24538241784960.0, "grad_norm": 1.9164949846457244, "language_loss": 0.67620242, "learning_rate": 1.5514873232568206e-06, "loss": 0.70086324, "num_input_tokens_seen": 209690380, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20458984, "step": 9727, "time_per_iteration": 2.870342493057251 }, { "auxiliary_loss_clip": 0.01421995, "auxiliary_loss_mlp": 0.01037348, "balance_loss_clip": 1.25446153, "balance_loss_mlp": 1.01770234, "epoch": 0.5848790019540057, "flos": 20637349488000.0, "grad_norm": 1.7431372379091086, "language_loss": 0.82298315, "learning_rate": 1.5511077895959055e-06, "loss": 0.84757662, "num_input_tokens_seen": 209708845, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19641113, "step": 9728, "time_per_iteration": 2.8334927558898926 }, { "auxiliary_loss_clip": 0.01395764, "auxiliary_loss_mlp": 0.01035487, "balance_loss_clip": 1.23472357, "balance_loss_mlp": 1.01588917, "epoch": 0.5849391252066737, "flos": 22428976821120.0, "grad_norm": 2.399218908270919, "language_loss": 0.78991067, "learning_rate": 1.550728272957027e-06, "loss": 0.81422317, "num_input_tokens_seen": 209729000, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19592285, "step": 9729, "time_per_iteration": 2.8994829654693604 }, { "auxiliary_loss_clip": 0.01415175, "auxiliary_loss_mlp": 0.01041196, "balance_loss_clip": 1.24756444, "balance_loss_mlp": 1.02062023, "epoch": 0.5849992484593417, "flos": 25421893591680.0, "grad_norm": 1.902152354615498, "language_loss": 0.71950924, "learning_rate": 1.5503487733545782e-06, "loss": 0.74407297, "num_input_tokens_seen": 209747435, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20581055, "step": 9730, "time_per_iteration": 4.361703634262085 }, { "auxiliary_loss_clip": 0.01413202, "auxiliary_loss_mlp": 0.01035922, "balance_loss_clip": 1.24465179, "balance_loss_mlp": 1.01511991, "epoch": 0.5850593717120096, "flos": 21073972239360.0, "grad_norm": 1.5852362300709648, "language_loss": 0.79171592, "learning_rate": 1.5499692908029482e-06, "loss": 0.81620717, "num_input_tokens_seen": 209764910, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20812988, "step": 9731, "time_per_iteration": 2.822007179260254 }, { "auxiliary_loss_clip": 0.01405986, "auxiliary_loss_mlp": 0.0103639, "balance_loss_clip": 1.23977923, "balance_loss_mlp": 1.01688719, "epoch": 0.5851194949646776, "flos": 25312500368640.0, "grad_norm": 5.280574158717184, "language_loss": 0.7115221, "learning_rate": 1.549589825316528e-06, "loss": 0.73594582, "num_input_tokens_seen": 209786115, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19482422, "step": 9732, "time_per_iteration": 2.886577606201172 }, { "auxiliary_loss_clip": 0.01430701, "auxiliary_loss_mlp": 0.0104181, "balance_loss_clip": 1.25970888, "balance_loss_mlp": 1.0203042, "epoch": 0.5851796182173455, "flos": 23597888964480.0, "grad_norm": 1.7887547448390424, "language_loss": 0.5388974, "learning_rate": 1.5492103769097075e-06, "loss": 0.56362259, "num_input_tokens_seen": 209806095, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21508789, "step": 9733, "time_per_iteration": 2.872915744781494 }, { "auxiliary_loss_clip": 0.01417814, "auxiliary_loss_mlp": 0.01043549, "balance_loss_clip": 1.25116599, "balance_loss_mlp": 1.02167439, "epoch": 0.5852397414700136, "flos": 24832370102400.0, "grad_norm": 2.0317040395203176, "language_loss": 0.88521552, "learning_rate": 1.5488309455968739e-06, "loss": 0.90982914, "num_input_tokens_seen": 209823650, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.21875, "step": 9734, "time_per_iteration": 2.8412466049194336 }, { "auxiliary_loss_clip": 0.01395394, "auxiliary_loss_mlp": 0.01039559, "balance_loss_clip": 1.23458493, "balance_loss_mlp": 1.01912642, "epoch": 0.5852998647226815, "flos": 19947255511680.0, "grad_norm": 1.7300119149582647, "language_loss": 0.72994834, "learning_rate": 1.5484515313924163e-06, "loss": 0.75429785, "num_input_tokens_seen": 209843220, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.2043457, "step": 9735, "time_per_iteration": 2.8420207500457764 }, { "auxiliary_loss_clip": 0.01420021, "auxiliary_loss_mlp": 0.0104309, "balance_loss_clip": 1.25127113, "balance_loss_mlp": 1.02268147, "epoch": 0.5853599879753495, "flos": 16727272496640.0, "grad_norm": 2.8425307563094337, "language_loss": 0.75422007, "learning_rate": 1.5480721343107217e-06, "loss": 0.77885121, "num_input_tokens_seen": 209854880, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.20410156, "step": 9736, "time_per_iteration": 2.806025981903076 }, { "auxiliary_loss_clip": 0.01409224, "auxiliary_loss_mlp": 0.01033808, "balance_loss_clip": 1.24268341, "balance_loss_mlp": 1.01418591, "epoch": 0.5854201112280174, "flos": 44472512039040.0, "grad_norm": 1.4823143901071398, "language_loss": 0.70851648, "learning_rate": 1.5476927543661772e-06, "loss": 0.73294681, "num_input_tokens_seen": 209877870, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19628906, "step": 9737, "time_per_iteration": 3.0624618530273438 }, { "auxiliary_loss_clip": 0.01409191, "auxiliary_loss_mlp": 0.01036589, "balance_loss_clip": 1.24556518, "balance_loss_mlp": 1.01672864, "epoch": 0.5854802344806854, "flos": 20348786280960.0, "grad_norm": 1.8397011856927914, "language_loss": 0.83081049, "learning_rate": 1.547313391573169e-06, "loss": 0.8552683, "num_input_tokens_seen": 209896690, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.1986084, "step": 9738, "time_per_iteration": 2.8310129642486572 }, { "auxiliary_loss_clip": 0.01421359, "auxiliary_loss_mlp": 0.01037443, "balance_loss_clip": 1.25210381, "balance_loss_mlp": 1.01680779, "epoch": 0.5855403577333533, "flos": 20930301440640.0, "grad_norm": 1.6962774774899998, "language_loss": 0.69210565, "learning_rate": 1.546934045946082e-06, "loss": 0.71669364, "num_input_tokens_seen": 209914640, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.20629883, "step": 9739, "time_per_iteration": 2.84096097946167 }, { "auxiliary_loss_clip": 0.01424089, "auxiliary_loss_mlp": 0.01035323, "balance_loss_clip": 1.25490987, "balance_loss_mlp": 1.01585603, "epoch": 0.5856004809860214, "flos": 20458269993600.0, "grad_norm": 2.2719069233604765, "language_loss": 0.59862345, "learning_rate": 1.5465547174993017e-06, "loss": 0.62321758, "num_input_tokens_seen": 209933375, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.19445801, "step": 9740, "time_per_iteration": 2.8300857543945312 }, { "auxiliary_loss_clip": 0.01416964, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.2482996, "balance_loss_mlp": 1.0160284, "epoch": 0.5856606042386893, "flos": 19648828938240.0, "grad_norm": 1.7155908534521276, "language_loss": 0.75711799, "learning_rate": 1.5461754062472113e-06, "loss": 0.78165865, "num_input_tokens_seen": 209952055, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21081543, "step": 9741, "time_per_iteration": 2.8338370323181152 }, { "auxiliary_loss_clip": 0.0141827, "auxiliary_loss_mlp": 0.01038246, "balance_loss_clip": 1.25143301, "balance_loss_mlp": 1.01782513, "epoch": 0.5857207274913573, "flos": 21695330085120.0, "grad_norm": 1.6953138960915284, "language_loss": 0.76763189, "learning_rate": 1.5457961122041959e-06, "loss": 0.79219699, "num_input_tokens_seen": 209971190, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.2043457, "step": 9742, "time_per_iteration": 4.276289224624634 }, { "auxiliary_loss_clip": 0.01404172, "auxiliary_loss_mlp": 0.01038729, "balance_loss_clip": 1.23949766, "balance_loss_mlp": 1.01894069, "epoch": 0.5857808507440253, "flos": 23192421897600.0, "grad_norm": 2.454588667504928, "language_loss": 0.76001614, "learning_rate": 1.5454168353846369e-06, "loss": 0.78444517, "num_input_tokens_seen": 209990695, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19787598, "step": 9743, "time_per_iteration": 2.9690585136413574 }, { "auxiliary_loss_clip": 0.01408785, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.24623775, "balance_loss_mlp": 1.01304245, "epoch": 0.5858409739966932, "flos": 27246757870080.0, "grad_norm": 1.7325491897041374, "language_loss": 0.81814384, "learning_rate": 1.5450375758029172e-06, "loss": 0.8425678, "num_input_tokens_seen": 210010210, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20581055, "step": 9744, "time_per_iteration": 4.278918981552124 }, { "auxiliary_loss_clip": 0.01434043, "auxiliary_loss_mlp": 0.01033649, "balance_loss_clip": 1.26322079, "balance_loss_mlp": 1.01383615, "epoch": 0.5859010972493612, "flos": 27867120330240.0, "grad_norm": 1.9582588728380177, "language_loss": 0.72696918, "learning_rate": 1.5446583334734183e-06, "loss": 0.75164616, "num_input_tokens_seen": 210030030, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.19787598, "step": 9745, "time_per_iteration": 4.308863639831543 }, { "auxiliary_loss_clip": 0.01205847, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.1119597, "balance_loss_mlp": 1.01269686, "epoch": 0.5859612205020291, "flos": 70041804485760.0, "grad_norm": 0.7310070387014701, "language_loss": 0.53341258, "learning_rate": 1.5442791084105204e-06, "loss": 0.55580688, "num_input_tokens_seen": 210094840, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.20898438, "step": 9746, "time_per_iteration": 3.462275266647339 }, { "auxiliary_loss_clip": 0.01422557, "auxiliary_loss_mlp": 0.01035695, "balance_loss_clip": 1.25373399, "balance_loss_mlp": 1.01380777, "epoch": 0.5860213437546972, "flos": 24065350686720.0, "grad_norm": 1.9716939642908846, "language_loss": 0.74194396, "learning_rate": 1.5438999006286054e-06, "loss": 0.76652646, "num_input_tokens_seen": 210114660, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21899414, "step": 9747, "time_per_iteration": 2.859715461730957 }, { "auxiliary_loss_clip": 0.01421859, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.25294447, "balance_loss_mlp": 1.0159651, "epoch": 0.5860814670073651, "flos": 18955477336320.0, "grad_norm": 1.9266191051005657, "language_loss": 0.81247878, "learning_rate": 1.543520710142051e-06, "loss": 0.8370682, "num_input_tokens_seen": 210132770, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21118164, "step": 9748, "time_per_iteration": 2.849375009536743 }, { "auxiliary_loss_clip": 0.01416903, "auxiliary_loss_mlp": 0.01035463, "balance_loss_clip": 1.2477957, "balance_loss_mlp": 1.01371908, "epoch": 0.5861415902600331, "flos": 22571245031040.0, "grad_norm": 1.6776698696459076, "language_loss": 0.72543001, "learning_rate": 1.5431415369652375e-06, "loss": 0.74995375, "num_input_tokens_seen": 210151895, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.21740723, "step": 9749, "time_per_iteration": 2.828810691833496 }, { "auxiliary_loss_clip": 0.01401564, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.23793912, "balance_loss_mlp": 1.01234972, "epoch": 0.586201713512701, "flos": 14400080737920.0, "grad_norm": 2.4014663441266713, "language_loss": 0.75809509, "learning_rate": 1.5427623811125428e-06, "loss": 0.78244537, "num_input_tokens_seen": 210168040, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.21130371, "step": 9750, "time_per_iteration": 2.830185890197754 }, { "auxiliary_loss_clip": 0.01404267, "auxiliary_loss_mlp": 0.0103814, "balance_loss_clip": 1.23937488, "balance_loss_mlp": 1.01514459, "epoch": 0.586261836765369, "flos": 19507556113920.0, "grad_norm": 1.6437950299689497, "language_loss": 0.72261631, "learning_rate": 1.542383242598344e-06, "loss": 0.74704033, "num_input_tokens_seen": 210187720, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.22998047, "step": 9751, "time_per_iteration": 2.8693981170654297 }, { "auxiliary_loss_clip": 0.01435894, "auxiliary_loss_mlp": 0.01031629, "balance_loss_clip": 1.26125252, "balance_loss_mlp": 1.01011157, "epoch": 0.5863219600180369, "flos": 20710926812160.0, "grad_norm": 1.8699852607352607, "language_loss": 0.75624955, "learning_rate": 1.5420041214370184e-06, "loss": 0.7809248, "num_input_tokens_seen": 210206080, "router_z_loss_clip": 1.74609375, "router_z_loss_mlp": 0.21520996, "step": 9752, "time_per_iteration": 2.8478024005889893 }, { "auxiliary_loss_clip": 0.01419155, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.25143194, "balance_loss_mlp": 1.01461196, "epoch": 0.586382083270705, "flos": 19801639186560.0, "grad_norm": 1.8434144446791156, "language_loss": 0.78529, "learning_rate": 1.541625017642943e-06, "loss": 0.8098433, "num_input_tokens_seen": 210225660, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21569824, "step": 9753, "time_per_iteration": 2.8551583290100098 }, { "auxiliary_loss_clip": 0.01403275, "auxiliary_loss_mlp": 0.01032861, "balance_loss_clip": 1.24091196, "balance_loss_mlp": 1.01147485, "epoch": 0.5864422065233729, "flos": 16507173951360.0, "grad_norm": 1.7236126437697146, "language_loss": 0.71891791, "learning_rate": 1.5412459312304927e-06, "loss": 0.74327928, "num_input_tokens_seen": 210242725, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.21386719, "step": 9754, "time_per_iteration": 2.977651357650757 }, { "auxiliary_loss_clip": 0.0140937, "auxiliary_loss_mlp": 0.01036289, "balance_loss_clip": 1.24121499, "balance_loss_mlp": 1.01499867, "epoch": 0.5865023297760409, "flos": 20423178011520.0, "grad_norm": 1.8643068303245285, "language_loss": 0.73119134, "learning_rate": 1.540866862214043e-06, "loss": 0.75564796, "num_input_tokens_seen": 210263225, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.21313477, "step": 9755, "time_per_iteration": 2.94720721244812 }, { "auxiliary_loss_clip": 0.01219343, "auxiliary_loss_mlp": 0.01038992, "balance_loss_clip": 1.12231946, "balance_loss_mlp": 1.00446892, "epoch": 0.5865624530287089, "flos": 63379658649600.0, "grad_norm": 0.7426920270506249, "language_loss": 0.56943321, "learning_rate": 1.540487810607967e-06, "loss": 0.59201658, "num_input_tokens_seen": 210322310, "router_z_loss_clip": 0.96875, "router_z_loss_mlp": 0.34570312, "step": 9756, "time_per_iteration": 3.354534387588501 }, { "auxiliary_loss_clip": 0.01417797, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.2518692, "balance_loss_mlp": 1.01282632, "epoch": 0.5866225762813768, "flos": 27027654710400.0, "grad_norm": 2.115363992682989, "language_loss": 0.77410996, "learning_rate": 1.5401087764266396e-06, "loss": 0.7986182, "num_input_tokens_seen": 210340845, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20202637, "step": 9757, "time_per_iteration": 2.9005088806152344 }, { "auxiliary_loss_clip": 0.01212804, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.11777997, "balance_loss_mlp": 1.01959538, "epoch": 0.5866826995340448, "flos": 73019473758720.0, "grad_norm": 0.85788848032698, "language_loss": 0.60561132, "learning_rate": 1.5397297596844337e-06, "loss": 0.62816906, "num_input_tokens_seen": 210397815, "router_z_loss_clip": 0.94921875, "router_z_loss_mlp": 0.23339844, "step": 9758, "time_per_iteration": 3.3590505123138428 }, { "auxiliary_loss_clip": 0.0142059, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.24923062, "balance_loss_mlp": 1.01250577, "epoch": 0.5867428227867127, "flos": 21295020925440.0, "grad_norm": 3.6657645710813136, "language_loss": 0.73211336, "learning_rate": 1.5393507603957212e-06, "loss": 0.75666618, "num_input_tokens_seen": 210413900, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.22180176, "step": 9759, "time_per_iteration": 2.8505609035491943 }, { "auxiliary_loss_clip": 0.01407993, "auxiliary_loss_mlp": 0.01041475, "balance_loss_clip": 1.24138796, "balance_loss_mlp": 1.02004147, "epoch": 0.5868029460393808, "flos": 33480497525760.0, "grad_norm": 1.8508978607633233, "language_loss": 0.73830545, "learning_rate": 1.5389717785748742e-06, "loss": 0.7628001, "num_input_tokens_seen": 210434110, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.21435547, "step": 9760, "time_per_iteration": 2.968231678009033 }, { "auxiliary_loss_clip": 0.01410351, "auxiliary_loss_mlp": 0.01034611, "balance_loss_clip": 1.24344063, "balance_loss_mlp": 1.01364183, "epoch": 0.5868630692920487, "flos": 17897089536000.0, "grad_norm": 1.8065495457034064, "language_loss": 0.73122311, "learning_rate": 1.5385928142362637e-06, "loss": 0.75567269, "num_input_tokens_seen": 210451685, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20983887, "step": 9761, "time_per_iteration": 2.865251064300537 }, { "auxiliary_loss_clip": 0.01413776, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.24372983, "balance_loss_mlp": 1.0107106, "epoch": 0.5869231925447167, "flos": 21045485998080.0, "grad_norm": 2.072671030302661, "language_loss": 0.7596218, "learning_rate": 1.5382138673942597e-06, "loss": 0.78409696, "num_input_tokens_seen": 210470825, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.23046875, "step": 9762, "time_per_iteration": 2.865057945251465 }, { "auxiliary_loss_clip": 0.01406283, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.2428292, "balance_loss_mlp": 1.01577544, "epoch": 0.5869833157973846, "flos": 74761603280640.0, "grad_norm": 1.4177386353826051, "language_loss": 0.72976625, "learning_rate": 1.5378349380632317e-06, "loss": 0.75420725, "num_input_tokens_seen": 210500075, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.22021484, "step": 9763, "time_per_iteration": 3.285033702850342 }, { "auxiliary_loss_clip": 0.01406929, "auxiliary_loss_mlp": 0.0103598, "balance_loss_clip": 1.2407577, "balance_loss_mlp": 1.01474929, "epoch": 0.5870434390500526, "flos": 17647826077440.0, "grad_norm": 1.6042708575208982, "language_loss": 0.80775142, "learning_rate": 1.53745602625755e-06, "loss": 0.83218044, "num_input_tokens_seen": 210518150, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.21228027, "step": 9764, "time_per_iteration": 2.8754215240478516 }, { "auxiliary_loss_clip": 0.01409266, "auxiliary_loss_mlp": 0.01036748, "balance_loss_clip": 1.24235678, "balance_loss_mlp": 1.01431251, "epoch": 0.5871035623027205, "flos": 21515888632320.0, "grad_norm": 1.6824022303170003, "language_loss": 0.79853934, "learning_rate": 1.5370771319915819e-06, "loss": 0.82299948, "num_input_tokens_seen": 210537760, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.2244873, "step": 9765, "time_per_iteration": 4.228884220123291 }, { "auxiliary_loss_clip": 0.01395292, "auxiliary_loss_mlp": 0.01034881, "balance_loss_clip": 1.23209095, "balance_loss_mlp": 1.01369715, "epoch": 0.5871636855553886, "flos": 13559438753280.0, "grad_norm": 1.6642002589617249, "language_loss": 0.8401767, "learning_rate": 1.5366982552796947e-06, "loss": 0.86447847, "num_input_tokens_seen": 210555515, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.21179199, "step": 9766, "time_per_iteration": 2.8089590072631836 }, { "auxiliary_loss_clip": 0.01420132, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.24911237, "balance_loss_mlp": 1.01749527, "epoch": 0.5872238088080565, "flos": 26223914499840.0, "grad_norm": 1.5625989381754892, "language_loss": 0.6981132, "learning_rate": 1.536319396136257e-06, "loss": 0.72270012, "num_input_tokens_seen": 210575000, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21057129, "step": 9767, "time_per_iteration": 2.8680531978607178 }, { "auxiliary_loss_clip": 0.01415054, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.24596286, "balance_loss_mlp": 1.01387298, "epoch": 0.5872839320607245, "flos": 30677609491200.0, "grad_norm": 2.341163651940572, "language_loss": 0.64439225, "learning_rate": 1.5359405545756336e-06, "loss": 0.66890061, "num_input_tokens_seen": 210595185, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21911621, "step": 9768, "time_per_iteration": 2.957693576812744 }, { "auxiliary_loss_clip": 0.01207296, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.11502314, "balance_loss_mlp": 1.0049088, "epoch": 0.5873440553133924, "flos": 60336239402880.0, "grad_norm": 0.713354606398054, "language_loss": 0.54042667, "learning_rate": 1.5355617306121914e-06, "loss": 0.56285203, "num_input_tokens_seen": 210653210, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.30273438, "step": 9769, "time_per_iteration": 3.3547139167785645 }, { "auxiliary_loss_clip": 0.01406119, "auxiliary_loss_mlp": 0.01038855, "balance_loss_clip": 1.23936796, "balance_loss_mlp": 1.01714647, "epoch": 0.5874041785660604, "flos": 21548175436800.0, "grad_norm": 4.8035300741887275, "language_loss": 0.71010739, "learning_rate": 1.5351829242602945e-06, "loss": 0.73455715, "num_input_tokens_seen": 210673750, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.21704102, "step": 9770, "time_per_iteration": 2.862474203109741 }, { "auxiliary_loss_clip": 0.01402673, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.23687553, "balance_loss_mlp": 1.0112884, "epoch": 0.5874643018187284, "flos": 24399547914240.0, "grad_norm": 3.157778078545333, "language_loss": 0.6853174, "learning_rate": 1.5348041355343077e-06, "loss": 0.70967269, "num_input_tokens_seen": 210692960, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.21557617, "step": 9771, "time_per_iteration": 2.8613970279693604 }, { "auxiliary_loss_clip": 0.01418015, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.24702597, "balance_loss_mlp": 1.01435947, "epoch": 0.5875244250713964, "flos": 28159484100480.0, "grad_norm": 1.527585659132837, "language_loss": 0.67021358, "learning_rate": 1.5344253644485954e-06, "loss": 0.69475007, "num_input_tokens_seen": 210714040, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.21276855, "step": 9772, "time_per_iteration": 2.9079885482788086 }, { "auxiliary_loss_clip": 0.01429966, "auxiliary_loss_mlp": 0.01046363, "balance_loss_clip": 1.25811386, "balance_loss_mlp": 1.02011299, "epoch": 0.5875845483240644, "flos": 25823514850560.0, "grad_norm": 1.5402411330176966, "language_loss": 0.75222141, "learning_rate": 1.534046611017519e-06, "loss": 0.77698469, "num_input_tokens_seen": 210733710, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.26257324, "step": 9773, "time_per_iteration": 2.8697237968444824 }, { "auxiliary_loss_clip": 0.01428768, "auxiliary_loss_mlp": 0.01038747, "balance_loss_clip": 1.25894475, "balance_loss_mlp": 1.01669335, "epoch": 0.5876446715767323, "flos": 26917854284160.0, "grad_norm": 3.892099943724368, "language_loss": 0.54574144, "learning_rate": 1.5336678752554421e-06, "loss": 0.57041657, "num_input_tokens_seen": 210753580, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.22045898, "step": 9774, "time_per_iteration": 2.907550573348999 }, { "auxiliary_loss_clip": 0.01415341, "auxiliary_loss_mlp": 0.01039431, "balance_loss_clip": 1.24889088, "balance_loss_mlp": 1.01634049, "epoch": 0.5877047948294003, "flos": 36698670748800.0, "grad_norm": 2.158202766055543, "language_loss": 0.65643072, "learning_rate": 1.5332891571767264e-06, "loss": 0.68097842, "num_input_tokens_seen": 210773495, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.23095703, "step": 9775, "time_per_iteration": 2.953894853591919 }, { "auxiliary_loss_clip": 0.0141156, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.24375391, "balance_loss_mlp": 1.01350045, "epoch": 0.5877649180820682, "flos": 26736105346560.0, "grad_norm": 2.290949921398518, "language_loss": 0.74032336, "learning_rate": 1.5329104567957326e-06, "loss": 0.76479185, "num_input_tokens_seen": 210793645, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21801758, "step": 9776, "time_per_iteration": 2.8983731269836426 }, { "auxiliary_loss_clip": 0.01428107, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 1.25787973, "balance_loss_mlp": 1.01548827, "epoch": 0.5878250413347362, "flos": 21041911658880.0, "grad_norm": 2.1124310729131643, "language_loss": 0.75294745, "learning_rate": 1.532531774126821e-06, "loss": 0.77759719, "num_input_tokens_seen": 210813415, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21386719, "step": 9777, "time_per_iteration": 4.353303670883179 }, { "auxiliary_loss_clip": 0.01395105, "auxiliary_loss_mlp": 0.01039358, "balance_loss_clip": 1.23311949, "balance_loss_mlp": 1.01871109, "epoch": 0.5878851645874041, "flos": 25495335181440.0, "grad_norm": 1.508216958948938, "language_loss": 0.74983591, "learning_rate": 1.5321531091843512e-06, "loss": 0.77418053, "num_input_tokens_seen": 210833850, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.2064209, "step": 9778, "time_per_iteration": 2.9330766201019287 }, { "auxiliary_loss_clip": 0.01421128, "auxiliary_loss_mlp": 0.01041999, "balance_loss_clip": 1.2532115, "balance_loss_mlp": 1.01937318, "epoch": 0.5879452878400722, "flos": 23779683146880.0, "grad_norm": 1.9796569777331752, "language_loss": 0.70636737, "learning_rate": 1.5317744619826824e-06, "loss": 0.73099864, "num_input_tokens_seen": 210853115, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.22631836, "step": 9779, "time_per_iteration": 5.665513038635254 }, { "auxiliary_loss_clip": 0.01421975, "auxiliary_loss_mlp": 0.01037173, "balance_loss_clip": 1.25111818, "balance_loss_mlp": 1.01508367, "epoch": 0.5880054110927401, "flos": 17834189984640.0, "grad_norm": 2.6973363494466165, "language_loss": 0.67763543, "learning_rate": 1.5313958325361727e-06, "loss": 0.70222694, "num_input_tokens_seen": 210872090, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.22094727, "step": 9780, "time_per_iteration": 2.8982720375061035 }, { "auxiliary_loss_clip": 0.01426545, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.25741792, "balance_loss_mlp": 1.01818252, "epoch": 0.5880655343454081, "flos": 19472735600640.0, "grad_norm": 2.3411123256132482, "language_loss": 0.72802824, "learning_rate": 1.5310172208591807e-06, "loss": 0.75269306, "num_input_tokens_seen": 210888490, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21765137, "step": 9781, "time_per_iteration": 2.883565902709961 }, { "auxiliary_loss_clip": 0.01416467, "auxiliary_loss_mlp": 0.01036568, "balance_loss_clip": 1.25019097, "balance_loss_mlp": 1.01521742, "epoch": 0.588125657598076, "flos": 21407536039680.0, "grad_norm": 1.461959583639037, "language_loss": 0.71420193, "learning_rate": 1.5306386269660622e-06, "loss": 0.73873234, "num_input_tokens_seen": 210908220, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21362305, "step": 9782, "time_per_iteration": 2.894538402557373 }, { "auxiliary_loss_clip": 0.01423959, "auxiliary_loss_mlp": 0.01041084, "balance_loss_clip": 1.25302982, "balance_loss_mlp": 1.01868439, "epoch": 0.588185780850744, "flos": 16043331813120.0, "grad_norm": 2.4817468152936417, "language_loss": 0.70771277, "learning_rate": 1.5302600508711741e-06, "loss": 0.73236322, "num_input_tokens_seen": 210923945, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.22399902, "step": 9783, "time_per_iteration": 2.8804900646209717 }, { "auxiliary_loss_clip": 0.01427601, "auxiliary_loss_mlp": 0.01041019, "balance_loss_clip": 1.25758314, "balance_loss_mlp": 1.01897693, "epoch": 0.588245904103412, "flos": 23737623465600.0, "grad_norm": 1.7645899897311215, "language_loss": 0.70276618, "learning_rate": 1.5298814925888719e-06, "loss": 0.7274524, "num_input_tokens_seen": 210941955, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.22045898, "step": 9784, "time_per_iteration": 2.841827869415283 }, { "auxiliary_loss_clip": 0.01433426, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.26061177, "balance_loss_mlp": 1.01454067, "epoch": 0.58830602735608, "flos": 33815644894080.0, "grad_norm": 1.8946453844456843, "language_loss": 0.69687265, "learning_rate": 1.5295029521335102e-06, "loss": 0.72156954, "num_input_tokens_seen": 210963105, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.21728516, "step": 9785, "time_per_iteration": 3.025907516479492 }, { "auxiliary_loss_clip": 0.01403351, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.23694491, "balance_loss_mlp": 1.01228452, "epoch": 0.588366150608748, "flos": 17099140659840.0, "grad_norm": 1.9117576067123427, "language_loss": 0.77874821, "learning_rate": 1.5291244295194448e-06, "loss": 0.80311215, "num_input_tokens_seen": 210978720, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.2076416, "step": 9786, "time_per_iteration": 2.822808027267456 }, { "auxiliary_loss_clip": 0.014168, "auxiliary_loss_mlp": 0.01037879, "balance_loss_clip": 1.24734592, "balance_loss_mlp": 1.01564658, "epoch": 0.5884262738614159, "flos": 22137291722880.0, "grad_norm": 1.5745817365537436, "language_loss": 0.80008978, "learning_rate": 1.5287459247610276e-06, "loss": 0.82463658, "num_input_tokens_seen": 210998750, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.22229004, "step": 9787, "time_per_iteration": 2.8554141521453857 }, { "auxiliary_loss_clip": 0.01417322, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.24770713, "balance_loss_mlp": 1.01695514, "epoch": 0.5884863971140839, "flos": 21041459210880.0, "grad_norm": 1.782752941092352, "language_loss": 0.67306292, "learning_rate": 1.5283674378726116e-06, "loss": 0.6976198, "num_input_tokens_seen": 211017550, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21411133, "step": 9788, "time_per_iteration": 2.8718106746673584 }, { "auxiliary_loss_clip": 0.01408543, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.2432518, "balance_loss_mlp": 1.01763058, "epoch": 0.5885465203667518, "flos": 23815544290560.0, "grad_norm": 2.4386922212561086, "language_loss": 0.8101629, "learning_rate": 1.5279889688685506e-06, "loss": 0.83463353, "num_input_tokens_seen": 211034135, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20874023, "step": 9789, "time_per_iteration": 2.863879680633545 }, { "auxiliary_loss_clip": 0.0140865, "auxiliary_loss_mlp": 0.01039799, "balance_loss_clip": 1.24321222, "balance_loss_mlp": 1.01687503, "epoch": 0.5886066436194198, "flos": 18889998831360.0, "grad_norm": 1.5431900061695556, "language_loss": 0.70795816, "learning_rate": 1.5276105177631944e-06, "loss": 0.73244262, "num_input_tokens_seen": 211053850, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.22900391, "step": 9790, "time_per_iteration": 2.8621883392333984 }, { "auxiliary_loss_clip": 0.01405254, "auxiliary_loss_mlp": 0.01039822, "balance_loss_clip": 1.23927236, "balance_loss_mlp": 1.01820946, "epoch": 0.5886667668720877, "flos": 24800807214720.0, "grad_norm": 1.741889108708411, "language_loss": 0.84905821, "learning_rate": 1.527232084570895e-06, "loss": 0.87350899, "num_input_tokens_seen": 211072165, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.21618652, "step": 9791, "time_per_iteration": 2.8779773712158203 }, { "auxiliary_loss_clip": 0.01426631, "auxiliary_loss_mlp": 0.01045088, "balance_loss_clip": 1.25780535, "balance_loss_mlp": 1.02324915, "epoch": 0.5887268901247558, "flos": 21623834021760.0, "grad_norm": 1.63804883010965, "language_loss": 0.77347553, "learning_rate": 1.5268536693060026e-06, "loss": 0.79819274, "num_input_tokens_seen": 211089630, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21826172, "step": 9792, "time_per_iteration": 2.850757360458374 }, { "auxiliary_loss_clip": 0.01424348, "auxiliary_loss_mlp": 0.01037008, "balance_loss_clip": 1.25166464, "balance_loss_mlp": 1.0152638, "epoch": 0.5887870133774237, "flos": 20490466308480.0, "grad_norm": 8.233835187147315, "language_loss": 0.69769752, "learning_rate": 1.5264752719828662e-06, "loss": 0.72231108, "num_input_tokens_seen": 211106120, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.2175293, "step": 9793, "time_per_iteration": 2.8211119174957275 }, { "auxiliary_loss_clip": 0.01408167, "auxiliary_loss_mlp": 0.0103816, "balance_loss_clip": 1.24195313, "balance_loss_mlp": 1.01597476, "epoch": 0.5888471366300917, "flos": 19215101854080.0, "grad_norm": 1.8123655091741104, "language_loss": 0.60476613, "learning_rate": 1.5260968926158353e-06, "loss": 0.62922943, "num_input_tokens_seen": 211122450, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.22180176, "step": 9794, "time_per_iteration": 2.8413476943969727 }, { "auxiliary_loss_clip": 0.01425097, "auxiliary_loss_mlp": 0.01043766, "balance_loss_clip": 1.25526524, "balance_loss_mlp": 1.02164114, "epoch": 0.5889072598827596, "flos": 19982030780160.0, "grad_norm": 1.5973947531394181, "language_loss": 0.66176975, "learning_rate": 1.525718531219257e-06, "loss": 0.68645835, "num_input_tokens_seen": 211141765, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.22131348, "step": 9795, "time_per_iteration": 2.8608076572418213 }, { "auxiliary_loss_clip": 0.01409209, "auxiliary_loss_mlp": 0.01043311, "balance_loss_clip": 1.24433267, "balance_loss_mlp": 1.0207566, "epoch": 0.5889673831354276, "flos": 20751131456640.0, "grad_norm": 1.7100737307571143, "language_loss": 0.75115013, "learning_rate": 1.5253401878074801e-06, "loss": 0.77567536, "num_input_tokens_seen": 211160475, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.22558594, "step": 9796, "time_per_iteration": 2.8349525928497314 }, { "auxiliary_loss_clip": 0.01420096, "auxiliary_loss_mlp": 0.01034342, "balance_loss_clip": 1.252267, "balance_loss_mlp": 1.0128243, "epoch": 0.5890275063880956, "flos": 25311731207040.0, "grad_norm": 1.5355591333692875, "language_loss": 0.83964109, "learning_rate": 1.5249618623948507e-06, "loss": 0.86418551, "num_input_tokens_seen": 211180480, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21508789, "step": 9797, "time_per_iteration": 2.92546010017395 }, { "auxiliary_loss_clip": 0.0141409, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.24711323, "balance_loss_mlp": 1.01711917, "epoch": 0.5890876296407636, "flos": 11772833592960.0, "grad_norm": 2.464395152957588, "language_loss": 0.80028337, "learning_rate": 1.5245835549957152e-06, "loss": 0.82480228, "num_input_tokens_seen": 211198000, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20678711, "step": 9798, "time_per_iteration": 2.8469839096069336 }, { "auxiliary_loss_clip": 0.01407411, "auxiliary_loss_mlp": 0.01040205, "balance_loss_clip": 1.24295354, "balance_loss_mlp": 1.01968908, "epoch": 0.5891477528934316, "flos": 13597335912960.0, "grad_norm": 3.5299519711984937, "language_loss": 0.74721336, "learning_rate": 1.5242052656244186e-06, "loss": 0.77168947, "num_input_tokens_seen": 211214765, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20507812, "step": 9799, "time_per_iteration": 2.8385612964630127 }, { "auxiliary_loss_clip": 0.01428796, "auxiliary_loss_mlp": 0.0104015, "balance_loss_clip": 1.25709832, "balance_loss_mlp": 1.01633215, "epoch": 0.5892078761460995, "flos": 15057525951360.0, "grad_norm": 2.491596002619927, "language_loss": 0.77625465, "learning_rate": 1.5238269942953064e-06, "loss": 0.80094409, "num_input_tokens_seen": 211232335, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.23815918, "step": 9800, "time_per_iteration": 4.290425062179565 }, { "auxiliary_loss_clip": 0.01429815, "auxiliary_loss_mlp": 0.01041667, "balance_loss_clip": 1.25987935, "balance_loss_mlp": 1.01900554, "epoch": 0.5892679993987675, "flos": 15786557717760.0, "grad_norm": 1.8725256492668771, "language_loss": 0.7988987, "learning_rate": 1.523448741022722e-06, "loss": 0.82361352, "num_input_tokens_seen": 211249985, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.22680664, "step": 9801, "time_per_iteration": 2.9512622356414795 }, { "auxiliary_loss_clip": 0.01428657, "auxiliary_loss_mlp": 0.01039974, "balance_loss_clip": 1.25820458, "balance_loss_mlp": 1.01912439, "epoch": 0.5893281226514354, "flos": 25276186776960.0, "grad_norm": 1.7107336230308925, "language_loss": 0.66743052, "learning_rate": 1.5230705058210088e-06, "loss": 0.69211686, "num_input_tokens_seen": 211268425, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20849609, "step": 9802, "time_per_iteration": 2.9146339893341064 }, { "auxiliary_loss_clip": 0.01409924, "auxiliary_loss_mlp": 0.01040865, "balance_loss_clip": 1.24332333, "balance_loss_mlp": 1.01828694, "epoch": 0.5893882459041034, "flos": 19466220349440.0, "grad_norm": 1.5254193940694714, "language_loss": 0.78666866, "learning_rate": 1.5226922887045108e-06, "loss": 0.8111766, "num_input_tokens_seen": 211286680, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.22595215, "step": 9803, "time_per_iteration": 2.8247745037078857 }, { "auxiliary_loss_clip": 0.01431093, "auxiliary_loss_mlp": 0.01043146, "balance_loss_clip": 1.26012897, "balance_loss_mlp": 1.02075887, "epoch": 0.5894483691567713, "flos": 20644498166400.0, "grad_norm": 1.532774695877252, "language_loss": 0.73625517, "learning_rate": 1.5223140896875686e-06, "loss": 0.76099759, "num_input_tokens_seen": 211307700, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.22375488, "step": 9804, "time_per_iteration": 2.875318765640259 }, { "auxiliary_loss_clip": 0.01413199, "auxiliary_loss_mlp": 0.0103982, "balance_loss_clip": 1.24652433, "balance_loss_mlp": 1.01868439, "epoch": 0.5895084924094394, "flos": 17785343583360.0, "grad_norm": 1.5133686442275884, "language_loss": 0.75602233, "learning_rate": 1.5219359087845234e-06, "loss": 0.78055251, "num_input_tokens_seen": 211324835, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.21142578, "step": 9805, "time_per_iteration": 2.8312366008758545 }, { "auxiliary_loss_clip": 0.01451029, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 1.27554917, "balance_loss_mlp": 1.02105355, "epoch": 0.5895686156621073, "flos": 20130814241280.0, "grad_norm": 1.6678052450980936, "language_loss": 0.79024547, "learning_rate": 1.5215577460097174e-06, "loss": 0.81517076, "num_input_tokens_seen": 211344130, "router_z_loss_clip": 1.75390625, "router_z_loss_mlp": 0.2043457, "step": 9806, "time_per_iteration": 2.8377721309661865 }, { "auxiliary_loss_clip": 0.01416201, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.24745727, "balance_loss_mlp": 1.01721048, "epoch": 0.5896287389147753, "flos": 20859800762880.0, "grad_norm": 1.9986619461404835, "language_loss": 0.77634662, "learning_rate": 1.5211796013774887e-06, "loss": 0.80089653, "num_input_tokens_seen": 211362915, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21569824, "step": 9807, "time_per_iteration": 2.82161545753479 }, { "auxiliary_loss_clip": 0.01439912, "auxiliary_loss_mlp": 0.01035968, "balance_loss_clip": 1.26606822, "balance_loss_mlp": 1.01422453, "epoch": 0.5896888621674432, "flos": 14545968531840.0, "grad_norm": 1.9070503563223435, "language_loss": 0.75025046, "learning_rate": 1.5208014749021786e-06, "loss": 0.77500927, "num_input_tokens_seen": 211380700, "router_z_loss_clip": 1.73828125, "router_z_loss_mlp": 0.21728516, "step": 9808, "time_per_iteration": 2.8112308979034424 }, { "auxiliary_loss_clip": 0.01433607, "auxiliary_loss_mlp": 0.01043768, "balance_loss_clip": 1.26245964, "balance_loss_mlp": 1.02073693, "epoch": 0.5897489854201112, "flos": 20896566802560.0, "grad_norm": 2.840396125797323, "language_loss": 0.72971278, "learning_rate": 1.5204233665981236e-06, "loss": 0.75448656, "num_input_tokens_seen": 211400095, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.23010254, "step": 9809, "time_per_iteration": 2.8702399730682373 }, { "auxiliary_loss_clip": 0.01433572, "auxiliary_loss_mlp": 0.01041505, "balance_loss_clip": 1.25963759, "balance_loss_mlp": 1.01966596, "epoch": 0.5898091086727792, "flos": 20020742346240.0, "grad_norm": 9.092244163769664, "language_loss": 0.84294456, "learning_rate": 1.5200452764796627e-06, "loss": 0.86769533, "num_input_tokens_seen": 211417810, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.21850586, "step": 9810, "time_per_iteration": 2.842733144760132 }, { "auxiliary_loss_clip": 0.01413662, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.24925053, "balance_loss_mlp": 1.01828766, "epoch": 0.5898692319254472, "flos": 16261892035200.0, "grad_norm": 1.569066174995993, "language_loss": 0.82339287, "learning_rate": 1.5196672045611336e-06, "loss": 0.84792894, "num_input_tokens_seen": 211436020, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.21643066, "step": 9811, "time_per_iteration": 2.8572592735290527 }, { "auxiliary_loss_clip": 0.01429093, "auxiliary_loss_mlp": 0.01035339, "balance_loss_clip": 1.25552237, "balance_loss_mlp": 1.01407194, "epoch": 0.5899293551781152, "flos": 20458134259200.0, "grad_norm": 2.3300491936524668, "language_loss": 0.78061247, "learning_rate": 1.5192891508568715e-06, "loss": 0.80525672, "num_input_tokens_seen": 211454335, "router_z_loss_clip": 1.73730469, "router_z_loss_mlp": 0.21252441, "step": 9812, "time_per_iteration": 4.254642009735107 }, { "auxiliary_loss_clip": 0.01440012, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.27018762, "balance_loss_mlp": 1.02051544, "epoch": 0.5899894784307831, "flos": 13889744928000.0, "grad_norm": 2.0547205341995443, "language_loss": 0.71239495, "learning_rate": 1.5189111153812133e-06, "loss": 0.73719454, "num_input_tokens_seen": 211472775, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.19421387, "step": 9813, "time_per_iteration": 2.8059709072113037 }, { "auxiliary_loss_clip": 0.01428811, "auxiliary_loss_mlp": 0.01042551, "balance_loss_clip": 1.25828612, "balance_loss_mlp": 1.02115273, "epoch": 0.5900496016834511, "flos": 20093324284800.0, "grad_norm": 1.7191213356721373, "language_loss": 0.72640765, "learning_rate": 1.518533098148494e-06, "loss": 0.75112128, "num_input_tokens_seen": 211492195, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.21398926, "step": 9814, "time_per_iteration": 5.659720182418823 }, { "auxiliary_loss_clip": 0.01422501, "auxiliary_loss_mlp": 0.0103612, "balance_loss_clip": 1.2545023, "balance_loss_mlp": 1.01499605, "epoch": 0.590109724936119, "flos": 20266883913600.0, "grad_norm": 1.707298452055488, "language_loss": 0.79332733, "learning_rate": 1.5181550991730476e-06, "loss": 0.81791353, "num_input_tokens_seen": 211510220, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.21130371, "step": 9815, "time_per_iteration": 2.821929693222046 }, { "auxiliary_loss_clip": 0.01447879, "auxiliary_loss_mlp": 0.01037209, "balance_loss_clip": 1.27183163, "balance_loss_mlp": 1.01571608, "epoch": 0.590169848188787, "flos": 24243570529920.0, "grad_norm": 2.874969415202789, "language_loss": 0.77173007, "learning_rate": 1.5177771184692083e-06, "loss": 0.79658091, "num_input_tokens_seen": 211526260, "router_z_loss_clip": 1.76171875, "router_z_loss_mlp": 0.21484375, "step": 9816, "time_per_iteration": 2.872300863265991 }, { "auxiliary_loss_clip": 0.01426874, "auxiliary_loss_mlp": 0.01034168, "balance_loss_clip": 1.25790787, "balance_loss_mlp": 1.01373529, "epoch": 0.590229971441455, "flos": 17793532892160.0, "grad_norm": 2.0108342607588288, "language_loss": 0.82088459, "learning_rate": 1.517399156051309e-06, "loss": 0.84549505, "num_input_tokens_seen": 211542890, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.2043457, "step": 9817, "time_per_iteration": 2.7997539043426514 }, { "auxiliary_loss_clip": 0.01435377, "auxiliary_loss_mlp": 0.01038219, "balance_loss_clip": 1.26394582, "balance_loss_mlp": 1.01698744, "epoch": 0.590290094694123, "flos": 22247092149120.0, "grad_norm": 1.9881139051613224, "language_loss": 0.77521455, "learning_rate": 1.517021211933682e-06, "loss": 0.79995048, "num_input_tokens_seen": 211562685, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.21252441, "step": 9818, "time_per_iteration": 2.859215497970581 }, { "auxiliary_loss_clip": 0.01428797, "auxiliary_loss_mlp": 0.01037158, "balance_loss_clip": 1.26003504, "balance_loss_mlp": 1.01653504, "epoch": 0.5903502179467909, "flos": 19107880381440.0, "grad_norm": 1.7839089271966944, "language_loss": 0.67288888, "learning_rate": 1.5166432861306592e-06, "loss": 0.69754839, "num_input_tokens_seen": 211579960, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20617676, "step": 9819, "time_per_iteration": 2.8198301792144775 }, { "auxiliary_loss_clip": 0.01434914, "auxiliary_loss_mlp": 0.0103594, "balance_loss_clip": 1.26578212, "balance_loss_mlp": 1.01482844, "epoch": 0.5904103411994589, "flos": 24244113467520.0, "grad_norm": 1.8906756088886163, "language_loss": 0.79193616, "learning_rate": 1.5162653786565714e-06, "loss": 0.81664473, "num_input_tokens_seen": 211599310, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.21105957, "step": 9820, "time_per_iteration": 2.8816215991973877 }, { "auxiliary_loss_clip": 0.01195161, "auxiliary_loss_mlp": 0.01028424, "balance_loss_clip": 1.10387301, "balance_loss_mlp": 1.00639367, "epoch": 0.5904704644521268, "flos": 64904196072960.0, "grad_norm": 0.9184882652932346, "language_loss": 0.65103376, "learning_rate": 1.5158874895257487e-06, "loss": 0.67326963, "num_input_tokens_seen": 211658790, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.22070312, "step": 9821, "time_per_iteration": 3.379164695739746 }, { "auxiliary_loss_clip": 0.01411877, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.24549174, "balance_loss_mlp": 1.0169661, "epoch": 0.5905305877047948, "flos": 19619528290560.0, "grad_norm": 2.0820411119405495, "language_loss": 0.62166971, "learning_rate": 1.515509618752521e-06, "loss": 0.64617008, "num_input_tokens_seen": 211677240, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21203613, "step": 9822, "time_per_iteration": 2.8700110912323 }, { "auxiliary_loss_clip": 0.01433779, "auxiliary_loss_mlp": 0.01039648, "balance_loss_clip": 1.26404595, "balance_loss_mlp": 1.01934624, "epoch": 0.5905907109574628, "flos": 18998984851200.0, "grad_norm": 1.8574427812301137, "language_loss": 0.83319175, "learning_rate": 1.5151317663512173e-06, "loss": 0.85792607, "num_input_tokens_seen": 211695485, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20300293, "step": 9823, "time_per_iteration": 2.895549774169922 }, { "auxiliary_loss_clip": 0.01422322, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.25537133, "balance_loss_mlp": 1.01508129, "epoch": 0.5906508342101308, "flos": 22210597578240.0, "grad_norm": 2.3349097802913366, "language_loss": 0.74213779, "learning_rate": 1.514753932336165e-06, "loss": 0.76671529, "num_input_tokens_seen": 211713090, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20361328, "step": 9824, "time_per_iteration": 2.848313570022583 }, { "auxiliary_loss_clip": 0.01460946, "auxiliary_loss_mlp": 0.01038239, "balance_loss_clip": 1.28164411, "balance_loss_mlp": 1.01678109, "epoch": 0.5907109574627988, "flos": 20896476312960.0, "grad_norm": 2.048505234010724, "language_loss": 0.84476238, "learning_rate": 1.514376116721693e-06, "loss": 0.86975431, "num_input_tokens_seen": 211732510, "router_z_loss_clip": 1.79101562, "router_z_loss_mlp": 0.21472168, "step": 9825, "time_per_iteration": 2.8185672760009766 }, { "auxiliary_loss_clip": 0.01415531, "auxiliary_loss_mlp": 0.01036504, "balance_loss_clip": 1.25041485, "balance_loss_mlp": 1.01753759, "epoch": 0.5907710807154667, "flos": 21516522059520.0, "grad_norm": 1.6892036925207452, "language_loss": 0.77247977, "learning_rate": 1.5139983195221272e-06, "loss": 0.79700017, "num_input_tokens_seen": 211748695, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.1895752, "step": 9826, "time_per_iteration": 2.8062500953674316 }, { "auxiliary_loss_clip": 0.0142199, "auxiliary_loss_mlp": 0.01040169, "balance_loss_clip": 1.25596547, "balance_loss_mlp": 1.02051139, "epoch": 0.5908312039681347, "flos": 22028622416640.0, "grad_norm": 1.6389661203483195, "language_loss": 0.73298317, "learning_rate": 1.513620540751793e-06, "loss": 0.75760472, "num_input_tokens_seen": 211768545, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.1965332, "step": 9827, "time_per_iteration": 2.8308017253875732 }, { "auxiliary_loss_clip": 0.01433485, "auxiliary_loss_mlp": 0.01038445, "balance_loss_clip": 1.26346004, "balance_loss_mlp": 1.01915646, "epoch": 0.5908913272208026, "flos": 18488920510080.0, "grad_norm": 1.8883134152676848, "language_loss": 0.80421823, "learning_rate": 1.5132427804250178e-06, "loss": 0.82893741, "num_input_tokens_seen": 211786665, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.19262695, "step": 9828, "time_per_iteration": 2.886791944503784 }, { "auxiliary_loss_clip": 0.01442161, "auxiliary_loss_mlp": 0.01042912, "balance_loss_clip": 1.2714746, "balance_loss_mlp": 1.02222931, "epoch": 0.5909514504734706, "flos": 12319663973760.0, "grad_norm": 2.355532176065077, "language_loss": 0.88997984, "learning_rate": 1.5128650385561241e-06, "loss": 0.91483057, "num_input_tokens_seen": 211801215, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20678711, "step": 9829, "time_per_iteration": 2.837496042251587 }, { "auxiliary_loss_clip": 0.0120109, "auxiliary_loss_mlp": 0.01034972, "balance_loss_clip": 1.10858297, "balance_loss_mlp": 1.01141667, "epoch": 0.5910115737261386, "flos": 70244755251840.0, "grad_norm": 0.7567898033891322, "language_loss": 0.57944632, "learning_rate": 1.5124873151594376e-06, "loss": 0.601807, "num_input_tokens_seen": 211857005, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.23535156, "step": 9830, "time_per_iteration": 3.272918939590454 }, { "auxiliary_loss_clip": 0.01448852, "auxiliary_loss_mlp": 0.01041263, "balance_loss_clip": 1.27323079, "balance_loss_mlp": 1.01929247, "epoch": 0.5910716969788066, "flos": 22027853255040.0, "grad_norm": 3.0435483691038985, "language_loss": 0.76972967, "learning_rate": 1.5121096102492812e-06, "loss": 0.79463083, "num_input_tokens_seen": 211876675, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.21984863, "step": 9831, "time_per_iteration": 2.8328471183776855 }, { "auxiliary_loss_clip": 0.01408039, "auxiliary_loss_mlp": 0.01037525, "balance_loss_clip": 1.24583125, "balance_loss_mlp": 1.01805782, "epoch": 0.5911318202314745, "flos": 21261557756160.0, "grad_norm": 2.201991716113376, "language_loss": 0.78284991, "learning_rate": 1.5117319238399767e-06, "loss": 0.80730557, "num_input_tokens_seen": 211895725, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19470215, "step": 9832, "time_per_iteration": 2.85086989402771 }, { "auxiliary_loss_clip": 0.01413259, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.24769163, "balance_loss_mlp": 1.01652408, "epoch": 0.5911919434841425, "flos": 17830525155840.0, "grad_norm": 2.0139928913487344, "language_loss": 0.84351945, "learning_rate": 1.511354255945847e-06, "loss": 0.86802518, "num_input_tokens_seen": 211913860, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20800781, "step": 9833, "time_per_iteration": 2.844409465789795 }, { "auxiliary_loss_clip": 0.01438533, "auxiliary_loss_mlp": 0.01040922, "balance_loss_clip": 1.26866651, "balance_loss_mlp": 1.02058518, "epoch": 0.5912520667368104, "flos": 20384149731840.0, "grad_norm": 1.837537096996121, "language_loss": 0.75134313, "learning_rate": 1.5109766065812123e-06, "loss": 0.77613771, "num_input_tokens_seen": 211932880, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20336914, "step": 9834, "time_per_iteration": 2.844228744506836 }, { "auxiliary_loss_clip": 0.01435217, "auxiliary_loss_mlp": 0.01041646, "balance_loss_clip": 1.26636147, "balance_loss_mlp": 1.02124953, "epoch": 0.5913121899894784, "flos": 17939375441280.0, "grad_norm": 2.548979008978037, "language_loss": 0.78923368, "learning_rate": 1.5105989757603942e-06, "loss": 0.81400228, "num_input_tokens_seen": 211948625, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20410156, "step": 9835, "time_per_iteration": 4.2609264850616455 }, { "auxiliary_loss_clip": 0.01439196, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.26811576, "balance_loss_mlp": 1.01756382, "epoch": 0.5913723132421465, "flos": 22136975009280.0, "grad_norm": 2.6202291647373475, "language_loss": 0.74790889, "learning_rate": 1.5102213634977117e-06, "loss": 0.77268237, "num_input_tokens_seen": 211965355, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.20593262, "step": 9836, "time_per_iteration": 2.858978509902954 }, { "auxiliary_loss_clip": 0.01445785, "auxiliary_loss_mlp": 0.01037696, "balance_loss_clip": 1.27433634, "balance_loss_mlp": 1.01802671, "epoch": 0.5914324364948144, "flos": 15704338636800.0, "grad_norm": 6.244583898013082, "language_loss": 0.83616656, "learning_rate": 1.5098437698074841e-06, "loss": 0.86100137, "num_input_tokens_seen": 211982245, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.19665527, "step": 9837, "time_per_iteration": 2.815629005432129 }, { "auxiliary_loss_clip": 0.01433485, "auxiliary_loss_mlp": 0.01041695, "balance_loss_clip": 1.26306248, "balance_loss_mlp": 1.02036798, "epoch": 0.5914925597474824, "flos": 22757563693440.0, "grad_norm": 1.6396847184436647, "language_loss": 0.80621326, "learning_rate": 1.5094661947040304e-06, "loss": 0.83096504, "num_input_tokens_seen": 212000250, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.21325684, "step": 9838, "time_per_iteration": 2.861156940460205 }, { "auxiliary_loss_clip": 0.01430996, "auxiliary_loss_mlp": 0.01039093, "balance_loss_clip": 1.26068902, "balance_loss_mlp": 1.01887476, "epoch": 0.5915526830001503, "flos": 18301018279680.0, "grad_norm": 1.7796335544552642, "language_loss": 0.70946956, "learning_rate": 1.5090886382016673e-06, "loss": 0.73417044, "num_input_tokens_seen": 212017505, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.20202637, "step": 9839, "time_per_iteration": 2.830792188644409 }, { "auxiliary_loss_clip": 0.01439969, "auxiliary_loss_mlp": 0.01039488, "balance_loss_clip": 1.26910424, "balance_loss_mlp": 1.01966369, "epoch": 0.5916128062528183, "flos": 17027825575680.0, "grad_norm": 2.0276362048274867, "language_loss": 0.65676427, "learning_rate": 1.5087111003147124e-06, "loss": 0.68155885, "num_input_tokens_seen": 212034595, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.19824219, "step": 9840, "time_per_iteration": 2.8305978775024414 }, { "auxiliary_loss_clip": 0.01433527, "auxiliary_loss_mlp": 0.01038055, "balance_loss_clip": 1.26425648, "balance_loss_mlp": 1.01737201, "epoch": 0.5916729295054862, "flos": 24765036560640.0, "grad_norm": 1.9815151343688935, "language_loss": 0.82455796, "learning_rate": 1.5083335810574813e-06, "loss": 0.8492738, "num_input_tokens_seen": 212055775, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20690918, "step": 9841, "time_per_iteration": 2.8630168437957764 }, { "auxiliary_loss_clip": 0.01420941, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.25508428, "balance_loss_mlp": 1.01757252, "epoch": 0.5917330527581542, "flos": 15965953925760.0, "grad_norm": 2.3036597982101843, "language_loss": 0.69720495, "learning_rate": 1.507956080444291e-06, "loss": 0.72177893, "num_input_tokens_seen": 212074000, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.18884277, "step": 9842, "time_per_iteration": 2.846956968307495 }, { "auxiliary_loss_clip": 0.01427488, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.25701082, "balance_loss_mlp": 1.0184449, "epoch": 0.5917931760108222, "flos": 23810024424960.0, "grad_norm": 2.109083812385693, "language_loss": 0.8324421, "learning_rate": 1.5075785984894549e-06, "loss": 0.85710239, "num_input_tokens_seen": 212091415, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.20080566, "step": 9843, "time_per_iteration": 2.9326016902923584 }, { "auxiliary_loss_clip": 0.01435101, "auxiliary_loss_mlp": 0.01036913, "balance_loss_clip": 1.26489687, "balance_loss_mlp": 1.01627791, "epoch": 0.5918532992634902, "flos": 23257764668160.0, "grad_norm": 2.516648784544326, "language_loss": 0.83166873, "learning_rate": 1.5072011352072875e-06, "loss": 0.85638881, "num_input_tokens_seen": 212105255, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.2064209, "step": 9844, "time_per_iteration": 2.9020986557006836 }, { "auxiliary_loss_clip": 0.01446096, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.27581847, "balance_loss_mlp": 1.0180918, "epoch": 0.5919134225161581, "flos": 19509184926720.0, "grad_norm": 1.7743798037059866, "language_loss": 0.74821007, "learning_rate": 1.5068236906121032e-06, "loss": 0.77305698, "num_input_tokens_seen": 212122765, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20495605, "step": 9845, "time_per_iteration": 2.876032590866089 }, { "auxiliary_loss_clip": 0.0143191, "auxiliary_loss_mlp": 0.01039423, "balance_loss_clip": 1.26034451, "balance_loss_mlp": 1.01840615, "epoch": 0.5919735457688261, "flos": 38815310615040.0, "grad_norm": 1.921905535415222, "language_loss": 0.6461674, "learning_rate": 1.506446264718213e-06, "loss": 0.67088073, "num_input_tokens_seen": 212143960, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20996094, "step": 9846, "time_per_iteration": 2.984344720840454 }, { "auxiliary_loss_clip": 0.01409417, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.24826229, "balance_loss_mlp": 1.01249397, "epoch": 0.592033669021494, "flos": 22174193496960.0, "grad_norm": 2.5911058121346198, "language_loss": 0.76839292, "learning_rate": 1.506068857539931e-06, "loss": 0.79279733, "num_input_tokens_seen": 212162005, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18530273, "step": 9847, "time_per_iteration": 4.232517719268799 }, { "auxiliary_loss_clip": 0.01430461, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.26053083, "balance_loss_mlp": 1.01706433, "epoch": 0.592093792274162, "flos": 22721250101760.0, "grad_norm": 1.8351872403908078, "language_loss": 0.6336273, "learning_rate": 1.5056914690915667e-06, "loss": 0.65831256, "num_input_tokens_seen": 212181635, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21020508, "step": 9848, "time_per_iteration": 2.834757089614868 }, { "auxiliary_loss_clip": 0.01437622, "auxiliary_loss_mlp": 0.01040304, "balance_loss_clip": 1.26877284, "balance_loss_mlp": 1.02170753, "epoch": 0.59215391552683, "flos": 22539274940160.0, "grad_norm": 2.0157867504798777, "language_loss": 0.76931739, "learning_rate": 1.5053140993874312e-06, "loss": 0.79409665, "num_input_tokens_seen": 212201615, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.18591309, "step": 9849, "time_per_iteration": 5.7194085121154785 }, { "auxiliary_loss_clip": 0.01424068, "auxiliary_loss_mlp": 0.01040349, "balance_loss_clip": 1.25528049, "balance_loss_mlp": 1.01911736, "epoch": 0.592214038779498, "flos": 24509800788480.0, "grad_norm": 2.068446430839621, "language_loss": 0.75840145, "learning_rate": 1.5049367484418353e-06, "loss": 0.78304565, "num_input_tokens_seen": 212219355, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21240234, "step": 9850, "time_per_iteration": 2.8519582748413086 }, { "auxiliary_loss_clip": 0.01422352, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.25538874, "balance_loss_mlp": 1.0165875, "epoch": 0.592274162032166, "flos": 21840358227840.0, "grad_norm": 1.7698384225323622, "language_loss": 0.76201868, "learning_rate": 1.5045594162690868e-06, "loss": 0.78659916, "num_input_tokens_seen": 212236710, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19116211, "step": 9851, "time_per_iteration": 2.821364164352417 }, { "auxiliary_loss_clip": 0.01438588, "auxiliary_loss_mlp": 0.01037657, "balance_loss_clip": 1.26857615, "balance_loss_mlp": 1.01819026, "epoch": 0.5923342852848339, "flos": 24618877297920.0, "grad_norm": 2.0784979707902753, "language_loss": 0.7193979, "learning_rate": 1.5041821028834954e-06, "loss": 0.74416041, "num_input_tokens_seen": 212256195, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.19458008, "step": 9852, "time_per_iteration": 2.881312131881714 }, { "auxiliary_loss_clip": 0.01440464, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.26894295, "balance_loss_mlp": 1.02257824, "epoch": 0.5923944085375019, "flos": 19947707959680.0, "grad_norm": 1.67736125376662, "language_loss": 0.80529499, "learning_rate": 1.5038048082993685e-06, "loss": 0.83012515, "num_input_tokens_seen": 212274085, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.1998291, "step": 9853, "time_per_iteration": 2.82205867767334 }, { "auxiliary_loss_clip": 0.01429613, "auxiliary_loss_mlp": 0.01037153, "balance_loss_clip": 1.263219, "balance_loss_mlp": 1.01848507, "epoch": 0.5924545317901698, "flos": 28670408092800.0, "grad_norm": 1.704437822220621, "language_loss": 0.68161786, "learning_rate": 1.5034275325310124e-06, "loss": 0.70628554, "num_input_tokens_seen": 212295530, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.18688965, "step": 9854, "time_per_iteration": 2.9279284477233887 }, { "auxiliary_loss_clip": 0.01427568, "auxiliary_loss_mlp": 0.0103221, "balance_loss_clip": 1.26116681, "balance_loss_mlp": 1.01337481, "epoch": 0.5925146550428378, "flos": 19874356859520.0, "grad_norm": 1.8434622346720118, "language_loss": 0.89849806, "learning_rate": 1.5030502755927344e-06, "loss": 0.92309582, "num_input_tokens_seen": 212313770, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.18859863, "step": 9855, "time_per_iteration": 2.807422637939453 }, { "auxiliary_loss_clip": 0.0141957, "auxiliary_loss_mlp": 0.01036023, "balance_loss_clip": 1.25477743, "balance_loss_mlp": 1.01580501, "epoch": 0.5925747782955058, "flos": 15131555723520.0, "grad_norm": 1.7735537821551608, "language_loss": 0.87482584, "learning_rate": 1.5026730374988397e-06, "loss": 0.8993817, "num_input_tokens_seen": 212331525, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20214844, "step": 9856, "time_per_iteration": 2.8977530002593994 }, { "auxiliary_loss_clip": 0.01441789, "auxiliary_loss_mlp": 0.010344, "balance_loss_clip": 1.27184534, "balance_loss_mlp": 1.01498103, "epoch": 0.5926349015481738, "flos": 18414528779520.0, "grad_norm": 1.9464620819118943, "language_loss": 0.78045321, "learning_rate": 1.5022958182636332e-06, "loss": 0.80521506, "num_input_tokens_seen": 212347295, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.19421387, "step": 9857, "time_per_iteration": 2.9011240005493164 }, { "auxiliary_loss_clip": 0.01437387, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.27088714, "balance_loss_mlp": 1.01678658, "epoch": 0.5926950248008417, "flos": 23121197303040.0, "grad_norm": 2.087253280532357, "language_loss": 0.65928543, "learning_rate": 1.501918617901419e-06, "loss": 0.68401861, "num_input_tokens_seen": 212365750, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19128418, "step": 9858, "time_per_iteration": 2.857067346572876 }, { "auxiliary_loss_clip": 0.01416318, "auxiliary_loss_mlp": 0.01031895, "balance_loss_clip": 1.25249922, "balance_loss_mlp": 1.01375127, "epoch": 0.5927551480535097, "flos": 28044525767040.0, "grad_norm": 1.7950700186965656, "language_loss": 0.77197385, "learning_rate": 1.501541436426501e-06, "loss": 0.79645598, "num_input_tokens_seen": 212385300, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18164062, "step": 9859, "time_per_iteration": 2.929734945297241 }, { "auxiliary_loss_clip": 0.01426683, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.25782275, "balance_loss_mlp": 1.01793921, "epoch": 0.5928152713061776, "flos": 21808659605760.0, "grad_norm": 2.7861747931085814, "language_loss": 0.76318878, "learning_rate": 1.5011642738531818e-06, "loss": 0.78783238, "num_input_tokens_seen": 212402140, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.1973877, "step": 9860, "time_per_iteration": 2.8330183029174805 }, { "auxiliary_loss_clip": 0.01423596, "auxiliary_loss_mlp": 0.01035215, "balance_loss_clip": 1.25728858, "balance_loss_mlp": 1.01558137, "epoch": 0.5928753945588456, "flos": 24327644647680.0, "grad_norm": 1.6791021315138142, "language_loss": 0.76766443, "learning_rate": 1.500787130195763e-06, "loss": 0.79225254, "num_input_tokens_seen": 212421790, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19641113, "step": 9861, "time_per_iteration": 2.901172637939453 }, { "auxiliary_loss_clip": 0.0140828, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.24409616, "balance_loss_mlp": 1.01449442, "epoch": 0.5929355178115137, "flos": 26474716281600.0, "grad_norm": 1.6263844772816096, "language_loss": 0.71433747, "learning_rate": 1.5004100054685465e-06, "loss": 0.73875868, "num_input_tokens_seen": 212442115, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19360352, "step": 9862, "time_per_iteration": 2.918715476989746 }, { "auxiliary_loss_clip": 0.01428467, "auxiliary_loss_mlp": 0.01035317, "balance_loss_clip": 1.26077044, "balance_loss_mlp": 1.01643443, "epoch": 0.5929956410641816, "flos": 24975000270720.0, "grad_norm": 1.9019725923045085, "language_loss": 0.78924012, "learning_rate": 1.500032899685832e-06, "loss": 0.81387794, "num_input_tokens_seen": 212459535, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.1887207, "step": 9863, "time_per_iteration": 2.881592273712158 }, { "auxiliary_loss_clip": 0.01429065, "auxiliary_loss_mlp": 0.01038552, "balance_loss_clip": 1.26189542, "balance_loss_mlp": 1.01919198, "epoch": 0.5930557643168496, "flos": 26218123165440.0, "grad_norm": 1.9155807614867755, "language_loss": 0.71169627, "learning_rate": 1.499655812861921e-06, "loss": 0.73637235, "num_input_tokens_seen": 212479385, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19348145, "step": 9864, "time_per_iteration": 2.9026668071746826 }, { "auxiliary_loss_clip": 0.01424616, "auxiliary_loss_mlp": 0.01036346, "balance_loss_clip": 1.25764048, "balance_loss_mlp": 1.01618779, "epoch": 0.5931158875695175, "flos": 27866396413440.0, "grad_norm": 7.0926175482617, "language_loss": 0.67992318, "learning_rate": 1.4992787450111112e-06, "loss": 0.7045328, "num_input_tokens_seen": 212500060, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20153809, "step": 9865, "time_per_iteration": 2.925739288330078 }, { "auxiliary_loss_clip": 0.0143625, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 1.26700819, "balance_loss_mlp": 1.018839, "epoch": 0.5931760108221855, "flos": 15421973967360.0, "grad_norm": 1.9585127214013762, "language_loss": 0.7873624, "learning_rate": 1.4989016961477015e-06, "loss": 0.81210583, "num_input_tokens_seen": 212518590, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.19238281, "step": 9866, "time_per_iteration": 2.817072868347168 }, { "auxiliary_loss_clip": 0.0141794, "auxiliary_loss_mlp": 0.01035778, "balance_loss_clip": 1.2555728, "balance_loss_mlp": 1.01626313, "epoch": 0.5932361340748534, "flos": 30200193912960.0, "grad_norm": 2.005992784753267, "language_loss": 0.73281038, "learning_rate": 1.4985246662859903e-06, "loss": 0.75734752, "num_input_tokens_seen": 212538190, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19519043, "step": 9867, "time_per_iteration": 2.8962535858154297 }, { "auxiliary_loss_clip": 0.01424962, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.25975263, "balance_loss_mlp": 1.01889312, "epoch": 0.5932962573275214, "flos": 20167308812160.0, "grad_norm": 1.5915688865612314, "language_loss": 0.67446417, "learning_rate": 1.4981476554402732e-06, "loss": 0.69911271, "num_input_tokens_seen": 212557820, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.21008301, "step": 9868, "time_per_iteration": 2.8464162349700928 }, { "auxiliary_loss_clip": 0.01427422, "auxiliary_loss_mlp": 0.01041207, "balance_loss_clip": 1.25979197, "balance_loss_mlp": 1.02075028, "epoch": 0.5933563805801894, "flos": 25456487880960.0, "grad_norm": 2.116641274180415, "language_loss": 0.76156712, "learning_rate": 1.4977706636248478e-06, "loss": 0.78625345, "num_input_tokens_seen": 212577645, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20458984, "step": 9869, "time_per_iteration": 2.9266693592071533 }, { "auxiliary_loss_clip": 0.01432097, "auxiliary_loss_mlp": 0.01036699, "balance_loss_clip": 1.26327455, "balance_loss_mlp": 1.01588559, "epoch": 0.5934165038328574, "flos": 60014719998720.0, "grad_norm": 1.830872330237942, "language_loss": 0.74939156, "learning_rate": 1.4973936908540091e-06, "loss": 0.77407956, "num_input_tokens_seen": 212603430, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.20825195, "step": 9870, "time_per_iteration": 4.532441854476929 }, { "auxiliary_loss_clip": 0.01433471, "auxiliary_loss_mlp": 0.01038773, "balance_loss_clip": 1.2641654, "balance_loss_mlp": 1.01909113, "epoch": 0.5934766270855253, "flos": 24429889192320.0, "grad_norm": 2.259079945015092, "language_loss": 0.72064435, "learning_rate": 1.4970167371420517e-06, "loss": 0.74536681, "num_input_tokens_seen": 212620730, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19677734, "step": 9871, "time_per_iteration": 2.8672282695770264 }, { "auxiliary_loss_clip": 0.01423807, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.25643623, "balance_loss_mlp": 1.01572835, "epoch": 0.5935367503381933, "flos": 23523497233920.0, "grad_norm": 1.8926684452916953, "language_loss": 0.75291479, "learning_rate": 1.496639802503271e-06, "loss": 0.77750278, "num_input_tokens_seen": 212639745, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19274902, "step": 9872, "time_per_iteration": 2.856052875518799 }, { "auxiliary_loss_clip": 0.01437306, "auxiliary_loss_mlp": 0.01044591, "balance_loss_clip": 1.26667345, "balance_loss_mlp": 1.02417088, "epoch": 0.5935968735908612, "flos": 18956427477120.0, "grad_norm": 2.2581569956118646, "language_loss": 0.79976487, "learning_rate": 1.4962628869519583e-06, "loss": 0.82458389, "num_input_tokens_seen": 212655915, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.2043457, "step": 9873, "time_per_iteration": 2.8186724185943604 }, { "auxiliary_loss_clip": 0.01425554, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 1.25893259, "balance_loss_mlp": 1.0167985, "epoch": 0.5936569968435292, "flos": 25494294551040.0, "grad_norm": 1.7499656760481874, "language_loss": 0.85596675, "learning_rate": 1.4958859905024078e-06, "loss": 0.8805964, "num_input_tokens_seen": 212676115, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20617676, "step": 9874, "time_per_iteration": 2.8692786693573 }, { "auxiliary_loss_clip": 0.01197799, "auxiliary_loss_mlp": 0.01025669, "balance_loss_clip": 1.10660827, "balance_loss_mlp": 1.00611877, "epoch": 0.5937171200961973, "flos": 66407848381440.0, "grad_norm": 0.7072058944302858, "language_loss": 0.60157132, "learning_rate": 1.4955091131689115e-06, "loss": 0.623806, "num_input_tokens_seen": 212737560, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.1953125, "step": 9875, "time_per_iteration": 3.425703287124634 }, { "auxiliary_loss_clip": 0.01439628, "auxiliary_loss_mlp": 0.01034314, "balance_loss_clip": 1.26704669, "balance_loss_mlp": 1.01348829, "epoch": 0.5937772433488652, "flos": 14911819136640.0, "grad_norm": 3.511494805887591, "language_loss": 0.78591108, "learning_rate": 1.4951322549657594e-06, "loss": 0.81065047, "num_input_tokens_seen": 212755365, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.20825195, "step": 9876, "time_per_iteration": 2.832049608230591 }, { "auxiliary_loss_clip": 0.01414893, "auxiliary_loss_mlp": 0.01033939, "balance_loss_clip": 1.25182843, "balance_loss_mlp": 1.01351857, "epoch": 0.5938373666015332, "flos": 22570792583040.0, "grad_norm": 1.6889936249811792, "language_loss": 0.76082176, "learning_rate": 1.494755415907243e-06, "loss": 0.78531003, "num_input_tokens_seen": 212773875, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20422363, "step": 9877, "time_per_iteration": 2.8527491092681885 }, { "auxiliary_loss_clip": 0.01436834, "auxiliary_loss_mlp": 0.01034583, "balance_loss_clip": 1.26812887, "balance_loss_mlp": 1.01276755, "epoch": 0.5938974898542011, "flos": 18449711251200.0, "grad_norm": 2.777969795878775, "language_loss": 0.82647258, "learning_rate": 1.4943785960076522e-06, "loss": 0.85118675, "num_input_tokens_seen": 212790590, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.21801758, "step": 9878, "time_per_iteration": 2.8352272510528564 }, { "auxiliary_loss_clip": 0.01437443, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.26776171, "balance_loss_mlp": 1.01713049, "epoch": 0.5939576131068691, "flos": 45602712616320.0, "grad_norm": 2.2502488109752643, "language_loss": 0.71276057, "learning_rate": 1.4940017952812754e-06, "loss": 0.73751259, "num_input_tokens_seen": 212812265, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.20617676, "step": 9879, "time_per_iteration": 3.1024248600006104 }, { "auxiliary_loss_clip": 0.01418874, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.25489569, "balance_loss_mlp": 1.01389956, "epoch": 0.594017736359537, "flos": 23598477146880.0, "grad_norm": 1.5451617236148434, "language_loss": 0.57869309, "learning_rate": 1.493625013742401e-06, "loss": 0.60322273, "num_input_tokens_seen": 212831915, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.20178223, "step": 9880, "time_per_iteration": 2.8984971046447754 }, { "auxiliary_loss_clip": 0.01422992, "auxiliary_loss_mlp": 0.01036849, "balance_loss_clip": 1.25686717, "balance_loss_mlp": 1.01683426, "epoch": 0.594077859612205, "flos": 29468311724160.0, "grad_norm": 1.916694986476144, "language_loss": 0.78418171, "learning_rate": 1.4932482514053177e-06, "loss": 0.80878013, "num_input_tokens_seen": 212851350, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20007324, "step": 9881, "time_per_iteration": 2.9815361499786377 }, { "auxiliary_loss_clip": 0.01424544, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.25634241, "balance_loss_mlp": 1.0128901, "epoch": 0.594137982864873, "flos": 16808767660800.0, "grad_norm": 2.098255446756221, "language_loss": 0.83528721, "learning_rate": 1.4928715082843112e-06, "loss": 0.85985637, "num_input_tokens_seen": 212867995, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19482422, "step": 9882, "time_per_iteration": 4.307379722595215 }, { "auxiliary_loss_clip": 0.0142636, "auxiliary_loss_mlp": 0.01032786, "balance_loss_clip": 1.2604444, "balance_loss_mlp": 1.01418912, "epoch": 0.594198106117541, "flos": 12757915537920.0, "grad_norm": 2.279454979894283, "language_loss": 0.80336481, "learning_rate": 1.492494784393667e-06, "loss": 0.82795626, "num_input_tokens_seen": 212885220, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.18603516, "step": 9883, "time_per_iteration": 2.7944443225860596 }, { "auxiliary_loss_clip": 0.01438105, "auxiliary_loss_mlp": 0.01040163, "balance_loss_clip": 1.26765847, "balance_loss_mlp": 1.0189085, "epoch": 0.5942582293702089, "flos": 21006457718400.0, "grad_norm": 1.8124491237032592, "language_loss": 0.75521201, "learning_rate": 1.4921180797476725e-06, "loss": 0.77999473, "num_input_tokens_seen": 212903195, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21264648, "step": 9884, "time_per_iteration": 5.68001127243042 }, { "auxiliary_loss_clip": 0.01432393, "auxiliary_loss_mlp": 0.01035575, "balance_loss_clip": 1.26407063, "balance_loss_mlp": 1.01491642, "epoch": 0.5943183526228769, "flos": 28302657206400.0, "grad_norm": 1.9355153962290743, "language_loss": 0.6736542, "learning_rate": 1.4917413943606106e-06, "loss": 0.69833386, "num_input_tokens_seen": 212923340, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20666504, "step": 9885, "time_per_iteration": 2.887298345565796 }, { "auxiliary_loss_clip": 0.01425831, "auxiliary_loss_mlp": 0.0103717, "balance_loss_clip": 1.26006126, "balance_loss_mlp": 1.01792932, "epoch": 0.5943784758755448, "flos": 26625807227520.0, "grad_norm": 2.5238925370733374, "language_loss": 0.77978694, "learning_rate": 1.4913647282467667e-06, "loss": 0.80441689, "num_input_tokens_seen": 212942755, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19250488, "step": 9886, "time_per_iteration": 2.978957414627075 }, { "auxiliary_loss_clip": 0.01197817, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.10818744, "balance_loss_mlp": 1.00932205, "epoch": 0.5944385991282128, "flos": 64220300634240.0, "grad_norm": 0.848722819766542, "language_loss": 0.64704263, "learning_rate": 1.490988081420423e-06, "loss": 0.66932094, "num_input_tokens_seen": 212999355, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.20703125, "step": 9887, "time_per_iteration": 3.250133752822876 }, { "auxiliary_loss_clip": 0.01428926, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.26177406, "balance_loss_mlp": 1.01498652, "epoch": 0.5944987223808808, "flos": 19581133438080.0, "grad_norm": 1.95296062315955, "language_loss": 0.70246494, "learning_rate": 1.4906114538958615e-06, "loss": 0.72710049, "num_input_tokens_seen": 213018570, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19641113, "step": 9888, "time_per_iteration": 2.8967599868774414 }, { "auxiliary_loss_clip": 0.01430053, "auxiliary_loss_mlp": 0.01040332, "balance_loss_clip": 1.26283073, "balance_loss_mlp": 1.02003098, "epoch": 0.5945588456335488, "flos": 26188867762560.0, "grad_norm": 1.9681810352617994, "language_loss": 0.79974824, "learning_rate": 1.490234845687366e-06, "loss": 0.8244521, "num_input_tokens_seen": 213037735, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.203125, "step": 9889, "time_per_iteration": 2.8805134296417236 }, { "auxiliary_loss_clip": 0.01420687, "auxiliary_loss_mlp": 0.0103493, "balance_loss_clip": 1.25463462, "balance_loss_mlp": 1.01518857, "epoch": 0.5946189688862168, "flos": 20455555305600.0, "grad_norm": 1.707502151579249, "language_loss": 0.72079355, "learning_rate": 1.4898582568092154e-06, "loss": 0.74534971, "num_input_tokens_seen": 213057160, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.1973877, "step": 9890, "time_per_iteration": 2.8844571113586426 }, { "auxiliary_loss_clip": 0.01437811, "auxiliary_loss_mlp": 0.01038808, "balance_loss_clip": 1.26870775, "balance_loss_mlp": 1.01893544, "epoch": 0.5946790921388847, "flos": 13443801747840.0, "grad_norm": 2.425890565367597, "language_loss": 0.70286226, "learning_rate": 1.489481687275691e-06, "loss": 0.72762847, "num_input_tokens_seen": 213073630, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.1986084, "step": 9891, "time_per_iteration": 2.8462204933166504 }, { "auxiliary_loss_clip": 0.01420339, "auxiliary_loss_mlp": 0.01035969, "balance_loss_clip": 1.25465345, "balance_loss_mlp": 1.01666903, "epoch": 0.5947392153915527, "flos": 20421911157120.0, "grad_norm": 1.9390211287665915, "language_loss": 0.54621994, "learning_rate": 1.4891051371010726e-06, "loss": 0.57078302, "num_input_tokens_seen": 213092450, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19311523, "step": 9892, "time_per_iteration": 2.8993146419525146 }, { "auxiliary_loss_clip": 0.01200465, "auxiliary_loss_mlp": 0.01029461, "balance_loss_clip": 1.11110806, "balance_loss_mlp": 1.00886142, "epoch": 0.5947993386442206, "flos": 65650239884160.0, "grad_norm": 0.6537102418789985, "language_loss": 0.54607761, "learning_rate": 1.4887286062996375e-06, "loss": 0.5683769, "num_input_tokens_seen": 213155465, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.20605469, "step": 9893, "time_per_iteration": 3.413184881210327 }, { "auxiliary_loss_clip": 0.01409398, "auxiliary_loss_mlp": 0.01035822, "balance_loss_clip": 1.24674439, "balance_loss_mlp": 1.01596212, "epoch": 0.5948594618968887, "flos": 23192919590400.0, "grad_norm": 2.1032485779837335, "language_loss": 0.75681806, "learning_rate": 1.4883520948856658e-06, "loss": 0.78127027, "num_input_tokens_seen": 213174875, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1986084, "step": 9894, "time_per_iteration": 2.866065740585327 }, { "auxiliary_loss_clip": 0.0141962, "auxiliary_loss_mlp": 0.01038083, "balance_loss_clip": 1.25231504, "balance_loss_mlp": 1.01816285, "epoch": 0.5949195851495566, "flos": 13634871114240.0, "grad_norm": 2.431023739867387, "language_loss": 0.78687662, "learning_rate": 1.487975602873434e-06, "loss": 0.81145364, "num_input_tokens_seen": 213192695, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19934082, "step": 9895, "time_per_iteration": 2.8491034507751465 }, { "auxiliary_loss_clip": 0.01449385, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.27897871, "balance_loss_mlp": 1.02184796, "epoch": 0.5949797084022246, "flos": 19759308036480.0, "grad_norm": 1.691662524800222, "language_loss": 0.79738003, "learning_rate": 1.4875991302772182e-06, "loss": 0.82229853, "num_input_tokens_seen": 213211195, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20605469, "step": 9896, "time_per_iteration": 2.8587896823883057 }, { "auxiliary_loss_clip": 0.01432278, "auxiliary_loss_mlp": 0.01037275, "balance_loss_clip": 1.26424706, "balance_loss_mlp": 1.01691389, "epoch": 0.5950398316548925, "flos": 25784441326080.0, "grad_norm": 1.4963758756030519, "language_loss": 0.8423475, "learning_rate": 1.4872226771112954e-06, "loss": 0.86704296, "num_input_tokens_seen": 213231975, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20349121, "step": 9897, "time_per_iteration": 2.886740207672119 }, { "auxiliary_loss_clip": 0.01431066, "auxiliary_loss_mlp": 0.01037482, "balance_loss_clip": 1.26255155, "balance_loss_mlp": 1.01793206, "epoch": 0.5950999549075605, "flos": 23049294036480.0, "grad_norm": 1.8083325483797432, "language_loss": 0.72244495, "learning_rate": 1.486846243389939e-06, "loss": 0.74713045, "num_input_tokens_seen": 213249760, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19555664, "step": 9898, "time_per_iteration": 2.846064567565918 }, { "auxiliary_loss_clip": 0.01434998, "auxiliary_loss_mlp": 0.01039811, "balance_loss_clip": 1.26301265, "balance_loss_mlp": 1.01913989, "epoch": 0.5951600781602284, "flos": 32457518421120.0, "grad_norm": 13.003859577839917, "language_loss": 0.64774364, "learning_rate": 1.4864698291274251e-06, "loss": 0.67249167, "num_input_tokens_seen": 213269890, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.20666504, "step": 9899, "time_per_iteration": 2.939169406890869 }, { "auxiliary_loss_clip": 0.01422437, "auxiliary_loss_mlp": 0.01036285, "balance_loss_clip": 1.2571646, "balance_loss_mlp": 1.01791441, "epoch": 0.5952202014128964, "flos": 23810205404160.0, "grad_norm": 1.7518648961733951, "language_loss": 0.72912264, "learning_rate": 1.4860934343380267e-06, "loss": 0.75370991, "num_input_tokens_seen": 213289400, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.18383789, "step": 9900, "time_per_iteration": 2.8975331783294678 }, { "auxiliary_loss_clip": 0.01416117, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.25222659, "balance_loss_mlp": 1.01822495, "epoch": 0.5952803246655644, "flos": 22502418410880.0, "grad_norm": 2.2736544324101717, "language_loss": 0.85591525, "learning_rate": 1.4857170590360169e-06, "loss": 0.88047135, "num_input_tokens_seen": 213308040, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.21289062, "step": 9901, "time_per_iteration": 2.8299620151519775 }, { "auxiliary_loss_clip": 0.012034, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.11221242, "balance_loss_mlp": 1.011271, "epoch": 0.5953404479182324, "flos": 51259705799040.0, "grad_norm": 0.781208399653823, "language_loss": 0.58212829, "learning_rate": 1.4853407032356674e-06, "loss": 0.60450959, "num_input_tokens_seen": 213358585, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.234375, "step": 9902, "time_per_iteration": 3.2500903606414795 }, { "auxiliary_loss_clip": 0.01421558, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 1.25407743, "balance_loss_mlp": 1.011935, "epoch": 0.5954005711709004, "flos": 23123142829440.0, "grad_norm": 1.6981087205192982, "language_loss": 0.78165293, "learning_rate": 1.4849643669512503e-06, "loss": 0.80618608, "num_input_tokens_seen": 213379585, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19812012, "step": 9903, "time_per_iteration": 2.9618070125579834 }, { "auxiliary_loss_clip": 0.01419932, "auxiliary_loss_mlp": 0.01032873, "balance_loss_clip": 1.25392604, "balance_loss_mlp": 1.01285791, "epoch": 0.5954606944235683, "flos": 35968281638400.0, "grad_norm": 1.7425380883459551, "language_loss": 0.79084349, "learning_rate": 1.4845880501970362e-06, "loss": 0.81537151, "num_input_tokens_seen": 213401465, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20031738, "step": 9904, "time_per_iteration": 2.9704926013946533 }, { "auxiliary_loss_clip": 0.0144294, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.27035642, "balance_loss_mlp": 1.01866376, "epoch": 0.5955208176762363, "flos": 30455339195520.0, "grad_norm": 1.3844497954274668, "language_loss": 0.73121512, "learning_rate": 1.4842117529872942e-06, "loss": 0.75604236, "num_input_tokens_seen": 213422720, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.21105957, "step": 9905, "time_per_iteration": 4.322917461395264 }, { "auxiliary_loss_clip": 0.01407781, "auxiliary_loss_mlp": 0.01038515, "balance_loss_clip": 1.24166715, "balance_loss_mlp": 1.01752234, "epoch": 0.5955809409289042, "flos": 17649228666240.0, "grad_norm": 1.7816492806695519, "language_loss": 0.70746499, "learning_rate": 1.483835475336295e-06, "loss": 0.73192799, "num_input_tokens_seen": 213439480, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20996094, "step": 9906, "time_per_iteration": 2.817312717437744 }, { "auxiliary_loss_clip": 0.01417283, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.25024581, "balance_loss_mlp": 1.01677716, "epoch": 0.5956410641815723, "flos": 24290561894400.0, "grad_norm": 1.7871048799217477, "language_loss": 0.75403965, "learning_rate": 1.4834592172583057e-06, "loss": 0.77858716, "num_input_tokens_seen": 213458895, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20690918, "step": 9907, "time_per_iteration": 2.849928617477417 }, { "auxiliary_loss_clip": 0.0142571, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.25829673, "balance_loss_mlp": 1.01689219, "epoch": 0.5957011874342402, "flos": 35747368686720.0, "grad_norm": 1.8316266773205743, "language_loss": 0.68391484, "learning_rate": 1.483082978767595e-06, "loss": 0.70854771, "num_input_tokens_seen": 213481730, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20678711, "step": 9908, "time_per_iteration": 2.9540486335754395 }, { "auxiliary_loss_clip": 0.0140903, "auxiliary_loss_mlp": 0.01035466, "balance_loss_clip": 1.24433982, "balance_loss_mlp": 1.01447296, "epoch": 0.5957613106869082, "flos": 21253187468160.0, "grad_norm": 3.3209808383837913, "language_loss": 0.77524483, "learning_rate": 1.4827067598784298e-06, "loss": 0.79968977, "num_input_tokens_seen": 213497225, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20983887, "step": 9909, "time_per_iteration": 2.8728034496307373 }, { "auxiliary_loss_clip": 0.01203374, "auxiliary_loss_mlp": 0.01021593, "balance_loss_clip": 1.11200833, "balance_loss_mlp": 1.00251961, "epoch": 0.5958214339395761, "flos": 65970456468480.0, "grad_norm": 0.9250427387541176, "language_loss": 0.73536485, "learning_rate": 1.4823305606050753e-06, "loss": 0.75761455, "num_input_tokens_seen": 213556890, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.19042969, "step": 9910, "time_per_iteration": 3.402803897857666 }, { "auxiliary_loss_clip": 0.01421357, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 1.25299978, "balance_loss_mlp": 1.01139402, "epoch": 0.5958815571922441, "flos": 23228283041280.0, "grad_norm": 1.6506235477489333, "language_loss": 0.70679414, "learning_rate": 1.481954380961799e-06, "loss": 0.73133588, "num_input_tokens_seen": 213575800, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21411133, "step": 9911, "time_per_iteration": 2.8778703212738037 }, { "auxiliary_loss_clip": 0.01443352, "auxiliary_loss_mlp": 0.01037448, "balance_loss_clip": 1.26987743, "balance_loss_mlp": 1.01682496, "epoch": 0.595941680444912, "flos": 16545659293440.0, "grad_norm": 2.0194738611878815, "language_loss": 0.66628361, "learning_rate": 1.4815782209628631e-06, "loss": 0.6910916, "num_input_tokens_seen": 213592740, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20605469, "step": 9912, "time_per_iteration": 2.811274528503418 }, { "auxiliary_loss_clip": 0.0142411, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.25693965, "balance_loss_mlp": 1.01484537, "epoch": 0.59600180369758, "flos": 27830444780160.0, "grad_norm": 1.8950507029364791, "language_loss": 0.73449641, "learning_rate": 1.4812020806225337e-06, "loss": 0.75908768, "num_input_tokens_seen": 213611970, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20166016, "step": 9913, "time_per_iteration": 2.8912205696105957 }, { "auxiliary_loss_clip": 0.01433945, "auxiliary_loss_mlp": 0.01036564, "balance_loss_clip": 1.26186395, "balance_loss_mlp": 1.01565504, "epoch": 0.596061926950248, "flos": 29502091607040.0, "grad_norm": 4.477185169591049, "language_loss": 0.80823386, "learning_rate": 1.4808259599550738e-06, "loss": 0.83293891, "num_input_tokens_seen": 213632230, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.20898438, "step": 9914, "time_per_iteration": 2.982706069946289 }, { "auxiliary_loss_clip": 0.0141616, "auxiliary_loss_mlp": 0.01033859, "balance_loss_clip": 1.25132012, "balance_loss_mlp": 1.01403463, "epoch": 0.596122050202916, "flos": 16845533700480.0, "grad_norm": 1.73319835733686, "language_loss": 0.68069762, "learning_rate": 1.4804498589747448e-06, "loss": 0.70519781, "num_input_tokens_seen": 213649645, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19824219, "step": 9915, "time_per_iteration": 2.8324806690216064 }, { "auxiliary_loss_clip": 0.01427663, "auxiliary_loss_mlp": 0.01036141, "balance_loss_clip": 1.2582494, "balance_loss_mlp": 1.01579165, "epoch": 0.596182173455584, "flos": 21006593452800.0, "grad_norm": 1.658135840475047, "language_loss": 0.79826069, "learning_rate": 1.4800737776958095e-06, "loss": 0.82289875, "num_input_tokens_seen": 213668850, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20349121, "step": 9916, "time_per_iteration": 2.8590149879455566 }, { "auxiliary_loss_clip": 0.01436066, "auxiliary_loss_mlp": 0.01033184, "balance_loss_clip": 1.26481545, "balance_loss_mlp": 1.01314521, "epoch": 0.5962422967082519, "flos": 16072949174400.0, "grad_norm": 1.7627919144793953, "language_loss": 0.83714473, "learning_rate": 1.4796977161325286e-06, "loss": 0.86183721, "num_input_tokens_seen": 213685695, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20031738, "step": 9917, "time_per_iteration": 4.251293182373047 }, { "auxiliary_loss_clip": 0.01415814, "auxiliary_loss_mlp": 0.01034547, "balance_loss_clip": 1.25075209, "balance_loss_mlp": 1.01415038, "epoch": 0.5963024199609199, "flos": 12174590586240.0, "grad_norm": 1.7920281195697247, "language_loss": 0.78356713, "learning_rate": 1.4793216742991625e-06, "loss": 0.80807072, "num_input_tokens_seen": 213703515, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20385742, "step": 9918, "time_per_iteration": 2.8501601219177246 }, { "auxiliary_loss_clip": 0.01418744, "auxiliary_loss_mlp": 0.01041915, "balance_loss_clip": 1.2532872, "balance_loss_mlp": 1.02168512, "epoch": 0.5963625432135878, "flos": 28085906776320.0, "grad_norm": 1.6479972622885097, "language_loss": 0.79149818, "learning_rate": 1.4789456522099707e-06, "loss": 0.81610483, "num_input_tokens_seen": 213724170, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20239258, "step": 9919, "time_per_iteration": 5.7042951583862305 }, { "auxiliary_loss_clip": 0.01421961, "auxiliary_loss_mlp": 0.0103499, "balance_loss_clip": 1.25474036, "balance_loss_mlp": 1.01467633, "epoch": 0.5964226664662559, "flos": 19868610769920.0, "grad_norm": 2.277950339013918, "language_loss": 0.78388411, "learning_rate": 1.4785696498792122e-06, "loss": 0.80845368, "num_input_tokens_seen": 213740620, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.203125, "step": 9920, "time_per_iteration": 2.8314244747161865 }, { "auxiliary_loss_clip": 0.01437907, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.26951945, "balance_loss_mlp": 1.01931441, "epoch": 0.5964827897189238, "flos": 12940297902720.0, "grad_norm": 3.049525442228704, "language_loss": 0.83497584, "learning_rate": 1.4781936673211446e-06, "loss": 0.85974944, "num_input_tokens_seen": 213755390, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20153809, "step": 9921, "time_per_iteration": 2.829024076461792 }, { "auxiliary_loss_clip": 0.01411421, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.24654472, "balance_loss_mlp": 1.01652372, "epoch": 0.5965429129715918, "flos": 18160016924160.0, "grad_norm": 2.7814967646847846, "language_loss": 0.81089163, "learning_rate": 1.4778177045500252e-06, "loss": 0.835374, "num_input_tokens_seen": 213773225, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20288086, "step": 9922, "time_per_iteration": 2.831235408782959 }, { "auxiliary_loss_clip": 0.01408688, "auxiliary_loss_mlp": 0.01033206, "balance_loss_clip": 1.24373579, "balance_loss_mlp": 1.01328611, "epoch": 0.5966030362242597, "flos": 21773477134080.0, "grad_norm": 1.8332506029164501, "language_loss": 0.77436846, "learning_rate": 1.477441761580111e-06, "loss": 0.79878747, "num_input_tokens_seen": 213791860, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19909668, "step": 9923, "time_per_iteration": 2.866919755935669 }, { "auxiliary_loss_clip": 0.01437854, "auxiliary_loss_mlp": 0.01037295, "balance_loss_clip": 1.26688063, "balance_loss_mlp": 1.01583719, "epoch": 0.5966631594769277, "flos": 18816511996800.0, "grad_norm": 2.03478562973171, "language_loss": 0.76276445, "learning_rate": 1.4770658384256573e-06, "loss": 0.787516, "num_input_tokens_seen": 213809455, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21472168, "step": 9924, "time_per_iteration": 2.834747314453125 }, { "auxiliary_loss_clip": 0.01422868, "auxiliary_loss_mlp": 0.0103934, "balance_loss_clip": 1.25885582, "balance_loss_mlp": 1.01875234, "epoch": 0.5967232827295956, "flos": 14072263027200.0, "grad_norm": 2.2346607450815767, "language_loss": 0.67303663, "learning_rate": 1.4766899351009204e-06, "loss": 0.69765866, "num_input_tokens_seen": 213826615, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20581055, "step": 9925, "time_per_iteration": 2.8318073749542236 }, { "auxiliary_loss_clip": 0.01420897, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.25637448, "balance_loss_mlp": 1.02043211, "epoch": 0.5967834059822636, "flos": 17247109714560.0, "grad_norm": 1.890542901047272, "language_loss": 0.72253376, "learning_rate": 1.4763140516201528e-06, "loss": 0.74715155, "num_input_tokens_seen": 213844495, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20446777, "step": 9926, "time_per_iteration": 2.858304023742676 }, { "auxiliary_loss_clip": 0.01429374, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.25981712, "balance_loss_mlp": 1.01533461, "epoch": 0.5968435292349316, "flos": 42534996912000.0, "grad_norm": 2.010218982363157, "language_loss": 0.71287513, "learning_rate": 1.4759381879976088e-06, "loss": 0.73752475, "num_input_tokens_seen": 213869125, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20251465, "step": 9927, "time_per_iteration": 3.037299871444702 }, { "auxiliary_loss_clip": 0.0143152, "auxiliary_loss_mlp": 0.01039685, "balance_loss_clip": 1.25957465, "balance_loss_mlp": 1.01914489, "epoch": 0.5969036524875996, "flos": 37644317210880.0, "grad_norm": 2.0340757040301023, "language_loss": 0.64379483, "learning_rate": 1.4755623442475415e-06, "loss": 0.66850686, "num_input_tokens_seen": 213891115, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.20544434, "step": 9928, "time_per_iteration": 3.0375545024871826 }, { "auxiliary_loss_clip": 0.01411467, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.24626517, "balance_loss_mlp": 1.01208699, "epoch": 0.5969637757402676, "flos": 23158642014720.0, "grad_norm": 1.7265241808903646, "language_loss": 0.70278549, "learning_rate": 1.4751865203842022e-06, "loss": 0.72722375, "num_input_tokens_seen": 213911925, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20263672, "step": 9929, "time_per_iteration": 2.9572389125823975 }, { "auxiliary_loss_clip": 0.01400744, "auxiliary_loss_mlp": 0.010382, "balance_loss_clip": 1.23949862, "balance_loss_mlp": 1.01808953, "epoch": 0.5970238989929355, "flos": 24030620663040.0, "grad_norm": 1.9837695836223377, "language_loss": 0.78363276, "learning_rate": 1.4748107164218431e-06, "loss": 0.80802226, "num_input_tokens_seen": 213930715, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.2010498, "step": 9930, "time_per_iteration": 2.8604960441589355 }, { "auxiliary_loss_clip": 0.01444928, "auxiliary_loss_mlp": 0.01036373, "balance_loss_clip": 1.27213454, "balance_loss_mlp": 1.01492691, "epoch": 0.5970840222456035, "flos": 19436195784960.0, "grad_norm": 1.7039550583737546, "language_loss": 0.70354915, "learning_rate": 1.4744349323747146e-06, "loss": 0.72836214, "num_input_tokens_seen": 213950015, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.21459961, "step": 9931, "time_per_iteration": 2.842463731765747 }, { "auxiliary_loss_clip": 0.01207422, "auxiliary_loss_mlp": 0.01038926, "balance_loss_clip": 1.11656106, "balance_loss_mlp": 1.02242756, "epoch": 0.5971441454982714, "flos": 63002994537600.0, "grad_norm": 0.8597695292956713, "language_loss": 0.64259553, "learning_rate": 1.474059168257065e-06, "loss": 0.66505903, "num_input_tokens_seen": 214003330, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.16503906, "step": 9932, "time_per_iteration": 3.299004554748535 }, { "auxiliary_loss_clip": 0.01413518, "auxiliary_loss_mlp": 0.01033082, "balance_loss_clip": 1.24742794, "balance_loss_mlp": 1.01259017, "epoch": 0.5972042687509395, "flos": 20275797139200.0, "grad_norm": 1.9209217520821718, "language_loss": 0.75267494, "learning_rate": 1.4736834240831454e-06, "loss": 0.77714092, "num_input_tokens_seen": 214021680, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.2052002, "step": 9933, "time_per_iteration": 2.8653359413146973 }, { "auxiliary_loss_clip": 0.01206105, "auxiliary_loss_mlp": 0.01027405, "balance_loss_clip": 1.11561394, "balance_loss_mlp": 1.00947571, "epoch": 0.5972643920036074, "flos": 71688448920960.0, "grad_norm": 0.6602046078011982, "language_loss": 0.52044594, "learning_rate": 1.473307699867203e-06, "loss": 0.54278111, "num_input_tokens_seen": 214090265, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.1796875, "step": 9934, "time_per_iteration": 3.4048213958740234 }, { "auxiliary_loss_clip": 0.01208986, "auxiliary_loss_mlp": 0.01037755, "balance_loss_clip": 1.11799705, "balance_loss_mlp": 1.01982641, "epoch": 0.5973245152562754, "flos": 56919893379840.0, "grad_norm": 0.8283827550397656, "language_loss": 0.54230714, "learning_rate": 1.4729319956234849e-06, "loss": 0.56477451, "num_input_tokens_seen": 214146375, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.1796875, "step": 9935, "time_per_iteration": 3.2338685989379883 }, { "auxiliary_loss_clip": 0.01416362, "auxiliary_loss_mlp": 0.01040978, "balance_loss_clip": 1.24845731, "balance_loss_mlp": 1.01832819, "epoch": 0.5973846385089433, "flos": 24173431810560.0, "grad_norm": 1.765983113685319, "language_loss": 0.6674161, "learning_rate": 1.4725563113662394e-06, "loss": 0.69198954, "num_input_tokens_seen": 214165340, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.22644043, "step": 9936, "time_per_iteration": 2.9413180351257324 }, { "auxiliary_loss_clip": 0.01431306, "auxiliary_loss_mlp": 0.01035783, "balance_loss_clip": 1.26169562, "balance_loss_mlp": 1.01568413, "epoch": 0.5974447617616113, "flos": 17678167355520.0, "grad_norm": 1.9460496678895458, "language_loss": 0.68030095, "learning_rate": 1.4721806471097103e-06, "loss": 0.70497185, "num_input_tokens_seen": 214181360, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.2010498, "step": 9937, "time_per_iteration": 2.9272842407226562 }, { "auxiliary_loss_clip": 0.01428644, "auxiliary_loss_mlp": 0.01034239, "balance_loss_clip": 1.25907993, "balance_loss_mlp": 1.01271033, "epoch": 0.5975048850142792, "flos": 22902591836160.0, "grad_norm": 2.1922047281702812, "language_loss": 0.78561693, "learning_rate": 1.4718050028681442e-06, "loss": 0.81024575, "num_input_tokens_seen": 214198525, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21520996, "step": 9938, "time_per_iteration": 2.9505600929260254 }, { "auxiliary_loss_clip": 0.01430705, "auxiliary_loss_mlp": 0.0103834, "balance_loss_clip": 1.2621336, "balance_loss_mlp": 1.01596391, "epoch": 0.5975650082669473, "flos": 24363867749760.0, "grad_norm": 1.8231987535158651, "language_loss": 0.76648831, "learning_rate": 1.4714293786557855e-06, "loss": 0.79117882, "num_input_tokens_seen": 214218710, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.22375488, "step": 9939, "time_per_iteration": 2.931148052215576 }, { "auxiliary_loss_clip": 0.0143547, "auxiliary_loss_mlp": 0.01038585, "balance_loss_clip": 1.26213551, "balance_loss_mlp": 1.01687694, "epoch": 0.5976251315196152, "flos": 20933197107840.0, "grad_norm": 2.5158932394194786, "language_loss": 0.69492805, "learning_rate": 1.471053774486878e-06, "loss": 0.71966863, "num_input_tokens_seen": 214237800, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21728516, "step": 9940, "time_per_iteration": 4.341262578964233 }, { "auxiliary_loss_clip": 0.01413654, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.24969852, "balance_loss_mlp": 1.01397657, "epoch": 0.5976852547722832, "flos": 35857938274560.0, "grad_norm": 1.360144234209964, "language_loss": 0.70686722, "learning_rate": 1.470678190375664e-06, "loss": 0.73134041, "num_input_tokens_seen": 214260355, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19677734, "step": 9941, "time_per_iteration": 2.9766056537628174 }, { "auxiliary_loss_clip": 0.01415743, "auxiliary_loss_mlp": 0.01036184, "balance_loss_clip": 1.24955821, "balance_loss_mlp": 1.01507163, "epoch": 0.5977453780249512, "flos": 12862648546560.0, "grad_norm": 2.739663306110588, "language_loss": 0.7825948, "learning_rate": 1.470302626336386e-06, "loss": 0.80711406, "num_input_tokens_seen": 214277120, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.21105957, "step": 9942, "time_per_iteration": 2.831942558288574 }, { "auxiliary_loss_clip": 0.0142312, "auxiliary_loss_mlp": 0.01036273, "balance_loss_clip": 1.25253654, "balance_loss_mlp": 1.01615047, "epoch": 0.5978055012776191, "flos": 20968832027520.0, "grad_norm": 1.7013274192150403, "language_loss": 0.75902462, "learning_rate": 1.4699270823832857e-06, "loss": 0.78361857, "num_input_tokens_seen": 214295300, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20117188, "step": 9943, "time_per_iteration": 2.8651227951049805 }, { "auxiliary_loss_clip": 0.01427486, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.26006842, "balance_loss_mlp": 1.01609349, "epoch": 0.5978656245302871, "flos": 34071197379840.0, "grad_norm": 2.114572442349852, "language_loss": 0.63020086, "learning_rate": 1.4695515585306032e-06, "loss": 0.65483642, "num_input_tokens_seen": 214317050, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19970703, "step": 9944, "time_per_iteration": 2.9593162536621094 }, { "auxiliary_loss_clip": 0.01430103, "auxiliary_loss_mlp": 0.01035205, "balance_loss_clip": 1.26333141, "balance_loss_mlp": 1.01465321, "epoch": 0.597925747782955, "flos": 37386050037120.0, "grad_norm": 1.6194787097154162, "language_loss": 0.73112476, "learning_rate": 1.4691760547925795e-06, "loss": 0.75577784, "num_input_tokens_seen": 214337470, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20568848, "step": 9945, "time_per_iteration": 2.9799981117248535 }, { "auxiliary_loss_clip": 0.01419584, "auxiliary_loss_mlp": 0.01033206, "balance_loss_clip": 1.25154352, "balance_loss_mlp": 1.01300049, "epoch": 0.5979858710356231, "flos": 25385851468800.0, "grad_norm": 2.5675449181521, "language_loss": 0.68121505, "learning_rate": 1.4688005711834522e-06, "loss": 0.70574296, "num_input_tokens_seen": 214357975, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20214844, "step": 9946, "time_per_iteration": 2.860201597213745 }, { "auxiliary_loss_clip": 0.01443571, "auxiliary_loss_mlp": 0.01040642, "balance_loss_clip": 1.26988876, "balance_loss_mlp": 1.01924419, "epoch": 0.598045994288291, "flos": 13706186198400.0, "grad_norm": 6.324691890564547, "language_loss": 0.89784765, "learning_rate": 1.468425107717461e-06, "loss": 0.9226898, "num_input_tokens_seen": 214374125, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21398926, "step": 9947, "time_per_iteration": 2.8091392517089844 }, { "auxiliary_loss_clip": 0.01408077, "auxiliary_loss_mlp": 0.01040453, "balance_loss_clip": 1.24460244, "balance_loss_mlp": 1.02053297, "epoch": 0.598106117540959, "flos": 21991449173760.0, "grad_norm": 2.1116338137624977, "language_loss": 0.72730184, "learning_rate": 1.4680496644088432e-06, "loss": 0.75178719, "num_input_tokens_seen": 214393395, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19921875, "step": 9948, "time_per_iteration": 2.8411688804626465 }, { "auxiliary_loss_clip": 0.01421996, "auxiliary_loss_mlp": 0.01038772, "balance_loss_clip": 1.25304389, "balance_loss_mlp": 1.01695657, "epoch": 0.5981662407936269, "flos": 20569337274240.0, "grad_norm": 2.1443071760378207, "language_loss": 0.90131724, "learning_rate": 1.4676742412718347e-06, "loss": 0.92592496, "num_input_tokens_seen": 214411550, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21801758, "step": 9949, "time_per_iteration": 2.895925998687744 }, { "auxiliary_loss_clip": 0.0143696, "auxiliary_loss_mlp": 0.01036231, "balance_loss_clip": 1.26867831, "balance_loss_mlp": 1.01590574, "epoch": 0.5982263640462949, "flos": 14071855824000.0, "grad_norm": 1.9863850405199435, "language_loss": 0.71550465, "learning_rate": 1.467298838320673e-06, "loss": 0.74023652, "num_input_tokens_seen": 214429780, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.203125, "step": 9950, "time_per_iteration": 2.838127613067627 }, { "auxiliary_loss_clip": 0.01436497, "auxiliary_loss_mlp": 0.01037676, "balance_loss_clip": 1.26759696, "balance_loss_mlp": 1.01677847, "epoch": 0.5982864872989628, "flos": 17714842905600.0, "grad_norm": 1.7879886012218047, "language_loss": 0.79022026, "learning_rate": 1.4669234555695921e-06, "loss": 0.81496197, "num_input_tokens_seen": 214447775, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.2088623, "step": 9951, "time_per_iteration": 2.872558355331421 }, { "auxiliary_loss_clip": 0.01433274, "auxiliary_loss_mlp": 0.01044542, "balance_loss_clip": 1.26377749, "balance_loss_mlp": 1.02352571, "epoch": 0.5983466105516309, "flos": 16773766168320.0, "grad_norm": 1.5240895302143922, "language_loss": 0.74575186, "learning_rate": 1.4665480930328275e-06, "loss": 0.77052999, "num_input_tokens_seen": 214467245, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.21032715, "step": 9952, "time_per_iteration": 4.402590036392212 }, { "auxiliary_loss_clip": 0.0143116, "auxiliary_loss_mlp": 0.01037888, "balance_loss_clip": 1.26067472, "balance_loss_mlp": 1.01579821, "epoch": 0.5984067338042988, "flos": 20051038379520.0, "grad_norm": 7.414314923600157, "language_loss": 0.79719228, "learning_rate": 1.466172750724613e-06, "loss": 0.82188278, "num_input_tokens_seen": 214484385, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.22106934, "step": 9953, "time_per_iteration": 2.857473134994507 }, { "auxiliary_loss_clip": 0.01418897, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.25199032, "balance_loss_mlp": 1.01771414, "epoch": 0.5984668570569668, "flos": 26330276321280.0, "grad_norm": 1.4421645822891698, "language_loss": 0.70096767, "learning_rate": 1.4657974286591807e-06, "loss": 0.72553939, "num_input_tokens_seen": 214503465, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20568848, "step": 9954, "time_per_iteration": 5.742401361465454 }, { "auxiliary_loss_clip": 0.01438523, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.26806629, "balance_loss_mlp": 1.0164485, "epoch": 0.5985269803096348, "flos": 20603162401920.0, "grad_norm": 2.6079539361033586, "language_loss": 0.74026698, "learning_rate": 1.4654221268507637e-06, "loss": 0.76501942, "num_input_tokens_seen": 214520725, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20263672, "step": 9955, "time_per_iteration": 2.9300239086151123 }, { "auxiliary_loss_clip": 0.01420806, "auxiliary_loss_mlp": 0.01036497, "balance_loss_clip": 1.25339913, "balance_loss_mlp": 1.01626706, "epoch": 0.5985871035623027, "flos": 26875432644480.0, "grad_norm": 5.347663577640157, "language_loss": 0.6895467, "learning_rate": 1.4650468453135934e-06, "loss": 0.71411973, "num_input_tokens_seen": 214540675, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20239258, "step": 9956, "time_per_iteration": 3.0039994716644287 }, { "auxiliary_loss_clip": 0.01435932, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.2655735, "balance_loss_mlp": 1.01418328, "epoch": 0.5986472268149707, "flos": 19619166332160.0, "grad_norm": 2.6101486320467226, "language_loss": 0.74506783, "learning_rate": 1.4646715840618999e-06, "loss": 0.76978457, "num_input_tokens_seen": 214559910, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.21569824, "step": 9957, "time_per_iteration": 2.82476806640625 }, { "auxiliary_loss_clip": 0.01406004, "auxiliary_loss_mlp": 0.01030084, "balance_loss_clip": 1.2443974, "balance_loss_mlp": 1.00993752, "epoch": 0.5987073500676386, "flos": 21803546943360.0, "grad_norm": 2.1378030572601623, "language_loss": 0.85128629, "learning_rate": 1.4642963431099138e-06, "loss": 0.87564719, "num_input_tokens_seen": 214575960, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.20153809, "step": 9958, "time_per_iteration": 2.8701822757720947 }, { "auxiliary_loss_clip": 0.01438973, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.2689867, "balance_loss_mlp": 1.01395905, "epoch": 0.5987674733203067, "flos": 24324522756480.0, "grad_norm": 1.9552927654593348, "language_loss": 0.67177123, "learning_rate": 1.463921122471864e-06, "loss": 0.69650316, "num_input_tokens_seen": 214594230, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20263672, "step": 9959, "time_per_iteration": 2.9065139293670654 }, { "auxiliary_loss_clip": 0.01430947, "auxiliary_loss_mlp": 0.01037428, "balance_loss_clip": 1.26348424, "balance_loss_mlp": 1.01750863, "epoch": 0.5988275965729746, "flos": 21328981787520.0, "grad_norm": 1.6555438057872447, "language_loss": 0.83910429, "learning_rate": 1.4635459221619796e-06, "loss": 0.86378807, "num_input_tokens_seen": 214613130, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19934082, "step": 9960, "time_per_iteration": 2.866654872894287 }, { "auxiliary_loss_clip": 0.01422909, "auxiliary_loss_mlp": 0.01035385, "balance_loss_clip": 1.25636995, "balance_loss_mlp": 1.01478601, "epoch": 0.5988877198256426, "flos": 25128670170240.0, "grad_norm": 1.5083637312816112, "language_loss": 0.80116212, "learning_rate": 1.4631707421944868e-06, "loss": 0.82574505, "num_input_tokens_seen": 214634470, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20581055, "step": 9961, "time_per_iteration": 2.8827714920043945 }, { "auxiliary_loss_clip": 0.01421114, "auxiliary_loss_mlp": 0.01034446, "balance_loss_clip": 1.25417161, "balance_loss_mlp": 1.01406133, "epoch": 0.5989478430783105, "flos": 26439443320320.0, "grad_norm": 2.135565947247819, "language_loss": 0.68027341, "learning_rate": 1.4627955825836136e-06, "loss": 0.70482898, "num_input_tokens_seen": 214654030, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20385742, "step": 9962, "time_per_iteration": 2.868591547012329 }, { "auxiliary_loss_clip": 0.01421818, "auxiliary_loss_mlp": 0.01035007, "balance_loss_clip": 1.25529456, "balance_loss_mlp": 1.01415682, "epoch": 0.5990079663309785, "flos": 25790187415680.0, "grad_norm": 1.5091448543558166, "language_loss": 0.74762809, "learning_rate": 1.4624204433435857e-06, "loss": 0.77219629, "num_input_tokens_seen": 214676985, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20849609, "step": 9963, "time_per_iteration": 2.9152467250823975 }, { "auxiliary_loss_clip": 0.01414435, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.25038993, "balance_loss_mlp": 1.01044011, "epoch": 0.5990680895836464, "flos": 36845915886720.0, "grad_norm": 2.673158830895831, "language_loss": 0.68808794, "learning_rate": 1.4620453244886281e-06, "loss": 0.71253908, "num_input_tokens_seen": 214700105, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20214844, "step": 9964, "time_per_iteration": 2.968250274658203 }, { "auxiliary_loss_clip": 0.01409271, "auxiliary_loss_mlp": 0.01037309, "balance_loss_clip": 1.24787641, "balance_loss_mlp": 1.01622045, "epoch": 0.5991282128363145, "flos": 24144085918080.0, "grad_norm": 3.188712596812824, "language_loss": 0.77418762, "learning_rate": 1.4616702260329662e-06, "loss": 0.79865348, "num_input_tokens_seen": 214717885, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.2109375, "step": 9965, "time_per_iteration": 2.8654401302337646 }, { "auxiliary_loss_clip": 0.01414926, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.24754572, "balance_loss_mlp": 1.01174831, "epoch": 0.5991883360889824, "flos": 10310652783360.0, "grad_norm": 2.1386746866083346, "language_loss": 0.7826888, "learning_rate": 1.4612951479908229e-06, "loss": 0.80716252, "num_input_tokens_seen": 214733680, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20690918, "step": 9966, "time_per_iteration": 2.816499710083008 }, { "auxiliary_loss_clip": 0.01417983, "auxiliary_loss_mlp": 0.01029793, "balance_loss_clip": 1.25211596, "balance_loss_mlp": 1.00910997, "epoch": 0.5992484593416504, "flos": 23961703553280.0, "grad_norm": 1.6112019788425704, "language_loss": 0.74538392, "learning_rate": 1.460920090376422e-06, "loss": 0.7698617, "num_input_tokens_seen": 214753285, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20678711, "step": 9967, "time_per_iteration": 2.882910966873169 }, { "auxiliary_loss_clip": 0.01435483, "auxiliary_loss_mlp": 0.01038827, "balance_loss_clip": 1.26146936, "balance_loss_mlp": 1.01727402, "epoch": 0.5993085825943184, "flos": 11950646232960.0, "grad_norm": 2.5436232139242154, "language_loss": 0.69142509, "learning_rate": 1.4605450532039847e-06, "loss": 0.71616822, "num_input_tokens_seen": 214767810, "router_z_loss_clip": 1.74121094, "router_z_loss_mlp": 0.21557617, "step": 9968, "time_per_iteration": 2.8864922523498535 }, { "auxiliary_loss_clip": 0.01432358, "auxiliary_loss_mlp": 0.01035054, "balance_loss_clip": 1.26281357, "balance_loss_mlp": 1.01390648, "epoch": 0.5993687058469863, "flos": 19036384318080.0, "grad_norm": 1.7291576602180945, "language_loss": 0.79960942, "learning_rate": 1.4601700364877334e-06, "loss": 0.8242836, "num_input_tokens_seen": 214786040, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.21142578, "step": 9969, "time_per_iteration": 2.842644453048706 }, { "auxiliary_loss_clip": 0.01415484, "auxiliary_loss_mlp": 0.01034502, "balance_loss_clip": 1.24802732, "balance_loss_mlp": 1.01259089, "epoch": 0.5994288290996543, "flos": 14291094718080.0, "grad_norm": 2.250379239589451, "language_loss": 0.81925118, "learning_rate": 1.4597950402418889e-06, "loss": 0.84375107, "num_input_tokens_seen": 214803110, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.21911621, "step": 9970, "time_per_iteration": 2.839820623397827 }, { "auxiliary_loss_clip": 0.01431772, "auxiliary_loss_mlp": 0.0103605, "balance_loss_clip": 1.2613771, "balance_loss_mlp": 1.01360321, "epoch": 0.5994889523523222, "flos": 19215554302080.0, "grad_norm": 2.520664474800559, "language_loss": 0.62202847, "learning_rate": 1.4594200644806697e-06, "loss": 0.64670676, "num_input_tokens_seen": 214819945, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.2244873, "step": 9971, "time_per_iteration": 2.843364953994751 }, { "auxiliary_loss_clip": 0.01415664, "auxiliary_loss_mlp": 0.01030809, "balance_loss_clip": 1.25241125, "balance_loss_mlp": 1.0109961, "epoch": 0.5995490756049903, "flos": 28048462064640.0, "grad_norm": 1.726288315920987, "language_loss": 0.79427546, "learning_rate": 1.4590451092182962e-06, "loss": 0.81874025, "num_input_tokens_seen": 214838810, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19824219, "step": 9972, "time_per_iteration": 3.009695053100586 }, { "auxiliary_loss_clip": 0.01429089, "auxiliary_loss_mlp": 0.01036838, "balance_loss_clip": 1.25641131, "balance_loss_mlp": 1.01539254, "epoch": 0.5996091988576582, "flos": 29063161370880.0, "grad_norm": 2.046955661869127, "language_loss": 0.77238023, "learning_rate": 1.4586701744689864e-06, "loss": 0.79703951, "num_input_tokens_seen": 214857040, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.21447754, "step": 9973, "time_per_iteration": 2.904505968093872 }, { "auxiliary_loss_clip": 0.01413162, "auxiliary_loss_mlp": 0.01035549, "balance_loss_clip": 1.24664855, "balance_loss_mlp": 1.0142101, "epoch": 0.5996693221103262, "flos": 20823803884800.0, "grad_norm": 1.9167236990750331, "language_loss": 0.66056538, "learning_rate": 1.4582952602469578e-06, "loss": 0.68505251, "num_input_tokens_seen": 214873375, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.21337891, "step": 9974, "time_per_iteration": 2.835796594619751 }, { "auxiliary_loss_clip": 0.01411229, "auxiliary_loss_mlp": 0.01035731, "balance_loss_clip": 1.24378633, "balance_loss_mlp": 1.01525092, "epoch": 0.5997294453629941, "flos": 23779411678080.0, "grad_norm": 1.4311967805244572, "language_loss": 0.75582767, "learning_rate": 1.457920366566428e-06, "loss": 0.78029728, "num_input_tokens_seen": 214893900, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20483398, "step": 9975, "time_per_iteration": 4.282090187072754 }, { "auxiliary_loss_clip": 0.01419222, "auxiliary_loss_mlp": 0.01034966, "balance_loss_clip": 1.25218105, "balance_loss_mlp": 1.01447368, "epoch": 0.5997895686156621, "flos": 20969917902720.0, "grad_norm": 2.0614981982518104, "language_loss": 0.78167152, "learning_rate": 1.457545493441611e-06, "loss": 0.80621344, "num_input_tokens_seen": 214912110, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20495605, "step": 9976, "time_per_iteration": 2.8894498348236084 }, { "auxiliary_loss_clip": 0.01410009, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.24467254, "balance_loss_mlp": 1.01393175, "epoch": 0.59984969186833, "flos": 28376958447360.0, "grad_norm": 4.083608941861804, "language_loss": 0.76136589, "learning_rate": 1.4571706408867237e-06, "loss": 0.78582072, "num_input_tokens_seen": 214930140, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.2154541, "step": 9977, "time_per_iteration": 2.8656837940216064 }, { "auxiliary_loss_clip": 0.01422697, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.25367892, "balance_loss_mlp": 1.0136981, "epoch": 0.5999098151209981, "flos": 22576357693440.0, "grad_norm": 2.1834637264469006, "language_loss": 0.69687814, "learning_rate": 1.4567958089159802e-06, "loss": 0.72145253, "num_input_tokens_seen": 214949200, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21044922, "step": 9978, "time_per_iteration": 2.8588674068450928 }, { "auxiliary_loss_clip": 0.0142624, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.25671244, "balance_loss_mlp": 1.0120976, "epoch": 0.599969938373666, "flos": 18777483717120.0, "grad_norm": 2.1496258522459377, "language_loss": 0.82634258, "learning_rate": 1.456420997543594e-06, "loss": 0.85094208, "num_input_tokens_seen": 214965775, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21606445, "step": 9979, "time_per_iteration": 2.8395378589630127 }, { "auxiliary_loss_clip": 0.01392899, "auxiliary_loss_mlp": 0.01037783, "balance_loss_clip": 1.23281693, "balance_loss_mlp": 1.01657581, "epoch": 0.600030061626334, "flos": 11334491539200.0, "grad_norm": 2.0540262500565545, "language_loss": 0.70419484, "learning_rate": 1.4560462067837782e-06, "loss": 0.72850162, "num_input_tokens_seen": 214982480, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.21191406, "step": 9980, "time_per_iteration": 2.8454771041870117 }, { "auxiliary_loss_clip": 0.01431451, "auxiliary_loss_mlp": 0.01034795, "balance_loss_clip": 1.25912619, "balance_loss_mlp": 1.01297975, "epoch": 0.600090184879002, "flos": 16586633099520.0, "grad_norm": 3.051188424247574, "language_loss": 0.69942749, "learning_rate": 1.4556714366507445e-06, "loss": 0.72408992, "num_input_tokens_seen": 214998110, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.21826172, "step": 9981, "time_per_iteration": 2.8002071380615234 }, { "auxiliary_loss_clip": 0.0140705, "auxiliary_loss_mlp": 0.01036791, "balance_loss_clip": 1.24273562, "balance_loss_mlp": 1.01669216, "epoch": 0.6001503081316699, "flos": 23627958773760.0, "grad_norm": 2.1439053628956994, "language_loss": 0.79819328, "learning_rate": 1.4552966871587048e-06, "loss": 0.82263166, "num_input_tokens_seen": 215017995, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20092773, "step": 9982, "time_per_iteration": 2.8563287258148193 }, { "auxiliary_loss_clip": 0.01411861, "auxiliary_loss_mlp": 0.01034836, "balance_loss_clip": 1.24719119, "balance_loss_mlp": 1.01273429, "epoch": 0.6002104313843379, "flos": 20677373153280.0, "grad_norm": 1.4511020653176327, "language_loss": 0.73582089, "learning_rate": 1.4549219583218686e-06, "loss": 0.76028788, "num_input_tokens_seen": 215038285, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.22106934, "step": 9983, "time_per_iteration": 2.886061668395996 }, { "auxiliary_loss_clip": 0.01414344, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.24672461, "balance_loss_mlp": 1.01356781, "epoch": 0.6002705546370058, "flos": 22465335657600.0, "grad_norm": 2.2811898992076127, "language_loss": 0.79665464, "learning_rate": 1.454547250154447e-06, "loss": 0.82114488, "num_input_tokens_seen": 215057825, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.21105957, "step": 9984, "time_per_iteration": 2.8749990463256836 }, { "auxiliary_loss_clip": 0.01417758, "auxiliary_loss_mlp": 0.01037074, "balance_loss_clip": 1.25025535, "balance_loss_mlp": 1.0163914, "epoch": 0.6003306778896739, "flos": 25202880921600.0, "grad_norm": 1.6828953163525888, "language_loss": 0.83855426, "learning_rate": 1.4541725626706485e-06, "loss": 0.86310256, "num_input_tokens_seen": 215077790, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20678711, "step": 9985, "time_per_iteration": 2.99177622795105 }, { "auxiliary_loss_clip": 0.01412172, "auxiliary_loss_mlp": 0.01038051, "balance_loss_clip": 1.2453202, "balance_loss_mlp": 1.01747561, "epoch": 0.6003908011423418, "flos": 26698977348480.0, "grad_norm": 1.7524742693159485, "language_loss": 0.72147119, "learning_rate": 1.4537978958846809e-06, "loss": 0.74597347, "num_input_tokens_seen": 215097650, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20568848, "step": 9986, "time_per_iteration": 4.32145619392395 }, { "auxiliary_loss_clip": 0.01411602, "auxiliary_loss_mlp": 0.01040424, "balance_loss_clip": 1.24508238, "balance_loss_mlp": 1.01901436, "epoch": 0.6004509243950098, "flos": 22575317063040.0, "grad_norm": 1.4520838306209765, "language_loss": 0.72469234, "learning_rate": 1.4534232498107514e-06, "loss": 0.74921256, "num_input_tokens_seen": 215118235, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.2142334, "step": 9987, "time_per_iteration": 2.913203239440918 }, { "auxiliary_loss_clip": 0.01409104, "auxiliary_loss_mlp": 0.01035562, "balance_loss_clip": 1.24496531, "balance_loss_mlp": 1.01567745, "epoch": 0.6005110476476777, "flos": 19728831024000.0, "grad_norm": 1.831814193323785, "language_loss": 0.85854495, "learning_rate": 1.4530486244630673e-06, "loss": 0.88299161, "num_input_tokens_seen": 215136755, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19885254, "step": 9988, "time_per_iteration": 5.693461656570435 }, { "auxiliary_loss_clip": 0.01414166, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.24737799, "balance_loss_mlp": 1.01826632, "epoch": 0.6005711709003457, "flos": 17721539136000.0, "grad_norm": 1.9725637441131074, "language_loss": 0.66598058, "learning_rate": 1.4526740198558346e-06, "loss": 0.6905117, "num_input_tokens_seen": 215155225, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20654297, "step": 9989, "time_per_iteration": 2.8730051517486572 }, { "auxiliary_loss_clip": 0.0141585, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.24980855, "balance_loss_mlp": 1.01996446, "epoch": 0.6006312941530136, "flos": 18523469554560.0, "grad_norm": 1.5012237621616775, "language_loss": 0.81024188, "learning_rate": 1.452299436003257e-06, "loss": 0.83479834, "num_input_tokens_seen": 215174815, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19824219, "step": 9990, "time_per_iteration": 2.941312551498413 }, { "auxiliary_loss_clip": 0.01434814, "auxiliary_loss_mlp": 0.01038907, "balance_loss_clip": 1.26492417, "balance_loss_mlp": 1.01910686, "epoch": 0.6006914174056817, "flos": 21399030017280.0, "grad_norm": 2.1063666614280194, "language_loss": 0.83459985, "learning_rate": 1.4519248729195403e-06, "loss": 0.85933709, "num_input_tokens_seen": 215192045, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.19799805, "step": 9991, "time_per_iteration": 2.8937768936157227 }, { "auxiliary_loss_clip": 0.01409981, "auxiliary_loss_mlp": 0.01038665, "balance_loss_clip": 1.24629867, "balance_loss_mlp": 1.01857841, "epoch": 0.6007515406583496, "flos": 12758232251520.0, "grad_norm": 1.7228719878783214, "language_loss": 0.8342126, "learning_rate": 1.4515503306188878e-06, "loss": 0.85869908, "num_input_tokens_seen": 215209885, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.20092773, "step": 9992, "time_per_iteration": 2.8034322261810303 }, { "auxiliary_loss_clip": 0.01411041, "auxiliary_loss_mlp": 0.01038204, "balance_loss_clip": 1.24570584, "balance_loss_mlp": 1.01591206, "epoch": 0.6008116639110176, "flos": 19215871015680.0, "grad_norm": 1.9319271320037874, "language_loss": 0.67164063, "learning_rate": 1.4511758091155008e-06, "loss": 0.69613308, "num_input_tokens_seen": 215228150, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.22290039, "step": 9993, "time_per_iteration": 2.9650959968566895 }, { "auxiliary_loss_clip": 0.01417312, "auxiliary_loss_mlp": 0.01037712, "balance_loss_clip": 1.24987221, "balance_loss_mlp": 1.01694608, "epoch": 0.6008717871636855, "flos": 17064048677760.0, "grad_norm": 2.565875090816295, "language_loss": 0.82301581, "learning_rate": 1.4508013084235826e-06, "loss": 0.84756601, "num_input_tokens_seen": 215243755, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20751953, "step": 9994, "time_per_iteration": 2.884915828704834 }, { "auxiliary_loss_clip": 0.01397074, "auxiliary_loss_mlp": 0.01032759, "balance_loss_clip": 1.23611879, "balance_loss_mlp": 1.01331639, "epoch": 0.6009319104163535, "flos": 20307224292480.0, "grad_norm": 1.9232116831753792, "language_loss": 0.73107553, "learning_rate": 1.4504268285573337e-06, "loss": 0.75537384, "num_input_tokens_seen": 215262130, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19433594, "step": 9995, "time_per_iteration": 2.8680977821350098 }, { "auxiliary_loss_clip": 0.01426248, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.25735557, "balance_loss_mlp": 1.01243711, "epoch": 0.6009920336690215, "flos": 21847190192640.0, "grad_norm": 1.6495150594564798, "language_loss": 0.81568038, "learning_rate": 1.4500523695309546e-06, "loss": 0.84027374, "num_input_tokens_seen": 215281785, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20629883, "step": 9996, "time_per_iteration": 2.9244449138641357 }, { "auxiliary_loss_clip": 0.01410907, "auxiliary_loss_mlp": 0.01037685, "balance_loss_clip": 1.24557757, "balance_loss_mlp": 1.01679981, "epoch": 0.6010521569216895, "flos": 22604934424320.0, "grad_norm": 1.6624707059383168, "language_loss": 0.78758281, "learning_rate": 1.4496779313586447e-06, "loss": 0.81206876, "num_input_tokens_seen": 215297550, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20898438, "step": 9997, "time_per_iteration": 2.840813159942627 }, { "auxiliary_loss_clip": 0.01435709, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.26511931, "balance_loss_mlp": 1.01734269, "epoch": 0.6011122801743575, "flos": 19181005257600.0, "grad_norm": 1.6069534951859916, "language_loss": 0.73528969, "learning_rate": 1.4493035140546028e-06, "loss": 0.7600252, "num_input_tokens_seen": 215316360, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20507812, "step": 9998, "time_per_iteration": 2.903951644897461 }, { "auxiliary_loss_clip": 0.0140993, "auxiliary_loss_mlp": 0.01036446, "balance_loss_clip": 1.24456048, "balance_loss_mlp": 1.01552463, "epoch": 0.6011724034270254, "flos": 25020996249600.0, "grad_norm": 1.5355229879150307, "language_loss": 0.72967315, "learning_rate": 1.448929117633027e-06, "loss": 0.75413692, "num_input_tokens_seen": 215336405, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20910645, "step": 9999, "time_per_iteration": 2.8664000034332275 }, { "auxiliary_loss_clip": 0.01427171, "auxiliary_loss_mlp": 0.01038914, "balance_loss_clip": 1.25596333, "balance_loss_mlp": 1.018255, "epoch": 0.6012325266796934, "flos": 21807392751360.0, "grad_norm": 2.4302224172613904, "language_loss": 0.78661084, "learning_rate": 1.4485547421081142e-06, "loss": 0.81127167, "num_input_tokens_seen": 215356590, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20666504, "step": 10000, "time_per_iteration": 2.8526506423950195 }, { "auxiliary_loss_clip": 0.01441936, "auxiliary_loss_mlp": 0.01038218, "balance_loss_clip": 1.26872432, "balance_loss_mlp": 1.01648593, "epoch": 0.6012926499323613, "flos": 19582400292480.0, "grad_norm": 2.3240031762050606, "language_loss": 0.78006399, "learning_rate": 1.4481803874940608e-06, "loss": 0.80486548, "num_input_tokens_seen": 215374295, "router_z_loss_clip": 1.72851562, "router_z_loss_mlp": 0.21716309, "step": 10001, "time_per_iteration": 2.8284196853637695 }, { "auxiliary_loss_clip": 0.01428564, "auxiliary_loss_mlp": 0.01034147, "balance_loss_clip": 1.25706601, "balance_loss_mlp": 1.01208115, "epoch": 0.6013527731850293, "flos": 34874213673600.0, "grad_norm": 1.655743372298529, "language_loss": 0.59012389, "learning_rate": 1.4478060538050624e-06, "loss": 0.61475098, "num_input_tokens_seen": 215394535, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.22045898, "step": 10002, "time_per_iteration": 2.9833273887634277 }, { "auxiliary_loss_clip": 0.01432154, "auxiliary_loss_mlp": 0.01035526, "balance_loss_clip": 1.262537, "balance_loss_mlp": 1.01501, "epoch": 0.6014128964376972, "flos": 23301634141440.0, "grad_norm": 1.6487940447877627, "language_loss": 0.78614652, "learning_rate": 1.447431741055314e-06, "loss": 0.81082332, "num_input_tokens_seen": 215414355, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20532227, "step": 10003, "time_per_iteration": 2.8668084144592285 }, { "auxiliary_loss_clip": 0.01424772, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.25529838, "balance_loss_mlp": 1.01598454, "epoch": 0.6014730196903653, "flos": 24830107862400.0, "grad_norm": 2.030993092591522, "language_loss": 0.77980292, "learning_rate": 1.4470574492590091e-06, "loss": 0.80442667, "num_input_tokens_seen": 215428280, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.21606445, "step": 10004, "time_per_iteration": 2.8453733921051025 }, { "auxiliary_loss_clip": 0.01427728, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.25920188, "balance_loss_mlp": 1.01454961, "epoch": 0.6015331429430332, "flos": 23122735626240.0, "grad_norm": 1.4849398613325957, "language_loss": 0.72943699, "learning_rate": 1.4466831784303408e-06, "loss": 0.75406647, "num_input_tokens_seen": 215448970, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20678711, "step": 10005, "time_per_iteration": 2.8858256340026855 }, { "auxiliary_loss_clip": 0.01413422, "auxiliary_loss_mlp": 0.01034927, "balance_loss_clip": 1.25086534, "balance_loss_mlp": 1.01467323, "epoch": 0.6015932661957012, "flos": 19208903316480.0, "grad_norm": 1.9763752601409366, "language_loss": 0.76228809, "learning_rate": 1.4463089285835026e-06, "loss": 0.7867716, "num_input_tokens_seen": 215465260, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20239258, "step": 10006, "time_per_iteration": 2.8283817768096924 }, { "auxiliary_loss_clip": 0.01424452, "auxiliary_loss_mlp": 0.01037093, "balance_loss_clip": 1.25552964, "balance_loss_mlp": 1.01582623, "epoch": 0.6016533894483691, "flos": 18122662702080.0, "grad_norm": 1.7974043117910554, "language_loss": 0.74869061, "learning_rate": 1.445934699732685e-06, "loss": 0.77330601, "num_input_tokens_seen": 215482725, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21264648, "step": 10007, "time_per_iteration": 2.8655953407287598 }, { "auxiliary_loss_clip": 0.01411024, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.24469554, "balance_loss_mlp": 1.01590657, "epoch": 0.6017135127010371, "flos": 16225578443520.0, "grad_norm": 1.7033930072149734, "language_loss": 0.70636612, "learning_rate": 1.4455604918920785e-06, "loss": 0.73084235, "num_input_tokens_seen": 215500420, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20690918, "step": 10008, "time_per_iteration": 2.954418182373047 }, { "auxiliary_loss_clip": 0.01422087, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.2548542, "balance_loss_mlp": 1.01626539, "epoch": 0.6017736359537051, "flos": 23455620754560.0, "grad_norm": 1.8151348355207302, "language_loss": 0.77385181, "learning_rate": 1.4451863050758748e-06, "loss": 0.79843903, "num_input_tokens_seen": 215522260, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20349121, "step": 10009, "time_per_iteration": 2.9332690238952637 }, { "auxiliary_loss_clip": 0.01409721, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.24357104, "balance_loss_mlp": 1.01409936, "epoch": 0.601833759206373, "flos": 23524628353920.0, "grad_norm": 2.1600378512799905, "language_loss": 0.75031084, "learning_rate": 1.4448121392982608e-06, "loss": 0.7747575, "num_input_tokens_seen": 215541715, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20837402, "step": 10010, "time_per_iteration": 4.3161962032318115 }, { "auxiliary_loss_clip": 0.0120699, "auxiliary_loss_mlp": 0.01042138, "balance_loss_clip": 1.113801, "balance_loss_mlp": 1.01638854, "epoch": 0.6018938824590411, "flos": 64026082114560.0, "grad_norm": 0.8370993538455997, "language_loss": 0.5512141, "learning_rate": 1.4444379945734268e-06, "loss": 0.57370543, "num_input_tokens_seen": 215603020, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.2578125, "step": 10011, "time_per_iteration": 3.4164257049560547 }, { "auxiliary_loss_clip": 0.01421216, "auxiliary_loss_mlp": 0.01040938, "balance_loss_clip": 1.25369883, "balance_loss_mlp": 1.02213919, "epoch": 0.601954005711709, "flos": 34652893518720.0, "grad_norm": 1.3725490756580894, "language_loss": 0.62279308, "learning_rate": 1.44406387091556e-06, "loss": 0.64741462, "num_input_tokens_seen": 215625115, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.18786621, "step": 10012, "time_per_iteration": 2.9786477088928223 }, { "auxiliary_loss_clip": 0.01422201, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.25699711, "balance_loss_mlp": 1.01539505, "epoch": 0.602014128964377, "flos": 19436422008960.0, "grad_norm": 2.2785517775468636, "language_loss": 0.75623471, "learning_rate": 1.4436897683388462e-06, "loss": 0.78080642, "num_input_tokens_seen": 215643730, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19567871, "step": 10013, "time_per_iteration": 2.853588581085205 }, { "auxiliary_loss_clip": 0.01404946, "auxiliary_loss_mlp": 0.01034134, "balance_loss_clip": 1.24274027, "balance_loss_mlp": 1.0151794, "epoch": 0.6020742522170449, "flos": 28341006814080.0, "grad_norm": 1.5798438943810529, "language_loss": 0.81602323, "learning_rate": 1.4433156868574732e-06, "loss": 0.84041405, "num_input_tokens_seen": 215664425, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.1895752, "step": 10014, "time_per_iteration": 2.911649703979492 }, { "auxiliary_loss_clip": 0.01405314, "auxiliary_loss_mlp": 0.01040445, "balance_loss_clip": 1.24314332, "balance_loss_mlp": 1.02057266, "epoch": 0.6021343754697129, "flos": 22757201735040.0, "grad_norm": 1.4345887813063787, "language_loss": 0.73009348, "learning_rate": 1.442941626485624e-06, "loss": 0.75455105, "num_input_tokens_seen": 215684280, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19873047, "step": 10015, "time_per_iteration": 2.8793442249298096 }, { "auxiliary_loss_clip": 0.01201253, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.10958898, "balance_loss_mlp": 1.01183569, "epoch": 0.6021944987223808, "flos": 65779749060480.0, "grad_norm": 0.830439642638239, "language_loss": 0.54879618, "learning_rate": 1.4425675872374848e-06, "loss": 0.57117879, "num_input_tokens_seen": 215739780, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.25195312, "step": 10016, "time_per_iteration": 3.2668616771698 }, { "auxiliary_loss_clip": 0.01411631, "auxiliary_loss_mlp": 0.01040038, "balance_loss_clip": 1.24611521, "balance_loss_mlp": 1.01883066, "epoch": 0.6022546219750489, "flos": 16113244308480.0, "grad_norm": 1.5000256431311787, "language_loss": 0.83156085, "learning_rate": 1.4421935691272381e-06, "loss": 0.85607761, "num_input_tokens_seen": 215757885, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.21203613, "step": 10017, "time_per_iteration": 2.826338291168213 }, { "auxiliary_loss_clip": 0.01415046, "auxiliary_loss_mlp": 0.01040712, "balance_loss_clip": 1.25180674, "balance_loss_mlp": 1.02017188, "epoch": 0.6023147452277168, "flos": 25521513937920.0, "grad_norm": 1.7717220575553094, "language_loss": 0.84399569, "learning_rate": 1.4418195721690677e-06, "loss": 0.86855328, "num_input_tokens_seen": 215776415, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20544434, "step": 10018, "time_per_iteration": 2.902188301086426 }, { "auxiliary_loss_clip": 0.01434097, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.26217449, "balance_loss_mlp": 1.02206612, "epoch": 0.6023748684803848, "flos": 22645817740800.0, "grad_norm": 1.6672475963439857, "language_loss": 0.78820264, "learning_rate": 1.4414455963771549e-06, "loss": 0.81296957, "num_input_tokens_seen": 215794865, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.2052002, "step": 10019, "time_per_iteration": 2.8547585010528564 }, { "auxiliary_loss_clip": 0.01420536, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.25369751, "balance_loss_mlp": 1.01748621, "epoch": 0.6024349917330527, "flos": 26220792608640.0, "grad_norm": 1.7076671137202029, "language_loss": 0.74540508, "learning_rate": 1.441071641765681e-06, "loss": 0.76999569, "num_input_tokens_seen": 215816840, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.21020508, "step": 10020, "time_per_iteration": 2.902641534805298 }, { "auxiliary_loss_clip": 0.01418487, "auxiliary_loss_mlp": 0.01044504, "balance_loss_clip": 1.24993134, "balance_loss_mlp": 1.02309418, "epoch": 0.6024951149857207, "flos": 21261693490560.0, "grad_norm": 1.6868826381474988, "language_loss": 0.64826679, "learning_rate": 1.4406977083488264e-06, "loss": 0.67289668, "num_input_tokens_seen": 215836100, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.21398926, "step": 10021, "time_per_iteration": 4.280750751495361 }, { "auxiliary_loss_clip": 0.01427319, "auxiliary_loss_mlp": 0.01034267, "balance_loss_clip": 1.25921917, "balance_loss_mlp": 1.01350021, "epoch": 0.6025552382383887, "flos": 26954846547840.0, "grad_norm": 1.4406051086451284, "language_loss": 0.80988163, "learning_rate": 1.4403237961407704e-06, "loss": 0.83449751, "num_input_tokens_seen": 215858480, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20751953, "step": 10022, "time_per_iteration": 2.902776002883911 }, { "auxiliary_loss_clip": 0.01456946, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.28404737, "balance_loss_mlp": 1.01523542, "epoch": 0.6026153614910567, "flos": 31696426074240.0, "grad_norm": 1.4342665301888817, "language_loss": 0.67063439, "learning_rate": 1.439949905155693e-06, "loss": 0.69556606, "num_input_tokens_seen": 215879950, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.20983887, "step": 10023, "time_per_iteration": 4.515506029129028 }, { "auxiliary_loss_clip": 0.01430828, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.26156878, "balance_loss_mlp": 1.02047348, "epoch": 0.6026754847437247, "flos": 29324143232640.0, "grad_norm": 1.8015004174132123, "language_loss": 0.75147098, "learning_rate": 1.4395760354077707e-06, "loss": 0.77619338, "num_input_tokens_seen": 215899830, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20947266, "step": 10024, "time_per_iteration": 2.918546676635742 }, { "auxiliary_loss_clip": 0.01423903, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.25720906, "balance_loss_mlp": 1.01847386, "epoch": 0.6027356079963926, "flos": 23597300782080.0, "grad_norm": 1.6272524232943106, "language_loss": 0.72961003, "learning_rate": 1.4392021869111815e-06, "loss": 0.75423598, "num_input_tokens_seen": 215920440, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20214844, "step": 10025, "time_per_iteration": 2.8653087615966797 }, { "auxiliary_loss_clip": 0.01437668, "auxiliary_loss_mlp": 0.01037707, "balance_loss_clip": 1.26581359, "balance_loss_mlp": 1.01657116, "epoch": 0.6027957312490606, "flos": 20823532416000.0, "grad_norm": 2.167089353766132, "language_loss": 0.69048917, "learning_rate": 1.4388283596801016e-06, "loss": 0.71524286, "num_input_tokens_seen": 215940535, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.21130371, "step": 10026, "time_per_iteration": 2.8980712890625 }, { "auxiliary_loss_clip": 0.0140845, "auxiliary_loss_mlp": 0.01039672, "balance_loss_clip": 1.24547172, "balance_loss_mlp": 1.01935875, "epoch": 0.6028558545017285, "flos": 19944721802880.0, "grad_norm": 1.937786198465296, "language_loss": 0.80917436, "learning_rate": 1.4384545537287061e-06, "loss": 0.8336556, "num_input_tokens_seen": 215958045, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.203125, "step": 10027, "time_per_iteration": 2.8540515899658203 }, { "auxiliary_loss_clip": 0.01439578, "auxiliary_loss_mlp": 0.01038442, "balance_loss_clip": 1.26726723, "balance_loss_mlp": 1.01805687, "epoch": 0.6029159777543965, "flos": 22831231507200.0, "grad_norm": 4.27059145727633, "language_loss": 0.71485114, "learning_rate": 1.438080769071171e-06, "loss": 0.7396313, "num_input_tokens_seen": 215977330, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.20397949, "step": 10028, "time_per_iteration": 2.853317975997925 }, { "auxiliary_loss_clip": 0.01426167, "auxiliary_loss_mlp": 0.0103936, "balance_loss_clip": 1.25806558, "balance_loss_mlp": 1.01750851, "epoch": 0.6029761010070644, "flos": 23597888964480.0, "grad_norm": 1.8323808397172243, "language_loss": 0.85025918, "learning_rate": 1.437707005721669e-06, "loss": 0.87491453, "num_input_tokens_seen": 215997865, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21850586, "step": 10029, "time_per_iteration": 2.889514446258545 }, { "auxiliary_loss_clip": 0.01407668, "auxiliary_loss_mlp": 0.01037722, "balance_loss_clip": 1.24416089, "balance_loss_mlp": 1.01711094, "epoch": 0.6030362242597325, "flos": 13670325054720.0, "grad_norm": 1.8719044439040922, "language_loss": 0.8099972, "learning_rate": 1.437333263694373e-06, "loss": 0.83445108, "num_input_tokens_seen": 216016230, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20617676, "step": 10030, "time_per_iteration": 2.825075149536133 }, { "auxiliary_loss_clip": 0.01431516, "auxiliary_loss_mlp": 0.01039788, "balance_loss_clip": 1.26199794, "balance_loss_mlp": 1.01927245, "epoch": 0.6030963475124004, "flos": 24432830104320.0, "grad_norm": 1.72533821907604, "language_loss": 0.7169866, "learning_rate": 1.4369595430034572e-06, "loss": 0.74169964, "num_input_tokens_seen": 216035785, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20507812, "step": 10031, "time_per_iteration": 2.8874855041503906 }, { "auxiliary_loss_clip": 0.01438356, "auxiliary_loss_mlp": 0.01038688, "balance_loss_clip": 1.26607192, "balance_loss_mlp": 1.01744461, "epoch": 0.6031564707650684, "flos": 29656304444160.0, "grad_norm": 1.807129412015419, "language_loss": 0.73945045, "learning_rate": 1.4365858436630912e-06, "loss": 0.76422083, "num_input_tokens_seen": 216059555, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21252441, "step": 10032, "time_per_iteration": 2.9415769577026367 }, { "auxiliary_loss_clip": 0.01432795, "auxiliary_loss_mlp": 0.01034744, "balance_loss_clip": 1.26209974, "balance_loss_mlp": 1.01311946, "epoch": 0.6032165940177363, "flos": 16627697395200.0, "grad_norm": 1.80950428041311, "language_loss": 0.68766516, "learning_rate": 1.4362121656874465e-06, "loss": 0.71234053, "num_input_tokens_seen": 216077235, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.21618652, "step": 10033, "time_per_iteration": 2.9123125076293945 }, { "auxiliary_loss_clip": 0.01416061, "auxiliary_loss_mlp": 0.01038528, "balance_loss_clip": 1.2507627, "balance_loss_mlp": 1.01684391, "epoch": 0.6032767172704043, "flos": 17495694501120.0, "grad_norm": 2.339841352605086, "language_loss": 0.76537716, "learning_rate": 1.4358385090906934e-06, "loss": 0.78992307, "num_input_tokens_seen": 216094985, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.21691895, "step": 10034, "time_per_iteration": 2.835196018218994 }, { "auxiliary_loss_clip": 0.01438459, "auxiliary_loss_mlp": 0.0104348, "balance_loss_clip": 1.26775694, "balance_loss_mlp": 1.02211773, "epoch": 0.6033368405230723, "flos": 26844050736000.0, "grad_norm": 1.8636821281588323, "language_loss": 0.7474668, "learning_rate": 1.4354648738870004e-06, "loss": 0.77228618, "num_input_tokens_seen": 216115905, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.21374512, "step": 10035, "time_per_iteration": 2.931239128112793 }, { "auxiliary_loss_clip": 0.01414545, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.25044179, "balance_loss_mlp": 1.01255465, "epoch": 0.6033969637757403, "flos": 16918703821440.0, "grad_norm": 1.5784937972275932, "language_loss": 0.87241161, "learning_rate": 1.435091260090536e-06, "loss": 0.89689684, "num_input_tokens_seen": 216132420, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.2142334, "step": 10036, "time_per_iteration": 2.8254477977752686 }, { "auxiliary_loss_clip": 0.01432565, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.26205945, "balance_loss_mlp": 1.0160439, "epoch": 0.6034570870284083, "flos": 22940443751040.0, "grad_norm": 1.94296253725499, "language_loss": 0.70504129, "learning_rate": 1.4347176677154676e-06, "loss": 0.72974378, "num_input_tokens_seen": 216149800, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.21618652, "step": 10037, "time_per_iteration": 2.88252592086792 }, { "auxiliary_loss_clip": 0.01429115, "auxiliary_loss_mlp": 0.01034154, "balance_loss_clip": 1.263008, "balance_loss_mlp": 1.01268423, "epoch": 0.6035172102810762, "flos": 23376387830400.0, "grad_norm": 2.173334094124257, "language_loss": 0.85695803, "learning_rate": 1.4343440967759616e-06, "loss": 0.88159072, "num_input_tokens_seen": 216168200, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.21459961, "step": 10038, "time_per_iteration": 2.884089231491089 }, { "auxiliary_loss_clip": 0.01422272, "auxiliary_loss_mlp": 0.01036168, "balance_loss_clip": 1.25369859, "balance_loss_mlp": 1.01565194, "epoch": 0.6035773335337442, "flos": 20896974005760.0, "grad_norm": 1.9747241953360997, "language_loss": 0.77625299, "learning_rate": 1.4339705472861846e-06, "loss": 0.8008374, "num_input_tokens_seen": 216187105, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20507812, "step": 10039, "time_per_iteration": 2.834707498550415 }, { "auxiliary_loss_clip": 0.01411841, "auxiliary_loss_mlp": 0.01032251, "balance_loss_clip": 1.24633312, "balance_loss_mlp": 1.01160407, "epoch": 0.6036374567864121, "flos": 24947056967040.0, "grad_norm": 1.5703846065339016, "language_loss": 0.72170699, "learning_rate": 1.433597019260301e-06, "loss": 0.74614787, "num_input_tokens_seen": 216205440, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20654297, "step": 10040, "time_per_iteration": 2.8654778003692627 }, { "auxiliary_loss_clip": 0.0143412, "auxiliary_loss_mlp": 0.01039937, "balance_loss_clip": 1.26354539, "balance_loss_mlp": 1.01800275, "epoch": 0.6036975800390801, "flos": 23158506280320.0, "grad_norm": 1.9482389368463588, "language_loss": 0.79184294, "learning_rate": 1.433223512712475e-06, "loss": 0.81658351, "num_input_tokens_seen": 216223130, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.21948242, "step": 10041, "time_per_iteration": 2.8346941471099854 }, { "auxiliary_loss_clip": 0.01423414, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.25762081, "balance_loss_mlp": 1.01430404, "epoch": 0.603757703291748, "flos": 18669855041280.0, "grad_norm": 1.7299218145969342, "language_loss": 0.76306069, "learning_rate": 1.4328500276568704e-06, "loss": 0.78765035, "num_input_tokens_seen": 216240260, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.21252441, "step": 10042, "time_per_iteration": 2.80134654045105 }, { "auxiliary_loss_clip": 0.0141882, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.25350654, "balance_loss_mlp": 1.01612163, "epoch": 0.6038178265444161, "flos": 19692064984320.0, "grad_norm": 1.7169004632223308, "language_loss": 0.85350287, "learning_rate": 1.4324765641076498e-06, "loss": 0.8780604, "num_input_tokens_seen": 216258510, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20812988, "step": 10043, "time_per_iteration": 2.836409330368042 }, { "auxiliary_loss_clip": 0.01434092, "auxiliary_loss_mlp": 0.01041229, "balance_loss_clip": 1.26326048, "balance_loss_mlp": 1.02017665, "epoch": 0.603877949797084, "flos": 22648532428800.0, "grad_norm": 2.70247007563294, "language_loss": 0.6977663, "learning_rate": 1.432103122078974e-06, "loss": 0.72251958, "num_input_tokens_seen": 216277550, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21044922, "step": 10044, "time_per_iteration": 2.8495242595672607 }, { "auxiliary_loss_clip": 0.01432094, "auxiliary_loss_mlp": 0.01033992, "balance_loss_clip": 1.26361656, "balance_loss_mlp": 1.01292801, "epoch": 0.603938073049752, "flos": 25458976344960.0, "grad_norm": 1.536693063233035, "language_loss": 0.78590083, "learning_rate": 1.4317297015850057e-06, "loss": 0.81056172, "num_input_tokens_seen": 216296690, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21057129, "step": 10045, "time_per_iteration": 4.283295154571533 }, { "auxiliary_loss_clip": 0.01413267, "auxiliary_loss_mlp": 0.01037143, "balance_loss_clip": 1.24851012, "balance_loss_mlp": 1.01640093, "epoch": 0.6039981963024199, "flos": 22348748511360.0, "grad_norm": 2.3457492413190963, "language_loss": 0.78150034, "learning_rate": 1.4313563026399036e-06, "loss": 0.80600446, "num_input_tokens_seen": 216316110, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20739746, "step": 10046, "time_per_iteration": 2.838019609451294 }, { "auxiliary_loss_clip": 0.0142888, "auxiliary_loss_mlp": 0.01035169, "balance_loss_clip": 1.26023483, "balance_loss_mlp": 1.01533306, "epoch": 0.6040583195550879, "flos": 20712600869760.0, "grad_norm": 1.6496707273121405, "language_loss": 0.87825215, "learning_rate": 1.430982925257827e-06, "loss": 0.90289265, "num_input_tokens_seen": 216333855, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.19836426, "step": 10047, "time_per_iteration": 2.8313379287719727 }, { "auxiliary_loss_clip": 0.01420387, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.25710416, "balance_loss_mlp": 1.01431262, "epoch": 0.604118442807756, "flos": 27174764113920.0, "grad_norm": 2.445771342892238, "language_loss": 0.76390481, "learning_rate": 1.4306095694529358e-06, "loss": 0.78844631, "num_input_tokens_seen": 216354890, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19470215, "step": 10048, "time_per_iteration": 2.8957207202911377 }, { "auxiliary_loss_clip": 0.01446282, "auxiliary_loss_mlp": 0.0104116, "balance_loss_clip": 1.27181101, "balance_loss_mlp": 1.018677, "epoch": 0.6041785660604239, "flos": 30893319290880.0, "grad_norm": 1.9212023163638519, "language_loss": 0.66783249, "learning_rate": 1.430236235239386e-06, "loss": 0.69270688, "num_input_tokens_seen": 216376055, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.22509766, "step": 10049, "time_per_iteration": 2.918947219848633 }, { "auxiliary_loss_clip": 0.0143711, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.26982307, "balance_loss_mlp": 1.01991415, "epoch": 0.6042386893130919, "flos": 19947798449280.0, "grad_norm": 1.70742961389772, "language_loss": 0.67220199, "learning_rate": 1.429862922631336e-06, "loss": 0.6969763, "num_input_tokens_seen": 216396295, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20397949, "step": 10050, "time_per_iteration": 2.8502004146575928 }, { "auxiliary_loss_clip": 0.0143161, "auxiliary_loss_mlp": 0.01038585, "balance_loss_clip": 1.26491499, "balance_loss_mlp": 1.01783037, "epoch": 0.6042988125657598, "flos": 32428624976640.0, "grad_norm": 1.7445249806418974, "language_loss": 0.70434201, "learning_rate": 1.4294896316429408e-06, "loss": 0.72904396, "num_input_tokens_seen": 216416605, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20751953, "step": 10051, "time_per_iteration": 3.060915470123291 }, { "auxiliary_loss_clip": 0.01413615, "auxiliary_loss_mlp": 0.010374, "balance_loss_clip": 1.24801457, "balance_loss_mlp": 1.01614523, "epoch": 0.6043589358184278, "flos": 17429763548160.0, "grad_norm": 2.316501530028133, "language_loss": 0.65810746, "learning_rate": 1.4291163622883553e-06, "loss": 0.68261755, "num_input_tokens_seen": 216435130, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.21228027, "step": 10052, "time_per_iteration": 2.8138670921325684 }, { "auxiliary_loss_clip": 0.01426016, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.25756741, "balance_loss_mlp": 1.0151968, "epoch": 0.6044190590710957, "flos": 27684330762240.0, "grad_norm": 2.1083682395432723, "language_loss": 0.69293624, "learning_rate": 1.4287431145817358e-06, "loss": 0.71755272, "num_input_tokens_seen": 216455640, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.2043457, "step": 10053, "time_per_iteration": 2.8695178031921387 }, { "auxiliary_loss_clip": 0.0121152, "auxiliary_loss_mlp": 0.01042066, "balance_loss_clip": 1.11764359, "balance_loss_mlp": 1.02041721, "epoch": 0.6044791823237637, "flos": 65344438408320.0, "grad_norm": 0.7289237466168756, "language_loss": 0.60479277, "learning_rate": 1.4283698885372336e-06, "loss": 0.62732863, "num_input_tokens_seen": 216518130, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.21679688, "step": 10054, "time_per_iteration": 3.5018398761749268 }, { "auxiliary_loss_clip": 0.01402784, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.24033308, "balance_loss_mlp": 1.01323581, "epoch": 0.6045393055764317, "flos": 24501430500480.0, "grad_norm": 1.9812865588149222, "language_loss": 0.86460638, "learning_rate": 1.4279966841690027e-06, "loss": 0.88898921, "num_input_tokens_seen": 216536845, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.22253418, "step": 10055, "time_per_iteration": 2.8799567222595215 }, { "auxiliary_loss_clip": 0.01425778, "auxiliary_loss_mlp": 0.01039187, "balance_loss_clip": 1.25918603, "balance_loss_mlp": 1.01799154, "epoch": 0.6045994288290997, "flos": 19061567688960.0, "grad_norm": 3.7606989623449585, "language_loss": 0.74579144, "learning_rate": 1.4276235014911952e-06, "loss": 0.77044111, "num_input_tokens_seen": 216551860, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.21191406, "step": 10056, "time_per_iteration": 4.288909435272217 }, { "auxiliary_loss_clip": 0.01418155, "auxiliary_loss_mlp": 0.01037565, "balance_loss_clip": 1.25586152, "balance_loss_mlp": 1.01721585, "epoch": 0.6046595520817676, "flos": 26587502864640.0, "grad_norm": 1.7104904147974485, "language_loss": 0.80948287, "learning_rate": 1.4272503405179616e-06, "loss": 0.83404005, "num_input_tokens_seen": 216574775, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20324707, "step": 10057, "time_per_iteration": 2.992302656173706 }, { "auxiliary_loss_clip": 0.01419503, "auxiliary_loss_mlp": 0.01042197, "balance_loss_clip": 1.25539827, "balance_loss_mlp": 1.01977324, "epoch": 0.6047196753344356, "flos": 13588286952960.0, "grad_norm": 2.1697783373570605, "language_loss": 0.75311887, "learning_rate": 1.4268772012634527e-06, "loss": 0.77773589, "num_input_tokens_seen": 216590100, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.2244873, "step": 10058, "time_per_iteration": 4.287184238433838 }, { "auxiliary_loss_clip": 0.0141695, "auxiliary_loss_mlp": 0.01038849, "balance_loss_clip": 1.25332999, "balance_loss_mlp": 1.01829743, "epoch": 0.6047797985871035, "flos": 25531467793920.0, "grad_norm": 1.8619015183490688, "language_loss": 0.72467053, "learning_rate": 1.4265040837418176e-06, "loss": 0.74922848, "num_input_tokens_seen": 216610145, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20556641, "step": 10059, "time_per_iteration": 2.87035870552063 }, { "auxiliary_loss_clip": 0.01424132, "auxiliary_loss_mlp": 0.01035595, "balance_loss_clip": 1.25684309, "balance_loss_mlp": 1.01445901, "epoch": 0.6048399218397715, "flos": 20529404098560.0, "grad_norm": 1.46962679298768, "language_loss": 0.76864612, "learning_rate": 1.4261309879672054e-06, "loss": 0.79324335, "num_input_tokens_seen": 216630625, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.21130371, "step": 10060, "time_per_iteration": 2.8537654876708984 }, { "auxiliary_loss_clip": 0.01424269, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.25785184, "balance_loss_mlp": 1.01483774, "epoch": 0.6049000450924396, "flos": 20417522411520.0, "grad_norm": 1.9808512718605469, "language_loss": 0.74630964, "learning_rate": 1.4257579139537628e-06, "loss": 0.77090657, "num_input_tokens_seen": 216649255, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20581055, "step": 10061, "time_per_iteration": 2.824930191040039 }, { "auxiliary_loss_clip": 0.0142856, "auxiliary_loss_mlp": 0.01036062, "balance_loss_clip": 1.26022577, "balance_loss_mlp": 1.01585555, "epoch": 0.6049601683451075, "flos": 20751267191040.0, "grad_norm": 1.7152994558422956, "language_loss": 0.68238109, "learning_rate": 1.425384861715639e-06, "loss": 0.70702732, "num_input_tokens_seen": 216668100, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20227051, "step": 10062, "time_per_iteration": 2.8743603229522705 }, { "auxiliary_loss_clip": 0.01405901, "auxiliary_loss_mlp": 0.01037076, "balance_loss_clip": 1.24253285, "balance_loss_mlp": 1.0157969, "epoch": 0.6050202915977755, "flos": 20092464633600.0, "grad_norm": 1.9097120412926312, "language_loss": 0.72187734, "learning_rate": 1.425011831266978e-06, "loss": 0.74630713, "num_input_tokens_seen": 216686125, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.21276855, "step": 10063, "time_per_iteration": 2.841268539428711 }, { "auxiliary_loss_clip": 0.01420707, "auxiliary_loss_mlp": 0.01041058, "balance_loss_clip": 1.25583065, "balance_loss_mlp": 1.02029192, "epoch": 0.6050804148504434, "flos": 15969256796160.0, "grad_norm": 1.8103612584916575, "language_loss": 0.85041749, "learning_rate": 1.424638822621926e-06, "loss": 0.87503517, "num_input_tokens_seen": 216704265, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.2076416, "step": 10064, "time_per_iteration": 2.8366026878356934 }, { "auxiliary_loss_clip": 0.01418349, "auxiliary_loss_mlp": 0.01040687, "balance_loss_clip": 1.25274086, "balance_loss_mlp": 1.01924086, "epoch": 0.6051405381031114, "flos": 17465443712640.0, "grad_norm": 2.495623206344067, "language_loss": 0.80898535, "learning_rate": 1.4242658357946278e-06, "loss": 0.83357573, "num_input_tokens_seen": 216721765, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.21447754, "step": 10065, "time_per_iteration": 2.8143727779388428 }, { "auxiliary_loss_clip": 0.01445837, "auxiliary_loss_mlp": 0.01034288, "balance_loss_clip": 1.27409029, "balance_loss_mlp": 1.01327133, "epoch": 0.6052006613557793, "flos": 11407390191360.0, "grad_norm": 1.9503287349897236, "language_loss": 0.794447, "learning_rate": 1.423892870799226e-06, "loss": 0.81924832, "num_input_tokens_seen": 216738295, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.21020508, "step": 10066, "time_per_iteration": 2.8488786220550537 }, { "auxiliary_loss_clip": 0.01424126, "auxiliary_loss_mlp": 0.01035229, "balance_loss_clip": 1.25834405, "balance_loss_mlp": 1.01492715, "epoch": 0.6052607846084473, "flos": 24760964528640.0, "grad_norm": 1.7149704471830003, "language_loss": 0.74137741, "learning_rate": 1.4235199276498655e-06, "loss": 0.76597095, "num_input_tokens_seen": 216759875, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.203125, "step": 10067, "time_per_iteration": 2.9268276691436768 }, { "auxiliary_loss_clip": 0.01430357, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.26474214, "balance_loss_mlp": 1.01492214, "epoch": 0.6053209078611153, "flos": 20750995722240.0, "grad_norm": 1.3650825447048458, "language_loss": 0.69389862, "learning_rate": 1.4231470063606863e-06, "loss": 0.7185573, "num_input_tokens_seen": 216780705, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20581055, "step": 10068, "time_per_iteration": 2.955702781677246 }, { "auxiliary_loss_clip": 0.01420067, "auxiliary_loss_mlp": 0.01035596, "balance_loss_clip": 1.25137091, "balance_loss_mlp": 1.01589084, "epoch": 0.6053810311137833, "flos": 18962490280320.0, "grad_norm": 2.244835546165123, "language_loss": 0.87588, "learning_rate": 1.4227741069458303e-06, "loss": 0.90043664, "num_input_tokens_seen": 216797625, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19714355, "step": 10069, "time_per_iteration": 2.826185703277588 }, { "auxiliary_loss_clip": 0.01422577, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.25720406, "balance_loss_mlp": 1.01484823, "epoch": 0.6054411543664512, "flos": 23961613063680.0, "grad_norm": 1.480612070665327, "language_loss": 0.83948439, "learning_rate": 1.4224012294194387e-06, "loss": 0.86405623, "num_input_tokens_seen": 216817610, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19750977, "step": 10070, "time_per_iteration": 2.8723835945129395 }, { "auxiliary_loss_clip": 0.01430925, "auxiliary_loss_mlp": 0.01038475, "balance_loss_clip": 1.26206672, "balance_loss_mlp": 1.01764894, "epoch": 0.6055012776191192, "flos": 20603479115520.0, "grad_norm": 1.5255306272332645, "language_loss": 0.8683579, "learning_rate": 1.4220283737956496e-06, "loss": 0.89305186, "num_input_tokens_seen": 216836835, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.20800781, "step": 10071, "time_per_iteration": 2.856146812438965 }, { "auxiliary_loss_clip": 0.01436008, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 1.26582587, "balance_loss_mlp": 1.01502442, "epoch": 0.6055614008717871, "flos": 30309949094400.0, "grad_norm": 3.0656283922975387, "language_loss": 0.77812189, "learning_rate": 1.421655540088603e-06, "loss": 0.80283022, "num_input_tokens_seen": 216856760, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.19812012, "step": 10072, "time_per_iteration": 2.9437766075134277 }, { "auxiliary_loss_clip": 0.01431841, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.26262164, "balance_loss_mlp": 1.01330948, "epoch": 0.6056215241244551, "flos": 27136233527040.0, "grad_norm": 1.6561982527122652, "language_loss": 0.74793661, "learning_rate": 1.4212827283124367e-06, "loss": 0.77260184, "num_input_tokens_seen": 216878795, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21362305, "step": 10073, "time_per_iteration": 2.893334150314331 }, { "auxiliary_loss_clip": 0.0121335, "auxiliary_loss_mlp": 0.01054855, "balance_loss_clip": 1.11893952, "balance_loss_mlp": 1.03358769, "epoch": 0.6056816473771232, "flos": 56031762337920.0, "grad_norm": 0.7692476441483127, "language_loss": 0.5518555, "learning_rate": 1.4209099384812863e-06, "loss": 0.57453752, "num_input_tokens_seen": 216937800, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.21289062, "step": 10074, "time_per_iteration": 3.396829128265381 }, { "auxiliary_loss_clip": 0.01427518, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.26275468, "balance_loss_mlp": 1.0139395, "epoch": 0.6057417706297911, "flos": 23559810825600.0, "grad_norm": 1.6271795752035823, "language_loss": 0.82043785, "learning_rate": 1.4205371706092894e-06, "loss": 0.84504604, "num_input_tokens_seen": 216955280, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19348145, "step": 10075, "time_per_iteration": 2.867440938949585 }, { "auxiliary_loss_clip": 0.01427238, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 1.25907898, "balance_loss_mlp": 1.01170087, "epoch": 0.6058018938824591, "flos": 27755419622400.0, "grad_norm": 3.074878958282947, "language_loss": 0.78654784, "learning_rate": 1.4201644247105813e-06, "loss": 0.81113642, "num_input_tokens_seen": 216976950, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19921875, "step": 10076, "time_per_iteration": 2.9752860069274902 }, { "auxiliary_loss_clip": 0.01425817, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.2562921, "balance_loss_mlp": 1.01914024, "epoch": 0.605862017135127, "flos": 22793515326720.0, "grad_norm": 1.828903986489873, "language_loss": 0.73537314, "learning_rate": 1.4197917007992964e-06, "loss": 0.7600168, "num_input_tokens_seen": 216996945, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.1940918, "step": 10077, "time_per_iteration": 2.9972007274627686 }, { "auxiliary_loss_clip": 0.01433721, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.26625001, "balance_loss_mlp": 1.01487875, "epoch": 0.605922140387795, "flos": 21224882206080.0, "grad_norm": 1.6302412032632612, "language_loss": 0.56173289, "learning_rate": 1.4194189988895682e-06, "loss": 0.58641607, "num_input_tokens_seen": 217016580, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19726562, "step": 10078, "time_per_iteration": 2.89125394821167 }, { "auxiliary_loss_clip": 0.01433159, "auxiliary_loss_mlp": 0.01034696, "balance_loss_clip": 1.26288629, "balance_loss_mlp": 1.0148592, "epoch": 0.6059822636404629, "flos": 27278954184960.0, "grad_norm": 1.5224686474927496, "language_loss": 0.70931816, "learning_rate": 1.4190463189955297e-06, "loss": 0.73399675, "num_input_tokens_seen": 217037300, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.19848633, "step": 10079, "time_per_iteration": 4.323460578918457 }, { "auxiliary_loss_clip": 0.01423629, "auxiliary_loss_mlp": 0.01036421, "balance_loss_clip": 1.2572844, "balance_loss_mlp": 1.01615584, "epoch": 0.606042386893131, "flos": 20641059561600.0, "grad_norm": 2.250304284406604, "language_loss": 0.633219, "learning_rate": 1.4186736611313131e-06, "loss": 0.65781951, "num_input_tokens_seen": 217055805, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20251465, "step": 10080, "time_per_iteration": 2.842203378677368 }, { "auxiliary_loss_clip": 0.0142592, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.25837922, "balance_loss_mlp": 1.01591969, "epoch": 0.6061025101457989, "flos": 23012482752000.0, "grad_norm": 1.658797494600291, "language_loss": 0.71906775, "learning_rate": 1.4183010253110492e-06, "loss": 0.743689, "num_input_tokens_seen": 217074175, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20288086, "step": 10081, "time_per_iteration": 2.854065418243408 }, { "auxiliary_loss_clip": 0.01431662, "auxiliary_loss_mlp": 0.01037223, "balance_loss_clip": 1.26396298, "balance_loss_mlp": 1.01717186, "epoch": 0.6061626333984669, "flos": 29911133013120.0, "grad_norm": 1.6593466785502682, "language_loss": 0.69647634, "learning_rate": 1.4179284115488691e-06, "loss": 0.72116518, "num_input_tokens_seen": 217095695, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20056152, "step": 10082, "time_per_iteration": 2.954836368560791 }, { "auxiliary_loss_clip": 0.01434146, "auxiliary_loss_mlp": 0.01033681, "balance_loss_clip": 1.26620007, "balance_loss_mlp": 1.01440525, "epoch": 0.6062227566511348, "flos": 25019955619200.0, "grad_norm": 2.907212178470548, "language_loss": 0.66490144, "learning_rate": 1.4175558198589015e-06, "loss": 0.68957967, "num_input_tokens_seen": 217116260, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.19287109, "step": 10083, "time_per_iteration": 2.874065399169922 }, { "auxiliary_loss_clip": 0.01443921, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.27439213, "balance_loss_mlp": 1.01748657, "epoch": 0.6062828799038028, "flos": 19473414272640.0, "grad_norm": 1.8323303789221732, "language_loss": 0.74760342, "learning_rate": 1.4171832502552764e-06, "loss": 0.77241039, "num_input_tokens_seen": 217134465, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.19274902, "step": 10084, "time_per_iteration": 2.840383768081665 }, { "auxiliary_loss_clip": 0.01417908, "auxiliary_loss_mlp": 0.01035054, "balance_loss_clip": 1.25159919, "balance_loss_mlp": 1.01443076, "epoch": 0.6063430031564707, "flos": 13597471647360.0, "grad_norm": 2.461807128961348, "language_loss": 0.7396906, "learning_rate": 1.4168107027521204e-06, "loss": 0.76422024, "num_input_tokens_seen": 217149920, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20629883, "step": 10085, "time_per_iteration": 2.812438488006592 }, { "auxiliary_loss_clip": 0.01423043, "auxiliary_loss_mlp": 0.01033675, "balance_loss_clip": 1.25747979, "balance_loss_mlp": 1.0147686, "epoch": 0.6064031264091387, "flos": 23265184815360.0, "grad_norm": 2.2003983314769933, "language_loss": 0.76999092, "learning_rate": 1.4164381773635605e-06, "loss": 0.79455817, "num_input_tokens_seen": 217168165, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.18896484, "step": 10086, "time_per_iteration": 2.8749682903289795 }, { "auxiliary_loss_clip": 0.01418508, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.25351739, "balance_loss_mlp": 1.01681519, "epoch": 0.6064632496618068, "flos": 22469407689600.0, "grad_norm": 1.3531479301470306, "language_loss": 0.73350137, "learning_rate": 1.4160656741037246e-06, "loss": 0.75804877, "num_input_tokens_seen": 217190070, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19421387, "step": 10087, "time_per_iteration": 2.877917528152466 }, { "auxiliary_loss_clip": 0.01422341, "auxiliary_loss_mlp": 0.01033445, "balance_loss_clip": 1.25856662, "balance_loss_mlp": 1.01502752, "epoch": 0.6065233729144747, "flos": 25129439331840.0, "grad_norm": 1.6036406518268422, "language_loss": 0.84146088, "learning_rate": 1.4156931929867355e-06, "loss": 0.86601877, "num_input_tokens_seen": 217209370, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.1842041, "step": 10088, "time_per_iteration": 2.8852734565734863 }, { "auxiliary_loss_clip": 0.0142134, "auxiliary_loss_mlp": 0.01035128, "balance_loss_clip": 1.25667977, "balance_loss_mlp": 1.01482701, "epoch": 0.6065834961671427, "flos": 23487862314240.0, "grad_norm": 2.1816153465833095, "language_loss": 0.72101092, "learning_rate": 1.4153207340267201e-06, "loss": 0.74557561, "num_input_tokens_seen": 217226990, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20300293, "step": 10089, "time_per_iteration": 2.8861777782440186 }, { "auxiliary_loss_clip": 0.01426726, "auxiliary_loss_mlp": 0.01034734, "balance_loss_clip": 1.2606318, "balance_loss_mlp": 1.01500452, "epoch": 0.6066436194198106, "flos": 17028232778880.0, "grad_norm": 2.9726808424664255, "language_loss": 0.83606994, "learning_rate": 1.4149482972378009e-06, "loss": 0.86068451, "num_input_tokens_seen": 217244585, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19726562, "step": 10090, "time_per_iteration": 2.8743653297424316 }, { "auxiliary_loss_clip": 0.01453826, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.27800465, "balance_loss_mlp": 1.01790667, "epoch": 0.6067037426724786, "flos": 18523831512960.0, "grad_norm": 2.2801283107408, "language_loss": 0.76909328, "learning_rate": 1.4145758826341e-06, "loss": 0.7940166, "num_input_tokens_seen": 217263435, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.20593262, "step": 10091, "time_per_iteration": 4.320406198501587 }, { "auxiliary_loss_clip": 0.01425417, "auxiliary_loss_mlp": 0.01035828, "balance_loss_clip": 1.26072383, "balance_loss_mlp": 1.01574111, "epoch": 0.6067638659251465, "flos": 22356123413760.0, "grad_norm": 1.6586412268259023, "language_loss": 0.80166662, "learning_rate": 1.4142034902297415e-06, "loss": 0.8262791, "num_input_tokens_seen": 217283725, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20092773, "step": 10092, "time_per_iteration": 4.348551273345947 }, { "auxiliary_loss_clip": 0.01418786, "auxiliary_loss_mlp": 0.01039463, "balance_loss_clip": 1.25190687, "balance_loss_mlp": 1.0201993, "epoch": 0.6068239891778145, "flos": 12456683786880.0, "grad_norm": 6.1082751585282296, "language_loss": 0.76473641, "learning_rate": 1.4138311200388444e-06, "loss": 0.78931892, "num_input_tokens_seen": 217301120, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19287109, "step": 10093, "time_per_iteration": 2.9546127319335938 }, { "auxiliary_loss_clip": 0.01414271, "auxiliary_loss_mlp": 0.0103313, "balance_loss_clip": 1.25149226, "balance_loss_mlp": 1.01315022, "epoch": 0.6068841124304825, "flos": 23196403440000.0, "grad_norm": 2.003270783522232, "language_loss": 0.8786301, "learning_rate": 1.4134587720755304e-06, "loss": 0.90310413, "num_input_tokens_seen": 217319585, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1998291, "step": 10094, "time_per_iteration": 2.8740999698638916 }, { "auxiliary_loss_clip": 0.01423226, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.25848532, "balance_loss_mlp": 1.01197541, "epoch": 0.6069442356831505, "flos": 18597046878720.0, "grad_norm": 1.568352499144357, "language_loss": 0.73019326, "learning_rate": 1.413086446353919e-06, "loss": 0.75475109, "num_input_tokens_seen": 217338880, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20593262, "step": 10095, "time_per_iteration": 2.8532907962799072 }, { "auxiliary_loss_clip": 0.01421314, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.25517046, "balance_loss_mlp": 1.01491046, "epoch": 0.6070043589358184, "flos": 20970325105920.0, "grad_norm": 2.1264768339000786, "language_loss": 0.7763626, "learning_rate": 1.4127141428881273e-06, "loss": 0.8009181, "num_input_tokens_seen": 217357480, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19311523, "step": 10096, "time_per_iteration": 2.912102460861206 }, { "auxiliary_loss_clip": 0.01431138, "auxiliary_loss_mlp": 0.01038944, "balance_loss_clip": 1.26334691, "balance_loss_mlp": 1.01920271, "epoch": 0.6070644821884864, "flos": 11699889696000.0, "grad_norm": 1.8822534594187885, "language_loss": 0.80851829, "learning_rate": 1.4123418616922749e-06, "loss": 0.83321917, "num_input_tokens_seen": 217374575, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19726562, "step": 10097, "time_per_iteration": 2.836203098297119 }, { "auxiliary_loss_clip": 0.01410907, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.24871349, "balance_loss_mlp": 1.01252639, "epoch": 0.6071246054411543, "flos": 19318070315520.0, "grad_norm": 1.5300611884914288, "language_loss": 0.68417788, "learning_rate": 1.411969602780478e-06, "loss": 0.70859975, "num_input_tokens_seen": 217392950, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1875, "step": 10098, "time_per_iteration": 2.8667104244232178 }, { "auxiliary_loss_clip": 0.01416392, "auxiliary_loss_mlp": 0.0103427, "balance_loss_clip": 1.25212479, "balance_loss_mlp": 1.01499414, "epoch": 0.6071847286938223, "flos": 17758033706880.0, "grad_norm": 2.005935836725218, "language_loss": 0.81232518, "learning_rate": 1.4115973661668523e-06, "loss": 0.83683175, "num_input_tokens_seen": 217412145, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19274902, "step": 10099, "time_per_iteration": 2.8730127811431885 }, { "auxiliary_loss_clip": 0.01432071, "auxiliary_loss_mlp": 0.01038812, "balance_loss_clip": 1.26165271, "balance_loss_mlp": 1.0184629, "epoch": 0.6072448519464904, "flos": 22647627532800.0, "grad_norm": 1.7395607239957527, "language_loss": 0.71509218, "learning_rate": 1.4112251518655133e-06, "loss": 0.73980093, "num_input_tokens_seen": 217432080, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20349121, "step": 10100, "time_per_iteration": 2.8692426681518555 }, { "auxiliary_loss_clip": 0.01435334, "auxiliary_loss_mlp": 0.01038776, "balance_loss_clip": 1.26857233, "balance_loss_mlp": 1.01793778, "epoch": 0.6073049751991583, "flos": 19546901107200.0, "grad_norm": 2.133749033704126, "language_loss": 0.71812844, "learning_rate": 1.4108529598905764e-06, "loss": 0.7428695, "num_input_tokens_seen": 217450945, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20849609, "step": 10101, "time_per_iteration": 2.8186120986938477 }, { "auxiliary_loss_clip": 0.01422228, "auxiliary_loss_mlp": 0.01033908, "balance_loss_clip": 1.25703263, "balance_loss_mlp": 1.01404774, "epoch": 0.6073650984518263, "flos": 28306186300800.0, "grad_norm": 1.8713904567450006, "language_loss": 0.69684887, "learning_rate": 1.410480790256154e-06, "loss": 0.72141027, "num_input_tokens_seen": 217473105, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19873047, "step": 10102, "time_per_iteration": 2.9369587898254395 }, { "auxiliary_loss_clip": 0.01434461, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.26597905, "balance_loss_mlp": 1.01256311, "epoch": 0.6074252217044942, "flos": 25674957613440.0, "grad_norm": 2.066322947459648, "language_loss": 0.7453438, "learning_rate": 1.4101086429763589e-06, "loss": 0.77001339, "num_input_tokens_seen": 217491780, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19934082, "step": 10103, "time_per_iteration": 2.8683276176452637 }, { "auxiliary_loss_clip": 0.01454796, "auxiliary_loss_mlp": 0.01039198, "balance_loss_clip": 1.28114152, "balance_loss_mlp": 1.01950431, "epoch": 0.6074853449571622, "flos": 22867454609280.0, "grad_norm": 1.5630486315383372, "language_loss": 0.77085108, "learning_rate": 1.4097365180653032e-06, "loss": 0.79579103, "num_input_tokens_seen": 217510605, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.19702148, "step": 10104, "time_per_iteration": 2.8466057777404785 }, { "auxiliary_loss_clip": 0.0121091, "auxiliary_loss_mlp": 0.01055487, "balance_loss_clip": 1.11896706, "balance_loss_mlp": 1.03746212, "epoch": 0.6075454682098301, "flos": 67141947565440.0, "grad_norm": 0.7236744207171859, "language_loss": 0.56102598, "learning_rate": 1.4093644155370977e-06, "loss": 0.58368993, "num_input_tokens_seen": 217574815, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.18066406, "step": 10105, "time_per_iteration": 3.4074668884277344 }, { "auxiliary_loss_clip": 0.0121107, "auxiliary_loss_mlp": 0.01064982, "balance_loss_clip": 1.12093854, "balance_loss_mlp": 1.04600394, "epoch": 0.6076055914624982, "flos": 70740522236160.0, "grad_norm": 0.7831622439921496, "language_loss": 0.56858915, "learning_rate": 1.4089923354058533e-06, "loss": 0.59134966, "num_input_tokens_seen": 217632375, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.18945312, "step": 10106, "time_per_iteration": 3.3174498081207275 }, { "auxiliary_loss_clip": 0.01418586, "auxiliary_loss_mlp": 0.01033736, "balance_loss_clip": 1.25406075, "balance_loss_mlp": 1.01484179, "epoch": 0.6076657147151661, "flos": 28375827327360.0, "grad_norm": 1.756896408627934, "language_loss": 0.69618869, "learning_rate": 1.4086202776856784e-06, "loss": 0.72071189, "num_input_tokens_seen": 217653055, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18896484, "step": 10107, "time_per_iteration": 2.912600040435791 }, { "auxiliary_loss_clip": 0.01441758, "auxiliary_loss_mlp": 0.01030004, "balance_loss_clip": 1.27101731, "balance_loss_mlp": 1.01113355, "epoch": 0.6077258379678341, "flos": 15058249868160.0, "grad_norm": 1.6613806020027926, "language_loss": 0.81354719, "learning_rate": 1.4082482423906815e-06, "loss": 0.83826482, "num_input_tokens_seen": 217671520, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.18884277, "step": 10108, "time_per_iteration": 2.87605881690979 }, { "auxiliary_loss_clip": 0.0142944, "auxiliary_loss_mlp": 0.01036638, "balance_loss_clip": 1.25945449, "balance_loss_mlp": 1.01630068, "epoch": 0.607785961220502, "flos": 36179195489280.0, "grad_norm": 1.8958697554232684, "language_loss": 0.71926618, "learning_rate": 1.4078762295349714e-06, "loss": 0.743927, "num_input_tokens_seen": 217691880, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.20336914, "step": 10109, "time_per_iteration": 2.9811882972717285 }, { "auxiliary_loss_clip": 0.01404446, "auxiliary_loss_mlp": 0.01032948, "balance_loss_clip": 1.2435993, "balance_loss_mlp": 1.01354122, "epoch": 0.60784608447317, "flos": 22533347871360.0, "grad_norm": 1.6361417775256606, "language_loss": 0.80901331, "learning_rate": 1.407504239132653e-06, "loss": 0.83338726, "num_input_tokens_seen": 217710530, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19396973, "step": 10110, "time_per_iteration": 2.875927209854126 }, { "auxiliary_loss_clip": 0.01423465, "auxiliary_loss_mlp": 0.01034316, "balance_loss_clip": 1.25600863, "balance_loss_mlp": 1.01494479, "epoch": 0.6079062077258379, "flos": 23851495923840.0, "grad_norm": 2.678791910701082, "language_loss": 0.72114229, "learning_rate": 1.4071322711978338e-06, "loss": 0.74572015, "num_input_tokens_seen": 217728650, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19360352, "step": 10111, "time_per_iteration": 2.8420586585998535 }, { "auxiliary_loss_clip": 0.01431525, "auxiliary_loss_mlp": 0.01038538, "balance_loss_clip": 1.26159692, "balance_loss_mlp": 1.01787901, "epoch": 0.6079663309785059, "flos": 23377292726400.0, "grad_norm": 2.7334102668644933, "language_loss": 0.65331966, "learning_rate": 1.4067603257446186e-06, "loss": 0.6780203, "num_input_tokens_seen": 217747135, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.20654297, "step": 10112, "time_per_iteration": 2.8827414512634277 }, { "auxiliary_loss_clip": 0.01203155, "auxiliary_loss_mlp": 0.01023114, "balance_loss_clip": 1.11422968, "balance_loss_mlp": 1.00671101, "epoch": 0.6080264542311739, "flos": 71415205718400.0, "grad_norm": 0.6509523165913869, "language_loss": 0.49565384, "learning_rate": 1.4063884027871105e-06, "loss": 0.51791656, "num_input_tokens_seen": 217811860, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.1640625, "step": 10113, "time_per_iteration": 3.4065239429473877 }, { "auxiliary_loss_clip": 0.01205416, "auxiliary_loss_mlp": 0.01022868, "balance_loss_clip": 1.11458111, "balance_loss_mlp": 1.0025543, "epoch": 0.6080865774838419, "flos": 66560794364160.0, "grad_norm": 0.8449235158214998, "language_loss": 0.57009709, "learning_rate": 1.4060165023394147e-06, "loss": 0.59237993, "num_input_tokens_seen": 217866510, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.203125, "step": 10114, "time_per_iteration": 4.616581678390503 }, { "auxiliary_loss_clip": 0.01430315, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.26233912, "balance_loss_mlp": 1.01226521, "epoch": 0.6081467007365099, "flos": 19217183114880.0, "grad_norm": 2.0633522053060984, "language_loss": 0.71682966, "learning_rate": 1.4056446244156317e-06, "loss": 0.74145007, "num_input_tokens_seen": 217885650, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19470215, "step": 10115, "time_per_iteration": 2.87162709236145 }, { "auxiliary_loss_clip": 0.01425477, "auxiliary_loss_mlp": 0.0103748, "balance_loss_clip": 1.25943995, "balance_loss_mlp": 1.01759553, "epoch": 0.6082068239891778, "flos": 24177684821760.0, "grad_norm": 1.616913455947331, "language_loss": 0.73286021, "learning_rate": 1.4052727690298642e-06, "loss": 0.7574898, "num_input_tokens_seen": 217905300, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19873047, "step": 10116, "time_per_iteration": 2.937653064727783 }, { "auxiliary_loss_clip": 0.01438173, "auxiliary_loss_mlp": 0.01041775, "balance_loss_clip": 1.26838684, "balance_loss_mlp": 1.02091324, "epoch": 0.6082669472418458, "flos": 37427023843200.0, "grad_norm": 2.24701180660584, "language_loss": 0.54612648, "learning_rate": 1.4049009361962138e-06, "loss": 0.57092595, "num_input_tokens_seen": 217927845, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20849609, "step": 10117, "time_per_iteration": 3.005751848220825 }, { "auxiliary_loss_clip": 0.01424504, "auxiliary_loss_mlp": 0.01039631, "balance_loss_clip": 1.25680304, "balance_loss_mlp": 1.02064097, "epoch": 0.6083270704945137, "flos": 15093839543040.0, "grad_norm": 1.721983565983604, "language_loss": 0.70732605, "learning_rate": 1.4045291259287786e-06, "loss": 0.73196745, "num_input_tokens_seen": 217946145, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.18969727, "step": 10118, "time_per_iteration": 2.8599250316619873 }, { "auxiliary_loss_clip": 0.01421611, "auxiliary_loss_mlp": 0.01040284, "balance_loss_clip": 1.25496936, "balance_loss_mlp": 1.02056718, "epoch": 0.6083871937471818, "flos": 20678368538880.0, "grad_norm": 3.5683144194271375, "language_loss": 0.75048733, "learning_rate": 1.4041573382416588e-06, "loss": 0.77510625, "num_input_tokens_seen": 217965190, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19726562, "step": 10119, "time_per_iteration": 2.8657584190368652 }, { "auxiliary_loss_clip": 0.01423256, "auxiliary_loss_mlp": 0.01044306, "balance_loss_clip": 1.25787544, "balance_loss_mlp": 1.02399302, "epoch": 0.6084473169998497, "flos": 21516838773120.0, "grad_norm": 1.732719927502735, "language_loss": 0.68313682, "learning_rate": 1.4037855731489525e-06, "loss": 0.70781243, "num_input_tokens_seen": 217983625, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20324707, "step": 10120, "time_per_iteration": 2.8730695247650146 }, { "auxiliary_loss_clip": 0.01445988, "auxiliary_loss_mlp": 0.0104696, "balance_loss_clip": 1.27523732, "balance_loss_mlp": 1.02609801, "epoch": 0.6085074402525177, "flos": 26881314468480.0, "grad_norm": 1.677126820575194, "language_loss": 0.75314432, "learning_rate": 1.4034138306647571e-06, "loss": 0.77807379, "num_input_tokens_seen": 218006005, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20861816, "step": 10121, "time_per_iteration": 2.8982303142547607 }, { "auxiliary_loss_clip": 0.01421195, "auxiliary_loss_mlp": 0.01040305, "balance_loss_clip": 1.25611925, "balance_loss_mlp": 1.02101707, "epoch": 0.6085675635051856, "flos": 10897099626240.0, "grad_norm": 2.1440082105779013, "language_loss": 0.81230414, "learning_rate": 1.4030421108031685e-06, "loss": 0.83691913, "num_input_tokens_seen": 218024195, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19287109, "step": 10122, "time_per_iteration": 2.814009666442871 }, { "auxiliary_loss_clip": 0.01429409, "auxiliary_loss_mlp": 0.01041489, "balance_loss_clip": 1.26423025, "balance_loss_mlp": 1.02177179, "epoch": 0.6086276867578536, "flos": 34875978220800.0, "grad_norm": 1.5738349025424445, "language_loss": 0.56411862, "learning_rate": 1.402670413578284e-06, "loss": 0.58882767, "num_input_tokens_seen": 218047190, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19714355, "step": 10123, "time_per_iteration": 2.9800362586975098 }, { "auxiliary_loss_clip": 0.01412724, "auxiliary_loss_mlp": 0.01048674, "balance_loss_clip": 1.25042987, "balance_loss_mlp": 1.02853966, "epoch": 0.6086878100105215, "flos": 20057327406720.0, "grad_norm": 1.8054022382702537, "language_loss": 0.75065553, "learning_rate": 1.4022987390041965e-06, "loss": 0.77526951, "num_input_tokens_seen": 218065945, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20129395, "step": 10124, "time_per_iteration": 2.8338046073913574 }, { "auxiliary_loss_clip": 0.01426835, "auxiliary_loss_mlp": 0.01047744, "balance_loss_clip": 1.25889575, "balance_loss_mlp": 1.02817023, "epoch": 0.6087479332631895, "flos": 18341539637760.0, "grad_norm": 3.345788673276808, "language_loss": 0.65454865, "learning_rate": 1.4019270870950006e-06, "loss": 0.67929441, "num_input_tokens_seen": 218085285, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19567871, "step": 10125, "time_per_iteration": 2.845608711242676 }, { "auxiliary_loss_clip": 0.01415106, "auxiliary_loss_mlp": 0.01041972, "balance_loss_clip": 1.25138593, "balance_loss_mlp": 1.02181399, "epoch": 0.6088080565158575, "flos": 24502742599680.0, "grad_norm": 8.172051263824828, "language_loss": 0.77723563, "learning_rate": 1.40155545786479e-06, "loss": 0.80180633, "num_input_tokens_seen": 218104735, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20141602, "step": 10126, "time_per_iteration": 4.3261542320251465 }, { "auxiliary_loss_clip": 0.01433118, "auxiliary_loss_mlp": 0.01041657, "balance_loss_clip": 1.26334524, "balance_loss_mlp": 1.0220952, "epoch": 0.6088681797685255, "flos": 10275968004480.0, "grad_norm": 2.5036058602757687, "language_loss": 0.7268362, "learning_rate": 1.4011838513276558e-06, "loss": 0.75158405, "num_input_tokens_seen": 218121855, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.19567871, "step": 10127, "time_per_iteration": 5.63316798210144 }, { "auxiliary_loss_clip": 0.01441485, "auxiliary_loss_mlp": 0.01045686, "balance_loss_clip": 1.27197373, "balance_loss_mlp": 1.02592111, "epoch": 0.6089283030211935, "flos": 21981812031360.0, "grad_norm": 2.125993011777379, "language_loss": 0.73605949, "learning_rate": 1.400812267497691e-06, "loss": 0.76093119, "num_input_tokens_seen": 218137325, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.19763184, "step": 10128, "time_per_iteration": 2.8361382484436035 }, { "auxiliary_loss_clip": 0.01417847, "auxiliary_loss_mlp": 0.01038481, "balance_loss_clip": 1.25312114, "balance_loss_mlp": 1.01962245, "epoch": 0.6089884262738614, "flos": 17794121074560.0, "grad_norm": 2.055199812729614, "language_loss": 0.73817444, "learning_rate": 1.4004407063889842e-06, "loss": 0.76273763, "num_input_tokens_seen": 218155530, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18847656, "step": 10129, "time_per_iteration": 2.8527722358703613 }, { "auxiliary_loss_clip": 0.01421589, "auxiliary_loss_mlp": 0.01040654, "balance_loss_clip": 1.25591242, "balance_loss_mlp": 1.02153325, "epoch": 0.6090485495265294, "flos": 36926053706880.0, "grad_norm": 1.5301158956678766, "language_loss": 0.66960865, "learning_rate": 1.400069168015626e-06, "loss": 0.69423115, "num_input_tokens_seen": 218182535, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19140625, "step": 10130, "time_per_iteration": 3.040285348892212 }, { "auxiliary_loss_clip": 0.01410001, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.24768972, "balance_loss_mlp": 1.0167073, "epoch": 0.6091086727791973, "flos": 19907910518400.0, "grad_norm": 2.5659780133274905, "language_loss": 0.77616453, "learning_rate": 1.3996976523917054e-06, "loss": 0.80062342, "num_input_tokens_seen": 218201740, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19177246, "step": 10131, "time_per_iteration": 2.8524441719055176 }, { "auxiliary_loss_clip": 0.01412519, "auxiliary_loss_mlp": 0.01038573, "balance_loss_clip": 1.24859715, "balance_loss_mlp": 1.01948738, "epoch": 0.6091687960318654, "flos": 22173741048960.0, "grad_norm": 3.585045848449767, "language_loss": 0.77629042, "learning_rate": 1.3993261595313093e-06, "loss": 0.8008014, "num_input_tokens_seen": 218219800, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.1907959, "step": 10132, "time_per_iteration": 2.8822109699249268 }, { "auxiliary_loss_clip": 0.01411805, "auxiliary_loss_mlp": 0.01040258, "balance_loss_clip": 1.2515527, "balance_loss_mlp": 1.02079141, "epoch": 0.6092289192845333, "flos": 21473874195840.0, "grad_norm": 1.7272302200968659, "language_loss": 0.76079941, "learning_rate": 1.3989546894485261e-06, "loss": 0.78531992, "num_input_tokens_seen": 218237585, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19470215, "step": 10133, "time_per_iteration": 2.8532354831695557 }, { "auxiliary_loss_clip": 0.01428089, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.26170444, "balance_loss_mlp": 1.01973677, "epoch": 0.6092890425372013, "flos": 28706812174080.0, "grad_norm": 1.8387564902042173, "language_loss": 0.64777553, "learning_rate": 1.3985832421574414e-06, "loss": 0.6724562, "num_input_tokens_seen": 218258700, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20239258, "step": 10134, "time_per_iteration": 2.9171957969665527 }, { "auxiliary_loss_clip": 0.01415342, "auxiliary_loss_mlp": 0.01035335, "balance_loss_clip": 1.25058126, "balance_loss_mlp": 1.01566589, "epoch": 0.6093491657898692, "flos": 20822808499200.0, "grad_norm": 1.8641164722244383, "language_loss": 0.79434991, "learning_rate": 1.3982118176721397e-06, "loss": 0.81885672, "num_input_tokens_seen": 218275655, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19665527, "step": 10135, "time_per_iteration": 2.850215435028076 }, { "auxiliary_loss_clip": 0.01420049, "auxiliary_loss_mlp": 0.01040339, "balance_loss_clip": 1.25369298, "balance_loss_mlp": 1.02156317, "epoch": 0.6094092890425372, "flos": 25457845224960.0, "grad_norm": 1.8253713796403923, "language_loss": 0.72625399, "learning_rate": 1.3978404160067069e-06, "loss": 0.75085783, "num_input_tokens_seen": 218295720, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.18774414, "step": 10136, "time_per_iteration": 2.96061110496521 }, { "auxiliary_loss_clip": 0.01418011, "auxiliary_loss_mlp": 0.01034507, "balance_loss_clip": 1.25296926, "balance_loss_mlp": 1.01463437, "epoch": 0.6094694122952051, "flos": 35633405738880.0, "grad_norm": 1.767733147974447, "language_loss": 0.7501539, "learning_rate": 1.3974690371752253e-06, "loss": 0.77467906, "num_input_tokens_seen": 218316745, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.1986084, "step": 10137, "time_per_iteration": 2.9900147914886475 }, { "auxiliary_loss_clip": 0.01411725, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.24558604, "balance_loss_mlp": 1.01816654, "epoch": 0.6095295355478731, "flos": 24466293273600.0, "grad_norm": 1.652066552987795, "language_loss": 0.80747843, "learning_rate": 1.3970976811917785e-06, "loss": 0.83197892, "num_input_tokens_seen": 218335385, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20153809, "step": 10138, "time_per_iteration": 2.86911940574646 }, { "auxiliary_loss_clip": 0.01405683, "auxiliary_loss_mlp": 0.0103621, "balance_loss_clip": 1.24445069, "balance_loss_mlp": 1.0169692, "epoch": 0.6095896588005411, "flos": 15641755799040.0, "grad_norm": 1.7289689983680092, "language_loss": 0.82447374, "learning_rate": 1.3967263480704481e-06, "loss": 0.84889269, "num_input_tokens_seen": 218353320, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19238281, "step": 10139, "time_per_iteration": 2.820976495742798 }, { "auxiliary_loss_clip": 0.01434889, "auxiliary_loss_mlp": 0.01035416, "balance_loss_clip": 1.26623225, "balance_loss_mlp": 1.01561546, "epoch": 0.6096497820532091, "flos": 15556595806080.0, "grad_norm": 1.9092426701999503, "language_loss": 0.83850759, "learning_rate": 1.396355037825315e-06, "loss": 0.86321062, "num_input_tokens_seen": 218365620, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19812012, "step": 10140, "time_per_iteration": 2.7832326889038086 }, { "auxiliary_loss_clip": 0.01409586, "auxiliary_loss_mlp": 0.01037377, "balance_loss_clip": 1.24342132, "balance_loss_mlp": 1.01740944, "epoch": 0.6097099053058771, "flos": 24214496106240.0, "grad_norm": 1.7460544796977036, "language_loss": 0.76149315, "learning_rate": 1.3959837504704592e-06, "loss": 0.78596276, "num_input_tokens_seen": 218383785, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19958496, "step": 10141, "time_per_iteration": 2.875797986984253 }, { "auxiliary_loss_clip": 0.01407302, "auxiliary_loss_mlp": 0.0103281, "balance_loss_clip": 1.24345088, "balance_loss_mlp": 1.0133791, "epoch": 0.609770028558545, "flos": 19578916442880.0, "grad_norm": 2.785268794650899, "language_loss": 0.77421296, "learning_rate": 1.3956124860199603e-06, "loss": 0.79861403, "num_input_tokens_seen": 218399055, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19421387, "step": 10142, "time_per_iteration": 2.801565408706665 }, { "auxiliary_loss_clip": 0.01412743, "auxiliary_loss_mlp": 0.01033956, "balance_loss_clip": 1.24793839, "balance_loss_mlp": 1.01335633, "epoch": 0.609830151811213, "flos": 23959169844480.0, "grad_norm": 1.7019803690971946, "language_loss": 0.7680195, "learning_rate": 1.3952412444878964e-06, "loss": 0.79248643, "num_input_tokens_seen": 218419120, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20605469, "step": 10143, "time_per_iteration": 2.876600742340088 }, { "auxiliary_loss_clip": 0.01414713, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.24967933, "balance_loss_mlp": 1.01547813, "epoch": 0.6098902750638809, "flos": 16188224221440.0, "grad_norm": 1.9260475597782607, "language_loss": 0.76245922, "learning_rate": 1.3948700258883448e-06, "loss": 0.78695405, "num_input_tokens_seen": 218435290, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19299316, "step": 10144, "time_per_iteration": 2.821422576904297 }, { "auxiliary_loss_clip": 0.01427437, "auxiliary_loss_mlp": 0.01033805, "balance_loss_clip": 1.26025653, "balance_loss_mlp": 1.01400399, "epoch": 0.609950398316549, "flos": 44541519638400.0, "grad_norm": 1.863055862870904, "language_loss": 0.74214035, "learning_rate": 1.394498830235383e-06, "loss": 0.76675278, "num_input_tokens_seen": 218457880, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19812012, "step": 10145, "time_per_iteration": 3.101120710372925 }, { "auxiliary_loss_clip": 0.01419118, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.25398481, "balance_loss_mlp": 1.01584077, "epoch": 0.6100105215692169, "flos": 23232128849280.0, "grad_norm": 5.675786613139131, "language_loss": 0.69655931, "learning_rate": 1.3941276575430862e-06, "loss": 0.72110671, "num_input_tokens_seen": 218475930, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19775391, "step": 10146, "time_per_iteration": 2.899437189102173 }, { "auxiliary_loss_clip": 0.01405707, "auxiliary_loss_mlp": 0.01036076, "balance_loss_clip": 1.24504137, "balance_loss_mlp": 1.01730061, "epoch": 0.6100706448218849, "flos": 15020850401280.0, "grad_norm": 1.5496689644941308, "language_loss": 0.77028894, "learning_rate": 1.3937565078255289e-06, "loss": 0.79470676, "num_input_tokens_seen": 218493675, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18774414, "step": 10147, "time_per_iteration": 2.8510098457336426 }, { "auxiliary_loss_clip": 0.01413608, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.24933052, "balance_loss_mlp": 1.01271486, "epoch": 0.6101307680745528, "flos": 19647833552640.0, "grad_norm": 1.8808954214348996, "language_loss": 0.79128224, "learning_rate": 1.393385381096786e-06, "loss": 0.81574869, "num_input_tokens_seen": 218511780, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20324707, "step": 10148, "time_per_iteration": 2.8704307079315186 }, { "auxiliary_loss_clip": 0.01420645, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.25238156, "balance_loss_mlp": 1.01268089, "epoch": 0.6101908913272208, "flos": 29947808563200.0, "grad_norm": 1.9734273260463708, "language_loss": 0.54824126, "learning_rate": 1.39301427737093e-06, "loss": 0.57278126, "num_input_tokens_seen": 218531850, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20678711, "step": 10149, "time_per_iteration": 4.360315561294556 }, { "auxiliary_loss_clip": 0.01395042, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.23680055, "balance_loss_mlp": 1.01499832, "epoch": 0.6102510145798887, "flos": 21808614360960.0, "grad_norm": 1.7830318508808038, "language_loss": 0.80741203, "learning_rate": 1.3926431966620333e-06, "loss": 0.83171451, "num_input_tokens_seen": 218551245, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.20202637, "step": 10150, "time_per_iteration": 2.849329710006714 }, { "auxiliary_loss_clip": 0.01426833, "auxiliary_loss_mlp": 0.01039407, "balance_loss_clip": 1.25904262, "balance_loss_mlp": 1.01809192, "epoch": 0.6103111378325567, "flos": 20716356188160.0, "grad_norm": 1.529408908439079, "language_loss": 0.70307696, "learning_rate": 1.3922721389841684e-06, "loss": 0.72773939, "num_input_tokens_seen": 218571365, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.21325684, "step": 10151, "time_per_iteration": 2.8507206439971924 }, { "auxiliary_loss_clip": 0.01401903, "auxiliary_loss_mlp": 0.01035555, "balance_loss_clip": 1.23938632, "balance_loss_mlp": 1.0160284, "epoch": 0.6103712610852247, "flos": 29392019712000.0, "grad_norm": 1.7954208345729656, "language_loss": 0.71681023, "learning_rate": 1.3919011043514036e-06, "loss": 0.74118471, "num_input_tokens_seen": 218588315, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19519043, "step": 10152, "time_per_iteration": 2.8810818195343018 }, { "auxiliary_loss_clip": 0.01419368, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.25361967, "balance_loss_mlp": 1.01415825, "epoch": 0.6104313843378927, "flos": 20822627520000.0, "grad_norm": 1.6754878284532166, "language_loss": 0.78963733, "learning_rate": 1.391530092777811e-06, "loss": 0.81417549, "num_input_tokens_seen": 218605940, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20275879, "step": 10153, "time_per_iteration": 2.838214159011841 }, { "auxiliary_loss_clip": 0.01410198, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.24700904, "balance_loss_mlp": 1.01429462, "epoch": 0.6104915075905607, "flos": 26589719859840.0, "grad_norm": 1.6826889547493957, "language_loss": 0.80087912, "learning_rate": 1.3911591042774573e-06, "loss": 0.8253206, "num_input_tokens_seen": 218626100, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.1965332, "step": 10154, "time_per_iteration": 2.9012928009033203 }, { "auxiliary_loss_clip": 0.01408753, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.24512482, "balance_loss_mlp": 1.01538754, "epoch": 0.6105516308432286, "flos": 23926566326400.0, "grad_norm": 1.8416946187938175, "language_loss": 0.70985579, "learning_rate": 1.3907881388644116e-06, "loss": 0.73428977, "num_input_tokens_seen": 218645060, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19262695, "step": 10155, "time_per_iteration": 2.845160961151123 }, { "auxiliary_loss_clip": 0.01410523, "auxiliary_loss_mlp": 0.01039876, "balance_loss_clip": 1.2474407, "balance_loss_mlp": 1.01966977, "epoch": 0.6106117540958966, "flos": 31590019008000.0, "grad_norm": 1.6664425029489653, "language_loss": 0.72126138, "learning_rate": 1.3904171965527413e-06, "loss": 0.74576539, "num_input_tokens_seen": 218667690, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20202637, "step": 10156, "time_per_iteration": 2.9315431118011475 }, { "auxiliary_loss_clip": 0.01400315, "auxiliary_loss_mlp": 0.01034984, "balance_loss_clip": 1.24060059, "balance_loss_mlp": 1.01401544, "epoch": 0.6106718773485645, "flos": 19617220805760.0, "grad_norm": 1.6268485834439577, "language_loss": 0.67718661, "learning_rate": 1.3900462773565114e-06, "loss": 0.70153964, "num_input_tokens_seen": 218687505, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.2097168, "step": 10157, "time_per_iteration": 2.834444522857666 }, { "auxiliary_loss_clip": 0.01415012, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.24919367, "balance_loss_mlp": 1.01713955, "epoch": 0.6107320006012326, "flos": 17131472709120.0, "grad_norm": 1.9793190377249117, "language_loss": 0.73009676, "learning_rate": 1.3896753812897877e-06, "loss": 0.75461423, "num_input_tokens_seen": 218705315, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19592285, "step": 10158, "time_per_iteration": 2.8232662677764893 }, { "auxiliary_loss_clip": 0.01419812, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 1.25426626, "balance_loss_mlp": 1.02042174, "epoch": 0.6107921238539005, "flos": 30158903393280.0, "grad_norm": 2.356034019525822, "language_loss": 0.70000005, "learning_rate": 1.389304508366635e-06, "loss": 0.7246049, "num_input_tokens_seen": 218725735, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20263672, "step": 10159, "time_per_iteration": 2.9268887042999268 }, { "auxiliary_loss_clip": 0.01411691, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.24679077, "balance_loss_mlp": 1.01438773, "epoch": 0.6108522471065685, "flos": 18448942089600.0, "grad_norm": 1.974539954078257, "language_loss": 0.8023802, "learning_rate": 1.3889336586011167e-06, "loss": 0.82684129, "num_input_tokens_seen": 218743215, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20031738, "step": 10160, "time_per_iteration": 4.218799352645874 }, { "auxiliary_loss_clip": 0.01200229, "auxiliary_loss_mlp": 0.01019607, "balance_loss_clip": 1.11058807, "balance_loss_mlp": 1.0035857, "epoch": 0.6109123703592364, "flos": 64169345710080.0, "grad_norm": 1.251773694211875, "language_loss": 0.61539865, "learning_rate": 1.388562832007295e-06, "loss": 0.63759702, "num_input_tokens_seen": 218806440, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.16015625, "step": 10161, "time_per_iteration": 3.482314348220825 }, { "auxiliary_loss_clip": 0.01423253, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.25666952, "balance_loss_mlp": 1.01474619, "epoch": 0.6109724936119044, "flos": 20677599377280.0, "grad_norm": 1.7064409312212958, "language_loss": 0.76756406, "learning_rate": 1.3881920285992324e-06, "loss": 0.79214329, "num_input_tokens_seen": 218825720, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19934082, "step": 10162, "time_per_iteration": 5.660090446472168 }, { "auxiliary_loss_clip": 0.01417848, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 1.2541852, "balance_loss_mlp": 1.01588535, "epoch": 0.6110326168645723, "flos": 31362771784320.0, "grad_norm": 1.7448285933147976, "language_loss": 0.72648466, "learning_rate": 1.3878212483909888e-06, "loss": 0.7510165, "num_input_tokens_seen": 218847735, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19445801, "step": 10163, "time_per_iteration": 2.9347667694091797 }, { "auxiliary_loss_clip": 0.01400815, "auxiliary_loss_mlp": 0.010287, "balance_loss_clip": 1.24190247, "balance_loss_mlp": 1.00954294, "epoch": 0.6110927401172404, "flos": 25012625961600.0, "grad_norm": 1.7170669732185335, "language_loss": 0.60421747, "learning_rate": 1.387450491396625e-06, "loss": 0.62851262, "num_input_tokens_seen": 218866585, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19152832, "step": 10164, "time_per_iteration": 2.8834242820739746 }, { "auxiliary_loss_clip": 0.0141251, "auxiliary_loss_mlp": 0.01034668, "balance_loss_clip": 1.24893308, "balance_loss_mlp": 1.01387811, "epoch": 0.6111528633699083, "flos": 26258735013120.0, "grad_norm": 1.9612315121955342, "language_loss": 0.7623958, "learning_rate": 1.3870797576302003e-06, "loss": 0.78686756, "num_input_tokens_seen": 218885560, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.2076416, "step": 10165, "time_per_iteration": 2.9181790351867676 }, { "auxiliary_loss_clip": 0.01415522, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.25566053, "balance_loss_mlp": 1.01342034, "epoch": 0.6112129866225763, "flos": 22392617984640.0, "grad_norm": 1.5439430607899407, "language_loss": 0.80059433, "learning_rate": 1.3867090471057719e-06, "loss": 0.82507563, "num_input_tokens_seen": 218905055, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19189453, "step": 10166, "time_per_iteration": 2.844480037689209 }, { "auxiliary_loss_clip": 0.01431097, "auxiliary_loss_mlp": 0.01034615, "balance_loss_clip": 1.26611423, "balance_loss_mlp": 1.01383734, "epoch": 0.6112731098752443, "flos": 25238651575680.0, "grad_norm": 1.8069376905114054, "language_loss": 0.68403023, "learning_rate": 1.3863383598373987e-06, "loss": 0.70868731, "num_input_tokens_seen": 218924030, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20776367, "step": 10167, "time_per_iteration": 2.8908276557922363 }, { "auxiliary_loss_clip": 0.01417636, "auxiliary_loss_mlp": 0.01035184, "balance_loss_clip": 1.25677621, "balance_loss_mlp": 1.01630104, "epoch": 0.6113332331279122, "flos": 22903406242560.0, "grad_norm": 2.5329796624015097, "language_loss": 0.79356778, "learning_rate": 1.3859676958391364e-06, "loss": 0.81809598, "num_input_tokens_seen": 218943750, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18884277, "step": 10168, "time_per_iteration": 2.878872871398926 }, { "auxiliary_loss_clip": 0.01446019, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.27335167, "balance_loss_mlp": 1.02054715, "epoch": 0.6113933563805802, "flos": 18628428787200.0, "grad_norm": 2.424484379369739, "language_loss": 0.86114353, "learning_rate": 1.3855970551250398e-06, "loss": 0.88600683, "num_input_tokens_seen": 218957585, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.19763184, "step": 10169, "time_per_iteration": 2.795133590698242 }, { "auxiliary_loss_clip": 0.01420459, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.25785708, "balance_loss_mlp": 1.01506495, "epoch": 0.6114534796332481, "flos": 41881578485760.0, "grad_norm": 1.6303515651117158, "language_loss": 0.79564762, "learning_rate": 1.3852264377091652e-06, "loss": 0.82018471, "num_input_tokens_seen": 218980025, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18188477, "step": 10170, "time_per_iteration": 3.0212934017181396 }, { "auxiliary_loss_clip": 0.01433249, "auxiliary_loss_mlp": 0.01033873, "balance_loss_clip": 1.26398945, "balance_loss_mlp": 1.01315463, "epoch": 0.6115136028859162, "flos": 21918550521600.0, "grad_norm": 1.8959888169273387, "language_loss": 0.69747484, "learning_rate": 1.3848558436055651e-06, "loss": 0.72214615, "num_input_tokens_seen": 218998200, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20715332, "step": 10171, "time_per_iteration": 2.8802108764648438 }, { "auxiliary_loss_clip": 0.01425326, "auxiliary_loss_mlp": 0.01035681, "balance_loss_clip": 1.25874448, "balance_loss_mlp": 1.01534367, "epoch": 0.6115737261385841, "flos": 28816884069120.0, "grad_norm": 1.6200770939681983, "language_loss": 0.79607368, "learning_rate": 1.3844852728282934e-06, "loss": 0.82068372, "num_input_tokens_seen": 219017910, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20349121, "step": 10172, "time_per_iteration": 2.9061429500579834 }, { "auxiliary_loss_clip": 0.01428908, "auxiliary_loss_mlp": 0.01034214, "balance_loss_clip": 1.25886571, "balance_loss_mlp": 1.01410365, "epoch": 0.6116338493912521, "flos": 21261331532160.0, "grad_norm": 1.7750722209576755, "language_loss": 0.67668027, "learning_rate": 1.3841147253914022e-06, "loss": 0.70131147, "num_input_tokens_seen": 219037730, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.20129395, "step": 10173, "time_per_iteration": 2.843770742416382 }, { "auxiliary_loss_clip": 0.0142552, "auxiliary_loss_mlp": 0.01032748, "balance_loss_clip": 1.26028657, "balance_loss_mlp": 1.01323342, "epoch": 0.61169397264392, "flos": 17539156771200.0, "grad_norm": 1.6967483378128967, "language_loss": 0.56804621, "learning_rate": 1.3837442013089416e-06, "loss": 0.5926289, "num_input_tokens_seen": 219056755, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19519043, "step": 10174, "time_per_iteration": 2.799285650253296 }, { "auxiliary_loss_clip": 0.01422128, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.25664532, "balance_loss_mlp": 1.01625013, "epoch": 0.611754095896588, "flos": 23962156001280.0, "grad_norm": 1.8885369446742384, "language_loss": 0.67117727, "learning_rate": 1.3833737005949628e-06, "loss": 0.69576299, "num_input_tokens_seen": 219076985, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.2019043, "step": 10175, "time_per_iteration": 2.8684136867523193 }, { "auxiliary_loss_clip": 0.01419284, "auxiliary_loss_mlp": 0.01034011, "balance_loss_clip": 1.25486922, "balance_loss_mlp": 1.0148778, "epoch": 0.6118142191492559, "flos": 26006032949760.0, "grad_norm": 2.304172152728961, "language_loss": 0.83415604, "learning_rate": 1.3830032232635154e-06, "loss": 0.85868895, "num_input_tokens_seen": 219096050, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19140625, "step": 10176, "time_per_iteration": 2.939687490463257 }, { "auxiliary_loss_clip": 0.01416767, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.25289607, "balance_loss_mlp": 1.01639044, "epoch": 0.611874342401924, "flos": 24612633515520.0, "grad_norm": 1.9103172603248497, "language_loss": 0.7792573, "learning_rate": 1.3826327693286474e-06, "loss": 0.80379641, "num_input_tokens_seen": 219112665, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20739746, "step": 10177, "time_per_iteration": 2.8368217945098877 }, { "auxiliary_loss_clip": 0.01422809, "auxiliary_loss_mlp": 0.01036076, "balance_loss_clip": 1.25669527, "balance_loss_mlp": 1.01634645, "epoch": 0.6119344656545919, "flos": 15895362758400.0, "grad_norm": 1.8795491522965506, "language_loss": 0.76570374, "learning_rate": 1.3822623388044065e-06, "loss": 0.79029262, "num_input_tokens_seen": 219129120, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19714355, "step": 10178, "time_per_iteration": 2.817561388015747 }, { "auxiliary_loss_clip": 0.01430979, "auxiliary_loss_mlp": 0.01039225, "balance_loss_clip": 1.26517701, "balance_loss_mlp": 1.01929295, "epoch": 0.6119945889072599, "flos": 21662862301440.0, "grad_norm": 2.225987786006421, "language_loss": 0.6843164, "learning_rate": 1.3818919317048402e-06, "loss": 0.70901847, "num_input_tokens_seen": 219148950, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19921875, "step": 10179, "time_per_iteration": 2.864372730255127 }, { "auxiliary_loss_clip": 0.01425559, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.26020265, "balance_loss_mlp": 1.01327109, "epoch": 0.6120547121599279, "flos": 13780623173760.0, "grad_norm": 2.246184398298458, "language_loss": 0.84704566, "learning_rate": 1.3815215480439933e-06, "loss": 0.87162334, "num_input_tokens_seen": 219165585, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.1895752, "step": 10180, "time_per_iteration": 2.9358816146850586 }, { "auxiliary_loss_clip": 0.01424162, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.26065195, "balance_loss_mlp": 1.01183558, "epoch": 0.6121148354125958, "flos": 20087487705600.0, "grad_norm": 1.8608864946783017, "language_loss": 0.77999592, "learning_rate": 1.3811511878359113e-06, "loss": 0.8045522, "num_input_tokens_seen": 219183280, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19616699, "step": 10181, "time_per_iteration": 2.8458054065704346 }, { "auxiliary_loss_clip": 0.01426965, "auxiliary_loss_mlp": 0.01030703, "balance_loss_clip": 1.26059651, "balance_loss_mlp": 1.011343, "epoch": 0.6121749586652638, "flos": 13476902958720.0, "grad_norm": 2.1265578754583454, "language_loss": 0.81209403, "learning_rate": 1.3807808510946384e-06, "loss": 0.8366707, "num_input_tokens_seen": 219197200, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19348145, "step": 10182, "time_per_iteration": 2.8119683265686035 }, { "auxiliary_loss_clip": 0.01404118, "auxiliary_loss_mlp": 0.01031138, "balance_loss_clip": 1.24438882, "balance_loss_mlp": 1.01319742, "epoch": 0.6122350819179317, "flos": 20130135569280.0, "grad_norm": 1.6411968106374635, "language_loss": 0.83534169, "learning_rate": 1.3804105378342177e-06, "loss": 0.8596943, "num_input_tokens_seen": 219216825, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.17944336, "step": 10183, "time_per_iteration": 2.8451850414276123 }, { "auxiliary_loss_clip": 0.01199097, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 1.1105876, "balance_loss_mlp": 1.01661742, "epoch": 0.6122952051705998, "flos": 65458356111360.0, "grad_norm": 0.7103728408370809, "language_loss": 0.62875849, "learning_rate": 1.3800402480686914e-06, "loss": 0.65107203, "num_input_tokens_seen": 219283795, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.15625, "step": 10184, "time_per_iteration": 4.958518743515015 }, { "auxiliary_loss_clip": 0.0141813, "auxiliary_loss_mlp": 0.01032894, "balance_loss_clip": 1.25578117, "balance_loss_mlp": 1.0144763, "epoch": 0.6123553284232677, "flos": 20386004768640.0, "grad_norm": 7.356756652970936, "language_loss": 0.8238613, "learning_rate": 1.379669981812101e-06, "loss": 0.84837151, "num_input_tokens_seen": 219302385, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18408203, "step": 10185, "time_per_iteration": 2.878382444381714 }, { "auxiliary_loss_clip": 0.01443696, "auxiliary_loss_mlp": 0.01039214, "balance_loss_clip": 1.27537942, "balance_loss_mlp": 1.01946104, "epoch": 0.6124154516759357, "flos": 23998152879360.0, "grad_norm": 1.88889619252092, "language_loss": 0.75395072, "learning_rate": 1.3792997390784868e-06, "loss": 0.77877986, "num_input_tokens_seen": 219319765, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.1973877, "step": 10186, "time_per_iteration": 2.9370062351226807 }, { "auxiliary_loss_clip": 0.01410887, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.24883962, "balance_loss_mlp": 1.01523101, "epoch": 0.6124755749286036, "flos": 21478308186240.0, "grad_norm": 3.120120945302171, "language_loss": 0.79433084, "learning_rate": 1.3789295198818895e-06, "loss": 0.81878126, "num_input_tokens_seen": 219337440, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18920898, "step": 10187, "time_per_iteration": 2.8535842895507812 }, { "auxiliary_loss_clip": 0.01421394, "auxiliary_loss_mlp": 0.01031428, "balance_loss_clip": 1.25682914, "balance_loss_mlp": 1.01352251, "epoch": 0.6125356981812716, "flos": 23889936021120.0, "grad_norm": 1.5979888033207086, "language_loss": 0.83585393, "learning_rate": 1.3785593242363462e-06, "loss": 0.86038208, "num_input_tokens_seen": 219357525, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.17919922, "step": 10188, "time_per_iteration": 2.906890392303467 }, { "auxiliary_loss_clip": 0.0142915, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.26330912, "balance_loss_mlp": 1.01302516, "epoch": 0.6125958214339395, "flos": 14432910480000.0, "grad_norm": 1.9241581783281914, "language_loss": 0.75868297, "learning_rate": 1.378189152155896e-06, "loss": 0.78328764, "num_input_tokens_seen": 219374855, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.1829834, "step": 10189, "time_per_iteration": 2.932894706726074 }, { "auxiliary_loss_clip": 0.01420511, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.25748158, "balance_loss_mlp": 1.01563227, "epoch": 0.6126559446866076, "flos": 23269709295360.0, "grad_norm": 1.537318271573001, "language_loss": 0.74450326, "learning_rate": 1.3778190036545758e-06, "loss": 0.76905036, "num_input_tokens_seen": 219394740, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18566895, "step": 10190, "time_per_iteration": 2.8955636024475098 }, { "auxiliary_loss_clip": 0.01435979, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.26904178, "balance_loss_mlp": 1.01616526, "epoch": 0.6127160679392755, "flos": 26874708727680.0, "grad_norm": 1.690581845625544, "language_loss": 0.68800259, "learning_rate": 1.3774488787464207e-06, "loss": 0.71272767, "num_input_tokens_seen": 219413755, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20373535, "step": 10191, "time_per_iteration": 2.9143002033233643 }, { "auxiliary_loss_clip": 0.01432134, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 1.26508129, "balance_loss_mlp": 1.01533222, "epoch": 0.6127761911919435, "flos": 26407744698240.0, "grad_norm": 1.842549565139184, "language_loss": 0.7483114, "learning_rate": 1.377078777445467e-06, "loss": 0.77298087, "num_input_tokens_seen": 219433560, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19470215, "step": 10192, "time_per_iteration": 2.891695499420166 }, { "auxiliary_loss_clip": 0.01409804, "auxiliary_loss_mlp": 0.01031726, "balance_loss_clip": 1.24821842, "balance_loss_mlp": 1.01349866, "epoch": 0.6128363144446115, "flos": 22644143683200.0, "grad_norm": 2.124915383386676, "language_loss": 0.85322058, "learning_rate": 1.3767086997657478e-06, "loss": 0.87763584, "num_input_tokens_seen": 219452640, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18225098, "step": 10193, "time_per_iteration": 2.8524515628814697 }, { "auxiliary_loss_clip": 0.01427075, "auxiliary_loss_mlp": 0.01034007, "balance_loss_clip": 1.26221228, "balance_loss_mlp": 1.01451588, "epoch": 0.6128964376972794, "flos": 26769885229440.0, "grad_norm": 1.9924664626712698, "language_loss": 0.71344149, "learning_rate": 1.3763386457212979e-06, "loss": 0.73805237, "num_input_tokens_seen": 219468585, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19494629, "step": 10194, "time_per_iteration": 2.8710057735443115 }, { "auxiliary_loss_clip": 0.01195189, "auxiliary_loss_mlp": 0.01024657, "balance_loss_clip": 1.10691893, "balance_loss_mlp": 1.005584, "epoch": 0.6129565609499474, "flos": 65595194945280.0, "grad_norm": 0.810953228681581, "language_loss": 0.58765924, "learning_rate": 1.375968615326149e-06, "loss": 0.60985774, "num_input_tokens_seen": 219523015, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.19042969, "step": 10195, "time_per_iteration": 4.677098274230957 }, { "auxiliary_loss_clip": 0.0142648, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.26317024, "balance_loss_mlp": 1.01680601, "epoch": 0.6130166842026153, "flos": 16370606586240.0, "grad_norm": 1.9417235996359883, "language_loss": 0.69553626, "learning_rate": 1.3755986085943324e-06, "loss": 0.72017568, "num_input_tokens_seen": 219539980, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.2064209, "step": 10196, "time_per_iteration": 2.8481924533843994 }, { "auxiliary_loss_clip": 0.01416873, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.25429595, "balance_loss_mlp": 1.01600599, "epoch": 0.6130768074552834, "flos": 23661105229440.0, "grad_norm": 1.9524408275806746, "language_loss": 0.71706152, "learning_rate": 1.3752286255398788e-06, "loss": 0.74158728, "num_input_tokens_seen": 219556980, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19677734, "step": 10197, "time_per_iteration": 5.701512813568115 }, { "auxiliary_loss_clip": 0.01426985, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.26051712, "balance_loss_mlp": 1.01864219, "epoch": 0.6131369307079513, "flos": 20057010693120.0, "grad_norm": 1.948014887575, "language_loss": 0.79701245, "learning_rate": 1.3748586661768191e-06, "loss": 0.82167017, "num_input_tokens_seen": 219576410, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20153809, "step": 10198, "time_per_iteration": 2.870842933654785 }, { "auxiliary_loss_clip": 0.01434656, "auxiliary_loss_mlp": 0.01039519, "balance_loss_clip": 1.2670064, "balance_loss_mlp": 1.01973057, "epoch": 0.6131970539606193, "flos": 22681814618880.0, "grad_norm": 1.6847992774216725, "language_loss": 0.74947363, "learning_rate": 1.374488730519181e-06, "loss": 0.77421534, "num_input_tokens_seen": 219597180, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19787598, "step": 10199, "time_per_iteration": 2.9064888954162598 }, { "auxiliary_loss_clip": 0.01425917, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.2586031, "balance_loss_mlp": 1.01819944, "epoch": 0.6132571772132872, "flos": 26882400343680.0, "grad_norm": 2.0011425688796045, "language_loss": 0.62146425, "learning_rate": 1.374118818580993e-06, "loss": 0.64610934, "num_input_tokens_seen": 219617630, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20397949, "step": 10200, "time_per_iteration": 2.8709676265716553 }, { "auxiliary_loss_clip": 0.014263, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.26209009, "balance_loss_mlp": 1.01471543, "epoch": 0.6133173004659552, "flos": 22902772815360.0, "grad_norm": 1.8966591671073665, "language_loss": 0.69443429, "learning_rate": 1.3737489303762822e-06, "loss": 0.71903282, "num_input_tokens_seen": 219637025, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18835449, "step": 10201, "time_per_iteration": 2.8562865257263184 }, { "auxiliary_loss_clip": 0.01409979, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.24821639, "balance_loss_mlp": 1.0149982, "epoch": 0.6133774237186231, "flos": 20494719319680.0, "grad_norm": 1.766956359530808, "language_loss": 0.84568369, "learning_rate": 1.3733790659190746e-06, "loss": 0.87012279, "num_input_tokens_seen": 219656625, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18933105, "step": 10202, "time_per_iteration": 2.823660135269165 }, { "auxiliary_loss_clip": 0.01197081, "auxiliary_loss_mlp": 0.01020055, "balance_loss_clip": 1.10839832, "balance_loss_mlp": 1.00050473, "epoch": 0.6134375469712912, "flos": 69444996583680.0, "grad_norm": 0.8872594338418944, "language_loss": 0.67065185, "learning_rate": 1.3730092252233953e-06, "loss": 0.69282323, "num_input_tokens_seen": 219718090, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.1953125, "step": 10203, "time_per_iteration": 3.4145946502685547 }, { "auxiliary_loss_clip": 0.01423641, "auxiliary_loss_mlp": 0.01034763, "balance_loss_clip": 1.25940394, "balance_loss_mlp": 1.0165484, "epoch": 0.6134976702239591, "flos": 41296443742080.0, "grad_norm": 1.9966094979241957, "language_loss": 0.62024415, "learning_rate": 1.37263940830327e-06, "loss": 0.6448282, "num_input_tokens_seen": 219740100, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.18225098, "step": 10204, "time_per_iteration": 3.0338988304138184 }, { "auxiliary_loss_clip": 0.01414112, "auxiliary_loss_mlp": 0.01036158, "balance_loss_clip": 1.25127196, "balance_loss_mlp": 1.01666737, "epoch": 0.6135577934766271, "flos": 22356892575360.0, "grad_norm": 1.8106066658797446, "language_loss": 0.73423666, "learning_rate": 1.3722696151727204e-06, "loss": 0.75873935, "num_input_tokens_seen": 219761225, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19482422, "step": 10205, "time_per_iteration": 2.9577600955963135 }, { "auxiliary_loss_clip": 0.01406245, "auxiliary_loss_mlp": 0.01034331, "balance_loss_clip": 1.24529195, "balance_loss_mlp": 1.01363611, "epoch": 0.6136179167292951, "flos": 23737532976000.0, "grad_norm": 1.6879689493482142, "language_loss": 0.76467311, "learning_rate": 1.3718998458457701e-06, "loss": 0.78907883, "num_input_tokens_seen": 219780085, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20690918, "step": 10206, "time_per_iteration": 2.9040634632110596 }, { "auxiliary_loss_clip": 0.0142004, "auxiliary_loss_mlp": 0.01037667, "balance_loss_clip": 1.2555989, "balance_loss_mlp": 1.01773477, "epoch": 0.613678039981963, "flos": 26034111987840.0, "grad_norm": 2.0569387369294896, "language_loss": 0.75685668, "learning_rate": 1.3715301003364407e-06, "loss": 0.78143382, "num_input_tokens_seen": 219797895, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19946289, "step": 10207, "time_per_iteration": 2.9152138233184814 }, { "auxiliary_loss_clip": 0.01421743, "auxiliary_loss_mlp": 0.01037668, "balance_loss_clip": 1.25826967, "balance_loss_mlp": 1.01781964, "epoch": 0.613738163234631, "flos": 9864800092800.0, "grad_norm": 2.2089277306883512, "language_loss": 0.82810307, "learning_rate": 1.3711603786587525e-06, "loss": 0.85269719, "num_input_tokens_seen": 219811295, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19848633, "step": 10208, "time_per_iteration": 2.829864978790283 }, { "auxiliary_loss_clip": 0.01436094, "auxiliary_loss_mlp": 0.01037509, "balance_loss_clip": 1.26826322, "balance_loss_mlp": 1.01687419, "epoch": 0.613798286487299, "flos": 33194965720320.0, "grad_norm": 1.8292039344398896, "language_loss": 0.73669219, "learning_rate": 1.3707906808267265e-06, "loss": 0.76142824, "num_input_tokens_seen": 219832735, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20629883, "step": 10209, "time_per_iteration": 2.9635069370269775 }, { "auxiliary_loss_clip": 0.01422136, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.25922894, "balance_loss_mlp": 1.01173735, "epoch": 0.613858409739967, "flos": 25638417797760.0, "grad_norm": 1.630934169217141, "language_loss": 0.74993819, "learning_rate": 1.37042100685438e-06, "loss": 0.77447295, "num_input_tokens_seen": 219852755, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19592285, "step": 10210, "time_per_iteration": 2.8861145973205566 }, { "auxiliary_loss_clip": 0.01195046, "auxiliary_loss_mlp": 0.01023046, "balance_loss_clip": 1.10622776, "balance_loss_mlp": 1.00597501, "epoch": 0.6139185329926349, "flos": 67224935808000.0, "grad_norm": 0.8852565804816103, "language_loss": 0.65115845, "learning_rate": 1.3700513567557325e-06, "loss": 0.67333937, "num_input_tokens_seen": 219922785, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.17089844, "step": 10211, "time_per_iteration": 3.5290822982788086 }, { "auxiliary_loss_clip": 0.01416368, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 1.25375366, "balance_loss_mlp": 1.02057099, "epoch": 0.6139786562453029, "flos": 21553650057600.0, "grad_norm": 1.6962870466139335, "language_loss": 0.75982106, "learning_rate": 1.369681730544801e-06, "loss": 0.78439087, "num_input_tokens_seen": 219942215, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20031738, "step": 10212, "time_per_iteration": 2.8657941818237305 }, { "auxiliary_loss_clip": 0.01424573, "auxiliary_loss_mlp": 0.01035739, "balance_loss_clip": 1.26094043, "balance_loss_mlp": 1.01580691, "epoch": 0.6140387794979708, "flos": 26079836497920.0, "grad_norm": 1.545782885503587, "language_loss": 0.74356866, "learning_rate": 1.3693121282356009e-06, "loss": 0.76817179, "num_input_tokens_seen": 219963830, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19909668, "step": 10213, "time_per_iteration": 2.880075693130493 }, { "auxiliary_loss_clip": 0.01441955, "auxiliary_loss_mlp": 0.01036658, "balance_loss_clip": 1.27157176, "balance_loss_mlp": 1.01698816, "epoch": 0.6140989027506388, "flos": 23704748478720.0, "grad_norm": 1.4335376885962137, "language_loss": 0.73765451, "learning_rate": 1.3689425498421483e-06, "loss": 0.76244062, "num_input_tokens_seen": 219983815, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.1965332, "step": 10214, "time_per_iteration": 2.879070281982422 }, { "auxiliary_loss_clip": 0.01420506, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 1.25515103, "balance_loss_mlp": 1.01329601, "epoch": 0.6141590260033067, "flos": 22240576897920.0, "grad_norm": 1.5886497003850317, "language_loss": 0.7514987, "learning_rate": 1.3685729953784572e-06, "loss": 0.77603543, "num_input_tokens_seen": 220003165, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19873047, "step": 10215, "time_per_iteration": 2.833376169204712 }, { "auxiliary_loss_clip": 0.01422237, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.25799179, "balance_loss_mlp": 1.01575089, "epoch": 0.6142191492559748, "flos": 23880434613120.0, "grad_norm": 2.195861548756872, "language_loss": 0.79324865, "learning_rate": 1.368203464858542e-06, "loss": 0.817837, "num_input_tokens_seen": 220021015, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20849609, "step": 10216, "time_per_iteration": 2.893620252609253 }, { "auxiliary_loss_clip": 0.01422797, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.25859308, "balance_loss_mlp": 1.01280236, "epoch": 0.6142792725086427, "flos": 15049110418560.0, "grad_norm": 2.1226798136557736, "language_loss": 0.80572128, "learning_rate": 1.3678339582964147e-06, "loss": 0.83027381, "num_input_tokens_seen": 220035780, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19628906, "step": 10217, "time_per_iteration": 2.7954397201538086 }, { "auxiliary_loss_clip": 0.0142549, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.25947809, "balance_loss_mlp": 1.01203609, "epoch": 0.6143393957613107, "flos": 23341295848320.0, "grad_norm": 2.6706357984023033, "language_loss": 0.79819393, "learning_rate": 1.3674644757060865e-06, "loss": 0.82276201, "num_input_tokens_seen": 220054280, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19262695, "step": 10218, "time_per_iteration": 2.8784401416778564 }, { "auxiliary_loss_clip": 0.0141518, "auxiliary_loss_mlp": 0.01039497, "balance_loss_clip": 1.25201464, "balance_loss_mlp": 1.0185039, "epoch": 0.6143995190139786, "flos": 20125792068480.0, "grad_norm": 1.5145614644057983, "language_loss": 0.82499397, "learning_rate": 1.367095017101569e-06, "loss": 0.84954077, "num_input_tokens_seen": 220074120, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20996094, "step": 10219, "time_per_iteration": 4.261526823043823 }, { "auxiliary_loss_clip": 0.01429864, "auxiliary_loss_mlp": 0.01034626, "balance_loss_clip": 1.26420069, "balance_loss_mlp": 1.01507545, "epoch": 0.6144596422666466, "flos": 42318834664320.0, "grad_norm": 1.8110049908311496, "language_loss": 0.67884845, "learning_rate": 1.3667255824968717e-06, "loss": 0.70349336, "num_input_tokens_seen": 220096320, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19555664, "step": 10220, "time_per_iteration": 3.0127596855163574 }, { "auxiliary_loss_clip": 0.01416561, "auxiliary_loss_mlp": 0.0103134, "balance_loss_clip": 1.25266242, "balance_loss_mlp": 1.01221883, "epoch": 0.6145197655193146, "flos": 21581955319680.0, "grad_norm": 3.051033930891564, "language_loss": 0.72290063, "learning_rate": 1.3663561719060041e-06, "loss": 0.74737966, "num_input_tokens_seen": 220114850, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19140625, "step": 10221, "time_per_iteration": 2.8733503818511963 }, { "auxiliary_loss_clip": 0.01419588, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.25699675, "balance_loss_mlp": 1.01413441, "epoch": 0.6145798887719826, "flos": 21481203853440.0, "grad_norm": 1.818154941224552, "language_loss": 0.80170768, "learning_rate": 1.3659867853429735e-06, "loss": 0.82623243, "num_input_tokens_seen": 220133395, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18762207, "step": 10222, "time_per_iteration": 2.9629812240600586 }, { "auxiliary_loss_clip": 0.01430375, "auxiliary_loss_mlp": 0.01031295, "balance_loss_clip": 1.26395547, "balance_loss_mlp": 1.01200688, "epoch": 0.6146400120246506, "flos": 20786721131520.0, "grad_norm": 3.0968230156184826, "language_loss": 0.7722767, "learning_rate": 1.365617422821788e-06, "loss": 0.79689348, "num_input_tokens_seen": 220152790, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19287109, "step": 10223, "time_per_iteration": 2.836669445037842 }, { "auxiliary_loss_clip": 0.01414612, "auxiliary_loss_mlp": 0.01034162, "balance_loss_clip": 1.25376391, "balance_loss_mlp": 1.01411128, "epoch": 0.6147001352773185, "flos": 13889247235200.0, "grad_norm": 2.259572760451063, "language_loss": 0.79216588, "learning_rate": 1.3652480843564535e-06, "loss": 0.81665361, "num_input_tokens_seen": 220169535, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.20043945, "step": 10224, "time_per_iteration": 2.8400509357452393 }, { "auxiliary_loss_clip": 0.01411521, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.25108099, "balance_loss_mlp": 1.01270807, "epoch": 0.6147602585299865, "flos": 56660612837760.0, "grad_norm": 1.3749490226287833, "language_loss": 0.66790825, "learning_rate": 1.3648787699609746e-06, "loss": 0.69233745, "num_input_tokens_seen": 220195305, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18676758, "step": 10225, "time_per_iteration": 3.20723295211792 }, { "auxiliary_loss_clip": 0.01427211, "auxiliary_loss_mlp": 0.01033052, "balance_loss_clip": 1.26085591, "balance_loss_mlp": 1.01387107, "epoch": 0.6148203817826544, "flos": 32830020011520.0, "grad_norm": 2.220273190296515, "language_loss": 0.64476711, "learning_rate": 1.364509479649357e-06, "loss": 0.66936976, "num_input_tokens_seen": 220215040, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19189453, "step": 10226, "time_per_iteration": 2.9998879432678223 }, { "auxiliary_loss_clip": 0.01436659, "auxiliary_loss_mlp": 0.01038307, "balance_loss_clip": 1.26996672, "balance_loss_mlp": 1.01746964, "epoch": 0.6148805050353224, "flos": 18340996700160.0, "grad_norm": 1.9491300340163806, "language_loss": 0.76194775, "learning_rate": 1.3641402134356037e-06, "loss": 0.78669739, "num_input_tokens_seen": 220234205, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20837402, "step": 10227, "time_per_iteration": 2.87154483795166 }, { "auxiliary_loss_clip": 0.01432541, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.26406634, "balance_loss_mlp": 1.01809764, "epoch": 0.6149406282879903, "flos": 14072036803200.0, "grad_norm": 2.3676973098765606, "language_loss": 0.63299072, "learning_rate": 1.3637709713337164e-06, "loss": 0.65770864, "num_input_tokens_seen": 220252730, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.21142578, "step": 10228, "time_per_iteration": 2.822516918182373 }, { "auxiliary_loss_clip": 0.01418198, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.25578713, "balance_loss_mlp": 1.01926088, "epoch": 0.6150007515406584, "flos": 25200618681600.0, "grad_norm": 1.4944923755955306, "language_loss": 0.75276786, "learning_rate": 1.3634017533576985e-06, "loss": 0.77733231, "num_input_tokens_seen": 220273345, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18994141, "step": 10229, "time_per_iteration": 2.8910796642303467 }, { "auxiliary_loss_clip": 0.01431672, "auxiliary_loss_mlp": 0.01033538, "balance_loss_clip": 1.26809025, "balance_loss_mlp": 1.01371324, "epoch": 0.6150608747933263, "flos": 21955452295680.0, "grad_norm": 1.7360388537797682, "language_loss": 0.78945124, "learning_rate": 1.3630325595215493e-06, "loss": 0.81410336, "num_input_tokens_seen": 220293845, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19824219, "step": 10230, "time_per_iteration": 4.334232807159424 }, { "auxiliary_loss_clip": 0.0143586, "auxiliary_loss_mlp": 0.01038053, "balance_loss_clip": 1.26960921, "balance_loss_mlp": 1.0189085, "epoch": 0.6151209980459943, "flos": 30129602745600.0, "grad_norm": 5.059867026927376, "language_loss": 0.74017549, "learning_rate": 1.36266338983927e-06, "loss": 0.76491463, "num_input_tokens_seen": 220316070, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19152832, "step": 10231, "time_per_iteration": 2.908796787261963 }, { "auxiliary_loss_clip": 0.01419195, "auxiliary_loss_mlp": 0.01034746, "balance_loss_clip": 1.25551653, "balance_loss_mlp": 1.01573181, "epoch": 0.6151811212986622, "flos": 30020571480960.0, "grad_norm": 1.785882833069432, "language_loss": 0.70938563, "learning_rate": 1.362294244324858e-06, "loss": 0.73392504, "num_input_tokens_seen": 220335695, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19030762, "step": 10232, "time_per_iteration": 4.258838653564453 }, { "auxiliary_loss_clip": 0.01404041, "auxiliary_loss_mlp": 0.01033359, "balance_loss_clip": 1.24535334, "balance_loss_mlp": 1.01496518, "epoch": 0.6152412445513302, "flos": 18880316444160.0, "grad_norm": 3.5403645539011146, "language_loss": 0.92196214, "learning_rate": 1.3619251229923126e-06, "loss": 0.94633615, "num_input_tokens_seen": 220353720, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18408203, "step": 10233, "time_per_iteration": 4.265609979629517 }, { "auxiliary_loss_clip": 0.01422036, "auxiliary_loss_mlp": 0.01034695, "balance_loss_clip": 1.26006222, "balance_loss_mlp": 1.01658654, "epoch": 0.6153013678039982, "flos": 25714845544320.0, "grad_norm": 1.7447863186205352, "language_loss": 0.7229321, "learning_rate": 1.3615560258556306e-06, "loss": 0.74749935, "num_input_tokens_seen": 220372515, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18078613, "step": 10234, "time_per_iteration": 2.8695971965789795 }, { "auxiliary_loss_clip": 0.01434954, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.26799321, "balance_loss_mlp": 1.01595247, "epoch": 0.6153614910566662, "flos": 28521534142080.0, "grad_norm": 1.9949210220309233, "language_loss": 0.67677164, "learning_rate": 1.3611869529288077e-06, "loss": 0.70147741, "num_input_tokens_seen": 220393490, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19677734, "step": 10235, "time_per_iteration": 2.948618173599243 }, { "auxiliary_loss_clip": 0.01441183, "auxiliary_loss_mlp": 0.0103691, "balance_loss_clip": 1.27225649, "balance_loss_mlp": 1.01811075, "epoch": 0.6154216143093342, "flos": 23560172784000.0, "grad_norm": 1.7820527269893371, "language_loss": 0.816953, "learning_rate": 1.3608179042258398e-06, "loss": 0.84173393, "num_input_tokens_seen": 220412855, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.18811035, "step": 10236, "time_per_iteration": 2.866508722305298 }, { "auxiliary_loss_clip": 0.0143313, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 1.26608694, "balance_loss_mlp": 1.0134095, "epoch": 0.6154817375620021, "flos": 22758513834240.0, "grad_norm": 1.722917619542013, "language_loss": 0.80850661, "learning_rate": 1.360448879760721e-06, "loss": 0.83316052, "num_input_tokens_seen": 220433440, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18859863, "step": 10237, "time_per_iteration": 2.8357560634613037 }, { "auxiliary_loss_clip": 0.01418186, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.25488818, "balance_loss_mlp": 1.01653886, "epoch": 0.6155418608146701, "flos": 27174583134720.0, "grad_norm": 1.6859222454060037, "language_loss": 0.76546758, "learning_rate": 1.3600798795474449e-06, "loss": 0.79001617, "num_input_tokens_seen": 220453445, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.20117188, "step": 10238, "time_per_iteration": 2.9183290004730225 }, { "auxiliary_loss_clip": 0.01199217, "auxiliary_loss_mlp": 0.01015661, "balance_loss_clip": 1.10653615, "balance_loss_mlp": 1.00002098, "epoch": 0.615601984067338, "flos": 68838234825600.0, "grad_norm": 0.7570692365341908, "language_loss": 0.5766443, "learning_rate": 1.3597109036000036e-06, "loss": 0.59879303, "num_input_tokens_seen": 220509730, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.15625, "step": 10239, "time_per_iteration": 3.3379664421081543 }, { "auxiliary_loss_clip": 0.01425214, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.25915241, "balance_loss_mlp": 1.01190674, "epoch": 0.615662107320006, "flos": 15523947043200.0, "grad_norm": 2.302545996864993, "language_loss": 0.78364581, "learning_rate": 1.3593419519323892e-06, "loss": 0.80821753, "num_input_tokens_seen": 220527295, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20043945, "step": 10240, "time_per_iteration": 2.8647491931915283 }, { "auxiliary_loss_clip": 0.01429583, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.26142573, "balance_loss_mlp": 1.01746142, "epoch": 0.615722230572674, "flos": 21072117202560.0, "grad_norm": 2.0331446547231673, "language_loss": 0.73339796, "learning_rate": 1.3589730245585922e-06, "loss": 0.75806844, "num_input_tokens_seen": 220542730, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.19995117, "step": 10241, "time_per_iteration": 2.876960039138794 }, { "auxiliary_loss_clip": 0.0140958, "auxiliary_loss_mlp": 0.01032875, "balance_loss_clip": 1.24927223, "balance_loss_mlp": 1.01388454, "epoch": 0.615782353825342, "flos": 23266813628160.0, "grad_norm": 3.7667199391176642, "language_loss": 0.73159635, "learning_rate": 1.3586041214926018e-06, "loss": 0.75602084, "num_input_tokens_seen": 220562995, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18994141, "step": 10242, "time_per_iteration": 2.9131641387939453 }, { "auxiliary_loss_clip": 0.01416124, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.2528882, "balance_loss_mlp": 1.01209331, "epoch": 0.6158424770780099, "flos": 21113317232640.0, "grad_norm": 1.797828511400935, "language_loss": 0.73101151, "learning_rate": 1.3582352427484086e-06, "loss": 0.75547171, "num_input_tokens_seen": 220581775, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.17810059, "step": 10243, "time_per_iteration": 2.8623111248016357 }, { "auxiliary_loss_clip": 0.01194206, "auxiliary_loss_mlp": 0.01020599, "balance_loss_clip": 1.10413396, "balance_loss_mlp": 1.00371861, "epoch": 0.6159026003306779, "flos": 70367540935680.0, "grad_norm": 0.758882536191643, "language_loss": 0.56849653, "learning_rate": 1.3578663883399984e-06, "loss": 0.5906446, "num_input_tokens_seen": 220646395, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.16894531, "step": 10244, "time_per_iteration": 3.4159066677093506 }, { "auxiliary_loss_clip": 0.01420088, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.25663817, "balance_loss_mlp": 1.01285005, "epoch": 0.6159627235833458, "flos": 33887186202240.0, "grad_norm": 1.6910410982463737, "language_loss": 0.6403811, "learning_rate": 1.3574975582813593e-06, "loss": 0.66490531, "num_input_tokens_seen": 220668335, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19494629, "step": 10245, "time_per_iteration": 2.98595929145813 }, { "auxiliary_loss_clip": 0.014097, "auxiliary_loss_mlp": 0.01031517, "balance_loss_clip": 1.24752736, "balance_loss_mlp": 1.01252687, "epoch": 0.6160228468360138, "flos": 26585919296640.0, "grad_norm": 1.8105541034223454, "language_loss": 0.7978763, "learning_rate": 1.3571287525864771e-06, "loss": 0.82228845, "num_input_tokens_seen": 220688915, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19006348, "step": 10246, "time_per_iteration": 2.908010244369507 }, { "auxiliary_loss_clip": 0.01440763, "auxiliary_loss_mlp": 0.01047872, "balance_loss_clip": 1.27261508, "balance_loss_mlp": 1.02674794, "epoch": 0.6160829700886818, "flos": 17199937370880.0, "grad_norm": 2.519936790371734, "language_loss": 0.87984157, "learning_rate": 1.3567599712693368e-06, "loss": 0.904728, "num_input_tokens_seen": 220703465, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.21118164, "step": 10247, "time_per_iteration": 2.889995574951172 }, { "auxiliary_loss_clip": 0.01434536, "auxiliary_loss_mlp": 0.01034011, "balance_loss_clip": 1.26852894, "balance_loss_mlp": 1.0149132, "epoch": 0.6161430933413498, "flos": 23634157311360.0, "grad_norm": 1.5839160320004646, "language_loss": 0.80779576, "learning_rate": 1.3563912143439235e-06, "loss": 0.83248115, "num_input_tokens_seen": 220722090, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19104004, "step": 10248, "time_per_iteration": 2.937851667404175 }, { "auxiliary_loss_clip": 0.01414711, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.2533164, "balance_loss_mlp": 1.01254654, "epoch": 0.6162032165940178, "flos": 23012663731200.0, "grad_norm": 2.619655182289342, "language_loss": 0.87065256, "learning_rate": 1.3560224818242191e-06, "loss": 0.89510697, "num_input_tokens_seen": 220741075, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18188477, "step": 10249, "time_per_iteration": 2.844805955886841 }, { "auxiliary_loss_clip": 0.01424761, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.25959635, "balance_loss_mlp": 1.01413155, "epoch": 0.6162633398466857, "flos": 39436713705600.0, "grad_norm": 2.464460805578947, "language_loss": 0.70338607, "learning_rate": 1.3556537737242072e-06, "loss": 0.72797906, "num_input_tokens_seen": 220763395, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20397949, "step": 10250, "time_per_iteration": 2.9893922805786133 }, { "auxiliary_loss_clip": 0.01408977, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.25082493, "balance_loss_mlp": 1.01166272, "epoch": 0.6163234630993537, "flos": 19253994399360.0, "grad_norm": 4.6844361695196275, "language_loss": 0.7460832, "learning_rate": 1.3552850900578692e-06, "loss": 0.77047169, "num_input_tokens_seen": 220780640, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18225098, "step": 10251, "time_per_iteration": 2.8247179985046387 }, { "auxiliary_loss_clip": 0.01415372, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.25021005, "balance_loss_mlp": 1.01297879, "epoch": 0.6163835863520216, "flos": 15970478405760.0, "grad_norm": 2.095999831089002, "language_loss": 0.68740284, "learning_rate": 1.3549164308391844e-06, "loss": 0.71188509, "num_input_tokens_seen": 220797960, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.1986084, "step": 10252, "time_per_iteration": 2.858543634414673 }, { "auxiliary_loss_clip": 0.01200864, "auxiliary_loss_mlp": 0.01026852, "balance_loss_clip": 1.10855484, "balance_loss_mlp": 1.00558555, "epoch": 0.6164437096046896, "flos": 68136530918400.0, "grad_norm": 0.889290849872288, "language_loss": 0.58037126, "learning_rate": 1.3545477960821333e-06, "loss": 0.6026485, "num_input_tokens_seen": 220856930, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.21289062, "step": 10253, "time_per_iteration": 3.4137163162231445 }, { "auxiliary_loss_clip": 0.01425037, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.25788629, "balance_loss_mlp": 1.01409125, "epoch": 0.6165038328573575, "flos": 21371177203200.0, "grad_norm": 1.4944481457770558, "language_loss": 0.80233085, "learning_rate": 1.3541791858006946e-06, "loss": 0.82691717, "num_input_tokens_seen": 220877595, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19506836, "step": 10254, "time_per_iteration": 4.273070573806763 }, { "auxiliary_loss_clip": 0.0143114, "auxiliary_loss_mlp": 0.01035213, "balance_loss_clip": 1.26244497, "balance_loss_mlp": 1.01565087, "epoch": 0.6165639561100256, "flos": 21110874013440.0, "grad_norm": 1.8148366830355995, "language_loss": 0.81137228, "learning_rate": 1.353810600008846e-06, "loss": 0.83603579, "num_input_tokens_seen": 220896880, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19567871, "step": 10255, "time_per_iteration": 2.881281614303589 }, { "auxiliary_loss_clip": 0.01429361, "auxiliary_loss_mlp": 0.0103584, "balance_loss_clip": 1.26237559, "balance_loss_mlp": 1.01597953, "epoch": 0.6166240793626935, "flos": 25349628366720.0, "grad_norm": 5.662700670000636, "language_loss": 0.66740024, "learning_rate": 1.3534420387205646e-06, "loss": 0.69205225, "num_input_tokens_seen": 220916425, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19848633, "step": 10256, "time_per_iteration": 2.8848044872283936 }, { "auxiliary_loss_clip": 0.01420663, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.25931692, "balance_loss_mlp": 1.01437068, "epoch": 0.6166842026153615, "flos": 19692110229120.0, "grad_norm": 1.7314966486777392, "language_loss": 0.73135328, "learning_rate": 1.353073501949825e-06, "loss": 0.75589907, "num_input_tokens_seen": 220935050, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19543457, "step": 10257, "time_per_iteration": 2.842000722885132 }, { "auxiliary_loss_clip": 0.01421958, "auxiliary_loss_mlp": 0.01030759, "balance_loss_clip": 1.25610554, "balance_loss_mlp": 1.01098251, "epoch": 0.6167443258680294, "flos": 19327888437120.0, "grad_norm": 3.151265864242765, "language_loss": 0.73352861, "learning_rate": 1.3527049897106034e-06, "loss": 0.75805581, "num_input_tokens_seen": 220953085, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19775391, "step": 10258, "time_per_iteration": 2.845064640045166 }, { "auxiliary_loss_clip": 0.01418319, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.25142169, "balance_loss_mlp": 1.01177025, "epoch": 0.6168044491206974, "flos": 25276232021760.0, "grad_norm": 3.2788530593017486, "language_loss": 0.65258539, "learning_rate": 1.3523365020168735e-06, "loss": 0.6770761, "num_input_tokens_seen": 220969050, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.18981934, "step": 10259, "time_per_iteration": 2.88090181350708 }, { "auxiliary_loss_clip": 0.014114, "auxiliary_loss_mlp": 0.01035474, "balance_loss_clip": 1.24899554, "balance_loss_mlp": 1.01564932, "epoch": 0.6168645723733654, "flos": 13226463135360.0, "grad_norm": 1.8838617716780197, "language_loss": 0.71507418, "learning_rate": 1.3519680388826084e-06, "loss": 0.73954284, "num_input_tokens_seen": 220985825, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19812012, "step": 10260, "time_per_iteration": 2.8262319564819336 }, { "auxiliary_loss_clip": 0.01444176, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.27455151, "balance_loss_mlp": 1.0148474, "epoch": 0.6169246956260334, "flos": 26663478163200.0, "grad_norm": 2.007345619396919, "language_loss": 0.69026893, "learning_rate": 1.3515996003217803e-06, "loss": 0.71506429, "num_input_tokens_seen": 221004465, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20507812, "step": 10261, "time_per_iteration": 2.8963143825531006 }, { "auxiliary_loss_clip": 0.01411142, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.24746597, "balance_loss_mlp": 1.01297116, "epoch": 0.6169848188787014, "flos": 23158687259520.0, "grad_norm": 1.6717329204529354, "language_loss": 0.71892631, "learning_rate": 1.3512311863483602e-06, "loss": 0.74335861, "num_input_tokens_seen": 221023260, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19116211, "step": 10262, "time_per_iteration": 2.857736349105835 }, { "auxiliary_loss_clip": 0.01428133, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.26395488, "balance_loss_mlp": 1.01164758, "epoch": 0.6170449421313693, "flos": 23342517457920.0, "grad_norm": 2.3869894631201634, "language_loss": 0.70912099, "learning_rate": 1.3508627969763188e-06, "loss": 0.73371518, "num_input_tokens_seen": 221043090, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.1965332, "step": 10263, "time_per_iteration": 2.8540382385253906 }, { "auxiliary_loss_clip": 0.01428754, "auxiliary_loss_mlp": 0.01036034, "balance_loss_clip": 1.26100862, "balance_loss_mlp": 1.01706743, "epoch": 0.6171050653840373, "flos": 15860089797120.0, "grad_norm": 2.0749195425037774, "language_loss": 0.76801836, "learning_rate": 1.3504944322196244e-06, "loss": 0.7926662, "num_input_tokens_seen": 221061435, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.1895752, "step": 10264, "time_per_iteration": 2.886823892593384 }, { "auxiliary_loss_clip": 0.0141311, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.25065923, "balance_loss_mlp": 1.01613665, "epoch": 0.6171651886367052, "flos": 20054567473920.0, "grad_norm": 2.121941146698505, "language_loss": 0.85351449, "learning_rate": 1.350126092092247e-06, "loss": 0.87801164, "num_input_tokens_seen": 221078705, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20471191, "step": 10265, "time_per_iteration": 4.34124493598938 }, { "auxiliary_loss_clip": 0.01403644, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.24294293, "balance_loss_mlp": 1.01261497, "epoch": 0.6172253118893732, "flos": 26443967800320.0, "grad_norm": 2.030236450959666, "language_loss": 0.65273404, "learning_rate": 1.349757776608153e-06, "loss": 0.67707992, "num_input_tokens_seen": 221099245, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18310547, "step": 10266, "time_per_iteration": 2.933673858642578 }, { "auxiliary_loss_clip": 0.01412417, "auxiliary_loss_mlp": 0.01034302, "balance_loss_clip": 1.24779391, "balance_loss_mlp": 1.01514506, "epoch": 0.6172854351420412, "flos": 22641790953600.0, "grad_norm": 1.6687025500633408, "language_loss": 0.76188719, "learning_rate": 1.3493894857813094e-06, "loss": 0.78635442, "num_input_tokens_seen": 221116930, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19152832, "step": 10267, "time_per_iteration": 5.676170110702515 }, { "auxiliary_loss_clip": 0.01429928, "auxiliary_loss_mlp": 0.01038355, "balance_loss_clip": 1.26154685, "balance_loss_mlp": 1.01835179, "epoch": 0.6173455583947092, "flos": 21222258007680.0, "grad_norm": 1.6975070343269385, "language_loss": 0.75651294, "learning_rate": 1.3490212196256818e-06, "loss": 0.78119576, "num_input_tokens_seen": 221137660, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20007324, "step": 10268, "time_per_iteration": 2.8558239936828613 }, { "auxiliary_loss_clip": 0.01438536, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.26878905, "balance_loss_mlp": 1.01518416, "epoch": 0.6174056816473771, "flos": 19509727864320.0, "grad_norm": 1.6862505944540929, "language_loss": 0.76412112, "learning_rate": 1.3486529781552342e-06, "loss": 0.78885674, "num_input_tokens_seen": 221156225, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.19836426, "step": 10269, "time_per_iteration": 2.826125383377075 }, { "auxiliary_loss_clip": 0.01409225, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.24484217, "balance_loss_mlp": 1.01476371, "epoch": 0.6174658049000451, "flos": 16005796611840.0, "grad_norm": 2.359998860613369, "language_loss": 0.77168977, "learning_rate": 1.3482847613839318e-06, "loss": 0.79611319, "num_input_tokens_seen": 221173820, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18359375, "step": 10270, "time_per_iteration": 2.809882402420044 }, { "auxiliary_loss_clip": 0.0142233, "auxiliary_loss_mlp": 0.01035386, "balance_loss_clip": 1.25792766, "balance_loss_mlp": 1.01591909, "epoch": 0.617525928152713, "flos": 21912804432000.0, "grad_norm": 2.102801982145458, "language_loss": 0.83412564, "learning_rate": 1.347916569325736e-06, "loss": 0.85870284, "num_input_tokens_seen": 221191815, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19470215, "step": 10271, "time_per_iteration": 2.8300681114196777 }, { "auxiliary_loss_clip": 0.01424427, "auxiliary_loss_mlp": 0.01036204, "balance_loss_clip": 1.25887609, "balance_loss_mlp": 1.01752377, "epoch": 0.617586051405381, "flos": 21115896186240.0, "grad_norm": 1.7600845171183308, "language_loss": 0.77995968, "learning_rate": 1.3475484019946093e-06, "loss": 0.80456597, "num_input_tokens_seen": 221211205, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.18688965, "step": 10272, "time_per_iteration": 2.8330299854278564 }, { "auxiliary_loss_clip": 0.01196724, "auxiliary_loss_mlp": 0.01020147, "balance_loss_clip": 1.10555184, "balance_loss_mlp": 1.0024085, "epoch": 0.617646174658049, "flos": 58639255488000.0, "grad_norm": 0.8094277516357877, "language_loss": 0.59165466, "learning_rate": 1.347180259404513e-06, "loss": 0.61382341, "num_input_tokens_seen": 221268430, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.17773438, "step": 10273, "time_per_iteration": 3.201281785964966 }, { "auxiliary_loss_clip": 0.01411921, "auxiliary_loss_mlp": 0.01033834, "balance_loss_clip": 1.25022292, "balance_loss_mlp": 1.01485562, "epoch": 0.617706297910717, "flos": 13885944364800.0, "grad_norm": 3.059517164332595, "language_loss": 0.73914707, "learning_rate": 1.3468121415694059e-06, "loss": 0.76360464, "num_input_tokens_seen": 221281930, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18981934, "step": 10274, "time_per_iteration": 2.8165013790130615 }, { "auxiliary_loss_clip": 0.01422242, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.25833559, "balance_loss_mlp": 1.01572835, "epoch": 0.617766421163385, "flos": 19217816542080.0, "grad_norm": 1.7715768831864545, "language_loss": 0.78892362, "learning_rate": 1.3464440485032484e-06, "loss": 0.81350017, "num_input_tokens_seen": 221301605, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19677734, "step": 10275, "time_per_iteration": 2.892542600631714 }, { "auxiliary_loss_clip": 0.01410392, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.24683046, "balance_loss_mlp": 1.01608229, "epoch": 0.6178265444160529, "flos": 22576448183040.0, "grad_norm": 1.665912470558785, "language_loss": 0.8003307, "learning_rate": 1.346075980219998e-06, "loss": 0.82479811, "num_input_tokens_seen": 221320105, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20251465, "step": 10276, "time_per_iteration": 2.8324923515319824 }, { "auxiliary_loss_clip": 0.01426099, "auxiliary_loss_mlp": 0.01039309, "balance_loss_clip": 1.26068211, "balance_loss_mlp": 1.01912677, "epoch": 0.6178866676687209, "flos": 11991801018240.0, "grad_norm": 1.831212542225, "language_loss": 0.81652141, "learning_rate": 1.345707936733612e-06, "loss": 0.8411755, "num_input_tokens_seen": 221335915, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20178223, "step": 10277, "time_per_iteration": 2.8627374172210693 }, { "auxiliary_loss_clip": 0.01434244, "auxiliary_loss_mlp": 0.01038767, "balance_loss_clip": 1.26608551, "balance_loss_mlp": 1.01822686, "epoch": 0.6179467909213888, "flos": 21000123446400.0, "grad_norm": 1.518157870500574, "language_loss": 0.82515377, "learning_rate": 1.3453399180580466e-06, "loss": 0.84988391, "num_input_tokens_seen": 221353965, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20532227, "step": 10278, "time_per_iteration": 2.872072696685791 }, { "auxiliary_loss_clip": 0.014099, "auxiliary_loss_mlp": 0.01032561, "balance_loss_clip": 1.24681854, "balance_loss_mlp": 1.01305795, "epoch": 0.6180069141740568, "flos": 25349085429120.0, "grad_norm": 1.7391308841373785, "language_loss": 0.74408567, "learning_rate": 1.3449719242072567e-06, "loss": 0.76851028, "num_input_tokens_seen": 221374080, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19494629, "step": 10279, "time_per_iteration": 2.8699381351470947 }, { "auxiliary_loss_clip": 0.01422431, "auxiliary_loss_mlp": 0.01038233, "balance_loss_clip": 1.25917721, "balance_loss_mlp": 1.01905191, "epoch": 0.6180670374267248, "flos": 19655072720640.0, "grad_norm": 1.4400008371261395, "language_loss": 0.71122867, "learning_rate": 1.3446039551951975e-06, "loss": 0.73583531, "num_input_tokens_seen": 221392910, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19189453, "step": 10280, "time_per_iteration": 2.838732957839966 }, { "auxiliary_loss_clip": 0.01426328, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.26059484, "balance_loss_mlp": 1.01723123, "epoch": 0.6181271606793928, "flos": 19474590637440.0, "grad_norm": 1.6355469496507058, "language_loss": 0.72903699, "learning_rate": 1.3442360110358215e-06, "loss": 0.75366545, "num_input_tokens_seen": 221410990, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19287109, "step": 10281, "time_per_iteration": 2.818643569946289 }, { "auxiliary_loss_clip": 0.01403805, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.24438965, "balance_loss_mlp": 1.01716447, "epoch": 0.6181872839320607, "flos": 25605542810880.0, "grad_norm": 1.4932057766613216, "language_loss": 0.77847052, "learning_rate": 1.3438680917430827e-06, "loss": 0.80286908, "num_input_tokens_seen": 221431020, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18908691, "step": 10282, "time_per_iteration": 2.8969223499298096 }, { "auxiliary_loss_clip": 0.01430391, "auxiliary_loss_mlp": 0.01037761, "balance_loss_clip": 1.26219738, "balance_loss_mlp": 1.01710224, "epoch": 0.6182474071847287, "flos": 25561492358400.0, "grad_norm": 1.6497136675937623, "language_loss": 0.69582456, "learning_rate": 1.343500197330931e-06, "loss": 0.72050607, "num_input_tokens_seen": 221453235, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20654297, "step": 10283, "time_per_iteration": 2.8804261684417725 }, { "auxiliary_loss_clip": 0.01446141, "auxiliary_loss_mlp": 0.01033647, "balance_loss_clip": 1.27335131, "balance_loss_mlp": 1.01394176, "epoch": 0.6183075304373966, "flos": 22132948222080.0, "grad_norm": 1.8687245848321017, "language_loss": 0.75346673, "learning_rate": 1.3431323278133176e-06, "loss": 0.77826464, "num_input_tokens_seen": 221472560, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.19702148, "step": 10284, "time_per_iteration": 2.856048822402954 }, { "auxiliary_loss_clip": 0.01401102, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.24386525, "balance_loss_mlp": 1.01630938, "epoch": 0.6183676536900646, "flos": 22465788105600.0, "grad_norm": 1.4990264835298597, "language_loss": 0.76262605, "learning_rate": 1.3427644832041922e-06, "loss": 0.78699243, "num_input_tokens_seen": 221492835, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19226074, "step": 10285, "time_per_iteration": 2.8706209659576416 }, { "auxiliary_loss_clip": 0.01414309, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.24979138, "balance_loss_mlp": 1.01445377, "epoch": 0.6184277769427327, "flos": 23373627897600.0, "grad_norm": 1.8240474647686413, "language_loss": 0.73462409, "learning_rate": 1.342396663517503e-06, "loss": 0.7591083, "num_input_tokens_seen": 221511870, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19677734, "step": 10286, "time_per_iteration": 2.8997135162353516 }, { "auxiliary_loss_clip": 0.01413563, "auxiliary_loss_mlp": 0.01031415, "balance_loss_clip": 1.25048482, "balance_loss_mlp": 1.01263976, "epoch": 0.6184879001954006, "flos": 22721340591360.0, "grad_norm": 1.7025529512601867, "language_loss": 0.76804364, "learning_rate": 1.342028868767199e-06, "loss": 0.79249346, "num_input_tokens_seen": 221529915, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.1875, "step": 10287, "time_per_iteration": 2.867908000946045 }, { "auxiliary_loss_clip": 0.01421738, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.25786567, "balance_loss_mlp": 1.0129106, "epoch": 0.6185480234480686, "flos": 23852446064640.0, "grad_norm": 1.6456854050267995, "language_loss": 0.73329556, "learning_rate": 1.3416610989672262e-06, "loss": 0.75783992, "num_input_tokens_seen": 221549745, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19775391, "step": 10288, "time_per_iteration": 2.882108211517334 }, { "auxiliary_loss_clip": 0.01394826, "auxiliary_loss_mlp": 0.0103057, "balance_loss_clip": 1.23617196, "balance_loss_mlp": 1.01210403, "epoch": 0.6186081467007365, "flos": 45493636106880.0, "grad_norm": 1.7042012845829833, "language_loss": 0.73613572, "learning_rate": 1.3412933541315296e-06, "loss": 0.76038963, "num_input_tokens_seen": 221572455, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18457031, "step": 10289, "time_per_iteration": 4.469456195831299 }, { "auxiliary_loss_clip": 0.01423481, "auxiliary_loss_mlp": 0.01036291, "balance_loss_clip": 1.256387, "balance_loss_mlp": 1.01585829, "epoch": 0.6186682699534045, "flos": 23561530128000.0, "grad_norm": 2.138977945006333, "language_loss": 0.79767597, "learning_rate": 1.340925634274056e-06, "loss": 0.82227361, "num_input_tokens_seen": 221591325, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.2043457, "step": 10290, "time_per_iteration": 2.9141712188720703 }, { "auxiliary_loss_clip": 0.01429991, "auxiliary_loss_mlp": 0.01032976, "balance_loss_clip": 1.26242292, "balance_loss_mlp": 1.01341367, "epoch": 0.6187283932060724, "flos": 25784848529280.0, "grad_norm": 1.7541525066370758, "language_loss": 0.82051194, "learning_rate": 1.3405579394087475e-06, "loss": 0.84514165, "num_input_tokens_seen": 221611640, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19567871, "step": 10291, "time_per_iteration": 2.899714946746826 }, { "auxiliary_loss_clip": 0.01422285, "auxiliary_loss_mlp": 0.01033503, "balance_loss_clip": 1.25659966, "balance_loss_mlp": 1.01390457, "epoch": 0.6187885164587404, "flos": 25276186776960.0, "grad_norm": 1.731830837887726, "language_loss": 0.78309983, "learning_rate": 1.3401902695495487e-06, "loss": 0.80765772, "num_input_tokens_seen": 221631225, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19592285, "step": 10292, "time_per_iteration": 2.8982324600219727 }, { "auxiliary_loss_clip": 0.0143938, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.26773167, "balance_loss_mlp": 1.01602721, "epoch": 0.6188486397114084, "flos": 26262128373120.0, "grad_norm": 2.114492695611436, "language_loss": 0.73705339, "learning_rate": 1.339822624710401e-06, "loss": 0.76181769, "num_input_tokens_seen": 221651035, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.21032715, "step": 10293, "time_per_iteration": 2.9023170471191406 }, { "auxiliary_loss_clip": 0.01421446, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.25696266, "balance_loss_mlp": 1.01522684, "epoch": 0.6189087629640764, "flos": 20933242352640.0, "grad_norm": 1.5845831189416137, "language_loss": 0.83554184, "learning_rate": 1.3394550049052454e-06, "loss": 0.8601051, "num_input_tokens_seen": 221671300, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19665527, "step": 10294, "time_per_iteration": 2.8660778999328613 }, { "auxiliary_loss_clip": 0.01410774, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.24622667, "balance_loss_mlp": 1.01451278, "epoch": 0.6189688862167443, "flos": 14837970343680.0, "grad_norm": 6.496050594112748, "language_loss": 0.71759033, "learning_rate": 1.3390874101480225e-06, "loss": 0.74203801, "num_input_tokens_seen": 221687320, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19482422, "step": 10295, "time_per_iteration": 2.794046401977539 }, { "auxiliary_loss_clip": 0.01404057, "auxiliary_loss_mlp": 0.01036054, "balance_loss_clip": 1.24190307, "balance_loss_mlp": 1.01618159, "epoch": 0.6190290094694123, "flos": 24297077145600.0, "grad_norm": 1.4536539537209683, "language_loss": 0.70760489, "learning_rate": 1.3387198404526705e-06, "loss": 0.73200595, "num_input_tokens_seen": 221710175, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19873047, "step": 10296, "time_per_iteration": 2.9184675216674805 }, { "auxiliary_loss_clip": 0.01420812, "auxiliary_loss_mlp": 0.01037625, "balance_loss_clip": 1.25458503, "balance_loss_mlp": 1.0171212, "epoch": 0.6190891327220802, "flos": 22539908367360.0, "grad_norm": 2.0796694953303754, "language_loss": 0.72767663, "learning_rate": 1.3383522958331287e-06, "loss": 0.75226104, "num_input_tokens_seen": 221728145, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20507812, "step": 10297, "time_per_iteration": 2.8540337085723877 }, { "auxiliary_loss_clip": 0.01194863, "auxiliary_loss_mlp": 0.010199, "balance_loss_clip": 1.10632896, "balance_loss_mlp": 1.00168478, "epoch": 0.6191492559747482, "flos": 67759459603200.0, "grad_norm": 0.8960907583886707, "language_loss": 0.6424948, "learning_rate": 1.3379847763033345e-06, "loss": 0.66464245, "num_input_tokens_seen": 221786100, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.18261719, "step": 10298, "time_per_iteration": 3.2800772190093994 }, { "auxiliary_loss_clip": 0.01413192, "auxiliary_loss_mlp": 0.01042568, "balance_loss_clip": 1.24804771, "balance_loss_mlp": 1.02327967, "epoch": 0.6192093792274163, "flos": 22356983064960.0, "grad_norm": 1.6841677404813264, "language_loss": 0.74922061, "learning_rate": 1.3376172818772236e-06, "loss": 0.7737782, "num_input_tokens_seen": 221806450, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19287109, "step": 10299, "time_per_iteration": 2.895599126815796 }, { "auxiliary_loss_clip": 0.01432412, "auxiliary_loss_mlp": 0.01036538, "balance_loss_clip": 1.26172328, "balance_loss_mlp": 1.01698768, "epoch": 0.6192695024800842, "flos": 13562062951680.0, "grad_norm": 1.901617851849146, "language_loss": 0.68843836, "learning_rate": 1.337249812568732e-06, "loss": 0.71312785, "num_input_tokens_seen": 221823330, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.19555664, "step": 10300, "time_per_iteration": 4.311887979507446 }, { "auxiliary_loss_clip": 0.01413047, "auxiliary_loss_mlp": 0.01037887, "balance_loss_clip": 1.24827075, "balance_loss_mlp": 1.01815772, "epoch": 0.6193296257327522, "flos": 17422841093760.0, "grad_norm": 1.7554620577067552, "language_loss": 0.67722636, "learning_rate": 1.3368823683917939e-06, "loss": 0.70173568, "num_input_tokens_seen": 221839360, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19726562, "step": 10301, "time_per_iteration": 2.823730707168579 }, { "auxiliary_loss_clip": 0.01411852, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.2462883, "balance_loss_mlp": 1.01432765, "epoch": 0.6193897489854201, "flos": 31113146367360.0, "grad_norm": 1.690167095036896, "language_loss": 0.73944747, "learning_rate": 1.3365149493603424e-06, "loss": 0.76389635, "num_input_tokens_seen": 221859465, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.18701172, "step": 10302, "time_per_iteration": 4.320590257644653 }, { "auxiliary_loss_clip": 0.01407964, "auxiliary_loss_mlp": 0.0103303, "balance_loss_clip": 1.24444818, "balance_loss_mlp": 1.0115962, "epoch": 0.6194498722380881, "flos": 19143017608320.0, "grad_norm": 2.084872103215293, "language_loss": 0.81492001, "learning_rate": 1.3361475554883107e-06, "loss": 0.83932996, "num_input_tokens_seen": 221878555, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.21435547, "step": 10303, "time_per_iteration": 4.276179790496826 }, { "auxiliary_loss_clip": 0.01429038, "auxiliary_loss_mlp": 0.0103665, "balance_loss_clip": 1.25860095, "balance_loss_mlp": 1.01577616, "epoch": 0.619509995490756, "flos": 21845063687040.0, "grad_norm": 1.5780255990836647, "language_loss": 0.77450073, "learning_rate": 1.3357801867896307e-06, "loss": 0.79915762, "num_input_tokens_seen": 221898790, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.20874023, "step": 10304, "time_per_iteration": 2.873863697052002 }, { "auxiliary_loss_clip": 0.0142614, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.25649083, "balance_loss_mlp": 1.01581502, "epoch": 0.619570118743424, "flos": 23817354082560.0, "grad_norm": 1.9886206936970547, "language_loss": 0.78648555, "learning_rate": 1.3354128432782324e-06, "loss": 0.81111395, "num_input_tokens_seen": 221918875, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.20874023, "step": 10305, "time_per_iteration": 2.978706121444702 }, { "auxiliary_loss_clip": 0.01426504, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.2573626, "balance_loss_mlp": 1.01725054, "epoch": 0.619630241996092, "flos": 21110421565440.0, "grad_norm": 1.6412530275777248, "language_loss": 0.7950455, "learning_rate": 1.335045524968045e-06, "loss": 0.81969517, "num_input_tokens_seen": 221937895, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21191406, "step": 10306, "time_per_iteration": 2.923522472381592 }, { "auxiliary_loss_clip": 0.01410687, "auxiliary_loss_mlp": 0.0103077, "balance_loss_clip": 1.24869049, "balance_loss_mlp": 1.01177955, "epoch": 0.61969036524876, "flos": 27319747011840.0, "grad_norm": 1.7482221807783596, "language_loss": 0.81039643, "learning_rate": 1.3346782318729988e-06, "loss": 0.83481103, "num_input_tokens_seen": 221955920, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18994141, "step": 10307, "time_per_iteration": 2.904693365097046 }, { "auxiliary_loss_clip": 0.01194278, "auxiliary_loss_mlp": 0.01017373, "balance_loss_clip": 1.10685229, "balance_loss_mlp": 0.9978224, "epoch": 0.6197504885014279, "flos": 51677434206720.0, "grad_norm": 0.8076171877744418, "language_loss": 0.59422278, "learning_rate": 1.3343109640070203e-06, "loss": 0.61633933, "num_input_tokens_seen": 222011405, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.1953125, "step": 10308, "time_per_iteration": 3.3857014179229736 }, { "auxiliary_loss_clip": 0.01411334, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.24903154, "balance_loss_mlp": 1.0111239, "epoch": 0.6198106117540959, "flos": 30569845080960.0, "grad_norm": 1.862479757372258, "language_loss": 0.68363994, "learning_rate": 1.333943721384037e-06, "loss": 0.70805585, "num_input_tokens_seen": 222034545, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19128418, "step": 10309, "time_per_iteration": 2.9414303302764893 }, { "auxiliary_loss_clip": 0.01404566, "auxiliary_loss_mlp": 0.01036741, "balance_loss_clip": 1.24287271, "balance_loss_mlp": 1.01602244, "epoch": 0.6198707350067638, "flos": 18917263463040.0, "grad_norm": 1.4480451517674497, "language_loss": 0.73052859, "learning_rate": 1.3335765040179746e-06, "loss": 0.7549417, "num_input_tokens_seen": 222052690, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.20727539, "step": 10310, "time_per_iteration": 2.8381073474884033 }, { "auxiliary_loss_clip": 0.0142838, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.26142681, "balance_loss_mlp": 1.0165242, "epoch": 0.6199308582594318, "flos": 21443759141760.0, "grad_norm": 1.9809452698090997, "language_loss": 0.79574466, "learning_rate": 1.3332093119227573e-06, "loss": 0.8203963, "num_input_tokens_seen": 222069095, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20275879, "step": 10311, "time_per_iteration": 2.877274513244629 }, { "auxiliary_loss_clip": 0.01410272, "auxiliary_loss_mlp": 0.01032712, "balance_loss_clip": 1.2439574, "balance_loss_mlp": 1.01267266, "epoch": 0.6199909815120999, "flos": 18416971998720.0, "grad_norm": 2.885738311527711, "language_loss": 0.73164809, "learning_rate": 1.3328421451123105e-06, "loss": 0.75607789, "num_input_tokens_seen": 222087360, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20043945, "step": 10312, "time_per_iteration": 2.8203017711639404 }, { "auxiliary_loss_clip": 0.0143989, "auxiliary_loss_mlp": 0.0103575, "balance_loss_clip": 1.26911807, "balance_loss_mlp": 1.01547253, "epoch": 0.6200511047647678, "flos": 21475864967040.0, "grad_norm": 1.8287160448351079, "language_loss": 0.72885442, "learning_rate": 1.3324750036005557e-06, "loss": 0.75361085, "num_input_tokens_seen": 222106130, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.20288086, "step": 10313, "time_per_iteration": 2.8425076007843018 }, { "auxiliary_loss_clip": 0.01430172, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.26173019, "balance_loss_mlp": 1.01351988, "epoch": 0.6201112280174358, "flos": 18223097454720.0, "grad_norm": 3.1652499886155163, "language_loss": 0.78978574, "learning_rate": 1.332107887401416e-06, "loss": 0.81442499, "num_input_tokens_seen": 222123125, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20239258, "step": 10314, "time_per_iteration": 2.8005306720733643 }, { "auxiliary_loss_clip": 0.01408085, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.24305928, "balance_loss_mlp": 1.01630616, "epoch": 0.6201713512701037, "flos": 20020787591040.0, "grad_norm": 1.677479255773925, "language_loss": 0.78536248, "learning_rate": 1.331740796528812e-06, "loss": 0.80980152, "num_input_tokens_seen": 222140655, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19506836, "step": 10315, "time_per_iteration": 2.838513135910034 }, { "auxiliary_loss_clip": 0.01425303, "auxiliary_loss_mlp": 0.01038768, "balance_loss_clip": 1.25717139, "balance_loss_mlp": 1.01964712, "epoch": 0.6202314745227717, "flos": 22496762810880.0, "grad_norm": 1.9273849343531655, "language_loss": 0.76949614, "learning_rate": 1.3313737309966641e-06, "loss": 0.79413688, "num_input_tokens_seen": 222160450, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19140625, "step": 10316, "time_per_iteration": 2.8956499099731445 }, { "auxiliary_loss_clip": 0.01407509, "auxiliary_loss_mlp": 0.01033805, "balance_loss_clip": 1.24130583, "balance_loss_mlp": 1.01296747, "epoch": 0.6202915977754396, "flos": 26838666604800.0, "grad_norm": 2.089232130543429, "language_loss": 0.78501731, "learning_rate": 1.3310066908188915e-06, "loss": 0.80943048, "num_input_tokens_seen": 222179170, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20837402, "step": 10317, "time_per_iteration": 2.8941309452056885 }, { "auxiliary_loss_clip": 0.01203346, "auxiliary_loss_mlp": 0.01048229, "balance_loss_clip": 1.11139488, "balance_loss_mlp": 1.02553129, "epoch": 0.6203517210281076, "flos": 62774588931840.0, "grad_norm": 0.6934760283803361, "language_loss": 0.59129912, "learning_rate": 1.3306396760094122e-06, "loss": 0.61381489, "num_input_tokens_seen": 222242660, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.2265625, "step": 10318, "time_per_iteration": 3.4170868396759033 }, { "auxiliary_loss_clip": 0.01418565, "auxiliary_loss_mlp": 0.01041488, "balance_loss_clip": 1.25313163, "balance_loss_mlp": 1.0197444, "epoch": 0.6204118442807756, "flos": 23414737438080.0, "grad_norm": 1.6245183008069577, "language_loss": 0.78862488, "learning_rate": 1.330272686582143e-06, "loss": 0.81322545, "num_input_tokens_seen": 222262170, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.21728516, "step": 10319, "time_per_iteration": 2.905768394470215 }, { "auxiliary_loss_clip": 0.01402642, "auxiliary_loss_mlp": 0.01034689, "balance_loss_clip": 1.24111223, "balance_loss_mlp": 1.01569903, "epoch": 0.6204719675334436, "flos": 20203305690240.0, "grad_norm": 1.7785194501910797, "language_loss": 0.67394471, "learning_rate": 1.3299057225510013e-06, "loss": 0.698318, "num_input_tokens_seen": 222280375, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18994141, "step": 10320, "time_per_iteration": 2.878734588623047 }, { "auxiliary_loss_clip": 0.01397975, "auxiliary_loss_mlp": 0.01034726, "balance_loss_clip": 1.2367475, "balance_loss_mlp": 1.01543808, "epoch": 0.6205320907861115, "flos": 13195805143680.0, "grad_norm": 1.6692119387511808, "language_loss": 0.77148533, "learning_rate": 1.3295387839299013e-06, "loss": 0.79581237, "num_input_tokens_seen": 222297325, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19287109, "step": 10321, "time_per_iteration": 2.8377456665039062 }, { "auxiliary_loss_clip": 0.01392868, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.23192048, "balance_loss_mlp": 1.01339793, "epoch": 0.6205922140387795, "flos": 20678278049280.0, "grad_norm": 1.7270199572680083, "language_loss": 0.74660134, "learning_rate": 1.329171870732758e-06, "loss": 0.77085483, "num_input_tokens_seen": 222317095, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19091797, "step": 10322, "time_per_iteration": 2.8560922145843506 }, { "auxiliary_loss_clip": 0.01403298, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.24058557, "balance_loss_mlp": 1.01506424, "epoch": 0.6206523372914474, "flos": 23888216718720.0, "grad_norm": 1.7568640670967484, "language_loss": 0.73649967, "learning_rate": 1.3288049829734845e-06, "loss": 0.76087761, "num_input_tokens_seen": 222337055, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19421387, "step": 10323, "time_per_iteration": 2.88151216506958 }, { "auxiliary_loss_clip": 0.01430213, "auxiliary_loss_mlp": 0.01035114, "balance_loss_clip": 1.25839686, "balance_loss_mlp": 1.0142045, "epoch": 0.6207124605441154, "flos": 13414320120960.0, "grad_norm": 2.735372635604947, "language_loss": 0.60448581, "learning_rate": 1.3284381206659933e-06, "loss": 0.62913907, "num_input_tokens_seen": 222354515, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.20922852, "step": 10324, "time_per_iteration": 4.265254735946655 }, { "auxiliary_loss_clip": 0.01423321, "auxiliary_loss_mlp": 0.01036039, "balance_loss_clip": 1.25556183, "balance_loss_mlp": 1.01520121, "epoch": 0.6207725837967835, "flos": 18925588506240.0, "grad_norm": 1.9176967149696995, "language_loss": 0.77575254, "learning_rate": 1.3280712838241956e-06, "loss": 0.80034608, "num_input_tokens_seen": 222372755, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20849609, "step": 10325, "time_per_iteration": 2.8687658309936523 }, { "auxiliary_loss_clip": 0.0142728, "auxiliary_loss_mlp": 0.01031408, "balance_loss_clip": 1.25831282, "balance_loss_mlp": 1.0112021, "epoch": 0.6208327070494514, "flos": 23988787205760.0, "grad_norm": 2.167028048460525, "language_loss": 0.73253649, "learning_rate": 1.327704472462003e-06, "loss": 0.75712335, "num_input_tokens_seen": 222391380, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.2019043, "step": 10326, "time_per_iteration": 2.8777382373809814 }, { "auxiliary_loss_clip": 0.01421074, "auxiliary_loss_mlp": 0.01039053, "balance_loss_clip": 1.25272584, "balance_loss_mlp": 1.01767874, "epoch": 0.6208928303021194, "flos": 22830688569600.0, "grad_norm": 2.673547344161478, "language_loss": 0.7445538, "learning_rate": 1.3273376865933234e-06, "loss": 0.76915514, "num_input_tokens_seen": 222411165, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21362305, "step": 10327, "time_per_iteration": 2.8402843475341797 }, { "auxiliary_loss_clip": 0.01430951, "auxiliary_loss_mlp": 0.01033527, "balance_loss_clip": 1.26282644, "balance_loss_mlp": 1.01180744, "epoch": 0.6209529535547873, "flos": 17572936654080.0, "grad_norm": 2.4898195633204736, "language_loss": 0.81428623, "learning_rate": 1.326970926232066e-06, "loss": 0.83893102, "num_input_tokens_seen": 222428110, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.21704102, "step": 10328, "time_per_iteration": 2.847388505935669 }, { "auxiliary_loss_clip": 0.01415822, "auxiliary_loss_mlp": 0.01037885, "balance_loss_clip": 1.25012958, "balance_loss_mlp": 1.01603413, "epoch": 0.6210130768074553, "flos": 22020478352640.0, "grad_norm": 2.0424529646378504, "language_loss": 0.78647304, "learning_rate": 1.3266041913921396e-06, "loss": 0.81101012, "num_input_tokens_seen": 222446385, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.21862793, "step": 10329, "time_per_iteration": 2.8287460803985596 }, { "auxiliary_loss_clip": 0.01192513, "auxiliary_loss_mlp": 0.01036415, "balance_loss_clip": 1.10776162, "balance_loss_mlp": 1.01676977, "epoch": 0.6210732000601232, "flos": 63705259365120.0, "grad_norm": 0.8406787062293202, "language_loss": 0.6221323, "learning_rate": 1.3262374820874484e-06, "loss": 0.64442158, "num_input_tokens_seen": 222502150, "router_z_loss_clip": 0.84765625, "router_z_loss_mlp": 0.19628906, "step": 10330, "time_per_iteration": 3.292299270629883 }, { "auxiliary_loss_clip": 0.01426917, "auxiliary_loss_mlp": 0.01036303, "balance_loss_clip": 1.25833189, "balance_loss_mlp": 1.0157994, "epoch": 0.6211333233127913, "flos": 24254022078720.0, "grad_norm": 1.9376177623153126, "language_loss": 0.78996146, "learning_rate": 1.3258707983319002e-06, "loss": 0.81459367, "num_input_tokens_seen": 222519880, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20507812, "step": 10331, "time_per_iteration": 2.885284662246704 }, { "auxiliary_loss_clip": 0.01425752, "auxiliary_loss_mlp": 0.01038847, "balance_loss_clip": 1.25665379, "balance_loss_mlp": 1.01797318, "epoch": 0.6211934465654592, "flos": 16951940766720.0, "grad_norm": 1.933695747176211, "language_loss": 0.68320429, "learning_rate": 1.3255041401393992e-06, "loss": 0.70785034, "num_input_tokens_seen": 222538545, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.2088623, "step": 10332, "time_per_iteration": 2.8588366508483887 }, { "auxiliary_loss_clip": 0.01417174, "auxiliary_loss_mlp": 0.01032826, "balance_loss_clip": 1.2513082, "balance_loss_mlp": 1.01279938, "epoch": 0.6212535698181272, "flos": 15275588480640.0, "grad_norm": 1.922229973877783, "language_loss": 0.7678563, "learning_rate": 1.3251375075238476e-06, "loss": 0.79235625, "num_input_tokens_seen": 222556935, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20031738, "step": 10333, "time_per_iteration": 2.9523508548736572 }, { "auxiliary_loss_clip": 0.01406662, "auxiliary_loss_mlp": 0.01035566, "balance_loss_clip": 1.24463582, "balance_loss_mlp": 1.01527607, "epoch": 0.6213136930707951, "flos": 13451900567040.0, "grad_norm": 3.078980702150917, "language_loss": 0.70944291, "learning_rate": 1.3247709004991507e-06, "loss": 0.73386514, "num_input_tokens_seen": 222574035, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20275879, "step": 10334, "time_per_iteration": 2.8661539554595947 }, { "auxiliary_loss_clip": 0.01406772, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.24438024, "balance_loss_mlp": 1.01076221, "epoch": 0.6213738163234631, "flos": 18119631300480.0, "grad_norm": 2.0590818865755445, "language_loss": 0.70418203, "learning_rate": 1.3244043190792078e-06, "loss": 0.72854805, "num_input_tokens_seen": 222592290, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19067383, "step": 10335, "time_per_iteration": 4.283087253570557 }, { "auxiliary_loss_clip": 0.01417427, "auxiliary_loss_mlp": 0.01030159, "balance_loss_clip": 1.25474048, "balance_loss_mlp": 1.01050103, "epoch": 0.621433939576131, "flos": 25348135288320.0, "grad_norm": 1.4260894177203194, "language_loss": 0.80615866, "learning_rate": 1.3240377632779213e-06, "loss": 0.83063453, "num_input_tokens_seen": 222612805, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19628906, "step": 10336, "time_per_iteration": 2.9185190200805664 }, { "auxiliary_loss_clip": 0.01398137, "auxiliary_loss_mlp": 0.01033079, "balance_loss_clip": 1.23747969, "balance_loss_mlp": 1.01259851, "epoch": 0.621494062828799, "flos": 22575995735040.0, "grad_norm": 1.6705929916528577, "language_loss": 0.7364471, "learning_rate": 1.3236712331091907e-06, "loss": 0.76075923, "num_input_tokens_seen": 222632260, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20483398, "step": 10337, "time_per_iteration": 4.31508207321167 }, { "auxiliary_loss_clip": 0.01425391, "auxiliary_loss_mlp": 0.01035442, "balance_loss_clip": 1.25589538, "balance_loss_mlp": 1.01492643, "epoch": 0.621554186081467, "flos": 27429547438080.0, "grad_norm": 3.0960265174885317, "language_loss": 0.63986719, "learning_rate": 1.3233047285869145e-06, "loss": 0.6644755, "num_input_tokens_seen": 222653570, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20507812, "step": 10338, "time_per_iteration": 2.8883275985717773 }, { "auxiliary_loss_clip": 0.01420261, "auxiliary_loss_mlp": 0.01030518, "balance_loss_clip": 1.25575709, "balance_loss_mlp": 1.01117063, "epoch": 0.621614309334135, "flos": 22357164044160.0, "grad_norm": 1.4786053083553699, "language_loss": 0.72254199, "learning_rate": 1.322938249724991e-06, "loss": 0.74704981, "num_input_tokens_seen": 222672480, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19348145, "step": 10339, "time_per_iteration": 2.8704187870025635 }, { "auxiliary_loss_clip": 0.01399475, "auxiliary_loss_mlp": 0.01036822, "balance_loss_clip": 1.2384963, "balance_loss_mlp": 1.01575768, "epoch": 0.621674432586803, "flos": 19290579459840.0, "grad_norm": 1.8275957207017957, "language_loss": 0.70367408, "learning_rate": 1.3225717965373166e-06, "loss": 0.728037, "num_input_tokens_seen": 222691200, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.21069336, "step": 10340, "time_per_iteration": 2.846717119216919 }, { "auxiliary_loss_clip": 0.01400771, "auxiliary_loss_mlp": 0.01034507, "balance_loss_clip": 1.23865294, "balance_loss_mlp": 1.01393151, "epoch": 0.6217345558394709, "flos": 21617725973760.0, "grad_norm": 1.9106037807322824, "language_loss": 0.69989884, "learning_rate": 1.322205369037788e-06, "loss": 0.72425163, "num_input_tokens_seen": 222709975, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.20568848, "step": 10341, "time_per_iteration": 2.8518295288085938 }, { "auxiliary_loss_clip": 0.01417738, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.25186563, "balance_loss_mlp": 1.01325655, "epoch": 0.6217946790921389, "flos": 18013088499840.0, "grad_norm": 1.7141534917200223, "language_loss": 0.80992138, "learning_rate": 1.321838967240299e-06, "loss": 0.83444095, "num_input_tokens_seen": 222729005, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.2097168, "step": 10342, "time_per_iteration": 2.830789566040039 }, { "auxiliary_loss_clip": 0.01191718, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.10582972, "balance_loss_mlp": 1.00926769, "epoch": 0.6218548023448068, "flos": 62004583359360.0, "grad_norm": 0.7889177331274366, "language_loss": 0.57358801, "learning_rate": 1.3214725911587452e-06, "loss": 0.59579241, "num_input_tokens_seen": 222786090, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.19433594, "step": 10343, "time_per_iteration": 3.2910213470458984 }, { "auxiliary_loss_clip": 0.01395578, "auxiliary_loss_mlp": 0.01030721, "balance_loss_clip": 1.23515248, "balance_loss_mlp": 1.01183867, "epoch": 0.6219149255974749, "flos": 25750163750400.0, "grad_norm": 2.6103681137018704, "language_loss": 0.73526251, "learning_rate": 1.3211062408070184e-06, "loss": 0.75952554, "num_input_tokens_seen": 222806100, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18896484, "step": 10344, "time_per_iteration": 2.9269955158233643 }, { "auxiliary_loss_clip": 0.01414027, "auxiliary_loss_mlp": 0.0103714, "balance_loss_clip": 1.24950695, "balance_loss_mlp": 1.01671982, "epoch": 0.6219750488501428, "flos": 25422074570880.0, "grad_norm": 1.689220590238894, "language_loss": 0.61072224, "learning_rate": 1.3207399161990105e-06, "loss": 0.63523388, "num_input_tokens_seen": 222826575, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20422363, "step": 10345, "time_per_iteration": 2.901153564453125 }, { "auxiliary_loss_clip": 0.01411991, "auxiliary_loss_mlp": 0.0103531, "balance_loss_clip": 1.2474544, "balance_loss_mlp": 1.01446056, "epoch": 0.6220351721028108, "flos": 20056920203520.0, "grad_norm": 2.0506466777648122, "language_loss": 0.78758377, "learning_rate": 1.320373617348614e-06, "loss": 0.81205678, "num_input_tokens_seen": 222845285, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20837402, "step": 10346, "time_per_iteration": 2.8809657096862793 }, { "auxiliary_loss_clip": 0.01415704, "auxiliary_loss_mlp": 0.01034713, "balance_loss_clip": 1.24787951, "balance_loss_mlp": 1.01435173, "epoch": 0.6220952953554787, "flos": 27499324199040.0, "grad_norm": 1.8567347889708723, "language_loss": 0.72395295, "learning_rate": 1.3200073442697171e-06, "loss": 0.74845707, "num_input_tokens_seen": 222864575, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20361328, "step": 10347, "time_per_iteration": 2.896475315093994 }, { "auxiliary_loss_clip": 0.01414334, "auxiliary_loss_mlp": 0.01034706, "balance_loss_clip": 1.25070477, "balance_loss_mlp": 1.01410639, "epoch": 0.6221554186081467, "flos": 19216775911680.0, "grad_norm": 3.6075826049757804, "language_loss": 0.72865021, "learning_rate": 1.3196410969762108e-06, "loss": 0.75314063, "num_input_tokens_seen": 222884420, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20617676, "step": 10348, "time_per_iteration": 2.879554510116577 }, { "auxiliary_loss_clip": 0.01187336, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.10160041, "balance_loss_mlp": 1.0098542, "epoch": 0.6222155418608146, "flos": 62980118651520.0, "grad_norm": 0.8218903598601407, "language_loss": 0.54250956, "learning_rate": 1.3192748754819815e-06, "loss": 0.56465977, "num_input_tokens_seen": 222944690, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 0.17871094, "step": 10349, "time_per_iteration": 3.3399224281311035 }, { "auxiliary_loss_clip": 0.01423068, "auxiliary_loss_mlp": 0.01031727, "balance_loss_clip": 1.25710249, "balance_loss_mlp": 1.0122, "epoch": 0.6222756651134826, "flos": 22611223451520.0, "grad_norm": 1.951743560498765, "language_loss": 0.69923675, "learning_rate": 1.3189086798009173e-06, "loss": 0.72378463, "num_input_tokens_seen": 222962990, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19506836, "step": 10350, "time_per_iteration": 2.9471845626831055 }, { "auxiliary_loss_clip": 0.01408682, "auxiliary_loss_mlp": 0.01037052, "balance_loss_clip": 1.24501204, "balance_loss_mlp": 1.01639247, "epoch": 0.6223357883661506, "flos": 21152119288320.0, "grad_norm": 1.9361258847616158, "language_loss": 0.58206069, "learning_rate": 1.3185425099469046e-06, "loss": 0.60651803, "num_input_tokens_seen": 222980715, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20666504, "step": 10351, "time_per_iteration": 2.8502047061920166 }, { "auxiliary_loss_clip": 0.01190214, "auxiliary_loss_mlp": 0.01025069, "balance_loss_clip": 1.10192227, "balance_loss_mlp": 1.00570917, "epoch": 0.6223959116188186, "flos": 63797432302080.0, "grad_norm": 0.7939591424533053, "language_loss": 0.61206114, "learning_rate": 1.3181763659338276e-06, "loss": 0.63421398, "num_input_tokens_seen": 223040685, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.19335938, "step": 10352, "time_per_iteration": 3.242985486984253 }, { "auxiliary_loss_clip": 0.01407763, "auxiliary_loss_mlp": 0.01035263, "balance_loss_clip": 1.2460264, "balance_loss_mlp": 1.01535487, "epoch": 0.6224560348714866, "flos": 22576131469440.0, "grad_norm": 2.5465376547902427, "language_loss": 0.83192146, "learning_rate": 1.3178102477755714e-06, "loss": 0.85635173, "num_input_tokens_seen": 223059000, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19909668, "step": 10353, "time_per_iteration": 2.857053756713867 }, { "auxiliary_loss_clip": 0.01405112, "auxiliary_loss_mlp": 0.01032058, "balance_loss_clip": 1.24399471, "balance_loss_mlp": 1.01254368, "epoch": 0.6225161581241545, "flos": 24108450998400.0, "grad_norm": 1.6699342103467656, "language_loss": 0.76698947, "learning_rate": 1.3174441554860195e-06, "loss": 0.79136121, "num_input_tokens_seen": 223079345, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19506836, "step": 10354, "time_per_iteration": 2.898144006729126 }, { "auxiliary_loss_clip": 0.01402526, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.2407434, "balance_loss_mlp": 1.01723886, "epoch": 0.6225762813768225, "flos": 20451799987200.0, "grad_norm": 1.50157762552056, "language_loss": 0.7903322, "learning_rate": 1.3170780890790528e-06, "loss": 0.81472707, "num_input_tokens_seen": 223097880, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19726562, "step": 10355, "time_per_iteration": 2.8429126739501953 }, { "auxiliary_loss_clip": 0.01417786, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.2535547, "balance_loss_mlp": 1.01380563, "epoch": 0.6226364046294904, "flos": 27209403648000.0, "grad_norm": 3.5451447466606707, "language_loss": 0.78817922, "learning_rate": 1.3167120485685538e-06, "loss": 0.8126877, "num_input_tokens_seen": 223118185, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19238281, "step": 10356, "time_per_iteration": 2.973090410232544 }, { "auxiliary_loss_clip": 0.01435579, "auxiliary_loss_mlp": 0.0104125, "balance_loss_clip": 1.26431704, "balance_loss_mlp": 1.02001834, "epoch": 0.6226965278821585, "flos": 20454650409600.0, "grad_norm": 2.30340056727797, "language_loss": 0.68824887, "learning_rate": 1.3163460339684024e-06, "loss": 0.71301717, "num_input_tokens_seen": 223137600, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.21228027, "step": 10357, "time_per_iteration": 2.872229814529419 }, { "auxiliary_loss_clip": 0.01428332, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.25817883, "balance_loss_mlp": 1.01491046, "epoch": 0.6227566511348264, "flos": 22172609928960.0, "grad_norm": 2.9492632425352867, "language_loss": 0.7697376, "learning_rate": 1.3159800452924778e-06, "loss": 0.7943874, "num_input_tokens_seen": 223154360, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.21716309, "step": 10358, "time_per_iteration": 2.842421531677246 }, { "auxiliary_loss_clip": 0.01421337, "auxiliary_loss_mlp": 0.0103829, "balance_loss_clip": 1.25345016, "balance_loss_mlp": 1.01864409, "epoch": 0.6228167743874944, "flos": 18049945029120.0, "grad_norm": 2.108111295112494, "language_loss": 0.83286315, "learning_rate": 1.3156140825546588e-06, "loss": 0.85745943, "num_input_tokens_seen": 223172255, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19665527, "step": 10359, "time_per_iteration": 4.29512619972229 }, { "auxiliary_loss_clip": 0.01398992, "auxiliary_loss_mlp": 0.01044454, "balance_loss_clip": 1.23980331, "balance_loss_mlp": 1.0234611, "epoch": 0.6228768976401623, "flos": 17750251601280.0, "grad_norm": 3.7510155864298373, "language_loss": 0.74177945, "learning_rate": 1.315248145768822e-06, "loss": 0.76621389, "num_input_tokens_seen": 223186965, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.20996094, "step": 10360, "time_per_iteration": 2.805169105529785 }, { "auxiliary_loss_clip": 0.01416524, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.24986362, "balance_loss_mlp": 1.01877117, "epoch": 0.6229370208928303, "flos": 17903966745600.0, "grad_norm": 1.9791982364275265, "language_loss": 0.781353, "learning_rate": 1.3148822349488442e-06, "loss": 0.80590498, "num_input_tokens_seen": 223206045, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19934082, "step": 10361, "time_per_iteration": 2.8429064750671387 }, { "auxiliary_loss_clip": 0.01413532, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.25112104, "balance_loss_mlp": 1.01750803, "epoch": 0.6229971441454982, "flos": 17356683916800.0, "grad_norm": 1.6660070088685646, "language_loss": 0.68417227, "learning_rate": 1.3145163501086005e-06, "loss": 0.70868212, "num_input_tokens_seen": 223224820, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19946289, "step": 10362, "time_per_iteration": 2.83235502243042 }, { "auxiliary_loss_clip": 0.01416237, "auxiliary_loss_mlp": 0.0104197, "balance_loss_clip": 1.25164735, "balance_loss_mlp": 1.02160954, "epoch": 0.6230572673981662, "flos": 29253190106880.0, "grad_norm": 2.083099213207099, "language_loss": 0.68625456, "learning_rate": 1.3141504912619658e-06, "loss": 0.71083665, "num_input_tokens_seen": 223243205, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20361328, "step": 10363, "time_per_iteration": 2.9104669094085693 }, { "auxiliary_loss_clip": 0.01422123, "auxiliary_loss_mlp": 0.01038878, "balance_loss_clip": 1.25240576, "balance_loss_mlp": 1.01818371, "epoch": 0.6231173906508342, "flos": 16334564463360.0, "grad_norm": 1.788790618088364, "language_loss": 0.87587351, "learning_rate": 1.3137846584228127e-06, "loss": 0.90048349, "num_input_tokens_seen": 223261370, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.20703125, "step": 10364, "time_per_iteration": 2.8866076469421387 }, { "auxiliary_loss_clip": 0.01190433, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.1010561, "balance_loss_mlp": 1.02244127, "epoch": 0.6231775139035022, "flos": 68729158316160.0, "grad_norm": 0.8859861133245455, "language_loss": 0.60857117, "learning_rate": 1.313418851605015e-06, "loss": 0.63089538, "num_input_tokens_seen": 223315050, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.1953125, "step": 10365, "time_per_iteration": 3.333400011062622 }, { "auxiliary_loss_clip": 0.01449441, "auxiliary_loss_mlp": 0.01045571, "balance_loss_clip": 1.27624142, "balance_loss_mlp": 1.02368414, "epoch": 0.6232376371561702, "flos": 19828587104640.0, "grad_norm": 4.29669287413631, "language_loss": 0.76090056, "learning_rate": 1.3130530708224427e-06, "loss": 0.78585064, "num_input_tokens_seen": 223332130, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.21899414, "step": 10366, "time_per_iteration": 2.8515429496765137 }, { "auxiliary_loss_clip": 0.01432849, "auxiliary_loss_mlp": 0.01042731, "balance_loss_clip": 1.26387298, "balance_loss_mlp": 1.02209604, "epoch": 0.6232977604088381, "flos": 23268668664960.0, "grad_norm": 2.0160815612541776, "language_loss": 0.77437437, "learning_rate": 1.3126873160889665e-06, "loss": 0.7991302, "num_input_tokens_seen": 223351605, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20629883, "step": 10367, "time_per_iteration": 2.8505983352661133 }, { "auxiliary_loss_clip": 0.01411808, "auxiliary_loss_mlp": 0.01044456, "balance_loss_clip": 1.2504741, "balance_loss_mlp": 1.02333212, "epoch": 0.6233578836615061, "flos": 21116484368640.0, "grad_norm": 1.5366579323636682, "language_loss": 0.7902025, "learning_rate": 1.312321587418457e-06, "loss": 0.8147651, "num_input_tokens_seen": 223372090, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.21130371, "step": 10368, "time_per_iteration": 2.907984495162964 }, { "auxiliary_loss_clip": 0.01416078, "auxiliary_loss_mlp": 0.01038597, "balance_loss_clip": 1.25023818, "balance_loss_mlp": 1.01806939, "epoch": 0.623418006914174, "flos": 23780045105280.0, "grad_norm": 1.770711521142182, "language_loss": 0.69778758, "learning_rate": 1.3119558848247811e-06, "loss": 0.72233427, "num_input_tokens_seen": 223390110, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20532227, "step": 10369, "time_per_iteration": 2.8732213973999023 }, { "auxiliary_loss_clip": 0.01423436, "auxiliary_loss_mlp": 0.01038222, "balance_loss_clip": 1.25832176, "balance_loss_mlp": 1.0175277, "epoch": 0.6234781301668421, "flos": 17898356390400.0, "grad_norm": 2.110412146605329, "language_loss": 0.88336813, "learning_rate": 1.3115902083218072e-06, "loss": 0.90798473, "num_input_tokens_seen": 223404205, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20715332, "step": 10370, "time_per_iteration": 4.228360176086426 }, { "auxiliary_loss_clip": 0.0141715, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.25322342, "balance_loss_mlp": 1.01470709, "epoch": 0.62353825341951, "flos": 26186243564160.0, "grad_norm": 1.4556642707085412, "language_loss": 0.66794479, "learning_rate": 1.311224557923402e-06, "loss": 0.6924597, "num_input_tokens_seen": 223424855, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19628906, "step": 10371, "time_per_iteration": 2.896641254425049 }, { "auxiliary_loss_clip": 0.0139086, "auxiliary_loss_mlp": 0.01037834, "balance_loss_clip": 1.23452032, "balance_loss_mlp": 1.01873648, "epoch": 0.623598376672178, "flos": 31152536605440.0, "grad_norm": 1.3028894607517774, "language_loss": 0.78157222, "learning_rate": 1.3108589336434298e-06, "loss": 0.80585921, "num_input_tokens_seen": 223447225, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.19091797, "step": 10372, "time_per_iteration": 5.6874754428863525 }, { "auxiliary_loss_clip": 0.01418618, "auxiliary_loss_mlp": 0.01036906, "balance_loss_clip": 1.25188518, "balance_loss_mlp": 1.01674771, "epoch": 0.6236584999248459, "flos": 23740111929600.0, "grad_norm": 1.8759815699399536, "language_loss": 0.77739549, "learning_rate": 1.3104933354957568e-06, "loss": 0.80195069, "num_input_tokens_seen": 223467520, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20153809, "step": 10373, "time_per_iteration": 2.8695085048675537 }, { "auxiliary_loss_clip": 0.01407205, "auxiliary_loss_mlp": 0.01031458, "balance_loss_clip": 1.24628425, "balance_loss_mlp": 1.01267099, "epoch": 0.6237186231775139, "flos": 21772888951680.0, "grad_norm": 1.6211541034227688, "language_loss": 0.70557612, "learning_rate": 1.3101277634942448e-06, "loss": 0.72996271, "num_input_tokens_seen": 223488130, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18786621, "step": 10374, "time_per_iteration": 2.8578813076019287 }, { "auxiliary_loss_clip": 0.01427085, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 1.25938463, "balance_loss_mlp": 1.01579285, "epoch": 0.6237787464301818, "flos": 14947725525120.0, "grad_norm": 1.8207986027795982, "language_loss": 0.77630186, "learning_rate": 1.3097622176527577e-06, "loss": 0.80093843, "num_input_tokens_seen": 223505105, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20788574, "step": 10375, "time_per_iteration": 2.9992563724517822 }, { "auxiliary_loss_clip": 0.01412227, "auxiliary_loss_mlp": 0.01035305, "balance_loss_clip": 1.25078416, "balance_loss_mlp": 1.01641095, "epoch": 0.6238388696828499, "flos": 35603878867200.0, "grad_norm": 1.5997237691571562, "language_loss": 0.71379656, "learning_rate": 1.3093966979851566e-06, "loss": 0.73827189, "num_input_tokens_seen": 223528065, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18908691, "step": 10376, "time_per_iteration": 2.9938740730285645 }, { "auxiliary_loss_clip": 0.01424868, "auxiliary_loss_mlp": 0.01041116, "balance_loss_clip": 1.25787568, "balance_loss_mlp": 1.02087414, "epoch": 0.6238989929355178, "flos": 23634112066560.0, "grad_norm": 2.4870905527129383, "language_loss": 0.77906328, "learning_rate": 1.309031204505301e-06, "loss": 0.8037231, "num_input_tokens_seen": 223547305, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20239258, "step": 10377, "time_per_iteration": 2.8679239749908447 }, { "auxiliary_loss_clip": 0.01415293, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.25118256, "balance_loss_mlp": 1.01596308, "epoch": 0.6239591161881858, "flos": 22096725120000.0, "grad_norm": 2.607547396239463, "language_loss": 0.69179416, "learning_rate": 1.308665737227052e-06, "loss": 0.71629167, "num_input_tokens_seen": 223567205, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18493652, "step": 10378, "time_per_iteration": 2.8788669109344482 }, { "auxiliary_loss_clip": 0.01409015, "auxiliary_loss_mlp": 0.01031868, "balance_loss_clip": 1.24576831, "balance_loss_mlp": 1.01269913, "epoch": 0.6240192394408538, "flos": 24546838296960.0, "grad_norm": 2.326988346432853, "language_loss": 0.76826048, "learning_rate": 1.3083002961642675e-06, "loss": 0.7926693, "num_input_tokens_seen": 223586560, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19177246, "step": 10379, "time_per_iteration": 2.931349277496338 }, { "auxiliary_loss_clip": 0.01420067, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.25356698, "balance_loss_mlp": 1.01130903, "epoch": 0.6240793626935217, "flos": 27944452972800.0, "grad_norm": 1.653768105452062, "language_loss": 0.79383671, "learning_rate": 1.3079348813308051e-06, "loss": 0.8183459, "num_input_tokens_seen": 223610595, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.1953125, "step": 10380, "time_per_iteration": 2.9162395000457764 }, { "auxiliary_loss_clip": 0.01404293, "auxiliary_loss_mlp": 0.01035882, "balance_loss_clip": 1.24324822, "balance_loss_mlp": 1.01696348, "epoch": 0.6241394859461897, "flos": 22902591836160.0, "grad_norm": 1.5448015725664552, "language_loss": 0.80673206, "learning_rate": 1.3075694927405207e-06, "loss": 0.83113378, "num_input_tokens_seen": 223630230, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18920898, "step": 10381, "time_per_iteration": 2.8782458305358887 }, { "auxiliary_loss_clip": 0.01418554, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.25359929, "balance_loss_mlp": 1.01198208, "epoch": 0.6241996091988576, "flos": 12758548965120.0, "grad_norm": 1.9178629666890425, "language_loss": 0.75016069, "learning_rate": 1.3072041304072718e-06, "loss": 0.77466047, "num_input_tokens_seen": 223648360, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19433594, "step": 10382, "time_per_iteration": 2.8514316082000732 }, { "auxiliary_loss_clip": 0.01417026, "auxiliary_loss_mlp": 0.01035218, "balance_loss_clip": 1.25435972, "balance_loss_mlp": 1.01486897, "epoch": 0.6242597324515257, "flos": 25863040823040.0, "grad_norm": 1.5111190274983402, "language_loss": 0.78958422, "learning_rate": 1.306838794344911e-06, "loss": 0.8141067, "num_input_tokens_seen": 223671255, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20349121, "step": 10383, "time_per_iteration": 2.948786735534668 }, { "auxiliary_loss_clip": 0.01410078, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.24644458, "balance_loss_mlp": 1.01079094, "epoch": 0.6243198557041936, "flos": 19947165022080.0, "grad_norm": 1.7632518023667838, "language_loss": 0.75564879, "learning_rate": 1.3064734845672925e-06, "loss": 0.78005075, "num_input_tokens_seen": 223689860, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19335938, "step": 10384, "time_per_iteration": 2.826127529144287 }, { "auxiliary_loss_clip": 0.01424282, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.25708115, "balance_loss_mlp": 1.01407552, "epoch": 0.6243799789568616, "flos": 18415478920320.0, "grad_norm": 1.8679889539439598, "language_loss": 0.66835171, "learning_rate": 1.3061082010882694e-06, "loss": 0.69294679, "num_input_tokens_seen": 223707835, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.21166992, "step": 10385, "time_per_iteration": 2.8361148834228516 }, { "auxiliary_loss_clip": 0.01183359, "auxiliary_loss_mlp": 0.01016519, "balance_loss_clip": 1.0966922, "balance_loss_mlp": 0.9971593, "epoch": 0.6244401022095295, "flos": 66060638634240.0, "grad_norm": 0.7550878379077774, "language_loss": 0.62077135, "learning_rate": 1.305742943921692e-06, "loss": 0.64277011, "num_input_tokens_seen": 223771875, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.19335938, "step": 10386, "time_per_iteration": 3.3833136558532715 }, { "auxiliary_loss_clip": 0.0141568, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.2502749, "balance_loss_mlp": 1.01410198, "epoch": 0.6245002254621975, "flos": 24581885034240.0, "grad_norm": 2.3852820475306324, "language_loss": 0.72914314, "learning_rate": 1.3053777130814128e-06, "loss": 0.75364184, "num_input_tokens_seen": 223788895, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20080566, "step": 10387, "time_per_iteration": 2.892831563949585 }, { "auxiliary_loss_clip": 0.01434031, "auxiliary_loss_mlp": 0.01037322, "balance_loss_clip": 1.26344776, "balance_loss_mlp": 1.01653218, "epoch": 0.6245603487148654, "flos": 29180743902720.0, "grad_norm": 2.1307382585699997, "language_loss": 0.65897548, "learning_rate": 1.3050125085812798e-06, "loss": 0.68368906, "num_input_tokens_seen": 223810385, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20800781, "step": 10388, "time_per_iteration": 2.903571128845215 }, { "auxiliary_loss_clip": 0.01416052, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.25119734, "balance_loss_mlp": 1.01482654, "epoch": 0.6246204719675335, "flos": 14797403740800.0, "grad_norm": 1.7686758299364909, "language_loss": 0.79898572, "learning_rate": 1.3046473304351417e-06, "loss": 0.82348692, "num_input_tokens_seen": 223826040, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19262695, "step": 10389, "time_per_iteration": 2.808130979537964 }, { "auxiliary_loss_clip": 0.01401946, "auxiliary_loss_mlp": 0.0103086, "balance_loss_clip": 1.23999476, "balance_loss_mlp": 1.01103568, "epoch": 0.6246805952202014, "flos": 12500779484160.0, "grad_norm": 2.0965635063971826, "language_loss": 0.6147033, "learning_rate": 1.3042821786568475e-06, "loss": 0.63903129, "num_input_tokens_seen": 223842300, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19812012, "step": 10390, "time_per_iteration": 2.822838306427002 }, { "auxiliary_loss_clip": 0.01435465, "auxiliary_loss_mlp": 0.01038445, "balance_loss_clip": 1.2664988, "balance_loss_mlp": 1.01881087, "epoch": 0.6247407184728694, "flos": 12794319619200.0, "grad_norm": 2.0245379916007504, "language_loss": 0.78091675, "learning_rate": 1.3039170532602416e-06, "loss": 0.80565584, "num_input_tokens_seen": 223858320, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.19641113, "step": 10391, "time_per_iteration": 2.8201215267181396 }, { "auxiliary_loss_clip": 0.01427239, "auxiliary_loss_mlp": 0.01037623, "balance_loss_clip": 1.26095295, "balance_loss_mlp": 1.01742935, "epoch": 0.6248008417255374, "flos": 40645061331840.0, "grad_norm": 1.6106837147551534, "language_loss": 0.65479583, "learning_rate": 1.3035519542591718e-06, "loss": 0.67944443, "num_input_tokens_seen": 223883545, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20202637, "step": 10392, "time_per_iteration": 3.085664749145508 }, { "auxiliary_loss_clip": 0.01429348, "auxiliary_loss_mlp": 0.01034165, "balance_loss_clip": 1.26143634, "balance_loss_mlp": 1.01324344, "epoch": 0.6248609649782053, "flos": 19911665836800.0, "grad_norm": 1.8525788806332424, "language_loss": 0.76956904, "learning_rate": 1.3031868816674819e-06, "loss": 0.79420418, "num_input_tokens_seen": 223901445, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20910645, "step": 10393, "time_per_iteration": 2.907437324523926 }, { "auxiliary_loss_clip": 0.01418702, "auxiliary_loss_mlp": 0.01038371, "balance_loss_clip": 1.25107813, "balance_loss_mlp": 1.01693702, "epoch": 0.6249210882308733, "flos": 19692245963520.0, "grad_norm": 1.8205900095628822, "language_loss": 0.83476657, "learning_rate": 1.3028218354990142e-06, "loss": 0.85933733, "num_input_tokens_seen": 223920170, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.21447754, "step": 10394, "time_per_iteration": 4.234117269515991 }, { "auxiliary_loss_clip": 0.01416713, "auxiliary_loss_mlp": 0.01035836, "balance_loss_clip": 1.24888635, "balance_loss_mlp": 1.01583266, "epoch": 0.6249812114835412, "flos": 13998685703040.0, "grad_norm": 1.8721606309424472, "language_loss": 0.76098835, "learning_rate": 1.3024568157676128e-06, "loss": 0.78551376, "num_input_tokens_seen": 223936495, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.1998291, "step": 10395, "time_per_iteration": 2.841761827468872 }, { "auxiliary_loss_clip": 0.01437377, "auxiliary_loss_mlp": 0.01037848, "balance_loss_clip": 1.26803517, "balance_loss_mlp": 1.01805902, "epoch": 0.6250413347362093, "flos": 14536059920640.0, "grad_norm": 2.3202493468349665, "language_loss": 0.73249477, "learning_rate": 1.302091822487119e-06, "loss": 0.75724709, "num_input_tokens_seen": 223950070, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.19787598, "step": 10396, "time_per_iteration": 2.8274879455566406 }, { "auxiliary_loss_clip": 0.01423147, "auxiliary_loss_mlp": 0.0103098, "balance_loss_clip": 1.2572577, "balance_loss_mlp": 1.01157308, "epoch": 0.6251014579888772, "flos": 22972459086720.0, "grad_norm": 2.21579124143767, "language_loss": 0.76970053, "learning_rate": 1.3017268556713732e-06, "loss": 0.79424179, "num_input_tokens_seen": 223970065, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19396973, "step": 10397, "time_per_iteration": 2.8524744510650635 }, { "auxiliary_loss_clip": 0.01424856, "auxiliary_loss_mlp": 0.01037701, "balance_loss_clip": 1.25833964, "balance_loss_mlp": 1.01716089, "epoch": 0.6251615812415452, "flos": 28122989529600.0, "grad_norm": 2.2397817770120247, "language_loss": 0.76013952, "learning_rate": 1.3013619153342154e-06, "loss": 0.78476512, "num_input_tokens_seen": 223990315, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20556641, "step": 10398, "time_per_iteration": 2.915275812149048 }, { "auxiliary_loss_clip": 0.01427262, "auxiliary_loss_mlp": 0.01043756, "balance_loss_clip": 1.2576896, "balance_loss_mlp": 1.02259636, "epoch": 0.6252217044942131, "flos": 26735788632960.0, "grad_norm": 2.133714436018665, "language_loss": 0.75310338, "learning_rate": 1.300997001489483e-06, "loss": 0.77781355, "num_input_tokens_seen": 224009960, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.21142578, "step": 10399, "time_per_iteration": 2.9015908241271973 }, { "auxiliary_loss_clip": 0.01423226, "auxiliary_loss_mlp": 0.01038595, "balance_loss_clip": 1.25709534, "balance_loss_mlp": 1.0172447, "epoch": 0.6252818277468811, "flos": 20015222480640.0, "grad_norm": 1.4795254414251267, "language_loss": 0.74802005, "learning_rate": 1.3006321141510147e-06, "loss": 0.77263826, "num_input_tokens_seen": 224028870, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21350098, "step": 10400, "time_per_iteration": 2.859086513519287 }, { "auxiliary_loss_clip": 0.01187848, "auxiliary_loss_mlp": 0.01020367, "balance_loss_clip": 1.10052657, "balance_loss_mlp": 0.99690628, "epoch": 0.625341950999549, "flos": 59309731203840.0, "grad_norm": 0.8865687610414611, "language_loss": 0.56557477, "learning_rate": 1.3002672533326465e-06, "loss": 0.58765692, "num_input_tokens_seen": 224094140, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.234375, "step": 10401, "time_per_iteration": 3.421266794204712 }, { "auxiliary_loss_clip": 0.01420706, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 1.2536478, "balance_loss_mlp": 1.01417232, "epoch": 0.625402074252217, "flos": 20166720629760.0, "grad_norm": 2.916652967263405, "language_loss": 0.83692622, "learning_rate": 1.2999024190482146e-06, "loss": 0.86148679, "num_input_tokens_seen": 224113235, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.21203613, "step": 10402, "time_per_iteration": 2.8425896167755127 }, { "auxiliary_loss_clip": 0.01415409, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.24945307, "balance_loss_mlp": 1.01395321, "epoch": 0.625462197504885, "flos": 29144475555840.0, "grad_norm": 1.7705567282979633, "language_loss": 0.69635504, "learning_rate": 1.2995376113115527e-06, "loss": 0.72084367, "num_input_tokens_seen": 224134530, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19494629, "step": 10403, "time_per_iteration": 2.9768335819244385 }, { "auxiliary_loss_clip": 0.01419001, "auxiliary_loss_mlp": 0.01032736, "balance_loss_clip": 1.2516675, "balance_loss_mlp": 1.01183844, "epoch": 0.625522320757553, "flos": 26115652396800.0, "grad_norm": 1.9344112868785492, "language_loss": 0.72565758, "learning_rate": 1.2991728301364954e-06, "loss": 0.75017494, "num_input_tokens_seen": 224154170, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20898438, "step": 10404, "time_per_iteration": 2.889516830444336 }, { "auxiliary_loss_clip": 0.01425593, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.25898933, "balance_loss_mlp": 1.01797819, "epoch": 0.625582444010221, "flos": 20640607113600.0, "grad_norm": 2.3843143671013154, "language_loss": 0.70290601, "learning_rate": 1.2988080755368742e-06, "loss": 0.7275492, "num_input_tokens_seen": 224172730, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20751953, "step": 10405, "time_per_iteration": 4.283205509185791 }, { "auxiliary_loss_clip": 0.0142177, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.25762439, "balance_loss_mlp": 1.01519346, "epoch": 0.6256425672628889, "flos": 20531123400960.0, "grad_norm": 1.602562230395877, "language_loss": 0.79821181, "learning_rate": 1.2984433475265207e-06, "loss": 0.82278323, "num_input_tokens_seen": 224192620, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20178223, "step": 10406, "time_per_iteration": 2.8438658714294434 }, { "auxiliary_loss_clip": 0.01412757, "auxiliary_loss_mlp": 0.01032799, "balance_loss_clip": 1.24661744, "balance_loss_mlp": 1.01261687, "epoch": 0.6257026905155569, "flos": 29540034011520.0, "grad_norm": 1.869965856920582, "language_loss": 0.69202602, "learning_rate": 1.2980786461192666e-06, "loss": 0.71648169, "num_input_tokens_seen": 224214660, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20178223, "step": 10407, "time_per_iteration": 4.311597585678101 }, { "auxiliary_loss_clip": 0.01400602, "auxiliary_loss_mlp": 0.01033269, "balance_loss_clip": 1.24031734, "balance_loss_mlp": 1.01334906, "epoch": 0.6257628137682248, "flos": 24035597591040.0, "grad_norm": 1.7009670345094714, "language_loss": 0.85828596, "learning_rate": 1.2977139713289398e-06, "loss": 0.88262469, "num_input_tokens_seen": 224234170, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19909668, "step": 10408, "time_per_iteration": 2.877589225769043 }, { "auxiliary_loss_clip": 0.01416874, "auxiliary_loss_mlp": 0.01035013, "balance_loss_clip": 1.25185609, "balance_loss_mlp": 1.01525974, "epoch": 0.6258229370208929, "flos": 20860931882880.0, "grad_norm": 2.00322735300357, "language_loss": 0.80362034, "learning_rate": 1.2973493231693699e-06, "loss": 0.82813919, "num_input_tokens_seen": 224253115, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19750977, "step": 10409, "time_per_iteration": 2.839599609375 }, { "auxiliary_loss_clip": 0.01418145, "auxiliary_loss_mlp": 0.01034705, "balance_loss_clip": 1.25373626, "balance_loss_mlp": 1.01414108, "epoch": 0.6258830602735608, "flos": 22240350673920.0, "grad_norm": 2.1497976340671165, "language_loss": 0.70431453, "learning_rate": 1.2969847016543845e-06, "loss": 0.72884303, "num_input_tokens_seen": 224271375, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20556641, "step": 10410, "time_per_iteration": 2.847909927368164 }, { "auxiliary_loss_clip": 0.01418616, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.25728035, "balance_loss_mlp": 1.01167846, "epoch": 0.6259431835262288, "flos": 25086293775360.0, "grad_norm": 1.754232610953202, "language_loss": 0.68876195, "learning_rate": 1.2966201067978086e-06, "loss": 0.71325815, "num_input_tokens_seen": 224290315, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.1932373, "step": 10411, "time_per_iteration": 2.876253128051758 }, { "auxiliary_loss_clip": 0.0141729, "auxiliary_loss_mlp": 0.01037091, "balance_loss_clip": 1.24983692, "balance_loss_mlp": 1.01607406, "epoch": 0.6260033067788967, "flos": 28262814520320.0, "grad_norm": 1.7538588642341113, "language_loss": 0.70150065, "learning_rate": 1.2962555386134702e-06, "loss": 0.72604442, "num_input_tokens_seen": 224310545, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.21020508, "step": 10412, "time_per_iteration": 2.915454149246216 }, { "auxiliary_loss_clip": 0.01421412, "auxiliary_loss_mlp": 0.01036408, "balance_loss_clip": 1.25770521, "balance_loss_mlp": 1.01729894, "epoch": 0.6260634300315647, "flos": 23377790419200.0, "grad_norm": 1.5220157677659918, "language_loss": 0.70202541, "learning_rate": 1.2958909971151908e-06, "loss": 0.72660363, "num_input_tokens_seen": 224331115, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19116211, "step": 10413, "time_per_iteration": 2.868215322494507 }, { "auxiliary_loss_clip": 0.01443894, "auxiliary_loss_mlp": 0.01035435, "balance_loss_clip": 1.27008557, "balance_loss_mlp": 1.01475239, "epoch": 0.6261235532842326, "flos": 18042932085120.0, "grad_norm": 2.5989944491132677, "language_loss": 0.81925511, "learning_rate": 1.295526482316796e-06, "loss": 0.84404838, "num_input_tokens_seen": 224347525, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.20666504, "step": 10414, "time_per_iteration": 2.840371608734131 }, { "auxiliary_loss_clip": 0.01430423, "auxiliary_loss_mlp": 0.01033899, "balance_loss_clip": 1.26408887, "balance_loss_mlp": 1.0137887, "epoch": 0.6261836765369007, "flos": 22019663946240.0, "grad_norm": 1.6519593561306756, "language_loss": 0.75717497, "learning_rate": 1.2951619942321083e-06, "loss": 0.78181815, "num_input_tokens_seen": 224367045, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20092773, "step": 10415, "time_per_iteration": 2.947255849838257 }, { "auxiliary_loss_clip": 0.0141989, "auxiliary_loss_mlp": 0.01033768, "balance_loss_clip": 1.25593758, "balance_loss_mlp": 1.01406324, "epoch": 0.6262437997895686, "flos": 24946695008640.0, "grad_norm": 1.6370382027458616, "language_loss": 0.7491765, "learning_rate": 1.2947975328749472e-06, "loss": 0.77371311, "num_input_tokens_seen": 224388860, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19714355, "step": 10416, "time_per_iteration": 2.9239585399627686 }, { "auxiliary_loss_clip": 0.01402104, "auxiliary_loss_mlp": 0.01032414, "balance_loss_clip": 1.24166703, "balance_loss_mlp": 1.01280379, "epoch": 0.6263039230422366, "flos": 31619681614080.0, "grad_norm": 1.693384712019319, "language_loss": 0.85119843, "learning_rate": 1.2944330982591352e-06, "loss": 0.87554365, "num_input_tokens_seen": 224409645, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19592285, "step": 10417, "time_per_iteration": 2.9268813133239746 }, { "auxiliary_loss_clip": 0.01427444, "auxiliary_loss_mlp": 0.01035034, "balance_loss_clip": 1.26084602, "balance_loss_mlp": 1.01563835, "epoch": 0.6263640462949046, "flos": 17648414259840.0, "grad_norm": 2.2548652809489833, "language_loss": 0.58255064, "learning_rate": 1.2940686903984904e-06, "loss": 0.60717547, "num_input_tokens_seen": 224428530, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19384766, "step": 10418, "time_per_iteration": 2.862926483154297 }, { "auxiliary_loss_clip": 0.01436398, "auxiliary_loss_mlp": 0.01036532, "balance_loss_clip": 1.26519823, "balance_loss_mlp": 1.01601624, "epoch": 0.6264241695475725, "flos": 19984609733760.0, "grad_norm": 1.8259155934652063, "language_loss": 0.84837234, "learning_rate": 1.2937043093068316e-06, "loss": 0.87310159, "num_input_tokens_seen": 224447175, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.20507812, "step": 10419, "time_per_iteration": 2.9714460372924805 }, { "auxiliary_loss_clip": 0.01434078, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.26758599, "balance_loss_mlp": 1.017313, "epoch": 0.6264842928002405, "flos": 27355698645120.0, "grad_norm": 1.3801625844806593, "language_loss": 0.65018839, "learning_rate": 1.2933399549979762e-06, "loss": 0.67489851, "num_input_tokens_seen": 224469445, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19616699, "step": 10420, "time_per_iteration": 2.916839361190796 }, { "auxiliary_loss_clip": 0.01432881, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.26426685, "balance_loss_mlp": 1.01642776, "epoch": 0.6265444160529084, "flos": 23006148480000.0, "grad_norm": 2.0234949453804263, "language_loss": 0.87424982, "learning_rate": 1.292975627485741e-06, "loss": 0.89895177, "num_input_tokens_seen": 224486590, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20910645, "step": 10421, "time_per_iteration": 2.863182544708252 }, { "auxiliary_loss_clip": 0.01413863, "auxiliary_loss_mlp": 0.01035194, "balance_loss_clip": 1.24967504, "balance_loss_mlp": 1.01557231, "epoch": 0.6266045393055765, "flos": 19947934183680.0, "grad_norm": 2.2871123804957443, "language_loss": 0.80505288, "learning_rate": 1.2926113267839403e-06, "loss": 0.82954347, "num_input_tokens_seen": 224502795, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19628906, "step": 10422, "time_per_iteration": 2.846493721008301 }, { "auxiliary_loss_clip": 0.01415397, "auxiliary_loss_mlp": 0.01032826, "balance_loss_clip": 1.25026536, "balance_loss_mlp": 1.01233363, "epoch": 0.6266646625582444, "flos": 24399728893440.0, "grad_norm": 4.744316780846274, "language_loss": 0.7553885, "learning_rate": 1.292247052906389e-06, "loss": 0.77987075, "num_input_tokens_seen": 224522300, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20495605, "step": 10423, "time_per_iteration": 2.8802969455718994 }, { "auxiliary_loss_clip": 0.01419368, "auxiliary_loss_mlp": 0.01031429, "balance_loss_clip": 1.25394917, "balance_loss_mlp": 1.01159263, "epoch": 0.6267247858109124, "flos": 14691901570560.0, "grad_norm": 1.7781896315195567, "language_loss": 0.78785783, "learning_rate": 1.2918828058669004e-06, "loss": 0.81236577, "num_input_tokens_seen": 224538260, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19836426, "step": 10424, "time_per_iteration": 2.808213472366333 }, { "auxiliary_loss_clip": 0.01413471, "auxiliary_loss_mlp": 0.01034047, "balance_loss_clip": 1.25086117, "balance_loss_mlp": 1.01349521, "epoch": 0.6267849090635803, "flos": 24939093882240.0, "grad_norm": 1.7722926122712463, "language_loss": 0.69958448, "learning_rate": 1.2915185856792868e-06, "loss": 0.7240597, "num_input_tokens_seen": 224559155, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20556641, "step": 10425, "time_per_iteration": 2.8585946559906006 }, { "auxiliary_loss_clip": 0.01405229, "auxiliary_loss_mlp": 0.01029292, "balance_loss_clip": 1.24590623, "balance_loss_mlp": 1.00882375, "epoch": 0.6268450323162483, "flos": 25348768715520.0, "grad_norm": 1.545984967630754, "language_loss": 0.75394249, "learning_rate": 1.2911543923573598e-06, "loss": 0.77828777, "num_input_tokens_seen": 224578660, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.20458984, "step": 10426, "time_per_iteration": 2.8993122577667236 }, { "auxiliary_loss_clip": 0.01426204, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.2591567, "balance_loss_mlp": 1.01341021, "epoch": 0.6269051555689162, "flos": 26188505804160.0, "grad_norm": 1.8632742292843785, "language_loss": 0.81088692, "learning_rate": 1.290790225914929e-06, "loss": 0.83548141, "num_input_tokens_seen": 224599080, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19836426, "step": 10427, "time_per_iteration": 2.9259533882141113 }, { "auxiliary_loss_clip": 0.01435354, "auxiliary_loss_mlp": 0.01037324, "balance_loss_clip": 1.26787543, "balance_loss_mlp": 1.01727331, "epoch": 0.6269652788215843, "flos": 18265247625600.0, "grad_norm": 1.8085536287142228, "language_loss": 0.69349372, "learning_rate": 1.2904260863658034e-06, "loss": 0.71822047, "num_input_tokens_seen": 224614225, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20056152, "step": 10428, "time_per_iteration": 2.8195786476135254 }, { "auxiliary_loss_clip": 0.01418388, "auxiliary_loss_mlp": 0.01037402, "balance_loss_clip": 1.2526406, "balance_loss_mlp": 1.01704144, "epoch": 0.6270254020742522, "flos": 11772878837760.0, "grad_norm": 2.4372848034378247, "language_loss": 0.7227093, "learning_rate": 1.2900619737237928e-06, "loss": 0.74726719, "num_input_tokens_seen": 224632365, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20349121, "step": 10429, "time_per_iteration": 4.300502061843872 }, { "auxiliary_loss_clip": 0.01436386, "auxiliary_loss_mlp": 0.01035177, "balance_loss_clip": 1.26776695, "balance_loss_mlp": 1.01473248, "epoch": 0.6270855253269202, "flos": 23485690563840.0, "grad_norm": 1.6319749436746076, "language_loss": 0.80678141, "learning_rate": 1.2896978880027023e-06, "loss": 0.83149701, "num_input_tokens_seen": 224651125, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20458984, "step": 10430, "time_per_iteration": 2.846107244491577 }, { "auxiliary_loss_clip": 0.01183178, "auxiliary_loss_mlp": 0.01020692, "balance_loss_clip": 1.09476423, "balance_loss_mlp": 0.99570608, "epoch": 0.6271456485795882, "flos": 70095247873920.0, "grad_norm": 0.7575120073679272, "language_loss": 0.59195322, "learning_rate": 1.2893338292163393e-06, "loss": 0.61399192, "num_input_tokens_seen": 224716115, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.25, "step": 10431, "time_per_iteration": 3.453068733215332 }, { "auxiliary_loss_clip": 0.01189376, "auxiliary_loss_mlp": 0.0103267, "balance_loss_clip": 1.09926414, "balance_loss_mlp": 1.01149869, "epoch": 0.6272057718322561, "flos": 65190786491520.0, "grad_norm": 0.8679022115937886, "language_loss": 0.63928515, "learning_rate": 1.2889697973785095e-06, "loss": 0.66150558, "num_input_tokens_seen": 224782930, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.21191406, "step": 10432, "time_per_iteration": 3.3376471996307373 }, { "auxiliary_loss_clip": 0.01416912, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.25268126, "balance_loss_mlp": 1.01621079, "epoch": 0.6272658950849241, "flos": 24400045607040.0, "grad_norm": 2.9193723217887055, "language_loss": 0.65643358, "learning_rate": 1.2886057925030153e-06, "loss": 0.68096149, "num_input_tokens_seen": 224802010, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.1965332, "step": 10433, "time_per_iteration": 2.8565850257873535 }, { "auxiliary_loss_clip": 0.01426992, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.25802994, "balance_loss_mlp": 1.01302278, "epoch": 0.627326018337592, "flos": 17974557912960.0, "grad_norm": 3.1844907135922353, "language_loss": 0.62890136, "learning_rate": 1.2882418146036612e-06, "loss": 0.65350878, "num_input_tokens_seen": 224818875, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.20751953, "step": 10434, "time_per_iteration": 2.7930490970611572 }, { "auxiliary_loss_clip": 0.01424969, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.25740826, "balance_loss_mlp": 1.0136615, "epoch": 0.6273861415902601, "flos": 20239755016320.0, "grad_norm": 1.837173412465903, "language_loss": 0.85056549, "learning_rate": 1.2878778636942484e-06, "loss": 0.87516081, "num_input_tokens_seen": 224837790, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.2088623, "step": 10435, "time_per_iteration": 2.8522324562072754 }, { "auxiliary_loss_clip": 0.01188772, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.09710443, "balance_loss_mlp": 1.01170659, "epoch": 0.627446264842928, "flos": 64981754939520.0, "grad_norm": 0.7424138362144119, "language_loss": 0.61569983, "learning_rate": 1.2875139397885786e-06, "loss": 0.637923, "num_input_tokens_seen": 224899685, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.21875, "step": 10436, "time_per_iteration": 3.3448572158813477 }, { "auxiliary_loss_clip": 0.01430742, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.26383519, "balance_loss_mlp": 1.01574302, "epoch": 0.627506388095596, "flos": 23594133646080.0, "grad_norm": 1.4976023785931207, "language_loss": 0.77999711, "learning_rate": 1.2871500429004523e-06, "loss": 0.80466342, "num_input_tokens_seen": 224918650, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20141602, "step": 10437, "time_per_iteration": 2.956705093383789 }, { "auxiliary_loss_clip": 0.01189547, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.09809041, "balance_loss_mlp": 1.00866508, "epoch": 0.6275665113482639, "flos": 67613752788480.0, "grad_norm": 0.7188831473243978, "language_loss": 0.54354471, "learning_rate": 1.2867861730436667e-06, "loss": 0.56575382, "num_input_tokens_seen": 224981575, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.2265625, "step": 10438, "time_per_iteration": 3.2608180046081543 }, { "auxiliary_loss_clip": 0.0141337, "auxiliary_loss_mlp": 0.0103831, "balance_loss_clip": 1.24831903, "balance_loss_mlp": 1.01848614, "epoch": 0.6276266346009319, "flos": 27648560108160.0, "grad_norm": 2.0474457819878644, "language_loss": 0.84867156, "learning_rate": 1.2864223302320214e-06, "loss": 0.87318838, "num_input_tokens_seen": 225000820, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19824219, "step": 10439, "time_per_iteration": 2.9408130645751953 }, { "auxiliary_loss_clip": 0.01428036, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.25967407, "balance_loss_mlp": 1.02097929, "epoch": 0.6276867578535998, "flos": 22756070615040.0, "grad_norm": 2.173721302674834, "language_loss": 0.81012428, "learning_rate": 1.2860585144793128e-06, "loss": 0.83482331, "num_input_tokens_seen": 225017585, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20898438, "step": 10440, "time_per_iteration": 4.3223717212677 }, { "auxiliary_loss_clip": 0.01406597, "auxiliary_loss_mlp": 0.01033592, "balance_loss_clip": 1.24620163, "balance_loss_mlp": 1.01350534, "epoch": 0.6277468811062679, "flos": 24654828931200.0, "grad_norm": 1.646674132256955, "language_loss": 0.75177884, "learning_rate": 1.285694725799337e-06, "loss": 0.77618074, "num_input_tokens_seen": 225039085, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.20080566, "step": 10441, "time_per_iteration": 2.922445774078369 }, { "auxiliary_loss_clip": 0.01404357, "auxiliary_loss_mlp": 0.01036359, "balance_loss_clip": 1.241418, "balance_loss_mlp": 1.01572347, "epoch": 0.6278070043589358, "flos": 19687495259520.0, "grad_norm": 1.8311944498635246, "language_loss": 0.72535467, "learning_rate": 1.2853309642058884e-06, "loss": 0.74976182, "num_input_tokens_seen": 225058105, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.20654297, "step": 10442, "time_per_iteration": 5.707226276397705 }, { "auxiliary_loss_clip": 0.0141925, "auxiliary_loss_mlp": 0.01038345, "balance_loss_clip": 1.25221586, "balance_loss_mlp": 1.01774573, "epoch": 0.6278671276116038, "flos": 22130550247680.0, "grad_norm": 1.484251619129408, "language_loss": 0.72369796, "learning_rate": 1.284967229712762e-06, "loss": 0.74827391, "num_input_tokens_seen": 225077605, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20605469, "step": 10443, "time_per_iteration": 2.887761116027832 }, { "auxiliary_loss_clip": 0.01421843, "auxiliary_loss_mlp": 0.01042881, "balance_loss_clip": 1.25644374, "balance_loss_mlp": 1.02188814, "epoch": 0.6279272508642717, "flos": 23048705854080.0, "grad_norm": 2.2840567756162944, "language_loss": 0.74100339, "learning_rate": 1.2846035223337492e-06, "loss": 0.76565069, "num_input_tokens_seen": 225097775, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20983887, "step": 10444, "time_per_iteration": 2.870457172393799 }, { "auxiliary_loss_clip": 0.01414085, "auxiliary_loss_mlp": 0.01039571, "balance_loss_clip": 1.2497108, "balance_loss_mlp": 1.01949644, "epoch": 0.6279873741169397, "flos": 19832432912640.0, "grad_norm": 1.8029380679246159, "language_loss": 0.72778225, "learning_rate": 1.2842398420826423e-06, "loss": 0.7523188, "num_input_tokens_seen": 225115585, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20056152, "step": 10445, "time_per_iteration": 2.8629682064056396 }, { "auxiliary_loss_clip": 0.01406891, "auxiliary_loss_mlp": 0.01038634, "balance_loss_clip": 1.24245322, "balance_loss_mlp": 1.01845217, "epoch": 0.6280474973696077, "flos": 23926068633600.0, "grad_norm": 1.5005687791185502, "language_loss": 0.70088196, "learning_rate": 1.2838761889732331e-06, "loss": 0.72533727, "num_input_tokens_seen": 225135575, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20166016, "step": 10446, "time_per_iteration": 2.8603789806365967 }, { "auxiliary_loss_clip": 0.01442754, "auxiliary_loss_mlp": 0.01043369, "balance_loss_clip": 1.27118587, "balance_loss_mlp": 1.0221498, "epoch": 0.6281076206222757, "flos": 17977046376960.0, "grad_norm": 1.8662739545991374, "language_loss": 0.74600577, "learning_rate": 1.2835125630193102e-06, "loss": 0.77086699, "num_input_tokens_seen": 225154230, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.2121582, "step": 10447, "time_per_iteration": 2.833699941635132 }, { "auxiliary_loss_clip": 0.01185691, "auxiliary_loss_mlp": 0.0103018, "balance_loss_clip": 1.09399891, "balance_loss_mlp": 1.00729227, "epoch": 0.6281677438749437, "flos": 66807614603520.0, "grad_norm": 0.6769109072416627, "language_loss": 0.52418864, "learning_rate": 1.2831489642346626e-06, "loss": 0.54634738, "num_input_tokens_seen": 225213650, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.22851562, "step": 10448, "time_per_iteration": 3.216970682144165 }, { "auxiliary_loss_clip": 0.01439461, "auxiliary_loss_mlp": 0.01042961, "balance_loss_clip": 1.27147269, "balance_loss_mlp": 1.02230227, "epoch": 0.6282278671276116, "flos": 11663666593920.0, "grad_norm": 4.1955310728234485, "language_loss": 0.92866206, "learning_rate": 1.282785392633079e-06, "loss": 0.95348632, "num_input_tokens_seen": 225230135, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20654297, "step": 10449, "time_per_iteration": 2.829805850982666 }, { "auxiliary_loss_clip": 0.01424748, "auxiliary_loss_mlp": 0.01037025, "balance_loss_clip": 1.25923657, "balance_loss_mlp": 1.01740336, "epoch": 0.6282879903802796, "flos": 42757040983680.0, "grad_norm": 1.617536897322907, "language_loss": 0.61034399, "learning_rate": 1.2824218482283438e-06, "loss": 0.63496172, "num_input_tokens_seen": 225253520, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19628906, "step": 10450, "time_per_iteration": 3.065383195877075 }, { "auxiliary_loss_clip": 0.0139897, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.23766279, "balance_loss_mlp": 1.01467144, "epoch": 0.6283481136329475, "flos": 20018299127040.0, "grad_norm": 1.8631354663017858, "language_loss": 0.77363312, "learning_rate": 1.2820583310342452e-06, "loss": 0.79797512, "num_input_tokens_seen": 225272460, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20556641, "step": 10451, "time_per_iteration": 2.856536626815796 }, { "auxiliary_loss_clip": 0.01428915, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.26074088, "balance_loss_mlp": 1.01232421, "epoch": 0.6284082368856155, "flos": 21913483104000.0, "grad_norm": 1.5498908654818992, "language_loss": 0.77989811, "learning_rate": 1.281694841064566e-06, "loss": 0.80452019, "num_input_tokens_seen": 225291700, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.2097168, "step": 10452, "time_per_iteration": 2.8545711040496826 }, { "auxiliary_loss_clip": 0.01413551, "auxiliary_loss_mlp": 0.01033249, "balance_loss_clip": 1.24947035, "balance_loss_mlp": 1.0124228, "epoch": 0.6284683601382834, "flos": 25495244691840.0, "grad_norm": 1.62771451780529, "language_loss": 0.7370562, "learning_rate": 1.2813313783330904e-06, "loss": 0.7615242, "num_input_tokens_seen": 225311470, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.20812988, "step": 10453, "time_per_iteration": 2.9143664836883545 }, { "auxiliary_loss_clip": 0.01412908, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.24592125, "balance_loss_mlp": 1.0124042, "epoch": 0.6285284833909515, "flos": 16545749783040.0, "grad_norm": 1.8184142354329116, "language_loss": 0.81595832, "learning_rate": 1.2809679428536013e-06, "loss": 0.84042031, "num_input_tokens_seen": 225328385, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20898438, "step": 10454, "time_per_iteration": 2.826732635498047 }, { "auxiliary_loss_clip": 0.01409986, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.24711871, "balance_loss_mlp": 1.01322234, "epoch": 0.6285886066436194, "flos": 22831050528000.0, "grad_norm": 7.4908219299114025, "language_loss": 0.82963854, "learning_rate": 1.2806045346398792e-06, "loss": 0.8540678, "num_input_tokens_seen": 225348415, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19714355, "step": 10455, "time_per_iteration": 2.878007411956787 }, { "auxiliary_loss_clip": 0.01409465, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.24515915, "balance_loss_mlp": 1.0116086, "epoch": 0.6286487298962874, "flos": 24726008280960.0, "grad_norm": 1.537737450154947, "language_loss": 0.82819438, "learning_rate": 1.280241153705706e-06, "loss": 0.85261053, "num_input_tokens_seen": 225367740, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20544434, "step": 10456, "time_per_iteration": 2.923813581466675 }, { "auxiliary_loss_clip": 0.0143816, "auxiliary_loss_mlp": 0.0104134, "balance_loss_clip": 1.2690537, "balance_loss_mlp": 1.02114642, "epoch": 0.6287088531489553, "flos": 20750588519040.0, "grad_norm": 1.4747876105320308, "language_loss": 0.73381603, "learning_rate": 1.27987780006486e-06, "loss": 0.75861102, "num_input_tokens_seen": 225388405, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.2019043, "step": 10457, "time_per_iteration": 2.8872382640838623 }, { "auxiliary_loss_clip": 0.01444913, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.27180469, "balance_loss_mlp": 1.01941252, "epoch": 0.6287689764016233, "flos": 23079635314560.0, "grad_norm": 1.7549328558020942, "language_loss": 0.80856633, "learning_rate": 1.2795144737311202e-06, "loss": 0.83341622, "num_input_tokens_seen": 225408360, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.20654297, "step": 10458, "time_per_iteration": 2.8678884506225586 }, { "auxiliary_loss_clip": 0.01435502, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.26599145, "balance_loss_mlp": 1.01921535, "epoch": 0.6288290996542913, "flos": 32246378346240.0, "grad_norm": 2.2930436893474333, "language_loss": 0.61960661, "learning_rate": 1.2791511747182635e-06, "loss": 0.6443553, "num_input_tokens_seen": 225431310, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20166016, "step": 10459, "time_per_iteration": 2.9307773113250732 }, { "auxiliary_loss_clip": 0.01434809, "auxiliary_loss_mlp": 0.01036343, "balance_loss_clip": 1.2673583, "balance_loss_mlp": 1.01729321, "epoch": 0.6288892229069593, "flos": 24651209347200.0, "grad_norm": 1.5725083044858146, "language_loss": 0.79770714, "learning_rate": 1.2787879030400666e-06, "loss": 0.82241863, "num_input_tokens_seen": 225450385, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19055176, "step": 10460, "time_per_iteration": 2.8985321521759033 }, { "auxiliary_loss_clip": 0.01414485, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.25143659, "balance_loss_mlp": 1.01021194, "epoch": 0.6289493461596273, "flos": 17867336440320.0, "grad_norm": 1.9146011461421923, "language_loss": 0.74779314, "learning_rate": 1.2784246587103047e-06, "loss": 0.77224052, "num_input_tokens_seen": 225467325, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20043945, "step": 10461, "time_per_iteration": 2.917844533920288 }, { "auxiliary_loss_clip": 0.01411279, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.24798441, "balance_loss_mlp": 1.01534128, "epoch": 0.6290094694122952, "flos": 22355354252160.0, "grad_norm": 1.5490756868932751, "language_loss": 0.70948958, "learning_rate": 1.2780614417427523e-06, "loss": 0.73395342, "num_input_tokens_seen": 225487370, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19763184, "step": 10462, "time_per_iteration": 2.8750245571136475 }, { "auxiliary_loss_clip": 0.01395842, "auxiliary_loss_mlp": 0.01032772, "balance_loss_clip": 1.237131, "balance_loss_mlp": 1.01438951, "epoch": 0.6290695926649632, "flos": 28414086445440.0, "grad_norm": 2.0848331536643374, "language_loss": 0.72727871, "learning_rate": 1.2776982521511821e-06, "loss": 0.75156486, "num_input_tokens_seen": 225506915, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18383789, "step": 10463, "time_per_iteration": 2.876756191253662 }, { "auxiliary_loss_clip": 0.01414521, "auxiliary_loss_mlp": 0.01034614, "balance_loss_clip": 1.25402486, "balance_loss_mlp": 1.01558769, "epoch": 0.6291297159176311, "flos": 21515028981120.0, "grad_norm": 1.722253207422101, "language_loss": 0.72888863, "learning_rate": 1.2773350899493665e-06, "loss": 0.75338006, "num_input_tokens_seen": 225525670, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19030762, "step": 10464, "time_per_iteration": 4.22221565246582 }, { "auxiliary_loss_clip": 0.01423719, "auxiliary_loss_mlp": 0.01031588, "balance_loss_clip": 1.2598803, "balance_loss_mlp": 1.01202595, "epoch": 0.6291898391702991, "flos": 12210904177920.0, "grad_norm": 1.7296014890120686, "language_loss": 0.69878364, "learning_rate": 1.2769719551510768e-06, "loss": 0.7233367, "num_input_tokens_seen": 225542235, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19580078, "step": 10465, "time_per_iteration": 2.809210777282715 }, { "auxiliary_loss_clip": 0.01188546, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.09764934, "balance_loss_mlp": 1.00737, "epoch": 0.629249962422967, "flos": 69330309719040.0, "grad_norm": 0.6823481288745018, "language_loss": 0.59858978, "learning_rate": 1.2766088477700832e-06, "loss": 0.62072635, "num_input_tokens_seen": 225607185, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.17773438, "step": 10466, "time_per_iteration": 3.495004892349243 }, { "auxiliary_loss_clip": 0.0141456, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.25077701, "balance_loss_mlp": 1.01059604, "epoch": 0.6293100856756351, "flos": 40092575351040.0, "grad_norm": 2.0062083739554293, "language_loss": 0.66055286, "learning_rate": 1.276245767820154e-06, "loss": 0.68498808, "num_input_tokens_seen": 225628785, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18347168, "step": 10467, "time_per_iteration": 3.0293545722961426 }, { "auxiliary_loss_clip": 0.01183649, "auxiliary_loss_mlp": 0.01018127, "balance_loss_clip": 1.09412014, "balance_loss_mlp": 1.00057936, "epoch": 0.629370208928303, "flos": 67528728529920.0, "grad_norm": 0.7949460276035607, "language_loss": 0.56927502, "learning_rate": 1.2758827153150586e-06, "loss": 0.59129274, "num_input_tokens_seen": 225678980, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.17578125, "step": 10468, "time_per_iteration": 3.076260566711426 }, { "auxiliary_loss_clip": 0.0118411, "auxiliary_loss_mlp": 0.01022218, "balance_loss_clip": 1.09503436, "balance_loss_mlp": 1.00152326, "epoch": 0.629430332180971, "flos": 60691140766080.0, "grad_norm": 0.749324227185622, "language_loss": 0.58106005, "learning_rate": 1.2755196902685626e-06, "loss": 0.60312343, "num_input_tokens_seen": 225740295, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.20703125, "step": 10469, "time_per_iteration": 3.1962838172912598 }, { "auxiliary_loss_clip": 0.01186606, "auxiliary_loss_mlp": 0.01025987, "balance_loss_clip": 1.0953064, "balance_loss_mlp": 1.00481534, "epoch": 0.6294904554336389, "flos": 66903814327680.0, "grad_norm": 0.6784350698288011, "language_loss": 0.52156997, "learning_rate": 1.2751566926944329e-06, "loss": 0.54369587, "num_input_tokens_seen": 225805615, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.21191406, "step": 10470, "time_per_iteration": 3.31520676612854 }, { "auxiliary_loss_clip": 0.01401563, "auxiliary_loss_mlp": 0.01041045, "balance_loss_clip": 1.24087751, "balance_loss_mlp": 1.01720285, "epoch": 0.6295505786863069, "flos": 42538209292800.0, "grad_norm": 2.0532743423588227, "language_loss": 0.75341415, "learning_rate": 1.2747937226064342e-06, "loss": 0.7778402, "num_input_tokens_seen": 225826585, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.23864746, "step": 10471, "time_per_iteration": 3.0668468475341797 }, { "auxiliary_loss_clip": 0.01425602, "auxiliary_loss_mlp": 0.01030867, "balance_loss_clip": 1.25845683, "balance_loss_mlp": 1.01138783, "epoch": 0.629610701938975, "flos": 17393042753280.0, "grad_norm": 3.8947935143821173, "language_loss": 0.64441806, "learning_rate": 1.2744307800183297e-06, "loss": 0.66898274, "num_input_tokens_seen": 225844095, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19506836, "step": 10472, "time_per_iteration": 2.8521456718444824 }, { "auxiliary_loss_clip": 0.01429945, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.26271629, "balance_loss_mlp": 1.01862597, "epoch": 0.6296708251916429, "flos": 24253479141120.0, "grad_norm": 1.7460331712075927, "language_loss": 0.69901478, "learning_rate": 1.2740678649438828e-06, "loss": 0.72370398, "num_input_tokens_seen": 225864310, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20336914, "step": 10473, "time_per_iteration": 2.9144556522369385 }, { "auxiliary_loss_clip": 0.01415461, "auxiliary_loss_mlp": 0.01033825, "balance_loss_clip": 1.25048494, "balance_loss_mlp": 1.01467991, "epoch": 0.6297309484443109, "flos": 19287186099840.0, "grad_norm": 1.56849971526424, "language_loss": 0.75192773, "learning_rate": 1.2737049773968554e-06, "loss": 0.77642065, "num_input_tokens_seen": 225883830, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19140625, "step": 10474, "time_per_iteration": 2.861717939376831 }, { "auxiliary_loss_clip": 0.01418699, "auxiliary_loss_mlp": 0.01031539, "balance_loss_clip": 1.25377297, "balance_loss_mlp": 1.01229882, "epoch": 0.6297910716969788, "flos": 30674442355200.0, "grad_norm": 1.5506835346899481, "language_loss": 0.67497927, "learning_rate": 1.2733421173910081e-06, "loss": 0.69948173, "num_input_tokens_seen": 225905755, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19238281, "step": 10475, "time_per_iteration": 4.3348987102508545 }, { "auxiliary_loss_clip": 0.0140768, "auxiliary_loss_mlp": 0.01032327, "balance_loss_clip": 1.24690163, "balance_loss_mlp": 1.0135988, "epoch": 0.6298511949496468, "flos": 14429652854400.0, "grad_norm": 1.8734908205639436, "language_loss": 0.90925145, "learning_rate": 1.272979284940101e-06, "loss": 0.93365151, "num_input_tokens_seen": 225922155, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18725586, "step": 10476, "time_per_iteration": 2.8133223056793213 }, { "auxiliary_loss_clip": 0.01410116, "auxiliary_loss_mlp": 0.01031947, "balance_loss_clip": 1.24756241, "balance_loss_mlp": 1.01389885, "epoch": 0.6299113182023147, "flos": 23524764088320.0, "grad_norm": 1.6795969339461663, "language_loss": 0.76636708, "learning_rate": 1.2726164800578913e-06, "loss": 0.79078764, "num_input_tokens_seen": 225941060, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18054199, "step": 10477, "time_per_iteration": 5.7560248374938965 }, { "auxiliary_loss_clip": 0.01414244, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.24927104, "balance_loss_mlp": 1.01357925, "epoch": 0.6299714414549827, "flos": 22684755530880.0, "grad_norm": 1.7799748566108777, "language_loss": 0.70957315, "learning_rate": 1.272253702758138e-06, "loss": 0.73406011, "num_input_tokens_seen": 225960870, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20874023, "step": 10478, "time_per_iteration": 2.8443644046783447 }, { "auxiliary_loss_clip": 0.01439945, "auxiliary_loss_mlp": 0.01030122, "balance_loss_clip": 1.26820803, "balance_loss_mlp": 1.01058364, "epoch": 0.6300315647076506, "flos": 14509881164160.0, "grad_norm": 2.3122887164335943, "language_loss": 0.68252206, "learning_rate": 1.2718909530545974e-06, "loss": 0.7072227, "num_input_tokens_seen": 225977895, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.19519043, "step": 10479, "time_per_iteration": 2.8902485370635986 }, { "auxiliary_loss_clip": 0.01425815, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.26214135, "balance_loss_mlp": 1.01298642, "epoch": 0.6300916879603187, "flos": 21881693992320.0, "grad_norm": 1.7532213100340635, "language_loss": 0.7449671, "learning_rate": 1.2715282309610245e-06, "loss": 0.76955211, "num_input_tokens_seen": 225997835, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19689941, "step": 10480, "time_per_iteration": 2.893348217010498 }, { "auxiliary_loss_clip": 0.01427821, "auxiliary_loss_mlp": 0.01036754, "balance_loss_clip": 1.26003742, "balance_loss_mlp": 1.01567805, "epoch": 0.6301518112129866, "flos": 21843751587840.0, "grad_norm": 2.0302104967358914, "language_loss": 0.79028219, "learning_rate": 1.2711655364911744e-06, "loss": 0.81492794, "num_input_tokens_seen": 226017620, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.2109375, "step": 10481, "time_per_iteration": 2.8486900329589844 }, { "auxiliary_loss_clip": 0.01184584, "auxiliary_loss_mlp": 0.01020171, "balance_loss_clip": 1.0948385, "balance_loss_mlp": 1.00128829, "epoch": 0.6302119344656546, "flos": 44356259589120.0, "grad_norm": 0.8805446139748399, "language_loss": 0.61850524, "learning_rate": 1.2708028696588e-06, "loss": 0.64055276, "num_input_tokens_seen": 226068755, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.18847656, "step": 10482, "time_per_iteration": 3.0975542068481445 }, { "auxiliary_loss_clip": 0.0144939, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.27662659, "balance_loss_mlp": 1.01367807, "epoch": 0.6302720577183225, "flos": 11225188805760.0, "grad_norm": 1.8398530057066655, "language_loss": 0.83628422, "learning_rate": 1.2704402304776541e-06, "loss": 0.86112106, "num_input_tokens_seen": 226084395, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.20605469, "step": 10483, "time_per_iteration": 2.902813196182251 }, { "auxiliary_loss_clip": 0.01409101, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.24932432, "balance_loss_mlp": 1.01245904, "epoch": 0.6303321809709905, "flos": 27976558798080.0, "grad_norm": 1.6308405764019855, "language_loss": 0.73385346, "learning_rate": 1.270077618961487e-06, "loss": 0.75825524, "num_input_tokens_seen": 226105890, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18615723, "step": 10484, "time_per_iteration": 2.894984006881714 }, { "auxiliary_loss_clip": 0.01421179, "auxiliary_loss_mlp": 0.01034146, "balance_loss_clip": 1.25494039, "balance_loss_mlp": 1.01481032, "epoch": 0.6303923042236586, "flos": 28232563731840.0, "grad_norm": 1.6846028250305394, "language_loss": 0.75164866, "learning_rate": 1.2697150351240506e-06, "loss": 0.77620196, "num_input_tokens_seen": 226126760, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19335938, "step": 10485, "time_per_iteration": 2.9643726348876953 }, { "auxiliary_loss_clip": 0.01437498, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.26544642, "balance_loss_mlp": 1.01221538, "epoch": 0.6304524274763265, "flos": 27641139960960.0, "grad_norm": 2.1196606568993044, "language_loss": 0.82301438, "learning_rate": 1.269352478979093e-06, "loss": 0.84769857, "num_input_tokens_seen": 226147315, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.18713379, "step": 10486, "time_per_iteration": 2.8804924488067627 }, { "auxiliary_loss_clip": 0.01415774, "auxiliary_loss_mlp": 0.01035592, "balance_loss_clip": 1.25143385, "balance_loss_mlp": 1.01688766, "epoch": 0.6305125507289945, "flos": 17320234590720.0, "grad_norm": 1.677190623256512, "language_loss": 0.64554262, "learning_rate": 1.2689899505403628e-06, "loss": 0.67005634, "num_input_tokens_seen": 226165935, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18713379, "step": 10487, "time_per_iteration": 2.8390207290649414 }, { "auxiliary_loss_clip": 0.01414049, "auxiliary_loss_mlp": 0.01036045, "balance_loss_clip": 1.25101101, "balance_loss_mlp": 1.01703095, "epoch": 0.6305726739816624, "flos": 25818311698560.0, "grad_norm": 1.6060041433631136, "language_loss": 0.67954862, "learning_rate": 1.2686274498216065e-06, "loss": 0.70404959, "num_input_tokens_seen": 226186890, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18994141, "step": 10488, "time_per_iteration": 2.864760637283325 }, { "auxiliary_loss_clip": 0.014256, "auxiliary_loss_mlp": 0.0103484, "balance_loss_clip": 1.25828815, "balance_loss_mlp": 1.01573133, "epoch": 0.6306327972343304, "flos": 21807121282560.0, "grad_norm": 1.8358348520941656, "language_loss": 0.68310589, "learning_rate": 1.2682649768365706e-06, "loss": 0.70771027, "num_input_tokens_seen": 226206710, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19104004, "step": 10489, "time_per_iteration": 2.8544108867645264 }, { "auxiliary_loss_clip": 0.01445921, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.27011156, "balance_loss_mlp": 1.01465416, "epoch": 0.6306929204869983, "flos": 20787173579520.0, "grad_norm": 2.4165003375801954, "language_loss": 0.71109509, "learning_rate": 1.2679025315990007e-06, "loss": 0.73589766, "num_input_tokens_seen": 226225565, "router_z_loss_clip": 1.75683594, "router_z_loss_mlp": 0.19689941, "step": 10490, "time_per_iteration": 2.8539507389068604 }, { "auxiliary_loss_clip": 0.01423985, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.25748289, "balance_loss_mlp": 1.01936817, "epoch": 0.6307530437396663, "flos": 23663322224640.0, "grad_norm": 1.8540239787011796, "language_loss": 0.79216623, "learning_rate": 1.2675401141226393e-06, "loss": 0.81680089, "num_input_tokens_seen": 226243680, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.2010498, "step": 10491, "time_per_iteration": 2.8690590858459473 }, { "auxiliary_loss_clip": 0.01418932, "auxiliary_loss_mlp": 0.01037707, "balance_loss_clip": 1.25510073, "balance_loss_mlp": 1.01813293, "epoch": 0.6308131669923343, "flos": 24730125557760.0, "grad_norm": 1.931086256739605, "language_loss": 0.56476295, "learning_rate": 1.2671777244212308e-06, "loss": 0.58932936, "num_input_tokens_seen": 226264345, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19580078, "step": 10492, "time_per_iteration": 2.8923726081848145 }, { "auxiliary_loss_clip": 0.01424407, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.25702465, "balance_loss_mlp": 1.01688111, "epoch": 0.6308732902450023, "flos": 22575724266240.0, "grad_norm": 1.8839054977085326, "language_loss": 0.65507388, "learning_rate": 1.2668153625085168e-06, "loss": 0.67969167, "num_input_tokens_seen": 226283165, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20483398, "step": 10493, "time_per_iteration": 2.853484630584717 }, { "auxiliary_loss_clip": 0.01424396, "auxiliary_loss_mlp": 0.01033705, "balance_loss_clip": 1.25886631, "balance_loss_mlp": 1.014274, "epoch": 0.6309334134976702, "flos": 24654828931200.0, "grad_norm": 1.413643785023352, "language_loss": 0.83106321, "learning_rate": 1.2664530283982367e-06, "loss": 0.85564423, "num_input_tokens_seen": 226304080, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.1940918, "step": 10494, "time_per_iteration": 2.891767978668213 }, { "auxiliary_loss_clip": 0.01430827, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.26401114, "balance_loss_mlp": 1.01687169, "epoch": 0.6309935367503382, "flos": 41443010208000.0, "grad_norm": 1.70609007257018, "language_loss": 0.80165601, "learning_rate": 1.2660907221041317e-06, "loss": 0.82632899, "num_input_tokens_seen": 226325925, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.19628906, "step": 10495, "time_per_iteration": 3.04705810546875 }, { "auxiliary_loss_clip": 0.0143459, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.26600599, "balance_loss_mlp": 1.01597714, "epoch": 0.6310536600030061, "flos": 15126443061120.0, "grad_norm": 2.295818361052776, "language_loss": 0.70975423, "learning_rate": 1.2657284436399403e-06, "loss": 0.73445851, "num_input_tokens_seen": 226344190, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19848633, "step": 10496, "time_per_iteration": 2.8809518814086914 }, { "auxiliary_loss_clip": 0.01426487, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.25865579, "balance_loss_mlp": 1.01851916, "epoch": 0.6311137832556741, "flos": 15239998805760.0, "grad_norm": 2.1092866126516805, "language_loss": 0.81362325, "learning_rate": 1.2653661930193997e-06, "loss": 0.8382746, "num_input_tokens_seen": 226361520, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20153809, "step": 10497, "time_per_iteration": 2.8308322429656982 }, { "auxiliary_loss_clip": 0.01423023, "auxiliary_loss_mlp": 0.01037315, "balance_loss_clip": 1.2579906, "balance_loss_mlp": 1.0187186, "epoch": 0.6311739065083422, "flos": 22028984375040.0, "grad_norm": 1.8842482119237882, "language_loss": 0.75372469, "learning_rate": 1.265003970256247e-06, "loss": 0.77832812, "num_input_tokens_seen": 226381920, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18591309, "step": 10498, "time_per_iteration": 2.8629560470581055 }, { "auxiliary_loss_clip": 0.01417835, "auxiliary_loss_mlp": 0.01032428, "balance_loss_clip": 1.25262737, "balance_loss_mlp": 1.01346207, "epoch": 0.6312340297610101, "flos": 22720978632960.0, "grad_norm": 2.2903007369616057, "language_loss": 0.70583129, "learning_rate": 1.264641775364217e-06, "loss": 0.73033392, "num_input_tokens_seen": 226400035, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.18981934, "step": 10499, "time_per_iteration": 4.32206654548645 }, { "auxiliary_loss_clip": 0.01419516, "auxiliary_loss_mlp": 0.0103948, "balance_loss_clip": 1.2576797, "balance_loss_mlp": 1.02059722, "epoch": 0.6312941530136781, "flos": 24290833363200.0, "grad_norm": 1.8430327668933115, "language_loss": 0.70604908, "learning_rate": 1.2642796083570448e-06, "loss": 0.7306391, "num_input_tokens_seen": 226418280, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18896484, "step": 10500, "time_per_iteration": 2.911339282989502 }, { "auxiliary_loss_clip": 0.01422316, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.25806713, "balance_loss_mlp": 1.0164113, "epoch": 0.631354276266346, "flos": 21735987177600.0, "grad_norm": 1.7882081907757024, "language_loss": 0.74526304, "learning_rate": 1.2639174692484634e-06, "loss": 0.76984096, "num_input_tokens_seen": 226436650, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19042969, "step": 10501, "time_per_iteration": 2.8315303325653076 }, { "auxiliary_loss_clip": 0.01416959, "auxiliary_loss_mlp": 0.01042807, "balance_loss_clip": 1.25270033, "balance_loss_mlp": 1.02213585, "epoch": 0.631414399519014, "flos": 24035869059840.0, "grad_norm": 1.9630180680924847, "language_loss": 0.76118946, "learning_rate": 1.2635553580522053e-06, "loss": 0.78578711, "num_input_tokens_seen": 226456275, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20654297, "step": 10502, "time_per_iteration": 2.8702878952026367 }, { "auxiliary_loss_clip": 0.01439619, "auxiliary_loss_mlp": 0.01039339, "balance_loss_clip": 1.26853657, "balance_loss_mlp": 1.01930022, "epoch": 0.6314745227716819, "flos": 24326332548480.0, "grad_norm": 3.0653528436270863, "language_loss": 0.8611083, "learning_rate": 1.2631932747820022e-06, "loss": 0.88589787, "num_input_tokens_seen": 226473610, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20031738, "step": 10503, "time_per_iteration": 2.8569366931915283 }, { "auxiliary_loss_clip": 0.0142467, "auxiliary_loss_mlp": 0.01036931, "balance_loss_clip": 1.25800788, "balance_loss_mlp": 1.01736832, "epoch": 0.6315346460243499, "flos": 23376478320000.0, "grad_norm": 2.079009800493109, "language_loss": 0.87146699, "learning_rate": 1.2628312194515838e-06, "loss": 0.896083, "num_input_tokens_seen": 226493665, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19555664, "step": 10504, "time_per_iteration": 2.9101741313934326 }, { "auxiliary_loss_clip": 0.01446957, "auxiliary_loss_mlp": 0.01040364, "balance_loss_clip": 1.27428555, "balance_loss_mlp": 1.0208497, "epoch": 0.6315947692770179, "flos": 20268376992000.0, "grad_norm": 1.6415632667106703, "language_loss": 0.77539301, "learning_rate": 1.2624691920746793e-06, "loss": 0.80026615, "num_input_tokens_seen": 226511625, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.19506836, "step": 10505, "time_per_iteration": 2.8791158199310303 }, { "auxiliary_loss_clip": 0.01424374, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.25766659, "balance_loss_mlp": 1.01769352, "epoch": 0.6316548925296859, "flos": 25277001183360.0, "grad_norm": 1.8986532173010484, "language_loss": 0.82276058, "learning_rate": 1.2621071926650166e-06, "loss": 0.84737825, "num_input_tokens_seen": 226530085, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19714355, "step": 10506, "time_per_iteration": 2.8691341876983643 }, { "auxiliary_loss_clip": 0.01430548, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.26421356, "balance_loss_mlp": 1.01930547, "epoch": 0.6317150157823538, "flos": 22941077178240.0, "grad_norm": 1.8893211574641704, "language_loss": 0.75416791, "learning_rate": 1.2617452212363238e-06, "loss": 0.77886301, "num_input_tokens_seen": 226548115, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.1965332, "step": 10507, "time_per_iteration": 2.861525058746338 }, { "auxiliary_loss_clip": 0.01441766, "auxiliary_loss_mlp": 0.01042201, "balance_loss_clip": 1.27263606, "balance_loss_mlp": 1.02243638, "epoch": 0.6317751390350218, "flos": 22536876965760.0, "grad_norm": 1.792535568939442, "language_loss": 0.68628514, "learning_rate": 1.2613832778023258e-06, "loss": 0.7111249, "num_input_tokens_seen": 226567955, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19775391, "step": 10508, "time_per_iteration": 2.8924825191497803 }, { "auxiliary_loss_clip": 0.01421017, "auxiliary_loss_mlp": 0.01039372, "balance_loss_clip": 1.25573885, "balance_loss_mlp": 1.01965451, "epoch": 0.6318352622876897, "flos": 23305163235840.0, "grad_norm": 1.9281275185590812, "language_loss": 0.713359, "learning_rate": 1.2610213623767478e-06, "loss": 0.73796296, "num_input_tokens_seen": 226588205, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19714355, "step": 10509, "time_per_iteration": 2.8950679302215576 }, { "auxiliary_loss_clip": 0.01422941, "auxiliary_loss_mlp": 0.0103887, "balance_loss_clip": 1.25794923, "balance_loss_mlp": 1.01930773, "epoch": 0.6318953855403577, "flos": 20713777234560.0, "grad_norm": 1.5260279726563561, "language_loss": 0.80030364, "learning_rate": 1.2606594749733143e-06, "loss": 0.82492173, "num_input_tokens_seen": 226606965, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19567871, "step": 10510, "time_per_iteration": 4.252953052520752 }, { "auxiliary_loss_clip": 0.01422586, "auxiliary_loss_mlp": 0.01035466, "balance_loss_clip": 1.2551527, "balance_loss_mlp": 1.01646447, "epoch": 0.6319555087930258, "flos": 22830236121600.0, "grad_norm": 1.5834329066402621, "language_loss": 0.71119618, "learning_rate": 1.2602976156057469e-06, "loss": 0.73577666, "num_input_tokens_seen": 226627845, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.18994141, "step": 10511, "time_per_iteration": 2.898562431335449 }, { "auxiliary_loss_clip": 0.0142574, "auxiliary_loss_mlp": 0.01038501, "balance_loss_clip": 1.26306009, "balance_loss_mlp": 1.01901007, "epoch": 0.6320156320456937, "flos": 19979542316160.0, "grad_norm": 1.8134956262785868, "language_loss": 0.80726171, "learning_rate": 1.2599357842877684e-06, "loss": 0.83190411, "num_input_tokens_seen": 226645855, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19494629, "step": 10512, "time_per_iteration": 5.691515684127808 }, { "auxiliary_loss_clip": 0.01436341, "auxiliary_loss_mlp": 0.01039214, "balance_loss_clip": 1.26985204, "balance_loss_mlp": 1.01941371, "epoch": 0.6320757552983617, "flos": 27024035126400.0, "grad_norm": 1.7187220387010815, "language_loss": 0.71289921, "learning_rate": 1.2595739810330994e-06, "loss": 0.73765481, "num_input_tokens_seen": 226665375, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19824219, "step": 10513, "time_per_iteration": 2.905351400375366 }, { "auxiliary_loss_clip": 0.0143809, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.26722598, "balance_loss_mlp": 1.01380348, "epoch": 0.6321358785510296, "flos": 23706422536320.0, "grad_norm": 1.6906892345051308, "language_loss": 0.67816818, "learning_rate": 1.259212205855459e-06, "loss": 0.70288217, "num_input_tokens_seen": 226685270, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.19470215, "step": 10514, "time_per_iteration": 2.8747355937957764 }, { "auxiliary_loss_clip": 0.01409146, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.24623489, "balance_loss_mlp": 1.01776993, "epoch": 0.6321960018036976, "flos": 26006485397760.0, "grad_norm": 2.8304227953892336, "language_loss": 0.75332016, "learning_rate": 1.2588504587685663e-06, "loss": 0.77777708, "num_input_tokens_seen": 226705325, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.18798828, "step": 10515, "time_per_iteration": 2.939897298812866 }, { "auxiliary_loss_clip": 0.01409215, "auxiliary_loss_mlp": 0.01034818, "balance_loss_clip": 1.24830627, "balance_loss_mlp": 1.01541042, "epoch": 0.6322561250563655, "flos": 22831457731200.0, "grad_norm": 1.9131957798509478, "language_loss": 0.90651405, "learning_rate": 1.2584887397861379e-06, "loss": 0.93095446, "num_input_tokens_seen": 226723815, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19396973, "step": 10516, "time_per_iteration": 2.869004726409912 }, { "auxiliary_loss_clip": 0.01447067, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.27319884, "balance_loss_mlp": 1.01353383, "epoch": 0.6323162483090335, "flos": 18997446528000.0, "grad_norm": 1.6796464906271946, "language_loss": 0.82083821, "learning_rate": 1.2581270489218911e-06, "loss": 0.84563923, "num_input_tokens_seen": 226741550, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.19519043, "step": 10517, "time_per_iteration": 2.840836524963379 }, { "auxiliary_loss_clip": 0.01422464, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.2579248, "balance_loss_mlp": 1.01435542, "epoch": 0.6323763715617015, "flos": 19874718817920.0, "grad_norm": 2.7719133546030723, "language_loss": 0.78543961, "learning_rate": 1.257765386189541e-06, "loss": 0.80999553, "num_input_tokens_seen": 226761115, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18774414, "step": 10518, "time_per_iteration": 2.898045063018799 }, { "auxiliary_loss_clip": 0.01420302, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.2573514, "balance_loss_mlp": 1.01470613, "epoch": 0.6324364948143695, "flos": 22792655675520.0, "grad_norm": 1.5124320164087828, "language_loss": 0.85788226, "learning_rate": 1.2574037516028018e-06, "loss": 0.88241374, "num_input_tokens_seen": 226782225, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18139648, "step": 10519, "time_per_iteration": 2.9048378467559814 }, { "auxiliary_loss_clip": 0.01404907, "auxiliary_loss_mlp": 0.01030995, "balance_loss_clip": 1.24406362, "balance_loss_mlp": 1.01195729, "epoch": 0.6324966180670374, "flos": 22245870539520.0, "grad_norm": 1.543518392375552, "language_loss": 0.7330997, "learning_rate": 1.2570421451753867e-06, "loss": 0.75745869, "num_input_tokens_seen": 226802375, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19018555, "step": 10520, "time_per_iteration": 2.9655165672302246 }, { "auxiliary_loss_clip": 0.01419057, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.25381947, "balance_loss_mlp": 1.01236176, "epoch": 0.6325567413197054, "flos": 21699130648320.0, "grad_norm": 1.9346482062674295, "language_loss": 0.72702128, "learning_rate": 1.2566805669210081e-06, "loss": 0.75153339, "num_input_tokens_seen": 226822165, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19787598, "step": 10521, "time_per_iteration": 2.8854384422302246 }, { "auxiliary_loss_clip": 0.01427194, "auxiliary_loss_mlp": 0.01035222, "balance_loss_clip": 1.26037276, "balance_loss_mlp": 1.01498032, "epoch": 0.6326168645723733, "flos": 19946441105280.0, "grad_norm": 2.037451467991628, "language_loss": 0.72805476, "learning_rate": 1.256319016853377e-06, "loss": 0.75267899, "num_input_tokens_seen": 226841645, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20251465, "step": 10522, "time_per_iteration": 2.8542914390563965 }, { "auxiliary_loss_clip": 0.01416596, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.25352395, "balance_loss_mlp": 1.01192236, "epoch": 0.6326769878250413, "flos": 20240071729920.0, "grad_norm": 1.701708250380814, "language_loss": 0.81812918, "learning_rate": 1.2559574949862023e-06, "loss": 0.84261817, "num_input_tokens_seen": 226860355, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20385742, "step": 10523, "time_per_iteration": 2.87210750579834 }, { "auxiliary_loss_clip": 0.01420129, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.25536811, "balance_loss_mlp": 1.01118779, "epoch": 0.6327371110777094, "flos": 20785137563520.0, "grad_norm": 1.9058313190830092, "language_loss": 0.74522936, "learning_rate": 1.255596001333195e-06, "loss": 0.76973349, "num_input_tokens_seen": 226878390, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19091797, "step": 10524, "time_per_iteration": 2.8323171138763428 }, { "auxiliary_loss_clip": 0.01441937, "auxiliary_loss_mlp": 0.01037792, "balance_loss_clip": 1.26971507, "balance_loss_mlp": 1.01763332, "epoch": 0.6327972343303773, "flos": 30348796394880.0, "grad_norm": 2.031117756121758, "language_loss": 0.84974396, "learning_rate": 1.2552345359080615e-06, "loss": 0.87454116, "num_input_tokens_seen": 226898420, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.20141602, "step": 10525, "time_per_iteration": 2.9047927856445312 }, { "auxiliary_loss_clip": 0.0141916, "auxiliary_loss_mlp": 0.01031341, "balance_loss_clip": 1.25464296, "balance_loss_mlp": 1.01289928, "epoch": 0.6328573575830453, "flos": 17101086186240.0, "grad_norm": 1.9708637319865374, "language_loss": 0.66927826, "learning_rate": 1.2548730987245093e-06, "loss": 0.69378328, "num_input_tokens_seen": 226916305, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18432617, "step": 10526, "time_per_iteration": 2.8409132957458496 }, { "auxiliary_loss_clip": 0.01436708, "auxiliary_loss_mlp": 0.01037805, "balance_loss_clip": 1.2675277, "balance_loss_mlp": 1.01794505, "epoch": 0.6329174808357132, "flos": 25058305226880.0, "grad_norm": 1.6176299707856039, "language_loss": 0.74040598, "learning_rate": 1.254511689796244e-06, "loss": 0.76515114, "num_input_tokens_seen": 226937705, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.19836426, "step": 10527, "time_per_iteration": 2.8826372623443604 }, { "auxiliary_loss_clip": 0.01413817, "auxiliary_loss_mlp": 0.01034133, "balance_loss_clip": 1.25242472, "balance_loss_mlp": 1.01573896, "epoch": 0.6329776040883812, "flos": 16845578945280.0, "grad_norm": 1.9729898440053824, "language_loss": 0.71718144, "learning_rate": 1.2541503091369693e-06, "loss": 0.74166095, "num_input_tokens_seen": 226954880, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18395996, "step": 10528, "time_per_iteration": 2.8388054370880127 }, { "auxiliary_loss_clip": 0.01426374, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.26032686, "balance_loss_mlp": 1.0149107, "epoch": 0.6330377273410491, "flos": 13524618240000.0, "grad_norm": 2.7303451262002643, "language_loss": 0.67797667, "learning_rate": 1.2537889567603905e-06, "loss": 0.70259255, "num_input_tokens_seen": 226972595, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20300293, "step": 10529, "time_per_iteration": 2.8005096912384033 }, { "auxiliary_loss_clip": 0.01442041, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.27126873, "balance_loss_mlp": 1.01295519, "epoch": 0.6330978505937171, "flos": 21547587254400.0, "grad_norm": 1.8203635008792771, "language_loss": 0.76705837, "learning_rate": 1.2534276326802092e-06, "loss": 0.79180872, "num_input_tokens_seen": 226991910, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.20031738, "step": 10530, "time_per_iteration": 2.861442804336548 }, { "auxiliary_loss_clip": 0.01458999, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.2883389, "balance_loss_mlp": 1.01814246, "epoch": 0.6331579738463851, "flos": 25020317577600.0, "grad_norm": 1.673562030902328, "language_loss": 0.73910457, "learning_rate": 1.2530663369101259e-06, "loss": 0.76406938, "num_input_tokens_seen": 227010175, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.19348145, "step": 10531, "time_per_iteration": 2.8916373252868652 }, { "auxiliary_loss_clip": 0.01423073, "auxiliary_loss_mlp": 0.01039711, "balance_loss_clip": 1.25963473, "balance_loss_mlp": 1.01956427, "epoch": 0.6332180970990531, "flos": 14984265340800.0, "grad_norm": 2.4017957497904185, "language_loss": 0.80553567, "learning_rate": 1.2527050694638432e-06, "loss": 0.83016348, "num_input_tokens_seen": 227025540, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20129395, "step": 10532, "time_per_iteration": 2.860258102416992 }, { "auxiliary_loss_clip": 0.01410544, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.24768066, "balance_loss_mlp": 1.01427722, "epoch": 0.633278220351721, "flos": 22716182684160.0, "grad_norm": 1.5451254221100283, "language_loss": 0.75251019, "learning_rate": 1.2523438303550582e-06, "loss": 0.77693868, "num_input_tokens_seen": 227045520, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18029785, "step": 10533, "time_per_iteration": 2.8877639770507812 }, { "auxiliary_loss_clip": 0.01438459, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.26699817, "balance_loss_mlp": 1.01731253, "epoch": 0.633338343604389, "flos": 12610172707200.0, "grad_norm": 2.4406799942180206, "language_loss": 0.77860999, "learning_rate": 1.2519826195974706e-06, "loss": 0.80336857, "num_input_tokens_seen": 227059420, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20092773, "step": 10534, "time_per_iteration": 4.242862701416016 }, { "auxiliary_loss_clip": 0.01430046, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.26432216, "balance_loss_mlp": 1.01650703, "epoch": 0.6333984668570569, "flos": 25971710129280.0, "grad_norm": 1.7330164188574384, "language_loss": 0.86815417, "learning_rate": 1.251621437204777e-06, "loss": 0.89282161, "num_input_tokens_seen": 227081310, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20214844, "step": 10535, "time_per_iteration": 2.921600818634033 }, { "auxiliary_loss_clip": 0.01424366, "auxiliary_loss_mlp": 0.01035079, "balance_loss_clip": 1.2585237, "balance_loss_mlp": 1.01508784, "epoch": 0.6334585901097249, "flos": 23669656496640.0, "grad_norm": 2.763463269017597, "language_loss": 0.77311385, "learning_rate": 1.2512602831906733e-06, "loss": 0.79770827, "num_input_tokens_seen": 227100365, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.1998291, "step": 10536, "time_per_iteration": 2.9528253078460693 }, { "auxiliary_loss_clip": 0.01425968, "auxiliary_loss_mlp": 0.01031428, "balance_loss_clip": 1.26197505, "balance_loss_mlp": 1.01180601, "epoch": 0.633518713362393, "flos": 28770480887040.0, "grad_norm": 1.8092952742929462, "language_loss": 0.61058491, "learning_rate": 1.250899157568855e-06, "loss": 0.6351589, "num_input_tokens_seen": 227119680, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19616699, "step": 10537, "time_per_iteration": 2.929633855819702 }, { "auxiliary_loss_clip": 0.01193576, "auxiliary_loss_mlp": 0.01038261, "balance_loss_clip": 1.10155034, "balance_loss_mlp": 1.01623094, "epoch": 0.6335788366150609, "flos": 70448791893120.0, "grad_norm": 0.7813614224732381, "language_loss": 0.5248059, "learning_rate": 1.2505380603530155e-06, "loss": 0.54712427, "num_input_tokens_seen": 227184465, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.22070312, "step": 10538, "time_per_iteration": 3.4122090339660645 }, { "auxiliary_loss_clip": 0.01440156, "auxiliary_loss_mlp": 0.01037019, "balance_loss_clip": 1.26846313, "balance_loss_mlp": 1.01626432, "epoch": 0.6336389598677289, "flos": 23742464659200.0, "grad_norm": 1.9199142985724913, "language_loss": 0.83990854, "learning_rate": 1.250176991556848e-06, "loss": 0.86468029, "num_input_tokens_seen": 227202185, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20751953, "step": 10539, "time_per_iteration": 2.868924140930176 }, { "auxiliary_loss_clip": 0.0143099, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.26110244, "balance_loss_mlp": 1.01356649, "epoch": 0.6336990831203968, "flos": 29288191599360.0, "grad_norm": 1.709345226528231, "language_loss": 0.87604523, "learning_rate": 1.2498159511940438e-06, "loss": 0.90070117, "num_input_tokens_seen": 227222020, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.21020508, "step": 10540, "time_per_iteration": 2.9496352672576904 }, { "auxiliary_loss_clip": 0.01420355, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.25611067, "balance_loss_mlp": 1.01123989, "epoch": 0.6337592063730648, "flos": 29108976370560.0, "grad_norm": 3.7001424451825806, "language_loss": 0.72816366, "learning_rate": 1.2494549392782943e-06, "loss": 0.75266773, "num_input_tokens_seen": 227240885, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18811035, "step": 10541, "time_per_iteration": 2.93251371383667 }, { "auxiliary_loss_clip": 0.01457119, "auxiliary_loss_mlp": 0.01034488, "balance_loss_clip": 1.28488922, "balance_loss_mlp": 1.01436543, "epoch": 0.6338193296257327, "flos": 34717693351680.0, "grad_norm": 2.0763623427892273, "language_loss": 0.8587181, "learning_rate": 1.2490939558232887e-06, "loss": 0.88363409, "num_input_tokens_seen": 227257880, "router_z_loss_clip": 1.72070312, "router_z_loss_mlp": 0.20129395, "step": 10542, "time_per_iteration": 3.008507251739502 }, { "auxiliary_loss_clip": 0.01415425, "auxiliary_loss_mlp": 0.01031271, "balance_loss_clip": 1.25064671, "balance_loss_mlp": 1.01167297, "epoch": 0.6338794528784008, "flos": 16695438140160.0, "grad_norm": 1.6106011262312403, "language_loss": 0.78413427, "learning_rate": 1.2487330008427153e-06, "loss": 0.80860126, "num_input_tokens_seen": 227274840, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19616699, "step": 10543, "time_per_iteration": 2.8744728565216064 }, { "auxiliary_loss_clip": 0.01405568, "auxiliary_loss_mlp": 0.01034781, "balance_loss_clip": 1.24530458, "balance_loss_mlp": 1.0151825, "epoch": 0.6339395761310687, "flos": 22356892575360.0, "grad_norm": 2.2845949708900903, "language_loss": 0.74164867, "learning_rate": 1.2483720743502618e-06, "loss": 0.76605213, "num_input_tokens_seen": 227294835, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19604492, "step": 10544, "time_per_iteration": 4.366663455963135 }, { "auxiliary_loss_clip": 0.01446581, "auxiliary_loss_mlp": 0.0103331, "balance_loss_clip": 1.27563477, "balance_loss_mlp": 1.01263916, "epoch": 0.6339996993837367, "flos": 18561050000640.0, "grad_norm": 2.8147173447636686, "language_loss": 0.6920526, "learning_rate": 1.2480111763596144e-06, "loss": 0.71685147, "num_input_tokens_seen": 227314935, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.20654297, "step": 10545, "time_per_iteration": 2.8865580558776855 }, { "auxiliary_loss_clip": 0.01405339, "auxiliary_loss_mlp": 0.0103391, "balance_loss_clip": 1.24261796, "balance_loss_mlp": 1.01446688, "epoch": 0.6340598226364046, "flos": 12977290166400.0, "grad_norm": 2.0043148903065866, "language_loss": 0.72218841, "learning_rate": 1.2476503068844592e-06, "loss": 0.74658084, "num_input_tokens_seen": 227332905, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19458008, "step": 10546, "time_per_iteration": 4.264515399932861 }, { "auxiliary_loss_clip": 0.01405071, "auxiliary_loss_mlp": 0.01035518, "balance_loss_clip": 1.24500895, "balance_loss_mlp": 1.01555037, "epoch": 0.6341199458890726, "flos": 26699294062080.0, "grad_norm": 1.3576352523981463, "language_loss": 0.78601122, "learning_rate": 1.2472894659384792e-06, "loss": 0.81041718, "num_input_tokens_seen": 227354915, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19958496, "step": 10547, "time_per_iteration": 4.348164319992065 }, { "auxiliary_loss_clip": 0.0143357, "auxiliary_loss_mlp": 0.01035067, "balance_loss_clip": 1.26420236, "balance_loss_mlp": 1.01523089, "epoch": 0.6341800691417405, "flos": 18743568099840.0, "grad_norm": 1.878477914759935, "language_loss": 0.64017516, "learning_rate": 1.2469286535353578e-06, "loss": 0.66486156, "num_input_tokens_seen": 227372990, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.19836426, "step": 10548, "time_per_iteration": 2.835376024246216 }, { "auxiliary_loss_clip": 0.0142116, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.25475979, "balance_loss_mlp": 1.01407897, "epoch": 0.6342401923944085, "flos": 26260454315520.0, "grad_norm": 1.6897725618060273, "language_loss": 0.62642962, "learning_rate": 1.2465678696887785e-06, "loss": 0.65097713, "num_input_tokens_seen": 227393270, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19519043, "step": 10549, "time_per_iteration": 2.916325330734253 }, { "auxiliary_loss_clip": 0.01415569, "auxiliary_loss_mlp": 0.01033787, "balance_loss_clip": 1.25073195, "balance_loss_mlp": 1.01406944, "epoch": 0.6343003156470765, "flos": 24691640215680.0, "grad_norm": 1.7800709781878616, "language_loss": 0.74584419, "learning_rate": 1.2462071144124197e-06, "loss": 0.7703377, "num_input_tokens_seen": 227413630, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19714355, "step": 10550, "time_per_iteration": 2.8998491764068604 }, { "auxiliary_loss_clip": 0.01192626, "auxiliary_loss_mlp": 0.01018224, "balance_loss_clip": 1.10013199, "balance_loss_mlp": 0.99628985, "epoch": 0.6343604388997445, "flos": 69835035173760.0, "grad_norm": 0.7241466515962187, "language_loss": 0.57686305, "learning_rate": 1.2458463877199638e-06, "loss": 0.59897149, "num_input_tokens_seen": 227476630, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.21972656, "step": 10551, "time_per_iteration": 3.3953652381896973 }, { "auxiliary_loss_clip": 0.01415556, "auxiliary_loss_mlp": 0.01033663, "balance_loss_clip": 1.24998713, "balance_loss_mlp": 1.0132426, "epoch": 0.6344205621524125, "flos": 21992806517760.0, "grad_norm": 2.119479702480203, "language_loss": 0.67833209, "learning_rate": 1.2454856896250881e-06, "loss": 0.70282435, "num_input_tokens_seen": 227496060, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20410156, "step": 10552, "time_per_iteration": 2.8635590076446533 }, { "auxiliary_loss_clip": 0.01428623, "auxiliary_loss_mlp": 0.01030289, "balance_loss_clip": 1.25920224, "balance_loss_mlp": 1.01029742, "epoch": 0.6344806854050804, "flos": 20458541462400.0, "grad_norm": 1.6433394350698887, "language_loss": 0.83575642, "learning_rate": 1.24512502014147e-06, "loss": 0.8603456, "num_input_tokens_seen": 227513440, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.19970703, "step": 10553, "time_per_iteration": 2.8822944164276123 }, { "auxiliary_loss_clip": 0.01434637, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.26499844, "balance_loss_mlp": 1.01831174, "epoch": 0.6345408086577484, "flos": 40524085440000.0, "grad_norm": 3.1703811654028393, "language_loss": 0.55994231, "learning_rate": 1.2447643792827879e-06, "loss": 0.58467948, "num_input_tokens_seen": 227535395, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20776367, "step": 10554, "time_per_iteration": 2.997152090072632 }, { "auxiliary_loss_clip": 0.01426271, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.25935173, "balance_loss_mlp": 1.01558876, "epoch": 0.6346009319104163, "flos": 21371312937600.0, "grad_norm": 1.7265979553720885, "language_loss": 0.71625817, "learning_rate": 1.2444037670627153e-06, "loss": 0.74087524, "num_input_tokens_seen": 227554545, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19848633, "step": 10555, "time_per_iteration": 2.873615026473999 }, { "auxiliary_loss_clip": 0.01190227, "auxiliary_loss_mlp": 0.01032749, "balance_loss_clip": 1.09946394, "balance_loss_mlp": 1.01644158, "epoch": 0.6346610551630844, "flos": 71393216745600.0, "grad_norm": 0.7866523840235039, "language_loss": 0.55497593, "learning_rate": 1.2440431834949276e-06, "loss": 0.57720566, "num_input_tokens_seen": 227608575, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.16308594, "step": 10556, "time_per_iteration": 3.3035776615142822 }, { "auxiliary_loss_clip": 0.01429278, "auxiliary_loss_mlp": 0.0103595, "balance_loss_clip": 1.2592206, "balance_loss_mlp": 1.01530325, "epoch": 0.6347211784157523, "flos": 25422617508480.0, "grad_norm": 2.0608695851191583, "language_loss": 0.68971574, "learning_rate": 1.2436826285930985e-06, "loss": 0.71436799, "num_input_tokens_seen": 227628175, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.2064209, "step": 10557, "time_per_iteration": 2.8911702632904053 }, { "auxiliary_loss_clip": 0.01417474, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.25124598, "balance_loss_mlp": 1.01022053, "epoch": 0.6347813016684203, "flos": 15751330001280.0, "grad_norm": 1.5402664149742429, "language_loss": 0.70862573, "learning_rate": 1.2433221023709002e-06, "loss": 0.73310518, "num_input_tokens_seen": 227645330, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20263672, "step": 10558, "time_per_iteration": 2.8304927349090576 }, { "auxiliary_loss_clip": 0.01413045, "auxiliary_loss_mlp": 0.0103432, "balance_loss_clip": 1.24721932, "balance_loss_mlp": 1.0136373, "epoch": 0.6348414249210882, "flos": 21473240768640.0, "grad_norm": 1.5535912548941604, "language_loss": 0.78892314, "learning_rate": 1.2429616048420031e-06, "loss": 0.81339675, "num_input_tokens_seen": 227665250, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20678711, "step": 10559, "time_per_iteration": 2.8969550132751465 }, { "auxiliary_loss_clip": 0.01426814, "auxiliary_loss_mlp": 0.01038156, "balance_loss_clip": 1.25940561, "balance_loss_mlp": 1.01743698, "epoch": 0.6349015481737562, "flos": 21663088525440.0, "grad_norm": 1.6554719594091163, "language_loss": 0.69258499, "learning_rate": 1.242601136020078e-06, "loss": 0.71723461, "num_input_tokens_seen": 227685070, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20727539, "step": 10560, "time_per_iteration": 2.8830788135528564 }, { "auxiliary_loss_clip": 0.01424044, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.25664949, "balance_loss_mlp": 1.01489294, "epoch": 0.6349616714264241, "flos": 22203765613440.0, "grad_norm": 1.6864240167462674, "language_loss": 0.77578455, "learning_rate": 1.2422406959187939e-06, "loss": 0.80037892, "num_input_tokens_seen": 227704430, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.2052002, "step": 10561, "time_per_iteration": 2.908672332763672 }, { "auxiliary_loss_clip": 0.01430751, "auxiliary_loss_mlp": 0.01036716, "balance_loss_clip": 1.26137996, "balance_loss_mlp": 1.01630735, "epoch": 0.6350217946790921, "flos": 25421169674880.0, "grad_norm": 1.8462626674297673, "language_loss": 0.73129296, "learning_rate": 1.2418802845518178e-06, "loss": 0.75596762, "num_input_tokens_seen": 227724920, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20410156, "step": 10562, "time_per_iteration": 2.9103801250457764 }, { "auxiliary_loss_clip": 0.01433511, "auxiliary_loss_mlp": 0.01035885, "balance_loss_clip": 1.2648387, "balance_loss_mlp": 1.0153091, "epoch": 0.63508191793176, "flos": 19728378576000.0, "grad_norm": 2.1972553024895807, "language_loss": 0.81327558, "learning_rate": 1.2415199019328185e-06, "loss": 0.83796954, "num_input_tokens_seen": 227743400, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20568848, "step": 10563, "time_per_iteration": 2.960369348526001 }, { "auxiliary_loss_clip": 0.01429755, "auxiliary_loss_mlp": 0.01037955, "balance_loss_clip": 1.26174676, "balance_loss_mlp": 1.01761806, "epoch": 0.6351420411844281, "flos": 18196059047040.0, "grad_norm": 4.580599084713165, "language_loss": 0.81768715, "learning_rate": 1.2411595480754597e-06, "loss": 0.84236425, "num_input_tokens_seen": 227759990, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20336914, "step": 10564, "time_per_iteration": 2.855085611343384 }, { "auxiliary_loss_clip": 0.01418043, "auxiliary_loss_mlp": 0.01035088, "balance_loss_clip": 1.25170648, "balance_loss_mlp": 1.0163604, "epoch": 0.6352021644370961, "flos": 33738809944320.0, "grad_norm": 1.741208974667203, "language_loss": 0.73491901, "learning_rate": 1.240799222993407e-06, "loss": 0.75945032, "num_input_tokens_seen": 227780835, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.18737793, "step": 10565, "time_per_iteration": 2.994623899459839 }, { "auxiliary_loss_clip": 0.01434738, "auxiliary_loss_mlp": 0.01035656, "balance_loss_clip": 1.26567805, "balance_loss_mlp": 1.01475894, "epoch": 0.635262287689764, "flos": 20384149731840.0, "grad_norm": 3.135448863318975, "language_loss": 0.70233369, "learning_rate": 1.240438926700324e-06, "loss": 0.72703767, "num_input_tokens_seen": 227798580, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20898438, "step": 10566, "time_per_iteration": 2.860374689102173 }, { "auxiliary_loss_clip": 0.01413688, "auxiliary_loss_mlp": 0.01035001, "balance_loss_clip": 1.25135374, "balance_loss_mlp": 1.01645255, "epoch": 0.635322410942432, "flos": 27536497441920.0, "grad_norm": 12.913596653768852, "language_loss": 0.70407057, "learning_rate": 1.2400786592098725e-06, "loss": 0.72855753, "num_input_tokens_seen": 227819210, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1854248, "step": 10567, "time_per_iteration": 2.938150644302368 }, { "auxiliary_loss_clip": 0.01415558, "auxiliary_loss_mlp": 0.01032865, "balance_loss_clip": 1.25348902, "balance_loss_mlp": 1.01361287, "epoch": 0.6353825341950999, "flos": 21553604812800.0, "grad_norm": 2.2653138809590065, "language_loss": 0.85105354, "learning_rate": 1.2397184205357154e-06, "loss": 0.87553775, "num_input_tokens_seen": 227838340, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19238281, "step": 10568, "time_per_iteration": 2.889739751815796 }, { "auxiliary_loss_clip": 0.01419647, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.25323343, "balance_loss_mlp": 1.01526165, "epoch": 0.635442657447768, "flos": 31772446617600.0, "grad_norm": 2.077714547770781, "language_loss": 0.85312241, "learning_rate": 1.2393582106915113e-06, "loss": 0.87766558, "num_input_tokens_seen": 227859170, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19421387, "step": 10569, "time_per_iteration": 4.331791162490845 }, { "auxiliary_loss_clip": 0.01401536, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.23854494, "balance_loss_mlp": 1.01706183, "epoch": 0.6355027807004359, "flos": 19838405226240.0, "grad_norm": 1.577611732564103, "language_loss": 0.70315707, "learning_rate": 1.2389980296909198e-06, "loss": 0.72754008, "num_input_tokens_seen": 227878545, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19702148, "step": 10570, "time_per_iteration": 2.8639445304870605 }, { "auxiliary_loss_clip": 0.01423435, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.25497532, "balance_loss_mlp": 1.01319909, "epoch": 0.6355629039531039, "flos": 30384476559360.0, "grad_norm": 1.707781046795074, "language_loss": 0.67111802, "learning_rate": 1.2386378775476e-06, "loss": 0.69568372, "num_input_tokens_seen": 227898875, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19934082, "step": 10571, "time_per_iteration": 2.903986930847168 }, { "auxiliary_loss_clip": 0.01430272, "auxiliary_loss_mlp": 0.01036885, "balance_loss_clip": 1.26195788, "balance_loss_mlp": 1.01528406, "epoch": 0.6356230272057718, "flos": 17941275722880.0, "grad_norm": 1.6740597913142996, "language_loss": 0.71968341, "learning_rate": 1.2382777542752074e-06, "loss": 0.74435496, "num_input_tokens_seen": 227917130, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.21618652, "step": 10572, "time_per_iteration": 2.8243138790130615 }, { "auxiliary_loss_clip": 0.01417017, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.25312257, "balance_loss_mlp": 1.0141319, "epoch": 0.6356831504584398, "flos": 25387163568000.0, "grad_norm": 1.4754539912150084, "language_loss": 0.81496662, "learning_rate": 1.2379176598873992e-06, "loss": 0.8394742, "num_input_tokens_seen": 227939550, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19616699, "step": 10573, "time_per_iteration": 2.927483558654785 }, { "auxiliary_loss_clip": 0.01426024, "auxiliary_loss_mlp": 0.01041413, "balance_loss_clip": 1.25889492, "balance_loss_mlp": 1.02115965, "epoch": 0.6357432737111077, "flos": 46516931925120.0, "grad_norm": 1.995876584604385, "language_loss": 0.69803137, "learning_rate": 1.2375575943978303e-06, "loss": 0.72270572, "num_input_tokens_seen": 227962200, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20251465, "step": 10574, "time_per_iteration": 3.102177381515503 }, { "auxiliary_loss_clip": 0.01422901, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.25821173, "balance_loss_mlp": 1.01605785, "epoch": 0.6358033969637757, "flos": 17283151837440.0, "grad_norm": 2.2642023523274086, "language_loss": 0.87420493, "learning_rate": 1.2371975578201525e-06, "loss": 0.89880967, "num_input_tokens_seen": 227979270, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.21533203, "step": 10575, "time_per_iteration": 2.8774237632751465 }, { "auxiliary_loss_clip": 0.01432007, "auxiliary_loss_mlp": 0.01040029, "balance_loss_clip": 1.26723969, "balance_loss_mlp": 1.02062201, "epoch": 0.6358635202164437, "flos": 27136866954240.0, "grad_norm": 1.5471851917253399, "language_loss": 0.72336543, "learning_rate": 1.2368375501680204e-06, "loss": 0.7480858, "num_input_tokens_seen": 228000550, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19396973, "step": 10576, "time_per_iteration": 2.8966774940490723 }, { "auxiliary_loss_clip": 0.01431117, "auxiliary_loss_mlp": 0.01039761, "balance_loss_clip": 1.26402283, "balance_loss_mlp": 1.01930499, "epoch": 0.6359236434691117, "flos": 27536225973120.0, "grad_norm": 2.681853939996624, "language_loss": 0.69916642, "learning_rate": 1.236477571455085e-06, "loss": 0.72387522, "num_input_tokens_seen": 228022005, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20458984, "step": 10577, "time_per_iteration": 2.956602096557617 }, { "auxiliary_loss_clip": 0.01416463, "auxiliary_loss_mlp": 0.01036823, "balance_loss_clip": 1.25184834, "balance_loss_mlp": 1.01761866, "epoch": 0.6359837667217797, "flos": 39362819667840.0, "grad_norm": 1.7625520190110893, "language_loss": 0.73049545, "learning_rate": 1.2361176216949964e-06, "loss": 0.75502837, "num_input_tokens_seen": 228043770, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19213867, "step": 10578, "time_per_iteration": 3.0018527507781982 }, { "auxiliary_loss_clip": 0.01193301, "auxiliary_loss_mlp": 0.01026446, "balance_loss_clip": 1.10213971, "balance_loss_mlp": 1.00737286, "epoch": 0.6360438899744476, "flos": 56439672624000.0, "grad_norm": 0.7097616740963882, "language_loss": 0.5457049, "learning_rate": 1.2357577009014044e-06, "loss": 0.56790239, "num_input_tokens_seen": 228104985, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.19042969, "step": 10579, "time_per_iteration": 4.81935715675354 }, { "auxiliary_loss_clip": 0.01409296, "auxiliary_loss_mlp": 0.01036608, "balance_loss_clip": 1.24564791, "balance_loss_mlp": 1.01706934, "epoch": 0.6361040132271156, "flos": 24983958741120.0, "grad_norm": 2.754332172615228, "language_loss": 0.7817418, "learning_rate": 1.2353978090879568e-06, "loss": 0.80620086, "num_input_tokens_seen": 228125620, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19555664, "step": 10580, "time_per_iteration": 2.8833963871002197 }, { "auxiliary_loss_clip": 0.0142142, "auxiliary_loss_mlp": 0.01031724, "balance_loss_clip": 1.2572304, "balance_loss_mlp": 1.01288867, "epoch": 0.6361641364797835, "flos": 23269845029760.0, "grad_norm": 1.8998657723042809, "language_loss": 0.67131698, "learning_rate": 1.235037946268301e-06, "loss": 0.69584846, "num_input_tokens_seen": 228143495, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18859863, "step": 10581, "time_per_iteration": 4.316204071044922 }, { "auxiliary_loss_clip": 0.01408869, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.24583614, "balance_loss_mlp": 1.01711893, "epoch": 0.6362242597324516, "flos": 26005580501760.0, "grad_norm": 1.3440964918187783, "language_loss": 0.69150364, "learning_rate": 1.2346781124560828e-06, "loss": 0.71595347, "num_input_tokens_seen": 228166500, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18981934, "step": 10582, "time_per_iteration": 4.347703456878662 }, { "auxiliary_loss_clip": 0.01427786, "auxiliary_loss_mlp": 0.01039761, "balance_loss_clip": 1.26064873, "balance_loss_mlp": 1.02040148, "epoch": 0.6362843829851195, "flos": 25714393096320.0, "grad_norm": 1.7224745402780957, "language_loss": 0.85513926, "learning_rate": 1.2343183076649473e-06, "loss": 0.87981468, "num_input_tokens_seen": 228185325, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19335938, "step": 10583, "time_per_iteration": 2.8928844928741455 }, { "auxiliary_loss_clip": 0.01412062, "auxiliary_loss_mlp": 0.01039988, "balance_loss_clip": 1.25054669, "balance_loss_mlp": 1.01947212, "epoch": 0.6363445062377875, "flos": 20532887948160.0, "grad_norm": 1.5227455626226185, "language_loss": 0.76192576, "learning_rate": 1.233958531908538e-06, "loss": 0.78644627, "num_input_tokens_seen": 228204050, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20507812, "step": 10584, "time_per_iteration": 2.8696115016937256 }, { "auxiliary_loss_clip": 0.01417218, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.25055683, "balance_loss_mlp": 1.01737976, "epoch": 0.6364046294904554, "flos": 19473233293440.0, "grad_norm": 2.0630833970609532, "language_loss": 0.73611951, "learning_rate": 1.2335987852004985e-06, "loss": 0.76067394, "num_input_tokens_seen": 228222430, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20825195, "step": 10585, "time_per_iteration": 2.868460178375244 }, { "auxiliary_loss_clip": 0.01425073, "auxiliary_loss_mlp": 0.01033077, "balance_loss_clip": 1.26008201, "balance_loss_mlp": 1.01374125, "epoch": 0.6364647527431234, "flos": 21005643312000.0, "grad_norm": 1.7944357164275786, "language_loss": 0.83554518, "learning_rate": 1.2332390675544697e-06, "loss": 0.86012673, "num_input_tokens_seen": 228241925, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19335938, "step": 10586, "time_per_iteration": 2.8866946697235107 }, { "auxiliary_loss_clip": 0.01409184, "auxiliary_loss_mlp": 0.01030315, "balance_loss_clip": 1.24659169, "balance_loss_mlp": 1.0110743, "epoch": 0.6365248759957913, "flos": 25780550273280.0, "grad_norm": 1.5685212476703394, "language_loss": 0.73136413, "learning_rate": 1.2328793789840918e-06, "loss": 0.75575912, "num_input_tokens_seen": 228262535, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19238281, "step": 10587, "time_per_iteration": 2.958512306213379 }, { "auxiliary_loss_clip": 0.01428326, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.26136446, "balance_loss_mlp": 1.0147289, "epoch": 0.6365849992484593, "flos": 22465788105600.0, "grad_norm": 1.936855784689104, "language_loss": 0.77571714, "learning_rate": 1.2325197195030058e-06, "loss": 0.80033326, "num_input_tokens_seen": 228281340, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.18566895, "step": 10588, "time_per_iteration": 2.9962804317474365 }, { "auxiliary_loss_clip": 0.01400661, "auxiliary_loss_mlp": 0.01031264, "balance_loss_clip": 1.24122858, "balance_loss_mlp": 1.01160669, "epoch": 0.6366451225011273, "flos": 19034936484480.0, "grad_norm": 1.3668919056259032, "language_loss": 0.80311882, "learning_rate": 1.2321600891248478e-06, "loss": 0.82743812, "num_input_tokens_seen": 228300865, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1965332, "step": 10589, "time_per_iteration": 2.9305076599121094 }, { "auxiliary_loss_clip": 0.01415845, "auxiliary_loss_mlp": 0.01031378, "balance_loss_clip": 1.25303173, "balance_loss_mlp": 1.01192307, "epoch": 0.6367052457537953, "flos": 25239285002880.0, "grad_norm": 2.108781839617669, "language_loss": 0.68065667, "learning_rate": 1.231800487863257e-06, "loss": 0.70512891, "num_input_tokens_seen": 228320815, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19470215, "step": 10590, "time_per_iteration": 2.8894801139831543 }, { "auxiliary_loss_clip": 0.01426443, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.25595176, "balance_loss_mlp": 1.01109374, "epoch": 0.6367653690064633, "flos": 19217816542080.0, "grad_norm": 1.7479076855526154, "language_loss": 0.79758489, "learning_rate": 1.2314409157318685e-06, "loss": 0.82215655, "num_input_tokens_seen": 228339065, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.19616699, "step": 10591, "time_per_iteration": 2.8550961017608643 }, { "auxiliary_loss_clip": 0.01407532, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.24612641, "balance_loss_mlp": 1.01417434, "epoch": 0.6368254922591312, "flos": 23556417465600.0, "grad_norm": 1.7488806719252232, "language_loss": 0.89462864, "learning_rate": 1.231081372744317e-06, "loss": 0.91903448, "num_input_tokens_seen": 228359210, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18884277, "step": 10592, "time_per_iteration": 2.907463312149048 }, { "auxiliary_loss_clip": 0.01402388, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.24125302, "balance_loss_mlp": 1.01173925, "epoch": 0.6368856155117992, "flos": 26478109641600.0, "grad_norm": 1.3397571402311352, "language_loss": 0.69192624, "learning_rate": 1.2307218589142376e-06, "loss": 0.71625769, "num_input_tokens_seen": 228379630, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19006348, "step": 10593, "time_per_iteration": 2.932339668273926 }, { "auxiliary_loss_clip": 0.01406732, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.24401164, "balance_loss_mlp": 1.01391792, "epoch": 0.6369457387644671, "flos": 33705346775040.0, "grad_norm": 1.7149351663367853, "language_loss": 0.64017713, "learning_rate": 1.2303623742552618e-06, "loss": 0.66457868, "num_input_tokens_seen": 228401410, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19494629, "step": 10594, "time_per_iteration": 3.0017402172088623 }, { "auxiliary_loss_clip": 0.01190368, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.09842455, "balance_loss_mlp": 1.01252711, "epoch": 0.6370058620171352, "flos": 70940821541760.0, "grad_norm": 0.7870682531777012, "language_loss": 0.54687274, "learning_rate": 1.230002918781022e-06, "loss": 0.56910294, "num_input_tokens_seen": 228470335, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.20117188, "step": 10595, "time_per_iteration": 3.5273349285125732 }, { "auxiliary_loss_clip": 0.01430058, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.26218736, "balance_loss_mlp": 1.01487017, "epoch": 0.6370659852698031, "flos": 21151757329920.0, "grad_norm": 1.7176365065226327, "language_loss": 0.67989755, "learning_rate": 1.2296434925051493e-06, "loss": 0.70454401, "num_input_tokens_seen": 228490765, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19714355, "step": 10596, "time_per_iteration": 2.892890214920044 }, { "auxiliary_loss_clip": 0.01409918, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.24642992, "balance_loss_mlp": 1.01145482, "epoch": 0.6371261085224711, "flos": 20202853242240.0, "grad_norm": 2.1852321942164568, "language_loss": 0.80372751, "learning_rate": 1.2292840954412718e-06, "loss": 0.82814223, "num_input_tokens_seen": 228509700, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20092773, "step": 10597, "time_per_iteration": 2.9108800888061523 }, { "auxiliary_loss_clip": 0.01426753, "auxiliary_loss_mlp": 0.01035943, "balance_loss_clip": 1.26053691, "balance_loss_mlp": 1.01611876, "epoch": 0.637186231775139, "flos": 19693241349120.0, "grad_norm": 1.6315838106475793, "language_loss": 0.74797535, "learning_rate": 1.2289247276030189e-06, "loss": 0.77260226, "num_input_tokens_seen": 228529050, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19824219, "step": 10598, "time_per_iteration": 2.8771023750305176 }, { "auxiliary_loss_clip": 0.01418334, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.25267899, "balance_loss_mlp": 1.01477051, "epoch": 0.637246355027807, "flos": 13077001002240.0, "grad_norm": 3.293320990180704, "language_loss": 0.690162, "learning_rate": 1.2285653890040176e-06, "loss": 0.71469462, "num_input_tokens_seen": 228544665, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20178223, "step": 10599, "time_per_iteration": 2.875793695449829 }, { "auxiliary_loss_clip": 0.01428189, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.25862646, "balance_loss_mlp": 1.01917505, "epoch": 0.6373064782804749, "flos": 18231603477120.0, "grad_norm": 2.275020919271935, "language_loss": 0.81714725, "learning_rate": 1.2282060796578942e-06, "loss": 0.8418231, "num_input_tokens_seen": 228562060, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20214844, "step": 10600, "time_per_iteration": 2.861290693283081 }, { "auxiliary_loss_clip": 0.01409905, "auxiliary_loss_mlp": 0.01034449, "balance_loss_clip": 1.24651229, "balance_loss_mlp": 1.01412392, "epoch": 0.637366601533143, "flos": 24509122116480.0, "grad_norm": 1.4510349697095504, "language_loss": 0.80258715, "learning_rate": 1.2278467995782732e-06, "loss": 0.82703066, "num_input_tokens_seen": 228582550, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20336914, "step": 10601, "time_per_iteration": 2.929255247116089 }, { "auxiliary_loss_clip": 0.01416891, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.25001729, "balance_loss_mlp": 1.01383436, "epoch": 0.6374267247858109, "flos": 26370119007360.0, "grad_norm": 2.105873162889238, "language_loss": 0.67679965, "learning_rate": 1.2274875487787797e-06, "loss": 0.70130908, "num_input_tokens_seen": 228604960, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20214844, "step": 10602, "time_per_iteration": 2.916118621826172 }, { "auxiliary_loss_clip": 0.0141652, "auxiliary_loss_mlp": 0.01035087, "balance_loss_clip": 1.25027072, "balance_loss_mlp": 1.01559663, "epoch": 0.6374868480384789, "flos": 20380303923840.0, "grad_norm": 1.650262179358348, "language_loss": 0.80250895, "learning_rate": 1.2271283272730354e-06, "loss": 0.82702506, "num_input_tokens_seen": 228622195, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19494629, "step": 10603, "time_per_iteration": 2.870232105255127 }, { "auxiliary_loss_clip": 0.01416429, "auxiliary_loss_mlp": 0.01031619, "balance_loss_clip": 1.25123394, "balance_loss_mlp": 1.01099634, "epoch": 0.6375469712911469, "flos": 21006095760000.0, "grad_norm": 1.9192523967253292, "language_loss": 0.7809096, "learning_rate": 1.2267691350746621e-06, "loss": 0.80539006, "num_input_tokens_seen": 228639735, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20629883, "step": 10604, "time_per_iteration": 4.327500581741333 }, { "auxiliary_loss_clip": 0.01430147, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.26006079, "balance_loss_mlp": 1.01334321, "epoch": 0.6376070945438148, "flos": 19724351788800.0, "grad_norm": 1.6029820786924245, "language_loss": 0.77452469, "learning_rate": 1.226409972197281e-06, "loss": 0.79915321, "num_input_tokens_seen": 228658195, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.19360352, "step": 10605, "time_per_iteration": 3.0158066749572754 }, { "auxiliary_loss_clip": 0.01425567, "auxiliary_loss_mlp": 0.01033239, "balance_loss_clip": 1.25685799, "balance_loss_mlp": 1.01198435, "epoch": 0.6376672177964828, "flos": 21516657793920.0, "grad_norm": 2.324611814441127, "language_loss": 0.66280955, "learning_rate": 1.2260508386545106e-06, "loss": 0.6873976, "num_input_tokens_seen": 228677415, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.21264648, "step": 10606, "time_per_iteration": 2.919983386993408 }, { "auxiliary_loss_clip": 0.0139559, "auxiliary_loss_mlp": 0.01036854, "balance_loss_clip": 1.23555243, "balance_loss_mlp": 1.01648116, "epoch": 0.6377273410491507, "flos": 18853232791680.0, "grad_norm": 1.5589402164559836, "language_loss": 0.75836182, "learning_rate": 1.225691734459971e-06, "loss": 0.78268623, "num_input_tokens_seen": 228696450, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.20361328, "step": 10607, "time_per_iteration": 2.8723268508911133 }, { "auxiliary_loss_clip": 0.01427652, "auxiliary_loss_mlp": 0.01040659, "balance_loss_clip": 1.26156688, "balance_loss_mlp": 1.02097714, "epoch": 0.6377874643018188, "flos": 53080027614720.0, "grad_norm": 5.980226397178839, "language_loss": 0.66144705, "learning_rate": 1.225332659627278e-06, "loss": 0.68613017, "num_input_tokens_seen": 228721600, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19689941, "step": 10608, "time_per_iteration": 3.1558682918548584 }, { "auxiliary_loss_clip": 0.01190254, "auxiliary_loss_mlp": 0.01021066, "balance_loss_clip": 1.09873652, "balance_loss_mlp": 1.00046635, "epoch": 0.6378475875544867, "flos": 65163458632320.0, "grad_norm": 0.7109534433152319, "language_loss": 0.51855457, "learning_rate": 1.2249736141700475e-06, "loss": 0.54066765, "num_input_tokens_seen": 228784535, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.20605469, "step": 10609, "time_per_iteration": 3.384315013885498 }, { "auxiliary_loss_clip": 0.01401672, "auxiliary_loss_mlp": 0.01033614, "balance_loss_clip": 1.2394278, "balance_loss_mlp": 1.01489842, "epoch": 0.6379077108071547, "flos": 23013070934400.0, "grad_norm": 1.5666766323443047, "language_loss": 0.75608134, "learning_rate": 1.2246145981018965e-06, "loss": 0.78043419, "num_input_tokens_seen": 228804110, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18737793, "step": 10610, "time_per_iteration": 2.921985149383545 }, { "auxiliary_loss_clip": 0.01191757, "auxiliary_loss_mlp": 0.01021953, "balance_loss_clip": 1.10152173, "balance_loss_mlp": 1.00097179, "epoch": 0.6379678340598226, "flos": 67636827636480.0, "grad_norm": 0.8503768416156406, "language_loss": 0.63187242, "learning_rate": 1.2242556114364364e-06, "loss": 0.65400958, "num_input_tokens_seen": 228867705, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.20996094, "step": 10611, "time_per_iteration": 3.3566787242889404 }, { "auxiliary_loss_clip": 0.01417855, "auxiliary_loss_mlp": 0.01034476, "balance_loss_clip": 1.25246584, "balance_loss_mlp": 1.01522398, "epoch": 0.6380279573124906, "flos": 29692210832640.0, "grad_norm": 2.198029529750739, "language_loss": 0.73653013, "learning_rate": 1.223896654187282e-06, "loss": 0.76105344, "num_input_tokens_seen": 228889215, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19250488, "step": 10612, "time_per_iteration": 2.974773406982422 }, { "auxiliary_loss_clip": 0.01190947, "auxiliary_loss_mlp": 0.01014853, "balance_loss_clip": 1.10014606, "balance_loss_mlp": 0.99816418, "epoch": 0.6380880805651585, "flos": 66512581390080.0, "grad_norm": 0.7174302918635455, "language_loss": 0.57945591, "learning_rate": 1.2235377263680446e-06, "loss": 0.60151386, "num_input_tokens_seen": 228948465, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.16699219, "step": 10613, "time_per_iteration": 3.201345205307007 }, { "auxiliary_loss_clip": 0.01420807, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.25543833, "balance_loss_mlp": 1.0158093, "epoch": 0.6381482038178266, "flos": 23925570940800.0, "grad_norm": 1.9732364729060767, "language_loss": 0.76043212, "learning_rate": 1.2231788279923334e-06, "loss": 0.78499901, "num_input_tokens_seen": 228967955, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20080566, "step": 10614, "time_per_iteration": 4.344970703125 }, { "auxiliary_loss_clip": 0.01410409, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.24769926, "balance_loss_mlp": 1.01673031, "epoch": 0.6382083270704945, "flos": 24253614875520.0, "grad_norm": 2.262292130660365, "language_loss": 0.80941999, "learning_rate": 1.2228199590737599e-06, "loss": 0.83389175, "num_input_tokens_seen": 228985495, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20031738, "step": 10615, "time_per_iteration": 2.909511089324951 }, { "auxiliary_loss_clip": 0.01192863, "auxiliary_loss_mlp": 0.01024557, "balance_loss_clip": 1.10013855, "balance_loss_mlp": 1.0005244, "epoch": 0.6382684503231625, "flos": 70811086141440.0, "grad_norm": 0.6582320655047929, "language_loss": 0.55674541, "learning_rate": 1.2224611196259305e-06, "loss": 0.57891959, "num_input_tokens_seen": 229052995, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.24023438, "step": 10616, "time_per_iteration": 3.352346658706665 }, { "auxiliary_loss_clip": 0.01429428, "auxiliary_loss_mlp": 0.01035201, "balance_loss_clip": 1.2627126, "balance_loss_mlp": 1.01542401, "epoch": 0.6383285735758305, "flos": 16553486643840.0, "grad_norm": 4.783911857562708, "language_loss": 0.85377115, "learning_rate": 1.2221023096624538e-06, "loss": 0.87841743, "num_input_tokens_seen": 229071030, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19775391, "step": 10617, "time_per_iteration": 5.810904026031494 }, { "auxiliary_loss_clip": 0.01421031, "auxiliary_loss_mlp": 0.01038587, "balance_loss_clip": 1.2545526, "balance_loss_mlp": 1.01854837, "epoch": 0.6383886968284984, "flos": 14435489433600.0, "grad_norm": 1.772428926348425, "language_loss": 0.8751663, "learning_rate": 1.221743529196936e-06, "loss": 0.89976251, "num_input_tokens_seen": 229088275, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20043945, "step": 10618, "time_per_iteration": 2.852480173110962 }, { "auxiliary_loss_clip": 0.0142712, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.2609098, "balance_loss_mlp": 1.01736641, "epoch": 0.6384488200811664, "flos": 17938244321280.0, "grad_norm": 2.4304630797388436, "language_loss": 0.74075896, "learning_rate": 1.2213847782429806e-06, "loss": 0.76539278, "num_input_tokens_seen": 229105190, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.18884277, "step": 10619, "time_per_iteration": 2.8696787357330322 }, { "auxiliary_loss_clip": 0.0144455, "auxiliary_loss_mlp": 0.01043364, "balance_loss_clip": 1.2713306, "balance_loss_mlp": 1.02162051, "epoch": 0.6385089433338343, "flos": 18524872143360.0, "grad_norm": 1.9117435515245458, "language_loss": 0.76809472, "learning_rate": 1.221026056814193e-06, "loss": 0.79297388, "num_input_tokens_seen": 229122290, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.21728516, "step": 10620, "time_per_iteration": 2.834789752960205 }, { "auxiliary_loss_clip": 0.01424594, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.25993049, "balance_loss_mlp": 1.01450121, "epoch": 0.6385690665865024, "flos": 24764267399040.0, "grad_norm": 2.354045156599609, "language_loss": 0.72596782, "learning_rate": 1.2206673649241752e-06, "loss": 0.75055391, "num_input_tokens_seen": 229141620, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19506836, "step": 10621, "time_per_iteration": 2.8915207386016846 }, { "auxiliary_loss_clip": 0.01395112, "auxiliary_loss_mlp": 0.01029838, "balance_loss_clip": 1.23581576, "balance_loss_mlp": 1.01076484, "epoch": 0.6386291898391703, "flos": 20130180814080.0, "grad_norm": 1.7326989499827687, "language_loss": 0.788149, "learning_rate": 1.220308702586529e-06, "loss": 0.81239855, "num_input_tokens_seen": 229161570, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19055176, "step": 10622, "time_per_iteration": 2.897085189819336 }, { "auxiliary_loss_clip": 0.01400834, "auxiliary_loss_mlp": 0.01031602, "balance_loss_clip": 1.23960865, "balance_loss_mlp": 1.01289821, "epoch": 0.6386893130918383, "flos": 16874472389760.0, "grad_norm": 1.8656485860660403, "language_loss": 0.75478286, "learning_rate": 1.2199500698148546e-06, "loss": 0.77910721, "num_input_tokens_seen": 229178465, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18676758, "step": 10623, "time_per_iteration": 2.8348002433776855 }, { "auxiliary_loss_clip": 0.01404102, "auxiliary_loss_mlp": 0.01034302, "balance_loss_clip": 1.2434181, "balance_loss_mlp": 1.01526415, "epoch": 0.6387494363445062, "flos": 22976576363520.0, "grad_norm": 2.105012327800066, "language_loss": 0.76819652, "learning_rate": 1.2195914666227527e-06, "loss": 0.79258054, "num_input_tokens_seen": 229198975, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19018555, "step": 10624, "time_per_iteration": 2.9461028575897217 }, { "auxiliary_loss_clip": 0.01421476, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.25560713, "balance_loss_mlp": 1.01604056, "epoch": 0.6388095595971742, "flos": 22868223770880.0, "grad_norm": 1.8815024198633832, "language_loss": 0.81112021, "learning_rate": 1.21923289302382e-06, "loss": 0.83569729, "num_input_tokens_seen": 229218825, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20202637, "step": 10625, "time_per_iteration": 2.892503261566162 }, { "auxiliary_loss_clip": 0.01435147, "auxiliary_loss_mlp": 0.01041965, "balance_loss_clip": 1.2671243, "balance_loss_mlp": 1.02131808, "epoch": 0.6388696828498421, "flos": 17320506059520.0, "grad_norm": 4.8590372101689425, "language_loss": 0.73667407, "learning_rate": 1.218874349031654e-06, "loss": 0.76144516, "num_input_tokens_seen": 229236060, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.2064209, "step": 10626, "time_per_iteration": 2.8904950618743896 }, { "auxiliary_loss_clip": 0.01431019, "auxiliary_loss_mlp": 0.01035109, "balance_loss_clip": 1.26452756, "balance_loss_mlp": 1.01484311, "epoch": 0.6389298061025102, "flos": 17137445022720.0, "grad_norm": 1.7914528375793113, "language_loss": 0.73458624, "learning_rate": 1.2185158346598517e-06, "loss": 0.75924754, "num_input_tokens_seen": 229255160, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20263672, "step": 10627, "time_per_iteration": 2.8895771503448486 }, { "auxiliary_loss_clip": 0.01441741, "auxiliary_loss_mlp": 0.01039246, "balance_loss_clip": 1.27066541, "balance_loss_mlp": 1.01762176, "epoch": 0.6389899293551781, "flos": 27722906593920.0, "grad_norm": 1.6079667470696684, "language_loss": 0.67835248, "learning_rate": 1.2181573499220064e-06, "loss": 0.70316231, "num_input_tokens_seen": 229278705, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.21630859, "step": 10628, "time_per_iteration": 2.922659158706665 }, { "auxiliary_loss_clip": 0.0140187, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.24035263, "balance_loss_mlp": 1.01469791, "epoch": 0.6390500526078461, "flos": 21225741857280.0, "grad_norm": 1.763853472525078, "language_loss": 0.68809628, "learning_rate": 1.2177988948317135e-06, "loss": 0.71245688, "num_input_tokens_seen": 229299990, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19470215, "step": 10629, "time_per_iteration": 2.912322998046875 }, { "auxiliary_loss_clip": 0.01437087, "auxiliary_loss_mlp": 0.01043236, "balance_loss_clip": 1.26388943, "balance_loss_mlp": 1.02114654, "epoch": 0.6391101758605141, "flos": 21591320993280.0, "grad_norm": 1.5218047276149684, "language_loss": 0.75778008, "learning_rate": 1.2174404694025646e-06, "loss": 0.78258324, "num_input_tokens_seen": 229319230, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.22094727, "step": 10630, "time_per_iteration": 2.944474935531616 }, { "auxiliary_loss_clip": 0.01415585, "auxiliary_loss_mlp": 0.010386, "balance_loss_clip": 1.25176167, "balance_loss_mlp": 1.01865625, "epoch": 0.639170299113182, "flos": 19909720310400.0, "grad_norm": 1.6302352487402056, "language_loss": 0.70919836, "learning_rate": 1.2170820736481511e-06, "loss": 0.73374021, "num_input_tokens_seen": 229338600, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19934082, "step": 10631, "time_per_iteration": 2.9247424602508545 }, { "auxiliary_loss_clip": 0.01186176, "auxiliary_loss_mlp": 0.01027647, "balance_loss_clip": 1.09630311, "balance_loss_mlp": 1.00533152, "epoch": 0.63923042236585, "flos": 69907119419520.0, "grad_norm": 0.7736767569588264, "language_loss": 0.63027972, "learning_rate": 1.2167237075820646e-06, "loss": 0.65241796, "num_input_tokens_seen": 229402420, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.22363281, "step": 10632, "time_per_iteration": 3.394996404647827 }, { "auxiliary_loss_clip": 0.01411491, "auxiliary_loss_mlp": 0.01039406, "balance_loss_clip": 1.24773991, "balance_loss_mlp": 1.0188663, "epoch": 0.639290545618518, "flos": 22685162734080.0, "grad_norm": 1.7549325404893397, "language_loss": 0.67521405, "learning_rate": 1.216365371217893e-06, "loss": 0.69972301, "num_input_tokens_seen": 229419185, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20544434, "step": 10633, "time_per_iteration": 2.8727967739105225 }, { "auxiliary_loss_clip": 0.01418026, "auxiliary_loss_mlp": 0.01034473, "balance_loss_clip": 1.25359845, "balance_loss_mlp": 1.01448166, "epoch": 0.639350668871186, "flos": 19838857674240.0, "grad_norm": 1.8832953461727837, "language_loss": 0.82526946, "learning_rate": 1.216007064569225e-06, "loss": 0.84979451, "num_input_tokens_seen": 229436735, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19995117, "step": 10634, "time_per_iteration": 2.8772552013397217 }, { "auxiliary_loss_clip": 0.01423851, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.25953674, "balance_loss_mlp": 1.01608789, "epoch": 0.6394107921238539, "flos": 20561736147840.0, "grad_norm": 1.6351288090620961, "language_loss": 0.75842631, "learning_rate": 1.2156487876496483e-06, "loss": 0.7830233, "num_input_tokens_seen": 229455595, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19763184, "step": 10635, "time_per_iteration": 2.8638575077056885 }, { "auxiliary_loss_clip": 0.01419533, "auxiliary_loss_mlp": 0.01037367, "balance_loss_clip": 1.25356746, "balance_loss_mlp": 1.01630259, "epoch": 0.6394709153765219, "flos": 25785843914880.0, "grad_norm": 1.9103929695053528, "language_loss": 0.72311699, "learning_rate": 1.2152905404727475e-06, "loss": 0.74768591, "num_input_tokens_seen": 229476230, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.21057129, "step": 10636, "time_per_iteration": 2.9158835411071777 }, { "auxiliary_loss_clip": 0.01436455, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.26676583, "balance_loss_mlp": 1.0181272, "epoch": 0.6395310386291898, "flos": 17539111526400.0, "grad_norm": 1.7827667256817197, "language_loss": 0.74897647, "learning_rate": 1.2149323230521085e-06, "loss": 0.77372301, "num_input_tokens_seen": 229494300, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20068359, "step": 10637, "time_per_iteration": 2.8634557723999023 }, { "auxiliary_loss_clip": 0.01429432, "auxiliary_loss_mlp": 0.01036246, "balance_loss_clip": 1.26219583, "balance_loss_mlp": 1.01617146, "epoch": 0.6395911618818578, "flos": 18597454081920.0, "grad_norm": 1.7638654148147994, "language_loss": 0.787718, "learning_rate": 1.2145741354013143e-06, "loss": 0.81237477, "num_input_tokens_seen": 229512985, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20080566, "step": 10638, "time_per_iteration": 4.283633708953857 }, { "auxiliary_loss_clip": 0.01419982, "auxiliary_loss_mlp": 0.01038017, "balance_loss_clip": 1.25548196, "balance_loss_mlp": 1.01789415, "epoch": 0.6396512851345257, "flos": 28378361036160.0, "grad_norm": 1.5567233425345377, "language_loss": 0.82814515, "learning_rate": 1.2142159775339478e-06, "loss": 0.85272509, "num_input_tokens_seen": 229534270, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20117188, "step": 10639, "time_per_iteration": 2.9391729831695557 }, { "auxiliary_loss_clip": 0.01188109, "auxiliary_loss_mlp": 0.0101556, "balance_loss_clip": 1.09906125, "balance_loss_mlp": 0.99543744, "epoch": 0.6397114083871938, "flos": 70755995957760.0, "grad_norm": 0.8322793018720489, "language_loss": 0.59084594, "learning_rate": 1.21385784946359e-06, "loss": 0.61288267, "num_input_tokens_seen": 229596455, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.20117188, "step": 10640, "time_per_iteration": 3.330916166305542 }, { "auxiliary_loss_clip": 0.01411993, "auxiliary_loss_mlp": 0.01030131, "balance_loss_clip": 1.25044847, "balance_loss_mlp": 1.01173711, "epoch": 0.6397715316398617, "flos": 18149610620160.0, "grad_norm": 1.8221305477584664, "language_loss": 0.79428267, "learning_rate": 1.2134997512038215e-06, "loss": 0.81870389, "num_input_tokens_seen": 229612860, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18395996, "step": 10641, "time_per_iteration": 2.8732833862304688 }, { "auxiliary_loss_clip": 0.0143476, "auxiliary_loss_mlp": 0.01032623, "balance_loss_clip": 1.26246238, "balance_loss_mlp": 1.01316845, "epoch": 0.6398316548925297, "flos": 25750616198400.0, "grad_norm": 1.6235313928436281, "language_loss": 0.64310747, "learning_rate": 1.2131416827682209e-06, "loss": 0.66778135, "num_input_tokens_seen": 229633960, "router_z_loss_clip": 1.72265625, "router_z_loss_mlp": 0.19458008, "step": 10642, "time_per_iteration": 2.961947202682495 }, { "auxiliary_loss_clip": 0.01185897, "auxiliary_loss_mlp": 0.01032676, "balance_loss_clip": 1.09587097, "balance_loss_mlp": 1.0136981, "epoch": 0.6398917781451977, "flos": 71240632721280.0, "grad_norm": 1.1528374812429372, "language_loss": 0.56034714, "learning_rate": 1.2127836441703667e-06, "loss": 0.58253288, "num_input_tokens_seen": 229686730, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.18945312, "step": 10643, "time_per_iteration": 3.2471749782562256 }, { "auxiliary_loss_clip": 0.01445083, "auxiliary_loss_mlp": 0.0103181, "balance_loss_clip": 1.2747376, "balance_loss_mlp": 1.01310587, "epoch": 0.6399519013978656, "flos": 20531530604160.0, "grad_norm": 1.7599455108568705, "language_loss": 0.77586806, "learning_rate": 1.2124256354238358e-06, "loss": 0.80063701, "num_input_tokens_seen": 229704800, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.18713379, "step": 10644, "time_per_iteration": 2.9127166271209717 }, { "auxiliary_loss_clip": 0.01424452, "auxiliary_loss_mlp": 0.01038549, "balance_loss_clip": 1.26067805, "balance_loss_mlp": 1.01792586, "epoch": 0.6400120246505336, "flos": 24471179712000.0, "grad_norm": 1.4673891422580556, "language_loss": 0.82872653, "learning_rate": 1.212067656542203e-06, "loss": 0.85335654, "num_input_tokens_seen": 229725265, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.2064209, "step": 10645, "time_per_iteration": 2.9292383193969727 }, { "auxiliary_loss_clip": 0.0142456, "auxiliary_loss_mlp": 0.01038865, "balance_loss_clip": 1.25484347, "balance_loss_mlp": 1.01821744, "epoch": 0.6400721479032015, "flos": 28377772853760.0, "grad_norm": 2.5186558208453578, "language_loss": 0.74340606, "learning_rate": 1.2117097075390447e-06, "loss": 0.7680403, "num_input_tokens_seen": 229744840, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.2064209, "step": 10646, "time_per_iteration": 2.918893337249756 }, { "auxiliary_loss_clip": 0.0142159, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.256845, "balance_loss_mlp": 1.01753163, "epoch": 0.6401322711558696, "flos": 17824145639040.0, "grad_norm": 2.2304574692842, "language_loss": 0.80047709, "learning_rate": 1.2113517884279327e-06, "loss": 0.82506406, "num_input_tokens_seen": 229759095, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19567871, "step": 10647, "time_per_iteration": 3.002793550491333 }, { "auxiliary_loss_clip": 0.01425092, "auxiliary_loss_mlp": 0.01034311, "balance_loss_clip": 1.26329124, "balance_loss_mlp": 1.01545215, "epoch": 0.6401923944085375, "flos": 26041894093440.0, "grad_norm": 1.6022407143921893, "language_loss": 0.76441002, "learning_rate": 1.2109938992224399e-06, "loss": 0.78900409, "num_input_tokens_seen": 229777750, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18859863, "step": 10648, "time_per_iteration": 2.9292759895324707 }, { "auxiliary_loss_clip": 0.01416814, "auxiliary_loss_mlp": 0.01033693, "balance_loss_clip": 1.25257564, "balance_loss_mlp": 1.01485825, "epoch": 0.6402525176612055, "flos": 23596893578880.0, "grad_norm": 3.0806887890717416, "language_loss": 0.79655683, "learning_rate": 1.210636039936138e-06, "loss": 0.82106185, "num_input_tokens_seen": 229796785, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18835449, "step": 10649, "time_per_iteration": 4.324416637420654 }, { "auxiliary_loss_clip": 0.0142662, "auxiliary_loss_mlp": 0.01042193, "balance_loss_clip": 1.2605722, "balance_loss_mlp": 1.02213025, "epoch": 0.6403126409138734, "flos": 18050623701120.0, "grad_norm": 1.7752478916176784, "language_loss": 0.75973666, "learning_rate": 1.2102782105825956e-06, "loss": 0.78442478, "num_input_tokens_seen": 229815425, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20043945, "step": 10650, "time_per_iteration": 2.828914165496826 }, { "auxiliary_loss_clip": 0.01412915, "auxiliary_loss_mlp": 0.01032626, "balance_loss_clip": 1.25132704, "balance_loss_mlp": 1.01294398, "epoch": 0.6403727641665414, "flos": 21989096444160.0, "grad_norm": 1.421939643236227, "language_loss": 0.71454763, "learning_rate": 1.2099204111753833e-06, "loss": 0.739003, "num_input_tokens_seen": 229834545, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19677734, "step": 10651, "time_per_iteration": 2.853471279144287 }, { "auxiliary_loss_clip": 0.0142193, "auxiliary_loss_mlp": 0.01038979, "balance_loss_clip": 1.25642169, "balance_loss_mlp": 1.01954782, "epoch": 0.6404328874192093, "flos": 24905178264960.0, "grad_norm": 2.3841513333747595, "language_loss": 0.64629447, "learning_rate": 1.2095626417280684e-06, "loss": 0.67090356, "num_input_tokens_seen": 229849175, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19421387, "step": 10652, "time_per_iteration": 5.7885777950286865 }, { "auxiliary_loss_clip": 0.01428772, "auxiliary_loss_mlp": 0.01034069, "balance_loss_clip": 1.26294708, "balance_loss_mlp": 1.01513839, "epoch": 0.6404930106718774, "flos": 17604635276160.0, "grad_norm": 1.9232193597009615, "language_loss": 0.80037957, "learning_rate": 1.2092049022542168e-06, "loss": 0.82500798, "num_input_tokens_seen": 229865400, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.18945312, "step": 10653, "time_per_iteration": 2.847811460494995 }, { "auxiliary_loss_clip": 0.01457545, "auxiliary_loss_mlp": 0.01040116, "balance_loss_clip": 1.28304648, "balance_loss_mlp": 1.01985049, "epoch": 0.6405531339245453, "flos": 20167716015360.0, "grad_norm": 2.2960601970604086, "language_loss": 0.72050655, "learning_rate": 1.2088471927673952e-06, "loss": 0.74548328, "num_input_tokens_seen": 229882945, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.20263672, "step": 10654, "time_per_iteration": 2.8639028072357178 }, { "auxiliary_loss_clip": 0.01440767, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.26968098, "balance_loss_mlp": 1.01698852, "epoch": 0.6406132571772133, "flos": 21951696977280.0, "grad_norm": 1.7498641606610295, "language_loss": 0.73086828, "learning_rate": 1.2084895132811666e-06, "loss": 0.75565159, "num_input_tokens_seen": 229901590, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.20556641, "step": 10655, "time_per_iteration": 2.8841168880462646 }, { "auxiliary_loss_clip": 0.01422337, "auxiliary_loss_mlp": 0.01037241, "balance_loss_clip": 1.25652099, "balance_loss_mlp": 1.01732111, "epoch": 0.6406733804298813, "flos": 28779937050240.0, "grad_norm": 1.500264076200686, "language_loss": 0.83653414, "learning_rate": 1.2081318638090952e-06, "loss": 0.86112994, "num_input_tokens_seen": 229922535, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19909668, "step": 10656, "time_per_iteration": 2.934028148651123 }, { "auxiliary_loss_clip": 0.01428696, "auxiliary_loss_mlp": 0.01039691, "balance_loss_clip": 1.26124632, "balance_loss_mlp": 1.0196991, "epoch": 0.6407335036825492, "flos": 17466122384640.0, "grad_norm": 3.7321961948797844, "language_loss": 0.73102641, "learning_rate": 1.2077742443647433e-06, "loss": 0.75571024, "num_input_tokens_seen": 229939575, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19995117, "step": 10657, "time_per_iteration": 2.8729894161224365 }, { "auxiliary_loss_clip": 0.01433272, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.26715958, "balance_loss_mlp": 1.01986885, "epoch": 0.6407936269352172, "flos": 22134893748480.0, "grad_norm": 1.6013287787312804, "language_loss": 0.77738333, "learning_rate": 1.2074166549616707e-06, "loss": 0.80210936, "num_input_tokens_seen": 229958840, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19470215, "step": 10658, "time_per_iteration": 2.8620684146881104 }, { "auxiliary_loss_clip": 0.01428599, "auxiliary_loss_mlp": 0.01035545, "balance_loss_clip": 1.26034474, "balance_loss_mlp": 1.01528001, "epoch": 0.6408537501878852, "flos": 23120563875840.0, "grad_norm": 1.6232958390410503, "language_loss": 0.76657844, "learning_rate": 1.2070590956134386e-06, "loss": 0.79121989, "num_input_tokens_seen": 229979680, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20288086, "step": 10659, "time_per_iteration": 2.890638828277588 }, { "auxiliary_loss_clip": 0.01428027, "auxiliary_loss_mlp": 0.01035847, "balance_loss_clip": 1.26003706, "balance_loss_mlp": 1.01661825, "epoch": 0.6409138734405532, "flos": 16481221418880.0, "grad_norm": 2.1044077916865067, "language_loss": 0.78435314, "learning_rate": 1.2067015663336046e-06, "loss": 0.80899191, "num_input_tokens_seen": 229996830, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19226074, "step": 10660, "time_per_iteration": 2.8354198932647705 }, { "auxiliary_loss_clip": 0.01447006, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.27504802, "balance_loss_mlp": 1.01657939, "epoch": 0.6409739966932211, "flos": 22786638117120.0, "grad_norm": 1.8987962621829833, "language_loss": 0.69675279, "learning_rate": 1.206344067135727e-06, "loss": 0.72158909, "num_input_tokens_seen": 230015115, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.20043945, "step": 10661, "time_per_iteration": 2.8755786418914795 }, { "auxiliary_loss_clip": 0.01417303, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.25559759, "balance_loss_mlp": 1.01140189, "epoch": 0.6410341199458891, "flos": 25162042849920.0, "grad_norm": 1.5763908432023974, "language_loss": 0.76929688, "learning_rate": 1.205986598033362e-06, "loss": 0.79376435, "num_input_tokens_seen": 230035515, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18041992, "step": 10662, "time_per_iteration": 2.913311243057251 }, { "auxiliary_loss_clip": 0.01428486, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.26235533, "balance_loss_mlp": 1.01279962, "epoch": 0.641094243198557, "flos": 27055869482880.0, "grad_norm": 2.5191459750923406, "language_loss": 0.69972992, "learning_rate": 1.2056291590400644e-06, "loss": 0.72433501, "num_input_tokens_seen": 230054355, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19238281, "step": 10663, "time_per_iteration": 2.9283523559570312 }, { "auxiliary_loss_clip": 0.0142447, "auxiliary_loss_mlp": 0.0103591, "balance_loss_clip": 1.25887358, "balance_loss_mlp": 1.01631224, "epoch": 0.641154366451225, "flos": 25385534755200.0, "grad_norm": 2.2262658125064956, "language_loss": 0.68544781, "learning_rate": 1.205271750169389e-06, "loss": 0.71005166, "num_input_tokens_seen": 230074605, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19604492, "step": 10664, "time_per_iteration": 2.9669623374938965 }, { "auxiliary_loss_clip": 0.01427165, "auxiliary_loss_mlp": 0.01034329, "balance_loss_clip": 1.26291978, "balance_loss_mlp": 1.01568484, "epoch": 0.6412144897038929, "flos": 25163671662720.0, "grad_norm": 2.0391274395868626, "language_loss": 0.66914469, "learning_rate": 1.2049143714348881e-06, "loss": 0.69375968, "num_input_tokens_seen": 230093820, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18652344, "step": 10665, "time_per_iteration": 2.906723976135254 }, { "auxiliary_loss_clip": 0.01419699, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.25699663, "balance_loss_mlp": 1.01418304, "epoch": 0.641274612956561, "flos": 23451232008960.0, "grad_norm": 1.6841320535206803, "language_loss": 0.64887869, "learning_rate": 1.2045570228501145e-06, "loss": 0.6734035, "num_input_tokens_seen": 230114285, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18615723, "step": 10666, "time_per_iteration": 2.9523122310638428 }, { "auxiliary_loss_clip": 0.01428809, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.26217437, "balance_loss_mlp": 1.01387572, "epoch": 0.6413347362092289, "flos": 19436964946560.0, "grad_norm": 1.9525967246029787, "language_loss": 0.71790564, "learning_rate": 1.2041997044286176e-06, "loss": 0.74252433, "num_input_tokens_seen": 230132760, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19189453, "step": 10667, "time_per_iteration": 2.8985817432403564 }, { "auxiliary_loss_clip": 0.01466095, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.2880882, "balance_loss_mlp": 1.01692605, "epoch": 0.6413948594618969, "flos": 17204099892480.0, "grad_norm": 7.531950515906734, "language_loss": 0.78758931, "learning_rate": 1.2038424161839484e-06, "loss": 0.81262624, "num_input_tokens_seen": 230149690, "router_z_loss_clip": 1.78027344, "router_z_loss_mlp": 0.20666504, "step": 10668, "time_per_iteration": 2.9091477394104004 }, { "auxiliary_loss_clip": 0.01431334, "auxiliary_loss_mlp": 0.0103458, "balance_loss_clip": 1.26624274, "balance_loss_mlp": 1.0141952, "epoch": 0.6414549827145648, "flos": 22279062240000.0, "grad_norm": 2.152229645046003, "language_loss": 0.68659908, "learning_rate": 1.2034851581296544e-06, "loss": 0.71125817, "num_input_tokens_seen": 230166950, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20397949, "step": 10669, "time_per_iteration": 2.916707754135132 }, { "auxiliary_loss_clip": 0.01452395, "auxiliary_loss_mlp": 0.01036864, "balance_loss_clip": 1.27983439, "balance_loss_mlp": 1.01690793, "epoch": 0.6415151059672328, "flos": 19648105021440.0, "grad_norm": 1.8464743520543008, "language_loss": 0.79502165, "learning_rate": 1.2031279302792825e-06, "loss": 0.81991428, "num_input_tokens_seen": 230184785, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.19970703, "step": 10670, "time_per_iteration": 2.852468729019165 }, { "auxiliary_loss_clip": 0.01437834, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.26782632, "balance_loss_mlp": 1.0186193, "epoch": 0.6415752292199008, "flos": 14873876732160.0, "grad_norm": 2.2207479308350093, "language_loss": 0.89163488, "learning_rate": 1.20277073264638e-06, "loss": 0.91640091, "num_input_tokens_seen": 230201385, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20153809, "step": 10671, "time_per_iteration": 2.8192851543426514 }, { "auxiliary_loss_clip": 0.01420009, "auxiliary_loss_mlp": 0.01032505, "balance_loss_clip": 1.25844097, "balance_loss_mlp": 1.01381326, "epoch": 0.6416353524725688, "flos": 13743540420480.0, "grad_norm": 1.9193543933643726, "language_loss": 0.70426619, "learning_rate": 1.2024135652444907e-06, "loss": 0.72879136, "num_input_tokens_seen": 230220380, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18701172, "step": 10672, "time_per_iteration": 2.849375009536743 }, { "auxiliary_loss_clip": 0.0144819, "auxiliary_loss_mlp": 0.01033614, "balance_loss_clip": 1.27579701, "balance_loss_mlp": 1.01340795, "epoch": 0.6416954757252368, "flos": 24545571442560.0, "grad_norm": 2.0584038140715255, "language_loss": 0.75240296, "learning_rate": 1.2020564280871593e-06, "loss": 0.77722096, "num_input_tokens_seen": 230239845, "router_z_loss_clip": 1.72363281, "router_z_loss_mlp": 0.20214844, "step": 10673, "time_per_iteration": 4.343919038772583 }, { "auxiliary_loss_clip": 0.01437586, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.27070141, "balance_loss_mlp": 1.01630449, "epoch": 0.6417555989779047, "flos": 27721684984320.0, "grad_norm": 9.377517444487138, "language_loss": 0.70192856, "learning_rate": 1.2016993211879283e-06, "loss": 0.72667408, "num_input_tokens_seen": 230262420, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20654297, "step": 10674, "time_per_iteration": 2.953443765640259 }, { "auxiliary_loss_clip": 0.01452658, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.2810154, "balance_loss_mlp": 1.01340759, "epoch": 0.6418157222305727, "flos": 20565898669440.0, "grad_norm": 1.8069554665396148, "language_loss": 0.67641377, "learning_rate": 1.201342244560338e-06, "loss": 0.70127201, "num_input_tokens_seen": 230279950, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.1973877, "step": 10675, "time_per_iteration": 2.8756954669952393 }, { "auxiliary_loss_clip": 0.01434542, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.26970649, "balance_loss_mlp": 1.01412261, "epoch": 0.6418758454832406, "flos": 22611766389120.0, "grad_norm": 1.8935539110079342, "language_loss": 0.67515373, "learning_rate": 1.2009851982179307e-06, "loss": 0.69984519, "num_input_tokens_seen": 230299705, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20483398, "step": 10676, "time_per_iteration": 2.8884360790252686 }, { "auxiliary_loss_clip": 0.01423655, "auxiliary_loss_mlp": 0.01033356, "balance_loss_clip": 1.25718617, "balance_loss_mlp": 1.0121007, "epoch": 0.6419359687359086, "flos": 27384999292800.0, "grad_norm": 1.7809758788584291, "language_loss": 0.76946425, "learning_rate": 1.2006281821742446e-06, "loss": 0.79403442, "num_input_tokens_seen": 230320030, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.21264648, "step": 10677, "time_per_iteration": 2.935551404953003 }, { "auxiliary_loss_clip": 0.01190475, "auxiliary_loss_mlp": 0.01023203, "balance_loss_clip": 1.10204387, "balance_loss_mlp": 1.00708568, "epoch": 0.6419960919885765, "flos": 67281093884160.0, "grad_norm": 0.7665629148441414, "language_loss": 0.60796797, "learning_rate": 1.200271196442818e-06, "loss": 0.63010478, "num_input_tokens_seen": 230381495, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.16113281, "step": 10678, "time_per_iteration": 3.4241528511047363 }, { "auxiliary_loss_clip": 0.01415066, "auxiliary_loss_mlp": 0.01033231, "balance_loss_clip": 1.25327182, "balance_loss_mlp": 1.01425266, "epoch": 0.6420562152412446, "flos": 19911484857600.0, "grad_norm": 1.6971540601049537, "language_loss": 0.6813162, "learning_rate": 1.1999142410371875e-06, "loss": 0.70579916, "num_input_tokens_seen": 230401385, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18969727, "step": 10679, "time_per_iteration": 2.897221326828003 }, { "auxiliary_loss_clip": 0.0144488, "auxiliary_loss_mlp": 0.01039037, "balance_loss_clip": 1.27552247, "balance_loss_mlp": 1.01837778, "epoch": 0.6421163384939125, "flos": 24800852459520.0, "grad_norm": 2.376874515234927, "language_loss": 0.7344048, "learning_rate": 1.1995573159708897e-06, "loss": 0.75924397, "num_input_tokens_seen": 230421340, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.20666504, "step": 10680, "time_per_iteration": 2.9345006942749023 }, { "auxiliary_loss_clip": 0.01429974, "auxiliary_loss_mlp": 0.01033244, "balance_loss_clip": 1.26430452, "balance_loss_mlp": 1.01504111, "epoch": 0.6421764617465805, "flos": 25603687774080.0, "grad_norm": 1.8341236258258593, "language_loss": 0.6937654, "learning_rate": 1.1992004212574582e-06, "loss": 0.71839756, "num_input_tokens_seen": 230441270, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.18212891, "step": 10681, "time_per_iteration": 2.9172184467315674 }, { "auxiliary_loss_clip": 0.01421641, "auxiliary_loss_mlp": 0.01032813, "balance_loss_clip": 1.25648963, "balance_loss_mlp": 1.01335835, "epoch": 0.6422365849992484, "flos": 14142220767360.0, "grad_norm": 1.872058628280906, "language_loss": 0.75711232, "learning_rate": 1.198843556910427e-06, "loss": 0.78165686, "num_input_tokens_seen": 230457455, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19458008, "step": 10682, "time_per_iteration": 2.837601900100708 }, { "auxiliary_loss_clip": 0.01406371, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.24697948, "balance_loss_mlp": 1.01278877, "epoch": 0.6422967082519164, "flos": 22394427776640.0, "grad_norm": 1.5111909971224204, "language_loss": 0.79768056, "learning_rate": 1.1984867229433287e-06, "loss": 0.82205731, "num_input_tokens_seen": 230478955, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18530273, "step": 10683, "time_per_iteration": 2.9045791625976562 }, { "auxiliary_loss_clip": 0.01430182, "auxiliary_loss_mlp": 0.01033663, "balance_loss_clip": 1.26372552, "balance_loss_mlp": 1.01296806, "epoch": 0.6423568315045844, "flos": 14656176161280.0, "grad_norm": 1.8447997025607277, "language_loss": 0.68560451, "learning_rate": 1.1981299193696941e-06, "loss": 0.71024299, "num_input_tokens_seen": 230496425, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20678711, "step": 10684, "time_per_iteration": 4.350924015045166 }, { "auxiliary_loss_clip": 0.01427995, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.26060963, "balance_loss_mlp": 1.01253033, "epoch": 0.6424169547572524, "flos": 26845272345600.0, "grad_norm": 1.8508719947285983, "language_loss": 0.72685158, "learning_rate": 1.1977731462030533e-06, "loss": 0.75144541, "num_input_tokens_seen": 230516245, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.18859863, "step": 10685, "time_per_iteration": 2.9081976413726807 }, { "auxiliary_loss_clip": 0.0142107, "auxiliary_loss_mlp": 0.01035459, "balance_loss_clip": 1.25874758, "balance_loss_mlp": 1.01665998, "epoch": 0.6424770780099204, "flos": 22716770866560.0, "grad_norm": 1.4569821208569986, "language_loss": 0.75700456, "learning_rate": 1.197416403456935e-06, "loss": 0.78156984, "num_input_tokens_seen": 230534745, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18786621, "step": 10686, "time_per_iteration": 2.8941800594329834 }, { "auxiliary_loss_clip": 0.01432115, "auxiliary_loss_mlp": 0.01036283, "balance_loss_clip": 1.26200795, "balance_loss_mlp": 1.01632726, "epoch": 0.6425372012625883, "flos": 28479655440000.0, "grad_norm": 2.6517935294205337, "language_loss": 0.69253433, "learning_rate": 1.197059691144867e-06, "loss": 0.7172184, "num_input_tokens_seen": 230555895, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.19958496, "step": 10687, "time_per_iteration": 4.522422790527344 }, { "auxiliary_loss_clip": 0.01441026, "auxiliary_loss_mlp": 0.01037113, "balance_loss_clip": 1.27206433, "balance_loss_mlp": 1.01741958, "epoch": 0.6425973245152563, "flos": 29363759694720.0, "grad_norm": 1.7661022084918028, "language_loss": 0.6712476, "learning_rate": 1.1967030092803767e-06, "loss": 0.69602895, "num_input_tokens_seen": 230577460, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19689941, "step": 10688, "time_per_iteration": 2.932551145553589 }, { "auxiliary_loss_clip": 0.0142935, "auxiliary_loss_mlp": 0.01035752, "balance_loss_clip": 1.26289177, "balance_loss_mlp": 1.0167501, "epoch": 0.6426574477679242, "flos": 16437713904000.0, "grad_norm": 1.7150861968859532, "language_loss": 0.73864454, "learning_rate": 1.1963463578769876e-06, "loss": 0.76329553, "num_input_tokens_seen": 230595030, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.18994141, "step": 10689, "time_per_iteration": 3.020660400390625 }, { "auxiliary_loss_clip": 0.01410759, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.24812031, "balance_loss_mlp": 1.01163769, "epoch": 0.6427175710205922, "flos": 21846104317440.0, "grad_norm": 2.1334692388235137, "language_loss": 0.73208773, "learning_rate": 1.195989736948226e-06, "loss": 0.75649667, "num_input_tokens_seen": 230615135, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18505859, "step": 10690, "time_per_iteration": 2.899162530899048 }, { "auxiliary_loss_clip": 0.01423285, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.25997949, "balance_loss_mlp": 1.01206636, "epoch": 0.6427776942732601, "flos": 17795930866560.0, "grad_norm": 1.8397615616458913, "language_loss": 0.78285342, "learning_rate": 1.1956331465076143e-06, "loss": 0.80740619, "num_input_tokens_seen": 230631965, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19946289, "step": 10691, "time_per_iteration": 2.887563943862915 }, { "auxiliary_loss_clip": 0.01433303, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.26460516, "balance_loss_mlp": 1.01540029, "epoch": 0.6428378175259282, "flos": 15094291991040.0, "grad_norm": 1.645920146953257, "language_loss": 0.75615132, "learning_rate": 1.1952765865686738e-06, "loss": 0.78083098, "num_input_tokens_seen": 230649565, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19262695, "step": 10692, "time_per_iteration": 2.852415084838867 }, { "auxiliary_loss_clip": 0.01429112, "auxiliary_loss_mlp": 0.01038675, "balance_loss_clip": 1.26243806, "balance_loss_mlp": 1.01873112, "epoch": 0.6428979407785961, "flos": 23852265085440.0, "grad_norm": 1.808450138857936, "language_loss": 0.62432265, "learning_rate": 1.1949200571449263e-06, "loss": 0.64900053, "num_input_tokens_seen": 230669265, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19946289, "step": 10693, "time_per_iteration": 2.888190746307373 }, { "auxiliary_loss_clip": 0.01430688, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.26105404, "balance_loss_mlp": 1.01487494, "epoch": 0.6429580640312641, "flos": 32939729948160.0, "grad_norm": 1.9219967714160997, "language_loss": 0.61382437, "learning_rate": 1.1945635582498903e-06, "loss": 0.63848358, "num_input_tokens_seen": 230690575, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.20361328, "step": 10694, "time_per_iteration": 3.0111265182495117 }, { "auxiliary_loss_clip": 0.01426918, "auxiliary_loss_mlp": 0.01038163, "balance_loss_clip": 1.26052904, "balance_loss_mlp": 1.01907802, "epoch": 0.643018187283932, "flos": 21077999026560.0, "grad_norm": 1.3842545962455544, "language_loss": 0.80352914, "learning_rate": 1.1942070898970853e-06, "loss": 0.82817996, "num_input_tokens_seen": 230709420, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.1907959, "step": 10695, "time_per_iteration": 2.8511664867401123 }, { "auxiliary_loss_clip": 0.01417617, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.25081968, "balance_loss_mlp": 1.01497746, "epoch": 0.6430783105366, "flos": 26736014856960.0, "grad_norm": 1.6600577459499624, "language_loss": 0.74419242, "learning_rate": 1.1938506521000285e-06, "loss": 0.768713, "num_input_tokens_seen": 230729350, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19445801, "step": 10696, "time_per_iteration": 2.917100191116333 }, { "auxiliary_loss_clip": 0.01422182, "auxiliary_loss_mlp": 0.01029768, "balance_loss_clip": 1.25923085, "balance_loss_mlp": 1.01006281, "epoch": 0.643138433789268, "flos": 23707553656320.0, "grad_norm": 1.68381558531295, "language_loss": 0.75997633, "learning_rate": 1.1934942448722347e-06, "loss": 0.78449583, "num_input_tokens_seen": 230749220, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19714355, "step": 10697, "time_per_iteration": 2.886347770690918 }, { "auxiliary_loss_clip": 0.0142138, "auxiliary_loss_mlp": 0.01035929, "balance_loss_clip": 1.25947118, "balance_loss_mlp": 1.01698661, "epoch": 0.643198557041936, "flos": 34214958668160.0, "grad_norm": 1.9410327597725991, "language_loss": 0.66734207, "learning_rate": 1.1931378682272208e-06, "loss": 0.69191515, "num_input_tokens_seen": 230770245, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18945312, "step": 10698, "time_per_iteration": 2.9949021339416504 }, { "auxiliary_loss_clip": 0.01195049, "auxiliary_loss_mlp": 0.01014534, "balance_loss_clip": 1.10547543, "balance_loss_mlp": 1.00080156, "epoch": 0.643258680294604, "flos": 67658255688960.0, "grad_norm": 0.8591675424222531, "language_loss": 0.63507801, "learning_rate": 1.1927815221784996e-06, "loss": 0.65717387, "num_input_tokens_seen": 230837030, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.13769531, "step": 10699, "time_per_iteration": 3.35263729095459 }, { "auxiliary_loss_clip": 0.01411584, "auxiliary_loss_mlp": 0.01030382, "balance_loss_clip": 1.25230908, "balance_loss_mlp": 1.01202381, "epoch": 0.6433188035472719, "flos": 25195325040000.0, "grad_norm": 1.6588616924309962, "language_loss": 0.69976437, "learning_rate": 1.1924252067395838e-06, "loss": 0.72418404, "num_input_tokens_seen": 230856845, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18359375, "step": 10700, "time_per_iteration": 2.899168014526367 }, { "auxiliary_loss_clip": 0.01422034, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 1.2558794, "balance_loss_mlp": 1.01117396, "epoch": 0.6433789267999399, "flos": 24984546923520.0, "grad_norm": 1.8658578572892917, "language_loss": 0.73898029, "learning_rate": 1.1920689219239855e-06, "loss": 0.76351202, "num_input_tokens_seen": 230878785, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19958496, "step": 10701, "time_per_iteration": 2.941054105758667 }, { "auxiliary_loss_clip": 0.01444837, "auxiliary_loss_mlp": 0.01035008, "balance_loss_clip": 1.27330661, "balance_loss_mlp": 1.01436067, "epoch": 0.6434390500526078, "flos": 17574836935680.0, "grad_norm": 1.8696570200189369, "language_loss": 0.82702291, "learning_rate": 1.1917126677452144e-06, "loss": 0.85182142, "num_input_tokens_seen": 230895445, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.20629883, "step": 10702, "time_per_iteration": 2.8928802013397217 }, { "auxiliary_loss_clip": 0.01419668, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.25590801, "balance_loss_mlp": 1.02037311, "epoch": 0.6434991733052758, "flos": 20851656698880.0, "grad_norm": 1.805814768839921, "language_loss": 0.75468886, "learning_rate": 1.1913564442167798e-06, "loss": 0.77927643, "num_input_tokens_seen": 230911375, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18701172, "step": 10703, "time_per_iteration": 2.8814473152160645 }, { "auxiliary_loss_clip": 0.01196709, "auxiliary_loss_mlp": 0.01022506, "balance_loss_clip": 1.10511088, "balance_loss_mlp": 1.00448132, "epoch": 0.6435592965579437, "flos": 66126117139200.0, "grad_norm": 0.6623677001978532, "language_loss": 0.54623955, "learning_rate": 1.1910002513521898e-06, "loss": 0.56843168, "num_input_tokens_seen": 230975990, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.18066406, "step": 10704, "time_per_iteration": 3.375680685043335 }, { "auxiliary_loss_clip": 0.01428477, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.26223564, "balance_loss_mlp": 1.01463437, "epoch": 0.6436194198106118, "flos": 23779321188480.0, "grad_norm": 1.593680684976102, "language_loss": 0.77865839, "learning_rate": 1.1906440891649519e-06, "loss": 0.80327046, "num_input_tokens_seen": 230997110, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.18103027, "step": 10705, "time_per_iteration": 2.9029922485351562 }, { "auxiliary_loss_clip": 0.01427463, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.26184249, "balance_loss_mlp": 1.01907718, "epoch": 0.6436795430632797, "flos": 20240026485120.0, "grad_norm": 1.6576529138690521, "language_loss": 0.7986517, "learning_rate": 1.1902879576685708e-06, "loss": 0.82330275, "num_input_tokens_seen": 231015590, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.18566895, "step": 10706, "time_per_iteration": 2.93033766746521 }, { "auxiliary_loss_clip": 0.01419598, "auxiliary_loss_mlp": 0.01032824, "balance_loss_clip": 1.25433564, "balance_loss_mlp": 1.01283264, "epoch": 0.6437396663159477, "flos": 20311205834880.0, "grad_norm": 2.5579114984113116, "language_loss": 0.80862391, "learning_rate": 1.1899318568765518e-06, "loss": 0.83314812, "num_input_tokens_seen": 231033800, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19995117, "step": 10707, "time_per_iteration": 2.9159953594207764 }, { "auxiliary_loss_clip": 0.01423327, "auxiliary_loss_mlp": 0.01033587, "balance_loss_clip": 1.25796103, "balance_loss_mlp": 1.01460862, "epoch": 0.6437997895686156, "flos": 23889166859520.0, "grad_norm": 1.7545105355431025, "language_loss": 0.86014867, "learning_rate": 1.1895757868023978e-06, "loss": 0.88471782, "num_input_tokens_seen": 231053160, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.18969727, "step": 10708, "time_per_iteration": 4.296573877334595 }, { "auxiliary_loss_clip": 0.01458293, "auxiliary_loss_mlp": 0.01038439, "balance_loss_clip": 1.28401041, "balance_loss_mlp": 1.01814938, "epoch": 0.6438599128212836, "flos": 18998532403200.0, "grad_norm": 2.1707738992867793, "language_loss": 0.66716301, "learning_rate": 1.1892197474596106e-06, "loss": 0.69213033, "num_input_tokens_seen": 231069470, "router_z_loss_clip": 1.7421875, "router_z_loss_mlp": 0.20288086, "step": 10709, "time_per_iteration": 2.877901077270508 }, { "auxiliary_loss_clip": 0.01425767, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.26174247, "balance_loss_mlp": 1.01622844, "epoch": 0.6439200360739517, "flos": 24106686451200.0, "grad_norm": 1.8261284951769534, "language_loss": 0.81272042, "learning_rate": 1.1888637388616929e-06, "loss": 0.83732778, "num_input_tokens_seen": 231088205, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18737793, "step": 10710, "time_per_iteration": 2.8937854766845703 }, { "auxiliary_loss_clip": 0.01422719, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.25771606, "balance_loss_mlp": 1.01266146, "epoch": 0.6439801593266196, "flos": 31913945665920.0, "grad_norm": 1.7614913686423783, "language_loss": 0.67037034, "learning_rate": 1.1885077610221425e-06, "loss": 0.69491571, "num_input_tokens_seen": 231107850, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19165039, "step": 10711, "time_per_iteration": 2.946032762527466 }, { "auxiliary_loss_clip": 0.01433548, "auxiliary_loss_mlp": 0.01035531, "balance_loss_clip": 1.26506233, "balance_loss_mlp": 1.01605248, "epoch": 0.6440402825792876, "flos": 27137319402240.0, "grad_norm": 1.5955118602219833, "language_loss": 0.79086792, "learning_rate": 1.1881518139544597e-06, "loss": 0.81555873, "num_input_tokens_seen": 231127200, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.19482422, "step": 10712, "time_per_iteration": 2.906083345413208 }, { "auxiliary_loss_clip": 0.01429817, "auxiliary_loss_mlp": 0.01037509, "balance_loss_clip": 1.26107526, "balance_loss_mlp": 1.01736307, "epoch": 0.6441004058319555, "flos": 20677418398080.0, "grad_norm": 1.5893081109515568, "language_loss": 0.8354193, "learning_rate": 1.1877958976721417e-06, "loss": 0.86009252, "num_input_tokens_seen": 231146360, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20141602, "step": 10713, "time_per_iteration": 2.853761672973633 }, { "auxiliary_loss_clip": 0.01405005, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.24518359, "balance_loss_mlp": 1.01385951, "epoch": 0.6441605290846235, "flos": 26035786045440.0, "grad_norm": 1.395930042385437, "language_loss": 0.79337871, "learning_rate": 1.187440012188684e-06, "loss": 0.81775886, "num_input_tokens_seen": 231168350, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19152832, "step": 10714, "time_per_iteration": 2.9263272285461426 }, { "auxiliary_loss_clip": 0.01431247, "auxiliary_loss_mlp": 0.01035302, "balance_loss_clip": 1.26587379, "balance_loss_mlp": 1.01593053, "epoch": 0.6442206523372914, "flos": 24910155192960.0, "grad_norm": 1.4347535334971606, "language_loss": 0.82028353, "learning_rate": 1.187084157517583e-06, "loss": 0.84494901, "num_input_tokens_seen": 231188385, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19372559, "step": 10715, "time_per_iteration": 2.950824499130249 }, { "auxiliary_loss_clip": 0.01431827, "auxiliary_loss_mlp": 0.01038797, "balance_loss_clip": 1.26330137, "balance_loss_mlp": 1.01797116, "epoch": 0.6442807755899594, "flos": 25167426981120.0, "grad_norm": 1.8577050634505778, "language_loss": 0.81449395, "learning_rate": 1.186728333672332e-06, "loss": 0.83920026, "num_input_tokens_seen": 231209880, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20825195, "step": 10716, "time_per_iteration": 2.938124656677246 }, { "auxiliary_loss_clip": 0.01441731, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.2708056, "balance_loss_mlp": 1.01309657, "epoch": 0.6443408988426274, "flos": 27355924869120.0, "grad_norm": 1.6895320121029476, "language_loss": 0.7835021, "learning_rate": 1.186372540666424e-06, "loss": 0.8082391, "num_input_tokens_seen": 231230765, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.18859863, "step": 10717, "time_per_iteration": 2.9358303546905518 }, { "auxiliary_loss_clip": 0.01402607, "auxiliary_loss_mlp": 0.01033678, "balance_loss_clip": 1.24292934, "balance_loss_mlp": 1.01447296, "epoch": 0.6444010220952954, "flos": 27939928492800.0, "grad_norm": 1.7225602624529082, "language_loss": 0.69111943, "learning_rate": 1.1860167785133513e-06, "loss": 0.71548223, "num_input_tokens_seen": 231252350, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19189453, "step": 10718, "time_per_iteration": 4.376666784286499 }, { "auxiliary_loss_clip": 0.01195843, "auxiliary_loss_mlp": 0.01012771, "balance_loss_clip": 1.10432959, "balance_loss_mlp": 0.99560511, "epoch": 0.6444611453479633, "flos": 71241628106880.0, "grad_norm": 0.7731634279231949, "language_loss": 0.49737352, "learning_rate": 1.185661047226603e-06, "loss": 0.51945966, "num_input_tokens_seen": 231313865, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.171875, "step": 10719, "time_per_iteration": 3.5296919345855713 }, { "auxiliary_loss_clip": 0.01434392, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.26548624, "balance_loss_mlp": 1.02007937, "epoch": 0.6445212686006313, "flos": 22713875199360.0, "grad_norm": 1.8037411369799676, "language_loss": 0.79063082, "learning_rate": 1.18530534681967e-06, "loss": 0.81538117, "num_input_tokens_seen": 231331710, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20581055, "step": 10720, "time_per_iteration": 2.926892042160034 }, { "auxiliary_loss_clip": 0.01429791, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.26546133, "balance_loss_mlp": 1.01744413, "epoch": 0.6445813918532992, "flos": 21188749593600.0, "grad_norm": 4.4375291925736144, "language_loss": 0.7745651, "learning_rate": 1.18494967730604e-06, "loss": 0.79923159, "num_input_tokens_seen": 231350705, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.1940918, "step": 10721, "time_per_iteration": 2.908325672149658 }, { "auxiliary_loss_clip": 0.0142525, "auxiliary_loss_mlp": 0.01034299, "balance_loss_clip": 1.25805497, "balance_loss_mlp": 1.01443911, "epoch": 0.6446415151059672, "flos": 25202790432000.0, "grad_norm": 2.3882175405704347, "language_loss": 0.73944461, "learning_rate": 1.1845940386991995e-06, "loss": 0.76404011, "num_input_tokens_seen": 231369550, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19836426, "step": 10722, "time_per_iteration": 5.807856321334839 }, { "auxiliary_loss_clip": 0.01437298, "auxiliary_loss_mlp": 0.01038538, "balance_loss_clip": 1.27203536, "balance_loss_mlp": 1.01938152, "epoch": 0.6447016383586353, "flos": 25313360019840.0, "grad_norm": 1.6328168014541293, "language_loss": 0.78843397, "learning_rate": 1.184238431012635e-06, "loss": 0.81319237, "num_input_tokens_seen": 231389285, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19152832, "step": 10723, "time_per_iteration": 2.93548583984375 }, { "auxiliary_loss_clip": 0.01441927, "auxiliary_loss_mlp": 0.01039654, "balance_loss_clip": 1.271227, "balance_loss_mlp": 1.01922166, "epoch": 0.6447617616113032, "flos": 27713043227520.0, "grad_norm": 1.5689093655521364, "language_loss": 0.59007174, "learning_rate": 1.1838828542598312e-06, "loss": 0.61488754, "num_input_tokens_seen": 231408820, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.2043457, "step": 10724, "time_per_iteration": 2.9897994995117188 }, { "auxiliary_loss_clip": 0.01418338, "auxiliary_loss_mlp": 0.01038781, "balance_loss_clip": 1.25674415, "balance_loss_mlp": 1.0196476, "epoch": 0.6448218848639712, "flos": 23049384526080.0, "grad_norm": 1.8718510971645606, "language_loss": 0.84589779, "learning_rate": 1.183527308454271e-06, "loss": 0.87046903, "num_input_tokens_seen": 231428100, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19128418, "step": 10725, "time_per_iteration": 2.9101297855377197 }, { "auxiliary_loss_clip": 0.0141823, "auxiliary_loss_mlp": 0.01040073, "balance_loss_clip": 1.25401974, "balance_loss_mlp": 1.0193429, "epoch": 0.6448820081166391, "flos": 24506135959680.0, "grad_norm": 4.063805667903456, "language_loss": 0.82714474, "learning_rate": 1.1831717936094368e-06, "loss": 0.85172772, "num_input_tokens_seen": 231445810, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20751953, "step": 10726, "time_per_iteration": 2.8951659202575684 }, { "auxiliary_loss_clip": 0.01444781, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.27376211, "balance_loss_mlp": 1.01783526, "epoch": 0.6449421313693071, "flos": 22429519758720.0, "grad_norm": 1.7920537206404201, "language_loss": 0.82170647, "learning_rate": 1.1828163097388108e-06, "loss": 0.84653473, "num_input_tokens_seen": 231463570, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.20214844, "step": 10727, "time_per_iteration": 2.8644561767578125 }, { "auxiliary_loss_clip": 0.01454479, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 1.2801398, "balance_loss_mlp": 1.02097654, "epoch": 0.645002254621975, "flos": 20234235150720.0, "grad_norm": 1.8068615813701856, "language_loss": 0.79684073, "learning_rate": 1.1824608568558717e-06, "loss": 0.82180417, "num_input_tokens_seen": 231482155, "router_z_loss_clip": 1.74316406, "router_z_loss_mlp": 0.20861816, "step": 10728, "time_per_iteration": 2.8891477584838867 }, { "auxiliary_loss_clip": 0.01423743, "auxiliary_loss_mlp": 0.01037529, "balance_loss_clip": 1.25813699, "balance_loss_mlp": 1.01819372, "epoch": 0.645062377874643, "flos": 27867029840640.0, "grad_norm": 1.8647059992801507, "language_loss": 0.75230628, "learning_rate": 1.1821054349740988e-06, "loss": 0.77691901, "num_input_tokens_seen": 231502465, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.1932373, "step": 10729, "time_per_iteration": 2.9148471355438232 }, { "auxiliary_loss_clip": 0.01432384, "auxiliary_loss_mlp": 0.01036966, "balance_loss_clip": 1.26527596, "balance_loss_mlp": 1.01746297, "epoch": 0.645122501127311, "flos": 25312455123840.0, "grad_norm": 1.5752941509913736, "language_loss": 0.66752511, "learning_rate": 1.1817500441069706e-06, "loss": 0.69221866, "num_input_tokens_seen": 231522740, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19506836, "step": 10730, "time_per_iteration": 2.909738063812256 }, { "auxiliary_loss_clip": 0.01427763, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.26152372, "balance_loss_mlp": 1.01804757, "epoch": 0.645182624379979, "flos": 18816692976000.0, "grad_norm": 1.640571364440011, "language_loss": 0.65431768, "learning_rate": 1.1813946842679614e-06, "loss": 0.67897546, "num_input_tokens_seen": 231542050, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19970703, "step": 10731, "time_per_iteration": 2.942887306213379 }, { "auxiliary_loss_clip": 0.01416444, "auxiliary_loss_mlp": 0.01038904, "balance_loss_clip": 1.2531743, "balance_loss_mlp": 1.01838851, "epoch": 0.6452427476326469, "flos": 18341177679360.0, "grad_norm": 1.6475484631221913, "language_loss": 0.68493098, "learning_rate": 1.1810393554705492e-06, "loss": 0.70948446, "num_input_tokens_seen": 231560380, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.2052002, "step": 10732, "time_per_iteration": 2.902336597442627 }, { "auxiliary_loss_clip": 0.01415075, "auxiliary_loss_mlp": 0.01039788, "balance_loss_clip": 1.25189662, "balance_loss_mlp": 1.02002263, "epoch": 0.6453028708853149, "flos": 22795234629120.0, "grad_norm": 2.5434230556995656, "language_loss": 0.76631153, "learning_rate": 1.1806840577282055e-06, "loss": 0.79086018, "num_input_tokens_seen": 231580810, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19763184, "step": 10733, "time_per_iteration": 2.91213321685791 }, { "auxiliary_loss_clip": 0.01462361, "auxiliary_loss_mlp": 0.01043164, "balance_loss_clip": 1.28967249, "balance_loss_mlp": 1.02184987, "epoch": 0.6453629941379828, "flos": 23955143057280.0, "grad_norm": 2.07320248012528, "language_loss": 0.68391204, "learning_rate": 1.1803287910544048e-06, "loss": 0.70896727, "num_input_tokens_seen": 231600585, "router_z_loss_clip": 1.72753906, "router_z_loss_mlp": 0.21313477, "step": 10734, "time_per_iteration": 2.9263014793395996 }, { "auxiliary_loss_clip": 0.01412434, "auxiliary_loss_mlp": 0.01040651, "balance_loss_clip": 1.25304675, "balance_loss_mlp": 1.02077889, "epoch": 0.6454231173906508, "flos": 17685451768320.0, "grad_norm": 2.7099464956219537, "language_loss": 0.74485689, "learning_rate": 1.1799735554626191e-06, "loss": 0.76938778, "num_input_tokens_seen": 231618765, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.1986084, "step": 10735, "time_per_iteration": 2.851369857788086 }, { "auxiliary_loss_clip": 0.01425506, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.25980115, "balance_loss_mlp": 1.01871943, "epoch": 0.6454832406433189, "flos": 23302674771840.0, "grad_norm": 1.8731594838543324, "language_loss": 0.75529397, "learning_rate": 1.1796183509663176e-06, "loss": 0.77993274, "num_input_tokens_seen": 231638525, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.1965332, "step": 10736, "time_per_iteration": 2.92983078956604 }, { "auxiliary_loss_clip": 0.01459813, "auxiliary_loss_mlp": 0.01039134, "balance_loss_clip": 1.28721285, "balance_loss_mlp": 1.01911902, "epoch": 0.6455433638959868, "flos": 20166856364160.0, "grad_norm": 5.6473538090312205, "language_loss": 0.71657181, "learning_rate": 1.1792631775789708e-06, "loss": 0.74156123, "num_input_tokens_seen": 231656785, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.20019531, "step": 10737, "time_per_iteration": 2.8788201808929443 }, { "auxiliary_loss_clip": 0.01194785, "auxiliary_loss_mlp": 0.01029265, "balance_loss_clip": 1.10301113, "balance_loss_mlp": 1.01343429, "epoch": 0.6456034871486548, "flos": 66564685416960.0, "grad_norm": 0.7991951359951835, "language_loss": 0.58481252, "learning_rate": 1.1789080353140464e-06, "loss": 0.60705304, "num_input_tokens_seen": 231719075, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.15820312, "step": 10738, "time_per_iteration": 3.4525833129882812 }, { "auxiliary_loss_clip": 0.01422634, "auxiliary_loss_mlp": 0.01032805, "balance_loss_clip": 1.25806427, "balance_loss_mlp": 1.01296854, "epoch": 0.6456636104013227, "flos": 24216532122240.0, "grad_norm": 1.6489125173554904, "language_loss": 0.75407732, "learning_rate": 1.1785529241850118e-06, "loss": 0.77863169, "num_input_tokens_seen": 231737810, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19848633, "step": 10739, "time_per_iteration": 2.9349186420440674 }, { "auxiliary_loss_clip": 0.01440272, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.26998663, "balance_loss_mlp": 1.01453984, "epoch": 0.6457237336539907, "flos": 23634835983360.0, "grad_norm": 1.8355693402391666, "language_loss": 0.72498274, "learning_rate": 1.1781978442053324e-06, "loss": 0.74973106, "num_input_tokens_seen": 231756140, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.20031738, "step": 10740, "time_per_iteration": 2.901472568511963 }, { "auxiliary_loss_clip": 0.01192819, "auxiliary_loss_mlp": 0.01019994, "balance_loss_clip": 1.10255015, "balance_loss_mlp": 1.00282812, "epoch": 0.6457838569066586, "flos": 65879251655040.0, "grad_norm": 0.6646151101911751, "language_loss": 0.55369961, "learning_rate": 1.1778427953884733e-06, "loss": 0.57582772, "num_input_tokens_seen": 231823665, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.171875, "step": 10741, "time_per_iteration": 3.347278594970703 }, { "auxiliary_loss_clip": 0.01417762, "auxiliary_loss_mlp": 0.01036869, "balance_loss_clip": 1.25422955, "balance_loss_mlp": 1.01709199, "epoch": 0.6458439801593266, "flos": 22391984557440.0, "grad_norm": 1.6712121324950662, "language_loss": 0.80694562, "learning_rate": 1.1774877777478977e-06, "loss": 0.83149195, "num_input_tokens_seen": 231844500, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19775391, "step": 10742, "time_per_iteration": 2.8826849460601807 }, { "auxiliary_loss_clip": 0.01410836, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.2504127, "balance_loss_mlp": 1.01286197, "epoch": 0.6459041034119946, "flos": 24799676094720.0, "grad_norm": 1.5408063950607875, "language_loss": 0.82519627, "learning_rate": 1.1771327912970678e-06, "loss": 0.84963703, "num_input_tokens_seen": 231864510, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.20385742, "step": 10743, "time_per_iteration": 4.373667240142822 }, { "auxiliary_loss_clip": 0.01412838, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.25011587, "balance_loss_mlp": 1.01213253, "epoch": 0.6459642266646626, "flos": 18332581167360.0, "grad_norm": 3.0013242520569987, "language_loss": 0.72495508, "learning_rate": 1.1767778360494453e-06, "loss": 0.74940318, "num_input_tokens_seen": 231881555, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19836426, "step": 10744, "time_per_iteration": 2.867992639541626 }, { "auxiliary_loss_clip": 0.01418198, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.25324368, "balance_loss_mlp": 1.01068163, "epoch": 0.6460243499173305, "flos": 43597185275520.0, "grad_norm": 1.879999344697192, "language_loss": 0.67918563, "learning_rate": 1.1764229120184896e-06, "loss": 0.70366502, "num_input_tokens_seen": 231905945, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19055176, "step": 10745, "time_per_iteration": 3.0845792293548584 }, { "auxiliary_loss_clip": 0.01418678, "auxiliary_loss_mlp": 0.01037999, "balance_loss_clip": 1.25362265, "balance_loss_mlp": 1.01806736, "epoch": 0.6460844731699985, "flos": 19253179992960.0, "grad_norm": 2.7652021153453146, "language_loss": 0.74870098, "learning_rate": 1.1760680192176597e-06, "loss": 0.77326787, "num_input_tokens_seen": 231922535, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19934082, "step": 10746, "time_per_iteration": 2.849994421005249 }, { "auxiliary_loss_clip": 0.01436714, "auxiliary_loss_mlp": 0.01040501, "balance_loss_clip": 1.26855433, "balance_loss_mlp": 1.02004504, "epoch": 0.6461445964226664, "flos": 27464639420160.0, "grad_norm": 1.5501179505607803, "language_loss": 0.67364407, "learning_rate": 1.175713157660413e-06, "loss": 0.69841623, "num_input_tokens_seen": 231944800, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20471191, "step": 10747, "time_per_iteration": 2.942025661468506 }, { "auxiliary_loss_clip": 0.01435922, "auxiliary_loss_mlp": 0.01036383, "balance_loss_clip": 1.26839185, "balance_loss_mlp": 1.01691604, "epoch": 0.6462047196753344, "flos": 20303197505280.0, "grad_norm": 2.1338332300825718, "language_loss": 0.67746878, "learning_rate": 1.1753583273602056e-06, "loss": 0.70219183, "num_input_tokens_seen": 231962970, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19470215, "step": 10748, "time_per_iteration": 2.886044979095459 }, { "auxiliary_loss_clip": 0.01435772, "auxiliary_loss_mlp": 0.0103823, "balance_loss_clip": 1.26630974, "balance_loss_mlp": 1.01782131, "epoch": 0.6462648429280025, "flos": 22028577171840.0, "grad_norm": 1.750320588054417, "language_loss": 0.7685656, "learning_rate": 1.1750035283304937e-06, "loss": 0.79330564, "num_input_tokens_seen": 231981195, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20397949, "step": 10749, "time_per_iteration": 2.911282539367676 }, { "auxiliary_loss_clip": 0.0143869, "auxiliary_loss_mlp": 0.01034442, "balance_loss_clip": 1.26956773, "balance_loss_mlp": 1.01509476, "epoch": 0.6463249661806704, "flos": 27792366641280.0, "grad_norm": 1.7061669104491564, "language_loss": 0.77456731, "learning_rate": 1.17464876058473e-06, "loss": 0.79929864, "num_input_tokens_seen": 232001735, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19348145, "step": 10750, "time_per_iteration": 2.9091992378234863 }, { "auxiliary_loss_clip": 0.0144583, "auxiliary_loss_mlp": 0.01035171, "balance_loss_clip": 1.27479565, "balance_loss_mlp": 1.01451182, "epoch": 0.6463850894333384, "flos": 22059732856320.0, "grad_norm": 2.2725485857624057, "language_loss": 0.69880319, "learning_rate": 1.1742940241363683e-06, "loss": 0.7236132, "num_input_tokens_seen": 232019830, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.20666504, "step": 10751, "time_per_iteration": 2.8715059757232666 }, { "auxiliary_loss_clip": 0.01452805, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.2818687, "balance_loss_mlp": 1.01766539, "epoch": 0.6464452126860063, "flos": 21116258144640.0, "grad_norm": 1.690631577526552, "language_loss": 0.72108614, "learning_rate": 1.1739393189988604e-06, "loss": 0.74598879, "num_input_tokens_seen": 232039625, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.19787598, "step": 10752, "time_per_iteration": 2.898308753967285 }, { "auxiliary_loss_clip": 0.01434749, "auxiliary_loss_mlp": 0.01037625, "balance_loss_clip": 1.26514173, "balance_loss_mlp": 1.0159409, "epoch": 0.6465053359386743, "flos": 16035006769920.0, "grad_norm": 1.6684512044340727, "language_loss": 0.78774977, "learning_rate": 1.1735846451856554e-06, "loss": 0.81247348, "num_input_tokens_seen": 232055855, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.21691895, "step": 10753, "time_per_iteration": 4.322907209396362 }, { "auxiliary_loss_clip": 0.01422832, "auxiliary_loss_mlp": 0.01040914, "balance_loss_clip": 1.25817561, "balance_loss_mlp": 1.0208993, "epoch": 0.6465654591913422, "flos": 23407679249280.0, "grad_norm": 1.6401490286898197, "language_loss": 0.85832322, "learning_rate": 1.1732300027102041e-06, "loss": 0.88296068, "num_input_tokens_seen": 232073475, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20007324, "step": 10754, "time_per_iteration": 2.8825440406799316 }, { "auxiliary_loss_clip": 0.0142953, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.26422644, "balance_loss_mlp": 1.01595438, "epoch": 0.6466255824440102, "flos": 15385615130880.0, "grad_norm": 2.071176243086444, "language_loss": 0.6096372, "learning_rate": 1.1728753915859541e-06, "loss": 0.63428295, "num_input_tokens_seen": 232091090, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19067383, "step": 10755, "time_per_iteration": 2.8428735733032227 }, { "auxiliary_loss_clip": 0.01422637, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.25785935, "balance_loss_mlp": 1.01467621, "epoch": 0.6466857056966782, "flos": 16261394342400.0, "grad_norm": 2.08075005219916, "language_loss": 0.6889869, "learning_rate": 1.1725208118263518e-06, "loss": 0.7135641, "num_input_tokens_seen": 232107320, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20397949, "step": 10756, "time_per_iteration": 2.9133362770080566 }, { "auxiliary_loss_clip": 0.01458343, "auxiliary_loss_mlp": 0.01035464, "balance_loss_clip": 1.28515506, "balance_loss_mlp": 1.01506758, "epoch": 0.6467458289493462, "flos": 21188025676800.0, "grad_norm": 5.817000355244816, "language_loss": 0.76100391, "learning_rate": 1.172166263444844e-06, "loss": 0.78594196, "num_input_tokens_seen": 232123930, "router_z_loss_clip": 1.73339844, "router_z_loss_mlp": 0.20385742, "step": 10757, "time_per_iteration": 4.284683465957642 }, { "auxiliary_loss_clip": 0.01414603, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.25345182, "balance_loss_mlp": 1.01673615, "epoch": 0.6468059522020141, "flos": 17977453580160.0, "grad_norm": 1.4278562692587216, "language_loss": 0.74999726, "learning_rate": 1.1718117464548734e-06, "loss": 0.77450466, "num_input_tokens_seen": 232142905, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1940918, "step": 10758, "time_per_iteration": 4.4209043979644775 }, { "auxiliary_loss_clip": 0.01433068, "auxiliary_loss_mlp": 0.01035103, "balance_loss_clip": 1.26589131, "balance_loss_mlp": 1.01440871, "epoch": 0.6468660754546821, "flos": 17897949187200.0, "grad_norm": 1.8996781043691047, "language_loss": 0.68820763, "learning_rate": 1.1714572608698845e-06, "loss": 0.71288931, "num_input_tokens_seen": 232162230, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20703125, "step": 10759, "time_per_iteration": 2.8946831226348877 }, { "auxiliary_loss_clip": 0.01449413, "auxiliary_loss_mlp": 0.01039453, "balance_loss_clip": 1.27751851, "balance_loss_mlp": 1.01821017, "epoch": 0.64692619870735, "flos": 22611087717120.0, "grad_norm": 1.8233904374703835, "language_loss": 0.76313412, "learning_rate": 1.1711028067033197e-06, "loss": 0.78802276, "num_input_tokens_seen": 232182700, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.21264648, "step": 10760, "time_per_iteration": 2.9149868488311768 }, { "auxiliary_loss_clip": 0.01418029, "auxiliary_loss_mlp": 0.01032349, "balance_loss_clip": 1.25472069, "balance_loss_mlp": 1.01358545, "epoch": 0.646986321960018, "flos": 49617975064320.0, "grad_norm": 1.6561867437683997, "language_loss": 0.66206241, "learning_rate": 1.1707483839686194e-06, "loss": 0.68656617, "num_input_tokens_seen": 232208235, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18762207, "step": 10761, "time_per_iteration": 3.131577491760254 }, { "auxiliary_loss_clip": 0.01437139, "auxiliary_loss_mlp": 0.01034077, "balance_loss_clip": 1.27065694, "balance_loss_mlp": 1.01372814, "epoch": 0.6470464452126861, "flos": 21918550521600.0, "grad_norm": 2.0321387128391515, "language_loss": 0.70659357, "learning_rate": 1.1703939926792235e-06, "loss": 0.73130572, "num_input_tokens_seen": 232228720, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20336914, "step": 10762, "time_per_iteration": 2.882758378982544 }, { "auxiliary_loss_clip": 0.01436879, "auxiliary_loss_mlp": 0.01036877, "balance_loss_clip": 1.26736486, "balance_loss_mlp": 1.01754105, "epoch": 0.647106568465354, "flos": 18113206538880.0, "grad_norm": 1.800473494091686, "language_loss": 0.82701796, "learning_rate": 1.1700396328485705e-06, "loss": 0.8517555, "num_input_tokens_seen": 232244655, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.1932373, "step": 10763, "time_per_iteration": 2.8798041343688965 }, { "auxiliary_loss_clip": 0.01192675, "auxiliary_loss_mlp": 0.01012573, "balance_loss_clip": 1.10217476, "balance_loss_mlp": 0.99550217, "epoch": 0.647166691718022, "flos": 69510429843840.0, "grad_norm": 0.7155594614219051, "language_loss": 0.57803178, "learning_rate": 1.1696853044900978e-06, "loss": 0.6000843, "num_input_tokens_seen": 232308685, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.17089844, "step": 10764, "time_per_iteration": 3.549649238586426 }, { "auxiliary_loss_clip": 0.01414743, "auxiliary_loss_mlp": 0.01031197, "balance_loss_clip": 1.24978161, "balance_loss_mlp": 1.01168239, "epoch": 0.6472268149706899, "flos": 34108551601920.0, "grad_norm": 1.9864320295195048, "language_loss": 0.61113292, "learning_rate": 1.1693310076172413e-06, "loss": 0.63559234, "num_input_tokens_seen": 232327520, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19506836, "step": 10765, "time_per_iteration": 2.9923577308654785 }, { "auxiliary_loss_clip": 0.01415168, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.25358903, "balance_loss_mlp": 1.0112108, "epoch": 0.6472869382233579, "flos": 28122989529600.0, "grad_norm": 1.904315398402937, "language_loss": 0.63860655, "learning_rate": 1.168976742243437e-06, "loss": 0.66306686, "num_input_tokens_seen": 232349025, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.1965332, "step": 10766, "time_per_iteration": 2.9702703952789307 }, { "auxiliary_loss_clip": 0.01436202, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.26993954, "balance_loss_mlp": 1.01320863, "epoch": 0.6473470614760258, "flos": 22502373166080.0, "grad_norm": 1.6639522314277153, "language_loss": 0.75861239, "learning_rate": 1.1686225083821174e-06, "loss": 0.78331304, "num_input_tokens_seen": 232367835, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20666504, "step": 10767, "time_per_iteration": 2.8857662677764893 }, { "auxiliary_loss_clip": 0.01427218, "auxiliary_loss_mlp": 0.01038912, "balance_loss_clip": 1.26001263, "balance_loss_mlp": 1.01819396, "epoch": 0.6474071847286939, "flos": 14547280631040.0, "grad_norm": 2.032694836779207, "language_loss": 0.7892257, "learning_rate": 1.1682683060467153e-06, "loss": 0.813887, "num_input_tokens_seen": 232385840, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20727539, "step": 10768, "time_per_iteration": 2.8729171752929688 }, { "auxiliary_loss_clip": 0.0141315, "auxiliary_loss_mlp": 0.01034349, "balance_loss_clip": 1.24921846, "balance_loss_mlp": 1.01544249, "epoch": 0.6474673079813618, "flos": 24109039180800.0, "grad_norm": 1.7751970963889592, "language_loss": 0.72359508, "learning_rate": 1.167914135250663e-06, "loss": 0.74807012, "num_input_tokens_seen": 232406205, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18896484, "step": 10769, "time_per_iteration": 2.9030165672302246 }, { "auxiliary_loss_clip": 0.01417808, "auxiliary_loss_mlp": 0.01033088, "balance_loss_clip": 1.25580001, "balance_loss_mlp": 1.0129056, "epoch": 0.6475274312340298, "flos": 14984491564800.0, "grad_norm": 4.878751300495201, "language_loss": 0.73656678, "learning_rate": 1.1675599960073895e-06, "loss": 0.76107574, "num_input_tokens_seen": 232424995, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.2019043, "step": 10770, "time_per_iteration": 2.901657819747925 }, { "auxiliary_loss_clip": 0.01444614, "auxiliary_loss_mlp": 0.01037294, "balance_loss_clip": 1.27232695, "balance_loss_mlp": 1.01616979, "epoch": 0.6475875544866977, "flos": 25055997742080.0, "grad_norm": 1.892982682952068, "language_loss": 0.74211836, "learning_rate": 1.167205888330325e-06, "loss": 0.76693738, "num_input_tokens_seen": 232445870, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21118164, "step": 10771, "time_per_iteration": 2.9919731616973877 }, { "auxiliary_loss_clip": 0.01431304, "auxiliary_loss_mlp": 0.0103991, "balance_loss_clip": 1.26576436, "balance_loss_mlp": 1.02025235, "epoch": 0.6476476777393657, "flos": 16480859460480.0, "grad_norm": 2.005794770355393, "language_loss": 0.74627662, "learning_rate": 1.1668518122328958e-06, "loss": 0.77098876, "num_input_tokens_seen": 232464285, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19665527, "step": 10772, "time_per_iteration": 2.8533477783203125 }, { "auxiliary_loss_clip": 0.01404887, "auxiliary_loss_mlp": 0.01031805, "balance_loss_clip": 1.24385333, "balance_loss_mlp": 1.01262403, "epoch": 0.6477078009920336, "flos": 25823333871360.0, "grad_norm": 1.5160822614139204, "language_loss": 0.83856165, "learning_rate": 1.1664977677285305e-06, "loss": 0.86292851, "num_input_tokens_seen": 232485815, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19165039, "step": 10773, "time_per_iteration": 2.964881658554077 }, { "auxiliary_loss_clip": 0.01417148, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.255059, "balance_loss_mlp": 1.01584435, "epoch": 0.6477679242447016, "flos": 17685135054720.0, "grad_norm": 1.4715790427150905, "language_loss": 0.78926563, "learning_rate": 1.1661437548306524e-06, "loss": 0.81379122, "num_input_tokens_seen": 232504875, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19580078, "step": 10774, "time_per_iteration": 2.8641586303710938 }, { "auxiliary_loss_clip": 0.01432966, "auxiliary_loss_mlp": 0.01040306, "balance_loss_clip": 1.26422036, "balance_loss_mlp": 1.01890802, "epoch": 0.6478280474973696, "flos": 21042047393280.0, "grad_norm": 2.021756486080307, "language_loss": 0.70558155, "learning_rate": 1.1657897735526867e-06, "loss": 0.73031425, "num_input_tokens_seen": 232521945, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21386719, "step": 10775, "time_per_iteration": 2.8877298831939697 }, { "auxiliary_loss_clip": 0.01440047, "auxiliary_loss_mlp": 0.01036979, "balance_loss_clip": 1.26983941, "balance_loss_mlp": 1.01734495, "epoch": 0.6478881707500376, "flos": 21627046402560.0, "grad_norm": 1.8142985367208029, "language_loss": 0.67052728, "learning_rate": 1.1654358239080574e-06, "loss": 0.69529754, "num_input_tokens_seen": 232541500, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.19628906, "step": 10776, "time_per_iteration": 2.8824782371520996 }, { "auxiliary_loss_clip": 0.01430314, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.262501, "balance_loss_mlp": 1.01575327, "epoch": 0.6479482940027056, "flos": 18451837756800.0, "grad_norm": 2.5918182266615015, "language_loss": 0.7958796, "learning_rate": 1.1650819059101839e-06, "loss": 0.82054174, "num_input_tokens_seen": 232559720, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20129395, "step": 10777, "time_per_iteration": 2.850985527038574 }, { "auxiliary_loss_clip": 0.014208, "auxiliary_loss_mlp": 0.01034056, "balance_loss_clip": 1.25569665, "balance_loss_mlp": 1.01380205, "epoch": 0.6480084172553735, "flos": 22174283986560.0, "grad_norm": 2.9602066145499975, "language_loss": 0.73872852, "learning_rate": 1.1647280195724896e-06, "loss": 0.76327705, "num_input_tokens_seen": 232579370, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20251465, "step": 10778, "time_per_iteration": 4.284914016723633 }, { "auxiliary_loss_clip": 0.01417071, "auxiliary_loss_mlp": 0.01034837, "balance_loss_clip": 1.25377667, "balance_loss_mlp": 1.01563203, "epoch": 0.6480685405080415, "flos": 24326875486080.0, "grad_norm": 1.5905359241668007, "language_loss": 0.78661883, "learning_rate": 1.1643741649083923e-06, "loss": 0.81113791, "num_input_tokens_seen": 232600495, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19189453, "step": 10779, "time_per_iteration": 2.9020681381225586 }, { "auxiliary_loss_clip": 0.01191475, "auxiliary_loss_mlp": 0.01018323, "balance_loss_clip": 1.10114992, "balance_loss_mlp": 0.99762869, "epoch": 0.6481286637607094, "flos": 59920908969600.0, "grad_norm": 0.719537901766192, "language_loss": 0.59446484, "learning_rate": 1.1640203419313095e-06, "loss": 0.61656284, "num_input_tokens_seen": 232663165, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.20703125, "step": 10780, "time_per_iteration": 3.3689403533935547 }, { "auxiliary_loss_clip": 0.01421398, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.25754189, "balance_loss_mlp": 1.01345658, "epoch": 0.6481887870133775, "flos": 25495244691840.0, "grad_norm": 1.7103454507426306, "language_loss": 0.79703748, "learning_rate": 1.1636665506546599e-06, "loss": 0.82157922, "num_input_tokens_seen": 232683385, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1932373, "step": 10781, "time_per_iteration": 2.927130699157715 }, { "auxiliary_loss_clip": 0.01439785, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.27131295, "balance_loss_mlp": 1.01321554, "epoch": 0.6482489102660454, "flos": 19937708858880.0, "grad_norm": 1.9777326793566872, "language_loss": 0.80099064, "learning_rate": 1.1633127910918578e-06, "loss": 0.82572877, "num_input_tokens_seen": 232699095, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20812988, "step": 10782, "time_per_iteration": 2.8674728870391846 }, { "auxiliary_loss_clip": 0.01431096, "auxiliary_loss_mlp": 0.01035833, "balance_loss_clip": 1.26287234, "balance_loss_mlp": 1.01561546, "epoch": 0.6483090335187134, "flos": 26990752936320.0, "grad_norm": 1.9571870556305826, "language_loss": 0.64896226, "learning_rate": 1.1629590632563187e-06, "loss": 0.67363155, "num_input_tokens_seen": 232717920, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20227051, "step": 10783, "time_per_iteration": 2.972557306289673 }, { "auxiliary_loss_clip": 0.0143114, "auxiliary_loss_mlp": 0.01034039, "balance_loss_clip": 1.26181614, "balance_loss_mlp": 1.0133208, "epoch": 0.6483691567713813, "flos": 25087651119360.0, "grad_norm": 1.7514355288779386, "language_loss": 0.8937996, "learning_rate": 1.1626053671614561e-06, "loss": 0.91845143, "num_input_tokens_seen": 232737605, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.20727539, "step": 10784, "time_per_iteration": 2.8838706016540527 }, { "auxiliary_loss_clip": 0.01413863, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.25021887, "balance_loss_mlp": 1.01266265, "epoch": 0.6484292800240493, "flos": 16114239694080.0, "grad_norm": 2.639473403194717, "language_loss": 0.74483955, "learning_rate": 1.1622517028206815e-06, "loss": 0.76931071, "num_input_tokens_seen": 232755110, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20568848, "step": 10785, "time_per_iteration": 2.8852479457855225 }, { "auxiliary_loss_clip": 0.01409105, "auxiliary_loss_mlp": 0.0103315, "balance_loss_clip": 1.24893081, "balance_loss_mlp": 1.0133853, "epoch": 0.6484894032767172, "flos": 28852202275200.0, "grad_norm": 1.4394198430908807, "language_loss": 0.69889665, "learning_rate": 1.1618980702474071e-06, "loss": 0.72331917, "num_input_tokens_seen": 232779040, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19763184, "step": 10786, "time_per_iteration": 2.9695687294006348 }, { "auxiliary_loss_clip": 0.01418454, "auxiliary_loss_mlp": 0.01036097, "balance_loss_clip": 1.25453138, "balance_loss_mlp": 1.01618862, "epoch": 0.6485495265293852, "flos": 30239267437440.0, "grad_norm": 2.059909296579217, "language_loss": 0.72100306, "learning_rate": 1.161544469455041e-06, "loss": 0.74554861, "num_input_tokens_seen": 232800515, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19885254, "step": 10787, "time_per_iteration": 2.9280216693878174 }, { "auxiliary_loss_clip": 0.01424891, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.2584157, "balance_loss_mlp": 1.01566315, "epoch": 0.6486096497820532, "flos": 20091288268800.0, "grad_norm": 1.8672290681561614, "language_loss": 0.8498888, "learning_rate": 1.1611909004569934e-06, "loss": 0.87449431, "num_input_tokens_seen": 232818450, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19995117, "step": 10788, "time_per_iteration": 4.353024959564209 }, { "auxiliary_loss_clip": 0.01425277, "auxiliary_loss_mlp": 0.01033567, "balance_loss_clip": 1.26053166, "balance_loss_mlp": 1.01282406, "epoch": 0.6486697730347212, "flos": 17137987960320.0, "grad_norm": 2.4795030154764213, "language_loss": 0.7812587, "learning_rate": 1.1608373632666708e-06, "loss": 0.80584705, "num_input_tokens_seen": 232834785, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20751953, "step": 10789, "time_per_iteration": 2.843346357345581 }, { "auxiliary_loss_clip": 0.01408988, "auxiliary_loss_mlp": 0.01029255, "balance_loss_clip": 1.24729204, "balance_loss_mlp": 1.01046777, "epoch": 0.6487298962873892, "flos": 38926739854080.0, "grad_norm": 1.800193102271303, "language_loss": 0.77106726, "learning_rate": 1.160483857897479e-06, "loss": 0.79544961, "num_input_tokens_seen": 232856050, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18762207, "step": 10790, "time_per_iteration": 3.004951238632202 }, { "auxiliary_loss_clip": 0.01417563, "auxiliary_loss_mlp": 0.01032283, "balance_loss_clip": 1.25521827, "balance_loss_mlp": 1.0134356, "epoch": 0.6487900195400571, "flos": 11955351692160.0, "grad_norm": 4.246744081048147, "language_loss": 0.61917138, "learning_rate": 1.160130384362823e-06, "loss": 0.64366984, "num_input_tokens_seen": 232873945, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18859863, "step": 10791, "time_per_iteration": 2.8411202430725098 }, { "auxiliary_loss_clip": 0.01410649, "auxiliary_loss_mlp": 0.01033591, "balance_loss_clip": 1.24705398, "balance_loss_mlp": 1.01337361, "epoch": 0.6488501427927251, "flos": 22354177887360.0, "grad_norm": 1.6409577748980588, "language_loss": 0.86622524, "learning_rate": 1.1597769426761082e-06, "loss": 0.89066768, "num_input_tokens_seen": 232892160, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20202637, "step": 10792, "time_per_iteration": 2.8722622394561768 }, { "auxiliary_loss_clip": 0.01429189, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 1.26101065, "balance_loss_mlp": 1.01008821, "epoch": 0.648910266045393, "flos": 22246322987520.0, "grad_norm": 1.8794208881363526, "language_loss": 0.78869754, "learning_rate": 1.159423532850735e-06, "loss": 0.81329405, "num_input_tokens_seen": 232911725, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20361328, "step": 10793, "time_per_iteration": 5.780864715576172 }, { "auxiliary_loss_clip": 0.0143166, "auxiliary_loss_mlp": 0.01034107, "balance_loss_clip": 1.26503038, "balance_loss_mlp": 1.01362705, "epoch": 0.6489703892980611, "flos": 25312138410240.0, "grad_norm": 3.5118481304509994, "language_loss": 0.75215137, "learning_rate": 1.1590701549001055e-06, "loss": 0.77680904, "num_input_tokens_seen": 232929085, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20483398, "step": 10794, "time_per_iteration": 2.914161205291748 }, { "auxiliary_loss_clip": 0.01417889, "auxiliary_loss_mlp": 0.01034504, "balance_loss_clip": 1.25306702, "balance_loss_mlp": 1.0142622, "epoch": 0.649030512550729, "flos": 24582699440640.0, "grad_norm": 1.8203083284019141, "language_loss": 0.70877624, "learning_rate": 1.158716808837621e-06, "loss": 0.73330021, "num_input_tokens_seen": 232949455, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20239258, "step": 10795, "time_per_iteration": 2.9107322692871094 }, { "auxiliary_loss_clip": 0.01437418, "auxiliary_loss_mlp": 0.01034162, "balance_loss_clip": 1.26825929, "balance_loss_mlp": 1.01344323, "epoch": 0.649090635803397, "flos": 26254482001920.0, "grad_norm": 1.7451116402078855, "language_loss": 0.54861856, "learning_rate": 1.158363494676679e-06, "loss": 0.57333434, "num_input_tokens_seen": 232969445, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20715332, "step": 10796, "time_per_iteration": 2.891923427581787 }, { "auxiliary_loss_clip": 0.01430087, "auxiliary_loss_mlp": 0.01033139, "balance_loss_clip": 1.26389396, "balance_loss_mlp": 1.01329041, "epoch": 0.6491507590560649, "flos": 24948414311040.0, "grad_norm": 1.4758619001465354, "language_loss": 0.78632379, "learning_rate": 1.1580102124306775e-06, "loss": 0.81095606, "num_input_tokens_seen": 232988900, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19824219, "step": 10797, "time_per_iteration": 2.9002041816711426 }, { "auxiliary_loss_clip": 0.01409489, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 1.24944067, "balance_loss_mlp": 1.01291537, "epoch": 0.6492108823087329, "flos": 19509094437120.0, "grad_norm": 1.9160505620612323, "language_loss": 0.7071563, "learning_rate": 1.1576569621130134e-06, "loss": 0.73157144, "num_input_tokens_seen": 233005060, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19104004, "step": 10798, "time_per_iteration": 2.914146661758423 }, { "auxiliary_loss_clip": 0.01418714, "auxiliary_loss_mlp": 0.0102911, "balance_loss_clip": 1.25390637, "balance_loss_mlp": 1.0104183, "epoch": 0.6492710055614008, "flos": 19728831024000.0, "grad_norm": 1.6809956389635154, "language_loss": 0.77230012, "learning_rate": 1.1573037437370811e-06, "loss": 0.79677838, "num_input_tokens_seen": 233023375, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.18688965, "step": 10799, "time_per_iteration": 2.9596080780029297 }, { "auxiliary_loss_clip": 0.01433891, "auxiliary_loss_mlp": 0.01034547, "balance_loss_clip": 1.26273155, "balance_loss_mlp": 1.01382852, "epoch": 0.6493311288140688, "flos": 24327870871680.0, "grad_norm": 2.0868608733626117, "language_loss": 0.72311419, "learning_rate": 1.1569505573162755e-06, "loss": 0.74779856, "num_input_tokens_seen": 233043130, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.20727539, "step": 10800, "time_per_iteration": 2.9081668853759766 }, { "auxiliary_loss_clip": 0.01192534, "auxiliary_loss_mlp": 0.01016541, "balance_loss_clip": 1.10063183, "balance_loss_mlp": 1.00004244, "epoch": 0.6493912520667368, "flos": 70964149875840.0, "grad_norm": 0.7765422169870086, "language_loss": 0.60327303, "learning_rate": 1.1565974028639897e-06, "loss": 0.62536383, "num_input_tokens_seen": 233110560, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.16503906, "step": 10801, "time_per_iteration": 3.4809653759002686 }, { "auxiliary_loss_clip": 0.01438971, "auxiliary_loss_mlp": 0.01040218, "balance_loss_clip": 1.27185535, "balance_loss_mlp": 1.01938057, "epoch": 0.6494513753194048, "flos": 25348678225920.0, "grad_norm": 1.6563104734776464, "language_loss": 0.79599309, "learning_rate": 1.156244280393614e-06, "loss": 0.82078493, "num_input_tokens_seen": 233130080, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20837402, "step": 10802, "time_per_iteration": 2.9208619594573975 }, { "auxiliary_loss_clip": 0.01419742, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.25460172, "balance_loss_mlp": 1.01890302, "epoch": 0.6495114985720728, "flos": 24692952314880.0, "grad_norm": 1.6383561074643163, "language_loss": 0.75090212, "learning_rate": 1.155891189918541e-06, "loss": 0.77549505, "num_input_tokens_seen": 233150235, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.2064209, "step": 10803, "time_per_iteration": 2.913355588912964 }, { "auxiliary_loss_clip": 0.01423097, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.25746584, "balance_loss_mlp": 1.01225317, "epoch": 0.6495716218247407, "flos": 23659476416640.0, "grad_norm": 2.6692424920739666, "language_loss": 0.70923579, "learning_rate": 1.1555381314521578e-06, "loss": 0.73377728, "num_input_tokens_seen": 233166710, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.18811035, "step": 10804, "time_per_iteration": 2.8485493659973145 }, { "auxiliary_loss_clip": 0.01404628, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.24214244, "balance_loss_mlp": 1.01553881, "epoch": 0.6496317450774087, "flos": 22356349637760.0, "grad_norm": 2.17417021306463, "language_loss": 0.73736525, "learning_rate": 1.1551851050078537e-06, "loss": 0.76177061, "num_input_tokens_seen": 233185445, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20349121, "step": 10805, "time_per_iteration": 2.898684501647949 }, { "auxiliary_loss_clip": 0.01424601, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.25753832, "balance_loss_mlp": 1.009794, "epoch": 0.6496918683300766, "flos": 30530862046080.0, "grad_norm": 2.647061799862123, "language_loss": 0.67076588, "learning_rate": 1.1548321105990155e-06, "loss": 0.69529718, "num_input_tokens_seen": 233205805, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18725586, "step": 10806, "time_per_iteration": 2.922217845916748 }, { "auxiliary_loss_clip": 0.01437755, "auxiliary_loss_mlp": 0.01028845, "balance_loss_clip": 1.26798487, "balance_loss_mlp": 1.00928235, "epoch": 0.6497519915827447, "flos": 12466728132480.0, "grad_norm": 2.6229756591887696, "language_loss": 0.79886878, "learning_rate": 1.1544791482390275e-06, "loss": 0.82353479, "num_input_tokens_seen": 233224215, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.19555664, "step": 10807, "time_per_iteration": 2.8560972213745117 }, { "auxiliary_loss_clip": 0.01195043, "auxiliary_loss_mlp": 0.01009413, "balance_loss_clip": 1.10266161, "balance_loss_mlp": 0.99320096, "epoch": 0.6498121148354126, "flos": 69127947135360.0, "grad_norm": 0.7754770142799517, "language_loss": 0.58927572, "learning_rate": 1.1541262179412745e-06, "loss": 0.61132026, "num_input_tokens_seen": 233294440, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.16210938, "step": 10808, "time_per_iteration": 3.520206928253174 }, { "auxiliary_loss_clip": 0.0142725, "auxiliary_loss_mlp": 0.01030815, "balance_loss_clip": 1.26474071, "balance_loss_mlp": 1.0112288, "epoch": 0.6498722380880806, "flos": 36909313130880.0, "grad_norm": 1.7015686339212714, "language_loss": 0.63571703, "learning_rate": 1.1537733197191415e-06, "loss": 0.66029763, "num_input_tokens_seen": 233316125, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19580078, "step": 10809, "time_per_iteration": 3.0098509788513184 }, { "auxiliary_loss_clip": 0.01421562, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.25788927, "balance_loss_mlp": 1.01266336, "epoch": 0.6499323613407485, "flos": 29028295612800.0, "grad_norm": 1.7258266101876474, "language_loss": 0.81447649, "learning_rate": 1.153420453586008e-06, "loss": 0.83900833, "num_input_tokens_seen": 233336140, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.1895752, "step": 10810, "time_per_iteration": 2.918860673904419 }, { "auxiliary_loss_clip": 0.01410708, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.25034809, "balance_loss_mlp": 1.01630592, "epoch": 0.6499924845934165, "flos": 20128506756480.0, "grad_norm": 1.6046377297650405, "language_loss": 0.72510183, "learning_rate": 1.1530676195552561e-06, "loss": 0.74956942, "num_input_tokens_seen": 233356095, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19750977, "step": 10811, "time_per_iteration": 2.8658456802368164 }, { "auxiliary_loss_clip": 0.01414286, "auxiliary_loss_mlp": 0.01030899, "balance_loss_clip": 1.25513959, "balance_loss_mlp": 1.01201606, "epoch": 0.6500526078460844, "flos": 24430884577920.0, "grad_norm": 1.7663003169552878, "language_loss": 0.78361005, "learning_rate": 1.1527148176402649e-06, "loss": 0.8080619, "num_input_tokens_seen": 233376830, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18884277, "step": 10812, "time_per_iteration": 2.9117116928100586 }, { "auxiliary_loss_clip": 0.01431202, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.26549864, "balance_loss_mlp": 1.01476943, "epoch": 0.6501127310987524, "flos": 23341386337920.0, "grad_norm": 1.8815826554205881, "language_loss": 0.8551116, "learning_rate": 1.152362047854413e-06, "loss": 0.87976247, "num_input_tokens_seen": 233395275, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19128418, "step": 10813, "time_per_iteration": 4.422515392303467 }, { "auxiliary_loss_clip": 0.01421846, "auxiliary_loss_mlp": 0.01033571, "balance_loss_clip": 1.25796974, "balance_loss_mlp": 1.01487947, "epoch": 0.6501728543514204, "flos": 18707118773760.0, "grad_norm": 1.5676059544669247, "language_loss": 0.80493176, "learning_rate": 1.1520093102110764e-06, "loss": 0.82948595, "num_input_tokens_seen": 233413345, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18701172, "step": 10814, "time_per_iteration": 2.82851505279541 }, { "auxiliary_loss_clip": 0.01449675, "auxiliary_loss_mlp": 0.0104044, "balance_loss_clip": 1.28000116, "balance_loss_mlp": 1.01992369, "epoch": 0.6502329776040884, "flos": 44215375985280.0, "grad_norm": 1.5060618998302724, "language_loss": 0.6609658, "learning_rate": 1.1516566047236328e-06, "loss": 0.68586695, "num_input_tokens_seen": 233436105, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20507812, "step": 10815, "time_per_iteration": 3.0241148471832275 }, { "auxiliary_loss_clip": 0.01446317, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.27545822, "balance_loss_mlp": 1.0143187, "epoch": 0.6502931008567564, "flos": 14582915550720.0, "grad_norm": 1.8931937509034875, "language_loss": 0.76560485, "learning_rate": 1.1513039314054546e-06, "loss": 0.7904278, "num_input_tokens_seen": 233452320, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.2166748, "step": 10816, "time_per_iteration": 2.828721523284912 }, { "auxiliary_loss_clip": 0.01422423, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.26086211, "balance_loss_mlp": 1.01408172, "epoch": 0.6503532241094243, "flos": 21404278414080.0, "grad_norm": 1.713786047523561, "language_loss": 0.73803806, "learning_rate": 1.1509512902699174e-06, "loss": 0.76260245, "num_input_tokens_seen": 233469920, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19934082, "step": 10817, "time_per_iteration": 2.8730146884918213 }, { "auxiliary_loss_clip": 0.0142914, "auxiliary_loss_mlp": 0.01033173, "balance_loss_clip": 1.26202047, "balance_loss_mlp": 1.01299095, "epoch": 0.6504133473620923, "flos": 74764906151040.0, "grad_norm": 1.4628871938314734, "language_loss": 0.7235741, "learning_rate": 1.1505986813303916e-06, "loss": 0.7481972, "num_input_tokens_seen": 233499780, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20178223, "step": 10818, "time_per_iteration": 3.275144338607788 }, { "auxiliary_loss_clip": 0.01441011, "auxiliary_loss_mlp": 0.01036289, "balance_loss_clip": 1.27258575, "balance_loss_mlp": 1.01592851, "epoch": 0.6504734706147602, "flos": 19721229897600.0, "grad_norm": 1.9788829466570557, "language_loss": 0.65609288, "learning_rate": 1.150246104600249e-06, "loss": 0.68086594, "num_input_tokens_seen": 233518235, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20349121, "step": 10819, "time_per_iteration": 2.853773832321167 }, { "auxiliary_loss_clip": 0.01437776, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.2690177, "balance_loss_mlp": 1.01543868, "epoch": 0.6505335938674283, "flos": 25567871875200.0, "grad_norm": 2.9241708587515385, "language_loss": 0.84463441, "learning_rate": 1.14989356009286e-06, "loss": 0.86936587, "num_input_tokens_seen": 233535215, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19934082, "step": 10820, "time_per_iteration": 2.900480270385742 }, { "auxiliary_loss_clip": 0.01439215, "auxiliary_loss_mlp": 0.01037823, "balance_loss_clip": 1.27068043, "balance_loss_mlp": 1.01753342, "epoch": 0.6505937171200962, "flos": 17830298931840.0, "grad_norm": 2.190884819955326, "language_loss": 0.78897083, "learning_rate": 1.1495410478215914e-06, "loss": 0.81374121, "num_input_tokens_seen": 233552775, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20275879, "step": 10821, "time_per_iteration": 2.8452041149139404 }, { "auxiliary_loss_clip": 0.01407094, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.24794054, "balance_loss_mlp": 1.01207304, "epoch": 0.6506538403727642, "flos": 20677961335680.0, "grad_norm": 1.5892807158149256, "language_loss": 0.80268013, "learning_rate": 1.1491885677998126e-06, "loss": 0.82705057, "num_input_tokens_seen": 233572080, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17871094, "step": 10822, "time_per_iteration": 2.842895984649658 }, { "auxiliary_loss_clip": 0.01413289, "auxiliary_loss_mlp": 0.01032864, "balance_loss_clip": 1.2510196, "balance_loss_mlp": 1.0125035, "epoch": 0.6507139636254321, "flos": 11726928103680.0, "grad_norm": 1.9889910299914795, "language_loss": 0.88208371, "learning_rate": 1.1488361200408883e-06, "loss": 0.90654522, "num_input_tokens_seen": 233589155, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20349121, "step": 10823, "time_per_iteration": 4.246881484985352 }, { "auxiliary_loss_clip": 0.01420551, "auxiliary_loss_mlp": 0.0103419, "balance_loss_clip": 1.25519848, "balance_loss_mlp": 1.0142101, "epoch": 0.6507740868781001, "flos": 26773278589440.0, "grad_norm": 1.5621854874731897, "language_loss": 0.67218816, "learning_rate": 1.148483704558183e-06, "loss": 0.6967355, "num_input_tokens_seen": 233608180, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.1998291, "step": 10824, "time_per_iteration": 2.9019312858581543 }, { "auxiliary_loss_clip": 0.01428583, "auxiliary_loss_mlp": 0.0103378, "balance_loss_clip": 1.25975156, "balance_loss_mlp": 1.01395571, "epoch": 0.650834210130768, "flos": 16480316522880.0, "grad_norm": 2.7681032417487708, "language_loss": 0.88181233, "learning_rate": 1.1481313213650607e-06, "loss": 0.90643603, "num_input_tokens_seen": 233625750, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.19836426, "step": 10825, "time_per_iteration": 2.932523250579834 }, { "auxiliary_loss_clip": 0.01425716, "auxiliary_loss_mlp": 0.01031274, "balance_loss_clip": 1.25713396, "balance_loss_mlp": 1.01131821, "epoch": 0.650894333383436, "flos": 17137490267520.0, "grad_norm": 2.99633255532371, "language_loss": 0.74497783, "learning_rate": 1.147778970474885e-06, "loss": 0.76954776, "num_input_tokens_seen": 233644235, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19970703, "step": 10826, "time_per_iteration": 2.8330492973327637 }, { "auxiliary_loss_clip": 0.01420964, "auxiliary_loss_mlp": 0.01037567, "balance_loss_clip": 1.25506878, "balance_loss_mlp": 1.01785016, "epoch": 0.650954456636104, "flos": 18743522855040.0, "grad_norm": 5.3702311835114145, "language_loss": 0.70179725, "learning_rate": 1.1474266519010157e-06, "loss": 0.72638261, "num_input_tokens_seen": 233662845, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19726562, "step": 10827, "time_per_iteration": 2.838696241378784 }, { "auxiliary_loss_clip": 0.01437621, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.26974964, "balance_loss_mlp": 1.01345277, "epoch": 0.651014579888772, "flos": 24537155909760.0, "grad_norm": 1.8224506955910706, "language_loss": 0.77761674, "learning_rate": 1.1470743656568136e-06, "loss": 0.80231589, "num_input_tokens_seen": 233681990, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.18847656, "step": 10828, "time_per_iteration": 5.685888290405273 }, { "auxiliary_loss_clip": 0.01435235, "auxiliary_loss_mlp": 0.01035664, "balance_loss_clip": 1.27029979, "balance_loss_mlp": 1.01542211, "epoch": 0.65107470314144, "flos": 24071730203520.0, "grad_norm": 2.181255231826281, "language_loss": 0.89926839, "learning_rate": 1.1467221117556362e-06, "loss": 0.92397738, "num_input_tokens_seen": 233698930, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20239258, "step": 10829, "time_per_iteration": 2.8604989051818848 }, { "auxiliary_loss_clip": 0.01193448, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.1003089, "balance_loss_mlp": 1.01193511, "epoch": 0.6511348263941079, "flos": 72514205366400.0, "grad_norm": 0.653131509950109, "language_loss": 0.55427498, "learning_rate": 1.1463698902108428e-06, "loss": 0.57648617, "num_input_tokens_seen": 233769825, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.15722656, "step": 10830, "time_per_iteration": 3.5037951469421387 }, { "auxiliary_loss_clip": 0.0143743, "auxiliary_loss_mlp": 0.01038154, "balance_loss_clip": 1.26698542, "balance_loss_mlp": 1.01811469, "epoch": 0.6511949496467759, "flos": 23378107132800.0, "grad_norm": 1.7547823038219335, "language_loss": 0.75779724, "learning_rate": 1.1460177010357878e-06, "loss": 0.78255308, "num_input_tokens_seen": 233787095, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.20031738, "step": 10831, "time_per_iteration": 2.8659768104553223 }, { "auxiliary_loss_clip": 0.01193968, "auxiliary_loss_mlp": 0.01020899, "balance_loss_clip": 1.10237336, "balance_loss_mlp": 1.00468671, "epoch": 0.6512550728994438, "flos": 67364534574720.0, "grad_norm": 0.6561742051579238, "language_loss": 0.51044691, "learning_rate": 1.145665544243828e-06, "loss": 0.53259552, "num_input_tokens_seen": 233853050, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.16210938, "step": 10832, "time_per_iteration": 3.427170753479004 }, { "auxiliary_loss_clip": 0.01434551, "auxiliary_loss_mlp": 0.01034416, "balance_loss_clip": 1.26495671, "balance_loss_mlp": 1.01488972, "epoch": 0.6513151961521119, "flos": 21151440616320.0, "grad_norm": 1.9948949847177644, "language_loss": 0.84735477, "learning_rate": 1.145313419848316e-06, "loss": 0.87204444, "num_input_tokens_seen": 233871385, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.19543457, "step": 10833, "time_per_iteration": 2.871377468109131 }, { "auxiliary_loss_clip": 0.01428674, "auxiliary_loss_mlp": 0.01032183, "balance_loss_clip": 1.26313829, "balance_loss_mlp": 1.01191747, "epoch": 0.6513753194047798, "flos": 15167235888000.0, "grad_norm": 3.720737024636619, "language_loss": 0.84840405, "learning_rate": 1.1449613278626049e-06, "loss": 0.87301266, "num_input_tokens_seen": 233888175, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20288086, "step": 10834, "time_per_iteration": 2.8400421142578125 }, { "auxiliary_loss_clip": 0.0143296, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.2663908, "balance_loss_mlp": 1.01923752, "epoch": 0.6514354426574478, "flos": 30238724499840.0, "grad_norm": 1.5574545451022714, "language_loss": 0.77305716, "learning_rate": 1.1446092683000455e-06, "loss": 0.79777366, "num_input_tokens_seen": 233911470, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19445801, "step": 10835, "time_per_iteration": 3.0243070125579834 }, { "auxiliary_loss_clip": 0.01437507, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.27133191, "balance_loss_mlp": 1.01309276, "epoch": 0.6514955659101157, "flos": 24215898695040.0, "grad_norm": 1.4570842421290429, "language_loss": 0.78132236, "learning_rate": 1.1442572411739882e-06, "loss": 0.80602717, "num_input_tokens_seen": 233932135, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19885254, "step": 10836, "time_per_iteration": 2.9367830753326416 }, { "auxiliary_loss_clip": 0.01427604, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.26025009, "balance_loss_mlp": 1.01750672, "epoch": 0.6515556891627837, "flos": 12383332686720.0, "grad_norm": 2.0505950002557194, "language_loss": 0.83116281, "learning_rate": 1.143905246497783e-06, "loss": 0.85581094, "num_input_tokens_seen": 233947880, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19702148, "step": 10837, "time_per_iteration": 2.8523926734924316 }, { "auxiliary_loss_clip": 0.01415026, "auxiliary_loss_mlp": 0.01033557, "balance_loss_clip": 1.25239158, "balance_loss_mlp": 1.01229048, "epoch": 0.6516158124154516, "flos": 49618518001920.0, "grad_norm": 2.669869087632107, "language_loss": 0.59803808, "learning_rate": 1.1435532842847758e-06, "loss": 0.6225239, "num_input_tokens_seen": 233971475, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.21276855, "step": 10838, "time_per_iteration": 3.1150951385498047 }, { "auxiliary_loss_clip": 0.01192959, "auxiliary_loss_mlp": 0.01018813, "balance_loss_clip": 1.10060692, "balance_loss_mlp": 1.00202835, "epoch": 0.6516759356681197, "flos": 59730834988800.0, "grad_norm": 0.7304456712723424, "language_loss": 0.61035275, "learning_rate": 1.1432013545483147e-06, "loss": 0.63247049, "num_input_tokens_seen": 234030690, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.16796875, "step": 10839, "time_per_iteration": 3.4331166744232178 }, { "auxiliary_loss_clip": 0.0141848, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.25602102, "balance_loss_mlp": 1.01543427, "epoch": 0.6517360589207876, "flos": 37464785268480.0, "grad_norm": 1.5900251191658636, "language_loss": 0.6832785, "learning_rate": 1.1428494573017439e-06, "loss": 0.7078023, "num_input_tokens_seen": 234052470, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18457031, "step": 10840, "time_per_iteration": 3.039977550506592 }, { "auxiliary_loss_clip": 0.01412542, "auxiliary_loss_mlp": 0.01033514, "balance_loss_clip": 1.24890566, "balance_loss_mlp": 1.01450038, "epoch": 0.6517961821734556, "flos": 25385987203200.0, "grad_norm": 1.96222045931857, "language_loss": 0.74697983, "learning_rate": 1.1424975925584071e-06, "loss": 0.77144039, "num_input_tokens_seen": 234071495, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19018555, "step": 10841, "time_per_iteration": 2.944181442260742 }, { "auxiliary_loss_clip": 0.01422863, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.25585675, "balance_loss_mlp": 1.01704693, "epoch": 0.6518563054261236, "flos": 28778670195840.0, "grad_norm": 2.4693666720722436, "language_loss": 0.63268518, "learning_rate": 1.142145760331648e-06, "loss": 0.65728462, "num_input_tokens_seen": 234092325, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20031738, "step": 10842, "time_per_iteration": 2.9586222171783447 }, { "auxiliary_loss_clip": 0.01198548, "auxiliary_loss_mlp": 0.01041145, "balance_loss_clip": 1.1038866, "balance_loss_mlp": 1.02416945, "epoch": 0.6519164286787915, "flos": 68952921690240.0, "grad_norm": 0.8242824711016649, "language_loss": 0.56138325, "learning_rate": 1.141793960634807e-06, "loss": 0.58378017, "num_input_tokens_seen": 234148005, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.16992188, "step": 10843, "time_per_iteration": 3.117135763168335 }, { "auxiliary_loss_clip": 0.01447384, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.27449393, "balance_loss_mlp": 1.02022254, "epoch": 0.6519765519314595, "flos": 20449809216000.0, "grad_norm": 1.6050341062999347, "language_loss": 0.83489698, "learning_rate": 1.1414421934812253e-06, "loss": 0.8597737, "num_input_tokens_seen": 234164280, "router_z_loss_clip": 1.73046875, "router_z_loss_mlp": 0.20068359, "step": 10844, "time_per_iteration": 2.8662898540496826 }, { "auxiliary_loss_clip": 0.01417047, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.25093913, "balance_loss_mlp": 1.01008296, "epoch": 0.6520366751841274, "flos": 28414765117440.0, "grad_norm": 1.8549469549301951, "language_loss": 0.60625607, "learning_rate": 1.1410904588842421e-06, "loss": 0.63074839, "num_input_tokens_seen": 234185090, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.2208252, "step": 10845, "time_per_iteration": 2.9218716621398926 }, { "auxiliary_loss_clip": 0.01424742, "auxiliary_loss_mlp": 0.01035068, "balance_loss_clip": 1.25708818, "balance_loss_mlp": 1.01535058, "epoch": 0.6520967984367955, "flos": 22283631964800.0, "grad_norm": 1.7747723740844616, "language_loss": 0.80064392, "learning_rate": 1.140738756857194e-06, "loss": 0.82524204, "num_input_tokens_seen": 234204050, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19714355, "step": 10846, "time_per_iteration": 2.86114239692688 }, { "auxiliary_loss_clip": 0.01193036, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.10028601, "balance_loss_mlp": 1.01495123, "epoch": 0.6521569216894634, "flos": 68952803938560.0, "grad_norm": 0.7104997386678515, "language_loss": 0.60268444, "learning_rate": 1.1403870874134192e-06, "loss": 0.62495506, "num_input_tokens_seen": 234269790, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.19042969, "step": 10847, "time_per_iteration": 3.3894426822662354 }, { "auxiliary_loss_clip": 0.01435802, "auxiliary_loss_mlp": 0.01039167, "balance_loss_clip": 1.26622343, "balance_loss_mlp": 1.01878262, "epoch": 0.6522170449421314, "flos": 29141941847040.0, "grad_norm": 1.4546805032112704, "language_loss": 0.81260204, "learning_rate": 1.1400354505662514e-06, "loss": 0.83735174, "num_input_tokens_seen": 234290135, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.20385742, "step": 10848, "time_per_iteration": 2.9208948612213135 }, { "auxiliary_loss_clip": 0.0141928, "auxiliary_loss_mlp": 0.01037582, "balance_loss_clip": 1.25449443, "balance_loss_mlp": 1.01723289, "epoch": 0.6522771681947993, "flos": 26663342428800.0, "grad_norm": 2.333687245797798, "language_loss": 0.75752842, "learning_rate": 1.1396838463290263e-06, "loss": 0.78209704, "num_input_tokens_seen": 234309535, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20324707, "step": 10849, "time_per_iteration": 4.293798208236694 }, { "auxiliary_loss_clip": 0.01417804, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.25579023, "balance_loss_mlp": 1.01484954, "epoch": 0.6523372914474673, "flos": 25750932912000.0, "grad_norm": 1.3594619538019048, "language_loss": 0.6864472, "learning_rate": 1.1393322747150752e-06, "loss": 0.71097016, "num_input_tokens_seen": 234328755, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1965332, "step": 10850, "time_per_iteration": 2.929518222808838 }, { "auxiliary_loss_clip": 0.01413865, "auxiliary_loss_mlp": 0.01032691, "balance_loss_clip": 1.25128865, "balance_loss_mlp": 1.01229417, "epoch": 0.6523974147001352, "flos": 24838206681600.0, "grad_norm": 1.946083477422338, "language_loss": 0.67766792, "learning_rate": 1.1389807357377313e-06, "loss": 0.70213342, "num_input_tokens_seen": 234348655, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20373535, "step": 10851, "time_per_iteration": 2.923574686050415 }, { "auxiliary_loss_clip": 0.01440494, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.27244556, "balance_loss_mlp": 1.01797962, "epoch": 0.6524575379528033, "flos": 26327425898880.0, "grad_norm": 2.294505267370526, "language_loss": 0.74465001, "learning_rate": 1.1386292294103235e-06, "loss": 0.76942885, "num_input_tokens_seen": 234367445, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.1940918, "step": 10852, "time_per_iteration": 2.9678995609283447 }, { "auxiliary_loss_clip": 0.01435877, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.2647872, "balance_loss_mlp": 1.01431775, "epoch": 0.6525176612054712, "flos": 19501991003520.0, "grad_norm": 2.104392173534739, "language_loss": 0.67071384, "learning_rate": 1.1382777557461812e-06, "loss": 0.69542837, "num_input_tokens_seen": 234384825, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.21264648, "step": 10853, "time_per_iteration": 2.8843600749969482 }, { "auxiliary_loss_clip": 0.01192539, "auxiliary_loss_mlp": 0.01041337, "balance_loss_clip": 1.10001087, "balance_loss_mlp": 1.01711369, "epoch": 0.6525777844581392, "flos": 71738182235520.0, "grad_norm": 0.7256362626580298, "language_loss": 0.63077074, "learning_rate": 1.137926314758634e-06, "loss": 0.65310949, "num_input_tokens_seen": 234450630, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.2421875, "step": 10854, "time_per_iteration": 3.4702813625335693 }, { "auxiliary_loss_clip": 0.01435957, "auxiliary_loss_mlp": 0.01041493, "balance_loss_clip": 1.26784396, "balance_loss_mlp": 1.02001131, "epoch": 0.6526379077108072, "flos": 26664835507200.0, "grad_norm": 1.7153043686610199, "language_loss": 0.78550541, "learning_rate": 1.1375749064610072e-06, "loss": 0.81027985, "num_input_tokens_seen": 234473505, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.21484375, "step": 10855, "time_per_iteration": 2.9372026920318604 }, { "auxiliary_loss_clip": 0.01406375, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.24413419, "balance_loss_mlp": 1.00888014, "epoch": 0.6526980309634751, "flos": 22830643324800.0, "grad_norm": 1.9806546869009416, "language_loss": 0.79673129, "learning_rate": 1.1372235308666256e-06, "loss": 0.82109398, "num_input_tokens_seen": 234492485, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.21008301, "step": 10856, "time_per_iteration": 2.8697712421417236 }, { "auxiliary_loss_clip": 0.01422167, "auxiliary_loss_mlp": 0.01035251, "balance_loss_clip": 1.25660968, "balance_loss_mlp": 1.01380491, "epoch": 0.6527581542161431, "flos": 28376460754560.0, "grad_norm": 1.6226242877310706, "language_loss": 0.74486226, "learning_rate": 1.136872187988815e-06, "loss": 0.76943648, "num_input_tokens_seen": 234512645, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.21472168, "step": 10857, "time_per_iteration": 2.9095025062561035 }, { "auxiliary_loss_clip": 0.01415923, "auxiliary_loss_mlp": 0.0103296, "balance_loss_clip": 1.25067306, "balance_loss_mlp": 1.0124557, "epoch": 0.652818277468811, "flos": 18378350922240.0, "grad_norm": 2.2092194832885825, "language_loss": 0.63922381, "learning_rate": 1.1365208778408965e-06, "loss": 0.66371262, "num_input_tokens_seen": 234529310, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20495605, "step": 10858, "time_per_iteration": 4.29092812538147 }, { "auxiliary_loss_clip": 0.01420005, "auxiliary_loss_mlp": 0.01035841, "balance_loss_clip": 1.25553942, "balance_loss_mlp": 1.01515818, "epoch": 0.6528784007214791, "flos": 18044017960320.0, "grad_norm": 1.803031795774203, "language_loss": 0.79205382, "learning_rate": 1.1361696004361939e-06, "loss": 0.81661224, "num_input_tokens_seen": 234546685, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20678711, "step": 10859, "time_per_iteration": 2.8555378913879395 }, { "auxiliary_loss_clip": 0.01441087, "auxiliary_loss_mlp": 0.01037279, "balance_loss_clip": 1.27100599, "balance_loss_mlp": 1.01636946, "epoch": 0.652938523974147, "flos": 22392029802240.0, "grad_norm": 1.6481859868455946, "language_loss": 0.68840373, "learning_rate": 1.1358183557880256e-06, "loss": 0.71318734, "num_input_tokens_seen": 234566255, "router_z_loss_clip": 1.70019531, "router_z_loss_mlp": 0.20922852, "step": 10860, "time_per_iteration": 2.8458690643310547 }, { "auxiliary_loss_clip": 0.0144107, "auxiliary_loss_mlp": 0.01033627, "balance_loss_clip": 1.26989388, "balance_loss_mlp": 1.01288462, "epoch": 0.652998647226815, "flos": 16772816027520.0, "grad_norm": 1.88560659738565, "language_loss": 0.6794405, "learning_rate": 1.135467143909712e-06, "loss": 0.70418751, "num_input_tokens_seen": 234585405, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20739746, "step": 10861, "time_per_iteration": 2.853938579559326 }, { "auxiliary_loss_clip": 0.01428755, "auxiliary_loss_mlp": 0.01039433, "balance_loss_clip": 1.26027691, "balance_loss_mlp": 1.01681936, "epoch": 0.6530587704794829, "flos": 35786080252800.0, "grad_norm": 1.857848964641663, "language_loss": 0.65946269, "learning_rate": 1.135115964814572e-06, "loss": 0.68414456, "num_input_tokens_seen": 234608095, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.22607422, "step": 10862, "time_per_iteration": 2.9925315380096436 }, { "auxiliary_loss_clip": 0.01425479, "auxiliary_loss_mlp": 0.01040455, "balance_loss_clip": 1.2592901, "balance_loss_mlp": 1.01995063, "epoch": 0.6531188937321509, "flos": 19325083259520.0, "grad_norm": 1.6096630739754856, "language_loss": 0.77840781, "learning_rate": 1.13476481851592e-06, "loss": 0.80306721, "num_input_tokens_seen": 234627335, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20495605, "step": 10863, "time_per_iteration": 5.694023847579956 }, { "auxiliary_loss_clip": 0.01426554, "auxiliary_loss_mlp": 0.0103651, "balance_loss_clip": 1.2610333, "balance_loss_mlp": 1.01649475, "epoch": 0.6531790169848188, "flos": 22904175404160.0, "grad_norm": 1.9396682627015769, "language_loss": 0.74735034, "learning_rate": 1.1344137050270739e-06, "loss": 0.77198094, "num_input_tokens_seen": 234646540, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20007324, "step": 10864, "time_per_iteration": 2.893057346343994 }, { "auxiliary_loss_clip": 0.01412542, "auxiliary_loss_mlp": 0.01034214, "balance_loss_clip": 1.24735689, "balance_loss_mlp": 1.01355457, "epoch": 0.6532391402374869, "flos": 29573813894400.0, "grad_norm": 2.3514685328647986, "language_loss": 0.86252463, "learning_rate": 1.1340626243613458e-06, "loss": 0.88699222, "num_input_tokens_seen": 234665470, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20654297, "step": 10865, "time_per_iteration": 2.9253711700439453 }, { "auxiliary_loss_clip": 0.01438733, "auxiliary_loss_mlp": 0.01032975, "balance_loss_clip": 1.27010822, "balance_loss_mlp": 1.01268554, "epoch": 0.6532992634901548, "flos": 23114048624640.0, "grad_norm": 1.6357639514701978, "language_loss": 0.82271415, "learning_rate": 1.133711576532051e-06, "loss": 0.84743118, "num_input_tokens_seen": 234683955, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20288086, "step": 10866, "time_per_iteration": 2.9799749851226807 }, { "auxiliary_loss_clip": 0.01428661, "auxiliary_loss_mlp": 0.01034474, "balance_loss_clip": 1.26403427, "balance_loss_mlp": 1.01321948, "epoch": 0.6533593867428228, "flos": 26078388664320.0, "grad_norm": 1.3784516528211717, "language_loss": 0.82866263, "learning_rate": 1.1333605615524995e-06, "loss": 0.85329401, "num_input_tokens_seen": 234704595, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.21240234, "step": 10867, "time_per_iteration": 2.935029983520508 }, { "auxiliary_loss_clip": 0.01425595, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.25810337, "balance_loss_mlp": 1.01234794, "epoch": 0.6534195099954908, "flos": 21221669825280.0, "grad_norm": 1.935843425736942, "language_loss": 0.82226646, "learning_rate": 1.1330095794360016e-06, "loss": 0.8468582, "num_input_tokens_seen": 234724090, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.21240234, "step": 10868, "time_per_iteration": 2.852872133255005 }, { "auxiliary_loss_clip": 0.01425601, "auxiliary_loss_mlp": 0.01034889, "balance_loss_clip": 1.25633276, "balance_loss_mlp": 1.01409936, "epoch": 0.6534796332481587, "flos": 19656158595840.0, "grad_norm": 1.8941453317831618, "language_loss": 0.80563748, "learning_rate": 1.1326586301958675e-06, "loss": 0.8302424, "num_input_tokens_seen": 234742560, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20788574, "step": 10869, "time_per_iteration": 2.831394672393799 }, { "auxiliary_loss_clip": 0.01430968, "auxiliary_loss_mlp": 0.01037695, "balance_loss_clip": 1.26440501, "balance_loss_mlp": 1.01719141, "epoch": 0.6535397565008267, "flos": 24032566189440.0, "grad_norm": 2.3409717268772217, "language_loss": 0.73537004, "learning_rate": 1.1323077138454063e-06, "loss": 0.76005667, "num_input_tokens_seen": 234762315, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20507812, "step": 10870, "time_per_iteration": 2.8762388229370117 }, { "auxiliary_loss_clip": 0.0144319, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.27534294, "balance_loss_mlp": 1.01873469, "epoch": 0.6535998797534947, "flos": 24612090577920.0, "grad_norm": 2.2934099595834443, "language_loss": 0.75951487, "learning_rate": 1.1319568303979221e-06, "loss": 0.78434098, "num_input_tokens_seen": 234781300, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20666504, "step": 10871, "time_per_iteration": 2.949042797088623 }, { "auxiliary_loss_clip": 0.0141425, "auxiliary_loss_mlp": 0.01033722, "balance_loss_clip": 1.25110543, "balance_loss_mlp": 1.01284885, "epoch": 0.6536600030061627, "flos": 23373673142400.0, "grad_norm": 1.4471908706452457, "language_loss": 0.56129181, "learning_rate": 1.1316059798667227e-06, "loss": 0.58577156, "num_input_tokens_seen": 234801040, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20874023, "step": 10872, "time_per_iteration": 2.883596420288086 }, { "auxiliary_loss_clip": 0.01419092, "auxiliary_loss_mlp": 0.01035939, "balance_loss_clip": 1.25451684, "balance_loss_mlp": 1.01576841, "epoch": 0.6537201262588306, "flos": 23889031125120.0, "grad_norm": 1.611541548932472, "language_loss": 0.75629711, "learning_rate": 1.1312551622651112e-06, "loss": 0.78084737, "num_input_tokens_seen": 234821415, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20166016, "step": 10873, "time_per_iteration": 2.8784608840942383 }, { "auxiliary_loss_clip": 0.01417043, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.25197649, "balance_loss_mlp": 1.01454782, "epoch": 0.6537802495114986, "flos": 24365949010560.0, "grad_norm": 1.5203515330787174, "language_loss": 0.75822592, "learning_rate": 1.1309043776063917e-06, "loss": 0.78274792, "num_input_tokens_seen": 234843795, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20605469, "step": 10874, "time_per_iteration": 2.9199771881103516 }, { "auxiliary_loss_clip": 0.01423461, "auxiliary_loss_mlp": 0.01034244, "balance_loss_clip": 1.25811267, "balance_loss_mlp": 1.01360846, "epoch": 0.6538403727641665, "flos": 28007352524160.0, "grad_norm": 1.6150081202768392, "language_loss": 0.82164109, "learning_rate": 1.1305536259038642e-06, "loss": 0.84621811, "num_input_tokens_seen": 234862350, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.2064209, "step": 10875, "time_per_iteration": 2.9745771884918213 }, { "auxiliary_loss_clip": 0.01429827, "auxiliary_loss_mlp": 0.01039911, "balance_loss_clip": 1.26268053, "balance_loss_mlp": 1.01892948, "epoch": 0.6539004960168345, "flos": 27575118518400.0, "grad_norm": 1.5747355303999402, "language_loss": 0.70523751, "learning_rate": 1.1302029071708314e-06, "loss": 0.72993487, "num_input_tokens_seen": 234881790, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.2097168, "step": 10876, "time_per_iteration": 2.9171407222747803 }, { "auxiliary_loss_clip": 0.0141883, "auxiliary_loss_mlp": 0.01041342, "balance_loss_clip": 1.25446188, "balance_loss_mlp": 1.02003908, "epoch": 0.6539606192695024, "flos": 14536105165440.0, "grad_norm": 1.8843143956521342, "language_loss": 0.80153811, "learning_rate": 1.1298522214205908e-06, "loss": 0.82613987, "num_input_tokens_seen": 234897775, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.21313477, "step": 10877, "time_per_iteration": 2.9932634830474854 }, { "auxiliary_loss_clip": 0.01415507, "auxiliary_loss_mlp": 0.01034441, "balance_loss_clip": 1.25114322, "balance_loss_mlp": 1.01417565, "epoch": 0.6540207425221705, "flos": 21626322485760.0, "grad_norm": 2.659019297056791, "language_loss": 0.80711776, "learning_rate": 1.1295015686664408e-06, "loss": 0.83161724, "num_input_tokens_seen": 234918395, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20275879, "step": 10878, "time_per_iteration": 2.8777878284454346 }, { "auxiliary_loss_clip": 0.01412263, "auxiliary_loss_mlp": 0.01035009, "balance_loss_clip": 1.24630547, "balance_loss_mlp": 1.01345539, "epoch": 0.6540808657748384, "flos": 17676131339520.0, "grad_norm": 1.9716899732150166, "language_loss": 0.85163468, "learning_rate": 1.1291509489216797e-06, "loss": 0.87610745, "num_input_tokens_seen": 234936260, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.2154541, "step": 10879, "time_per_iteration": 2.8494739532470703 }, { "auxiliary_loss_clip": 0.01425943, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.25758433, "balance_loss_mlp": 1.01144493, "epoch": 0.6541409890275064, "flos": 14546466224640.0, "grad_norm": 2.243738346282337, "language_loss": 0.73297203, "learning_rate": 1.128800362199601e-06, "loss": 0.75755918, "num_input_tokens_seen": 234952110, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.21325684, "step": 10880, "time_per_iteration": 2.811908006668091 }, { "auxiliary_loss_clip": 0.01413216, "auxiliary_loss_mlp": 0.0104022, "balance_loss_clip": 1.24982572, "balance_loss_mlp": 1.01822615, "epoch": 0.6542011122801744, "flos": 17174165817600.0, "grad_norm": 1.9628602878786534, "language_loss": 0.85292339, "learning_rate": 1.1284498085135005e-06, "loss": 0.8774578, "num_input_tokens_seen": 234970810, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.2199707, "step": 10881, "time_per_iteration": 2.817138433456421 }, { "auxiliary_loss_clip": 0.01437976, "auxiliary_loss_mlp": 0.01036338, "balance_loss_clip": 1.26881194, "balance_loss_mlp": 1.01449871, "epoch": 0.6542612355328423, "flos": 18195651843840.0, "grad_norm": 1.798973425548956, "language_loss": 0.78609115, "learning_rate": 1.1280992878766699e-06, "loss": 0.81083429, "num_input_tokens_seen": 234989565, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21826172, "step": 10882, "time_per_iteration": 2.8899927139282227 }, { "auxiliary_loss_clip": 0.01427355, "auxiliary_loss_mlp": 0.01041259, "balance_loss_clip": 1.25829661, "balance_loss_mlp": 1.01938391, "epoch": 0.6543213587855103, "flos": 19802046389760.0, "grad_norm": 1.6330096991536083, "language_loss": 0.8274399, "learning_rate": 1.1277488003024024e-06, "loss": 0.85212606, "num_input_tokens_seen": 235007955, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.21875, "step": 10883, "time_per_iteration": 4.268296718597412 }, { "auxiliary_loss_clip": 0.01433463, "auxiliary_loss_mlp": 0.01041628, "balance_loss_clip": 1.26457882, "balance_loss_mlp": 1.02048063, "epoch": 0.6543814820381783, "flos": 21114810311040.0, "grad_norm": 2.6330714553624603, "language_loss": 0.86018312, "learning_rate": 1.127398345803988e-06, "loss": 0.88493401, "num_input_tokens_seen": 235024860, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21130371, "step": 10884, "time_per_iteration": 2.895533800125122 }, { "auxiliary_loss_clip": 0.01442608, "auxiliary_loss_mlp": 0.01040787, "balance_loss_clip": 1.27395868, "balance_loss_mlp": 1.01881647, "epoch": 0.6544416052908463, "flos": 20203803383040.0, "grad_norm": 2.316525657595679, "language_loss": 0.80916488, "learning_rate": 1.127047924394715e-06, "loss": 0.83399886, "num_input_tokens_seen": 235043815, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21984863, "step": 10885, "time_per_iteration": 2.838981866836548 }, { "auxiliary_loss_clip": 0.0141728, "auxiliary_loss_mlp": 0.01037996, "balance_loss_clip": 1.25241303, "balance_loss_mlp": 1.01659834, "epoch": 0.6545017285435142, "flos": 23378831049600.0, "grad_norm": 2.319521416729139, "language_loss": 0.72437191, "learning_rate": 1.1266975360878722e-06, "loss": 0.74892467, "num_input_tokens_seen": 235062985, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.21411133, "step": 10886, "time_per_iteration": 2.8583364486694336 }, { "auxiliary_loss_clip": 0.01425343, "auxiliary_loss_mlp": 0.01037187, "balance_loss_clip": 1.26113105, "balance_loss_mlp": 1.01746964, "epoch": 0.6545618517961822, "flos": 19143877259520.0, "grad_norm": 1.7428465616588742, "language_loss": 0.78831184, "learning_rate": 1.1263471808967468e-06, "loss": 0.81293714, "num_input_tokens_seen": 235081670, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19726562, "step": 10887, "time_per_iteration": 2.8325719833374023 }, { "auxiliary_loss_clip": 0.01415011, "auxiliary_loss_mlp": 0.01035825, "balance_loss_clip": 1.24978495, "balance_loss_mlp": 1.01460576, "epoch": 0.6546219750488501, "flos": 14946639649920.0, "grad_norm": 1.7928425492701143, "language_loss": 0.78947902, "learning_rate": 1.1259968588346234e-06, "loss": 0.81398737, "num_input_tokens_seen": 235098510, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.2121582, "step": 10888, "time_per_iteration": 2.7950119972229004 }, { "auxiliary_loss_clip": 0.01413349, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.25088692, "balance_loss_mlp": 1.01288557, "epoch": 0.6546820983015181, "flos": 36334403712000.0, "grad_norm": 1.4702230208731475, "language_loss": 0.66769993, "learning_rate": 1.1256465699147874e-06, "loss": 0.69215959, "num_input_tokens_seen": 235119990, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1973877, "step": 10889, "time_per_iteration": 2.9670400619506836 }, { "auxiliary_loss_clip": 0.0141771, "auxiliary_loss_mlp": 0.01035274, "balance_loss_clip": 1.2515049, "balance_loss_mlp": 1.01471031, "epoch": 0.654742221554186, "flos": 20420870526720.0, "grad_norm": 1.843130604935648, "language_loss": 0.80480886, "learning_rate": 1.1252963141505203e-06, "loss": 0.82933867, "num_input_tokens_seen": 235139255, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.2052002, "step": 10890, "time_per_iteration": 2.889808177947998 }, { "auxiliary_loss_clip": 0.0143737, "auxiliary_loss_mlp": 0.01035549, "balance_loss_clip": 1.26833034, "balance_loss_mlp": 1.01578462, "epoch": 0.6548023448068541, "flos": 24874475028480.0, "grad_norm": 2.0920338205117854, "language_loss": 0.66574907, "learning_rate": 1.1249460915551052e-06, "loss": 0.69047827, "num_input_tokens_seen": 235158455, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.19763184, "step": 10891, "time_per_iteration": 2.946877956390381 }, { "auxiliary_loss_clip": 0.01413685, "auxiliary_loss_mlp": 0.01040819, "balance_loss_clip": 1.24916708, "balance_loss_mlp": 1.02089894, "epoch": 0.654862468059522, "flos": 21435841301760.0, "grad_norm": 1.7686355616921339, "language_loss": 0.80330014, "learning_rate": 1.1245959021418214e-06, "loss": 0.82784516, "num_input_tokens_seen": 235177350, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19934082, "step": 10892, "time_per_iteration": 2.891942262649536 }, { "auxiliary_loss_clip": 0.01445167, "auxiliary_loss_mlp": 0.01036929, "balance_loss_clip": 1.27351022, "balance_loss_mlp": 1.01693773, "epoch": 0.65492259131219, "flos": 26588950698240.0, "grad_norm": 1.8932556186018628, "language_loss": 0.78890276, "learning_rate": 1.1242457459239497e-06, "loss": 0.81372374, "num_input_tokens_seen": 235196435, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.1998291, "step": 10893, "time_per_iteration": 2.8958094120025635 }, { "auxiliary_loss_clip": 0.01425288, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.25939608, "balance_loss_mlp": 1.01568162, "epoch": 0.6549827145648579, "flos": 21509825829120.0, "grad_norm": 11.68303263453298, "language_loss": 0.70960438, "learning_rate": 1.123895622914766e-06, "loss": 0.73423064, "num_input_tokens_seen": 235215430, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21630859, "step": 10894, "time_per_iteration": 4.355911731719971 }, { "auxiliary_loss_clip": 0.01435853, "auxiliary_loss_mlp": 0.01035992, "balance_loss_clip": 1.2665472, "balance_loss_mlp": 1.01482034, "epoch": 0.6550428378175259, "flos": 22602626939520.0, "grad_norm": 2.8524955377829397, "language_loss": 0.63340902, "learning_rate": 1.123545533127549e-06, "loss": 0.65812743, "num_input_tokens_seen": 235232015, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21154785, "step": 10895, "time_per_iteration": 2.8494338989257812 }, { "auxiliary_loss_clip": 0.01415074, "auxiliary_loss_mlp": 0.01037211, "balance_loss_clip": 1.25025749, "balance_loss_mlp": 1.01699328, "epoch": 0.655102961070194, "flos": 12831628596480.0, "grad_norm": 1.9877234913840465, "language_loss": 0.80069274, "learning_rate": 1.1231954765755722e-06, "loss": 0.82521558, "num_input_tokens_seen": 235248115, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20227051, "step": 10896, "time_per_iteration": 2.891592264175415 }, { "auxiliary_loss_clip": 0.01408105, "auxiliary_loss_mlp": 0.01034191, "balance_loss_clip": 1.24703228, "balance_loss_mlp": 1.01419997, "epoch": 0.6551630843228619, "flos": 24801983579520.0, "grad_norm": 1.4011637202386802, "language_loss": 0.71027893, "learning_rate": 1.1228454532721111e-06, "loss": 0.73470187, "num_input_tokens_seen": 235270785, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1998291, "step": 10897, "time_per_iteration": 4.310504674911499 }, { "auxiliary_loss_clip": 0.01432882, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.26450944, "balance_loss_mlp": 1.01749635, "epoch": 0.6552232075755299, "flos": 16732430403840.0, "grad_norm": 1.7383432717582983, "language_loss": 0.76447999, "learning_rate": 1.1224954632304391e-06, "loss": 0.7891891, "num_input_tokens_seen": 235287905, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20507812, "step": 10898, "time_per_iteration": 4.183542013168335 }, { "auxiliary_loss_clip": 0.01426866, "auxiliary_loss_mlp": 0.01040527, "balance_loss_clip": 1.26252079, "balance_loss_mlp": 1.02017808, "epoch": 0.6552833308281978, "flos": 22026224442240.0, "grad_norm": 2.627376176975439, "language_loss": 0.73955989, "learning_rate": 1.122145506463827e-06, "loss": 0.76423377, "num_input_tokens_seen": 235305525, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20349121, "step": 10899, "time_per_iteration": 2.8368682861328125 }, { "auxiliary_loss_clip": 0.01426423, "auxiliary_loss_mlp": 0.01036002, "balance_loss_clip": 1.25954413, "balance_loss_mlp": 1.0157243, "epoch": 0.6553434540808658, "flos": 24874158314880.0, "grad_norm": 1.9327041120856, "language_loss": 0.57322764, "learning_rate": 1.1217955829855443e-06, "loss": 0.59785187, "num_input_tokens_seen": 235324415, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20263672, "step": 10900, "time_per_iteration": 2.8571484088897705 }, { "auxiliary_loss_clip": 0.01430675, "auxiliary_loss_mlp": 0.01033085, "balance_loss_clip": 1.26408708, "balance_loss_mlp": 1.01181853, "epoch": 0.6554035773335337, "flos": 23231269198080.0, "grad_norm": 1.6844271728232898, "language_loss": 0.77172482, "learning_rate": 1.1214456928088622e-06, "loss": 0.7963624, "num_input_tokens_seen": 235341595, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.21276855, "step": 10901, "time_per_iteration": 2.8535618782043457 }, { "auxiliary_loss_clip": 0.01412654, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 1.24737358, "balance_loss_mlp": 1.00888324, "epoch": 0.6554637005862017, "flos": 22793786795520.0, "grad_norm": 2.0953725678229937, "language_loss": 0.74257326, "learning_rate": 1.1210958359470463e-06, "loss": 0.7669819, "num_input_tokens_seen": 235361700, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.1932373, "step": 10902, "time_per_iteration": 2.844329357147217 }, { "auxiliary_loss_clip": 0.01415309, "auxiliary_loss_mlp": 0.0103523, "balance_loss_clip": 1.25371838, "balance_loss_mlp": 1.0147022, "epoch": 0.6555238238388696, "flos": 21517245976320.0, "grad_norm": 1.6853182192299556, "language_loss": 0.68586099, "learning_rate": 1.1207460124133645e-06, "loss": 0.71036637, "num_input_tokens_seen": 235382065, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20532227, "step": 10903, "time_per_iteration": 2.8493540287017822 }, { "auxiliary_loss_clip": 0.01443678, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.27282429, "balance_loss_mlp": 1.01498818, "epoch": 0.6555839470915377, "flos": 30531902676480.0, "grad_norm": 2.1376801818427182, "language_loss": 0.67758071, "learning_rate": 1.1203962222210832e-06, "loss": 0.70237505, "num_input_tokens_seen": 235402130, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.20776367, "step": 10904, "time_per_iteration": 2.9194397926330566 }, { "auxiliary_loss_clip": 0.01430339, "auxiliary_loss_mlp": 0.01039854, "balance_loss_clip": 1.26272333, "balance_loss_mlp": 1.01827705, "epoch": 0.6556440703442056, "flos": 24653245363200.0, "grad_norm": 1.8735676526084863, "language_loss": 0.91115963, "learning_rate": 1.120046465383464e-06, "loss": 0.93586159, "num_input_tokens_seen": 235420435, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21582031, "step": 10905, "time_per_iteration": 2.8457276821136475 }, { "auxiliary_loss_clip": 0.01412486, "auxiliary_loss_mlp": 0.01033719, "balance_loss_clip": 1.25120163, "balance_loss_mlp": 1.01359653, "epoch": 0.6557041935968736, "flos": 23742962352000.0, "grad_norm": 2.038286875798592, "language_loss": 0.76773643, "learning_rate": 1.1196967419137721e-06, "loss": 0.79219848, "num_input_tokens_seen": 235439960, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.20141602, "step": 10906, "time_per_iteration": 2.8617193698883057 }, { "auxiliary_loss_clip": 0.01436738, "auxiliary_loss_mlp": 0.01039012, "balance_loss_clip": 1.26837862, "balance_loss_mlp": 1.01841247, "epoch": 0.6557643168495415, "flos": 11108511169920.0, "grad_norm": 2.9150660672622064, "language_loss": 0.75986099, "learning_rate": 1.119347051825267e-06, "loss": 0.7846185, "num_input_tokens_seen": 235457495, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20605469, "step": 10907, "time_per_iteration": 2.8545119762420654 }, { "auxiliary_loss_clip": 0.01425166, "auxiliary_loss_mlp": 0.01036979, "balance_loss_clip": 1.2580483, "balance_loss_mlp": 1.0156281, "epoch": 0.6558244401022095, "flos": 30203361048960.0, "grad_norm": 1.8012360576596527, "language_loss": 0.72607434, "learning_rate": 1.118997395131211e-06, "loss": 0.75069571, "num_input_tokens_seen": 235479525, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.21362305, "step": 10908, "time_per_iteration": 2.9261651039123535 }, { "auxiliary_loss_clip": 0.01425162, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.25962257, "balance_loss_mlp": 1.01657915, "epoch": 0.6558845633548775, "flos": 17940008868480.0, "grad_norm": 2.0085107427104156, "language_loss": 0.82917017, "learning_rate": 1.118647771844861e-06, "loss": 0.8537851, "num_input_tokens_seen": 235496305, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19775391, "step": 10909, "time_per_iteration": 2.813429355621338 }, { "auxiliary_loss_clip": 0.01438362, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.26879048, "balance_loss_mlp": 1.01273286, "epoch": 0.6559446866075455, "flos": 21913121145600.0, "grad_norm": 2.5087385631891914, "language_loss": 0.64845985, "learning_rate": 1.1182981819794767e-06, "loss": 0.67317939, "num_input_tokens_seen": 235512545, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20849609, "step": 10910, "time_per_iteration": 2.8423595428466797 }, { "auxiliary_loss_clip": 0.01447372, "auxiliary_loss_mlp": 0.01036053, "balance_loss_clip": 1.27401447, "balance_loss_mlp": 1.01455927, "epoch": 0.6560048098602135, "flos": 14133714744960.0, "grad_norm": 3.981958061259692, "language_loss": 0.77378529, "learning_rate": 1.117948625548313e-06, "loss": 0.79861957, "num_input_tokens_seen": 235526045, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.21472168, "step": 10911, "time_per_iteration": 2.787630796432495 }, { "auxiliary_loss_clip": 0.01415227, "auxiliary_loss_mlp": 0.01034383, "balance_loss_clip": 1.25347352, "balance_loss_mlp": 1.01484418, "epoch": 0.6560649331128814, "flos": 18816964444800.0, "grad_norm": 1.5029185351344643, "language_loss": 0.75645769, "learning_rate": 1.1175991025646265e-06, "loss": 0.78095376, "num_input_tokens_seen": 235545285, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.1953125, "step": 10912, "time_per_iteration": 2.885342597961426 }, { "auxiliary_loss_clip": 0.01464841, "auxiliary_loss_mlp": 0.01042067, "balance_loss_clip": 1.29114854, "balance_loss_mlp": 1.02126503, "epoch": 0.6561250563655494, "flos": 17061876927360.0, "grad_norm": 1.5914334685633287, "language_loss": 0.77971268, "learning_rate": 1.1172496130416697e-06, "loss": 0.80478173, "num_input_tokens_seen": 235563150, "router_z_loss_clip": 1.73925781, "router_z_loss_mlp": 0.20812988, "step": 10913, "time_per_iteration": 2.8213424682617188 }, { "auxiliary_loss_clip": 0.01416043, "auxiliary_loss_mlp": 0.01034669, "balance_loss_clip": 1.25336099, "balance_loss_mlp": 1.01452315, "epoch": 0.6561851796182173, "flos": 22647808512000.0, "grad_norm": 1.9468487033975106, "language_loss": 0.722067, "learning_rate": 1.1169001569926961e-06, "loss": 0.74657416, "num_input_tokens_seen": 235582535, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.20141602, "step": 10914, "time_per_iteration": 2.86246657371521 }, { "auxiliary_loss_clip": 0.01410355, "auxiliary_loss_mlp": 0.01034919, "balance_loss_clip": 1.24667466, "balance_loss_mlp": 1.01490366, "epoch": 0.6562453028708853, "flos": 19247117189760.0, "grad_norm": 2.0814886122969094, "language_loss": 0.74573159, "learning_rate": 1.116550734430958e-06, "loss": 0.77018428, "num_input_tokens_seen": 235601490, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20019531, "step": 10915, "time_per_iteration": 2.8925211429595947 }, { "auxiliary_loss_clip": 0.01433635, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.26928699, "balance_loss_mlp": 1.01489735, "epoch": 0.6563054261235532, "flos": 23810748341760.0, "grad_norm": 1.5846274575889747, "language_loss": 0.79955143, "learning_rate": 1.1162013453697042e-06, "loss": 0.82424235, "num_input_tokens_seen": 235619165, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20568848, "step": 10916, "time_per_iteration": 2.8720598220825195 }, { "auxiliary_loss_clip": 0.0142823, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.26085591, "balance_loss_mlp": 1.0157001, "epoch": 0.6563655493762213, "flos": 19248157820160.0, "grad_norm": 1.8326744384420888, "language_loss": 0.77003074, "learning_rate": 1.1158519898221831e-06, "loss": 0.79466724, "num_input_tokens_seen": 235637115, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19714355, "step": 10917, "time_per_iteration": 2.8184945583343506 }, { "auxiliary_loss_clip": 0.01408797, "auxiliary_loss_mlp": 0.0103179, "balance_loss_clip": 1.24483919, "balance_loss_mlp": 1.01152456, "epoch": 0.6564256726288892, "flos": 25567328937600.0, "grad_norm": 2.5769072291713475, "language_loss": 0.71373308, "learning_rate": 1.1155026678016445e-06, "loss": 0.73813903, "num_input_tokens_seen": 235656330, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20263672, "step": 10918, "time_per_iteration": 2.874936819076538 }, { "auxiliary_loss_clip": 0.01404562, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.24620628, "balance_loss_mlp": 1.01165175, "epoch": 0.6564857958815572, "flos": 22210869047040.0, "grad_norm": 1.490278212800338, "language_loss": 0.76604438, "learning_rate": 1.115153379321332e-06, "loss": 0.79039502, "num_input_tokens_seen": 235674510, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18835449, "step": 10919, "time_per_iteration": 4.2468461990356445 }, { "auxiliary_loss_clip": 0.01199112, "auxiliary_loss_mlp": 0.01033359, "balance_loss_clip": 1.10669446, "balance_loss_mlp": 1.01237857, "epoch": 0.6565459191342251, "flos": 58148040245760.0, "grad_norm": 0.7182174874767043, "language_loss": 0.53039926, "learning_rate": 1.1148041243944931e-06, "loss": 0.552724, "num_input_tokens_seen": 235735050, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.20996094, "step": 10920, "time_per_iteration": 3.393949508666992 }, { "auxiliary_loss_clip": 0.01418591, "auxiliary_loss_mlp": 0.01036422, "balance_loss_clip": 1.25648904, "balance_loss_mlp": 1.01540589, "epoch": 0.6566060423868931, "flos": 30821687493120.0, "grad_norm": 1.5043208159886705, "language_loss": 0.65994763, "learning_rate": 1.1144549030343697e-06, "loss": 0.68449777, "num_input_tokens_seen": 235757545, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.21020508, "step": 10921, "time_per_iteration": 2.999271869659424 }, { "auxiliary_loss_clip": 0.01422168, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.25773418, "balance_loss_mlp": 1.00969529, "epoch": 0.6566661656395612, "flos": 23377790419200.0, "grad_norm": 7.77535786273537, "language_loss": 0.82160378, "learning_rate": 1.114105715254205e-06, "loss": 0.84613991, "num_input_tokens_seen": 235777265, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.21728516, "step": 10922, "time_per_iteration": 2.890613317489624 }, { "auxiliary_loss_clip": 0.0143226, "auxiliary_loss_mlp": 0.01035079, "balance_loss_clip": 1.26563561, "balance_loss_mlp": 1.01478922, "epoch": 0.6567262888922291, "flos": 25745684515200.0, "grad_norm": 1.806614570048082, "language_loss": 0.71799135, "learning_rate": 1.1137565610672414e-06, "loss": 0.74266481, "num_input_tokens_seen": 235796565, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20275879, "step": 10923, "time_per_iteration": 2.8676459789276123 }, { "auxiliary_loss_clip": 0.01426666, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.2615521, "balance_loss_mlp": 1.01524556, "epoch": 0.6567864121448971, "flos": 17131246485120.0, "grad_norm": 2.1118114291103667, "language_loss": 0.81853962, "learning_rate": 1.1134074404867169e-06, "loss": 0.84316492, "num_input_tokens_seen": 235814805, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20593262, "step": 10924, "time_per_iteration": 2.847930431365967 }, { "auxiliary_loss_clip": 0.01434463, "auxiliary_loss_mlp": 0.01030645, "balance_loss_clip": 1.26806974, "balance_loss_mlp": 1.01163089, "epoch": 0.656846535397565, "flos": 22429565003520.0, "grad_norm": 1.6692573697672237, "language_loss": 0.73308015, "learning_rate": 1.1130583535258717e-06, "loss": 0.75773126, "num_input_tokens_seen": 235833405, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19030762, "step": 10925, "time_per_iteration": 2.864029884338379 }, { "auxiliary_loss_clip": 0.01424045, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.25867867, "balance_loss_mlp": 1.01338983, "epoch": 0.656906658650233, "flos": 17711449545600.0, "grad_norm": 2.409249772435587, "language_loss": 0.73184466, "learning_rate": 1.112709300197942e-06, "loss": 0.75641537, "num_input_tokens_seen": 235848530, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19641113, "step": 10926, "time_per_iteration": 2.797229051589966 }, { "auxiliary_loss_clip": 0.01438318, "auxiliary_loss_mlp": 0.01034526, "balance_loss_clip": 1.26833868, "balance_loss_mlp": 1.01377141, "epoch": 0.6569667819029009, "flos": 21184134624000.0, "grad_norm": 2.719002107901623, "language_loss": 0.73225021, "learning_rate": 1.1123602805161656e-06, "loss": 0.75697863, "num_input_tokens_seen": 235867225, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.20739746, "step": 10927, "time_per_iteration": 2.910715341567993 }, { "auxiliary_loss_clip": 0.01193515, "auxiliary_loss_mlp": 0.01039131, "balance_loss_clip": 1.10121405, "balance_loss_mlp": 1.01814997, "epoch": 0.6570269051555689, "flos": 68795107251840.0, "grad_norm": 0.7321475560030118, "language_loss": 0.64523602, "learning_rate": 1.112011294493775e-06, "loss": 0.66756248, "num_input_tokens_seen": 235932925, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.20996094, "step": 10928, "time_per_iteration": 3.356295347213745 }, { "auxiliary_loss_clip": 0.01422543, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.25977576, "balance_loss_mlp": 1.01436615, "epoch": 0.6570870284082369, "flos": 26329507159680.0, "grad_norm": 2.0556909202252096, "language_loss": 0.78022969, "learning_rate": 1.1116623421440063e-06, "loss": 0.80480456, "num_input_tokens_seen": 235952680, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.20568848, "step": 10929, "time_per_iteration": 4.334246635437012 }, { "auxiliary_loss_clip": 0.01421912, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.25808167, "balance_loss_mlp": 1.01223397, "epoch": 0.6571471516609049, "flos": 26185293423360.0, "grad_norm": 1.9816009271745454, "language_loss": 0.66150987, "learning_rate": 1.1113134234800895e-06, "loss": 0.6860646, "num_input_tokens_seen": 235972075, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.21325684, "step": 10930, "time_per_iteration": 2.8868889808654785 }, { "auxiliary_loss_clip": 0.01416641, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.25208902, "balance_loss_mlp": 1.01328564, "epoch": 0.6572072749135728, "flos": 20386004768640.0, "grad_norm": 1.6267753237517748, "language_loss": 0.71718812, "learning_rate": 1.110964538515258e-06, "loss": 0.74170053, "num_input_tokens_seen": 235990340, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.2130127, "step": 10931, "time_per_iteration": 2.8707470893859863 }, { "auxiliary_loss_clip": 0.01436546, "auxiliary_loss_mlp": 0.01035925, "balance_loss_clip": 1.26904511, "balance_loss_mlp": 1.0160048, "epoch": 0.6572673981662408, "flos": 17137580757120.0, "grad_norm": 2.092471576048599, "language_loss": 0.6975342, "learning_rate": 1.1106156872627393e-06, "loss": 0.72225893, "num_input_tokens_seen": 236007470, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19921875, "step": 10932, "time_per_iteration": 2.91862154006958 }, { "auxiliary_loss_clip": 0.01429778, "auxiliary_loss_mlp": 0.01037166, "balance_loss_clip": 1.26409256, "balance_loss_mlp": 1.01675725, "epoch": 0.6573275214189087, "flos": 41288344922880.0, "grad_norm": 1.6028697125647475, "language_loss": 0.8049227, "learning_rate": 1.1102668697357626e-06, "loss": 0.82959211, "num_input_tokens_seen": 236029030, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20422363, "step": 10933, "time_per_iteration": 5.79594087600708 }, { "auxiliary_loss_clip": 0.01438626, "auxiliary_loss_mlp": 0.01037159, "balance_loss_clip": 1.27178836, "balance_loss_mlp": 1.01659513, "epoch": 0.6573876446715767, "flos": 22899605679360.0, "grad_norm": 2.1267757225063137, "language_loss": 0.74354184, "learning_rate": 1.1099180859475571e-06, "loss": 0.7682997, "num_input_tokens_seen": 236047160, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20568848, "step": 10934, "time_per_iteration": 2.8708555698394775 }, { "auxiliary_loss_clip": 0.01420235, "auxiliary_loss_mlp": 0.01035115, "balance_loss_clip": 1.25732684, "balance_loss_mlp": 1.0150522, "epoch": 0.6574477679242448, "flos": 44033808026880.0, "grad_norm": 1.7868778836997965, "language_loss": 0.76729262, "learning_rate": 1.1095693359113454e-06, "loss": 0.79184616, "num_input_tokens_seen": 236069215, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.20068359, "step": 10935, "time_per_iteration": 3.1085095405578613 }, { "auxiliary_loss_clip": 0.01436045, "auxiliary_loss_mlp": 0.0104108, "balance_loss_clip": 1.26868474, "balance_loss_mlp": 1.01966977, "epoch": 0.6575078911769127, "flos": 24582201747840.0, "grad_norm": 1.740009048185618, "language_loss": 0.7892909, "learning_rate": 1.1092206196403538e-06, "loss": 0.81406212, "num_input_tokens_seen": 236088335, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21411133, "step": 10936, "time_per_iteration": 2.8994131088256836 }, { "auxiliary_loss_clip": 0.01408279, "auxiliary_loss_mlp": 0.01038474, "balance_loss_clip": 1.2456224, "balance_loss_mlp": 1.01763642, "epoch": 0.6575680144295807, "flos": 20934056759040.0, "grad_norm": 1.8860349856068197, "language_loss": 0.70102322, "learning_rate": 1.1088719371478056e-06, "loss": 0.72549081, "num_input_tokens_seen": 236108540, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.20849609, "step": 10937, "time_per_iteration": 2.8665032386779785 }, { "auxiliary_loss_clip": 0.01428182, "auxiliary_loss_mlp": 0.01040328, "balance_loss_clip": 1.26272452, "balance_loss_mlp": 1.0197643, "epoch": 0.6576281376822486, "flos": 10932734545920.0, "grad_norm": 4.376563727804312, "language_loss": 0.69918716, "learning_rate": 1.1085232884469236e-06, "loss": 0.7238723, "num_input_tokens_seen": 236124495, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20556641, "step": 10938, "time_per_iteration": 2.809796094894409 }, { "auxiliary_loss_clip": 0.01423112, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.25768471, "balance_loss_mlp": 1.01636291, "epoch": 0.6576882609349166, "flos": 19290986663040.0, "grad_norm": 2.633202177144158, "language_loss": 0.7171613, "learning_rate": 1.108174673550927e-06, "loss": 0.74176276, "num_input_tokens_seen": 236142550, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20666504, "step": 10939, "time_per_iteration": 2.819267749786377 }, { "auxiliary_loss_clip": 0.01426554, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.25872517, "balance_loss_mlp": 1.01299095, "epoch": 0.6577483841875845, "flos": 20227719899520.0, "grad_norm": 2.1322291517574286, "language_loss": 0.78606105, "learning_rate": 1.107826092473037e-06, "loss": 0.81066906, "num_input_tokens_seen": 236156620, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.21264648, "step": 10940, "time_per_iteration": 2.8114871978759766 }, { "auxiliary_loss_clip": 0.01433101, "auxiliary_loss_mlp": 0.01039298, "balance_loss_clip": 1.26399517, "balance_loss_mlp": 1.01916349, "epoch": 0.6578085074402525, "flos": 34764413247360.0, "grad_norm": 1.8988660825645325, "language_loss": 0.69356191, "learning_rate": 1.107477545226471e-06, "loss": 0.71828592, "num_input_tokens_seen": 236177095, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20141602, "step": 10941, "time_per_iteration": 2.9528722763061523 }, { "auxiliary_loss_clip": 0.01428108, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.26247573, "balance_loss_mlp": 1.01582789, "epoch": 0.6578686306929205, "flos": 23479853984640.0, "grad_norm": 1.8335024656621393, "language_loss": 0.69430077, "learning_rate": 1.1071290318244448e-06, "loss": 0.7189368, "num_input_tokens_seen": 236194695, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19677734, "step": 10942, "time_per_iteration": 2.8631181716918945 }, { "auxiliary_loss_clip": 0.01467916, "auxiliary_loss_mlp": 0.01047891, "balance_loss_clip": 1.29319787, "balance_loss_mlp": 1.02499056, "epoch": 0.6579287539455885, "flos": 18086077641600.0, "grad_norm": 15.467182501508834, "language_loss": 0.72393858, "learning_rate": 1.1067805522801753e-06, "loss": 0.74909663, "num_input_tokens_seen": 236213885, "router_z_loss_clip": 1.74511719, "router_z_loss_mlp": 0.22900391, "step": 10943, "time_per_iteration": 2.84393048286438 }, { "auxiliary_loss_clip": 0.01424361, "auxiliary_loss_mlp": 0.01042484, "balance_loss_clip": 1.25995493, "balance_loss_mlp": 1.02064514, "epoch": 0.6579888771982564, "flos": 28674027676800.0, "grad_norm": 3.3885259865066084, "language_loss": 0.59427404, "learning_rate": 1.1064321066068778e-06, "loss": 0.6189425, "num_input_tokens_seen": 236237315, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.21838379, "step": 10944, "time_per_iteration": 2.9354195594787598 }, { "auxiliary_loss_clip": 0.0145017, "auxiliary_loss_mlp": 0.01043919, "balance_loss_clip": 1.27724457, "balance_loss_mlp": 1.02253318, "epoch": 0.6580490004509244, "flos": 25057536065280.0, "grad_norm": 1.7617976056341416, "language_loss": 0.72972548, "learning_rate": 1.1060836948177646e-06, "loss": 0.75466633, "num_input_tokens_seen": 236256345, "router_z_loss_clip": 1.7265625, "router_z_loss_mlp": 0.21398926, "step": 10945, "time_per_iteration": 2.8798065185546875 }, { "auxiliary_loss_clip": 0.01422608, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.25793529, "balance_loss_mlp": 1.01571155, "epoch": 0.6581091237035923, "flos": 43524693826560.0, "grad_norm": 1.7608569874731699, "language_loss": 0.7111398, "learning_rate": 1.105735316926046e-06, "loss": 0.73572665, "num_input_tokens_seen": 236281890, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20349121, "step": 10946, "time_per_iteration": 3.0531089305877686 }, { "auxiliary_loss_clip": 0.01433517, "auxiliary_loss_mlp": 0.01049609, "balance_loss_clip": 1.26891041, "balance_loss_mlp": 1.02915239, "epoch": 0.6581692469562603, "flos": 22424633320320.0, "grad_norm": 2.1568307659182904, "language_loss": 0.83176839, "learning_rate": 1.105386972944934e-06, "loss": 0.85659963, "num_input_tokens_seen": 236298370, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20471191, "step": 10947, "time_per_iteration": 2.8723208904266357 }, { "auxiliary_loss_clip": 0.01441986, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.27377272, "balance_loss_mlp": 1.01855326, "epoch": 0.6582293702089284, "flos": 24869588590080.0, "grad_norm": 1.6430278450456721, "language_loss": 0.7779268, "learning_rate": 1.1050386628876385e-06, "loss": 0.80272931, "num_input_tokens_seen": 236317380, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.19702148, "step": 10948, "time_per_iteration": 2.8501296043395996 }, { "auxiliary_loss_clip": 0.01423887, "auxiliary_loss_mlp": 0.01040978, "balance_loss_clip": 1.25942314, "balance_loss_mlp": 1.02109396, "epoch": 0.6582894934615963, "flos": 23050108442880.0, "grad_norm": 1.8093567297954019, "language_loss": 0.79561853, "learning_rate": 1.1046903867673655e-06, "loss": 0.8202672, "num_input_tokens_seen": 236336210, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19885254, "step": 10949, "time_per_iteration": 2.8416035175323486 }, { "auxiliary_loss_clip": 0.0118973, "auxiliary_loss_mlp": 0.01039494, "balance_loss_clip": 1.09778976, "balance_loss_mlp": 1.01488972, "epoch": 0.6583496167142643, "flos": 72585022757760.0, "grad_norm": 0.73537135419588, "language_loss": 0.61849815, "learning_rate": 1.104342144597323e-06, "loss": 0.64079034, "num_input_tokens_seen": 236403090, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.24511719, "step": 10950, "time_per_iteration": 3.4317963123321533 }, { "auxiliary_loss_clip": 0.01416749, "auxiliary_loss_mlp": 0.01039538, "balance_loss_clip": 1.2545892, "balance_loss_mlp": 1.01992798, "epoch": 0.6584097399669322, "flos": 13086638144640.0, "grad_norm": 1.9543701806870706, "language_loss": 0.68041098, "learning_rate": 1.1039939363907178e-06, "loss": 0.70497382, "num_input_tokens_seen": 236420475, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19616699, "step": 10951, "time_per_iteration": 2.8191659450531006 }, { "auxiliary_loss_clip": 0.01414353, "auxiliary_loss_mlp": 0.0103885, "balance_loss_clip": 1.25105596, "balance_loss_mlp": 1.01771438, "epoch": 0.6584698632196002, "flos": 28704595178880.0, "grad_norm": 1.3312035394806727, "language_loss": 0.77161658, "learning_rate": 1.1036457621607504e-06, "loss": 0.79614866, "num_input_tokens_seen": 236441915, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.21130371, "step": 10952, "time_per_iteration": 2.94565486907959 }, { "auxiliary_loss_clip": 0.01427658, "auxiliary_loss_mlp": 0.01044636, "balance_loss_clip": 1.26346874, "balance_loss_mlp": 1.02394104, "epoch": 0.6585299864722681, "flos": 14327363064960.0, "grad_norm": 1.700846415542881, "language_loss": 0.74742758, "learning_rate": 1.1032976219206257e-06, "loss": 0.77215058, "num_input_tokens_seen": 236460340, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.20690918, "step": 10953, "time_per_iteration": 4.2381911277771 }, { "auxiliary_loss_clip": 0.01428458, "auxiliary_loss_mlp": 0.01041229, "balance_loss_clip": 1.26277781, "balance_loss_mlp": 1.02064121, "epoch": 0.6585901097249361, "flos": 26809365957120.0, "grad_norm": 1.9506701604816052, "language_loss": 0.7932055, "learning_rate": 1.102949515683546e-06, "loss": 0.81790239, "num_input_tokens_seen": 236478280, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20581055, "step": 10954, "time_per_iteration": 2.9020566940307617 }, { "auxiliary_loss_clip": 0.01423411, "auxiliary_loss_mlp": 0.01043206, "balance_loss_clip": 1.25811172, "balance_loss_mlp": 1.02242744, "epoch": 0.658650232977604, "flos": 18742120266240.0, "grad_norm": 4.458385453844709, "language_loss": 0.70659375, "learning_rate": 1.1026014434627096e-06, "loss": 0.73125988, "num_input_tokens_seen": 236493225, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20751953, "step": 10955, "time_per_iteration": 2.80837345123291 }, { "auxiliary_loss_clip": 0.01411492, "auxiliary_loss_mlp": 0.01047273, "balance_loss_clip": 1.2508738, "balance_loss_mlp": 1.02761495, "epoch": 0.6587103562302721, "flos": 24764041175040.0, "grad_norm": 2.349923664367933, "language_loss": 0.81281447, "learning_rate": 1.1022534052713172e-06, "loss": 0.83740211, "num_input_tokens_seen": 236514420, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19641113, "step": 10956, "time_per_iteration": 2.8610870838165283 }, { "auxiliary_loss_clip": 0.01423502, "auxiliary_loss_mlp": 0.01041715, "balance_loss_clip": 1.25892186, "balance_loss_mlp": 1.02156889, "epoch": 0.65877047948294, "flos": 22356304392960.0, "grad_norm": 2.1196313467990384, "language_loss": 0.82306659, "learning_rate": 1.1019054011225648e-06, "loss": 0.84771878, "num_input_tokens_seen": 236532785, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20166016, "step": 10957, "time_per_iteration": 2.8464512825012207 }, { "auxiliary_loss_clip": 0.01418978, "auxiliary_loss_mlp": 0.0103811, "balance_loss_clip": 1.25607157, "balance_loss_mlp": 1.01934624, "epoch": 0.658830602735608, "flos": 45195164288640.0, "grad_norm": 5.164176164756197, "language_loss": 0.76823962, "learning_rate": 1.1015574310296506e-06, "loss": 0.7928105, "num_input_tokens_seen": 236553330, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18774414, "step": 10958, "time_per_iteration": 3.078305244445801 }, { "auxiliary_loss_clip": 0.01422619, "auxiliary_loss_mlp": 0.0104126, "balance_loss_clip": 1.25912321, "balance_loss_mlp": 1.02094674, "epoch": 0.6588907259882759, "flos": 19911168144000.0, "grad_norm": 1.8148091744974095, "language_loss": 0.75646508, "learning_rate": 1.1012094950057678e-06, "loss": 0.78110385, "num_input_tokens_seen": 236572960, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20324707, "step": 10959, "time_per_iteration": 2.843550682067871 }, { "auxiliary_loss_clip": 0.01424443, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.26029468, "balance_loss_mlp": 1.01580667, "epoch": 0.6589508492409439, "flos": 24144357386880.0, "grad_norm": 1.900180164673239, "language_loss": 0.65939224, "learning_rate": 1.1008615930641107e-06, "loss": 0.68398887, "num_input_tokens_seen": 236594090, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19421387, "step": 10960, "time_per_iteration": 2.868428945541382 }, { "auxiliary_loss_clip": 0.01449388, "auxiliary_loss_mlp": 0.01042652, "balance_loss_clip": 1.27914333, "balance_loss_mlp": 1.02157569, "epoch": 0.659010972493612, "flos": 18231920190720.0, "grad_norm": 2.09456280934446, "language_loss": 0.82757664, "learning_rate": 1.1005137252178734e-06, "loss": 0.85249704, "num_input_tokens_seen": 236610190, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21081543, "step": 10961, "time_per_iteration": 2.814931869506836 }, { "auxiliary_loss_clip": 0.01438708, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.2736609, "balance_loss_mlp": 1.01536381, "epoch": 0.6590710957462799, "flos": 27611522599680.0, "grad_norm": 1.939296058666974, "language_loss": 0.75013566, "learning_rate": 1.1001658914802453e-06, "loss": 0.77486676, "num_input_tokens_seen": 236631575, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19042969, "step": 10962, "time_per_iteration": 2.9552595615386963 }, { "auxiliary_loss_clip": 0.0144326, "auxiliary_loss_mlp": 0.0104477, "balance_loss_clip": 1.27488422, "balance_loss_mlp": 1.02482593, "epoch": 0.6591312189989479, "flos": 20312608423680.0, "grad_norm": 1.790726759614983, "language_loss": 0.81044823, "learning_rate": 1.0998180918644165e-06, "loss": 0.83532852, "num_input_tokens_seen": 236649815, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19946289, "step": 10963, "time_per_iteration": 2.9176297187805176 }, { "auxiliary_loss_clip": 0.01420539, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.25792909, "balance_loss_mlp": 1.01957393, "epoch": 0.6591913422516158, "flos": 12320614114560.0, "grad_norm": 1.684615616476281, "language_loss": 0.7916559, "learning_rate": 1.0994703263835754e-06, "loss": 0.81625223, "num_input_tokens_seen": 236668335, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19519043, "step": 10964, "time_per_iteration": 4.3206446170806885 }, { "auxiliary_loss_clip": 0.01442578, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.27364123, "balance_loss_mlp": 1.01650786, "epoch": 0.6592514655042838, "flos": 25895508606720.0, "grad_norm": 1.6359464937062185, "language_loss": 0.74770415, "learning_rate": 1.0991225950509106e-06, "loss": 0.77248991, "num_input_tokens_seen": 236688945, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.19494629, "step": 10965, "time_per_iteration": 2.874070882797241 }, { "auxiliary_loss_clip": 0.01447937, "auxiliary_loss_mlp": 0.01043582, "balance_loss_clip": 1.2761122, "balance_loss_mlp": 1.02330494, "epoch": 0.6593115887569517, "flos": 14071584355200.0, "grad_norm": 1.9832818141541908, "language_loss": 0.74094224, "learning_rate": 1.0987748978796067e-06, "loss": 0.76585746, "num_input_tokens_seen": 236707055, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20288086, "step": 10966, "time_per_iteration": 2.8877007961273193 }, { "auxiliary_loss_clip": 0.01427093, "auxiliary_loss_mlp": 0.0103919, "balance_loss_clip": 1.26167679, "balance_loss_mlp": 1.01874578, "epoch": 0.6593717120096197, "flos": 24728768213760.0, "grad_norm": 1.593173671543881, "language_loss": 0.77654445, "learning_rate": 1.0984272348828487e-06, "loss": 0.80120718, "num_input_tokens_seen": 236725900, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.2043457, "step": 10967, "time_per_iteration": 4.295374155044556 }, { "auxiliary_loss_clip": 0.01194011, "auxiliary_loss_mlp": 0.01028162, "balance_loss_clip": 1.09915709, "balance_loss_mlp": 1.00260377, "epoch": 0.6594318352622877, "flos": 55588913786880.0, "grad_norm": 0.7844241022478061, "language_loss": 0.48529333, "learning_rate": 1.0980796060738221e-06, "loss": 0.50751507, "num_input_tokens_seen": 236788415, "router_z_loss_clip": 0.94921875, "router_z_loss_mlp": 0.25585938, "step": 10968, "time_per_iteration": 4.737373352050781 }, { "auxiliary_loss_clip": 0.01441304, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.27218175, "balance_loss_mlp": 1.01795852, "epoch": 0.6594919585149557, "flos": 17465805671040.0, "grad_norm": 1.8681780279323057, "language_loss": 0.79687047, "learning_rate": 1.0977320114657058e-06, "loss": 0.82167459, "num_input_tokens_seen": 236805155, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.21142578, "step": 10969, "time_per_iteration": 2.886795997619629 }, { "auxiliary_loss_clip": 0.01429979, "auxiliary_loss_mlp": 0.01039079, "balance_loss_clip": 1.26461673, "balance_loss_mlp": 1.01926613, "epoch": 0.6595520817676236, "flos": 18232327393920.0, "grad_norm": 3.2074024256710016, "language_loss": 0.66433144, "learning_rate": 1.0973844510716817e-06, "loss": 0.68902194, "num_input_tokens_seen": 236824360, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19824219, "step": 10970, "time_per_iteration": 2.8572988510131836 }, { "auxiliary_loss_clip": 0.01434292, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.26664495, "balance_loss_mlp": 1.01619589, "epoch": 0.6596122050202916, "flos": 22209783171840.0, "grad_norm": 1.669875231367687, "language_loss": 0.7733897, "learning_rate": 1.0970369249049308e-06, "loss": 0.79809344, "num_input_tokens_seen": 236844640, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19885254, "step": 10971, "time_per_iteration": 2.8899338245391846 }, { "auxiliary_loss_clip": 0.0143446, "auxiliary_loss_mlp": 0.0103776, "balance_loss_clip": 1.26778173, "balance_loss_mlp": 1.0189724, "epoch": 0.6596723282729595, "flos": 14182018208640.0, "grad_norm": 2.500499564055094, "language_loss": 0.71728885, "learning_rate": 1.096689432978629e-06, "loss": 0.74201107, "num_input_tokens_seen": 236861160, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.18786621, "step": 10972, "time_per_iteration": 2.7915186882019043 }, { "auxiliary_loss_clip": 0.01425211, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.26029265, "balance_loss_mlp": 1.01071823, "epoch": 0.6597324515256275, "flos": 30564913397760.0, "grad_norm": 1.9122364836382735, "language_loss": 0.56761861, "learning_rate": 1.0963419753059556e-06, "loss": 0.59217072, "num_input_tokens_seen": 236880465, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19287109, "step": 10973, "time_per_iteration": 2.9353761672973633 }, { "auxiliary_loss_clip": 0.0145975, "auxiliary_loss_mlp": 0.0103994, "balance_loss_clip": 1.28667617, "balance_loss_mlp": 1.02024639, "epoch": 0.6597925747782956, "flos": 17648911952640.0, "grad_norm": 1.8973052244687596, "language_loss": 0.79267734, "learning_rate": 1.0959945519000839e-06, "loss": 0.81767422, "num_input_tokens_seen": 236897730, "router_z_loss_clip": 1.72949219, "router_z_loss_mlp": 0.19677734, "step": 10974, "time_per_iteration": 2.7895748615264893 }, { "auxiliary_loss_clip": 0.01427459, "auxiliary_loss_mlp": 0.01033654, "balance_loss_clip": 1.26007366, "balance_loss_mlp": 1.01350713, "epoch": 0.6598526980309635, "flos": 22829059756800.0, "grad_norm": 2.7104091843030234, "language_loss": 0.69063509, "learning_rate": 1.0956471627741906e-06, "loss": 0.71524626, "num_input_tokens_seen": 236917300, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20141602, "step": 10975, "time_per_iteration": 2.8518686294555664 }, { "auxiliary_loss_clip": 0.01442859, "auxiliary_loss_mlp": 0.01032744, "balance_loss_clip": 1.27613187, "balance_loss_mlp": 1.01369405, "epoch": 0.6599128212836315, "flos": 21077591823360.0, "grad_norm": 1.6553419306269885, "language_loss": 0.71289349, "learning_rate": 1.0952998079414464e-06, "loss": 0.7376495, "num_input_tokens_seen": 236935590, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19055176, "step": 10976, "time_per_iteration": 2.8385961055755615 }, { "auxiliary_loss_clip": 0.01421754, "auxiliary_loss_mlp": 0.01032387, "balance_loss_clip": 1.25875783, "balance_loss_mlp": 1.0129559, "epoch": 0.6599729445362994, "flos": 22173605314560.0, "grad_norm": 1.9051431461042905, "language_loss": 0.68047488, "learning_rate": 1.0949524874150243e-06, "loss": 0.70501631, "num_input_tokens_seen": 236952830, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19421387, "step": 10977, "time_per_iteration": 2.9098968505859375 }, { "auxiliary_loss_clip": 0.01442846, "auxiliary_loss_mlp": 0.01038211, "balance_loss_clip": 1.27229166, "balance_loss_mlp": 1.01674151, "epoch": 0.6600330677889674, "flos": 18159112028160.0, "grad_norm": 2.455265698195825, "language_loss": 0.82163179, "learning_rate": 1.0946052012080952e-06, "loss": 0.84644234, "num_input_tokens_seen": 236971930, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.21459961, "step": 10978, "time_per_iteration": 2.8498494625091553 }, { "auxiliary_loss_clip": 0.01434774, "auxiliary_loss_mlp": 0.01037587, "balance_loss_clip": 1.2664634, "balance_loss_mlp": 1.01726174, "epoch": 0.6600931910416353, "flos": 18159157272960.0, "grad_norm": 2.178262625761936, "language_loss": 0.68207771, "learning_rate": 1.0942579493338278e-06, "loss": 0.7068013, "num_input_tokens_seen": 236989920, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20324707, "step": 10979, "time_per_iteration": 2.808260679244995 }, { "auxiliary_loss_clip": 0.01440701, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.27222729, "balance_loss_mlp": 1.01334262, "epoch": 0.6601533142943034, "flos": 17429673058560.0, "grad_norm": 2.1822376973683464, "language_loss": 0.74279439, "learning_rate": 1.0939107318053889e-06, "loss": 0.76753241, "num_input_tokens_seen": 237006570, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19750977, "step": 10980, "time_per_iteration": 2.816565990447998 }, { "auxiliary_loss_clip": 0.01414933, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.25301838, "balance_loss_mlp": 1.01531386, "epoch": 0.6602134375469713, "flos": 28231070653440.0, "grad_norm": 1.641802742037147, "language_loss": 0.74365091, "learning_rate": 1.0935635486359459e-06, "loss": 0.76813722, "num_input_tokens_seen": 237028415, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18371582, "step": 10981, "time_per_iteration": 2.8924028873443604 }, { "auxiliary_loss_clip": 0.01434561, "auxiliary_loss_mlp": 0.01038725, "balance_loss_clip": 1.26688528, "balance_loss_mlp": 1.01871002, "epoch": 0.6602735607996393, "flos": 29428423793280.0, "grad_norm": 1.9165924845988083, "language_loss": 0.69480109, "learning_rate": 1.0932163998386647e-06, "loss": 0.71953398, "num_input_tokens_seen": 237046595, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20019531, "step": 10982, "time_per_iteration": 2.912083864212036 }, { "auxiliary_loss_clip": 0.01434881, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.26956725, "balance_loss_mlp": 1.0164063, "epoch": 0.6603336840523072, "flos": 18597227857920.0, "grad_norm": 1.6036143411135828, "language_loss": 0.70121622, "learning_rate": 1.0928692854267075e-06, "loss": 0.7259286, "num_input_tokens_seen": 237066150, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19946289, "step": 10983, "time_per_iteration": 2.8398022651672363 }, { "auxiliary_loss_clip": 0.01450008, "auxiliary_loss_mlp": 0.01037503, "balance_loss_clip": 1.28118396, "balance_loss_mlp": 1.01679611, "epoch": 0.6603938073049752, "flos": 33267728638080.0, "grad_norm": 1.618533785857711, "language_loss": 0.70960331, "learning_rate": 1.092522205413239e-06, "loss": 0.73447841, "num_input_tokens_seen": 237087060, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.20703125, "step": 10984, "time_per_iteration": 2.949765205383301 }, { "auxiliary_loss_clip": 0.01424403, "auxiliary_loss_mlp": 0.01041583, "balance_loss_clip": 1.26177406, "balance_loss_mlp": 1.0210197, "epoch": 0.6604539305576431, "flos": 17393223732480.0, "grad_norm": 6.6697318406176525, "language_loss": 0.84982377, "learning_rate": 1.0921751598114193e-06, "loss": 0.87448364, "num_input_tokens_seen": 237103825, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20568848, "step": 10985, "time_per_iteration": 2.8638951778411865 }, { "auxiliary_loss_clip": 0.01433326, "auxiliary_loss_mlp": 0.01042235, "balance_loss_clip": 1.26601398, "balance_loss_mlp": 1.0227797, "epoch": 0.6605140538103111, "flos": 21260924328960.0, "grad_norm": 1.966282765104304, "language_loss": 0.74863005, "learning_rate": 1.0918281486344077e-06, "loss": 0.77338564, "num_input_tokens_seen": 237121740, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19433594, "step": 10986, "time_per_iteration": 2.8699233531951904 }, { "auxiliary_loss_clip": 0.01425444, "auxiliary_loss_mlp": 0.01034248, "balance_loss_clip": 1.26223433, "balance_loss_mlp": 1.01449454, "epoch": 0.6605741770629792, "flos": 13889744928000.0, "grad_norm": 1.7563460733274847, "language_loss": 0.79782033, "learning_rate": 1.0914811718953636e-06, "loss": 0.8224172, "num_input_tokens_seen": 237139565, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19750977, "step": 10987, "time_per_iteration": 2.8451921939849854 }, { "auxiliary_loss_clip": 0.01195976, "auxiliary_loss_mlp": 0.01017531, "balance_loss_clip": 1.10221434, "balance_loss_mlp": 0.99902993, "epoch": 0.6606343003156471, "flos": 69351393795840.0, "grad_norm": 0.8164921363868355, "language_loss": 0.54237878, "learning_rate": 1.0911342296074454e-06, "loss": 0.5645138, "num_input_tokens_seen": 237201055, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.18457031, "step": 10988, "time_per_iteration": 4.929902076721191 }, { "auxiliary_loss_clip": 0.01426041, "auxiliary_loss_mlp": 0.01039725, "balance_loss_clip": 1.26223195, "balance_loss_mlp": 1.02038968, "epoch": 0.6606944235683151, "flos": 27284021602560.0, "grad_norm": 1.6003259157535057, "language_loss": 0.78083026, "learning_rate": 1.0907873217838077e-06, "loss": 0.80548793, "num_input_tokens_seen": 237221805, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.1932373, "step": 10989, "time_per_iteration": 2.9416205883026123 }, { "auxiliary_loss_clip": 0.01421637, "auxiliary_loss_mlp": 0.01039019, "balance_loss_clip": 1.25896358, "balance_loss_mlp": 1.01936162, "epoch": 0.660754546820983, "flos": 13780985132160.0, "grad_norm": 2.1318961361213535, "language_loss": 0.77914625, "learning_rate": 1.0904404484376064e-06, "loss": 0.80375278, "num_input_tokens_seen": 237238270, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19677734, "step": 10990, "time_per_iteration": 2.8615665435791016 }, { "auxiliary_loss_clip": 0.01436758, "auxiliary_loss_mlp": 0.01034368, "balance_loss_clip": 1.2688421, "balance_loss_mlp": 1.01376879, "epoch": 0.660814670073651, "flos": 15713523331200.0, "grad_norm": 1.7762384470734154, "language_loss": 0.61083078, "learning_rate": 1.0900936095819937e-06, "loss": 0.63554204, "num_input_tokens_seen": 237255400, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20593262, "step": 10991, "time_per_iteration": 2.8613767623901367 }, { "auxiliary_loss_clip": 0.01425824, "auxiliary_loss_mlp": 0.01037601, "balance_loss_clip": 1.25758028, "balance_loss_mlp": 1.01753831, "epoch": 0.6608747933263189, "flos": 20859755518080.0, "grad_norm": 2.3980015223955466, "language_loss": 0.69140971, "learning_rate": 1.0897468052301234e-06, "loss": 0.71604395, "num_input_tokens_seen": 237273105, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20056152, "step": 10992, "time_per_iteration": 2.92490816116333 }, { "auxiliary_loss_clip": 0.01432662, "auxiliary_loss_mlp": 0.01034563, "balance_loss_clip": 1.26317573, "balance_loss_mlp": 1.0145601, "epoch": 0.660934916578987, "flos": 20642054947200.0, "grad_norm": 2.8373544852623107, "language_loss": 0.88522524, "learning_rate": 1.0894000353951444e-06, "loss": 0.90989745, "num_input_tokens_seen": 237292650, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.1998291, "step": 10993, "time_per_iteration": 2.870835065841675 }, { "auxiliary_loss_clip": 0.01450593, "auxiliary_loss_mlp": 0.0103634, "balance_loss_clip": 1.27811933, "balance_loss_mlp": 1.0150131, "epoch": 0.6609950398316549, "flos": 25123059815040.0, "grad_norm": 1.6415001594437124, "language_loss": 0.67099082, "learning_rate": 1.0890533000902078e-06, "loss": 0.69586021, "num_input_tokens_seen": 237312865, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.21325684, "step": 10994, "time_per_iteration": 2.8836066722869873 }, { "auxiliary_loss_clip": 0.0142273, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.25885296, "balance_loss_mlp": 1.01633215, "epoch": 0.6610551630843229, "flos": 18670443223680.0, "grad_norm": 1.625882352999474, "language_loss": 0.78141105, "learning_rate": 1.0887065993284626e-06, "loss": 0.80601037, "num_input_tokens_seen": 237331210, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20874023, "step": 10995, "time_per_iteration": 2.866201400756836 }, { "auxiliary_loss_clip": 0.01425038, "auxiliary_loss_mlp": 0.01030924, "balance_loss_clip": 1.25981593, "balance_loss_mlp": 1.01239848, "epoch": 0.6611152863369908, "flos": 23268578175360.0, "grad_norm": 1.83441985039408, "language_loss": 0.75329894, "learning_rate": 1.088359933123053e-06, "loss": 0.77785861, "num_input_tokens_seen": 237349455, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.18530273, "step": 10996, "time_per_iteration": 2.8797202110290527 }, { "auxiliary_loss_clip": 0.01436584, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.27020442, "balance_loss_mlp": 1.01497638, "epoch": 0.6611754095896588, "flos": 22168809365760.0, "grad_norm": 1.7580351886437768, "language_loss": 0.6954245, "learning_rate": 1.088013301487126e-06, "loss": 0.72012925, "num_input_tokens_seen": 237367100, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.18920898, "step": 10997, "time_per_iteration": 2.876394748687744 }, { "auxiliary_loss_clip": 0.0143537, "auxiliary_loss_mlp": 0.01031329, "balance_loss_clip": 1.26589465, "balance_loss_mlp": 1.01142073, "epoch": 0.6612355328423267, "flos": 13999319130240.0, "grad_norm": 2.0641019560732836, "language_loss": 0.69999003, "learning_rate": 1.0876667044338269e-06, "loss": 0.724657, "num_input_tokens_seen": 237384840, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.19897461, "step": 10998, "time_per_iteration": 2.9563357830047607 }, { "auxiliary_loss_clip": 0.01200081, "auxiliary_loss_mlp": 0.01022255, "balance_loss_clip": 1.10787654, "balance_loss_mlp": 0.99688685, "epoch": 0.6612956560949947, "flos": 61482167187840.0, "grad_norm": 0.6543628429404955, "language_loss": 0.51178765, "learning_rate": 1.087320141976297e-06, "loss": 0.53401101, "num_input_tokens_seen": 237443355, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.25390625, "step": 10999, "time_per_iteration": 4.768325328826904 }, { "auxiliary_loss_clip": 0.01434388, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.2638135, "balance_loss_mlp": 1.01486838, "epoch": 0.6613557793476627, "flos": 21626684444160.0, "grad_norm": 2.525818009311192, "language_loss": 0.71144152, "learning_rate": 1.086973614127679e-06, "loss": 0.73613596, "num_input_tokens_seen": 237459205, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20178223, "step": 11000, "time_per_iteration": 2.8796279430389404 }, { "auxiliary_loss_clip": 0.01419475, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.25742531, "balance_loss_mlp": 1.01807332, "epoch": 0.6614159026003307, "flos": 34032938261760.0, "grad_norm": 2.92556389187988, "language_loss": 0.66006047, "learning_rate": 1.0866271209011133e-06, "loss": 0.68461919, "num_input_tokens_seen": 237483580, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18334961, "step": 11001, "time_per_iteration": 2.993093967437744 }, { "auxiliary_loss_clip": 0.0143039, "auxiliary_loss_mlp": 0.01033324, "balance_loss_clip": 1.2664156, "balance_loss_mlp": 1.01385736, "epoch": 0.6614760258529987, "flos": 24107862816000.0, "grad_norm": 1.8219628087561184, "language_loss": 0.73362482, "learning_rate": 1.086280662309739e-06, "loss": 0.75826198, "num_input_tokens_seen": 237502860, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19470215, "step": 11002, "time_per_iteration": 4.338241338729858 }, { "auxiliary_loss_clip": 0.01423158, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.25859475, "balance_loss_mlp": 1.01559544, "epoch": 0.6615361491056666, "flos": 14912226339840.0, "grad_norm": 2.351977397410118, "language_loss": 0.79621828, "learning_rate": 1.0859342383666928e-06, "loss": 0.8208158, "num_input_tokens_seen": 237521030, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20983887, "step": 11003, "time_per_iteration": 4.222936153411865 }, { "auxiliary_loss_clip": 0.01434602, "auxiliary_loss_mlp": 0.01035447, "balance_loss_clip": 1.26663327, "balance_loss_mlp": 1.01476407, "epoch": 0.6615962723583346, "flos": 15313349905920.0, "grad_norm": 2.6619180899722084, "language_loss": 0.69675767, "learning_rate": 1.0855878490851119e-06, "loss": 0.72145814, "num_input_tokens_seen": 237539585, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20690918, "step": 11004, "time_per_iteration": 2.8871705532073975 }, { "auxiliary_loss_clip": 0.01440277, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 1.27059758, "balance_loss_mlp": 1.01398385, "epoch": 0.6616563956110025, "flos": 18741441594240.0, "grad_norm": 2.66075156555359, "language_loss": 0.70541143, "learning_rate": 1.085241494478132e-06, "loss": 0.7301625, "num_input_tokens_seen": 237557655, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20861816, "step": 11005, "time_per_iteration": 2.9882431030273438 }, { "auxiliary_loss_clip": 0.01421151, "auxiliary_loss_mlp": 0.01031078, "balance_loss_clip": 1.2574718, "balance_loss_mlp": 1.01091945, "epoch": 0.6617165188636706, "flos": 24504823860480.0, "grad_norm": 1.5577203097400751, "language_loss": 0.78617454, "learning_rate": 1.0848951745588855e-06, "loss": 0.81069684, "num_input_tokens_seen": 237577000, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20141602, "step": 11006, "time_per_iteration": 2.9194726943969727 }, { "auxiliary_loss_clip": 0.01432857, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.26720643, "balance_loss_mlp": 1.01396322, "epoch": 0.6617766421163385, "flos": 22389088890240.0, "grad_norm": 1.4908491551080651, "language_loss": 0.76952457, "learning_rate": 1.0845488893405068e-06, "loss": 0.79419208, "num_input_tokens_seen": 237597960, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19934082, "step": 11007, "time_per_iteration": 2.9044909477233887 }, { "auxiliary_loss_clip": 0.01428424, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.26269567, "balance_loss_mlp": 1.01109195, "epoch": 0.6618367653690065, "flos": 20860253210880.0, "grad_norm": 1.6115861110822407, "language_loss": 0.79215527, "learning_rate": 1.0842026388361248e-06, "loss": 0.81674612, "num_input_tokens_seen": 237616385, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19567871, "step": 11008, "time_per_iteration": 2.828664779663086 }, { "auxiliary_loss_clip": 0.0144494, "auxiliary_loss_mlp": 0.01042153, "balance_loss_clip": 1.27313876, "balance_loss_mlp": 1.02100563, "epoch": 0.6618968886216744, "flos": 17721267667200.0, "grad_norm": 1.6992891888753698, "language_loss": 0.82747173, "learning_rate": 1.0838564230588715e-06, "loss": 0.85234272, "num_input_tokens_seen": 237634930, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.21142578, "step": 11009, "time_per_iteration": 2.8581507205963135 }, { "auxiliary_loss_clip": 0.01198594, "auxiliary_loss_mlp": 0.01017489, "balance_loss_clip": 1.10648942, "balance_loss_mlp": 1.00108552, "epoch": 0.6619570118743424, "flos": 67065067370880.0, "grad_norm": 1.066420775521907, "language_loss": 0.67392373, "learning_rate": 1.0835102420218735e-06, "loss": 0.6960845, "num_input_tokens_seen": 237693175, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.1640625, "step": 11010, "time_per_iteration": 3.308628559112549 }, { "auxiliary_loss_clip": 0.01433329, "auxiliary_loss_mlp": 0.0103591, "balance_loss_clip": 1.26607561, "balance_loss_mlp": 1.0155375, "epoch": 0.6620171351270103, "flos": 18670081265280.0, "grad_norm": 1.711432934618372, "language_loss": 0.72070307, "learning_rate": 1.0831640957382593e-06, "loss": 0.74539542, "num_input_tokens_seen": 237713160, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20361328, "step": 11011, "time_per_iteration": 3.003183364868164 }, { "auxiliary_loss_clip": 0.01428206, "auxiliary_loss_mlp": 0.01036592, "balance_loss_clip": 1.26215982, "balance_loss_mlp": 1.0172441, "epoch": 0.6620772583796783, "flos": 24181168671360.0, "grad_norm": 1.4998660404920836, "language_loss": 0.72886801, "learning_rate": 1.0828179842211557e-06, "loss": 0.75351596, "num_input_tokens_seen": 237733600, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19348145, "step": 11012, "time_per_iteration": 2.950166702270508 }, { "auxiliary_loss_clip": 0.01410494, "auxiliary_loss_mlp": 0.01036309, "balance_loss_clip": 1.25402236, "balance_loss_mlp": 1.01852274, "epoch": 0.6621373816323463, "flos": 23634157311360.0, "grad_norm": 2.3463813490119114, "language_loss": 0.79763502, "learning_rate": 1.0824719074836845e-06, "loss": 0.82210302, "num_input_tokens_seen": 237752135, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.17797852, "step": 11013, "time_per_iteration": 2.9012835025787354 }, { "auxiliary_loss_clip": 0.0142602, "auxiliary_loss_mlp": 0.01031647, "balance_loss_clip": 1.26254463, "balance_loss_mlp": 1.01119113, "epoch": 0.6621975048850143, "flos": 18451385308800.0, "grad_norm": 2.0963815646088557, "language_loss": 0.71115911, "learning_rate": 1.082125865538971e-06, "loss": 0.73573577, "num_input_tokens_seen": 237770735, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.20446777, "step": 11014, "time_per_iteration": 2.8598523139953613 }, { "auxiliary_loss_clip": 0.0142071, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.25853658, "balance_loss_mlp": 1.01464629, "epoch": 0.6622576281376823, "flos": 14071901068800.0, "grad_norm": 2.004796984183188, "language_loss": 0.78101063, "learning_rate": 1.081779858400137e-06, "loss": 0.80555129, "num_input_tokens_seen": 237789005, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18701172, "step": 11015, "time_per_iteration": 2.8403983116149902 }, { "auxiliary_loss_clip": 0.01421857, "auxiliary_loss_mlp": 0.01033733, "balance_loss_clip": 1.25835729, "balance_loss_mlp": 1.01381302, "epoch": 0.6623177513903502, "flos": 17027689841280.0, "grad_norm": 1.68257584952964, "language_loss": 0.82779592, "learning_rate": 1.0814338860803021e-06, "loss": 0.85235178, "num_input_tokens_seen": 237807740, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19921875, "step": 11016, "time_per_iteration": 2.847731351852417 }, { "auxiliary_loss_clip": 0.01441387, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.27216482, "balance_loss_mlp": 1.01251662, "epoch": 0.6623778746430182, "flos": 17278944071040.0, "grad_norm": 2.0403059928324607, "language_loss": 0.70672214, "learning_rate": 1.0810879485925864e-06, "loss": 0.73146093, "num_input_tokens_seen": 237826340, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.1998291, "step": 11017, "time_per_iteration": 2.847506523132324 }, { "auxiliary_loss_clip": 0.01416786, "auxiliary_loss_mlp": 0.01037867, "balance_loss_clip": 1.25371718, "balance_loss_mlp": 1.01725554, "epoch": 0.6624379978956861, "flos": 48808262540160.0, "grad_norm": 2.129593044946957, "language_loss": 0.7772975, "learning_rate": 1.0807420459501084e-06, "loss": 0.80184406, "num_input_tokens_seen": 237848305, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20593262, "step": 11018, "time_per_iteration": 3.090596914291382 }, { "auxiliary_loss_clip": 0.01417406, "auxiliary_loss_mlp": 0.01036929, "balance_loss_clip": 1.25386882, "balance_loss_mlp": 1.01619828, "epoch": 0.6624981211483542, "flos": 18961585384320.0, "grad_norm": 1.942744994543718, "language_loss": 0.84377569, "learning_rate": 1.0803961781659841e-06, "loss": 0.86831903, "num_input_tokens_seen": 237867020, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20739746, "step": 11019, "time_per_iteration": 2.845418691635132 }, { "auxiliary_loss_clip": 0.01425262, "auxiliary_loss_mlp": 0.01034564, "balance_loss_clip": 1.2620101, "balance_loss_mlp": 1.01527584, "epoch": 0.6625582444010221, "flos": 23266542159360.0, "grad_norm": 1.4545264189571288, "language_loss": 0.72199631, "learning_rate": 1.080050345253328e-06, "loss": 0.74659455, "num_input_tokens_seen": 237886710, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19299316, "step": 11020, "time_per_iteration": 2.8669159412384033 }, { "auxiliary_loss_clip": 0.01457278, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.28368688, "balance_loss_mlp": 1.01664865, "epoch": 0.6626183676536901, "flos": 21404052190080.0, "grad_norm": 1.7441546112786488, "language_loss": 0.72823197, "learning_rate": 1.0797045472252554e-06, "loss": 0.75318408, "num_input_tokens_seen": 237904795, "router_z_loss_clip": 1.73632812, "router_z_loss_mlp": 0.21289062, "step": 11021, "time_per_iteration": 2.853935956954956 }, { "auxiliary_loss_clip": 0.01434594, "auxiliary_loss_mlp": 0.01037981, "balance_loss_clip": 1.26913154, "balance_loss_mlp": 1.01751256, "epoch": 0.662678490906358, "flos": 14578572049920.0, "grad_norm": 3.562046711791596, "language_loss": 0.84117097, "learning_rate": 1.0793587840948793e-06, "loss": 0.8658967, "num_input_tokens_seen": 237921320, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20471191, "step": 11022, "time_per_iteration": 2.857743740081787 }, { "auxiliary_loss_clip": 0.01457142, "auxiliary_loss_mlp": 0.01041259, "balance_loss_clip": 1.28059101, "balance_loss_mlp": 1.01949143, "epoch": 0.662738614159026, "flos": 15999507584640.0, "grad_norm": 2.317050365163501, "language_loss": 0.74430817, "learning_rate": 1.0790130558753099e-06, "loss": 0.76929218, "num_input_tokens_seen": 237933525, "router_z_loss_clip": 1.76367188, "router_z_loss_mlp": 0.21765137, "step": 11023, "time_per_iteration": 2.7965474128723145 }, { "auxiliary_loss_clip": 0.0142859, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.26393557, "balance_loss_mlp": 1.0146178, "epoch": 0.6627987374116939, "flos": 19545679497600.0, "grad_norm": 1.8604968575581138, "language_loss": 0.75302637, "learning_rate": 1.0786673625796574e-06, "loss": 0.77765763, "num_input_tokens_seen": 237953395, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19909668, "step": 11024, "time_per_iteration": 4.29595685005188 }, { "auxiliary_loss_clip": 0.01446337, "auxiliary_loss_mlp": 0.01033466, "balance_loss_clip": 1.2780205, "balance_loss_mlp": 1.01280737, "epoch": 0.662858860664362, "flos": 15710537174400.0, "grad_norm": 3.9791488322279176, "language_loss": 0.69922733, "learning_rate": 1.0783217042210306e-06, "loss": 0.72402537, "num_input_tokens_seen": 237971445, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20654297, "step": 11025, "time_per_iteration": 2.9009432792663574 }, { "auxiliary_loss_clip": 0.01438253, "auxiliary_loss_mlp": 0.01041853, "balance_loss_clip": 1.27175021, "balance_loss_mlp": 1.02124143, "epoch": 0.6629189839170299, "flos": 20163191535360.0, "grad_norm": 2.103361138824014, "language_loss": 0.79861617, "learning_rate": 1.0779760808125379e-06, "loss": 0.82341725, "num_input_tokens_seen": 237989965, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20617676, "step": 11026, "time_per_iteration": 2.8928418159484863 }, { "auxiliary_loss_clip": 0.01419692, "auxiliary_loss_mlp": 0.01036541, "balance_loss_clip": 1.25645065, "balance_loss_mlp": 1.01681137, "epoch": 0.6629791071696979, "flos": 20923198007040.0, "grad_norm": 1.6561361561026828, "language_loss": 0.76782769, "learning_rate": 1.0776304923672842e-06, "loss": 0.79238999, "num_input_tokens_seen": 238006820, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19726562, "step": 11027, "time_per_iteration": 2.8810386657714844 }, { "auxiliary_loss_clip": 0.0144035, "auxiliary_loss_mlp": 0.01039401, "balance_loss_clip": 1.27272677, "balance_loss_mlp": 1.01837289, "epoch": 0.6630392304223659, "flos": 20855819220480.0, "grad_norm": 4.082304778947198, "language_loss": 0.70424139, "learning_rate": 1.0772849388983742e-06, "loss": 0.72903889, "num_input_tokens_seen": 238022560, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.21032715, "step": 11028, "time_per_iteration": 2.8347034454345703 }, { "auxiliary_loss_clip": 0.01430523, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.26495826, "balance_loss_mlp": 1.01662052, "epoch": 0.6630993536750338, "flos": 21005552822400.0, "grad_norm": 1.8925748807286094, "language_loss": 0.80316794, "learning_rate": 1.0769394204189138e-06, "loss": 0.82783067, "num_input_tokens_seen": 238041895, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19140625, "step": 11029, "time_per_iteration": 2.892699718475342 }, { "auxiliary_loss_clip": 0.01437052, "auxiliary_loss_mlp": 0.01037505, "balance_loss_clip": 1.26869535, "balance_loss_mlp": 1.01605916, "epoch": 0.6631594769277018, "flos": 18267962313600.0, "grad_norm": 1.8848817858762656, "language_loss": 0.76795733, "learning_rate": 1.0765939369420012e-06, "loss": 0.79270291, "num_input_tokens_seen": 238060445, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.21447754, "step": 11030, "time_per_iteration": 2.8199734687805176 }, { "auxiliary_loss_clip": 0.01455599, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 1.2838223, "balance_loss_mlp": 1.01303411, "epoch": 0.6632196001803697, "flos": 17829122567040.0, "grad_norm": 2.455962154670929, "language_loss": 0.76641083, "learning_rate": 1.0762484884807391e-06, "loss": 0.79129565, "num_input_tokens_seen": 238077080, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.19824219, "step": 11031, "time_per_iteration": 2.8109354972839355 }, { "auxiliary_loss_clip": 0.01441895, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.2716223, "balance_loss_mlp": 1.01789951, "epoch": 0.6632797234330378, "flos": 12675967925760.0, "grad_norm": 2.909222146518567, "language_loss": 0.75884277, "learning_rate": 1.075903075048228e-06, "loss": 0.78364146, "num_input_tokens_seen": 238091045, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.20068359, "step": 11032, "time_per_iteration": 2.8275091648101807 }, { "auxiliary_loss_clip": 0.0142474, "auxiliary_loss_mlp": 0.01032707, "balance_loss_clip": 1.26031065, "balance_loss_mlp": 1.01238227, "epoch": 0.6633398466857057, "flos": 23594721828480.0, "grad_norm": 1.7711713044232624, "language_loss": 0.81153369, "learning_rate": 1.0755576966575635e-06, "loss": 0.83610821, "num_input_tokens_seen": 238110220, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.203125, "step": 11033, "time_per_iteration": 2.878173589706421 }, { "auxiliary_loss_clip": 0.01432896, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.26575732, "balance_loss_mlp": 1.01594734, "epoch": 0.6633999699383737, "flos": 20641512009600.0, "grad_norm": 1.5941875014603308, "language_loss": 0.8128767, "learning_rate": 1.0752123533218451e-06, "loss": 0.8375597, "num_input_tokens_seen": 238130400, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19470215, "step": 11034, "time_per_iteration": 2.866323709487915 }, { "auxiliary_loss_clip": 0.01434854, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.26936364, "balance_loss_mlp": 1.01522648, "epoch": 0.6634600931910416, "flos": 21806668834560.0, "grad_norm": 1.6795845384947061, "language_loss": 0.7631079, "learning_rate": 1.074867045054166e-06, "loss": 0.78779978, "num_input_tokens_seen": 238148165, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19104004, "step": 11035, "time_per_iteration": 4.261380195617676 }, { "auxiliary_loss_clip": 0.01441196, "auxiliary_loss_mlp": 0.0102986, "balance_loss_clip": 1.26981187, "balance_loss_mlp": 1.01065516, "epoch": 0.6635202164437096, "flos": 18741803552640.0, "grad_norm": 1.9897433638394464, "language_loss": 0.83989078, "learning_rate": 1.074521771867622e-06, "loss": 0.86460137, "num_input_tokens_seen": 238166360, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.1920166, "step": 11036, "time_per_iteration": 2.8304898738861084 }, { "auxiliary_loss_clip": 0.01191666, "auxiliary_loss_mlp": 0.01019375, "balance_loss_clip": 1.10119271, "balance_loss_mlp": 0.99772704, "epoch": 0.6635803396963775, "flos": 60253386894720.0, "grad_norm": 0.7851179417026694, "language_loss": 0.52346206, "learning_rate": 1.0741765337753044e-06, "loss": 0.54557252, "num_input_tokens_seen": 238227630, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.21679688, "step": 11037, "time_per_iteration": 4.71683144569397 }, { "auxiliary_loss_clip": 0.01427659, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.26044178, "balance_loss_mlp": 1.01807654, "epoch": 0.6636404629490456, "flos": 29178481662720.0, "grad_norm": 1.805759948702868, "language_loss": 0.79702127, "learning_rate": 1.0738313307903052e-06, "loss": 0.82168621, "num_input_tokens_seen": 238248435, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20788574, "step": 11038, "time_per_iteration": 4.329996824264526 }, { "auxiliary_loss_clip": 0.01444839, "auxiliary_loss_mlp": 0.01040309, "balance_loss_clip": 1.27671313, "balance_loss_mlp": 1.01907825, "epoch": 0.6637005862017135, "flos": 38921400967680.0, "grad_norm": 2.368792921063246, "language_loss": 0.64827871, "learning_rate": 1.073486162925716e-06, "loss": 0.67313021, "num_input_tokens_seen": 238268755, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.2121582, "step": 11039, "time_per_iteration": 3.0127322673797607 }, { "auxiliary_loss_clip": 0.01438996, "auxiliary_loss_mlp": 0.01031972, "balance_loss_clip": 1.26913977, "balance_loss_mlp": 1.01170635, "epoch": 0.6637607094543815, "flos": 22793470081920.0, "grad_norm": 2.1597634048532144, "language_loss": 0.65167117, "learning_rate": 1.0731410301946237e-06, "loss": 0.67638087, "num_input_tokens_seen": 238290120, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.20251465, "step": 11040, "time_per_iteration": 2.913280487060547 }, { "auxiliary_loss_clip": 0.01414539, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.25004768, "balance_loss_mlp": 1.01384211, "epoch": 0.6638208327070495, "flos": 18123250884480.0, "grad_norm": 3.244780806413256, "language_loss": 0.72762805, "learning_rate": 1.0727959326101161e-06, "loss": 0.75211012, "num_input_tokens_seen": 238309290, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19824219, "step": 11041, "time_per_iteration": 2.8591063022613525 }, { "auxiliary_loss_clip": 0.01425889, "auxiliary_loss_mlp": 0.01043373, "balance_loss_clip": 1.26080537, "balance_loss_mlp": 1.02211821, "epoch": 0.6638809559597174, "flos": 29437020305280.0, "grad_norm": 2.529496351712869, "language_loss": 0.62381572, "learning_rate": 1.0724508701852806e-06, "loss": 0.64850837, "num_input_tokens_seen": 238327280, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.21264648, "step": 11042, "time_per_iteration": 2.914703130722046 }, { "auxiliary_loss_clip": 0.01445482, "auxiliary_loss_mlp": 0.01037642, "balance_loss_clip": 1.27393341, "balance_loss_mlp": 1.01716185, "epoch": 0.6639410792123854, "flos": 28083916005120.0, "grad_norm": 2.9119625467128762, "language_loss": 0.68839514, "learning_rate": 1.0721058429331998e-06, "loss": 0.71322638, "num_input_tokens_seen": 238346330, "router_z_loss_clip": 1.71484375, "router_z_loss_mlp": 0.20483398, "step": 11043, "time_per_iteration": 2.9312386512756348 }, { "auxiliary_loss_clip": 0.01418789, "auxiliary_loss_mlp": 0.0103449, "balance_loss_clip": 1.25883305, "balance_loss_mlp": 1.01581001, "epoch": 0.6640012024650533, "flos": 25567238448000.0, "grad_norm": 1.573654016784822, "language_loss": 0.84294999, "learning_rate": 1.0717608508669587e-06, "loss": 0.86748278, "num_input_tokens_seen": 238364650, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18676758, "step": 11044, "time_per_iteration": 2.8979432582855225 }, { "auxiliary_loss_clip": 0.01433504, "auxiliary_loss_mlp": 0.01035455, "balance_loss_clip": 1.26810455, "balance_loss_mlp": 1.01496327, "epoch": 0.6640613257177214, "flos": 14875279320960.0, "grad_norm": 2.109222610165583, "language_loss": 0.70706749, "learning_rate": 1.0714158939996392e-06, "loss": 0.73175716, "num_input_tokens_seen": 238381630, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20495605, "step": 11045, "time_per_iteration": 2.80625581741333 }, { "auxiliary_loss_clip": 0.01437573, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.26973164, "balance_loss_mlp": 1.0136677, "epoch": 0.6641214489703893, "flos": 23231314442880.0, "grad_norm": 1.569761347225777, "language_loss": 0.64933926, "learning_rate": 1.0710709723443235e-06, "loss": 0.67405498, "num_input_tokens_seen": 238402595, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.20336914, "step": 11046, "time_per_iteration": 2.914215326309204 }, { "auxiliary_loss_clip": 0.01437028, "auxiliary_loss_mlp": 0.01030444, "balance_loss_clip": 1.2700479, "balance_loss_mlp": 1.01147735, "epoch": 0.6641815722230573, "flos": 37756651345920.0, "grad_norm": 1.8740582499990948, "language_loss": 0.72281349, "learning_rate": 1.070726085914088e-06, "loss": 0.7474882, "num_input_tokens_seen": 238426860, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.1895752, "step": 11047, "time_per_iteration": 3.0393762588500977 }, { "auxiliary_loss_clip": 0.01434977, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.26913261, "balance_loss_mlp": 1.01960886, "epoch": 0.6642416954757252, "flos": 17940099358080.0, "grad_norm": 1.970262623586702, "language_loss": 0.77916253, "learning_rate": 1.0703812347220126e-06, "loss": 0.80391204, "num_input_tokens_seen": 238443990, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20361328, "step": 11048, "time_per_iteration": 2.986861228942871 }, { "auxiliary_loss_clip": 0.01187163, "auxiliary_loss_mlp": 0.01020708, "balance_loss_clip": 1.09700966, "balance_loss_mlp": 0.99925023, "epoch": 0.6643018187283932, "flos": 52019169333120.0, "grad_norm": 0.7444373283764798, "language_loss": 0.55020601, "learning_rate": 1.0700364187811745e-06, "loss": 0.5722847, "num_input_tokens_seen": 238503045, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.21484375, "step": 11049, "time_per_iteration": 3.3650968074798584 }, { "auxiliary_loss_clip": 0.01434832, "auxiliary_loss_mlp": 0.01033797, "balance_loss_clip": 1.26963425, "balance_loss_mlp": 1.01459265, "epoch": 0.6643619419810611, "flos": 30238769744640.0, "grad_norm": 1.6693653236596666, "language_loss": 0.65030479, "learning_rate": 1.069691638104648e-06, "loss": 0.67499113, "num_input_tokens_seen": 238527320, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19213867, "step": 11050, "time_per_iteration": 2.9674363136291504 }, { "auxiliary_loss_clip": 0.01427558, "auxiliary_loss_mlp": 0.01032006, "balance_loss_clip": 1.26359713, "balance_loss_mlp": 1.01327777, "epoch": 0.6644220652337292, "flos": 22976395384320.0, "grad_norm": 2.733315299220215, "language_loss": 0.7993691, "learning_rate": 1.0693468927055085e-06, "loss": 0.82396472, "num_input_tokens_seen": 238546030, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18725586, "step": 11051, "time_per_iteration": 2.9163060188293457 }, { "auxiliary_loss_clip": 0.01433917, "auxiliary_loss_mlp": 0.01032752, "balance_loss_clip": 1.26723409, "balance_loss_mlp": 1.01342773, "epoch": 0.6644821884863971, "flos": 21152119288320.0, "grad_norm": 14.21130762084279, "language_loss": 0.86170185, "learning_rate": 1.0690021825968276e-06, "loss": 0.88636857, "num_input_tokens_seen": 238564175, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19311523, "step": 11052, "time_per_iteration": 2.8769726753234863 }, { "auxiliary_loss_clip": 0.01442153, "auxiliary_loss_mlp": 0.01034838, "balance_loss_clip": 1.27185023, "balance_loss_mlp": 1.01463151, "epoch": 0.6645423117390651, "flos": 20202265059840.0, "grad_norm": 4.268282098917331, "language_loss": 0.75537252, "learning_rate": 1.0686575077916776e-06, "loss": 0.78014243, "num_input_tokens_seen": 238581010, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.2019043, "step": 11053, "time_per_iteration": 2.8455793857574463 }, { "auxiliary_loss_clip": 0.01423491, "auxiliary_loss_mlp": 0.01032482, "balance_loss_clip": 1.25965595, "balance_loss_mlp": 1.01367044, "epoch": 0.6646024349917331, "flos": 24362419916160.0, "grad_norm": 1.6150701955792264, "language_loss": 0.80074596, "learning_rate": 1.0683128683031278e-06, "loss": 0.8253057, "num_input_tokens_seen": 238601365, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18798828, "step": 11054, "time_per_iteration": 2.9354350566864014 }, { "auxiliary_loss_clip": 0.01420702, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.25705409, "balance_loss_mlp": 1.01436758, "epoch": 0.664662558244401, "flos": 18815878569600.0, "grad_norm": 1.4932260789753116, "language_loss": 0.74292928, "learning_rate": 1.0679682641442472e-06, "loss": 0.76747203, "num_input_tokens_seen": 238619850, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19189453, "step": 11055, "time_per_iteration": 2.8746349811553955 }, { "auxiliary_loss_clip": 0.01438916, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.27109814, "balance_loss_mlp": 1.01467526, "epoch": 0.664722681497069, "flos": 18962037832320.0, "grad_norm": 1.847191410660478, "language_loss": 0.73381138, "learning_rate": 1.0676236953281042e-06, "loss": 0.75854957, "num_input_tokens_seen": 238637635, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20239258, "step": 11056, "time_per_iteration": 2.8484530448913574 }, { "auxiliary_loss_clip": 0.01427586, "auxiliary_loss_mlp": 0.01030049, "balance_loss_clip": 1.26121902, "balance_loss_mlp": 1.0097115, "epoch": 0.6647828047497369, "flos": 19580228542080.0, "grad_norm": 2.010839557810378, "language_loss": 0.70980263, "learning_rate": 1.0672791618677641e-06, "loss": 0.73437899, "num_input_tokens_seen": 238656200, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20324707, "step": 11057, "time_per_iteration": 2.847320079803467 }, { "auxiliary_loss_clip": 0.01439613, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.27221918, "balance_loss_mlp": 1.01857924, "epoch": 0.664842928002405, "flos": 23159999358720.0, "grad_norm": 1.7382456390597079, "language_loss": 0.8141911, "learning_rate": 1.066934663776291e-06, "loss": 0.83897257, "num_input_tokens_seen": 238675005, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19946289, "step": 11058, "time_per_iteration": 2.869825601577759 }, { "auxiliary_loss_clip": 0.01190136, "auxiliary_loss_mlp": 0.0101851, "balance_loss_clip": 1.10079193, "balance_loss_mlp": 1.00039053, "epoch": 0.6649030512550729, "flos": 65273485282560.0, "grad_norm": 0.790812221820074, "language_loss": 0.62647396, "learning_rate": 1.0665902010667496e-06, "loss": 0.64856046, "num_input_tokens_seen": 238731425, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.18164062, "step": 11059, "time_per_iteration": 4.655359983444214 }, { "auxiliary_loss_clip": 0.01425894, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.26113927, "balance_loss_mlp": 1.01257527, "epoch": 0.6649631745077409, "flos": 20204753523840.0, "grad_norm": 1.8489823462693635, "language_loss": 0.79375249, "learning_rate": 1.0662457737522008e-06, "loss": 0.81832975, "num_input_tokens_seen": 238752020, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19250488, "step": 11060, "time_per_iteration": 2.8769779205322266 }, { "auxiliary_loss_clip": 0.01427172, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.26157975, "balance_loss_mlp": 1.01783919, "epoch": 0.6650232977604088, "flos": 17247516917760.0, "grad_norm": 2.1077119501333823, "language_loss": 0.80245471, "learning_rate": 1.0659013818457055e-06, "loss": 0.82710618, "num_input_tokens_seen": 238769665, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20129395, "step": 11061, "time_per_iteration": 2.8112502098083496 }, { "auxiliary_loss_clip": 0.01428519, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 1.26384234, "balance_loss_mlp": 1.01175857, "epoch": 0.6650834210130768, "flos": 10011999985920.0, "grad_norm": 1.9269218711121516, "language_loss": 0.57078993, "learning_rate": 1.0655570253603243e-06, "loss": 0.59538031, "num_input_tokens_seen": 238782180, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.1875, "step": 11062, "time_per_iteration": 2.8444645404815674 }, { "auxiliary_loss_clip": 0.01434745, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.26472616, "balance_loss_mlp": 1.0134201, "epoch": 0.6651435442657447, "flos": 10459255265280.0, "grad_norm": 1.6337774282706161, "language_loss": 0.76303196, "learning_rate": 1.0652127043091144e-06, "loss": 0.78772646, "num_input_tokens_seen": 238800315, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.21264648, "step": 11063, "time_per_iteration": 2.8262743949890137 }, { "auxiliary_loss_clip": 0.01431422, "auxiliary_loss_mlp": 0.0103486, "balance_loss_clip": 1.26553988, "balance_loss_mlp": 1.01578629, "epoch": 0.6652036675184128, "flos": 22353906418560.0, "grad_norm": 1.4060071169018622, "language_loss": 0.71080381, "learning_rate": 1.0648684187051316e-06, "loss": 0.73546666, "num_input_tokens_seen": 238822250, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19067383, "step": 11064, "time_per_iteration": 2.8775088787078857 }, { "auxiliary_loss_clip": 0.01188688, "auxiliary_loss_mlp": 0.01017708, "balance_loss_clip": 1.09863472, "balance_loss_mlp": 0.99987394, "epoch": 0.6652637907710807, "flos": 52934203048320.0, "grad_norm": 0.8540781495687443, "language_loss": 0.63121879, "learning_rate": 1.0645241685614322e-06, "loss": 0.65328276, "num_input_tokens_seen": 238877190, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.17871094, "step": 11065, "time_per_iteration": 3.3033342361450195 }, { "auxiliary_loss_clip": 0.01437403, "auxiliary_loss_mlp": 0.01036123, "balance_loss_clip": 1.26889133, "balance_loss_mlp": 1.01629865, "epoch": 0.6653239140237487, "flos": 23113007994240.0, "grad_norm": 1.6734835490972844, "language_loss": 0.63468516, "learning_rate": 1.0641799538910708e-06, "loss": 0.65942043, "num_input_tokens_seen": 238896010, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19812012, "step": 11066, "time_per_iteration": 2.872493028640747 }, { "auxiliary_loss_clip": 0.01435476, "auxiliary_loss_mlp": 0.01033488, "balance_loss_clip": 1.26757991, "balance_loss_mlp": 1.01412868, "epoch": 0.6653840372764167, "flos": 25970805233280.0, "grad_norm": 1.4657565624026951, "language_loss": 0.70306152, "learning_rate": 1.0638357747070985e-06, "loss": 0.72775114, "num_input_tokens_seen": 238918990, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19360352, "step": 11067, "time_per_iteration": 2.9806556701660156 }, { "auxiliary_loss_clip": 0.01187169, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.09883559, "balance_loss_mlp": 1.01529491, "epoch": 0.6654441605290846, "flos": 66069307653120.0, "grad_norm": 0.9304117856200816, "language_loss": 0.72140974, "learning_rate": 1.0634916310225684e-06, "loss": 0.74360031, "num_input_tokens_seen": 238975735, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.16601562, "step": 11068, "time_per_iteration": 3.275740623474121 }, { "auxiliary_loss_clip": 0.01188927, "auxiliary_loss_mlp": 0.01022641, "balance_loss_clip": 1.09941053, "balance_loss_mlp": 1.00394917, "epoch": 0.6655042837817526, "flos": 65230945891200.0, "grad_norm": 0.7084678261711423, "language_loss": 0.57871044, "learning_rate": 1.0631475228505285e-06, "loss": 0.60082614, "num_input_tokens_seen": 239042360, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.18652344, "step": 11069, "time_per_iteration": 3.3919081687927246 }, { "auxiliary_loss_clip": 0.01187625, "auxiliary_loss_mlp": 0.0102565, "balance_loss_clip": 1.09844756, "balance_loss_mlp": 1.00428772, "epoch": 0.6655644070344205, "flos": 69039800985600.0, "grad_norm": 0.7438551325511158, "language_loss": 0.63504118, "learning_rate": 1.062803450204029e-06, "loss": 0.65717393, "num_input_tokens_seen": 239109410, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.21386719, "step": 11070, "time_per_iteration": 4.7253851890563965 }, { "auxiliary_loss_clip": 0.01427435, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.26114428, "balance_loss_mlp": 1.01162422, "epoch": 0.6656245302870886, "flos": 36328159929600.0, "grad_norm": 1.6898430311218327, "language_loss": 0.59392536, "learning_rate": 1.062459413096116e-06, "loss": 0.61850882, "num_input_tokens_seen": 239135345, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19287109, "step": 11071, "time_per_iteration": 3.0616161823272705 }, { "auxiliary_loss_clip": 0.0142618, "auxiliary_loss_mlp": 0.01035239, "balance_loss_clip": 1.26210284, "balance_loss_mlp": 1.01525927, "epoch": 0.6656846535397565, "flos": 21803818412160.0, "grad_norm": 2.1794079979652876, "language_loss": 0.73726499, "learning_rate": 1.0621154115398364e-06, "loss": 0.76187921, "num_input_tokens_seen": 239154340, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.1998291, "step": 11072, "time_per_iteration": 4.270760536193848 }, { "auxiliary_loss_clip": 0.01414358, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.25299966, "balance_loss_mlp": 1.02136278, "epoch": 0.6657447767924245, "flos": 37501687042560.0, "grad_norm": 3.4395791240180142, "language_loss": 0.71579933, "learning_rate": 1.0617714455482353e-06, "loss": 0.74034995, "num_input_tokens_seen": 239177815, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19335938, "step": 11073, "time_per_iteration": 4.415718078613281 }, { "auxiliary_loss_clip": 0.01439418, "auxiliary_loss_mlp": 0.01038606, "balance_loss_clip": 1.27013576, "balance_loss_mlp": 1.01834047, "epoch": 0.6658049000450924, "flos": 16846664820480.0, "grad_norm": 2.4268388190033456, "language_loss": 0.56622148, "learning_rate": 1.061427515134354e-06, "loss": 0.59100175, "num_input_tokens_seen": 239195735, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20275879, "step": 11074, "time_per_iteration": 2.8065924644470215 }, { "auxiliary_loss_clip": 0.01421598, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.25835156, "balance_loss_mlp": 1.01501989, "epoch": 0.6658650232977604, "flos": 33524095530240.0, "grad_norm": 1.9496460179119457, "language_loss": 0.72876608, "learning_rate": 1.061083620311235e-06, "loss": 0.75332385, "num_input_tokens_seen": 239217535, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19177246, "step": 11075, "time_per_iteration": 2.9418752193450928 }, { "auxiliary_loss_clip": 0.01415139, "auxiliary_loss_mlp": 0.01039589, "balance_loss_clip": 1.25287974, "balance_loss_mlp": 1.0204674, "epoch": 0.6659251465504283, "flos": 37720382999040.0, "grad_norm": 1.3831723067228412, "language_loss": 0.66490567, "learning_rate": 1.0607397610919202e-06, "loss": 0.68945289, "num_input_tokens_seen": 239241975, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19116211, "step": 11076, "time_per_iteration": 2.992400646209717 }, { "auxiliary_loss_clip": 0.01416411, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.25280905, "balance_loss_mlp": 1.01471043, "epoch": 0.6659852698030964, "flos": 24902508821760.0, "grad_norm": 3.7877905383057455, "language_loss": 0.76426768, "learning_rate": 1.0603959374894468e-06, "loss": 0.78878582, "num_input_tokens_seen": 239262025, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20703125, "step": 11077, "time_per_iteration": 2.9186413288116455 }, { "auxiliary_loss_clip": 0.0142438, "auxiliary_loss_mlp": 0.01037207, "balance_loss_clip": 1.25827527, "balance_loss_mlp": 1.01754987, "epoch": 0.6660453930557643, "flos": 24363505791360.0, "grad_norm": 2.076873971179817, "language_loss": 0.6736269, "learning_rate": 1.0600521495168538e-06, "loss": 0.69824278, "num_input_tokens_seen": 239282775, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.1965332, "step": 11078, "time_per_iteration": 2.9222872257232666 }, { "auxiliary_loss_clip": 0.01449645, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.27901483, "balance_loss_mlp": 1.02090108, "epoch": 0.6661055163084323, "flos": 10604962080000.0, "grad_norm": 2.233994828458855, "language_loss": 0.7056247, "learning_rate": 1.0597083971871783e-06, "loss": 0.73053086, "num_input_tokens_seen": 239299775, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20068359, "step": 11079, "time_per_iteration": 2.892307758331299 }, { "auxiliary_loss_clip": 0.01436707, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.27179241, "balance_loss_mlp": 1.01420665, "epoch": 0.6661656395611003, "flos": 24067205723520.0, "grad_norm": 1.5161627584612951, "language_loss": 0.8069002, "learning_rate": 1.0593646805134544e-06, "loss": 0.83159506, "num_input_tokens_seen": 239319660, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.18579102, "step": 11080, "time_per_iteration": 2.845039129257202 }, { "auxiliary_loss_clip": 0.01412919, "auxiliary_loss_mlp": 0.01032138, "balance_loss_clip": 1.25300765, "balance_loss_mlp": 1.01275432, "epoch": 0.6662257628137682, "flos": 23045719697280.0, "grad_norm": 6.023128034946133, "language_loss": 0.78669322, "learning_rate": 1.0590209995087157e-06, "loss": 0.81114382, "num_input_tokens_seen": 239339215, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19396973, "step": 11081, "time_per_iteration": 2.896991491317749 }, { "auxiliary_loss_clip": 0.01439609, "auxiliary_loss_mlp": 0.01034353, "balance_loss_clip": 1.27141464, "balance_loss_mlp": 1.01461172, "epoch": 0.6662858860664362, "flos": 24765308029440.0, "grad_norm": 1.7345664431752867, "language_loss": 0.80412841, "learning_rate": 1.0586773541859946e-06, "loss": 0.82886797, "num_input_tokens_seen": 239358545, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.19750977, "step": 11082, "time_per_iteration": 2.8748488426208496 }, { "auxiliary_loss_clip": 0.01420939, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.25704396, "balance_loss_mlp": 1.01344693, "epoch": 0.6663460093191041, "flos": 20018118147840.0, "grad_norm": 1.5053816309873678, "language_loss": 0.84378737, "learning_rate": 1.0583337445583234e-06, "loss": 0.86832297, "num_input_tokens_seen": 239376665, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19165039, "step": 11083, "time_per_iteration": 2.8534185886383057 }, { "auxiliary_loss_clip": 0.01445493, "auxiliary_loss_mlp": 0.01039286, "balance_loss_clip": 1.27592373, "balance_loss_mlp": 1.01929486, "epoch": 0.6664061325717722, "flos": 17830253687040.0, "grad_norm": 2.250099841320157, "language_loss": 0.86156094, "learning_rate": 1.057990170638731e-06, "loss": 0.88640869, "num_input_tokens_seen": 239394345, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.19995117, "step": 11084, "time_per_iteration": 2.84954833984375 }, { "auxiliary_loss_clip": 0.01437354, "auxiliary_loss_mlp": 0.01039036, "balance_loss_clip": 1.2681638, "balance_loss_mlp": 1.01857948, "epoch": 0.6664662558244401, "flos": 18085851417600.0, "grad_norm": 2.5021527052427546, "language_loss": 0.74320561, "learning_rate": 1.0576466324402452e-06, "loss": 0.76796949, "num_input_tokens_seen": 239410605, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20458984, "step": 11085, "time_per_iteration": 2.8573334217071533 }, { "auxiliary_loss_clip": 0.01419305, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.25323558, "balance_loss_mlp": 1.0164696, "epoch": 0.6665263790771081, "flos": 21582679236480.0, "grad_norm": 1.7715154564091538, "language_loss": 0.80755067, "learning_rate": 1.057303129975894e-06, "loss": 0.832102, "num_input_tokens_seen": 239427155, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19348145, "step": 11086, "time_per_iteration": 2.833106756210327 }, { "auxiliary_loss_clip": 0.01428811, "auxiliary_loss_mlp": 0.01037576, "balance_loss_clip": 1.26362872, "balance_loss_mlp": 1.01708424, "epoch": 0.666586502329776, "flos": 24217210794240.0, "grad_norm": 1.8152405315560656, "language_loss": 0.75184894, "learning_rate": 1.056959663258702e-06, "loss": 0.7765128, "num_input_tokens_seen": 239445510, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20495605, "step": 11087, "time_per_iteration": 2.8348052501678467 }, { "auxiliary_loss_clip": 0.01425742, "auxiliary_loss_mlp": 0.01036741, "balance_loss_clip": 1.26110148, "balance_loss_mlp": 1.01654732, "epoch": 0.666646625582444, "flos": 22210733312640.0, "grad_norm": 1.7040851295726143, "language_loss": 0.65510464, "learning_rate": 1.0566162323016939e-06, "loss": 0.67972946, "num_input_tokens_seen": 239464805, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20202637, "step": 11088, "time_per_iteration": 2.936216354370117 }, { "auxiliary_loss_clip": 0.01443676, "auxiliary_loss_mlp": 0.01031132, "balance_loss_clip": 1.27609944, "balance_loss_mlp": 1.01165295, "epoch": 0.6667067488351119, "flos": 18269093433600.0, "grad_norm": 2.3196289007575235, "language_loss": 0.64906335, "learning_rate": 1.0562728371178928e-06, "loss": 0.67381144, "num_input_tokens_seen": 239483890, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19470215, "step": 11089, "time_per_iteration": 2.848558187484741 }, { "auxiliary_loss_clip": 0.01432778, "auxiliary_loss_mlp": 0.01032228, "balance_loss_clip": 1.26963913, "balance_loss_mlp": 1.0133338, "epoch": 0.66676687208778, "flos": 17244983208960.0, "grad_norm": 2.0685602873936424, "language_loss": 0.81370699, "learning_rate": 1.0559294777203221e-06, "loss": 0.83835709, "num_input_tokens_seen": 239500080, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18896484, "step": 11090, "time_per_iteration": 2.860603094100952 }, { "auxiliary_loss_clip": 0.01438416, "auxiliary_loss_mlp": 0.01036716, "balance_loss_clip": 1.26887405, "balance_loss_mlp": 1.01754713, "epoch": 0.6668269953404479, "flos": 19760801114880.0, "grad_norm": 2.373551797917491, "language_loss": 0.79145694, "learning_rate": 1.0555861541219984e-06, "loss": 0.8162083, "num_input_tokens_seen": 239517335, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19140625, "step": 11091, "time_per_iteration": 2.9464988708496094 }, { "auxiliary_loss_clip": 0.01417879, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.25398231, "balance_loss_mlp": 1.01619983, "epoch": 0.6668871185931159, "flos": 20568160909440.0, "grad_norm": 2.2474601469819486, "language_loss": 0.7944392, "learning_rate": 1.0552428663359425e-06, "loss": 0.8189714, "num_input_tokens_seen": 239536240, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19152832, "step": 11092, "time_per_iteration": 2.885115623474121 }, { "auxiliary_loss_clip": 0.01189784, "auxiliary_loss_mlp": 0.01022726, "balance_loss_clip": 1.09935999, "balance_loss_mlp": 1.0004096, "epoch": 0.6669472418457839, "flos": 58113445956480.0, "grad_norm": 0.7598091510470407, "language_loss": 0.57820797, "learning_rate": 1.0548996143751724e-06, "loss": 0.60033309, "num_input_tokens_seen": 239598000, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.22363281, "step": 11093, "time_per_iteration": 3.4036717414855957 }, { "auxiliary_loss_clip": 0.01427017, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.26234508, "balance_loss_mlp": 1.01257849, "epoch": 0.6670073650984518, "flos": 26075583486720.0, "grad_norm": 1.511615471106695, "language_loss": 0.76894677, "learning_rate": 1.054556398252703e-06, "loss": 0.79353595, "num_input_tokens_seen": 239617650, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19311523, "step": 11094, "time_per_iteration": 4.3695337772369385 }, { "auxiliary_loss_clip": 0.01420026, "auxiliary_loss_mlp": 0.01036733, "balance_loss_clip": 1.25489283, "balance_loss_mlp": 1.01614535, "epoch": 0.6670674883511198, "flos": 32429801341440.0, "grad_norm": 1.6937028947056387, "language_loss": 0.73780954, "learning_rate": 1.05421321798155e-06, "loss": 0.76237702, "num_input_tokens_seen": 239639825, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20593262, "step": 11095, "time_per_iteration": 3.0058164596557617 }, { "auxiliary_loss_clip": 0.01435712, "auxiliary_loss_mlp": 0.01037301, "balance_loss_clip": 1.27088642, "balance_loss_mlp": 1.0175482, "epoch": 0.6671276116037878, "flos": 18046053976320.0, "grad_norm": 2.2146656239482807, "language_loss": 0.74626327, "learning_rate": 1.053870073574727e-06, "loss": 0.77099335, "num_input_tokens_seen": 239656300, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19750977, "step": 11096, "time_per_iteration": 2.935943603515625 }, { "auxiliary_loss_clip": 0.0141455, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.25367665, "balance_loss_mlp": 1.01416802, "epoch": 0.6671877348564558, "flos": 23777058948480.0, "grad_norm": 2.1124908395191704, "language_loss": 0.64624739, "learning_rate": 1.0535269650452456e-06, "loss": 0.67072642, "num_input_tokens_seen": 239676655, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19189453, "step": 11097, "time_per_iteration": 2.8875958919525146 }, { "auxiliary_loss_clip": 0.01447542, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.27785182, "balance_loss_mlp": 1.01799989, "epoch": 0.6672478581091237, "flos": 20926908080640.0, "grad_norm": 1.8053566891702806, "language_loss": 0.76925945, "learning_rate": 1.0531838924061158e-06, "loss": 0.79410952, "num_input_tokens_seen": 239695430, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19482422, "step": 11098, "time_per_iteration": 2.866103172302246 }, { "auxiliary_loss_clip": 0.01436905, "auxiliary_loss_mlp": 0.01036486, "balance_loss_clip": 1.26976681, "balance_loss_mlp": 1.01657796, "epoch": 0.6673079813617917, "flos": 27867482288640.0, "grad_norm": 1.4783846892718366, "language_loss": 0.75219268, "learning_rate": 1.0528408556703476e-06, "loss": 0.77692658, "num_input_tokens_seen": 239717070, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19897461, "step": 11099, "time_per_iteration": 2.941845178604126 }, { "auxiliary_loss_clip": 0.01413907, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.25221837, "balance_loss_mlp": 1.01423454, "epoch": 0.6673681046144596, "flos": 21626955912960.0, "grad_norm": 2.310717342562936, "language_loss": 0.78460163, "learning_rate": 1.0524978548509502e-06, "loss": 0.80908537, "num_input_tokens_seen": 239737105, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.20227051, "step": 11100, "time_per_iteration": 2.877443313598633 }, { "auxiliary_loss_clip": 0.01423189, "auxiliary_loss_mlp": 0.01038388, "balance_loss_clip": 1.26031411, "balance_loss_mlp": 1.01924324, "epoch": 0.6674282278671276, "flos": 20900503100160.0, "grad_norm": 2.0591868506382935, "language_loss": 0.61124074, "learning_rate": 1.0521548899609288e-06, "loss": 0.63585651, "num_input_tokens_seen": 239757835, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19152832, "step": 11101, "time_per_iteration": 2.8667876720428467 }, { "auxiliary_loss_clip": 0.01445873, "auxiliary_loss_mlp": 0.01036866, "balance_loss_clip": 1.27264512, "balance_loss_mlp": 1.01650488, "epoch": 0.6674883511197955, "flos": 23634971717760.0, "grad_norm": 1.7262869095556992, "language_loss": 0.72170943, "learning_rate": 1.0518119610132884e-06, "loss": 0.74653685, "num_input_tokens_seen": 239775425, "router_z_loss_clip": 1.73144531, "router_z_loss_mlp": 0.20373535, "step": 11102, "time_per_iteration": 2.8671741485595703 }, { "auxiliary_loss_clip": 0.01426863, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.26144528, "balance_loss_mlp": 1.01467645, "epoch": 0.6675484743724636, "flos": 19619121087360.0, "grad_norm": 1.5821192058812679, "language_loss": 0.84880668, "learning_rate": 1.051469068021034e-06, "loss": 0.87340599, "num_input_tokens_seen": 239794605, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.18408203, "step": 11103, "time_per_iteration": 2.835686683654785 }, { "auxiliary_loss_clip": 0.0142546, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 1.2586199, "balance_loss_mlp": 1.01124132, "epoch": 0.6676085976251315, "flos": 14327589288960.0, "grad_norm": 1.947674768385336, "language_loss": 0.78757703, "learning_rate": 1.0511262109971668e-06, "loss": 0.81213403, "num_input_tokens_seen": 239812135, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18981934, "step": 11104, "time_per_iteration": 2.8343565464019775 }, { "auxiliary_loss_clip": 0.01451486, "auxiliary_loss_mlp": 0.01033545, "balance_loss_clip": 1.28038096, "balance_loss_mlp": 1.0146029, "epoch": 0.6676687208777995, "flos": 38117977470720.0, "grad_norm": 1.5517765638126149, "language_loss": 0.58710819, "learning_rate": 1.0507833899546889e-06, "loss": 0.6119585, "num_input_tokens_seen": 239835845, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.18933105, "step": 11105, "time_per_iteration": 4.42592978477478 }, { "auxiliary_loss_clip": 0.01447889, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.27612472, "balance_loss_mlp": 1.01560605, "epoch": 0.6677288441304675, "flos": 23990506508160.0, "grad_norm": 1.6536392162727778, "language_loss": 0.73845309, "learning_rate": 1.0504406049066e-06, "loss": 0.76328707, "num_input_tokens_seen": 239853820, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.19897461, "step": 11106, "time_per_iteration": 2.8575658798217773 }, { "auxiliary_loss_clip": 0.01425378, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.26063943, "balance_loss_mlp": 1.01408875, "epoch": 0.6677889673831354, "flos": 24181394895360.0, "grad_norm": 1.9614804299874335, "language_loss": 0.77311718, "learning_rate": 1.0500978558659e-06, "loss": 0.7977041, "num_input_tokens_seen": 239873365, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19226074, "step": 11107, "time_per_iteration": 4.29552698135376 }, { "auxiliary_loss_clip": 0.0140686, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.2471714, "balance_loss_mlp": 1.01238132, "epoch": 0.6678490906358034, "flos": 22319809822080.0, "grad_norm": 2.144975013857604, "language_loss": 0.90019464, "learning_rate": 1.049755142845583e-06, "loss": 0.92457736, "num_input_tokens_seen": 239891215, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19030762, "step": 11108, "time_per_iteration": 4.261826753616333 }, { "auxiliary_loss_clip": 0.01417365, "auxiliary_loss_mlp": 0.01029028, "balance_loss_clip": 1.25531745, "balance_loss_mlp": 1.01090837, "epoch": 0.6679092138884714, "flos": 36911349146880.0, "grad_norm": 1.3608397724005503, "language_loss": 0.83412933, "learning_rate": 1.049412465858646e-06, "loss": 0.85859323, "num_input_tokens_seen": 239913490, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18115234, "step": 11109, "time_per_iteration": 2.9660942554473877 }, { "auxiliary_loss_clip": 0.01431011, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 1.26418543, "balance_loss_mlp": 1.0125221, "epoch": 0.6679693371411394, "flos": 18159383496960.0, "grad_norm": 2.443556270588572, "language_loss": 0.6962285, "learning_rate": 1.0490698249180847e-06, "loss": 0.72085893, "num_input_tokens_seen": 239931565, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19494629, "step": 11110, "time_per_iteration": 2.8377926349639893 }, { "auxiliary_loss_clip": 0.01429309, "auxiliary_loss_mlp": 0.01036573, "balance_loss_clip": 1.26072001, "balance_loss_mlp": 1.01591408, "epoch": 0.6680294603938073, "flos": 27209448892800.0, "grad_norm": 1.6177040642839182, "language_loss": 0.7393024, "learning_rate": 1.04872722003689e-06, "loss": 0.7639612, "num_input_tokens_seen": 239952395, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20654297, "step": 11111, "time_per_iteration": 2.902770519256592 }, { "auxiliary_loss_clip": 0.01425061, "auxiliary_loss_mlp": 0.01032883, "balance_loss_clip": 1.2603085, "balance_loss_mlp": 1.01395226, "epoch": 0.6680895836464753, "flos": 21735172771200.0, "grad_norm": 1.9525374717576611, "language_loss": 0.66157007, "learning_rate": 1.0483846512280553e-06, "loss": 0.68614954, "num_input_tokens_seen": 239968910, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18933105, "step": 11112, "time_per_iteration": 2.8218181133270264 }, { "auxiliary_loss_clip": 0.01423763, "auxiliary_loss_mlp": 0.01035072, "balance_loss_clip": 1.25932693, "balance_loss_mlp": 1.01639175, "epoch": 0.6681497068991432, "flos": 19656022861440.0, "grad_norm": 1.8321413613679016, "language_loss": 0.63838267, "learning_rate": 1.048042118504569e-06, "loss": 0.66297102, "num_input_tokens_seen": 239987680, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.18688965, "step": 11113, "time_per_iteration": 2.841856002807617 }, { "auxiliary_loss_clip": 0.01415407, "auxiliary_loss_mlp": 0.01031076, "balance_loss_clip": 1.25457859, "balance_loss_mlp": 1.01190758, "epoch": 0.6682098301518112, "flos": 17427682287360.0, "grad_norm": 1.810489436877256, "language_loss": 0.66314411, "learning_rate": 1.047699621879422e-06, "loss": 0.68760896, "num_input_tokens_seen": 240005790, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19177246, "step": 11114, "time_per_iteration": 2.826577663421631 }, { "auxiliary_loss_clip": 0.01422759, "auxiliary_loss_mlp": 0.01036515, "balance_loss_clip": 1.25887764, "balance_loss_mlp": 1.01723886, "epoch": 0.6682699534044791, "flos": 22608599253120.0, "grad_norm": 1.4731775282690613, "language_loss": 0.79131973, "learning_rate": 1.0473571613655998e-06, "loss": 0.81591249, "num_input_tokens_seen": 240025895, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19287109, "step": 11115, "time_per_iteration": 2.8473198413848877 }, { "auxiliary_loss_clip": 0.01419837, "auxiliary_loss_mlp": 0.01030962, "balance_loss_clip": 1.25423908, "balance_loss_mlp": 1.01280665, "epoch": 0.6683300766571472, "flos": 24874520273280.0, "grad_norm": 1.8069303436138708, "language_loss": 0.80151606, "learning_rate": 1.0470147369760896e-06, "loss": 0.82602406, "num_input_tokens_seen": 240044880, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.18164062, "step": 11116, "time_per_iteration": 2.8971760272979736 }, { "auxiliary_loss_clip": 0.0144262, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.27498519, "balance_loss_mlp": 1.02139771, "epoch": 0.6683901999098151, "flos": 27138269543040.0, "grad_norm": 1.5765453631664295, "language_loss": 0.8031112, "learning_rate": 1.0466723487238768e-06, "loss": 0.82795942, "num_input_tokens_seen": 240065785, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20812988, "step": 11117, "time_per_iteration": 2.9236955642700195 }, { "auxiliary_loss_clip": 0.01425277, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.25905538, "balance_loss_mlp": 1.01329517, "epoch": 0.6684503231624831, "flos": 20748326279040.0, "grad_norm": 3.3414870050218224, "language_loss": 0.66314435, "learning_rate": 1.0463299966219441e-06, "loss": 0.68773127, "num_input_tokens_seen": 240085130, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.2010498, "step": 11118, "time_per_iteration": 2.8796873092651367 }, { "auxiliary_loss_clip": 0.0142745, "auxiliary_loss_mlp": 0.01032902, "balance_loss_clip": 1.26247168, "balance_loss_mlp": 1.01450777, "epoch": 0.668510446415151, "flos": 21772165034880.0, "grad_norm": 2.590461052957698, "language_loss": 0.69603598, "learning_rate": 1.0459876806832727e-06, "loss": 0.72063947, "num_input_tokens_seen": 240105495, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18395996, "step": 11119, "time_per_iteration": 2.914313554763794 }, { "auxiliary_loss_clip": 0.01435133, "auxiliary_loss_mlp": 0.01033867, "balance_loss_clip": 1.26823545, "balance_loss_mlp": 1.01490128, "epoch": 0.668570569667819, "flos": 30203632517760.0, "grad_norm": 1.7042042395058659, "language_loss": 0.67934179, "learning_rate": 1.0456454009208448e-06, "loss": 0.70403177, "num_input_tokens_seen": 240125455, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.18981934, "step": 11120, "time_per_iteration": 2.9352755546569824 }, { "auxiliary_loss_clip": 0.01436231, "auxiliary_loss_mlp": 0.01043004, "balance_loss_clip": 1.26949275, "balance_loss_mlp": 1.02328658, "epoch": 0.668630692920487, "flos": 24181349650560.0, "grad_norm": 2.992925473320502, "language_loss": 0.73173988, "learning_rate": 1.045303157347638e-06, "loss": 0.75653219, "num_input_tokens_seen": 240143870, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19714355, "step": 11121, "time_per_iteration": 2.9509148597717285 }, { "auxiliary_loss_clip": 0.01427967, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.26000786, "balance_loss_mlp": 1.01633954, "epoch": 0.668690816173155, "flos": 17466077139840.0, "grad_norm": 2.845159035875399, "language_loss": 0.70949292, "learning_rate": 1.0449609499766316e-06, "loss": 0.73412722, "num_input_tokens_seen": 240161020, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19128418, "step": 11122, "time_per_iteration": 2.849296808242798 }, { "auxiliary_loss_clip": 0.01433778, "auxiliary_loss_mlp": 0.01037961, "balance_loss_clip": 1.26626718, "balance_loss_mlp": 1.01785016, "epoch": 0.668750939425823, "flos": 25014797712000.0, "grad_norm": 1.5690782188907024, "language_loss": 0.72035563, "learning_rate": 1.0446187788208015e-06, "loss": 0.74507302, "num_input_tokens_seen": 240179820, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.20092773, "step": 11123, "time_per_iteration": 2.937446117401123 }, { "auxiliary_loss_clip": 0.01437597, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.26995027, "balance_loss_mlp": 1.02116632, "epoch": 0.6688110626784909, "flos": 24107229388800.0, "grad_norm": 1.5262061150681672, "language_loss": 0.79938734, "learning_rate": 1.0442766438931244e-06, "loss": 0.82416719, "num_input_tokens_seen": 240200130, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19226074, "step": 11124, "time_per_iteration": 2.8772456645965576 }, { "auxiliary_loss_clip": 0.01435564, "auxiliary_loss_mlp": 0.01039784, "balance_loss_clip": 1.27037001, "balance_loss_mlp": 1.01974511, "epoch": 0.6688711859311589, "flos": 21768816919680.0, "grad_norm": 1.6291473084014227, "language_loss": 0.75172162, "learning_rate": 1.0439345452065716e-06, "loss": 0.77647507, "num_input_tokens_seen": 240217945, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20056152, "step": 11125, "time_per_iteration": 2.8896243572235107 }, { "auxiliary_loss_clip": 0.01431576, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.26520371, "balance_loss_mlp": 1.01170266, "epoch": 0.6689313091838268, "flos": 22939900813440.0, "grad_norm": 2.279715372351084, "language_loss": 0.67352307, "learning_rate": 1.043592482774116e-06, "loss": 0.69815016, "num_input_tokens_seen": 240237220, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19445801, "step": 11126, "time_per_iteration": 2.864891529083252 }, { "auxiliary_loss_clip": 0.01435276, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.26801658, "balance_loss_mlp": 1.01425517, "epoch": 0.6689914324364948, "flos": 20895797640960.0, "grad_norm": 11.424251048117114, "language_loss": 0.72065276, "learning_rate": 1.0432504566087305e-06, "loss": 0.74533486, "num_input_tokens_seen": 240256000, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.18664551, "step": 11127, "time_per_iteration": 2.8378522396087646 }, { "auxiliary_loss_clip": 0.01445193, "auxiliary_loss_mlp": 0.0103719, "balance_loss_clip": 1.27281356, "balance_loss_mlp": 1.01661444, "epoch": 0.6690515556891627, "flos": 22758694813440.0, "grad_norm": 1.9837402075798045, "language_loss": 0.8187238, "learning_rate": 1.0429084667233827e-06, "loss": 0.84354758, "num_input_tokens_seen": 240275845, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.20568848, "step": 11128, "time_per_iteration": 2.8787641525268555 }, { "auxiliary_loss_clip": 0.01439455, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.27046871, "balance_loss_mlp": 1.01261365, "epoch": 0.6691116789418308, "flos": 23341929275520.0, "grad_norm": 2.800676560715508, "language_loss": 0.81197667, "learning_rate": 1.0425665131310427e-06, "loss": 0.83669364, "num_input_tokens_seen": 240294095, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.19628906, "step": 11129, "time_per_iteration": 4.28815484046936 }, { "auxiliary_loss_clip": 0.01408137, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.24715698, "balance_loss_mlp": 1.01268435, "epoch": 0.6691718021944987, "flos": 32458332827520.0, "grad_norm": 2.188447262640093, "language_loss": 0.71206796, "learning_rate": 1.0422245958446762e-06, "loss": 0.73646975, "num_input_tokens_seen": 240313460, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19335938, "step": 11130, "time_per_iteration": 2.9213359355926514 }, { "auxiliary_loss_clip": 0.01424473, "auxiliary_loss_mlp": 0.01041801, "balance_loss_clip": 1.26271057, "balance_loss_mlp": 1.02235782, "epoch": 0.6692319254471667, "flos": 23741876476800.0, "grad_norm": 1.5446444510145168, "language_loss": 0.70957112, "learning_rate": 1.0418827148772486e-06, "loss": 0.7342338, "num_input_tokens_seen": 240333540, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19458008, "step": 11131, "time_per_iteration": 2.875110626220703 }, { "auxiliary_loss_clip": 0.01430636, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.26214588, "balance_loss_mlp": 1.01234651, "epoch": 0.6692920486998346, "flos": 14435806147200.0, "grad_norm": 3.0602631240362146, "language_loss": 0.66106319, "learning_rate": 1.0415408702417243e-06, "loss": 0.68568945, "num_input_tokens_seen": 240350085, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.1965332, "step": 11132, "time_per_iteration": 2.826151132583618 }, { "auxiliary_loss_clip": 0.0143404, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.26581311, "balance_loss_mlp": 1.01197171, "epoch": 0.6693521719525026, "flos": 21517562689920.0, "grad_norm": 2.052981968941779, "language_loss": 0.75095028, "learning_rate": 1.0411990619510661e-06, "loss": 0.77561677, "num_input_tokens_seen": 240370015, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.2064209, "step": 11133, "time_per_iteration": 2.942416191101074 }, { "auxiliary_loss_clip": 0.01446412, "auxiliary_loss_mlp": 0.01036951, "balance_loss_clip": 1.27625751, "balance_loss_mlp": 1.01778162, "epoch": 0.6694122952051706, "flos": 25416871418880.0, "grad_norm": 2.057315250537488, "language_loss": 0.66910547, "learning_rate": 1.0408572900182363e-06, "loss": 0.69393909, "num_input_tokens_seen": 240390770, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.19177246, "step": 11134, "time_per_iteration": 2.9374921321868896 }, { "auxiliary_loss_clip": 0.01444776, "auxiliary_loss_mlp": 0.01036, "balance_loss_clip": 1.27344465, "balance_loss_mlp": 1.01479292, "epoch": 0.6694724184578386, "flos": 25671564253440.0, "grad_norm": 1.728706186414479, "language_loss": 0.77416945, "learning_rate": 1.0405155544561943e-06, "loss": 0.7989772, "num_input_tokens_seen": 240409590, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.21203613, "step": 11135, "time_per_iteration": 2.8583390712738037 }, { "auxiliary_loss_clip": 0.01422894, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.25998902, "balance_loss_mlp": 1.01334095, "epoch": 0.6695325417105066, "flos": 17717467104000.0, "grad_norm": 1.6313023345218096, "language_loss": 0.74407864, "learning_rate": 1.040173855277898e-06, "loss": 0.76863974, "num_input_tokens_seen": 240428180, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19873047, "step": 11136, "time_per_iteration": 2.8936526775360107 }, { "auxiliary_loss_clip": 0.0144637, "auxiliary_loss_mlp": 0.01032864, "balance_loss_clip": 1.27596021, "balance_loss_mlp": 1.01358747, "epoch": 0.6695926649631745, "flos": 24470184326400.0, "grad_norm": 1.811101687926553, "language_loss": 0.63008344, "learning_rate": 1.0398321924963061e-06, "loss": 0.65487576, "num_input_tokens_seen": 240447815, "router_z_loss_clip": 1.70507812, "router_z_loss_mlp": 0.19274902, "step": 11137, "time_per_iteration": 2.8636484146118164 }, { "auxiliary_loss_clip": 0.0143336, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.26798069, "balance_loss_mlp": 1.01582503, "epoch": 0.6696527882158425, "flos": 24290878608000.0, "grad_norm": 2.4163579637871435, "language_loss": 0.66850686, "learning_rate": 1.0394905661243724e-06, "loss": 0.69319779, "num_input_tokens_seen": 240468635, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19897461, "step": 11138, "time_per_iteration": 2.8650753498077393 }, { "auxiliary_loss_clip": 0.01408496, "auxiliary_loss_mlp": 0.01031492, "balance_loss_clip": 1.24764049, "balance_loss_mlp": 1.01264477, "epoch": 0.6697129114685104, "flos": 23013070934400.0, "grad_norm": 1.5618162247398246, "language_loss": 0.73391861, "learning_rate": 1.039148976175053e-06, "loss": 0.75831848, "num_input_tokens_seen": 240488550, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18847656, "step": 11139, "time_per_iteration": 2.846755266189575 }, { "auxiliary_loss_clip": 0.01404749, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.24578691, "balance_loss_mlp": 1.01494038, "epoch": 0.6697730347211784, "flos": 22648396694400.0, "grad_norm": 2.036397201057483, "language_loss": 0.71757936, "learning_rate": 1.0388074226613016e-06, "loss": 0.74196279, "num_input_tokens_seen": 240508330, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18652344, "step": 11140, "time_per_iteration": 4.451754808425903 }, { "auxiliary_loss_clip": 0.01431053, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.26204634, "balance_loss_mlp": 1.01277506, "epoch": 0.6698331579738463, "flos": 28889149294080.0, "grad_norm": 1.7601380381257328, "language_loss": 0.75750124, "learning_rate": 1.0384659055960691e-06, "loss": 0.78213751, "num_input_tokens_seen": 240528470, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.19799805, "step": 11141, "time_per_iteration": 3.000377893447876 }, { "auxiliary_loss_clip": 0.01429784, "auxiliary_loss_mlp": 0.01034426, "balance_loss_clip": 1.2630012, "balance_loss_mlp": 1.01488733, "epoch": 0.6698932812265144, "flos": 24217798976640.0, "grad_norm": 2.0901854843769887, "language_loss": 0.82884026, "learning_rate": 1.0381244249923052e-06, "loss": 0.85348237, "num_input_tokens_seen": 240547815, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.19543457, "step": 11142, "time_per_iteration": 4.307194471359253 }, { "auxiliary_loss_clip": 0.01409314, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.24594402, "balance_loss_mlp": 1.01417947, "epoch": 0.6699534044791823, "flos": 22100254214400.0, "grad_norm": 1.7102415254973589, "language_loss": 0.7097699, "learning_rate": 1.037782980862959e-06, "loss": 0.73419368, "num_input_tokens_seen": 240567765, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.1887207, "step": 11143, "time_per_iteration": 4.281297206878662 }, { "auxiliary_loss_clip": 0.01406355, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.24522066, "balance_loss_mlp": 1.01289189, "epoch": 0.6700135277318503, "flos": 25203378614400.0, "grad_norm": 1.5262272589293764, "language_loss": 0.70961374, "learning_rate": 1.0374415732209796e-06, "loss": 0.73398763, "num_input_tokens_seen": 240590750, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18151855, "step": 11144, "time_per_iteration": 2.8696465492248535 }, { "auxiliary_loss_clip": 0.01429515, "auxiliary_loss_mlp": 0.0103423, "balance_loss_clip": 1.26533186, "balance_loss_mlp": 1.01359534, "epoch": 0.6700736509845182, "flos": 23450508092160.0, "grad_norm": 1.8308704367636814, "language_loss": 0.75090277, "learning_rate": 1.0371002020793114e-06, "loss": 0.77554023, "num_input_tokens_seen": 240608875, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20617676, "step": 11145, "time_per_iteration": 2.847994804382324 }, { "auxiliary_loss_clip": 0.01427899, "auxiliary_loss_mlp": 0.01032928, "balance_loss_clip": 1.26041603, "balance_loss_mlp": 1.01284111, "epoch": 0.6701337742371862, "flos": 24400814768640.0, "grad_norm": 4.131606129087925, "language_loss": 0.71430516, "learning_rate": 1.0367588674509008e-06, "loss": 0.73891342, "num_input_tokens_seen": 240628565, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20080566, "step": 11146, "time_per_iteration": 2.8610198497772217 }, { "auxiliary_loss_clip": 0.01400117, "auxiliary_loss_mlp": 0.01038616, "balance_loss_clip": 1.24055982, "balance_loss_mlp": 1.01944685, "epoch": 0.6701938974898543, "flos": 14801882976000.0, "grad_norm": 1.877995160589152, "language_loss": 0.79297107, "learning_rate": 1.0364175693486905e-06, "loss": 0.81735837, "num_input_tokens_seen": 240646325, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19152832, "step": 11147, "time_per_iteration": 2.8161816596984863 }, { "auxiliary_loss_clip": 0.01422822, "auxiliary_loss_mlp": 0.01034922, "balance_loss_clip": 1.25895739, "balance_loss_mlp": 1.0158844, "epoch": 0.6702540207425222, "flos": 20162829576960.0, "grad_norm": 2.2741456182990536, "language_loss": 0.70779908, "learning_rate": 1.0360763077856218e-06, "loss": 0.73237652, "num_input_tokens_seen": 240666145, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19042969, "step": 11148, "time_per_iteration": 2.8232264518737793 }, { "auxiliary_loss_clip": 0.01424379, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.2595582, "balance_loss_mlp": 1.01668763, "epoch": 0.6703141439951902, "flos": 21223796330880.0, "grad_norm": 2.0008857823438957, "language_loss": 0.70817751, "learning_rate": 1.035735082774636e-06, "loss": 0.73277628, "num_input_tokens_seen": 240685570, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18811035, "step": 11149, "time_per_iteration": 2.8280773162841797 }, { "auxiliary_loss_clip": 0.01423313, "auxiliary_loss_mlp": 0.01030208, "balance_loss_clip": 1.25677514, "balance_loss_mlp": 1.011325, "epoch": 0.6703742672478581, "flos": 23122961850240.0, "grad_norm": 1.9701891807864087, "language_loss": 0.74783486, "learning_rate": 1.0353938943286727e-06, "loss": 0.7723701, "num_input_tokens_seen": 240706945, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.18884277, "step": 11150, "time_per_iteration": 2.8882875442504883 }, { "auxiliary_loss_clip": 0.01426883, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 1.26075721, "balance_loss_mlp": 1.01342726, "epoch": 0.6704343905005261, "flos": 22539229695360.0, "grad_norm": 2.7833942384166677, "language_loss": 0.79441345, "learning_rate": 1.035052742460671e-06, "loss": 0.81900382, "num_input_tokens_seen": 240727990, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.18737793, "step": 11151, "time_per_iteration": 2.910689115524292 }, { "auxiliary_loss_clip": 0.01183419, "auxiliary_loss_mlp": 0.01040254, "balance_loss_clip": 1.09327173, "balance_loss_mlp": 1.0198456, "epoch": 0.670494513753194, "flos": 64827469595520.0, "grad_norm": 0.8078575823219534, "language_loss": 0.55508566, "learning_rate": 1.0347116271835643e-06, "loss": 0.57732236, "num_input_tokens_seen": 240790380, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.20410156, "step": 11152, "time_per_iteration": 3.4526185989379883 }, { "auxiliary_loss_clip": 0.01421727, "auxiliary_loss_mlp": 0.01031638, "balance_loss_clip": 1.25489056, "balance_loss_mlp": 1.0127914, "epoch": 0.670554637005862, "flos": 23521415973120.0, "grad_norm": 2.333353269989664, "language_loss": 0.81221437, "learning_rate": 1.0343705485102896e-06, "loss": 0.83674806, "num_input_tokens_seen": 240811545, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.18835449, "step": 11153, "time_per_iteration": 2.8935024738311768 }, { "auxiliary_loss_clip": 0.01425813, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.25877655, "balance_loss_mlp": 1.01527762, "epoch": 0.67061476025853, "flos": 19472735600640.0, "grad_norm": 1.4726488605530594, "language_loss": 0.7674309, "learning_rate": 1.0340295064537814e-06, "loss": 0.79203284, "num_input_tokens_seen": 240831380, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19091797, "step": 11154, "time_per_iteration": 2.879734992980957 }, { "auxiliary_loss_clip": 0.01438656, "auxiliary_loss_mlp": 0.01039044, "balance_loss_clip": 1.26845026, "balance_loss_mlp": 1.01939797, "epoch": 0.670674883511198, "flos": 20529177874560.0, "grad_norm": 1.400748324236154, "language_loss": 0.76500577, "learning_rate": 1.0336885010269702e-06, "loss": 0.78978276, "num_input_tokens_seen": 240851855, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.19628906, "step": 11155, "time_per_iteration": 2.8642945289611816 }, { "auxiliary_loss_clip": 0.01440746, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.27413249, "balance_loss_mlp": 1.01710987, "epoch": 0.6707350067638659, "flos": 25494973223040.0, "grad_norm": 1.7676936066436593, "language_loss": 0.8259865, "learning_rate": 1.0333475322427878e-06, "loss": 0.85075676, "num_input_tokens_seen": 240869980, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19165039, "step": 11156, "time_per_iteration": 2.89217209815979 }, { "auxiliary_loss_clip": 0.01409426, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.248281, "balance_loss_mlp": 1.01617122, "epoch": 0.6707951300165339, "flos": 22283677209600.0, "grad_norm": 2.367773109414181, "language_loss": 0.75600553, "learning_rate": 1.033006600114165e-06, "loss": 0.78045356, "num_input_tokens_seen": 240888680, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1920166, "step": 11157, "time_per_iteration": 2.853684425354004 }, { "auxiliary_loss_clip": 0.01433207, "auxiliary_loss_mlp": 0.01035631, "balance_loss_clip": 1.265378, "balance_loss_mlp": 1.01619959, "epoch": 0.6708552532692018, "flos": 23994307071360.0, "grad_norm": 3.29698040121345, "language_loss": 0.74919713, "learning_rate": 1.0326657046540282e-06, "loss": 0.77388549, "num_input_tokens_seen": 240909050, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19433594, "step": 11158, "time_per_iteration": 2.8713324069976807 }, { "auxiliary_loss_clip": 0.01427007, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.25854897, "balance_loss_mlp": 1.01413965, "epoch": 0.6709153765218698, "flos": 24948595290240.0, "grad_norm": 1.5062742102445699, "language_loss": 0.82238287, "learning_rate": 1.0323248458753044e-06, "loss": 0.8469981, "num_input_tokens_seen": 240930035, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20373535, "step": 11159, "time_per_iteration": 2.8994321823120117 }, { "auxiliary_loss_clip": 0.01438357, "auxiliary_loss_mlp": 0.0103283, "balance_loss_clip": 1.27000368, "balance_loss_mlp": 1.01368499, "epoch": 0.6709754997745379, "flos": 17539202016000.0, "grad_norm": 3.228606841439439, "language_loss": 0.77728015, "learning_rate": 1.0319840237909193e-06, "loss": 0.80199206, "num_input_tokens_seen": 240948895, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19140625, "step": 11160, "time_per_iteration": 2.8161447048187256 }, { "auxiliary_loss_clip": 0.01421054, "auxiliary_loss_mlp": 0.01033552, "balance_loss_clip": 1.25675631, "balance_loss_mlp": 1.013978, "epoch": 0.6710356230272058, "flos": 22101023376000.0, "grad_norm": 1.8934254798651293, "language_loss": 0.73954624, "learning_rate": 1.0316432384137978e-06, "loss": 0.76409227, "num_input_tokens_seen": 240967770, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19567871, "step": 11161, "time_per_iteration": 2.8638856410980225 }, { "auxiliary_loss_clip": 0.01434315, "auxiliary_loss_mlp": 0.01038863, "balance_loss_clip": 1.26466453, "balance_loss_mlp": 1.01949191, "epoch": 0.6710957462798738, "flos": 24216984570240.0, "grad_norm": 1.7723136089099039, "language_loss": 0.69033521, "learning_rate": 1.0313024897568618e-06, "loss": 0.71506703, "num_input_tokens_seen": 240988985, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.19360352, "step": 11162, "time_per_iteration": 2.9104793071746826 }, { "auxiliary_loss_clip": 0.01421996, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.25759673, "balance_loss_mlp": 1.0175786, "epoch": 0.6711558695325417, "flos": 19101998557440.0, "grad_norm": 1.7946894933192514, "language_loss": 0.70537663, "learning_rate": 1.030961777833032e-06, "loss": 0.72995687, "num_input_tokens_seen": 241005455, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18457031, "step": 11163, "time_per_iteration": 2.7993149757385254 }, { "auxiliary_loss_clip": 0.01424824, "auxiliary_loss_mlp": 0.01033821, "balance_loss_clip": 1.26312685, "balance_loss_mlp": 1.01498556, "epoch": 0.6712159927852097, "flos": 25569138729600.0, "grad_norm": 1.5811604148597977, "language_loss": 0.76072586, "learning_rate": 1.0306211026552291e-06, "loss": 0.78531229, "num_input_tokens_seen": 241026175, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18835449, "step": 11164, "time_per_iteration": 4.275823593139648 }, { "auxiliary_loss_clip": 0.01423045, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.25825715, "balance_loss_mlp": 1.01299644, "epoch": 0.6712761160378776, "flos": 22236912069120.0, "grad_norm": 2.0024064867941354, "language_loss": 0.6588763, "learning_rate": 1.0302804642363704e-06, "loss": 0.68342394, "num_input_tokens_seen": 241044040, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18725586, "step": 11165, "time_per_iteration": 2.821869134902954 }, { "auxiliary_loss_clip": 0.01421354, "auxiliary_loss_mlp": 0.01035711, "balance_loss_clip": 1.2573483, "balance_loss_mlp": 1.01625609, "epoch": 0.6713362392905456, "flos": 22465697616000.0, "grad_norm": 2.3670901914803384, "language_loss": 0.72164333, "learning_rate": 1.0299398625893738e-06, "loss": 0.74621403, "num_input_tokens_seen": 241063615, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19458008, "step": 11166, "time_per_iteration": 2.8316218852996826 }, { "auxiliary_loss_clip": 0.01424416, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.26177442, "balance_loss_mlp": 1.0142529, "epoch": 0.6713963625432136, "flos": 25641313464960.0, "grad_norm": 1.877940402667431, "language_loss": 0.77945936, "learning_rate": 1.0295992977271546e-06, "loss": 0.80403364, "num_input_tokens_seen": 241082520, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18762207, "step": 11167, "time_per_iteration": 2.8686482906341553 }, { "auxiliary_loss_clip": 0.01428548, "auxiliary_loss_mlp": 0.01030395, "balance_loss_clip": 1.26118827, "balance_loss_mlp": 1.01142883, "epoch": 0.6714564857958816, "flos": 35019875243520.0, "grad_norm": 1.7129522664165657, "language_loss": 0.69379824, "learning_rate": 1.029258769662629e-06, "loss": 0.71838772, "num_input_tokens_seen": 241103505, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.1895752, "step": 11168, "time_per_iteration": 2.9571127891540527 }, { "auxiliary_loss_clip": 0.01438528, "auxiliary_loss_mlp": 0.01041922, "balance_loss_clip": 1.27015233, "balance_loss_mlp": 1.02027404, "epoch": 0.6715166090485495, "flos": 26289393004800.0, "grad_norm": 1.9746275895954053, "language_loss": 0.74588192, "learning_rate": 1.0289182784087068e-06, "loss": 0.77068645, "num_input_tokens_seen": 241122885, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.21655273, "step": 11169, "time_per_iteration": 2.848971366882324 }, { "auxiliary_loss_clip": 0.01440244, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.27111816, "balance_loss_mlp": 1.01361799, "epoch": 0.6715767323012175, "flos": 15932536001280.0, "grad_norm": 2.1919190267895057, "language_loss": 0.7668069, "learning_rate": 1.0285778239783005e-06, "loss": 0.79154009, "num_input_tokens_seen": 241140865, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.19433594, "step": 11170, "time_per_iteration": 2.7861461639404297 }, { "auxiliary_loss_clip": 0.01434538, "auxiliary_loss_mlp": 0.01031963, "balance_loss_clip": 1.26590717, "balance_loss_mlp": 1.01250851, "epoch": 0.6716368555538854, "flos": 17499811777920.0, "grad_norm": 2.3456806494638127, "language_loss": 0.75897145, "learning_rate": 1.0282374063843212e-06, "loss": 0.78363645, "num_input_tokens_seen": 241158225, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.19445801, "step": 11171, "time_per_iteration": 2.827265501022339 }, { "auxiliary_loss_clip": 0.01438658, "auxiliary_loss_mlp": 0.01033438, "balance_loss_clip": 1.26980472, "balance_loss_mlp": 1.0141499, "epoch": 0.6716969788065534, "flos": 16769648891520.0, "grad_norm": 1.6024656472006908, "language_loss": 0.86998653, "learning_rate": 1.0278970256396762e-06, "loss": 0.8947075, "num_input_tokens_seen": 241175215, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.19299316, "step": 11172, "time_per_iteration": 2.866748571395874 }, { "auxiliary_loss_clip": 0.01425849, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.25934291, "balance_loss_mlp": 1.01406956, "epoch": 0.6717571020592215, "flos": 22719757023360.0, "grad_norm": 4.216270137151523, "language_loss": 0.63878965, "learning_rate": 1.0275566817572733e-06, "loss": 0.6633907, "num_input_tokens_seen": 241195250, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20178223, "step": 11173, "time_per_iteration": 2.861457586288452 }, { "auxiliary_loss_clip": 0.01458236, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.28311241, "balance_loss_mlp": 1.01796293, "epoch": 0.6718172253118894, "flos": 18743341875840.0, "grad_norm": 2.539078668148549, "language_loss": 0.72649062, "learning_rate": 1.02721637475002e-06, "loss": 0.75145566, "num_input_tokens_seen": 241210720, "router_z_loss_clip": 1.75097656, "router_z_loss_mlp": 0.203125, "step": 11174, "time_per_iteration": 2.816046953201294 }, { "auxiliary_loss_clip": 0.01420469, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.25847411, "balance_loss_mlp": 1.015885, "epoch": 0.6718773485645574, "flos": 15640896147840.0, "grad_norm": 2.127704778584871, "language_loss": 0.69573224, "learning_rate": 1.0268761046308178e-06, "loss": 0.72028953, "num_input_tokens_seen": 241227395, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19360352, "step": 11175, "time_per_iteration": 4.3781023025512695 }, { "auxiliary_loss_clip": 0.01413576, "auxiliary_loss_mlp": 0.01033161, "balance_loss_clip": 1.25185823, "balance_loss_mlp": 1.01454067, "epoch": 0.6719374718172253, "flos": 19364292518400.0, "grad_norm": 1.8740866203121913, "language_loss": 0.74763626, "learning_rate": 1.0265358714125714e-06, "loss": 0.77210361, "num_input_tokens_seen": 241246355, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18615723, "step": 11176, "time_per_iteration": 2.9132983684539795 }, { "auxiliary_loss_clip": 0.01431314, "auxiliary_loss_mlp": 0.01033277, "balance_loss_clip": 1.26257896, "balance_loss_mlp": 1.01475179, "epoch": 0.6719975950698933, "flos": 21991403928960.0, "grad_norm": 1.8377965480122378, "language_loss": 0.7437501, "learning_rate": 1.026195675108182e-06, "loss": 0.76839602, "num_input_tokens_seen": 241264180, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.18518066, "step": 11177, "time_per_iteration": 4.2998597621917725 }, { "auxiliary_loss_clip": 0.01423326, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.25719023, "balance_loss_mlp": 1.01486778, "epoch": 0.6720577183225612, "flos": 25238877799680.0, "grad_norm": 2.0055842456422597, "language_loss": 0.78036511, "learning_rate": 1.025855515730551e-06, "loss": 0.80494475, "num_input_tokens_seen": 241282245, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19763184, "step": 11178, "time_per_iteration": 2.880328893661499 }, { "auxiliary_loss_clip": 0.01442148, "auxiliary_loss_mlp": 0.01031625, "balance_loss_clip": 1.27287006, "balance_loss_mlp": 1.01281381, "epoch": 0.6721178415752292, "flos": 16954564965120.0, "grad_norm": 2.6189697790450728, "language_loss": 0.71288818, "learning_rate": 1.0255153932925766e-06, "loss": 0.7376259, "num_input_tokens_seen": 241300745, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.18823242, "step": 11179, "time_per_iteration": 4.227755546569824 }, { "auxiliary_loss_clip": 0.01429492, "auxiliary_loss_mlp": 0.01031804, "balance_loss_clip": 1.26429701, "balance_loss_mlp": 1.01239693, "epoch": 0.6721779648278972, "flos": 21550799635200.0, "grad_norm": 1.4999145056035152, "language_loss": 0.74815279, "learning_rate": 1.0251753078071557e-06, "loss": 0.77276576, "num_input_tokens_seen": 241319320, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19396973, "step": 11180, "time_per_iteration": 2.887552261352539 }, { "auxiliary_loss_clip": 0.01414608, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.25117397, "balance_loss_mlp": 1.01274359, "epoch": 0.6722380880805652, "flos": 22616698072320.0, "grad_norm": 1.5216115196224718, "language_loss": 0.75898337, "learning_rate": 1.0248352592871848e-06, "loss": 0.7834549, "num_input_tokens_seen": 241342225, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19812012, "step": 11181, "time_per_iteration": 2.943776845932007 }, { "auxiliary_loss_clip": 0.01421305, "auxiliary_loss_mlp": 0.01031668, "balance_loss_clip": 1.25448966, "balance_loss_mlp": 1.01232052, "epoch": 0.6722982113332331, "flos": 15933938590080.0, "grad_norm": 1.873958847421389, "language_loss": 0.75246596, "learning_rate": 1.0244952477455585e-06, "loss": 0.77699566, "num_input_tokens_seen": 241358240, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19335938, "step": 11182, "time_per_iteration": 2.8862056732177734 }, { "auxiliary_loss_clip": 0.0141601, "auxiliary_loss_mlp": 0.01037266, "balance_loss_clip": 1.25324702, "balance_loss_mlp": 1.01794171, "epoch": 0.6723583345859011, "flos": 20606239048320.0, "grad_norm": 3.1872858020171977, "language_loss": 0.70487446, "learning_rate": 1.0241552731951699e-06, "loss": 0.72940719, "num_input_tokens_seen": 241378420, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19335938, "step": 11183, "time_per_iteration": 2.8479068279266357 }, { "auxiliary_loss_clip": 0.01420587, "auxiliary_loss_mlp": 0.01036485, "balance_loss_clip": 1.25617456, "balance_loss_mlp": 1.01666093, "epoch": 0.672418457838569, "flos": 21735851443200.0, "grad_norm": 1.530763082606305, "language_loss": 0.78599769, "learning_rate": 1.0238153356489112e-06, "loss": 0.81056839, "num_input_tokens_seen": 241397185, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19812012, "step": 11184, "time_per_iteration": 2.823882579803467 }, { "auxiliary_loss_clip": 0.01455416, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.28180718, "balance_loss_mlp": 1.0151813, "epoch": 0.672478581091237, "flos": 21480298957440.0, "grad_norm": 2.445310921564665, "language_loss": 0.67221904, "learning_rate": 1.0234754351196743e-06, "loss": 0.69712329, "num_input_tokens_seen": 241415785, "router_z_loss_clip": 1.73535156, "router_z_loss_mlp": 0.19812012, "step": 11185, "time_per_iteration": 2.834935188293457 }, { "auxiliary_loss_clip": 0.01426319, "auxiliary_loss_mlp": 0.01037009, "balance_loss_clip": 1.26090682, "balance_loss_mlp": 1.01667237, "epoch": 0.6725387043439051, "flos": 30858996470400.0, "grad_norm": 1.7372146392619452, "language_loss": 0.80933475, "learning_rate": 1.023135571620345e-06, "loss": 0.83396804, "num_input_tokens_seen": 241437390, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20336914, "step": 11186, "time_per_iteration": 2.9240360260009766 }, { "auxiliary_loss_clip": 0.01411298, "auxiliary_loss_mlp": 0.01034592, "balance_loss_clip": 1.24991298, "balance_loss_mlp": 1.01597142, "epoch": 0.672598827596573, "flos": 24065260197120.0, "grad_norm": 2.2527238905003832, "language_loss": 0.81020319, "learning_rate": 1.022795745163813e-06, "loss": 0.83466208, "num_input_tokens_seen": 241458085, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18615723, "step": 11187, "time_per_iteration": 2.890038013458252 }, { "auxiliary_loss_clip": 0.01459355, "auxiliary_loss_mlp": 0.01039299, "balance_loss_clip": 1.28602266, "balance_loss_mlp": 1.01952231, "epoch": 0.672658950849241, "flos": 21881920216320.0, "grad_norm": 2.036410489214374, "language_loss": 0.71433043, "learning_rate": 1.022455955762965e-06, "loss": 0.73931694, "num_input_tokens_seen": 241476880, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.19775391, "step": 11188, "time_per_iteration": 2.862586498260498 }, { "auxiliary_loss_clip": 0.01416514, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.2546593, "balance_loss_mlp": 1.01794672, "epoch": 0.6727190741019089, "flos": 23232581297280.0, "grad_norm": 1.7101983885023246, "language_loss": 0.76592708, "learning_rate": 1.0221162034306842e-06, "loss": 0.79045719, "num_input_tokens_seen": 241496535, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1854248, "step": 11189, "time_per_iteration": 2.930891752243042 }, { "auxiliary_loss_clip": 0.01436147, "auxiliary_loss_mlp": 0.01033499, "balance_loss_clip": 1.26652646, "balance_loss_mlp": 1.0140326, "epoch": 0.6727791973545769, "flos": 15787055410560.0, "grad_norm": 2.295697020851425, "language_loss": 0.76120287, "learning_rate": 1.0217764881798562e-06, "loss": 0.78589934, "num_input_tokens_seen": 241513465, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19470215, "step": 11190, "time_per_iteration": 2.816148042678833 }, { "auxiliary_loss_clip": 0.01414421, "auxiliary_loss_mlp": 0.01032545, "balance_loss_clip": 1.25032699, "balance_loss_mlp": 1.01311409, "epoch": 0.6728393206072448, "flos": 21259295516160.0, "grad_norm": 1.5104859132541189, "language_loss": 0.77421969, "learning_rate": 1.0214368100233612e-06, "loss": 0.79868937, "num_input_tokens_seen": 241534125, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19421387, "step": 11191, "time_per_iteration": 2.8385205268859863 }, { "auxiliary_loss_clip": 0.01418455, "auxiliary_loss_mlp": 0.01032103, "balance_loss_clip": 1.25611889, "balance_loss_mlp": 1.01313663, "epoch": 0.6728994438599128, "flos": 32135718268800.0, "grad_norm": 1.6843919186816652, "language_loss": 0.86885458, "learning_rate": 1.0210971689740802e-06, "loss": 0.89336014, "num_input_tokens_seen": 241556340, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18969727, "step": 11192, "time_per_iteration": 3.0052175521850586 }, { "auxiliary_loss_clip": 0.01432333, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.26506472, "balance_loss_mlp": 1.01075613, "epoch": 0.6729595671125808, "flos": 23122735626240.0, "grad_norm": 2.2802828924453307, "language_loss": 0.76717865, "learning_rate": 1.0207575650448923e-06, "loss": 0.79181039, "num_input_tokens_seen": 241575185, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.20080566, "step": 11193, "time_per_iteration": 2.8555662631988525 }, { "auxiliary_loss_clip": 0.01420961, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.25622106, "balance_loss_mlp": 1.01513338, "epoch": 0.6730196903652488, "flos": 14619636345600.0, "grad_norm": 2.109629072564097, "language_loss": 0.8001405, "learning_rate": 1.0204179982486758e-06, "loss": 0.82469571, "num_input_tokens_seen": 241592970, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19396973, "step": 11194, "time_per_iteration": 2.828916311264038 }, { "auxiliary_loss_clip": 0.0142019, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.25319254, "balance_loss_mlp": 1.01307917, "epoch": 0.6730798136179167, "flos": 21115850941440.0, "grad_norm": 1.821957805780065, "language_loss": 0.90292907, "learning_rate": 1.0200784685983075e-06, "loss": 0.92744762, "num_input_tokens_seen": 241610245, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18566895, "step": 11195, "time_per_iteration": 2.841773509979248 }, { "auxiliary_loss_clip": 0.01423592, "auxiliary_loss_mlp": 0.01040116, "balance_loss_clip": 1.25889146, "balance_loss_mlp": 1.02088737, "epoch": 0.6731399368705847, "flos": 28998090069120.0, "grad_norm": 1.713372310781971, "language_loss": 0.73345435, "learning_rate": 1.019738976106662e-06, "loss": 0.75809145, "num_input_tokens_seen": 241630350, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19250488, "step": 11196, "time_per_iteration": 2.877742290496826 }, { "auxiliary_loss_clip": 0.01182368, "auxiliary_loss_mlp": 0.01084762, "balance_loss_clip": 1.09578657, "balance_loss_mlp": 1.06158817, "epoch": 0.6732000601232526, "flos": 64774641651840.0, "grad_norm": 0.7984003756862422, "language_loss": 0.56571376, "learning_rate": 1.0193995207866123e-06, "loss": 0.58838511, "num_input_tokens_seen": 241692380, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.23144531, "step": 11197, "time_per_iteration": 3.2934730052948 }, { "auxiliary_loss_clip": 0.01416828, "auxiliary_loss_mlp": 0.01032812, "balance_loss_clip": 1.2569983, "balance_loss_mlp": 1.01414371, "epoch": 0.6732601833759206, "flos": 17210479409280.0, "grad_norm": 2.597663402591247, "language_loss": 0.7622714, "learning_rate": 1.0190601026510312e-06, "loss": 0.78676778, "num_input_tokens_seen": 241710430, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18664551, "step": 11198, "time_per_iteration": 2.8126072883605957 }, { "auxiliary_loss_clip": 0.01435448, "auxiliary_loss_mlp": 0.01036413, "balance_loss_clip": 1.26756454, "balance_loss_mlp": 1.01668358, "epoch": 0.6733203066285887, "flos": 18667502311680.0, "grad_norm": 1.9467753556814487, "language_loss": 0.82441616, "learning_rate": 1.0187207217127892e-06, "loss": 0.8491348, "num_input_tokens_seen": 241724775, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.1973877, "step": 11199, "time_per_iteration": 4.250103235244751 }, { "auxiliary_loss_clip": 0.01436669, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.26785219, "balance_loss_mlp": 1.01650429, "epoch": 0.6733804298812566, "flos": 35822982026880.0, "grad_norm": 1.67463560400295, "language_loss": 0.72247314, "learning_rate": 1.0183813779847552e-06, "loss": 0.7472012, "num_input_tokens_seen": 241744440, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19628906, "step": 11200, "time_per_iteration": 2.9659039974212646 }, { "auxiliary_loss_clip": 0.01437149, "auxiliary_loss_mlp": 0.01038224, "balance_loss_clip": 1.26954925, "balance_loss_mlp": 1.01906657, "epoch": 0.6734405531339246, "flos": 61658559256320.0, "grad_norm": 1.4378163095899341, "language_loss": 0.64952421, "learning_rate": 1.0180420714797987e-06, "loss": 0.6742779, "num_input_tokens_seen": 241771705, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19140625, "step": 11201, "time_per_iteration": 3.241058349609375 }, { "auxiliary_loss_clip": 0.01439939, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.27089405, "balance_loss_mlp": 1.01886332, "epoch": 0.6735006763865925, "flos": 20532164031360.0, "grad_norm": 1.7219050916038503, "language_loss": 0.64179194, "learning_rate": 1.0177028022107856e-06, "loss": 0.66658235, "num_input_tokens_seen": 241790830, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20239258, "step": 11202, "time_per_iteration": 2.846827983856201 }, { "auxiliary_loss_clip": 0.0142671, "auxiliary_loss_mlp": 0.01034229, "balance_loss_clip": 1.26033819, "balance_loss_mlp": 1.01498878, "epoch": 0.6735607996392605, "flos": 13927099150080.0, "grad_norm": 2.094537201504983, "language_loss": 0.75993222, "learning_rate": 1.0173635701905796e-06, "loss": 0.78454167, "num_input_tokens_seen": 241808165, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19226074, "step": 11203, "time_per_iteration": 2.8294644355773926 }, { "auxiliary_loss_clip": 0.01450899, "auxiliary_loss_mlp": 0.01037255, "balance_loss_clip": 1.27760839, "balance_loss_mlp": 1.01753736, "epoch": 0.6736209228919284, "flos": 18816557241600.0, "grad_norm": 1.671165476643503, "language_loss": 0.68456972, "learning_rate": 1.0170243754320456e-06, "loss": 0.7094512, "num_input_tokens_seen": 241826925, "router_z_loss_clip": 1.734375, "router_z_loss_mlp": 0.19702148, "step": 11204, "time_per_iteration": 2.81169056892395 }, { "auxiliary_loss_clip": 0.01439559, "auxiliary_loss_mlp": 0.0103462, "balance_loss_clip": 1.27003956, "balance_loss_mlp": 1.01497412, "epoch": 0.6736810461445965, "flos": 20382068471040.0, "grad_norm": 2.1371447813929247, "language_loss": 0.74405366, "learning_rate": 1.0166852179480465e-06, "loss": 0.76879549, "num_input_tokens_seen": 241845525, "router_z_loss_clip": 1.69726562, "router_z_loss_mlp": 0.19628906, "step": 11205, "time_per_iteration": 2.827439069747925 }, { "auxiliary_loss_clip": 0.01408871, "auxiliary_loss_mlp": 0.01032396, "balance_loss_clip": 1.247684, "balance_loss_mlp": 1.01408553, "epoch": 0.6737411693972644, "flos": 30019530850560.0, "grad_norm": 4.385556712548779, "language_loss": 0.72109318, "learning_rate": 1.0163460977514416e-06, "loss": 0.74550581, "num_input_tokens_seen": 241866815, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1829834, "step": 11206, "time_per_iteration": 2.9045214653015137 }, { "auxiliary_loss_clip": 0.01466514, "auxiliary_loss_mlp": 0.01040026, "balance_loss_clip": 1.29115605, "balance_loss_mlp": 1.01973677, "epoch": 0.6738012926499324, "flos": 25458297672960.0, "grad_norm": 1.7783335739800734, "language_loss": 0.68024772, "learning_rate": 1.016007014855092e-06, "loss": 0.70531309, "num_input_tokens_seen": 241887050, "router_z_loss_clip": 1.75488281, "router_z_loss_mlp": 0.20275879, "step": 11207, "time_per_iteration": 2.8745791912078857 }, { "auxiliary_loss_clip": 0.01417726, "auxiliary_loss_mlp": 0.01037723, "balance_loss_clip": 1.25631964, "balance_loss_mlp": 1.01785088, "epoch": 0.6738614159026003, "flos": 20786540152320.0, "grad_norm": 2.1827011863755263, "language_loss": 0.74552751, "learning_rate": 1.0156679692718553e-06, "loss": 0.77008206, "num_input_tokens_seen": 241904280, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.1986084, "step": 11208, "time_per_iteration": 2.82566237449646 }, { "auxiliary_loss_clip": 0.01432331, "auxiliary_loss_mlp": 0.0104412, "balance_loss_clip": 1.26353407, "balance_loss_mlp": 1.02341294, "epoch": 0.6739215391552683, "flos": 19574708676480.0, "grad_norm": 1.9458324391368427, "language_loss": 0.76215959, "learning_rate": 1.0153289610145867e-06, "loss": 0.78692412, "num_input_tokens_seen": 241919190, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20703125, "step": 11209, "time_per_iteration": 2.803452968597412 }, { "auxiliary_loss_clip": 0.01405612, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.2455771, "balance_loss_mlp": 1.01737571, "epoch": 0.6739816624079362, "flos": 24398597773440.0, "grad_norm": 1.6456170043600975, "language_loss": 0.67357314, "learning_rate": 1.0149899900961428e-06, "loss": 0.69799125, "num_input_tokens_seen": 241940525, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18811035, "step": 11210, "time_per_iteration": 4.2679266929626465 }, { "auxiliary_loss_clip": 0.01411303, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.25014257, "balance_loss_mlp": 1.01525259, "epoch": 0.6740417856606042, "flos": 22538279554560.0, "grad_norm": 2.5708709556369653, "language_loss": 0.80498546, "learning_rate": 1.014651056529377e-06, "loss": 0.82942826, "num_input_tokens_seen": 241959290, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.17724609, "step": 11211, "time_per_iteration": 4.3047380447387695 }, { "auxiliary_loss_clip": 0.01404781, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.24246323, "balance_loss_mlp": 1.01646554, "epoch": 0.6741019089132723, "flos": 25786432097280.0, "grad_norm": 1.3964461783003972, "language_loss": 0.76820254, "learning_rate": 1.014312160327143e-06, "loss": 0.7926079, "num_input_tokens_seen": 241980715, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19262695, "step": 11212, "time_per_iteration": 2.8905327320098877 }, { "auxiliary_loss_clip": 0.01415639, "auxiliary_loss_mlp": 0.01031356, "balance_loss_clip": 1.24910808, "balance_loss_mlp": 1.01219964, "epoch": 0.6741620321659402, "flos": 21115534227840.0, "grad_norm": 1.6968079917243384, "language_loss": 0.78912944, "learning_rate": 1.0139733015022905e-06, "loss": 0.81359935, "num_input_tokens_seen": 241999985, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19165039, "step": 11213, "time_per_iteration": 2.852665901184082 }, { "auxiliary_loss_clip": 0.01435311, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.26730454, "balance_loss_mlp": 1.02067053, "epoch": 0.6742221554186082, "flos": 20750090826240.0, "grad_norm": 4.413172105669341, "language_loss": 0.68297416, "learning_rate": 1.0136344800676685e-06, "loss": 0.70772696, "num_input_tokens_seen": 242018990, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.19311523, "step": 11214, "time_per_iteration": 4.2738142013549805 }, { "auxiliary_loss_clip": 0.01434555, "auxiliary_loss_mlp": 0.01044656, "balance_loss_clip": 1.26626837, "balance_loss_mlp": 1.02571392, "epoch": 0.6742822786712761, "flos": 37786902134400.0, "grad_norm": 1.613315426621431, "language_loss": 0.73166674, "learning_rate": 1.0132956960361263e-06, "loss": 0.75645888, "num_input_tokens_seen": 242039340, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.1895752, "step": 11215, "time_per_iteration": 2.975663661956787 }, { "auxiliary_loss_clip": 0.01429765, "auxiliary_loss_mlp": 0.01033821, "balance_loss_clip": 1.26153159, "balance_loss_mlp": 1.01561785, "epoch": 0.6743424019239441, "flos": 37276656814080.0, "grad_norm": 2.185111848608758, "language_loss": 0.67444539, "learning_rate": 1.0129569494205096e-06, "loss": 0.69908118, "num_input_tokens_seen": 242062215, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.18225098, "step": 11216, "time_per_iteration": 2.9590272903442383 }, { "auxiliary_loss_clip": 0.01183531, "auxiliary_loss_mlp": 0.01037541, "balance_loss_clip": 1.09696531, "balance_loss_mlp": 1.017609, "epoch": 0.674402525176612, "flos": 66032514351360.0, "grad_norm": 0.681121460388815, "language_loss": 0.56298029, "learning_rate": 1.0126182402336646e-06, "loss": 0.58519101, "num_input_tokens_seen": 242131130, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.19921875, "step": 11217, "time_per_iteration": 3.4389703273773193 }, { "auxiliary_loss_clip": 0.01416925, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.25432634, "balance_loss_mlp": 1.01388645, "epoch": 0.67446264842928, "flos": 26470418025600.0, "grad_norm": 1.7387756178730809, "language_loss": 0.75140107, "learning_rate": 1.0122795684884363e-06, "loss": 0.77589273, "num_input_tokens_seen": 242149720, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18347168, "step": 11218, "time_per_iteration": 2.936734437942505 }, { "auxiliary_loss_clip": 0.0143286, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.26549768, "balance_loss_mlp": 1.0169611, "epoch": 0.674522771681948, "flos": 23742871862400.0, "grad_norm": 1.6161481067398233, "language_loss": 0.66513628, "learning_rate": 1.0119409341976639e-06, "loss": 0.68983233, "num_input_tokens_seen": 242168875, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19787598, "step": 11219, "time_per_iteration": 2.908695936203003 }, { "auxiliary_loss_clip": 0.01429585, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.26283193, "balance_loss_mlp": 1.01418471, "epoch": 0.674582894934616, "flos": 24765172295040.0, "grad_norm": 1.6903957586421063, "language_loss": 0.75382268, "learning_rate": 1.0116023373741904e-06, "loss": 0.77844656, "num_input_tokens_seen": 242188465, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.18603516, "step": 11220, "time_per_iteration": 2.884932279586792 }, { "auxiliary_loss_clip": 0.01413808, "auxiliary_loss_mlp": 0.01035485, "balance_loss_clip": 1.24881911, "balance_loss_mlp": 1.01546955, "epoch": 0.6746430181872839, "flos": 24837301785600.0, "grad_norm": 1.5911339138647658, "language_loss": 0.71196866, "learning_rate": 1.0112637780308554e-06, "loss": 0.73646158, "num_input_tokens_seen": 242208675, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20031738, "step": 11221, "time_per_iteration": 2.9021379947662354 }, { "auxiliary_loss_clip": 0.0142158, "auxiliary_loss_mlp": 0.01031913, "balance_loss_clip": 1.25726807, "balance_loss_mlp": 1.01375723, "epoch": 0.6747031414399519, "flos": 16882299740160.0, "grad_norm": 1.7259143951453721, "language_loss": 0.58972275, "learning_rate": 1.010925256180498e-06, "loss": 0.61425769, "num_input_tokens_seen": 242227440, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18151855, "step": 11222, "time_per_iteration": 2.8484365940093994 }, { "auxiliary_loss_clip": 0.01429236, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.26164019, "balance_loss_mlp": 1.01150453, "epoch": 0.6747632646926198, "flos": 22795460853120.0, "grad_norm": 2.2933928167687645, "language_loss": 0.77352387, "learning_rate": 1.0105867718359528e-06, "loss": 0.79814106, "num_input_tokens_seen": 242245240, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20959473, "step": 11223, "time_per_iteration": 2.8326826095581055 }, { "auxiliary_loss_clip": 0.0141933, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.25479794, "balance_loss_mlp": 1.01215553, "epoch": 0.6748233879452878, "flos": 20055291390720.0, "grad_norm": 1.6041890913579429, "language_loss": 0.7598027, "learning_rate": 1.0102483250100574e-06, "loss": 0.7843082, "num_input_tokens_seen": 242263435, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.1907959, "step": 11224, "time_per_iteration": 2.827742576599121 }, { "auxiliary_loss_clip": 0.01419417, "auxiliary_loss_mlp": 0.01032942, "balance_loss_clip": 1.25589454, "balance_loss_mlp": 1.01415491, "epoch": 0.6748835111979558, "flos": 23013161424000.0, "grad_norm": 1.5643717431182482, "language_loss": 0.63308573, "learning_rate": 1.0099099157156445e-06, "loss": 0.65760928, "num_input_tokens_seen": 242282765, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18774414, "step": 11225, "time_per_iteration": 2.90899658203125 }, { "auxiliary_loss_clip": 0.01405717, "auxiliary_loss_mlp": 0.0103321, "balance_loss_clip": 1.24597096, "balance_loss_mlp": 1.0149951, "epoch": 0.6749436344506238, "flos": 12203348296320.0, "grad_norm": 1.6692321059844535, "language_loss": 0.64526415, "learning_rate": 1.0095715439655462e-06, "loss": 0.66965348, "num_input_tokens_seen": 242298980, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18225098, "step": 11226, "time_per_iteration": 2.817673444747925 }, { "auxiliary_loss_clip": 0.01423934, "auxiliary_loss_mlp": 0.01031587, "balance_loss_clip": 1.25685787, "balance_loss_mlp": 1.01240575, "epoch": 0.6750037577032918, "flos": 11880914716800.0, "grad_norm": 2.3332411981327725, "language_loss": 0.72527331, "learning_rate": 1.0092332097725945e-06, "loss": 0.74982852, "num_input_tokens_seen": 242315420, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.19189453, "step": 11227, "time_per_iteration": 2.8295962810516357 }, { "auxiliary_loss_clip": 0.01410637, "auxiliary_loss_mlp": 0.01034281, "balance_loss_clip": 1.24777675, "balance_loss_mlp": 1.01434946, "epoch": 0.6750638809559597, "flos": 17028504247680.0, "grad_norm": 1.8972293351223453, "language_loss": 0.72733188, "learning_rate": 1.0088949131496183e-06, "loss": 0.75178099, "num_input_tokens_seen": 242332805, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19934082, "step": 11228, "time_per_iteration": 2.826847791671753 }, { "auxiliary_loss_clip": 0.01186252, "auxiliary_loss_mlp": 0.01023069, "balance_loss_clip": 1.09869015, "balance_loss_mlp": 1.00294614, "epoch": 0.6751240042086277, "flos": 70984781504640.0, "grad_norm": 0.7623513704984965, "language_loss": 0.53358984, "learning_rate": 1.0085566541094482e-06, "loss": 0.55568302, "num_input_tokens_seen": 242396160, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.20117188, "step": 11229, "time_per_iteration": 3.395169496536255 }, { "auxiliary_loss_clip": 0.0140433, "auxiliary_loss_mlp": 0.01031647, "balance_loss_clip": 1.24352229, "balance_loss_mlp": 1.01246583, "epoch": 0.6751841274612956, "flos": 22685569937280.0, "grad_norm": 1.7516031160688172, "language_loss": 0.81049269, "learning_rate": 1.0082184326649072e-06, "loss": 0.83485246, "num_input_tokens_seen": 242414660, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19177246, "step": 11230, "time_per_iteration": 2.8386027812957764 }, { "auxiliary_loss_clip": 0.01411586, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.24990475, "balance_loss_mlp": 1.01132226, "epoch": 0.6752442507139637, "flos": 21298685754240.0, "grad_norm": 1.4240900585739449, "language_loss": 0.6667136, "learning_rate": 1.0078802488288228e-06, "loss": 0.69113553, "num_input_tokens_seen": 242434225, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19299316, "step": 11231, "time_per_iteration": 2.8295722007751465 }, { "auxiliary_loss_clip": 0.0144464, "auxiliary_loss_mlp": 0.01040115, "balance_loss_clip": 1.2732935, "balance_loss_mlp": 1.01919436, "epoch": 0.6753043739666316, "flos": 28268651099520.0, "grad_norm": 1.7217560347602512, "language_loss": 0.67598951, "learning_rate": 1.0075421026140198e-06, "loss": 0.70083708, "num_input_tokens_seen": 242454355, "router_z_loss_clip": 1.71191406, "router_z_loss_mlp": 0.20947266, "step": 11232, "time_per_iteration": 2.9281342029571533 }, { "auxiliary_loss_clip": 0.01411476, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.24823356, "balance_loss_mlp": 1.01328576, "epoch": 0.6753644972192996, "flos": 21370091328000.0, "grad_norm": 1.6148316312993396, "language_loss": 0.73143458, "learning_rate": 1.0072039940333188e-06, "loss": 0.75587958, "num_input_tokens_seen": 242474935, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.1973877, "step": 11233, "time_per_iteration": 2.8647220134735107 }, { "auxiliary_loss_clip": 0.01434422, "auxiliary_loss_mlp": 0.01034598, "balance_loss_clip": 1.26741576, "balance_loss_mlp": 1.0143559, "epoch": 0.6754246204719675, "flos": 26553225288960.0, "grad_norm": 1.7509896986781204, "language_loss": 0.77492702, "learning_rate": 1.0068659230995418e-06, "loss": 0.79961717, "num_input_tokens_seen": 242495530, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20227051, "step": 11234, "time_per_iteration": 4.352690696716309 }, { "auxiliary_loss_clip": 0.01420271, "auxiliary_loss_mlp": 0.01036692, "balance_loss_clip": 1.25596452, "balance_loss_mlp": 1.01696312, "epoch": 0.6754847437246355, "flos": 25567645651200.0, "grad_norm": 2.412404365666908, "language_loss": 0.76004773, "learning_rate": 1.0065278898255101e-06, "loss": 0.78461742, "num_input_tokens_seen": 242514550, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19726562, "step": 11235, "time_per_iteration": 2.872056245803833 }, { "auxiliary_loss_clip": 0.01184674, "auxiliary_loss_mlp": 0.0102464, "balance_loss_clip": 1.096071, "balance_loss_mlp": 1.00384998, "epoch": 0.6755448669773034, "flos": 59539539398400.0, "grad_norm": 0.7833881521914292, "language_loss": 0.51474053, "learning_rate": 1.0061898942240387e-06, "loss": 0.5368337, "num_input_tokens_seen": 242569200, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20800781, "step": 11236, "time_per_iteration": 3.3188884258270264 }, { "auxiliary_loss_clip": 0.0141174, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.24856281, "balance_loss_mlp": 1.01451349, "epoch": 0.6756049902299714, "flos": 23304891767040.0, "grad_norm": 2.1207822869406265, "language_loss": 0.76327699, "learning_rate": 1.0058519363079464e-06, "loss": 0.78773904, "num_input_tokens_seen": 242586950, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19946289, "step": 11237, "time_per_iteration": 2.854684829711914 }, { "auxiliary_loss_clip": 0.01425802, "auxiliary_loss_mlp": 0.01036903, "balance_loss_clip": 1.26123214, "balance_loss_mlp": 1.01722193, "epoch": 0.6756651134826394, "flos": 31587575788800.0, "grad_norm": 1.5751279412275998, "language_loss": 0.77369905, "learning_rate": 1.0055140160900482e-06, "loss": 0.79832608, "num_input_tokens_seen": 242607380, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19665527, "step": 11238, "time_per_iteration": 2.9037444591522217 }, { "auxiliary_loss_clip": 0.01431533, "auxiliary_loss_mlp": 0.01036266, "balance_loss_clip": 1.26186264, "balance_loss_mlp": 1.01590514, "epoch": 0.6757252367353074, "flos": 27283885868160.0, "grad_norm": 2.0203373056200893, "language_loss": 0.67585051, "learning_rate": 1.0051761335831587e-06, "loss": 0.7005285, "num_input_tokens_seen": 242628025, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.20361328, "step": 11239, "time_per_iteration": 2.9243950843811035 }, { "auxiliary_loss_clip": 0.01406469, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.24608469, "balance_loss_mlp": 1.01373351, "epoch": 0.6757853599879754, "flos": 16838113553280.0, "grad_norm": 1.9121899035772907, "language_loss": 0.83539724, "learning_rate": 1.0048382888000898e-06, "loss": 0.85978431, "num_input_tokens_seen": 242643825, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18505859, "step": 11240, "time_per_iteration": 2.8092992305755615 }, { "auxiliary_loss_clip": 0.01445857, "auxiliary_loss_mlp": 0.01035009, "balance_loss_clip": 1.27341461, "balance_loss_mlp": 1.01463604, "epoch": 0.6758454832406433, "flos": 23230002343680.0, "grad_norm": 2.3485874225806853, "language_loss": 0.7570582, "learning_rate": 1.0045004817536525e-06, "loss": 0.78186691, "num_input_tokens_seen": 242661820, "router_z_loss_clip": 1.72460938, "router_z_loss_mlp": 0.20361328, "step": 11241, "time_per_iteration": 2.825873374938965 }, { "auxiliary_loss_clip": 0.01419395, "auxiliary_loss_mlp": 0.01040986, "balance_loss_clip": 1.2553283, "balance_loss_mlp": 1.02104235, "epoch": 0.6759056064933113, "flos": 16298160382080.0, "grad_norm": 2.970177172062965, "language_loss": 0.81076372, "learning_rate": 1.0041627124566572e-06, "loss": 0.83536756, "num_input_tokens_seen": 242679890, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19946289, "step": 11242, "time_per_iteration": 2.881956100463867 }, { "auxiliary_loss_clip": 0.01424884, "auxiliary_loss_mlp": 0.01035146, "balance_loss_clip": 1.2588433, "balance_loss_mlp": 1.01640606, "epoch": 0.6759657297459792, "flos": 25933541500800.0, "grad_norm": 1.9179393313317445, "language_loss": 0.72970343, "learning_rate": 1.0038249809219109e-06, "loss": 0.75430369, "num_input_tokens_seen": 242699495, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.18737793, "step": 11243, "time_per_iteration": 2.8492431640625 }, { "auxiliary_loss_clip": 0.01418868, "auxiliary_loss_mlp": 0.01042146, "balance_loss_clip": 1.25369167, "balance_loss_mlp": 1.02244043, "epoch": 0.6760258529986473, "flos": 23010627715200.0, "grad_norm": 1.9155133160127673, "language_loss": 0.73504001, "learning_rate": 1.003487287162221e-06, "loss": 0.75965011, "num_input_tokens_seen": 242719500, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19689941, "step": 11244, "time_per_iteration": 2.8308210372924805 }, { "auxiliary_loss_clip": 0.01428323, "auxiliary_loss_mlp": 0.01041663, "balance_loss_clip": 1.2618525, "balance_loss_mlp": 1.02175498, "epoch": 0.6760859762513152, "flos": 20969058251520.0, "grad_norm": 3.532575152755126, "language_loss": 0.86466956, "learning_rate": 1.003149631190393e-06, "loss": 0.88936937, "num_input_tokens_seen": 242738325, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19897461, "step": 11245, "time_per_iteration": 4.291662931442261 }, { "auxiliary_loss_clip": 0.01445695, "auxiliary_loss_mlp": 0.0104084, "balance_loss_clip": 1.27391768, "balance_loss_mlp": 1.02090788, "epoch": 0.6761460995039832, "flos": 23633388149760.0, "grad_norm": 1.7049506634949008, "language_loss": 0.73788965, "learning_rate": 1.0028120130192327e-06, "loss": 0.76275498, "num_input_tokens_seen": 242756620, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.19946289, "step": 11246, "time_per_iteration": 4.291633129119873 }, { "auxiliary_loss_clip": 0.01417204, "auxiliary_loss_mlp": 0.01030165, "balance_loss_clip": 1.25241327, "balance_loss_mlp": 1.01093626, "epoch": 0.6762062227566511, "flos": 20778984270720.0, "grad_norm": 1.691443914578042, "language_loss": 0.88532627, "learning_rate": 1.002474432661539e-06, "loss": 0.90979993, "num_input_tokens_seen": 242774505, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19213867, "step": 11247, "time_per_iteration": 2.8198928833007812 }, { "auxiliary_loss_clip": 0.01189441, "auxiliary_loss_mlp": 0.01040809, "balance_loss_clip": 1.09864092, "balance_loss_mlp": 1.0174439, "epoch": 0.6762663460093191, "flos": 52847296490880.0, "grad_norm": 0.8179205197727337, "language_loss": 0.54051077, "learning_rate": 1.002136890130115e-06, "loss": 0.56281328, "num_input_tokens_seen": 242828645, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.23339844, "step": 11248, "time_per_iteration": 3.3671419620513916 }, { "auxiliary_loss_clip": 0.01407438, "auxiliary_loss_mlp": 0.01037734, "balance_loss_clip": 1.24825287, "balance_loss_mlp": 1.01897001, "epoch": 0.676326469261987, "flos": 23706874984320.0, "grad_norm": 1.8698015779887829, "language_loss": 0.74731672, "learning_rate": 1.001799385437761e-06, "loss": 0.77176839, "num_input_tokens_seen": 242850100, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18786621, "step": 11249, "time_per_iteration": 4.355906963348389 }, { "auxiliary_loss_clip": 0.01433759, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.26473391, "balance_loss_mlp": 1.01622963, "epoch": 0.676386592514655, "flos": 14071720089600.0, "grad_norm": 1.874217154506343, "language_loss": 0.7533493, "learning_rate": 1.0014619185972732e-06, "loss": 0.77804887, "num_input_tokens_seen": 242867775, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.19970703, "step": 11250, "time_per_iteration": 2.807476282119751 }, { "auxiliary_loss_clip": 0.01423059, "auxiliary_loss_mlp": 0.01038797, "balance_loss_clip": 1.25720739, "balance_loss_mlp": 1.01946092, "epoch": 0.676446715767323, "flos": 20421865912320.0, "grad_norm": 2.2866520160158794, "language_loss": 0.75870132, "learning_rate": 1.0011244896214497e-06, "loss": 0.78331989, "num_input_tokens_seen": 242886865, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19335938, "step": 11251, "time_per_iteration": 2.8696303367614746 }, { "auxiliary_loss_clip": 0.01417089, "auxiliary_loss_mlp": 0.0102999, "balance_loss_clip": 1.25420523, "balance_loss_mlp": 1.01036847, "epoch": 0.676506839019991, "flos": 21298188061440.0, "grad_norm": 1.9198920694562533, "language_loss": 0.70911598, "learning_rate": 1.0007870985230873e-06, "loss": 0.73358679, "num_input_tokens_seen": 242906705, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19616699, "step": 11252, "time_per_iteration": 2.8438289165496826 }, { "auxiliary_loss_clip": 0.01419404, "auxiliary_loss_mlp": 0.01033298, "balance_loss_clip": 1.25584233, "balance_loss_mlp": 1.01458263, "epoch": 0.676566962272659, "flos": 29943917510400.0, "grad_norm": 1.8048824894933624, "language_loss": 0.67342532, "learning_rate": 1.0004497453149765e-06, "loss": 0.69795233, "num_input_tokens_seen": 242925215, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18725586, "step": 11253, "time_per_iteration": 2.9105162620544434 }, { "auxiliary_loss_clip": 0.01431653, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.26293802, "balance_loss_mlp": 1.01693738, "epoch": 0.6766270855253269, "flos": 17940189847680.0, "grad_norm": 1.6998826911945832, "language_loss": 0.77629697, "learning_rate": 1.0001124300099115e-06, "loss": 0.80098718, "num_input_tokens_seen": 242944750, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.2043457, "step": 11254, "time_per_iteration": 2.8724215030670166 }, { "auxiliary_loss_clip": 0.01430969, "auxiliary_loss_mlp": 0.0103055, "balance_loss_clip": 1.26363599, "balance_loss_mlp": 1.01195312, "epoch": 0.6766872087779949, "flos": 23113279463040.0, "grad_norm": 1.8736773460946687, "language_loss": 0.72942448, "learning_rate": 9.997751526206835e-07, "loss": 0.75403965, "num_input_tokens_seen": 242963860, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.18591309, "step": 11255, "time_per_iteration": 2.8525168895721436 }, { "auxiliary_loss_clip": 0.01431815, "auxiliary_loss_mlp": 0.01042239, "balance_loss_clip": 1.26300859, "balance_loss_mlp": 1.02284384, "epoch": 0.6767473320306628, "flos": 26224185968640.0, "grad_norm": 2.364548215164128, "language_loss": 0.76398909, "learning_rate": 9.994379131600828e-07, "loss": 0.78872967, "num_input_tokens_seen": 242983050, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.1940918, "step": 11256, "time_per_iteration": 2.901160478591919 }, { "auxiliary_loss_clip": 0.01414126, "auxiliary_loss_mlp": 0.0103582, "balance_loss_clip": 1.24965012, "balance_loss_mlp": 1.01609063, "epoch": 0.6768074552833309, "flos": 18377762739840.0, "grad_norm": 2.2596065018432254, "language_loss": 0.66147494, "learning_rate": 9.991007116408965e-07, "loss": 0.68597448, "num_input_tokens_seen": 243001125, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19714355, "step": 11257, "time_per_iteration": 2.8886899948120117 }, { "auxiliary_loss_clip": 0.01404796, "auxiliary_loss_mlp": 0.01036163, "balance_loss_clip": 1.24260807, "balance_loss_mlp": 1.01674402, "epoch": 0.6768675785359988, "flos": 23050425156480.0, "grad_norm": 1.4335510845529764, "language_loss": 0.76275277, "learning_rate": 9.987635480759109e-07, "loss": 0.7871623, "num_input_tokens_seen": 243021865, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19421387, "step": 11258, "time_per_iteration": 2.836700677871704 }, { "auxiliary_loss_clip": 0.01410011, "auxiliary_loss_mlp": 0.01033055, "balance_loss_clip": 1.24983335, "balance_loss_mlp": 1.01388586, "epoch": 0.6769277017886668, "flos": 33049485129600.0, "grad_norm": 1.6548869124540608, "language_loss": 0.67584044, "learning_rate": 9.984264224779127e-07, "loss": 0.70027107, "num_input_tokens_seen": 243042970, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19177246, "step": 11259, "time_per_iteration": 2.933305501937866 }, { "auxiliary_loss_clip": 0.01408811, "auxiliary_loss_mlp": 0.01035726, "balance_loss_clip": 1.2444905, "balance_loss_mlp": 1.0158658, "epoch": 0.6769878250413347, "flos": 20857719502080.0, "grad_norm": 2.0513872845514602, "language_loss": 0.8618716, "learning_rate": 9.980893348596839e-07, "loss": 0.88631701, "num_input_tokens_seen": 243058470, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1986084, "step": 11260, "time_per_iteration": 2.8243842124938965 }, { "auxiliary_loss_clip": 0.01439266, "auxiliary_loss_mlp": 0.010358, "balance_loss_clip": 1.26843059, "balance_loss_mlp": 1.01639295, "epoch": 0.6770479482940027, "flos": 15604311087360.0, "grad_norm": 2.3538293372455614, "language_loss": 0.77888483, "learning_rate": 9.977522852340081e-07, "loss": 0.80363554, "num_input_tokens_seen": 243076630, "router_z_loss_clip": 1.70800781, "router_z_loss_mlp": 0.19396973, "step": 11261, "time_per_iteration": 2.8419673442840576 }, { "auxiliary_loss_clip": 0.01422731, "auxiliary_loss_mlp": 0.01038474, "balance_loss_clip": 1.25638485, "balance_loss_mlp": 1.0176723, "epoch": 0.6771080715466706, "flos": 18629288438400.0, "grad_norm": 2.003262749703773, "language_loss": 0.88828522, "learning_rate": 9.97415273613666e-07, "loss": 0.91289723, "num_input_tokens_seen": 243092260, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20800781, "step": 11262, "time_per_iteration": 2.902888298034668 }, { "auxiliary_loss_clip": 0.01425626, "auxiliary_loss_mlp": 0.01032566, "balance_loss_clip": 1.25867224, "balance_loss_mlp": 1.01327753, "epoch": 0.6771681947993387, "flos": 12503177458560.0, "grad_norm": 2.147705495014056, "language_loss": 0.74895895, "learning_rate": 9.97078300011439e-07, "loss": 0.77354091, "num_input_tokens_seen": 243109405, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19287109, "step": 11263, "time_per_iteration": 2.818920373916626 }, { "auxiliary_loss_clip": 0.01436703, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.26639748, "balance_loss_mlp": 1.02010179, "epoch": 0.6772283180520066, "flos": 22247182638720.0, "grad_norm": 1.9039323514292321, "language_loss": 0.68283838, "learning_rate": 9.967413644401016e-07, "loss": 0.70761162, "num_input_tokens_seen": 243128135, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.20532227, "step": 11264, "time_per_iteration": 2.8019471168518066 }, { "auxiliary_loss_clip": 0.01404884, "auxiliary_loss_mlp": 0.0103407, "balance_loss_clip": 1.24166894, "balance_loss_mlp": 1.0135541, "epoch": 0.6772884413046746, "flos": 16151593916160.0, "grad_norm": 1.9360543372571062, "language_loss": 0.74073637, "learning_rate": 9.964044669124324e-07, "loss": 0.76512593, "num_input_tokens_seen": 243146785, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.20544434, "step": 11265, "time_per_iteration": 2.82668137550354 }, { "auxiliary_loss_clip": 0.01415711, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.25258005, "balance_loss_mlp": 1.01420784, "epoch": 0.6773485645573426, "flos": 19145189358720.0, "grad_norm": 1.489109419946897, "language_loss": 0.62357032, "learning_rate": 9.96067607441207e-07, "loss": 0.64806849, "num_input_tokens_seen": 243165275, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19873047, "step": 11266, "time_per_iteration": 2.8105502128601074 }, { "auxiliary_loss_clip": 0.01412707, "auxiliary_loss_mlp": 0.01032067, "balance_loss_clip": 1.24838138, "balance_loss_mlp": 1.01237392, "epoch": 0.6774086878100105, "flos": 14144980700160.0, "grad_norm": 1.7634799577808051, "language_loss": 0.71933043, "learning_rate": 9.957307860391976e-07, "loss": 0.74377823, "num_input_tokens_seen": 243182845, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19702148, "step": 11267, "time_per_iteration": 2.834014415740967 }, { "auxiliary_loss_clip": 0.01414252, "auxiliary_loss_mlp": 0.01031803, "balance_loss_clip": 1.24974597, "balance_loss_mlp": 1.01238418, "epoch": 0.6774688110626785, "flos": 22206389811840.0, "grad_norm": 2.457054654739568, "language_loss": 0.71318364, "learning_rate": 9.953940027191785e-07, "loss": 0.7376442, "num_input_tokens_seen": 243201475, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19421387, "step": 11268, "time_per_iteration": 2.936436414718628 }, { "auxiliary_loss_clip": 0.0142622, "auxiliary_loss_mlp": 0.01031883, "balance_loss_clip": 1.26145279, "balance_loss_mlp": 1.01237988, "epoch": 0.6775289343153464, "flos": 23050470401280.0, "grad_norm": 2.648316782781714, "language_loss": 0.77631009, "learning_rate": 9.950572574939194e-07, "loss": 0.80089104, "num_input_tokens_seen": 243221850, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19494629, "step": 11269, "time_per_iteration": 4.366933584213257 }, { "auxiliary_loss_clip": 0.01426562, "auxiliary_loss_mlp": 0.01039, "balance_loss_clip": 1.25877237, "balance_loss_mlp": 1.0179714, "epoch": 0.6775890575680145, "flos": 18301923175680.0, "grad_norm": 2.179088861607088, "language_loss": 0.7477001, "learning_rate": 9.94720550376189e-07, "loss": 0.77235574, "num_input_tokens_seen": 243239855, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21020508, "step": 11270, "time_per_iteration": 2.8500144481658936 }, { "auxiliary_loss_clip": 0.01416498, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.25231647, "balance_loss_mlp": 1.0136826, "epoch": 0.6776491808206824, "flos": 25347004168320.0, "grad_norm": 1.6856467433249251, "language_loss": 0.73650515, "learning_rate": 9.94383881378756e-07, "loss": 0.76101828, "num_input_tokens_seen": 243260085, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.21130371, "step": 11271, "time_per_iteration": 2.848832368850708 }, { "auxiliary_loss_clip": 0.01426492, "auxiliary_loss_mlp": 0.01032342, "balance_loss_clip": 1.26029384, "balance_loss_mlp": 1.01294637, "epoch": 0.6777093040733504, "flos": 26038591223040.0, "grad_norm": 2.542409789613877, "language_loss": 0.68876714, "learning_rate": 9.94047250514387e-07, "loss": 0.71335548, "num_input_tokens_seen": 243280065, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.1940918, "step": 11272, "time_per_iteration": 2.868018865585327 }, { "auxiliary_loss_clip": 0.0143334, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.26392078, "balance_loss_mlp": 1.01682615, "epoch": 0.6777694273260183, "flos": 18012455072640.0, "grad_norm": 1.8400628996928945, "language_loss": 0.74637687, "learning_rate": 9.937106577958481e-07, "loss": 0.77109706, "num_input_tokens_seen": 243297775, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.21862793, "step": 11273, "time_per_iteration": 2.8254570960998535 }, { "auxiliary_loss_clip": 0.01408234, "auxiliary_loss_mlp": 0.01045717, "balance_loss_clip": 1.24513459, "balance_loss_mlp": 1.02438998, "epoch": 0.6778295505786863, "flos": 23451639212160.0, "grad_norm": 2.275979459116119, "language_loss": 0.70764017, "learning_rate": 9.933741032359015e-07, "loss": 0.73217964, "num_input_tokens_seen": 243315760, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.21325684, "step": 11274, "time_per_iteration": 2.8286259174346924 }, { "auxiliary_loss_clip": 0.01424382, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 1.25731289, "balance_loss_mlp": 1.01226783, "epoch": 0.6778896738313542, "flos": 19107608912640.0, "grad_norm": 1.5338748678282261, "language_loss": 0.66405052, "learning_rate": 9.930375868473093e-07, "loss": 0.68862027, "num_input_tokens_seen": 243335715, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.203125, "step": 11275, "time_per_iteration": 2.850635528564453 }, { "auxiliary_loss_clip": 0.01424107, "auxiliary_loss_mlp": 0.0103493, "balance_loss_clip": 1.25921357, "balance_loss_mlp": 1.01508141, "epoch": 0.6779497970840223, "flos": 26115018969600.0, "grad_norm": 2.1696697171242723, "language_loss": 0.73176479, "learning_rate": 9.927011086428335e-07, "loss": 0.75635523, "num_input_tokens_seen": 243356935, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.1986084, "step": 11276, "time_per_iteration": 2.8668644428253174 }, { "auxiliary_loss_clip": 0.01410471, "auxiliary_loss_mlp": 0.010328, "balance_loss_clip": 1.24869335, "balance_loss_mlp": 1.01317835, "epoch": 0.6780099203366902, "flos": 19728740534400.0, "grad_norm": 1.8867708060739383, "language_loss": 0.7746889, "learning_rate": 9.923646686352317e-07, "loss": 0.79912162, "num_input_tokens_seen": 243375625, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19628906, "step": 11277, "time_per_iteration": 2.8433282375335693 }, { "auxiliary_loss_clip": 0.01433158, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.26425588, "balance_loss_mlp": 1.01720738, "epoch": 0.6780700435893582, "flos": 18221378152320.0, "grad_norm": 2.507708213798488, "language_loss": 0.84744966, "learning_rate": 9.920282668372627e-07, "loss": 0.872154, "num_input_tokens_seen": 243390195, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.20068359, "step": 11278, "time_per_iteration": 2.789520025253296 }, { "auxiliary_loss_clip": 0.01401856, "auxiliary_loss_mlp": 0.01031372, "balance_loss_clip": 1.24120605, "balance_loss_mlp": 1.01204848, "epoch": 0.6781301668420262, "flos": 25387661260800.0, "grad_norm": 2.246124082331041, "language_loss": 0.70408392, "learning_rate": 9.916919032616844e-07, "loss": 0.7284162, "num_input_tokens_seen": 243411690, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19335938, "step": 11279, "time_per_iteration": 2.861539602279663 }, { "auxiliary_loss_clip": 0.01416446, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.25167966, "balance_loss_mlp": 1.01421368, "epoch": 0.6781902900946941, "flos": 24029987235840.0, "grad_norm": 1.9754614764232779, "language_loss": 0.74934232, "learning_rate": 9.913555779212485e-07, "loss": 0.77385497, "num_input_tokens_seen": 243430280, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20605469, "step": 11280, "time_per_iteration": 4.315319776535034 }, { "auxiliary_loss_clip": 0.01424775, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.25485694, "balance_loss_mlp": 1.01193249, "epoch": 0.6782504133473621, "flos": 19656384819840.0, "grad_norm": 1.8023470798250691, "language_loss": 0.71094191, "learning_rate": 9.910192908287104e-07, "loss": 0.73551673, "num_input_tokens_seen": 243448690, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20751953, "step": 11281, "time_per_iteration": 2.882263660430908 }, { "auxiliary_loss_clip": 0.01416484, "auxiliary_loss_mlp": 0.01035797, "balance_loss_clip": 1.25445724, "balance_loss_mlp": 1.01541185, "epoch": 0.67831053660003, "flos": 24942577731840.0, "grad_norm": 1.4810746510706485, "language_loss": 0.64675957, "learning_rate": 9.906830419968217e-07, "loss": 0.67128235, "num_input_tokens_seen": 243470695, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.20397949, "step": 11282, "time_per_iteration": 4.3315064907073975 }, { "auxiliary_loss_clip": 0.01432217, "auxiliary_loss_mlp": 0.01040536, "balance_loss_clip": 1.26153445, "balance_loss_mlp": 1.01984131, "epoch": 0.6783706598526981, "flos": 31219870147200.0, "grad_norm": 1.7792129745308065, "language_loss": 0.75265372, "learning_rate": 9.90346831438334e-07, "loss": 0.77738118, "num_input_tokens_seen": 243493345, "router_z_loss_clip": 1.70703125, "router_z_loss_mlp": 0.20690918, "step": 11283, "time_per_iteration": 2.901803970336914 }, { "auxiliary_loss_clip": 0.01417343, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.25205636, "balance_loss_mlp": 1.01187837, "epoch": 0.678430783105366, "flos": 35454054775680.0, "grad_norm": 1.5782463981378694, "language_loss": 0.574682, "learning_rate": 9.900106591659948e-07, "loss": 0.59916806, "num_input_tokens_seen": 243515670, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19396973, "step": 11284, "time_per_iteration": 4.358064651489258 }, { "auxiliary_loss_clip": 0.01416248, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.25118554, "balance_loss_mlp": 1.01325858, "epoch": 0.678490906358034, "flos": 14436937267200.0, "grad_norm": 2.1218315328407322, "language_loss": 0.76242232, "learning_rate": 9.896745251925535e-07, "loss": 0.78691888, "num_input_tokens_seen": 243533625, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20141602, "step": 11285, "time_per_iteration": 2.8184762001037598 }, { "auxiliary_loss_clip": 0.01402456, "auxiliary_loss_mlp": 0.01036449, "balance_loss_clip": 1.24122655, "balance_loss_mlp": 1.0160048, "epoch": 0.6785510296107019, "flos": 24320586458880.0, "grad_norm": 1.69964895994125, "language_loss": 0.66680771, "learning_rate": 9.893384295307557e-07, "loss": 0.6911968, "num_input_tokens_seen": 243553040, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.20446777, "step": 11286, "time_per_iteration": 2.874673843383789 }, { "auxiliary_loss_clip": 0.01413247, "auxiliary_loss_mlp": 0.01030331, "balance_loss_clip": 1.2466743, "balance_loss_mlp": 1.01006532, "epoch": 0.6786111528633699, "flos": 26987947758720.0, "grad_norm": 2.31654858611157, "language_loss": 0.53541523, "learning_rate": 9.890023721933447e-07, "loss": 0.55985099, "num_input_tokens_seen": 243572590, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20263672, "step": 11287, "time_per_iteration": 2.8719193935394287 }, { "auxiliary_loss_clip": 0.01410818, "auxiliary_loss_mlp": 0.01036149, "balance_loss_clip": 1.24626613, "balance_loss_mlp": 1.01599073, "epoch": 0.6786712761160378, "flos": 24328549543680.0, "grad_norm": 1.4435068111251812, "language_loss": 0.776425, "learning_rate": 9.886663531930655e-07, "loss": 0.80089462, "num_input_tokens_seen": 243594140, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20153809, "step": 11288, "time_per_iteration": 2.887211322784424 }, { "auxiliary_loss_clip": 0.01423731, "auxiliary_loss_mlp": 0.01041052, "balance_loss_clip": 1.25886631, "balance_loss_mlp": 1.02122712, "epoch": 0.6787313993687059, "flos": 22940896199040.0, "grad_norm": 2.6862971389687655, "language_loss": 0.74131191, "learning_rate": 9.883303725426593e-07, "loss": 0.76595974, "num_input_tokens_seen": 243615170, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19824219, "step": 11289, "time_per_iteration": 2.8789894580841064 }, { "auxiliary_loss_clip": 0.01412804, "auxiliary_loss_mlp": 0.01035915, "balance_loss_clip": 1.24759984, "balance_loss_mlp": 1.01632881, "epoch": 0.6787915226213738, "flos": 26879278452480.0, "grad_norm": 1.5116264527097434, "language_loss": 0.80383253, "learning_rate": 9.879944302548682e-07, "loss": 0.82831973, "num_input_tokens_seen": 243635675, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19580078, "step": 11290, "time_per_iteration": 2.867624521255493 }, { "auxiliary_loss_clip": 0.01401587, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.23988724, "balance_loss_mlp": 1.01275802, "epoch": 0.6788516458740418, "flos": 20017846679040.0, "grad_norm": 1.5201554448099261, "language_loss": 0.75504929, "learning_rate": 9.87658526342428e-07, "loss": 0.77939236, "num_input_tokens_seen": 243654950, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19970703, "step": 11291, "time_per_iteration": 2.8425180912017822 }, { "auxiliary_loss_clip": 0.0141025, "auxiliary_loss_mlp": 0.01034892, "balance_loss_clip": 1.24379349, "balance_loss_mlp": 1.01453137, "epoch": 0.6789117691267098, "flos": 28737877368960.0, "grad_norm": 1.8344090428464768, "language_loss": 0.76031041, "learning_rate": 9.873226608180785e-07, "loss": 0.78476191, "num_input_tokens_seen": 243674970, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20349121, "step": 11292, "time_per_iteration": 2.9313571453094482 }, { "auxiliary_loss_clip": 0.01409303, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.24418664, "balance_loss_mlp": 1.01317191, "epoch": 0.6789718923793777, "flos": 23413696807680.0, "grad_norm": 5.073850271019591, "language_loss": 0.85112441, "learning_rate": 9.869868336945556e-07, "loss": 0.87554848, "num_input_tokens_seen": 243693440, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19946289, "step": 11293, "time_per_iteration": 2.9133214950561523 }, { "auxiliary_loss_clip": 0.01440673, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.26857722, "balance_loss_mlp": 1.01557612, "epoch": 0.6790320156320457, "flos": 20458541462400.0, "grad_norm": 2.6195683866197585, "language_loss": 0.80698299, "learning_rate": 9.866510449845929e-07, "loss": 0.83175099, "num_input_tokens_seen": 243710055, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.20544434, "step": 11294, "time_per_iteration": 2.840186595916748 }, { "auxiliary_loss_clip": 0.01415216, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.25090504, "balance_loss_mlp": 1.01057029, "epoch": 0.6790921388847136, "flos": 24177006149760.0, "grad_norm": 1.7551487779009975, "language_loss": 0.79491174, "learning_rate": 9.86315294700924e-07, "loss": 0.81936073, "num_input_tokens_seen": 243728635, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19104004, "step": 11295, "time_per_iteration": 2.8824830055236816 }, { "auxiliary_loss_clip": 0.0139699, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.23799169, "balance_loss_mlp": 1.01463664, "epoch": 0.6791522621373817, "flos": 21918098073600.0, "grad_norm": 1.7527298955205517, "language_loss": 0.71848845, "learning_rate": 9.859795828562823e-07, "loss": 0.7427938, "num_input_tokens_seen": 243748330, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18896484, "step": 11296, "time_per_iteration": 2.8827760219573975 }, { "auxiliary_loss_clip": 0.01419055, "auxiliary_loss_mlp": 0.01031683, "balance_loss_clip": 1.25424206, "balance_loss_mlp": 1.01158476, "epoch": 0.6792123853900496, "flos": 24837166051200.0, "grad_norm": 4.1947304036601425, "language_loss": 0.70672762, "learning_rate": 9.856439094633949e-07, "loss": 0.73123503, "num_input_tokens_seen": 243769380, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20092773, "step": 11297, "time_per_iteration": 2.88236927986145 }, { "auxiliary_loss_clip": 0.01420344, "auxiliary_loss_mlp": 0.01033465, "balance_loss_clip": 1.25125313, "balance_loss_mlp": 1.01342642, "epoch": 0.6792725086427176, "flos": 17575153649280.0, "grad_norm": 2.5828254571242244, "language_loss": 0.67163193, "learning_rate": 9.853082745349918e-07, "loss": 0.69617009, "num_input_tokens_seen": 243785510, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20043945, "step": 11298, "time_per_iteration": 2.797438621520996 }, { "auxiliary_loss_clip": 0.01420188, "auxiliary_loss_mlp": 0.01034472, "balance_loss_clip": 1.25458765, "balance_loss_mlp": 1.01562512, "epoch": 0.6793326318953855, "flos": 26952810531840.0, "grad_norm": 1.6536026558085015, "language_loss": 0.72178936, "learning_rate": 9.84972678083801e-07, "loss": 0.74633586, "num_input_tokens_seen": 243805545, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.18847656, "step": 11299, "time_per_iteration": 2.8717901706695557 }, { "auxiliary_loss_clip": 0.01414142, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.2487762, "balance_loss_mlp": 1.01810062, "epoch": 0.6793927551480535, "flos": 24329454439680.0, "grad_norm": 1.2905766205020102, "language_loss": 0.77893174, "learning_rate": 9.846371201225488e-07, "loss": 0.80347264, "num_input_tokens_seen": 243825185, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.21862793, "step": 11300, "time_per_iteration": 2.8312931060791016 }, { "auxiliary_loss_clip": 0.01408104, "auxiliary_loss_mlp": 0.01040589, "balance_loss_clip": 1.24368, "balance_loss_mlp": 1.02112222, "epoch": 0.6794528784007214, "flos": 11443884762240.0, "grad_norm": 2.290388361300239, "language_loss": 0.64418095, "learning_rate": 9.843016006639577e-07, "loss": 0.66866791, "num_input_tokens_seen": 243841600, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19470215, "step": 11301, "time_per_iteration": 2.7834506034851074 }, { "auxiliary_loss_clip": 0.01400104, "auxiliary_loss_mlp": 0.0103946, "balance_loss_clip": 1.23513389, "balance_loss_mlp": 1.01909947, "epoch": 0.6795130016533895, "flos": 25240913815680.0, "grad_norm": 1.6460007763224784, "language_loss": 0.8314954, "learning_rate": 9.839661197207525e-07, "loss": 0.85589105, "num_input_tokens_seen": 243862250, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20361328, "step": 11302, "time_per_iteration": 2.8881211280822754 }, { "auxiliary_loss_clip": 0.01406566, "auxiliary_loss_mlp": 0.01038712, "balance_loss_clip": 1.241009, "balance_loss_mlp": 1.0186013, "epoch": 0.6795731249060574, "flos": 18305588004480.0, "grad_norm": 2.0350837504847443, "language_loss": 0.70770973, "learning_rate": 9.83630677305654e-07, "loss": 0.73216254, "num_input_tokens_seen": 243880560, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.2010498, "step": 11303, "time_per_iteration": 2.8204898834228516 }, { "auxiliary_loss_clip": 0.01429952, "auxiliary_loss_mlp": 0.01038269, "balance_loss_clip": 1.26030445, "balance_loss_mlp": 1.01828992, "epoch": 0.6796332481587254, "flos": 20309350798080.0, "grad_norm": 2.0066774561287226, "language_loss": 0.71171033, "learning_rate": 9.832952734313813e-07, "loss": 0.7363925, "num_input_tokens_seen": 243900635, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.1998291, "step": 11304, "time_per_iteration": 4.337868928909302 }, { "auxiliary_loss_clip": 0.0142428, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.25822258, "balance_loss_mlp": 1.01594305, "epoch": 0.6796933714113934, "flos": 23597391271680.0, "grad_norm": 2.1984984510686476, "language_loss": 0.7349211, "learning_rate": 9.829599081106536e-07, "loss": 0.75952458, "num_input_tokens_seen": 243920160, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20129395, "step": 11305, "time_per_iteration": 3.025583267211914 }, { "auxiliary_loss_clip": 0.01422252, "auxiliary_loss_mlp": 0.01037169, "balance_loss_clip": 1.25661349, "balance_loss_mlp": 1.01727271, "epoch": 0.6797534946640613, "flos": 27129808765440.0, "grad_norm": 4.476429731091663, "language_loss": 0.66965985, "learning_rate": 9.826245813561882e-07, "loss": 0.69425404, "num_input_tokens_seen": 243939015, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19897461, "step": 11306, "time_per_iteration": 2.9169840812683105 }, { "auxiliary_loss_clip": 0.01418554, "auxiliary_loss_mlp": 0.01037385, "balance_loss_clip": 1.2548852, "balance_loss_mlp": 1.01732218, "epoch": 0.6798136179167293, "flos": 22137608436480.0, "grad_norm": 1.7728283200996082, "language_loss": 0.8078481, "learning_rate": 9.822892931807021e-07, "loss": 0.83240747, "num_input_tokens_seen": 243958470, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20068359, "step": 11307, "time_per_iteration": 2.8640127182006836 }, { "auxiliary_loss_clip": 0.01411669, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.24837208, "balance_loss_mlp": 1.02114761, "epoch": 0.6798737411693972, "flos": 17496282683520.0, "grad_norm": 1.5135174022101265, "language_loss": 0.89179122, "learning_rate": 9.819540435969066e-07, "loss": 0.91632068, "num_input_tokens_seen": 243975450, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20117188, "step": 11308, "time_per_iteration": 2.835096597671509 }, { "auxiliary_loss_clip": 0.01418785, "auxiliary_loss_mlp": 0.01039877, "balance_loss_clip": 1.25387931, "balance_loss_mlp": 1.02062416, "epoch": 0.6799338644220653, "flos": 22902275122560.0, "grad_norm": 1.8940799923383618, "language_loss": 0.72461164, "learning_rate": 9.816188326175154e-07, "loss": 0.74919826, "num_input_tokens_seen": 243994355, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19250488, "step": 11309, "time_per_iteration": 2.866344928741455 }, { "auxiliary_loss_clip": 0.01413669, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.25029159, "balance_loss_mlp": 1.02519643, "epoch": 0.6799939876747332, "flos": 23189526230400.0, "grad_norm": 1.8218838367208725, "language_loss": 0.85491765, "learning_rate": 9.812836602552411e-07, "loss": 0.87950456, "num_input_tokens_seen": 244011620, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19824219, "step": 11310, "time_per_iteration": 2.8506929874420166 }, { "auxiliary_loss_clip": 0.01405995, "auxiliary_loss_mlp": 0.01041056, "balance_loss_clip": 1.24520338, "balance_loss_mlp": 1.02180409, "epoch": 0.6800541109274012, "flos": 19509275416320.0, "grad_norm": 1.990362932452939, "language_loss": 0.8334223, "learning_rate": 9.80948526522792e-07, "loss": 0.85789287, "num_input_tokens_seen": 244029925, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19262695, "step": 11311, "time_per_iteration": 2.9016757011413574 }, { "auxiliary_loss_clip": 0.01427994, "auxiliary_loss_mlp": 0.01042755, "balance_loss_clip": 1.25593483, "balance_loss_mlp": 1.02104735, "epoch": 0.6801142341800691, "flos": 22288970851200.0, "grad_norm": 1.5600797293940212, "language_loss": 0.7687189, "learning_rate": 9.806134314328767e-07, "loss": 0.79342639, "num_input_tokens_seen": 244051225, "router_z_loss_clip": 1.72167969, "router_z_loss_mlp": 0.21716309, "step": 11312, "time_per_iteration": 2.879802703857422 }, { "auxiliary_loss_clip": 0.01192916, "auxiliary_loss_mlp": 0.01047497, "balance_loss_clip": 1.10115099, "balance_loss_mlp": 1.02756488, "epoch": 0.6801743574327371, "flos": 68745265464960.0, "grad_norm": 0.6716363526172427, "language_loss": 0.57251465, "learning_rate": 9.802783749982038e-07, "loss": 0.59491879, "num_input_tokens_seen": 244115930, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.19921875, "step": 11313, "time_per_iteration": 3.5011777877807617 }, { "auxiliary_loss_clip": 0.01417921, "auxiliary_loss_mlp": 0.01034992, "balance_loss_clip": 1.25159705, "balance_loss_mlp": 1.01581168, "epoch": 0.680234480685405, "flos": 29472383756160.0, "grad_norm": 1.8200620466883983, "language_loss": 0.69567293, "learning_rate": 9.799433572314754e-07, "loss": 0.72020209, "num_input_tokens_seen": 244137320, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19189453, "step": 11314, "time_per_iteration": 2.9185609817504883 }, { "auxiliary_loss_clip": 0.01402121, "auxiliary_loss_mlp": 0.01033399, "balance_loss_clip": 1.2393415, "balance_loss_mlp": 1.01469517, "epoch": 0.6802946039380731, "flos": 15923351306880.0, "grad_norm": 1.8537638267312848, "language_loss": 0.81952071, "learning_rate": 9.796083781453972e-07, "loss": 0.84387589, "num_input_tokens_seen": 244152755, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18701172, "step": 11315, "time_per_iteration": 4.370807886123657 }, { "auxiliary_loss_clip": 0.01409394, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 1.24407387, "balance_loss_mlp": 1.01318073, "epoch": 0.680354727190741, "flos": 22028984375040.0, "grad_norm": 1.5204901192791644, "language_loss": 0.70376772, "learning_rate": 9.792734377526718e-07, "loss": 0.72819364, "num_input_tokens_seen": 244171480, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20019531, "step": 11316, "time_per_iteration": 4.3096232414245605 }, { "auxiliary_loss_clip": 0.01417803, "auxiliary_loss_mlp": 0.01036457, "balance_loss_clip": 1.25259864, "balance_loss_mlp": 1.01701427, "epoch": 0.680414850443409, "flos": 18450480412800.0, "grad_norm": 1.9107274499295817, "language_loss": 0.67876339, "learning_rate": 9.789385360660003e-07, "loss": 0.70330596, "num_input_tokens_seen": 244187920, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19445801, "step": 11317, "time_per_iteration": 2.812840700149536 }, { "auxiliary_loss_clip": 0.01423562, "auxiliary_loss_mlp": 0.0103471, "balance_loss_clip": 1.25664115, "balance_loss_mlp": 1.01599371, "epoch": 0.680474973696077, "flos": 26369576069760.0, "grad_norm": 1.5373803792298903, "language_loss": 0.75489831, "learning_rate": 9.78603673098082e-07, "loss": 0.77948105, "num_input_tokens_seen": 244209565, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18713379, "step": 11318, "time_per_iteration": 2.8647563457489014 }, { "auxiliary_loss_clip": 0.01402003, "auxiliary_loss_mlp": 0.01034891, "balance_loss_clip": 1.24110436, "balance_loss_mlp": 1.01616287, "epoch": 0.6805350969487449, "flos": 18342354044160.0, "grad_norm": 1.6791396269098764, "language_loss": 0.69236344, "learning_rate": 9.782688488616143e-07, "loss": 0.71673238, "num_input_tokens_seen": 244228015, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18713379, "step": 11319, "time_per_iteration": 4.2774646282196045 }, { "auxiliary_loss_clip": 0.01399259, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.23784161, "balance_loss_mlp": 1.01719356, "epoch": 0.6805952202014129, "flos": 19946893553280.0, "grad_norm": 1.7308414052713077, "language_loss": 0.77625227, "learning_rate": 9.779340633692945e-07, "loss": 0.80061281, "num_input_tokens_seen": 244245615, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19592285, "step": 11320, "time_per_iteration": 2.880239486694336 }, { "auxiliary_loss_clip": 0.01417559, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.25336313, "balance_loss_mlp": 1.01290202, "epoch": 0.6806553434540809, "flos": 25234127095680.0, "grad_norm": 4.747781951337733, "language_loss": 0.75302511, "learning_rate": 9.77599316633817e-07, "loss": 0.77753437, "num_input_tokens_seen": 244263625, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20446777, "step": 11321, "time_per_iteration": 2.8584327697753906 }, { "auxiliary_loss_clip": 0.01427436, "auxiliary_loss_mlp": 0.01038421, "balance_loss_clip": 1.2621448, "balance_loss_mlp": 1.01882339, "epoch": 0.6807154667067489, "flos": 17794618767360.0, "grad_norm": 2.962288114201844, "language_loss": 0.73654282, "learning_rate": 9.772646086678758e-07, "loss": 0.76120132, "num_input_tokens_seen": 244282745, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19616699, "step": 11322, "time_per_iteration": 2.8288662433624268 }, { "auxiliary_loss_clip": 0.01417954, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.25216615, "balance_loss_mlp": 1.01296902, "epoch": 0.6807755899594168, "flos": 22209964151040.0, "grad_norm": 1.7395559834495082, "language_loss": 0.79751027, "learning_rate": 9.769299394841638e-07, "loss": 0.82201672, "num_input_tokens_seen": 244303770, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19714355, "step": 11323, "time_per_iteration": 2.848283052444458 }, { "auxiliary_loss_clip": 0.01188626, "auxiliary_loss_mlp": 0.0102797, "balance_loss_clip": 1.09888208, "balance_loss_mlp": 1.00679862, "epoch": 0.6808357132120848, "flos": 68658702883200.0, "grad_norm": 0.7501361763494518, "language_loss": 0.57144701, "learning_rate": 9.765953090953714e-07, "loss": 0.59361291, "num_input_tokens_seen": 244355910, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.21191406, "step": 11324, "time_per_iteration": 3.137296438217163 }, { "auxiliary_loss_clip": 0.01416895, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.25171161, "balance_loss_mlp": 1.01801586, "epoch": 0.6808958364647527, "flos": 23854255856640.0, "grad_norm": 1.7617262681605776, "language_loss": 0.69228351, "learning_rate": 9.76260717514186e-07, "loss": 0.71682811, "num_input_tokens_seen": 244376610, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19543457, "step": 11325, "time_per_iteration": 2.880560874938965 }, { "auxiliary_loss_clip": 0.01420184, "auxiliary_loss_mlp": 0.01033362, "balance_loss_clip": 1.25124097, "balance_loss_mlp": 1.01261997, "epoch": 0.6809559597174207, "flos": 17720498505600.0, "grad_norm": 2.5098364653162775, "language_loss": 0.72076178, "learning_rate": 9.759261647532974e-07, "loss": 0.74529719, "num_input_tokens_seen": 244393000, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20739746, "step": 11326, "time_per_iteration": 2.8082377910614014 }, { "auxiliary_loss_clip": 0.01413551, "auxiliary_loss_mlp": 0.01030049, "balance_loss_clip": 1.24955285, "balance_loss_mlp": 1.01159573, "epoch": 0.6810160829700886, "flos": 22502056452480.0, "grad_norm": 1.7403943873680507, "language_loss": 0.7383846, "learning_rate": 9.75591650825392e-07, "loss": 0.7628206, "num_input_tokens_seen": 244409515, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18469238, "step": 11327, "time_per_iteration": 2.8523879051208496 }, { "auxiliary_loss_clip": 0.0140605, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.24355483, "balance_loss_mlp": 1.01068592, "epoch": 0.6810762062227567, "flos": 16840602017280.0, "grad_norm": 1.8352902294798372, "language_loss": 0.77808857, "learning_rate": 9.752571757431526e-07, "loss": 0.80244648, "num_input_tokens_seen": 244427165, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.1907959, "step": 11328, "time_per_iteration": 2.842034101486206 }, { "auxiliary_loss_clip": 0.01416045, "auxiliary_loss_mlp": 0.01030878, "balance_loss_clip": 1.24939597, "balance_loss_mlp": 1.0115068, "epoch": 0.6811363294754246, "flos": 12722552087040.0, "grad_norm": 2.7640090119118135, "language_loss": 0.65034783, "learning_rate": 9.74922739519265e-07, "loss": 0.67481709, "num_input_tokens_seen": 244445705, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19384766, "step": 11329, "time_per_iteration": 2.8196916580200195 }, { "auxiliary_loss_clip": 0.01412156, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.24676633, "balance_loss_mlp": 1.01392508, "epoch": 0.6811964527280926, "flos": 17720815219200.0, "grad_norm": 1.8233035201920738, "language_loss": 0.79505861, "learning_rate": 9.745883421664096e-07, "loss": 0.81951767, "num_input_tokens_seen": 244460415, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19812012, "step": 11330, "time_per_iteration": 2.806715965270996 }, { "auxiliary_loss_clip": 0.0141624, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.25203395, "balance_loss_mlp": 1.01193476, "epoch": 0.6812565759807605, "flos": 24874248804480.0, "grad_norm": 1.7618028653151885, "language_loss": 0.65100932, "learning_rate": 9.742539836972665e-07, "loss": 0.67548668, "num_input_tokens_seen": 244480555, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19543457, "step": 11331, "time_per_iteration": 2.8617892265319824 }, { "auxiliary_loss_clip": 0.01407402, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.24339044, "balance_loss_mlp": 1.0111357, "epoch": 0.6813166992334285, "flos": 17175070713600.0, "grad_norm": 2.076479242953104, "language_loss": 0.72693253, "learning_rate": 9.739196641245148e-07, "loss": 0.75130963, "num_input_tokens_seen": 244498540, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19177246, "step": 11332, "time_per_iteration": 2.8538248538970947 }, { "auxiliary_loss_clip": 0.01415031, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.24915206, "balance_loss_mlp": 1.01928163, "epoch": 0.6813768224860965, "flos": 18852825588480.0, "grad_norm": 1.8249742139039362, "language_loss": 0.76205456, "learning_rate": 9.735853834608326e-07, "loss": 0.78659356, "num_input_tokens_seen": 244517015, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19580078, "step": 11333, "time_per_iteration": 2.832982301712036 }, { "auxiliary_loss_clip": 0.01433956, "auxiliary_loss_mlp": 0.01030748, "balance_loss_clip": 1.26528454, "balance_loss_mlp": 1.0113405, "epoch": 0.6814369457387645, "flos": 24542811509760.0, "grad_norm": 1.4119152260756354, "language_loss": 0.72356653, "learning_rate": 9.732511417188963e-07, "loss": 0.74821359, "num_input_tokens_seen": 244537450, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19396973, "step": 11334, "time_per_iteration": 2.8665213584899902 }, { "auxiliary_loss_clip": 0.01403669, "auxiliary_loss_mlp": 0.01036497, "balance_loss_clip": 1.24317086, "balance_loss_mlp": 1.01641035, "epoch": 0.6814970689914325, "flos": 18232010680320.0, "grad_norm": 1.9416177926258882, "language_loss": 0.86827052, "learning_rate": 9.729169389113791e-07, "loss": 0.89267224, "num_input_tokens_seen": 244555640, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.20092773, "step": 11335, "time_per_iteration": 2.831590414047241 }, { "auxiliary_loss_clip": 0.01393246, "auxiliary_loss_mlp": 0.01030527, "balance_loss_clip": 1.2348001, "balance_loss_mlp": 1.01207328, "epoch": 0.6815571922441004, "flos": 25239782695680.0, "grad_norm": 1.693000867777012, "language_loss": 0.82713437, "learning_rate": 9.725827750509542e-07, "loss": 0.85137206, "num_input_tokens_seen": 244574005, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18444824, "step": 11336, "time_per_iteration": 2.853888750076294 }, { "auxiliary_loss_clip": 0.01397939, "auxiliary_loss_mlp": 0.01031976, "balance_loss_clip": 1.23819137, "balance_loss_mlp": 1.0129739, "epoch": 0.6816173154967684, "flos": 19463505661440.0, "grad_norm": 1.7927051071948732, "language_loss": 0.82308495, "learning_rate": 9.72248650150294e-07, "loss": 0.84738415, "num_input_tokens_seen": 244591395, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19006348, "step": 11337, "time_per_iteration": 2.813971519470215 }, { "auxiliary_loss_clip": 0.01405631, "auxiliary_loss_mlp": 0.01034285, "balance_loss_clip": 1.24481392, "balance_loss_mlp": 1.014902, "epoch": 0.6816774387494363, "flos": 17940506561280.0, "grad_norm": 1.667103102443736, "language_loss": 0.73164666, "learning_rate": 9.719145642220673e-07, "loss": 0.75604582, "num_input_tokens_seen": 244610400, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19372559, "step": 11338, "time_per_iteration": 2.841360330581665 }, { "auxiliary_loss_clip": 0.01407761, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.24468839, "balance_loss_mlp": 1.01623702, "epoch": 0.6817375620021043, "flos": 22242205710720.0, "grad_norm": 1.527636114026428, "language_loss": 0.77953243, "learning_rate": 9.715805172789435e-07, "loss": 0.80397415, "num_input_tokens_seen": 244630400, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20153809, "step": 11339, "time_per_iteration": 4.257450580596924 }, { "auxiliary_loss_clip": 0.01415712, "auxiliary_loss_mlp": 0.01036821, "balance_loss_clip": 1.25078011, "balance_loss_mlp": 1.01781929, "epoch": 0.6817976852547722, "flos": 25385353776000.0, "grad_norm": 2.009139066165595, "language_loss": 0.71589959, "learning_rate": 9.712465093335901e-07, "loss": 0.74042493, "num_input_tokens_seen": 244649155, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19006348, "step": 11340, "time_per_iteration": 2.877794027328491 }, { "auxiliary_loss_clip": 0.01419154, "auxiliary_loss_mlp": 0.01035316, "balance_loss_clip": 1.25154507, "balance_loss_mlp": 1.0154438, "epoch": 0.6818578085074403, "flos": 22273994822400.0, "grad_norm": 2.126685703879322, "language_loss": 0.8457756, "learning_rate": 9.709125403986722e-07, "loss": 0.87032032, "num_input_tokens_seen": 244665470, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19873047, "step": 11341, "time_per_iteration": 2.8404483795166016 }, { "auxiliary_loss_clip": 0.01413706, "auxiliary_loss_mlp": 0.01038408, "balance_loss_clip": 1.24951577, "balance_loss_mlp": 1.01848817, "epoch": 0.6819179317601082, "flos": 19327616968320.0, "grad_norm": 1.693491141917834, "language_loss": 0.69249678, "learning_rate": 9.705786104868531e-07, "loss": 0.71701789, "num_input_tokens_seen": 244684390, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19909668, "step": 11342, "time_per_iteration": 2.842381000518799 }, { "auxiliary_loss_clip": 0.01405223, "auxiliary_loss_mlp": 0.01038537, "balance_loss_clip": 1.24338627, "balance_loss_mlp": 1.01889133, "epoch": 0.6819780550127762, "flos": 21113588701440.0, "grad_norm": 3.1791654456655682, "language_loss": 0.75823903, "learning_rate": 9.702447196107963e-07, "loss": 0.7826767, "num_input_tokens_seen": 244703370, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19641113, "step": 11343, "time_per_iteration": 2.8489999771118164 }, { "auxiliary_loss_clip": 0.01426484, "auxiliary_loss_mlp": 0.01037476, "balance_loss_clip": 1.26137269, "balance_loss_mlp": 1.01713848, "epoch": 0.6820381782654441, "flos": 29728524424320.0, "grad_norm": 1.5202200310610858, "language_loss": 0.80320418, "learning_rate": 9.699108677831639e-07, "loss": 0.82784379, "num_input_tokens_seen": 244723325, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20336914, "step": 11344, "time_per_iteration": 2.9074172973632812 }, { "auxiliary_loss_clip": 0.01411322, "auxiliary_loss_mlp": 0.01042052, "balance_loss_clip": 1.24622333, "balance_loss_mlp": 1.02244198, "epoch": 0.6820983015181121, "flos": 29254140247680.0, "grad_norm": 1.7696620538954222, "language_loss": 0.67171717, "learning_rate": 9.695770550166136e-07, "loss": 0.69625092, "num_input_tokens_seen": 244745650, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19616699, "step": 11345, "time_per_iteration": 2.929905891418457 }, { "auxiliary_loss_clip": 0.01424098, "auxiliary_loss_mlp": 0.01041462, "balance_loss_clip": 1.25756848, "balance_loss_mlp": 1.02024317, "epoch": 0.6821584247707801, "flos": 18878913855360.0, "grad_norm": 2.687360309479774, "language_loss": 0.66486585, "learning_rate": 9.692432813238054e-07, "loss": 0.68952149, "num_input_tokens_seen": 244760270, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.21228027, "step": 11346, "time_per_iteration": 2.843095064163208 }, { "auxiliary_loss_clip": 0.01431491, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.26354384, "balance_loss_mlp": 1.01486158, "epoch": 0.6822185480234481, "flos": 21334501653120.0, "grad_norm": 2.122754981547926, "language_loss": 0.78987813, "learning_rate": 9.689095467173952e-07, "loss": 0.81453741, "num_input_tokens_seen": 244779565, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19592285, "step": 11347, "time_per_iteration": 2.830878734588623 }, { "auxiliary_loss_clip": 0.01192343, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.10237622, "balance_loss_mlp": 1.03103733, "epoch": 0.6822786712761161, "flos": 63515049649920.0, "grad_norm": 0.7354952465953686, "language_loss": 0.52479404, "learning_rate": 9.685758512100378e-07, "loss": 0.54726052, "num_input_tokens_seen": 244838480, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.23242188, "step": 11348, "time_per_iteration": 3.420215368270874 }, { "auxiliary_loss_clip": 0.01407983, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.24671841, "balance_loss_mlp": 1.0170027, "epoch": 0.682338794528784, "flos": 21078722943360.0, "grad_norm": 1.6981471416983351, "language_loss": 0.80386746, "learning_rate": 9.682421948143873e-07, "loss": 0.82831007, "num_input_tokens_seen": 244855265, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19250488, "step": 11349, "time_per_iteration": 2.898846387863159 }, { "auxiliary_loss_clip": 0.01449503, "auxiliary_loss_mlp": 0.01037626, "balance_loss_clip": 1.27485037, "balance_loss_mlp": 1.01503563, "epoch": 0.682398917781452, "flos": 36296913755520.0, "grad_norm": 1.6327830586349878, "language_loss": 0.7441256, "learning_rate": 9.67908577543096e-07, "loss": 0.76899695, "num_input_tokens_seen": 244875555, "router_z_loss_clip": 1.74707031, "router_z_loss_mlp": 0.22583008, "step": 11350, "time_per_iteration": 2.959620952606201 }, { "auxiliary_loss_clip": 0.01412246, "auxiliary_loss_mlp": 0.01039772, "balance_loss_clip": 1.24895024, "balance_loss_mlp": 1.01981688, "epoch": 0.6824590410341199, "flos": 24868864673280.0, "grad_norm": 1.51180371437338, "language_loss": 0.79801989, "learning_rate": 9.675749994088161e-07, "loss": 0.82254004, "num_input_tokens_seen": 244895270, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19946289, "step": 11351, "time_per_iteration": 4.35706901550293 }, { "auxiliary_loss_clip": 0.01402412, "auxiliary_loss_mlp": 0.01036246, "balance_loss_clip": 1.24095845, "balance_loss_mlp": 1.01689816, "epoch": 0.6825191642867879, "flos": 22461761318400.0, "grad_norm": 1.6445584036160426, "language_loss": 0.74340034, "learning_rate": 9.672414604241954e-07, "loss": 0.76778698, "num_input_tokens_seen": 244914535, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19335938, "step": 11352, "time_per_iteration": 2.864840030670166 }, { "auxiliary_loss_clip": 0.01426894, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.26002502, "balance_loss_mlp": 1.01725841, "epoch": 0.6825792875394558, "flos": 29436386878080.0, "grad_norm": 1.4722488044985225, "language_loss": 0.80755448, "learning_rate": 9.669079606018814e-07, "loss": 0.83220422, "num_input_tokens_seen": 244936095, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20812988, "step": 11353, "time_per_iteration": 2.909999132156372 }, { "auxiliary_loss_clip": 0.01411443, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.247159, "balance_loss_mlp": 1.01591277, "epoch": 0.6826394107921239, "flos": 18780334139520.0, "grad_norm": 1.7851784987753812, "language_loss": 0.79011291, "learning_rate": 9.665744999545218e-07, "loss": 0.8145895, "num_input_tokens_seen": 244955290, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20300293, "step": 11354, "time_per_iteration": 4.3294806480407715 }, { "auxiliary_loss_clip": 0.01411261, "auxiliary_loss_mlp": 0.01034424, "balance_loss_clip": 1.24900389, "balance_loss_mlp": 1.01484954, "epoch": 0.6826995340447918, "flos": 16626204316800.0, "grad_norm": 1.9388504804637146, "language_loss": 0.62242436, "learning_rate": 9.662410784947599e-07, "loss": 0.64688128, "num_input_tokens_seen": 244972935, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19567871, "step": 11355, "time_per_iteration": 2.8175766468048096 }, { "auxiliary_loss_clip": 0.01419734, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.25528455, "balance_loss_mlp": 1.01732683, "epoch": 0.6827596572974598, "flos": 20857583767680.0, "grad_norm": 1.7827505902360983, "language_loss": 0.83182585, "learning_rate": 9.659076962352398e-07, "loss": 0.85638875, "num_input_tokens_seen": 244989440, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19213867, "step": 11356, "time_per_iteration": 2.8358891010284424 }, { "auxiliary_loss_clip": 0.01435578, "auxiliary_loss_mlp": 0.01035991, "balance_loss_clip": 1.26781249, "balance_loss_mlp": 1.01500988, "epoch": 0.6828197805501277, "flos": 22758513834240.0, "grad_norm": 1.6493584496785942, "language_loss": 0.79033369, "learning_rate": 9.655743531886052e-07, "loss": 0.81504941, "num_input_tokens_seen": 245007830, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20983887, "step": 11357, "time_per_iteration": 2.840134620666504 }, { "auxiliary_loss_clip": 0.0119034, "auxiliary_loss_mlp": 0.01018051, "balance_loss_clip": 1.09993482, "balance_loss_mlp": 0.99869156, "epoch": 0.6828799038027957, "flos": 71681119263360.0, "grad_norm": 0.837571503406661, "language_loss": 0.59637719, "learning_rate": 9.65241049367493e-07, "loss": 0.61846113, "num_input_tokens_seen": 245070720, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.19335938, "step": 11358, "time_per_iteration": 3.3921425342559814 }, { "auxiliary_loss_clip": 0.01431307, "auxiliary_loss_mlp": 0.01039424, "balance_loss_clip": 1.26227689, "balance_loss_mlp": 1.01869345, "epoch": 0.6829400270554637, "flos": 19838812429440.0, "grad_norm": 1.9706328565152895, "language_loss": 0.80131733, "learning_rate": 9.64907784784544e-07, "loss": 0.82602465, "num_input_tokens_seen": 245089070, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20739746, "step": 11359, "time_per_iteration": 2.8380579948425293 }, { "auxiliary_loss_clip": 0.01410689, "auxiliary_loss_mlp": 0.01035978, "balance_loss_clip": 1.24510074, "balance_loss_mlp": 1.0162493, "epoch": 0.6830001503081317, "flos": 21990544277760.0, "grad_norm": 1.829718318151842, "language_loss": 0.81931925, "learning_rate": 9.645745594523958e-07, "loss": 0.84378594, "num_input_tokens_seen": 245106500, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.1973877, "step": 11360, "time_per_iteration": 2.867671012878418 }, { "auxiliary_loss_clip": 0.01418965, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.25239229, "balance_loss_mlp": 1.01788783, "epoch": 0.6830602735607997, "flos": 24327554158080.0, "grad_norm": 2.0633055028659455, "language_loss": 0.75336808, "learning_rate": 9.642413733836844e-07, "loss": 0.77793562, "num_input_tokens_seen": 245125260, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19909668, "step": 11361, "time_per_iteration": 2.8632469177246094 }, { "auxiliary_loss_clip": 0.01194176, "auxiliary_loss_mlp": 0.01015649, "balance_loss_clip": 1.10301065, "balance_loss_mlp": 0.99466801, "epoch": 0.6831203968134676, "flos": 57716077708800.0, "grad_norm": 0.8737415376383233, "language_loss": 0.5978626, "learning_rate": 9.639082265910437e-07, "loss": 0.61996078, "num_input_tokens_seen": 245188730, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.20996094, "step": 11362, "time_per_iteration": 3.394331932067871 }, { "auxiliary_loss_clip": 0.01421572, "auxiliary_loss_mlp": 0.01031652, "balance_loss_clip": 1.25545752, "balance_loss_mlp": 1.01105285, "epoch": 0.6831805200661356, "flos": 14395556257920.0, "grad_norm": 2.034658829975799, "language_loss": 0.76486272, "learning_rate": 9.635751190871074e-07, "loss": 0.78939492, "num_input_tokens_seen": 245205065, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20605469, "step": 11363, "time_per_iteration": 2.8349788188934326 }, { "auxiliary_loss_clip": 0.01420069, "auxiliary_loss_mlp": 0.01038688, "balance_loss_clip": 1.25722647, "balance_loss_mlp": 1.01770711, "epoch": 0.6832406433188035, "flos": 22830552835200.0, "grad_norm": 2.158969431532805, "language_loss": 0.89530939, "learning_rate": 9.632420508845063e-07, "loss": 0.91989696, "num_input_tokens_seen": 245224265, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.2097168, "step": 11364, "time_per_iteration": 2.8377461433410645 }, { "auxiliary_loss_clip": 0.01416923, "auxiliary_loss_mlp": 0.01031007, "balance_loss_clip": 1.25262904, "balance_loss_mlp": 1.01173115, "epoch": 0.6833007665714715, "flos": 17569407559680.0, "grad_norm": 2.0146292247373316, "language_loss": 0.88783824, "learning_rate": 9.629090219958697e-07, "loss": 0.91231751, "num_input_tokens_seen": 245243360, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19299316, "step": 11365, "time_per_iteration": 2.856091260910034 }, { "auxiliary_loss_clip": 0.01424807, "auxiliary_loss_mlp": 0.01037834, "balance_loss_clip": 1.25606549, "balance_loss_mlp": 1.01817596, "epoch": 0.6833608898241395, "flos": 22453617254400.0, "grad_norm": 2.214664570192883, "language_loss": 0.82291067, "learning_rate": 9.625760324338272e-07, "loss": 0.8475371, "num_input_tokens_seen": 245256350, "router_z_loss_clip": 1.68847656, "router_z_loss_mlp": 0.19665527, "step": 11366, "time_per_iteration": 2.803571939468384 }, { "auxiliary_loss_clip": 0.01420101, "auxiliary_loss_mlp": 0.01034053, "balance_loss_clip": 1.25383639, "balance_loss_mlp": 1.01409745, "epoch": 0.6834210130768075, "flos": 24545209484160.0, "grad_norm": 1.5523599795780185, "language_loss": 0.77474838, "learning_rate": 9.622430822110062e-07, "loss": 0.79928994, "num_input_tokens_seen": 245277575, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19946289, "step": 11367, "time_per_iteration": 2.8568115234375 }, { "auxiliary_loss_clip": 0.01414585, "auxiliary_loss_mlp": 0.0104091, "balance_loss_clip": 1.2486347, "balance_loss_mlp": 1.01994085, "epoch": 0.6834811363294754, "flos": 20056603489920.0, "grad_norm": 2.347942801304673, "language_loss": 0.70204437, "learning_rate": 9.619101713400312e-07, "loss": 0.72659928, "num_input_tokens_seen": 245296615, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20996094, "step": 11368, "time_per_iteration": 2.85105037689209 }, { "auxiliary_loss_clip": 0.01409319, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.2453804, "balance_loss_mlp": 1.01153696, "epoch": 0.6835412595821434, "flos": 24801576376320.0, "grad_norm": 1.6990733400158802, "language_loss": 0.74046516, "learning_rate": 9.615772998335261e-07, "loss": 0.76487875, "num_input_tokens_seen": 245316275, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.20495605, "step": 11369, "time_per_iteration": 2.8680148124694824 }, { "auxiliary_loss_clip": 0.01424637, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.25873351, "balance_loss_mlp": 1.01425743, "epoch": 0.6836013828348113, "flos": 19509818353920.0, "grad_norm": 2.0127459016453084, "language_loss": 0.80092072, "learning_rate": 9.612444677041138e-07, "loss": 0.82549679, "num_input_tokens_seen": 245334595, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.18713379, "step": 11370, "time_per_iteration": 2.823411703109741 }, { "auxiliary_loss_clip": 0.01196237, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.10512924, "balance_loss_mlp": 1.01280117, "epoch": 0.6836615060874793, "flos": 58394652243840.0, "grad_norm": 0.7502959758835724, "language_loss": 0.59776866, "learning_rate": 9.609116749644162e-07, "loss": 0.62008226, "num_input_tokens_seen": 245389750, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.22363281, "step": 11371, "time_per_iteration": 3.2548840045928955 }, { "auxiliary_loss_clip": 0.01401145, "auxiliary_loss_mlp": 0.010332, "balance_loss_clip": 1.24071646, "balance_loss_mlp": 1.01326835, "epoch": 0.6837216293401474, "flos": 12174500096640.0, "grad_norm": 1.5231223832905636, "language_loss": 0.64295727, "learning_rate": 9.605789216270511e-07, "loss": 0.6673007, "num_input_tokens_seen": 245407530, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19909668, "step": 11372, "time_per_iteration": 2.8188834190368652 }, { "auxiliary_loss_clip": 0.01421086, "auxiliary_loss_mlp": 0.01035254, "balance_loss_clip": 1.25661433, "balance_loss_mlp": 1.01505971, "epoch": 0.6837817525928153, "flos": 22137789415680.0, "grad_norm": 1.4723562932300103, "language_loss": 0.72156835, "learning_rate": 9.602462077046375e-07, "loss": 0.74613166, "num_input_tokens_seen": 245427000, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.2019043, "step": 11373, "time_per_iteration": 2.9459218978881836 }, { "auxiliary_loss_clip": 0.01193737, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.10475469, "balance_loss_mlp": 1.01738191, "epoch": 0.6838418758454833, "flos": 65038700160000.0, "grad_norm": 1.3176128894358552, "language_loss": 0.56629759, "learning_rate": 9.599135332097935e-07, "loss": 0.58862144, "num_input_tokens_seen": 245491620, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.21289062, "step": 11374, "time_per_iteration": 4.770488739013672 }, { "auxiliary_loss_clip": 0.01429828, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.26313651, "balance_loss_mlp": 1.0129174, "epoch": 0.6839019990981512, "flos": 21040192356480.0, "grad_norm": 1.651389627724949, "language_loss": 0.74670857, "learning_rate": 9.595808981551312e-07, "loss": 0.77133584, "num_input_tokens_seen": 245511285, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.1998291, "step": 11375, "time_per_iteration": 2.8765532970428467 }, { "auxiliary_loss_clip": 0.01413373, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.24968266, "balance_loss_mlp": 1.01619256, "epoch": 0.6839621223508192, "flos": 24946106826240.0, "grad_norm": 1.7605529255279193, "language_loss": 0.71050501, "learning_rate": 9.592483025532651e-07, "loss": 0.73499596, "num_input_tokens_seen": 245532910, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19506836, "step": 11376, "time_per_iteration": 2.8973183631896973 }, { "auxiliary_loss_clip": 0.01430295, "auxiliary_loss_mlp": 0.01035959, "balance_loss_clip": 1.26129889, "balance_loss_mlp": 1.01544333, "epoch": 0.6840222456034871, "flos": 26369847538560.0, "grad_norm": 3.156834145757527, "language_loss": 0.7471242, "learning_rate": 9.58915746416808e-07, "loss": 0.77178669, "num_input_tokens_seen": 245550540, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.2052002, "step": 11377, "time_per_iteration": 2.8828487396240234 }, { "auxiliary_loss_clip": 0.01192218, "auxiliary_loss_mlp": 0.01017067, "balance_loss_clip": 1.09976661, "balance_loss_mlp": 0.99570447, "epoch": 0.6840823688561551, "flos": 66020660213760.0, "grad_norm": 0.7173425014327345, "language_loss": 0.56907749, "learning_rate": 9.585832297583707e-07, "loss": 0.59117031, "num_input_tokens_seen": 245619570, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.21386719, "step": 11378, "time_per_iteration": 3.3827879428863525 }, { "auxiliary_loss_clip": 0.01423493, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 1.25672734, "balance_loss_mlp": 1.01622057, "epoch": 0.684142492108823, "flos": 21407536039680.0, "grad_norm": 1.850259942164225, "language_loss": 0.79478884, "learning_rate": 9.58250752590561e-07, "loss": 0.81939036, "num_input_tokens_seen": 245637980, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20422363, "step": 11379, "time_per_iteration": 2.829346179962158 }, { "auxiliary_loss_clip": 0.01398697, "auxiliary_loss_mlp": 0.01031805, "balance_loss_clip": 1.24173927, "balance_loss_mlp": 1.01382828, "epoch": 0.6842026153614911, "flos": 18809227584000.0, "grad_norm": 1.6704715384123423, "language_loss": 0.69692218, "learning_rate": 9.57918314925988e-07, "loss": 0.72122717, "num_input_tokens_seen": 245655690, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.17980957, "step": 11380, "time_per_iteration": 2.8209009170532227 }, { "auxiliary_loss_clip": 0.01417444, "auxiliary_loss_mlp": 0.01034829, "balance_loss_clip": 1.25380528, "balance_loss_mlp": 1.01426578, "epoch": 0.684262738614159, "flos": 19655932371840.0, "grad_norm": 1.8694414179270298, "language_loss": 0.78388149, "learning_rate": 9.575859167772568e-07, "loss": 0.80840421, "num_input_tokens_seen": 245671525, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20556641, "step": 11381, "time_per_iteration": 2.823035955429077 }, { "auxiliary_loss_clip": 0.011927, "auxiliary_loss_mlp": 0.01020071, "balance_loss_clip": 1.10436273, "balance_loss_mlp": 1.00309551, "epoch": 0.684322861866827, "flos": 62380188858240.0, "grad_norm": 0.8724389944568539, "language_loss": 0.67242014, "learning_rate": 9.572535581569713e-07, "loss": 0.69454789, "num_input_tokens_seen": 245724115, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.16992188, "step": 11382, "time_per_iteration": 3.1425552368164062 }, { "auxiliary_loss_clip": 0.01193617, "auxiliary_loss_mlp": 0.01026903, "balance_loss_clip": 1.10307646, "balance_loss_mlp": 1.006971, "epoch": 0.6843829851194949, "flos": 65837101484160.0, "grad_norm": 0.8247509223965037, "language_loss": 0.58169037, "learning_rate": 9.569212390777356e-07, "loss": 0.6038956, "num_input_tokens_seen": 245789245, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.19921875, "step": 11383, "time_per_iteration": 3.3082969188690186 }, { "auxiliary_loss_clip": 0.01402596, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.24087059, "balance_loss_mlp": 1.01416576, "epoch": 0.6844431083721629, "flos": 27866667882240.0, "grad_norm": 2.1599723540735907, "language_loss": 0.80578417, "learning_rate": 9.565889595521517e-07, "loss": 0.83014488, "num_input_tokens_seen": 245812420, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19287109, "step": 11384, "time_per_iteration": 2.9528353214263916 }, { "auxiliary_loss_clip": 0.01427916, "auxiliary_loss_mlp": 0.01041169, "balance_loss_clip": 1.26023901, "balance_loss_mlp": 1.02140439, "epoch": 0.684503231624831, "flos": 18262894896000.0, "grad_norm": 4.634426974526997, "language_loss": 0.78065002, "learning_rate": 9.562567195928187e-07, "loss": 0.80534089, "num_input_tokens_seen": 245829135, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19775391, "step": 11385, "time_per_iteration": 4.291656970977783 }, { "auxiliary_loss_clip": 0.01438674, "auxiliary_loss_mlp": 0.0103753, "balance_loss_clip": 1.26697147, "balance_loss_mlp": 1.01565492, "epoch": 0.6845633548774989, "flos": 17648685728640.0, "grad_norm": 1.924418198835301, "language_loss": 0.85293174, "learning_rate": 9.55924519212335e-07, "loss": 0.87769377, "num_input_tokens_seen": 245847140, "router_z_loss_clip": 1.71777344, "router_z_loss_mlp": 0.21862793, "step": 11386, "time_per_iteration": 4.2574169635772705 }, { "auxiliary_loss_clip": 0.01429882, "auxiliary_loss_mlp": 0.01036225, "balance_loss_clip": 1.26523602, "balance_loss_mlp": 1.01704454, "epoch": 0.6846234781301669, "flos": 20815750310400.0, "grad_norm": 1.9940439389694726, "language_loss": 0.82987022, "learning_rate": 9.555923584232984e-07, "loss": 0.85453129, "num_input_tokens_seen": 245862855, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19189453, "step": 11387, "time_per_iteration": 2.860929012298584 }, { "auxiliary_loss_clip": 0.01416958, "auxiliary_loss_mlp": 0.01028698, "balance_loss_clip": 1.25397682, "balance_loss_mlp": 1.00980365, "epoch": 0.6846836013828348, "flos": 36114893349120.0, "grad_norm": 1.7122298095762598, "language_loss": 0.72651255, "learning_rate": 9.552602372383047e-07, "loss": 0.75096911, "num_input_tokens_seen": 245885415, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18884277, "step": 11388, "time_per_iteration": 2.9708025455474854 }, { "auxiliary_loss_clip": 0.01409428, "auxiliary_loss_mlp": 0.0103089, "balance_loss_clip": 1.24758518, "balance_loss_mlp": 1.01181602, "epoch": 0.6847437246355028, "flos": 43157893080960.0, "grad_norm": 1.8848650139314924, "language_loss": 0.63578302, "learning_rate": 9.549281556699469e-07, "loss": 0.66018617, "num_input_tokens_seen": 245906285, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1907959, "step": 11389, "time_per_iteration": 4.4353296756744385 }, { "auxiliary_loss_clip": 0.0119265, "auxiliary_loss_mlp": 0.01031226, "balance_loss_clip": 1.1008637, "balance_loss_mlp": 1.00910115, "epoch": 0.6848038478881707, "flos": 71693272131840.0, "grad_norm": 0.762036034021924, "language_loss": 0.56042606, "learning_rate": 9.54596113730818e-07, "loss": 0.58266485, "num_input_tokens_seen": 245967620, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.22167969, "step": 11390, "time_per_iteration": 3.504220724105835 }, { "auxiliary_loss_clip": 0.01421072, "auxiliary_loss_mlp": 0.01033605, "balance_loss_clip": 1.25708961, "balance_loss_mlp": 1.01343465, "epoch": 0.6848639711408387, "flos": 19947300756480.0, "grad_norm": 2.374242102342309, "language_loss": 0.88700098, "learning_rate": 9.542641114335109e-07, "loss": 0.91154778, "num_input_tokens_seen": 245985075, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20166016, "step": 11391, "time_per_iteration": 2.8390703201293945 }, { "auxiliary_loss_clip": 0.01427064, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.26075053, "balance_loss_mlp": 1.01444995, "epoch": 0.6849240943935067, "flos": 26878102087680.0, "grad_norm": 1.5077263447067288, "language_loss": 0.79306757, "learning_rate": 9.539321487906117e-07, "loss": 0.81767714, "num_input_tokens_seen": 246003560, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19445801, "step": 11392, "time_per_iteration": 2.880298614501953 }, { "auxiliary_loss_clip": 0.01403029, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 1.24284279, "balance_loss_mlp": 1.01254869, "epoch": 0.6849842176461747, "flos": 13743268951680.0, "grad_norm": 2.2869530563890574, "language_loss": 0.71639073, "learning_rate": 9.536002258147104e-07, "loss": 0.74073923, "num_input_tokens_seen": 246019600, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19274902, "step": 11393, "time_per_iteration": 2.8313510417938232 }, { "auxiliary_loss_clip": 0.01433394, "auxiliary_loss_mlp": 0.01037198, "balance_loss_clip": 1.26529694, "balance_loss_mlp": 1.01653957, "epoch": 0.6850443408988426, "flos": 24983913496320.0, "grad_norm": 1.5798956781833566, "language_loss": 0.65180409, "learning_rate": 9.532683425183936e-07, "loss": 0.67651004, "num_input_tokens_seen": 246038920, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20666504, "step": 11394, "time_per_iteration": 2.8793210983276367 }, { "auxiliary_loss_clip": 0.01416699, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.25227857, "balance_loss_mlp": 1.01297438, "epoch": 0.6851044641515106, "flos": 27755329132800.0, "grad_norm": 1.582633987785243, "language_loss": 0.81388593, "learning_rate": 9.529364989142468e-07, "loss": 0.83838367, "num_input_tokens_seen": 246060490, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.2010498, "step": 11395, "time_per_iteration": 2.9071643352508545 }, { "auxiliary_loss_clip": 0.01409358, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.24584436, "balance_loss_mlp": 1.01730263, "epoch": 0.6851645874041785, "flos": 24361288796160.0, "grad_norm": 1.6855074482080372, "language_loss": 0.73543203, "learning_rate": 9.526046950148527e-07, "loss": 0.75989532, "num_input_tokens_seen": 246081465, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19677734, "step": 11396, "time_per_iteration": 3.0137746334075928 }, { "auxiliary_loss_clip": 0.01428098, "auxiliary_loss_mlp": 0.01034266, "balance_loss_clip": 1.25969243, "balance_loss_mlp": 1.01427484, "epoch": 0.6852247106568465, "flos": 15084157155840.0, "grad_norm": 3.4375290130100624, "language_loss": 0.79814321, "learning_rate": 9.522729308327931e-07, "loss": 0.8227669, "num_input_tokens_seen": 246096110, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19995117, "step": 11397, "time_per_iteration": 2.7934505939483643 }, { "auxiliary_loss_clip": 0.01413534, "auxiliary_loss_mlp": 0.01035014, "balance_loss_clip": 1.24882579, "balance_loss_mlp": 1.01529622, "epoch": 0.6852848339095146, "flos": 18779112529920.0, "grad_norm": 2.3377544509179398, "language_loss": 0.72068691, "learning_rate": 9.519412063806493e-07, "loss": 0.74517238, "num_input_tokens_seen": 246114785, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19714355, "step": 11398, "time_per_iteration": 2.809738874435425 }, { "auxiliary_loss_clip": 0.01403804, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.24252272, "balance_loss_mlp": 1.01470423, "epoch": 0.6853449571621825, "flos": 27865672496640.0, "grad_norm": 1.6707693985181886, "language_loss": 0.70993274, "learning_rate": 9.516095216709996e-07, "loss": 0.73431301, "num_input_tokens_seen": 246136375, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19506836, "step": 11399, "time_per_iteration": 2.8674776554107666 }, { "auxiliary_loss_clip": 0.01421791, "auxiliary_loss_mlp": 0.01035721, "balance_loss_clip": 1.2567637, "balance_loss_mlp": 1.01433468, "epoch": 0.6854050804148505, "flos": 18159654965760.0, "grad_norm": 1.8885796057061492, "language_loss": 0.70736367, "learning_rate": 9.512778767164217e-07, "loss": 0.73193878, "num_input_tokens_seen": 246155090, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.21386719, "step": 11400, "time_per_iteration": 2.818117141723633 }, { "auxiliary_loss_clip": 0.01449566, "auxiliary_loss_mlp": 0.01040821, "balance_loss_clip": 1.27371597, "balance_loss_mlp": 1.01910067, "epoch": 0.6854652036675184, "flos": 16334881176960.0, "grad_norm": 1.8558560047374948, "language_loss": 0.79465592, "learning_rate": 9.509462715294927e-07, "loss": 0.81955981, "num_input_tokens_seen": 246172645, "router_z_loss_clip": 1.7578125, "router_z_loss_mlp": 0.21716309, "step": 11401, "time_per_iteration": 2.8080966472625732 }, { "auxiliary_loss_clip": 0.01399403, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.23841286, "balance_loss_mlp": 1.01264215, "epoch": 0.6855253269201864, "flos": 14949399582720.0, "grad_norm": 2.309034715925553, "language_loss": 0.76203853, "learning_rate": 9.50614706122786e-07, "loss": 0.78635716, "num_input_tokens_seen": 246189055, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19824219, "step": 11402, "time_per_iteration": 2.8289482593536377 }, { "auxiliary_loss_clip": 0.01432232, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 1.26373744, "balance_loss_mlp": 1.01600683, "epoch": 0.6855854501728543, "flos": 23047665223680.0, "grad_norm": 1.9942861748283576, "language_loss": 0.73448181, "learning_rate": 9.502831805088742e-07, "loss": 0.75915611, "num_input_tokens_seen": 246207990, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.19189453, "step": 11403, "time_per_iteration": 2.8671300411224365 }, { "auxiliary_loss_clip": 0.01416536, "auxiliary_loss_mlp": 0.01034231, "balance_loss_clip": 1.25402367, "balance_loss_mlp": 1.01484764, "epoch": 0.6856455734255223, "flos": 13259473856640.0, "grad_norm": 1.9455043368014764, "language_loss": 0.81970185, "learning_rate": 9.499516947003294e-07, "loss": 0.84420949, "num_input_tokens_seen": 246221595, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19384766, "step": 11404, "time_per_iteration": 2.8450887203216553 }, { "auxiliary_loss_clip": 0.01413536, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.2511766, "balance_loss_mlp": 1.01419425, "epoch": 0.6857056966781903, "flos": 23344462984320.0, "grad_norm": 1.9813554131380395, "language_loss": 0.78153074, "learning_rate": 9.496202487097222e-07, "loss": 0.80601418, "num_input_tokens_seen": 246242970, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20605469, "step": 11405, "time_per_iteration": 2.8757519721984863 }, { "auxiliary_loss_clip": 0.01194498, "auxiliary_loss_mlp": 0.01031431, "balance_loss_clip": 1.10332489, "balance_loss_mlp": 1.01388347, "epoch": 0.6857658199308583, "flos": 61880576065920.0, "grad_norm": 0.8049479921220468, "language_loss": 0.61093795, "learning_rate": 9.492888425496199e-07, "loss": 0.63319719, "num_input_tokens_seen": 246300405, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.17578125, "step": 11406, "time_per_iteration": 3.3820865154266357 }, { "auxiliary_loss_clip": 0.01419469, "auxiliary_loss_mlp": 0.01036641, "balance_loss_clip": 1.2540884, "balance_loss_mlp": 1.01622081, "epoch": 0.6858259431835262, "flos": 16663287070080.0, "grad_norm": 1.8468916723564053, "language_loss": 0.77284855, "learning_rate": 9.489574762325907e-07, "loss": 0.79740965, "num_input_tokens_seen": 246318780, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.2043457, "step": 11407, "time_per_iteration": 2.8436667919158936 }, { "auxiliary_loss_clip": 0.0141712, "auxiliary_loss_mlp": 0.01039965, "balance_loss_clip": 1.25104141, "balance_loss_mlp": 1.01897264, "epoch": 0.6858860664361942, "flos": 21883322805120.0, "grad_norm": 2.2803989841797208, "language_loss": 0.71508265, "learning_rate": 9.486261497711991e-07, "loss": 0.73965353, "num_input_tokens_seen": 246339405, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.21008301, "step": 11408, "time_per_iteration": 2.8680248260498047 }, { "auxiliary_loss_clip": 0.01431874, "auxiliary_loss_mlp": 0.0103232, "balance_loss_clip": 1.26326621, "balance_loss_mlp": 1.01255476, "epoch": 0.6859461896888621, "flos": 15275905194240.0, "grad_norm": 1.7861817460428806, "language_loss": 0.7110045, "learning_rate": 9.482948631780087e-07, "loss": 0.73564649, "num_input_tokens_seen": 246357055, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19763184, "step": 11409, "time_per_iteration": 4.254801034927368 }, { "auxiliary_loss_clip": 0.01399181, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.24082446, "balance_loss_mlp": 1.01690459, "epoch": 0.6860063129415301, "flos": 18628383542400.0, "grad_norm": 1.5752649018181286, "language_loss": 0.78323799, "learning_rate": 9.479636164655825e-07, "loss": 0.80759323, "num_input_tokens_seen": 246374050, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19458008, "step": 11410, "time_per_iteration": 2.826809883117676 }, { "auxiliary_loss_clip": 0.01429942, "auxiliary_loss_mlp": 0.01039401, "balance_loss_clip": 1.26119101, "balance_loss_mlp": 1.01781166, "epoch": 0.6860664361941982, "flos": 23961929777280.0, "grad_norm": 1.7047333789951864, "language_loss": 0.71906257, "learning_rate": 9.476324096464821e-07, "loss": 0.74375594, "num_input_tokens_seen": 246392910, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.21582031, "step": 11411, "time_per_iteration": 2.879160165786743 }, { "auxiliary_loss_clip": 0.01414156, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.24912667, "balance_loss_mlp": 1.0133481, "epoch": 0.6861265594468661, "flos": 20416255557120.0, "grad_norm": 4.606155308293427, "language_loss": 0.71147579, "learning_rate": 9.473012427332654e-07, "loss": 0.73595583, "num_input_tokens_seen": 246411540, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20495605, "step": 11412, "time_per_iteration": 2.827428102493286 }, { "auxiliary_loss_clip": 0.01413005, "auxiliary_loss_mlp": 0.01033108, "balance_loss_clip": 1.24969316, "balance_loss_mlp": 1.01362967, "epoch": 0.6861866826995341, "flos": 11433749927040.0, "grad_norm": 2.664258490837861, "language_loss": 0.72852969, "learning_rate": 9.469701157384919e-07, "loss": 0.75299084, "num_input_tokens_seen": 246423295, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19482422, "step": 11413, "time_per_iteration": 2.7663071155548096 }, { "auxiliary_loss_clip": 0.01414454, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.2504977, "balance_loss_mlp": 1.01456308, "epoch": 0.686246805952202, "flos": 16006113325440.0, "grad_norm": 1.777551643976596, "language_loss": 0.74144149, "learning_rate": 9.466390286747164e-07, "loss": 0.7659263, "num_input_tokens_seen": 246441045, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19470215, "step": 11414, "time_per_iteration": 2.8261961936950684 }, { "auxiliary_loss_clip": 0.01421569, "auxiliary_loss_mlp": 0.01033779, "balance_loss_clip": 1.25663614, "balance_loss_mlp": 1.01396632, "epoch": 0.68630692920487, "flos": 19835735783040.0, "grad_norm": 2.1798679656259847, "language_loss": 0.87367862, "learning_rate": 9.46307981554495e-07, "loss": 0.8982321, "num_input_tokens_seen": 246456905, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19799805, "step": 11415, "time_per_iteration": 2.7976410388946533 }, { "auxiliary_loss_clip": 0.01420121, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.25367224, "balance_loss_mlp": 1.01716638, "epoch": 0.6863670524575379, "flos": 26297310844800.0, "grad_norm": 2.9473699952047014, "language_loss": 0.67895639, "learning_rate": 9.459769743903801e-07, "loss": 0.70352614, "num_input_tokens_seen": 246477545, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19689941, "step": 11416, "time_per_iteration": 2.8731725215911865 }, { "auxiliary_loss_clip": 0.01399716, "auxiliary_loss_mlp": 0.01035065, "balance_loss_clip": 1.23839867, "balance_loss_mlp": 1.01543164, "epoch": 0.686427175710206, "flos": 19182815049600.0, "grad_norm": 1.2901647788251638, "language_loss": 0.76774132, "learning_rate": 9.456460071949237e-07, "loss": 0.7920891, "num_input_tokens_seen": 246496705, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19628906, "step": 11417, "time_per_iteration": 2.835952043533325 }, { "auxiliary_loss_clip": 0.01422048, "auxiliary_loss_mlp": 0.01032861, "balance_loss_clip": 1.25745201, "balance_loss_mlp": 1.01338172, "epoch": 0.6864872989628739, "flos": 18925950464640.0, "grad_norm": 1.7845369982584198, "language_loss": 0.78678626, "learning_rate": 9.45315079980678e-07, "loss": 0.81133533, "num_input_tokens_seen": 246514860, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19494629, "step": 11418, "time_per_iteration": 2.805837631225586 }, { "auxiliary_loss_clip": 0.01421798, "auxiliary_loss_mlp": 0.01028717, "balance_loss_clip": 1.25579739, "balance_loss_mlp": 1.01045418, "epoch": 0.6865474222155419, "flos": 25966778446080.0, "grad_norm": 3.327932352547946, "language_loss": 0.77192676, "learning_rate": 9.449841927601887e-07, "loss": 0.7964319, "num_input_tokens_seen": 246536145, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.18249512, "step": 11419, "time_per_iteration": 2.891226291656494 }, { "auxiliary_loss_clip": 0.01425333, "auxiliary_loss_mlp": 0.0103738, "balance_loss_clip": 1.26094007, "balance_loss_mlp": 1.01784205, "epoch": 0.6866075454682098, "flos": 18487110718080.0, "grad_norm": 1.715498495558561, "language_loss": 0.71800172, "learning_rate": 9.446533455460044e-07, "loss": 0.74262887, "num_input_tokens_seen": 246553265, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.1953125, "step": 11420, "time_per_iteration": 2.7929141521453857 }, { "auxiliary_loss_clip": 0.01416785, "auxiliary_loss_mlp": 0.01030789, "balance_loss_clip": 1.25243533, "balance_loss_mlp": 1.01159596, "epoch": 0.6866676687208778, "flos": 34253127296640.0, "grad_norm": 1.5388659707580634, "language_loss": 0.75209141, "learning_rate": 9.443225383506712e-07, "loss": 0.7765671, "num_input_tokens_seen": 246575130, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19189453, "step": 11421, "time_per_iteration": 5.7138237953186035 }, { "auxiliary_loss_clip": 0.0140632, "auxiliary_loss_mlp": 0.0103418, "balance_loss_clip": 1.24578762, "balance_loss_mlp": 1.01420057, "epoch": 0.6867277919735457, "flos": 21730693536000.0, "grad_norm": 1.9214824542861868, "language_loss": 0.78082597, "learning_rate": 9.439917711867338e-07, "loss": 0.80523092, "num_input_tokens_seen": 246593095, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.1998291, "step": 11422, "time_per_iteration": 2.8612911701202393 }, { "auxiliary_loss_clip": 0.01406479, "auxiliary_loss_mlp": 0.0103506, "balance_loss_clip": 1.24356675, "balance_loss_mlp": 1.01479459, "epoch": 0.6867879152262137, "flos": 24108767712000.0, "grad_norm": 1.9996874949663241, "language_loss": 0.78082955, "learning_rate": 9.436610440667334e-07, "loss": 0.80524492, "num_input_tokens_seen": 246612165, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.20263672, "step": 11423, "time_per_iteration": 2.871936798095703 }, { "auxiliary_loss_clip": 0.01430057, "auxiliary_loss_mlp": 0.01037032, "balance_loss_clip": 1.26348114, "balance_loss_mlp": 1.01683831, "epoch": 0.6868480384788818, "flos": 21625643813760.0, "grad_norm": 1.4340012243197235, "language_loss": 0.73613453, "learning_rate": 9.433303570032129e-07, "loss": 0.76080543, "num_input_tokens_seen": 246632065, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.20202637, "step": 11424, "time_per_iteration": 4.302043199539185 }, { "auxiliary_loss_clip": 0.01414965, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.24990249, "balance_loss_mlp": 1.01114917, "epoch": 0.6869081617315497, "flos": 26297220355200.0, "grad_norm": 2.3496310112638397, "language_loss": 0.6574313, "learning_rate": 9.429997100087112e-07, "loss": 0.68188715, "num_input_tokens_seen": 246651245, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19458008, "step": 11425, "time_per_iteration": 2.8721606731414795 }, { "auxiliary_loss_clip": 0.01403667, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.24218416, "balance_loss_mlp": 1.01477838, "epoch": 0.6869682849842177, "flos": 21114810311040.0, "grad_norm": 1.4362502166581714, "language_loss": 0.72112399, "learning_rate": 9.426691030957657e-07, "loss": 0.74549872, "num_input_tokens_seen": 246672225, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19030762, "step": 11426, "time_per_iteration": 2.848940849304199 }, { "auxiliary_loss_clip": 0.01422968, "auxiliary_loss_mlp": 0.0103357, "balance_loss_clip": 1.25669432, "balance_loss_mlp": 1.0136385, "epoch": 0.6870284082368856, "flos": 17101583879040.0, "grad_norm": 2.6090214809142624, "language_loss": 0.86032474, "learning_rate": 9.423385362769136e-07, "loss": 0.88489014, "num_input_tokens_seen": 246688385, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19934082, "step": 11427, "time_per_iteration": 2.8583896160125732 }, { "auxiliary_loss_clip": 0.01414956, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.25237012, "balance_loss_mlp": 1.01413655, "epoch": 0.6870885314895536, "flos": 27319520787840.0, "grad_norm": 1.4513710202089727, "language_loss": 0.76885122, "learning_rate": 9.420080095646909e-07, "loss": 0.7933504, "num_input_tokens_seen": 246710730, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20825195, "step": 11428, "time_per_iteration": 2.90556001663208 }, { "auxiliary_loss_clip": 0.01439396, "auxiliary_loss_mlp": 0.01038684, "balance_loss_clip": 1.27045488, "balance_loss_mlp": 1.01781023, "epoch": 0.6871486547422215, "flos": 20824527801600.0, "grad_norm": 1.7754507598131672, "language_loss": 0.74260628, "learning_rate": 9.4167752297163e-07, "loss": 0.76738703, "num_input_tokens_seen": 246730350, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20861816, "step": 11429, "time_per_iteration": 2.8790361881256104 }, { "auxiliary_loss_clip": 0.01415402, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.24930525, "balance_loss_mlp": 1.0129149, "epoch": 0.6872087779948896, "flos": 30166323540480.0, "grad_norm": 1.9537445211606002, "language_loss": 0.83708566, "learning_rate": 9.413470765102643e-07, "loss": 0.86155975, "num_input_tokens_seen": 246751700, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19104004, "step": 11430, "time_per_iteration": 2.9140877723693848 }, { "auxiliary_loss_clip": 0.01410341, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.24688721, "balance_loss_mlp": 1.01267028, "epoch": 0.6872689012475575, "flos": 20714410661760.0, "grad_norm": 1.9604863977757747, "language_loss": 0.71054196, "learning_rate": 9.410166701931225e-07, "loss": 0.73497283, "num_input_tokens_seen": 246769860, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20068359, "step": 11431, "time_per_iteration": 2.8622169494628906 }, { "auxiliary_loss_clip": 0.01415542, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.25113273, "balance_loss_mlp": 1.01442444, "epoch": 0.6873290245002255, "flos": 25531965486720.0, "grad_norm": 1.894136916271802, "language_loss": 0.81182325, "learning_rate": 9.406863040327355e-07, "loss": 0.83632171, "num_input_tokens_seen": 246789905, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1986084, "step": 11432, "time_per_iteration": 2.9425241947174072 }, { "auxiliary_loss_clip": 0.01408554, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.24834359, "balance_loss_mlp": 1.01753354, "epoch": 0.6873891477528934, "flos": 25202247494400.0, "grad_norm": 2.0719760836853496, "language_loss": 0.68694746, "learning_rate": 9.403559780416295e-07, "loss": 0.71140611, "num_input_tokens_seen": 246808815, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19775391, "step": 11433, "time_per_iteration": 2.942978858947754 }, { "auxiliary_loss_clip": 0.01419137, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.25452626, "balance_loss_mlp": 1.01393771, "epoch": 0.6874492710055614, "flos": 35165808282240.0, "grad_norm": 1.885315769550077, "language_loss": 0.73621964, "learning_rate": 9.400256922323309e-07, "loss": 0.76075488, "num_input_tokens_seen": 246829775, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20458984, "step": 11434, "time_per_iteration": 2.9723289012908936 }, { "auxiliary_loss_clip": 0.01419027, "auxiliary_loss_mlp": 0.01037795, "balance_loss_clip": 1.25523448, "balance_loss_mlp": 1.01794648, "epoch": 0.6875093942582293, "flos": 17831294317440.0, "grad_norm": 1.7365168044879218, "language_loss": 0.81087482, "learning_rate": 9.396954466173657e-07, "loss": 0.83544308, "num_input_tokens_seen": 246848045, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19848633, "step": 11435, "time_per_iteration": 2.819511651992798 }, { "auxiliary_loss_clip": 0.0142133, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.25414336, "balance_loss_mlp": 1.01184297, "epoch": 0.6875695175108973, "flos": 20714365416960.0, "grad_norm": 1.9927434697875628, "language_loss": 0.81620181, "learning_rate": 9.393652412092538e-07, "loss": 0.8407535, "num_input_tokens_seen": 246866095, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.21948242, "step": 11436, "time_per_iteration": 2.8404290676116943 }, { "auxiliary_loss_clip": 0.01393354, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.23554969, "balance_loss_mlp": 1.01568961, "epoch": 0.6876296407635654, "flos": 25384856083200.0, "grad_norm": 1.7862468802751503, "language_loss": 0.82633305, "learning_rate": 9.390350760205183e-07, "loss": 0.85061151, "num_input_tokens_seen": 246883975, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18798828, "step": 11437, "time_per_iteration": 2.8769543170928955 }, { "auxiliary_loss_clip": 0.01458965, "auxiliary_loss_mlp": 0.01037829, "balance_loss_clip": 1.28490341, "balance_loss_mlp": 1.01740813, "epoch": 0.6876897640162333, "flos": 23232852766080.0, "grad_norm": 2.516898874743628, "language_loss": 0.78659666, "learning_rate": 9.387049510636793e-07, "loss": 0.81156456, "num_input_tokens_seen": 246901560, "router_z_loss_clip": 1.74023438, "router_z_loss_mlp": 0.20422363, "step": 11438, "time_per_iteration": 2.837991714477539 }, { "auxiliary_loss_clip": 0.01397615, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.23858023, "balance_loss_mlp": 1.01583445, "epoch": 0.6877498872689013, "flos": 27135373875840.0, "grad_norm": 1.5443776326978516, "language_loss": 0.73385751, "learning_rate": 9.383748663512554e-07, "loss": 0.7581929, "num_input_tokens_seen": 246922655, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.20080566, "step": 11439, "time_per_iteration": 2.954479932785034 }, { "auxiliary_loss_clip": 0.01411794, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.24912846, "balance_loss_mlp": 1.01512516, "epoch": 0.6878100105215692, "flos": 11588731925760.0, "grad_norm": 2.1067853640506895, "language_loss": 0.76213574, "learning_rate": 9.380448218957623e-07, "loss": 0.78660882, "num_input_tokens_seen": 246940100, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.20385742, "step": 11440, "time_per_iteration": 2.858375072479248 }, { "auxiliary_loss_clip": 0.01405229, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.24516082, "balance_loss_mlp": 1.01767898, "epoch": 0.6878701337742372, "flos": 20312879892480.0, "grad_norm": 1.7464982199734727, "language_loss": 0.72706294, "learning_rate": 9.377148177097167e-07, "loss": 0.75149524, "num_input_tokens_seen": 246958545, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.20324707, "step": 11441, "time_per_iteration": 2.844583034515381 }, { "auxiliary_loss_clip": 0.0142717, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.25780249, "balance_loss_mlp": 1.01296878, "epoch": 0.6879302570269051, "flos": 13846780350720.0, "grad_norm": 1.742699604484308, "language_loss": 0.67420954, "learning_rate": 9.373848538056317e-07, "loss": 0.69881707, "num_input_tokens_seen": 246974805, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.20617676, "step": 11442, "time_per_iteration": 2.8520867824554443 }, { "auxiliary_loss_clip": 0.01421231, "auxiliary_loss_mlp": 0.01038119, "balance_loss_clip": 1.25734007, "balance_loss_mlp": 1.01874745, "epoch": 0.6879903802795732, "flos": 21334592142720.0, "grad_norm": 9.465640541212256, "language_loss": 0.7055434, "learning_rate": 9.370549301960189e-07, "loss": 0.73013699, "num_input_tokens_seen": 246992505, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19360352, "step": 11443, "time_per_iteration": 2.937373399734497 }, { "auxiliary_loss_clip": 0.01422473, "auxiliary_loss_mlp": 0.01036633, "balance_loss_clip": 1.25759912, "balance_loss_mlp": 1.01572394, "epoch": 0.6880505035322411, "flos": 25162042849920.0, "grad_norm": 1.4573042767276814, "language_loss": 0.76867545, "learning_rate": 9.367250468933893e-07, "loss": 0.79326648, "num_input_tokens_seen": 247013370, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20910645, "step": 11444, "time_per_iteration": 4.379446506500244 }, { "auxiliary_loss_clip": 0.01406808, "auxiliary_loss_mlp": 0.01029482, "balance_loss_clip": 1.24533784, "balance_loss_mlp": 1.01013458, "epoch": 0.6881106267849091, "flos": 23224301498880.0, "grad_norm": 2.0507362206188393, "language_loss": 0.77369654, "learning_rate": 9.363952039102536e-07, "loss": 0.79805946, "num_input_tokens_seen": 247029855, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19335938, "step": 11445, "time_per_iteration": 2.8553643226623535 }, { "auxiliary_loss_clip": 0.01193045, "auxiliary_loss_mlp": 0.01031515, "balance_loss_clip": 1.09947145, "balance_loss_mlp": 1.01091564, "epoch": 0.688170750037577, "flos": 48505103245440.0, "grad_norm": 0.8256906310845094, "language_loss": 0.58371955, "learning_rate": 9.360654012591183e-07, "loss": 0.60596514, "num_input_tokens_seen": 247085030, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.20605469, "step": 11446, "time_per_iteration": 3.4447128772735596 }, { "auxiliary_loss_clip": 0.01432086, "auxiliary_loss_mlp": 0.01036229, "balance_loss_clip": 1.26278996, "balance_loss_mlp": 1.01614189, "epoch": 0.688230873290245, "flos": 22793832040320.0, "grad_norm": 1.480043388170362, "language_loss": 0.76028907, "learning_rate": 9.357356389524886e-07, "loss": 0.78497225, "num_input_tokens_seen": 247104840, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.20080566, "step": 11447, "time_per_iteration": 2.9029901027679443 }, { "auxiliary_loss_clip": 0.01415265, "auxiliary_loss_mlp": 0.01031339, "balance_loss_clip": 1.25057161, "balance_loss_mlp": 1.01143086, "epoch": 0.6882909965429129, "flos": 22465878595200.0, "grad_norm": 1.929946783971295, "language_loss": 0.7397123, "learning_rate": 9.354059170028705e-07, "loss": 0.76417834, "num_input_tokens_seen": 247121905, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19909668, "step": 11448, "time_per_iteration": 2.8681349754333496 }, { "auxiliary_loss_clip": 0.01431963, "auxiliary_loss_mlp": 0.01036163, "balance_loss_clip": 1.26095009, "balance_loss_mlp": 1.01576591, "epoch": 0.688351119795581, "flos": 26225452823040.0, "grad_norm": 1.7278142835160022, "language_loss": 0.75615418, "learning_rate": 9.350762354227673e-07, "loss": 0.78083551, "num_input_tokens_seen": 247142375, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.20410156, "step": 11449, "time_per_iteration": 2.9387545585632324 }, { "auxiliary_loss_clip": 0.01419625, "auxiliary_loss_mlp": 0.01038551, "balance_loss_clip": 1.25581753, "balance_loss_mlp": 1.01971626, "epoch": 0.6884112430482489, "flos": 22575543287040.0, "grad_norm": 1.8079484956595933, "language_loss": 0.7086755, "learning_rate": 9.34746594224679e-07, "loss": 0.73325729, "num_input_tokens_seen": 247161095, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18847656, "step": 11450, "time_per_iteration": 2.838945150375366 }, { "auxiliary_loss_clip": 0.01434683, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.26417983, "balance_loss_mlp": 1.01766801, "epoch": 0.6884713663009169, "flos": 17348223139200.0, "grad_norm": 2.552228083450065, "language_loss": 0.77123988, "learning_rate": 9.344169934211068e-07, "loss": 0.79595852, "num_input_tokens_seen": 247178565, "router_z_loss_clip": 1.70605469, "router_z_loss_mlp": 0.1953125, "step": 11451, "time_per_iteration": 2.855661392211914 }, { "auxiliary_loss_clip": 0.01438604, "auxiliary_loss_mlp": 0.01034105, "balance_loss_clip": 1.27227402, "balance_loss_mlp": 1.01541305, "epoch": 0.6885314895535849, "flos": 26482543632000.0, "grad_norm": 3.6074601365033874, "language_loss": 0.69334584, "learning_rate": 9.340874330245505e-07, "loss": 0.71807289, "num_input_tokens_seen": 247202345, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.18688965, "step": 11452, "time_per_iteration": 2.905358076095581 }, { "auxiliary_loss_clip": 0.01417755, "auxiliary_loss_mlp": 0.0104207, "balance_loss_clip": 1.2540319, "balance_loss_mlp": 1.02048075, "epoch": 0.6885916128062528, "flos": 20531168645760.0, "grad_norm": 1.5889071208768317, "language_loss": 0.7258296, "learning_rate": 9.337579130475042e-07, "loss": 0.75042784, "num_input_tokens_seen": 247219240, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.21606445, "step": 11453, "time_per_iteration": 2.895993232727051 }, { "auxiliary_loss_clip": 0.01197839, "auxiliary_loss_mlp": 0.01026535, "balance_loss_clip": 1.10306668, "balance_loss_mlp": 1.00889242, "epoch": 0.6886517360589208, "flos": 70745272940160.0, "grad_norm": 0.791370476608203, "language_loss": 0.50627553, "learning_rate": 9.334284335024644e-07, "loss": 0.52851927, "num_input_tokens_seen": 247272010, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.17675781, "step": 11454, "time_per_iteration": 3.2211222648620605 }, { "auxiliary_loss_clip": 0.01401461, "auxiliary_loss_mlp": 0.01037232, "balance_loss_clip": 1.24435687, "balance_loss_mlp": 1.0174551, "epoch": 0.6887118593115887, "flos": 17902202198400.0, "grad_norm": 2.216523756908758, "language_loss": 0.76650625, "learning_rate": 9.330989944019263e-07, "loss": 0.7908932, "num_input_tokens_seen": 247290630, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19775391, "step": 11455, "time_per_iteration": 4.342930793762207 }, { "auxiliary_loss_clip": 0.01424119, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.25532103, "balance_loss_mlp": 1.01175177, "epoch": 0.6887719825642568, "flos": 17460873987840.0, "grad_norm": 2.487589292932272, "language_loss": 0.74023169, "learning_rate": 9.327695957583803e-07, "loss": 0.76479363, "num_input_tokens_seen": 247304800, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.20324707, "step": 11456, "time_per_iteration": 4.33600378036499 }, { "auxiliary_loss_clip": 0.01406299, "auxiliary_loss_mlp": 0.01036438, "balance_loss_clip": 1.24537146, "balance_loss_mlp": 1.0170188, "epoch": 0.6888321058169247, "flos": 23079092376960.0, "grad_norm": 1.581828812706011, "language_loss": 0.81525612, "learning_rate": 9.32440237584319e-07, "loss": 0.83968347, "num_input_tokens_seen": 247323450, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1940918, "step": 11457, "time_per_iteration": 2.86034893989563 }, { "auxiliary_loss_clip": 0.01423471, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.2569356, "balance_loss_mlp": 1.01739812, "epoch": 0.6888922290695927, "flos": 23379554966400.0, "grad_norm": 3.8513499491618863, "language_loss": 0.77070963, "learning_rate": 9.321109198922301e-07, "loss": 0.79531717, "num_input_tokens_seen": 247343845, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19885254, "step": 11458, "time_per_iteration": 2.88289213180542 }, { "auxiliary_loss_clip": 0.01413875, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.24993682, "balance_loss_mlp": 1.01390564, "epoch": 0.6889523523222606, "flos": 17638550893440.0, "grad_norm": 2.208074354601784, "language_loss": 0.68264824, "learning_rate": 9.31781642694603e-07, "loss": 0.70712614, "num_input_tokens_seen": 247356650, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20019531, "step": 11459, "time_per_iteration": 4.207055330276489 }, { "auxiliary_loss_clip": 0.01414837, "auxiliary_loss_mlp": 0.01035976, "balance_loss_clip": 1.25095415, "balance_loss_mlp": 1.01690257, "epoch": 0.6890124755749286, "flos": 25238968289280.0, "grad_norm": 1.940103381934279, "language_loss": 0.69378811, "learning_rate": 9.314524060039221e-07, "loss": 0.71829617, "num_input_tokens_seen": 247377340, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19067383, "step": 11460, "time_per_iteration": 2.886648178100586 }, { "auxiliary_loss_clip": 0.01446431, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.27329493, "balance_loss_mlp": 1.01441216, "epoch": 0.6890725988275965, "flos": 20239935995520.0, "grad_norm": 1.7365235566031958, "language_loss": 0.77682483, "learning_rate": 9.311232098326731e-07, "loss": 0.80163872, "num_input_tokens_seen": 247395805, "router_z_loss_clip": 1.73242188, "router_z_loss_mlp": 0.20532227, "step": 11461, "time_per_iteration": 2.8734724521636963 }, { "auxiliary_loss_clip": 0.01409199, "auxiliary_loss_mlp": 0.01038741, "balance_loss_clip": 1.24572182, "balance_loss_mlp": 1.01896417, "epoch": 0.6891327220802645, "flos": 14542710906240.0, "grad_norm": 2.1344804064488514, "language_loss": 0.70971727, "learning_rate": 9.307940541933401e-07, "loss": 0.73419666, "num_input_tokens_seen": 247413165, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19787598, "step": 11462, "time_per_iteration": 2.8903589248657227 }, { "auxiliary_loss_clip": 0.0142049, "auxiliary_loss_mlp": 0.01034772, "balance_loss_clip": 1.25483322, "balance_loss_mlp": 1.01482832, "epoch": 0.6891928453329325, "flos": 21148228235520.0, "grad_norm": 1.5419600521956216, "language_loss": 0.87569976, "learning_rate": 9.304649390984034e-07, "loss": 0.90025234, "num_input_tokens_seen": 247433140, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19934082, "step": 11463, "time_per_iteration": 2.844409942626953 }, { "auxiliary_loss_clip": 0.01403144, "auxiliary_loss_mlp": 0.01036355, "balance_loss_clip": 1.2440865, "balance_loss_mlp": 1.01759124, "epoch": 0.6892529685856005, "flos": 17867562664320.0, "grad_norm": 1.8824791479645122, "language_loss": 0.69240832, "learning_rate": 9.301358645603428e-07, "loss": 0.71680331, "num_input_tokens_seen": 247451265, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18762207, "step": 11464, "time_per_iteration": 2.852980852127075 }, { "auxiliary_loss_clip": 0.01421881, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.25749278, "balance_loss_mlp": 1.01747787, "epoch": 0.6893130918382685, "flos": 29946858422400.0, "grad_norm": 1.877675396092746, "language_loss": 0.65858805, "learning_rate": 9.298068305916373e-07, "loss": 0.68317431, "num_input_tokens_seen": 247471645, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19262695, "step": 11465, "time_per_iteration": 2.9475553035736084 }, { "auxiliary_loss_clip": 0.01433739, "auxiliary_loss_mlp": 0.01035703, "balance_loss_clip": 1.26457763, "balance_loss_mlp": 1.01605725, "epoch": 0.6893732150909364, "flos": 24399004976640.0, "grad_norm": 1.381228478429248, "language_loss": 0.73352861, "learning_rate": 9.294778372047649e-07, "loss": 0.75822306, "num_input_tokens_seen": 247491170, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.1965332, "step": 11466, "time_per_iteration": 2.8867552280426025 }, { "auxiliary_loss_clip": 0.01415244, "auxiliary_loss_mlp": 0.01033056, "balance_loss_clip": 1.25247812, "balance_loss_mlp": 1.01451886, "epoch": 0.6894333383436044, "flos": 16991692963200.0, "grad_norm": 1.641887869424099, "language_loss": 0.73157299, "learning_rate": 9.291488844121995e-07, "loss": 0.75605595, "num_input_tokens_seen": 247509005, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18518066, "step": 11467, "time_per_iteration": 2.8365461826324463 }, { "auxiliary_loss_clip": 0.01425248, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.2566402, "balance_loss_mlp": 1.01742351, "epoch": 0.6894934615962723, "flos": 18993826944000.0, "grad_norm": 2.0850227159159282, "language_loss": 0.81967729, "learning_rate": 9.288199722264156e-07, "loss": 0.84431565, "num_input_tokens_seen": 247527050, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.21154785, "step": 11468, "time_per_iteration": 2.8566792011260986 }, { "auxiliary_loss_clip": 0.01438342, "auxiliary_loss_mlp": 0.01035411, "balance_loss_clip": 1.27060807, "balance_loss_mlp": 1.01555085, "epoch": 0.6895535848489404, "flos": 34544812394880.0, "grad_norm": 1.6383637453091446, "language_loss": 0.6677351, "learning_rate": 9.284911006598875e-07, "loss": 0.69247264, "num_input_tokens_seen": 247547765, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19836426, "step": 11469, "time_per_iteration": 2.9751250743865967 }, { "auxiliary_loss_clip": 0.01196261, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.10245728, "balance_loss_mlp": 1.00334585, "epoch": 0.6896137081016083, "flos": 50102828772480.0, "grad_norm": 0.8016572162458612, "language_loss": 0.55194318, "learning_rate": 9.281622697250824e-07, "loss": 0.57418245, "num_input_tokens_seen": 247603515, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.24316406, "step": 11470, "time_per_iteration": 3.259658098220825 }, { "auxiliary_loss_clip": 0.01417874, "auxiliary_loss_mlp": 0.01035392, "balance_loss_clip": 1.25548768, "balance_loss_mlp": 1.0183928, "epoch": 0.6896738313542763, "flos": 19947798449280.0, "grad_norm": 1.643029044928726, "language_loss": 0.78992152, "learning_rate": 9.278334794344715e-07, "loss": 0.8144542, "num_input_tokens_seen": 247622110, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.17004395, "step": 11471, "time_per_iteration": 2.8288698196411133 }, { "auxiliary_loss_clip": 0.01417747, "auxiliary_loss_mlp": 0.01034887, "balance_loss_clip": 1.25425148, "balance_loss_mlp": 1.01564622, "epoch": 0.6897339546069442, "flos": 21735398995200.0, "grad_norm": 1.8185206373165055, "language_loss": 0.79388702, "learning_rate": 9.275047298005232e-07, "loss": 0.81841338, "num_input_tokens_seen": 247641905, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19250488, "step": 11472, "time_per_iteration": 2.8700990676879883 }, { "auxiliary_loss_clip": 0.01416512, "auxiliary_loss_mlp": 0.01035295, "balance_loss_clip": 1.25292611, "balance_loss_mlp": 1.01690102, "epoch": 0.6897940778596122, "flos": 19835464314240.0, "grad_norm": 1.7148096304408054, "language_loss": 0.76183617, "learning_rate": 9.271760208357024e-07, "loss": 0.78635418, "num_input_tokens_seen": 247660945, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18383789, "step": 11473, "time_per_iteration": 2.886295795440674 }, { "auxiliary_loss_clip": 0.01422748, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.25565577, "balance_loss_mlp": 1.01405942, "epoch": 0.6898542011122801, "flos": 17318651022720.0, "grad_norm": 1.8900747504141944, "language_loss": 0.76621515, "learning_rate": 9.268473525524751e-07, "loss": 0.79077339, "num_input_tokens_seen": 247678395, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19030762, "step": 11474, "time_per_iteration": 2.8892338275909424 }, { "auxiliary_loss_clip": 0.01425176, "auxiliary_loss_mlp": 0.01036802, "balance_loss_clip": 1.2596606, "balance_loss_mlp": 1.01667905, "epoch": 0.6899143243649482, "flos": 24764810336640.0, "grad_norm": 2.739696129090592, "language_loss": 0.75623763, "learning_rate": 9.26518724963303e-07, "loss": 0.78085744, "num_input_tokens_seen": 247698380, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.2010498, "step": 11475, "time_per_iteration": 2.869035482406616 }, { "auxiliary_loss_clip": 0.01412416, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.24987221, "balance_loss_mlp": 1.01731992, "epoch": 0.6899744476176161, "flos": 17242449500160.0, "grad_norm": 2.455696143370247, "language_loss": 0.89666349, "learning_rate": 9.261901380806491e-07, "loss": 0.92116159, "num_input_tokens_seen": 247716370, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20068359, "step": 11476, "time_per_iteration": 2.972435235977173 }, { "auxiliary_loss_clip": 0.01403671, "auxiliary_loss_mlp": 0.01038071, "balance_loss_clip": 1.24268508, "balance_loss_mlp": 1.01798439, "epoch": 0.6900345708702841, "flos": 25421576878080.0, "grad_norm": 1.3680420303569714, "language_loss": 0.70747238, "learning_rate": 9.258615919169724e-07, "loss": 0.73188984, "num_input_tokens_seen": 247737335, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20092773, "step": 11477, "time_per_iteration": 2.9140870571136475 }, { "auxiliary_loss_clip": 0.01434833, "auxiliary_loss_mlp": 0.01036735, "balance_loss_clip": 1.26677549, "balance_loss_mlp": 1.01686287, "epoch": 0.6900946941229521, "flos": 23442952210560.0, "grad_norm": 2.604171037995578, "language_loss": 0.69121814, "learning_rate": 9.255330864847313e-07, "loss": 0.7159338, "num_input_tokens_seen": 247756680, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19873047, "step": 11478, "time_per_iteration": 2.873448371887207 }, { "auxiliary_loss_clip": 0.01425507, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.26045954, "balance_loss_mlp": 1.01706982, "epoch": 0.69015481737562, "flos": 17828624874240.0, "grad_norm": 3.0850416853522753, "language_loss": 0.7688185, "learning_rate": 9.252046217963843e-07, "loss": 0.79343808, "num_input_tokens_seen": 247774265, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19360352, "step": 11479, "time_per_iteration": 2.8568553924560547 }, { "auxiliary_loss_clip": 0.01425752, "auxiliary_loss_mlp": 0.0103437, "balance_loss_clip": 1.25863659, "balance_loss_mlp": 1.01486731, "epoch": 0.690214940628288, "flos": 17465262733440.0, "grad_norm": 1.6370902301624772, "language_loss": 0.79747081, "learning_rate": 9.248761978643856e-07, "loss": 0.82207209, "num_input_tokens_seen": 247792395, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19506836, "step": 11480, "time_per_iteration": 4.222980976104736 }, { "auxiliary_loss_clip": 0.01411946, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.24879932, "balance_loss_mlp": 1.01251626, "epoch": 0.6902750638809559, "flos": 29577795436800.0, "grad_norm": 1.5494769463191953, "language_loss": 0.75867736, "learning_rate": 9.245478147011885e-07, "loss": 0.78311974, "num_input_tokens_seen": 247811985, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19787598, "step": 11481, "time_per_iteration": 2.919602870941162 }, { "auxiliary_loss_clip": 0.01411984, "auxiliary_loss_mlp": 0.01030519, "balance_loss_clip": 1.25016999, "balance_loss_mlp": 1.01076639, "epoch": 0.690335187133624, "flos": 25568188588800.0, "grad_norm": 1.8308437893742748, "language_loss": 0.70403075, "learning_rate": 9.24219472319246e-07, "loss": 0.72845578, "num_input_tokens_seen": 247831880, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19763184, "step": 11482, "time_per_iteration": 2.9909377098083496 }, { "auxiliary_loss_clip": 0.01422336, "auxiliary_loss_mlp": 0.01032855, "balance_loss_clip": 1.25682449, "balance_loss_mlp": 1.01309061, "epoch": 0.6903953103862919, "flos": 22497712951680.0, "grad_norm": 1.7183319178861016, "language_loss": 0.83213073, "learning_rate": 9.238911707310096e-07, "loss": 0.85668266, "num_input_tokens_seen": 247851170, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19775391, "step": 11483, "time_per_iteration": 2.898716688156128 }, { "auxiliary_loss_clip": 0.01429648, "auxiliary_loss_mlp": 0.01033375, "balance_loss_clip": 1.26322508, "balance_loss_mlp": 1.01432502, "epoch": 0.6904554336389599, "flos": 26110720713600.0, "grad_norm": 1.827642030192868, "language_loss": 0.66744953, "learning_rate": 9.235629099489273e-07, "loss": 0.69207978, "num_input_tokens_seen": 247868950, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19042969, "step": 11484, "time_per_iteration": 2.9045732021331787 }, { "auxiliary_loss_clip": 0.01413459, "auxiliary_loss_mlp": 0.01035404, "balance_loss_clip": 1.25050211, "balance_loss_mlp": 1.01512623, "epoch": 0.6905155568916278, "flos": 31183194597120.0, "grad_norm": 1.451736122530799, "language_loss": 0.74110901, "learning_rate": 9.232346899854479e-07, "loss": 0.76559758, "num_input_tokens_seen": 247889805, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.20288086, "step": 11485, "time_per_iteration": 2.9417450428009033 }, { "auxiliary_loss_clip": 0.01422432, "auxiliary_loss_mlp": 0.01034259, "balance_loss_clip": 1.25569272, "balance_loss_mlp": 1.01451778, "epoch": 0.6905756801442958, "flos": 17648685728640.0, "grad_norm": 1.7150928143543336, "language_loss": 0.85963172, "learning_rate": 9.22906510853017e-07, "loss": 0.88419867, "num_input_tokens_seen": 247908585, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19726562, "step": 11486, "time_per_iteration": 2.8780910968780518 }, { "auxiliary_loss_clip": 0.01428404, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.26383901, "balance_loss_mlp": 1.01798117, "epoch": 0.6906358033969637, "flos": 22353227746560.0, "grad_norm": 1.4816921104649559, "language_loss": 0.73479903, "learning_rate": 9.225783725640786e-07, "loss": 0.7594642, "num_input_tokens_seen": 247928480, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20129395, "step": 11487, "time_per_iteration": 2.8461549282073975 }, { "auxiliary_loss_clip": 0.01192356, "auxiliary_loss_mlp": 0.01028794, "balance_loss_clip": 1.10023618, "balance_loss_mlp": 1.01296282, "epoch": 0.6906959266496318, "flos": 69781456051200.0, "grad_norm": 0.9085412171193092, "language_loss": 0.6676228, "learning_rate": 9.222502751310759e-07, "loss": 0.68983436, "num_input_tokens_seen": 247988855, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.15820312, "step": 11488, "time_per_iteration": 3.430896043777466 }, { "auxiliary_loss_clip": 0.01434725, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.2638278, "balance_loss_mlp": 1.01446021, "epoch": 0.6907560499022997, "flos": 21444256834560.0, "grad_norm": 1.7887823726769068, "language_loss": 0.75670117, "learning_rate": 9.219222185664519e-07, "loss": 0.78140533, "num_input_tokens_seen": 248007685, "router_z_loss_clip": 1.70898438, "router_z_loss_mlp": 0.21228027, "step": 11489, "time_per_iteration": 2.8884353637695312 }, { "auxiliary_loss_clip": 0.01422959, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.25693786, "balance_loss_mlp": 1.01368487, "epoch": 0.6908161731549677, "flos": 14400306961920.0, "grad_norm": 2.4483266871402787, "language_loss": 0.63156998, "learning_rate": 9.215942028826445e-07, "loss": 0.6561389, "num_input_tokens_seen": 248025145, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20251465, "step": 11490, "time_per_iteration": 2.8524436950683594 }, { "auxiliary_loss_clip": 0.01424642, "auxiliary_loss_mlp": 0.01039608, "balance_loss_clip": 1.25988007, "balance_loss_mlp": 1.02041554, "epoch": 0.6908762964076357, "flos": 20020516122240.0, "grad_norm": 1.6966380347488528, "language_loss": 0.73350829, "learning_rate": 9.212662280920937e-07, "loss": 0.75815082, "num_input_tokens_seen": 248043750, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19189453, "step": 11491, "time_per_iteration": 4.285810708999634 }, { "auxiliary_loss_clip": 0.0140708, "auxiliary_loss_mlp": 0.01039282, "balance_loss_clip": 1.24444604, "balance_loss_mlp": 1.01956439, "epoch": 0.6909364196603036, "flos": 28781294394240.0, "grad_norm": 1.5839146341973016, "language_loss": 0.70703357, "learning_rate": 9.20938294207235e-07, "loss": 0.73149729, "num_input_tokens_seen": 248065765, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19714355, "step": 11492, "time_per_iteration": 2.9245502948760986 }, { "auxiliary_loss_clip": 0.01439922, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.26869774, "balance_loss_mlp": 1.01469374, "epoch": 0.6909965429129716, "flos": 22538234309760.0, "grad_norm": 1.8900265630439628, "language_loss": 0.75072229, "learning_rate": 9.206104012405049e-07, "loss": 0.77547121, "num_input_tokens_seen": 248083810, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.20263672, "step": 11493, "time_per_iteration": 2.8609297275543213 }, { "auxiliary_loss_clip": 0.01416907, "auxiliary_loss_mlp": 0.01031095, "balance_loss_clip": 1.2539438, "balance_loss_mlp": 1.01152086, "epoch": 0.6910566661656395, "flos": 18415162206720.0, "grad_norm": 2.6639915784882002, "language_loss": 0.75337625, "learning_rate": 9.20282549204336e-07, "loss": 0.77785623, "num_input_tokens_seen": 248103185, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19567871, "step": 11494, "time_per_iteration": 4.247552871704102 }, { "auxiliary_loss_clip": 0.01414452, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.25163484, "balance_loss_mlp": 1.01614082, "epoch": 0.6911167894183076, "flos": 30786233552640.0, "grad_norm": 1.7982988022462836, "language_loss": 0.6875689, "learning_rate": 9.19954738111161e-07, "loss": 0.71206689, "num_input_tokens_seen": 248125665, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19226074, "step": 11495, "time_per_iteration": 2.9460017681121826 }, { "auxiliary_loss_clip": 0.01418877, "auxiliary_loss_mlp": 0.01032583, "balance_loss_clip": 1.25440109, "balance_loss_mlp": 1.01238942, "epoch": 0.6911769126709755, "flos": 13743721399680.0, "grad_norm": 1.7015774997540176, "language_loss": 0.7443217, "learning_rate": 9.196269679734119e-07, "loss": 0.76883632, "num_input_tokens_seen": 248142545, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.2019043, "step": 11496, "time_per_iteration": 2.8511016368865967 }, { "auxiliary_loss_clip": 0.01411692, "auxiliary_loss_mlp": 0.01034689, "balance_loss_clip": 1.24856687, "balance_loss_mlp": 1.01522207, "epoch": 0.6912370359236435, "flos": 17575877566080.0, "grad_norm": 2.1807626533664477, "language_loss": 0.8087281, "learning_rate": 9.19299238803515e-07, "loss": 0.83319187, "num_input_tokens_seen": 248160225, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19470215, "step": 11497, "time_per_iteration": 2.849093198776245 }, { "auxiliary_loss_clip": 0.01442545, "auxiliary_loss_mlp": 0.01040962, "balance_loss_clip": 1.27411556, "balance_loss_mlp": 1.0209583, "epoch": 0.6912971591763114, "flos": 22100887641600.0, "grad_norm": 1.6630973593936271, "language_loss": 0.81303293, "learning_rate": 9.189715506138993e-07, "loss": 0.83786798, "num_input_tokens_seen": 248180430, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19995117, "step": 11498, "time_per_iteration": 2.858480930328369 }, { "auxiliary_loss_clip": 0.01412634, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.25133586, "balance_loss_mlp": 1.01526451, "epoch": 0.6913572824289794, "flos": 29983579217280.0, "grad_norm": 1.437489176500461, "language_loss": 0.86470455, "learning_rate": 9.186439034169915e-07, "loss": 0.88917816, "num_input_tokens_seen": 248202365, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19482422, "step": 11499, "time_per_iteration": 2.9318525791168213 }, { "auxiliary_loss_clip": 0.01413351, "auxiliary_loss_mlp": 0.01035674, "balance_loss_clip": 1.25205779, "balance_loss_mlp": 1.01544476, "epoch": 0.6914174056816473, "flos": 20458631952000.0, "grad_norm": 1.5401437617659615, "language_loss": 0.76227963, "learning_rate": 9.183162972252145e-07, "loss": 0.78676987, "num_input_tokens_seen": 248221750, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.20214844, "step": 11500, "time_per_iteration": 2.8698318004608154 }, { "auxiliary_loss_clip": 0.01410302, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 1.24669898, "balance_loss_mlp": 1.01264215, "epoch": 0.6914775289343154, "flos": 21290994138240.0, "grad_norm": 1.8487118461668168, "language_loss": 0.78360963, "learning_rate": 9.179887320509921e-07, "loss": 0.80803645, "num_input_tokens_seen": 248239535, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19750977, "step": 11501, "time_per_iteration": 2.855605125427246 }, { "auxiliary_loss_clip": 0.01431971, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.26463759, "balance_loss_mlp": 1.01599789, "epoch": 0.6915376521869833, "flos": 23888578677120.0, "grad_norm": 1.950313524832114, "language_loss": 0.7464937, "learning_rate": 9.176612079067458e-07, "loss": 0.77117866, "num_input_tokens_seen": 248259055, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20532227, "step": 11502, "time_per_iteration": 2.9430646896362305 }, { "auxiliary_loss_clip": 0.01422544, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.2562263, "balance_loss_mlp": 1.01474643, "epoch": 0.6915977754396513, "flos": 11517733555200.0, "grad_norm": 1.9267561851167225, "language_loss": 0.7478534, "learning_rate": 9.173337248048953e-07, "loss": 0.77242547, "num_input_tokens_seen": 248276765, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19897461, "step": 11503, "time_per_iteration": 2.878425359725952 }, { "auxiliary_loss_clip": 0.01410026, "auxiliary_loss_mlp": 0.01036037, "balance_loss_clip": 1.2461406, "balance_loss_mlp": 1.01581931, "epoch": 0.6916578986923193, "flos": 22611449675520.0, "grad_norm": 1.665251067969717, "language_loss": 0.77711236, "learning_rate": 9.170062827578575e-07, "loss": 0.80157304, "num_input_tokens_seen": 248295310, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20227051, "step": 11504, "time_per_iteration": 2.8435680866241455 }, { "auxiliary_loss_clip": 0.01427026, "auxiliary_loss_mlp": 0.01033985, "balance_loss_clip": 1.26119685, "balance_loss_mlp": 1.01393414, "epoch": 0.6917180219449872, "flos": 23487862314240.0, "grad_norm": 1.633742850568159, "language_loss": 0.74890661, "learning_rate": 9.166788817780499e-07, "loss": 0.77351671, "num_input_tokens_seen": 248315230, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20068359, "step": 11505, "time_per_iteration": 2.9157793521881104 }, { "auxiliary_loss_clip": 0.01407985, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.24547231, "balance_loss_mlp": 1.01863742, "epoch": 0.6917781451976552, "flos": 23743052841600.0, "grad_norm": 1.9231582935462288, "language_loss": 0.88382256, "learning_rate": 9.163515218778886e-07, "loss": 0.90830052, "num_input_tokens_seen": 248332980, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.21166992, "step": 11506, "time_per_iteration": 2.9012980461120605 }, { "auxiliary_loss_clip": 0.01423405, "auxiliary_loss_mlp": 0.01029078, "balance_loss_clip": 1.2603333, "balance_loss_mlp": 1.00982523, "epoch": 0.6918382684503231, "flos": 31479087461760.0, "grad_norm": 2.4652077092913856, "language_loss": 0.71464038, "learning_rate": 9.160242030697856e-07, "loss": 0.73916513, "num_input_tokens_seen": 248352865, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19226074, "step": 11507, "time_per_iteration": 2.94651460647583 }, { "auxiliary_loss_clip": 0.01414103, "auxiliary_loss_mlp": 0.01036278, "balance_loss_clip": 1.24792552, "balance_loss_mlp": 1.01653695, "epoch": 0.6918983917029912, "flos": 21659830899840.0, "grad_norm": 1.8109641794310387, "language_loss": 0.7775116, "learning_rate": 9.156969253661538e-07, "loss": 0.80201542, "num_input_tokens_seen": 248371125, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.1973877, "step": 11508, "time_per_iteration": 2.8526206016540527 }, { "auxiliary_loss_clip": 0.01402263, "auxiliary_loss_mlp": 0.0103413, "balance_loss_clip": 1.24267399, "balance_loss_mlp": 1.01554513, "epoch": 0.6919585149556591, "flos": 25559320608000.0, "grad_norm": 1.5726520508848267, "language_loss": 0.75539982, "learning_rate": 9.153696887794027e-07, "loss": 0.77976376, "num_input_tokens_seen": 248390455, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18579102, "step": 11509, "time_per_iteration": 2.9178733825683594 }, { "auxiliary_loss_clip": 0.01412375, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.25117087, "balance_loss_mlp": 1.01157713, "epoch": 0.6920186382083271, "flos": 23670335168640.0, "grad_norm": 1.584852631886142, "language_loss": 0.64949286, "learning_rate": 9.150424933219425e-07, "loss": 0.67392701, "num_input_tokens_seen": 248411305, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19482422, "step": 11510, "time_per_iteration": 2.8686139583587646 }, { "auxiliary_loss_clip": 0.01436568, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.26673269, "balance_loss_mlp": 1.01367903, "epoch": 0.692078761460995, "flos": 19071159586560.0, "grad_norm": 1.7155852493044226, "language_loss": 0.76367795, "learning_rate": 9.147153390061788e-07, "loss": 0.78838587, "num_input_tokens_seen": 248430190, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20544434, "step": 11511, "time_per_iteration": 2.8608882427215576 }, { "auxiliary_loss_clip": 0.0141611, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.25467062, "balance_loss_mlp": 1.01679814, "epoch": 0.692138884713663, "flos": 29035263312000.0, "grad_norm": 1.460031943751676, "language_loss": 0.63010156, "learning_rate": 9.143882258445184e-07, "loss": 0.65462708, "num_input_tokens_seen": 248450830, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19641113, "step": 11512, "time_per_iteration": 2.939884901046753 }, { "auxiliary_loss_clip": 0.01422611, "auxiliary_loss_mlp": 0.01033694, "balance_loss_clip": 1.25752854, "balance_loss_mlp": 1.01243877, "epoch": 0.6921990079663309, "flos": 14766248056320.0, "grad_norm": 1.7153673109183003, "language_loss": 0.83424389, "learning_rate": 9.140611538493666e-07, "loss": 0.85880697, "num_input_tokens_seen": 248468585, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.21252441, "step": 11513, "time_per_iteration": 2.8554141521453857 }, { "auxiliary_loss_clip": 0.01409229, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.24781132, "balance_loss_mlp": 1.01410425, "epoch": 0.692259131218999, "flos": 23852355575040.0, "grad_norm": 1.4766165990608944, "language_loss": 0.79045165, "learning_rate": 9.137341230331233e-07, "loss": 0.81487381, "num_input_tokens_seen": 248490535, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18908691, "step": 11514, "time_per_iteration": 2.9367685317993164 }, { "auxiliary_loss_clip": 0.01432773, "auxiliary_loss_mlp": 0.01035179, "balance_loss_clip": 1.26368988, "balance_loss_mlp": 1.01667798, "epoch": 0.6923192544716669, "flos": 19144329707520.0, "grad_norm": 2.4254410388687666, "language_loss": 0.76046562, "learning_rate": 9.134071334081907e-07, "loss": 0.78514516, "num_input_tokens_seen": 248508575, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.18505859, "step": 11515, "time_per_iteration": 4.238208293914795 }, { "auxiliary_loss_clip": 0.01407129, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.24721622, "balance_loss_mlp": 1.01507187, "epoch": 0.6923793777243349, "flos": 28086359224320.0, "grad_norm": 3.835398272501269, "language_loss": 0.54029846, "learning_rate": 9.130801849869694e-07, "loss": 0.56471133, "num_input_tokens_seen": 248527025, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1907959, "step": 11516, "time_per_iteration": 2.883131742477417 }, { "auxiliary_loss_clip": 0.0139416, "auxiliary_loss_mlp": 0.01036557, "balance_loss_clip": 1.23863995, "balance_loss_mlp": 1.01565993, "epoch": 0.6924395009770029, "flos": 16589890725120.0, "grad_norm": 1.6606665628964752, "language_loss": 0.73663795, "learning_rate": 9.127532777818557e-07, "loss": 0.76094508, "num_input_tokens_seen": 248544275, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.20874023, "step": 11517, "time_per_iteration": 2.859261989593506 }, { "auxiliary_loss_clip": 0.0141775, "auxiliary_loss_mlp": 0.01037971, "balance_loss_clip": 1.25177467, "balance_loss_mlp": 1.01747918, "epoch": 0.6924996242296708, "flos": 16664237210880.0, "grad_norm": 2.220289450751826, "language_loss": 0.77055717, "learning_rate": 9.124264118052465e-07, "loss": 0.7951144, "num_input_tokens_seen": 248561870, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20495605, "step": 11518, "time_per_iteration": 2.8119728565216064 }, { "auxiliary_loss_clip": 0.01419768, "auxiliary_loss_mlp": 0.01035167, "balance_loss_clip": 1.25250494, "balance_loss_mlp": 1.01488996, "epoch": 0.6925597474823388, "flos": 34768304300160.0, "grad_norm": 1.5654027942273196, "language_loss": 0.65115154, "learning_rate": 9.120995870695376e-07, "loss": 0.6757009, "num_input_tokens_seen": 248588190, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.20300293, "step": 11519, "time_per_iteration": 3.1147053241729736 }, { "auxiliary_loss_clip": 0.01412039, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.24823368, "balance_loss_mlp": 1.01577032, "epoch": 0.6926198707350067, "flos": 21881739237120.0, "grad_norm": 1.8652921045561242, "language_loss": 0.63665485, "learning_rate": 9.117728035871212e-07, "loss": 0.66113025, "num_input_tokens_seen": 248606460, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19714355, "step": 11520, "time_per_iteration": 2.856724739074707 }, { "auxiliary_loss_clip": 0.01428194, "auxiliary_loss_mlp": 0.01039615, "balance_loss_clip": 1.25764477, "balance_loss_mlp": 1.01925445, "epoch": 0.6926799939876748, "flos": 13014146695680.0, "grad_norm": 1.94453896916085, "language_loss": 0.78638542, "learning_rate": 9.114460613703887e-07, "loss": 0.81106347, "num_input_tokens_seen": 248623715, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.20361328, "step": 11521, "time_per_iteration": 2.840867757797241 }, { "auxiliary_loss_clip": 0.01421952, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.25470459, "balance_loss_mlp": 1.01569915, "epoch": 0.6927401172403427, "flos": 16769286933120.0, "grad_norm": 2.642113157457962, "language_loss": 0.82672524, "learning_rate": 9.111193604317304e-07, "loss": 0.85130417, "num_input_tokens_seen": 248640575, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20251465, "step": 11522, "time_per_iteration": 2.8892040252685547 }, { "auxiliary_loss_clip": 0.01411317, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.24941134, "balance_loss_mlp": 1.01646888, "epoch": 0.6928002404930107, "flos": 25717424497920.0, "grad_norm": 4.596860724422841, "language_loss": 0.77193308, "learning_rate": 9.107927007835361e-07, "loss": 0.79640543, "num_input_tokens_seen": 248663535, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19458008, "step": 11523, "time_per_iteration": 2.9565186500549316 }, { "auxiliary_loss_clip": 0.01409531, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.24894953, "balance_loss_mlp": 1.01493716, "epoch": 0.6928603637456786, "flos": 18597499326720.0, "grad_norm": 2.0026078095730684, "language_loss": 0.69324327, "learning_rate": 9.104660824381915e-07, "loss": 0.7176826, "num_input_tokens_seen": 248681125, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19482422, "step": 11524, "time_per_iteration": 2.8525989055633545 }, { "auxiliary_loss_clip": 0.01448712, "auxiliary_loss_mlp": 0.0103876, "balance_loss_clip": 1.27844167, "balance_loss_mlp": 1.01786244, "epoch": 0.6929204869983466, "flos": 22211185760640.0, "grad_norm": 1.696181760597718, "language_loss": 0.65454435, "learning_rate": 9.101395054080815e-07, "loss": 0.67941904, "num_input_tokens_seen": 248700555, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.2088623, "step": 11525, "time_per_iteration": 4.511396169662476 }, { "auxiliary_loss_clip": 0.01414946, "auxiliary_loss_mlp": 0.01036051, "balance_loss_clip": 1.25217628, "balance_loss_mlp": 1.01695323, "epoch": 0.6929806102510145, "flos": 17903740521600.0, "grad_norm": 6.671254979082253, "language_loss": 0.70550275, "learning_rate": 9.098129697055907e-07, "loss": 0.73001266, "num_input_tokens_seen": 248716095, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19116211, "step": 11526, "time_per_iteration": 4.2730138301849365 }, { "auxiliary_loss_clip": 0.01420849, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.25834346, "balance_loss_mlp": 1.01570511, "epoch": 0.6930407335036826, "flos": 19764556433280.0, "grad_norm": 3.6940942231272533, "language_loss": 0.77337497, "learning_rate": 9.094864753431022e-07, "loss": 0.79793197, "num_input_tokens_seen": 248735330, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19116211, "step": 11527, "time_per_iteration": 2.8586692810058594 }, { "auxiliary_loss_clip": 0.01421883, "auxiliary_loss_mlp": 0.01033573, "balance_loss_clip": 1.25806952, "balance_loss_mlp": 1.01491714, "epoch": 0.6931008567563505, "flos": 21554419219200.0, "grad_norm": 1.6003826348188137, "language_loss": 0.79996765, "learning_rate": 9.091600223329952e-07, "loss": 0.82452226, "num_input_tokens_seen": 248754530, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18676758, "step": 11528, "time_per_iteration": 2.8595242500305176 }, { "auxiliary_loss_clip": 0.01398813, "auxiliary_loss_mlp": 0.01035321, "balance_loss_clip": 1.24055183, "balance_loss_mlp": 1.01589036, "epoch": 0.6931609800090185, "flos": 26261178232320.0, "grad_norm": 1.4328690850478691, "language_loss": 0.7617197, "learning_rate": 9.088336106876491e-07, "loss": 0.78606105, "num_input_tokens_seen": 248775825, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19458008, "step": 11529, "time_per_iteration": 4.297672510147095 }, { "auxiliary_loss_clip": 0.01406404, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.24655402, "balance_loss_mlp": 1.01562285, "epoch": 0.6932211032616865, "flos": 32356178772480.0, "grad_norm": 1.6421911408478045, "language_loss": 0.72885263, "learning_rate": 9.085072404194436e-07, "loss": 0.75326103, "num_input_tokens_seen": 248796180, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18811035, "step": 11530, "time_per_iteration": 2.942888021469116 }, { "auxiliary_loss_clip": 0.0143876, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.26934695, "balance_loss_mlp": 1.01905251, "epoch": 0.6932812265143544, "flos": 22057832574720.0, "grad_norm": 1.6443777603459018, "language_loss": 0.78956223, "learning_rate": 9.081809115407513e-07, "loss": 0.81434, "num_input_tokens_seen": 248814735, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.19958496, "step": 11531, "time_per_iteration": 2.859013080596924 }, { "auxiliary_loss_clip": 0.01400493, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.24056506, "balance_loss_mlp": 1.01264513, "epoch": 0.6933413497670224, "flos": 26269503275520.0, "grad_norm": 1.5171325623270502, "language_loss": 0.69988596, "learning_rate": 9.078546240639484e-07, "loss": 0.724195, "num_input_tokens_seen": 248839140, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1776123, "step": 11532, "time_per_iteration": 2.9338858127593994 }, { "auxiliary_loss_clip": 0.01422148, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.25697434, "balance_loss_mlp": 1.01504004, "epoch": 0.6934014730196904, "flos": 19582400292480.0, "grad_norm": 1.7763343982458926, "language_loss": 0.67731297, "learning_rate": 9.075283780014082e-07, "loss": 0.70187485, "num_input_tokens_seen": 248858300, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.18994141, "step": 11533, "time_per_iteration": 2.8527259826660156 }, { "auxiliary_loss_clip": 0.01414653, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.24988341, "balance_loss_mlp": 1.0203824, "epoch": 0.6934615962723584, "flos": 22127292622080.0, "grad_norm": 2.6071219783977666, "language_loss": 0.60166979, "learning_rate": 9.072021733655007e-07, "loss": 0.62621832, "num_input_tokens_seen": 248876310, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19824219, "step": 11534, "time_per_iteration": 2.8354551792144775 }, { "auxiliary_loss_clip": 0.01416504, "auxiliary_loss_mlp": 0.01034418, "balance_loss_clip": 1.251917, "balance_loss_mlp": 1.0155828, "epoch": 0.6935217195250263, "flos": 21370679510400.0, "grad_norm": 2.0187063830774257, "language_loss": 0.72222698, "learning_rate": 9.068760101685971e-07, "loss": 0.74673617, "num_input_tokens_seen": 248895650, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18811035, "step": 11535, "time_per_iteration": 2.8536369800567627 }, { "auxiliary_loss_clip": 0.01190588, "auxiliary_loss_mlp": 0.0102969, "balance_loss_clip": 1.09715915, "balance_loss_mlp": 1.00689733, "epoch": 0.6935818427776943, "flos": 64098030625920.0, "grad_norm": 0.7119678788099219, "language_loss": 0.59073544, "learning_rate": 9.065498884230638e-07, "loss": 0.61293823, "num_input_tokens_seen": 248963920, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.22753906, "step": 11536, "time_per_iteration": 3.4670650959014893 }, { "auxiliary_loss_clip": 0.0143116, "auxiliary_loss_mlp": 0.01038453, "balance_loss_clip": 1.26291907, "balance_loss_mlp": 1.01903331, "epoch": 0.6936419660303622, "flos": 20312110730880.0, "grad_norm": 2.736900967720685, "language_loss": 0.73388344, "learning_rate": 9.062238081412692e-07, "loss": 0.75857955, "num_input_tokens_seen": 248983380, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19421387, "step": 11537, "time_per_iteration": 2.916876792907715 }, { "auxiliary_loss_clip": 0.0119341, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.09981036, "balance_loss_mlp": 1.02343106, "epoch": 0.6937020892830302, "flos": 67212674467200.0, "grad_norm": 0.7567591928939326, "language_loss": 0.55652022, "learning_rate": 9.058977693355767e-07, "loss": 0.57888985, "num_input_tokens_seen": 249044680, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.20117188, "step": 11538, "time_per_iteration": 3.306994915008545 }, { "auxiliary_loss_clip": 0.01393765, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.23800468, "balance_loss_mlp": 1.01489866, "epoch": 0.6937622125356981, "flos": 23888669166720.0, "grad_norm": 1.4150927838528042, "language_loss": 0.78072834, "learning_rate": 9.055717720183505e-07, "loss": 0.80500847, "num_input_tokens_seen": 249061060, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.19372559, "step": 11539, "time_per_iteration": 2.872532844543457 }, { "auxiliary_loss_clip": 0.01399047, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.23848033, "balance_loss_mlp": 1.01656866, "epoch": 0.6938223357883662, "flos": 28742311359360.0, "grad_norm": 1.882130286265113, "language_loss": 0.65135419, "learning_rate": 9.05245816201953e-07, "loss": 0.67569494, "num_input_tokens_seen": 249081430, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18469238, "step": 11540, "time_per_iteration": 2.9064559936523438 }, { "auxiliary_loss_clip": 0.01404639, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.24405408, "balance_loss_mlp": 1.01281273, "epoch": 0.6938824590410341, "flos": 28666019347200.0, "grad_norm": 1.4242389762395766, "language_loss": 0.87361914, "learning_rate": 9.049199018987437e-07, "loss": 0.89798105, "num_input_tokens_seen": 249103020, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1875, "step": 11541, "time_per_iteration": 2.9272260665893555 }, { "auxiliary_loss_clip": 0.0142086, "auxiliary_loss_mlp": 0.01032123, "balance_loss_clip": 1.25544405, "balance_loss_mlp": 1.01289487, "epoch": 0.6939425822937021, "flos": 18990705052800.0, "grad_norm": 1.627054342232854, "language_loss": 0.84971368, "learning_rate": 9.04594029121081e-07, "loss": 0.87424356, "num_input_tokens_seen": 249120810, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19213867, "step": 11542, "time_per_iteration": 2.8440754413604736 }, { "auxiliary_loss_clip": 0.01426519, "auxiliary_loss_mlp": 0.01033763, "balance_loss_clip": 1.25882506, "balance_loss_mlp": 1.01339006, "epoch": 0.6940027055463701, "flos": 23086195810560.0, "grad_norm": 1.7254424088509535, "language_loss": 0.75854158, "learning_rate": 9.04268197881323e-07, "loss": 0.78314441, "num_input_tokens_seen": 249138050, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20373535, "step": 11543, "time_per_iteration": 2.8757340908050537 }, { "auxiliary_loss_clip": 0.0140357, "auxiliary_loss_mlp": 0.01036133, "balance_loss_clip": 1.24144828, "balance_loss_mlp": 1.01729798, "epoch": 0.694062828799038, "flos": 18195606599040.0, "grad_norm": 1.673010160996948, "language_loss": 0.77237636, "learning_rate": 9.039424081918241e-07, "loss": 0.79677337, "num_input_tokens_seen": 249155570, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18823242, "step": 11544, "time_per_iteration": 2.8544371128082275 }, { "auxiliary_loss_clip": 0.01419219, "auxiliary_loss_mlp": 0.01038098, "balance_loss_clip": 1.2540338, "balance_loss_mlp": 1.01855993, "epoch": 0.694122952051706, "flos": 17830434666240.0, "grad_norm": 2.1143586300817825, "language_loss": 0.72668087, "learning_rate": 9.036166600649388e-07, "loss": 0.75125408, "num_input_tokens_seen": 249172960, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.1953125, "step": 11545, "time_per_iteration": 2.8734676837921143 }, { "auxiliary_loss_clip": 0.01411196, "auxiliary_loss_mlp": 0.01034471, "balance_loss_clip": 1.2514019, "balance_loss_mlp": 1.01611304, "epoch": 0.694183075304374, "flos": 21225244164480.0, "grad_norm": 1.6609241108856527, "language_loss": 0.80157793, "learning_rate": 9.0329095351302e-07, "loss": 0.82603455, "num_input_tokens_seen": 249192450, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18359375, "step": 11546, "time_per_iteration": 2.8647756576538086 }, { "auxiliary_loss_clip": 0.01421093, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.25876832, "balance_loss_mlp": 1.01512241, "epoch": 0.694243198557042, "flos": 24071006286720.0, "grad_norm": 1.3232305189538418, "language_loss": 0.79056168, "learning_rate": 9.029652885484194e-07, "loss": 0.81511611, "num_input_tokens_seen": 249214320, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19226074, "step": 11547, "time_per_iteration": 2.9062814712524414 }, { "auxiliary_loss_clip": 0.01409402, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.24866664, "balance_loss_mlp": 1.01397145, "epoch": 0.6943033218097099, "flos": 21151666840320.0, "grad_norm": 1.924840607787628, "language_loss": 0.81193364, "learning_rate": 9.026396651834834e-07, "loss": 0.83636403, "num_input_tokens_seen": 249230925, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19641113, "step": 11548, "time_per_iteration": 2.8612732887268066 }, { "auxiliary_loss_clip": 0.01190447, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 1.09759545, "balance_loss_mlp": 1.0044775, "epoch": 0.6943634450623779, "flos": 57841306611840.0, "grad_norm": 0.7191427407658184, "language_loss": 0.53762555, "learning_rate": 9.023140834305613e-07, "loss": 0.55976647, "num_input_tokens_seen": 249293975, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.19140625, "step": 11549, "time_per_iteration": 3.3805699348449707 }, { "auxiliary_loss_clip": 0.01411938, "auxiliary_loss_mlp": 0.01035411, "balance_loss_clip": 1.24828565, "balance_loss_mlp": 1.01613498, "epoch": 0.6944235683150458, "flos": 30602674823040.0, "grad_norm": 1.4481334915468063, "language_loss": 0.74226582, "learning_rate": 9.01988543302e-07, "loss": 0.76673937, "num_input_tokens_seen": 249315285, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19262695, "step": 11550, "time_per_iteration": 4.355464458465576 }, { "auxiliary_loss_clip": 0.01427867, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.26167488, "balance_loss_mlp": 1.0182476, "epoch": 0.6944836915677138, "flos": 19729012003200.0, "grad_norm": 1.8200297336056745, "language_loss": 0.74701703, "learning_rate": 9.016630448101425e-07, "loss": 0.77168369, "num_input_tokens_seen": 249333505, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20544434, "step": 11551, "time_per_iteration": 2.8476433753967285 }, { "auxiliary_loss_clip": 0.01418893, "auxiliary_loss_mlp": 0.0103752, "balance_loss_clip": 1.25397611, "balance_loss_mlp": 1.01673007, "epoch": 0.6945438148203817, "flos": 24874384538880.0, "grad_norm": 1.491489764583699, "language_loss": 0.847417, "learning_rate": 9.01337587967333e-07, "loss": 0.87198108, "num_input_tokens_seen": 249354180, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20800781, "step": 11552, "time_per_iteration": 2.887401580810547 }, { "auxiliary_loss_clip": 0.01414157, "auxiliary_loss_mlp": 0.01033419, "balance_loss_clip": 1.25196934, "balance_loss_mlp": 1.01384521, "epoch": 0.6946039380730498, "flos": 33339224701440.0, "grad_norm": 1.484045762046424, "language_loss": 0.67850244, "learning_rate": 9.010121727859117e-07, "loss": 0.70297819, "num_input_tokens_seen": 249377035, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19567871, "step": 11553, "time_per_iteration": 2.9933483600616455 }, { "auxiliary_loss_clip": 0.01449058, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.27978349, "balance_loss_mlp": 1.01750541, "epoch": 0.6946640613257177, "flos": 20860931882880.0, "grad_norm": 1.501643820900227, "language_loss": 0.79716694, "learning_rate": 9.006867992782195e-07, "loss": 0.82203621, "num_input_tokens_seen": 249396155, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.20361328, "step": 11554, "time_per_iteration": 2.8578603267669678 }, { "auxiliary_loss_clip": 0.01415834, "auxiliary_loss_mlp": 0.01033167, "balance_loss_clip": 1.25051248, "balance_loss_mlp": 1.01344967, "epoch": 0.6947241845783857, "flos": 19364383008000.0, "grad_norm": 1.8223108437981177, "language_loss": 0.73686749, "learning_rate": 9.003614674565934e-07, "loss": 0.76135749, "num_input_tokens_seen": 249414555, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19726562, "step": 11555, "time_per_iteration": 2.8619163036346436 }, { "auxiliary_loss_clip": 0.0140834, "auxiliary_loss_mlp": 0.0103419, "balance_loss_clip": 1.24578285, "balance_loss_mlp": 1.0156405, "epoch": 0.6947843078310536, "flos": 27130215968640.0, "grad_norm": 1.7558027533526908, "language_loss": 0.78586274, "learning_rate": 9.000361773333705e-07, "loss": 0.81028807, "num_input_tokens_seen": 249433570, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18566895, "step": 11556, "time_per_iteration": 2.906740188598633 }, { "auxiliary_loss_clip": 0.01413189, "auxiliary_loss_mlp": 0.01036855, "balance_loss_clip": 1.24902654, "balance_loss_mlp": 1.01823425, "epoch": 0.6948444310837216, "flos": 28596876013440.0, "grad_norm": 2.154351242295796, "language_loss": 0.6140281, "learning_rate": 8.997109289208869e-07, "loss": 0.63852859, "num_input_tokens_seen": 249453735, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1862793, "step": 11557, "time_per_iteration": 2.9419431686401367 }, { "auxiliary_loss_clip": 0.01407099, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.24635673, "balance_loss_mlp": 1.01349354, "epoch": 0.6949045543363896, "flos": 15677707432320.0, "grad_norm": 1.632112169473129, "language_loss": 0.86133695, "learning_rate": 8.993857222314752e-07, "loss": 0.88574016, "num_input_tokens_seen": 249470805, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19726562, "step": 11558, "time_per_iteration": 2.8617608547210693 }, { "auxiliary_loss_clip": 0.01428025, "auxiliary_loss_mlp": 0.01034854, "balance_loss_clip": 1.2601881, "balance_loss_mlp": 1.01457667, "epoch": 0.6949646775890576, "flos": 23270116498560.0, "grad_norm": 1.6732830219327683, "language_loss": 0.70763361, "learning_rate": 8.990605572774664e-07, "loss": 0.73226249, "num_input_tokens_seen": 249491150, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.20251465, "step": 11559, "time_per_iteration": 2.897136688232422 }, { "auxiliary_loss_clip": 0.01422667, "auxiliary_loss_mlp": 0.01031794, "balance_loss_clip": 1.25952196, "balance_loss_mlp": 1.01322103, "epoch": 0.6950248008417256, "flos": 22392708474240.0, "grad_norm": 1.4725532868891718, "language_loss": 0.79994047, "learning_rate": 8.987354340711921e-07, "loss": 0.82448506, "num_input_tokens_seen": 249511560, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18566895, "step": 11560, "time_per_iteration": 5.729364395141602 }, { "auxiliary_loss_clip": 0.01403789, "auxiliary_loss_mlp": 0.01031633, "balance_loss_clip": 1.24362016, "balance_loss_mlp": 1.01209426, "epoch": 0.6950849240943935, "flos": 23487862314240.0, "grad_norm": 2.198515972028677, "language_loss": 0.77475113, "learning_rate": 8.9841035262498e-07, "loss": 0.79910529, "num_input_tokens_seen": 249531910, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19567871, "step": 11561, "time_per_iteration": 3.024867296218872 }, { "auxiliary_loss_clip": 0.01409153, "auxiliary_loss_mlp": 0.01034378, "balance_loss_clip": 1.24721599, "balance_loss_mlp": 1.01488733, "epoch": 0.6951450473470615, "flos": 17429446834560.0, "grad_norm": 3.7916738230705946, "language_loss": 0.78571546, "learning_rate": 8.980853129511577e-07, "loss": 0.8101508, "num_input_tokens_seen": 249550300, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19494629, "step": 11562, "time_per_iteration": 2.884848117828369 }, { "auxiliary_loss_clip": 0.0141977, "auxiliary_loss_mlp": 0.01037479, "balance_loss_clip": 1.25421715, "balance_loss_mlp": 1.01726127, "epoch": 0.6952051705997294, "flos": 20495533726080.0, "grad_norm": 1.804438702355151, "language_loss": 0.6956681, "learning_rate": 8.977603150620515e-07, "loss": 0.72024059, "num_input_tokens_seen": 249567740, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20214844, "step": 11563, "time_per_iteration": 4.334613084793091 }, { "auxiliary_loss_clip": 0.01398817, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.24095058, "balance_loss_mlp": 1.01069295, "epoch": 0.6952652938523974, "flos": 13997192624640.0, "grad_norm": 2.1098769041553496, "language_loss": 0.74289036, "learning_rate": 8.974353589699846e-07, "loss": 0.76717311, "num_input_tokens_seen": 249582700, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.1875, "step": 11564, "time_per_iteration": 2.825165033340454 }, { "auxiliary_loss_clip": 0.01460631, "auxiliary_loss_mlp": 0.01044366, "balance_loss_clip": 1.28482485, "balance_loss_mlp": 1.02342081, "epoch": 0.6953254171050653, "flos": 30965629760640.0, "grad_norm": 1.7936143228763208, "language_loss": 0.72776115, "learning_rate": 8.971104446872785e-07, "loss": 0.75281107, "num_input_tokens_seen": 249602920, "router_z_loss_clip": 1.75878906, "router_z_loss_mlp": 0.20959473, "step": 11565, "time_per_iteration": 2.9423716068267822 }, { "auxiliary_loss_clip": 0.01192125, "auxiliary_loss_mlp": 0.01021262, "balance_loss_clip": 1.09758139, "balance_loss_mlp": 0.99713439, "epoch": 0.6953855403577334, "flos": 61698148456320.0, "grad_norm": 0.9180875469951174, "language_loss": 0.58500409, "learning_rate": 8.96785572226255e-07, "loss": 0.60713798, "num_input_tokens_seen": 249660400, "router_z_loss_clip": 0.9453125, "router_z_loss_mlp": 0.24121094, "step": 11566, "time_per_iteration": 3.2167866230010986 }, { "auxiliary_loss_clip": 0.01415894, "auxiliary_loss_mlp": 0.01031172, "balance_loss_clip": 1.24939406, "balance_loss_mlp": 1.011657, "epoch": 0.6954456636104013, "flos": 23049294036480.0, "grad_norm": 1.7981596322877396, "language_loss": 0.74527478, "learning_rate": 8.964607415992338e-07, "loss": 0.76974547, "num_input_tokens_seen": 249679335, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19519043, "step": 11567, "time_per_iteration": 2.9325613975524902 }, { "auxiliary_loss_clip": 0.01406107, "auxiliary_loss_mlp": 0.01036206, "balance_loss_clip": 1.24564838, "balance_loss_mlp": 1.01604831, "epoch": 0.6955057868630693, "flos": 23930004931200.0, "grad_norm": 1.2530817917592652, "language_loss": 0.77079207, "learning_rate": 8.961359528185313e-07, "loss": 0.79521519, "num_input_tokens_seen": 249701805, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20153809, "step": 11568, "time_per_iteration": 2.920728921890259 }, { "auxiliary_loss_clip": 0.01411872, "auxiliary_loss_mlp": 0.01034403, "balance_loss_clip": 1.25039351, "balance_loss_mlp": 1.01527023, "epoch": 0.6955659101157372, "flos": 22603079387520.0, "grad_norm": 21.18210745015226, "language_loss": 0.73111677, "learning_rate": 8.958112058964649e-07, "loss": 0.75557959, "num_input_tokens_seen": 249720550, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19128418, "step": 11569, "time_per_iteration": 2.869680166244507 }, { "auxiliary_loss_clip": 0.01416498, "auxiliary_loss_mlp": 0.0103383, "balance_loss_clip": 1.253263, "balance_loss_mlp": 1.01356459, "epoch": 0.6956260333684052, "flos": 24583468602240.0, "grad_norm": 1.6132259343313964, "language_loss": 0.77597332, "learning_rate": 8.954865008453471e-07, "loss": 0.80047655, "num_input_tokens_seen": 249740325, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.20275879, "step": 11570, "time_per_iteration": 2.89581561088562 }, { "auxiliary_loss_clip": 0.01425862, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.25917101, "balance_loss_mlp": 1.01249194, "epoch": 0.6956861566210732, "flos": 25856435082240.0, "grad_norm": 2.5290420895878842, "language_loss": 0.75267249, "learning_rate": 8.95161837677493e-07, "loss": 0.77725971, "num_input_tokens_seen": 249760570, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20336914, "step": 11571, "time_per_iteration": 2.9328274726867676 }, { "auxiliary_loss_clip": 0.01400806, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.24352205, "balance_loss_mlp": 1.01185846, "epoch": 0.6957462798737412, "flos": 15308961160320.0, "grad_norm": 1.9621282340486275, "language_loss": 0.75222391, "learning_rate": 8.948372164052118e-07, "loss": 0.77654034, "num_input_tokens_seen": 249778290, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18969727, "step": 11572, "time_per_iteration": 2.9461464881896973 }, { "auxiliary_loss_clip": 0.01402679, "auxiliary_loss_mlp": 0.01031332, "balance_loss_clip": 1.24054754, "balance_loss_mlp": 1.01250887, "epoch": 0.6958064031264092, "flos": 36260509674240.0, "grad_norm": 2.1107299289905868, "language_loss": 0.70815361, "learning_rate": 8.94512637040814e-07, "loss": 0.7324937, "num_input_tokens_seen": 249800925, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18823242, "step": 11573, "time_per_iteration": 2.97440242767334 }, { "auxiliary_loss_clip": 0.01449149, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.27911365, "balance_loss_mlp": 1.01796281, "epoch": 0.6958665263790771, "flos": 19218178500480.0, "grad_norm": 1.7643323474626194, "language_loss": 0.75421327, "learning_rate": 8.941880995966095e-07, "loss": 0.77907586, "num_input_tokens_seen": 249820500, "router_z_loss_clip": 1.70410156, "router_z_loss_mlp": 0.19140625, "step": 11574, "time_per_iteration": 2.8527088165283203 }, { "auxiliary_loss_clip": 0.01426993, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.25898457, "balance_loss_mlp": 1.01383209, "epoch": 0.6959266496317451, "flos": 21805085266560.0, "grad_norm": 1.8627069301052535, "language_loss": 0.7486707, "learning_rate": 8.938636040849014e-07, "loss": 0.77327561, "num_input_tokens_seen": 249839845, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.1965332, "step": 11575, "time_per_iteration": 2.904594898223877 }, { "auxiliary_loss_clip": 0.01415333, "auxiliary_loss_mlp": 0.01031756, "balance_loss_clip": 1.25115585, "balance_loss_mlp": 1.01188374, "epoch": 0.695986772884413, "flos": 20567934685440.0, "grad_norm": 5.212018950050875, "language_loss": 0.80247915, "learning_rate": 8.935391505179966e-07, "loss": 0.82695001, "num_input_tokens_seen": 249857400, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1986084, "step": 11576, "time_per_iteration": 2.8874452114105225 }, { "auxiliary_loss_clip": 0.01422273, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.25421858, "balance_loss_mlp": 1.0112735, "epoch": 0.696046896137081, "flos": 14943608248320.0, "grad_norm": 2.9301157685993227, "language_loss": 0.58093488, "learning_rate": 8.932147389081985e-07, "loss": 0.60547578, "num_input_tokens_seen": 249871645, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20544434, "step": 11577, "time_per_iteration": 2.8688881397247314 }, { "auxiliary_loss_clip": 0.01392523, "auxiliary_loss_mlp": 0.0102725, "balance_loss_clip": 1.23465312, "balance_loss_mlp": 1.00870132, "epoch": 0.696107019389749, "flos": 30753177586560.0, "grad_norm": 1.525627461832965, "language_loss": 0.77225506, "learning_rate": 8.928903692678081e-07, "loss": 0.79645282, "num_input_tokens_seen": 249894215, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18554688, "step": 11578, "time_per_iteration": 2.959563970565796 }, { "auxiliary_loss_clip": 0.01412472, "auxiliary_loss_mlp": 0.01036358, "balance_loss_clip": 1.24885392, "balance_loss_mlp": 1.01711726, "epoch": 0.696167142642417, "flos": 20786359173120.0, "grad_norm": 2.10111104071756, "language_loss": 0.80965036, "learning_rate": 8.925660416091254e-07, "loss": 0.83413863, "num_input_tokens_seen": 249912850, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19226074, "step": 11579, "time_per_iteration": 2.856285333633423 }, { "auxiliary_loss_clip": 0.01404115, "auxiliary_loss_mlp": 0.01031831, "balance_loss_clip": 1.2432549, "balance_loss_mlp": 1.01177967, "epoch": 0.6962272658950849, "flos": 22575407552640.0, "grad_norm": 1.8470977396234243, "language_loss": 0.73661661, "learning_rate": 8.922417559444502e-07, "loss": 0.76097608, "num_input_tokens_seen": 249932650, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20056152, "step": 11580, "time_per_iteration": 2.907959461212158 }, { "auxiliary_loss_clip": 0.0141804, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.25282085, "balance_loss_mlp": 1.01381052, "epoch": 0.6962873891477529, "flos": 22210280864640.0, "grad_norm": 2.1301365349624586, "language_loss": 0.66443908, "learning_rate": 8.919175122860787e-07, "loss": 0.68895996, "num_input_tokens_seen": 249951205, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20239258, "step": 11581, "time_per_iteration": 2.8702609539031982 }, { "auxiliary_loss_clip": 0.01407929, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.24570751, "balance_loss_mlp": 1.01103461, "epoch": 0.6963475124004208, "flos": 12495938290560.0, "grad_norm": 2.1583851598034185, "language_loss": 0.77506554, "learning_rate": 8.915933106463056e-07, "loss": 0.7994523, "num_input_tokens_seen": 249967045, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19726562, "step": 11582, "time_per_iteration": 2.815110206604004 }, { "auxiliary_loss_clip": 0.01399586, "auxiliary_loss_mlp": 0.01030273, "balance_loss_clip": 1.23732138, "balance_loss_mlp": 1.01170015, "epoch": 0.6964076356530888, "flos": 17173894348800.0, "grad_norm": 1.9723614810778611, "language_loss": 0.70353228, "learning_rate": 8.91269151037425e-07, "loss": 0.72783089, "num_input_tokens_seen": 249984565, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18591309, "step": 11583, "time_per_iteration": 2.87984561920166 }, { "auxiliary_loss_clip": 0.01416077, "auxiliary_loss_mlp": 0.01032655, "balance_loss_clip": 1.25348353, "balance_loss_mlp": 1.01331949, "epoch": 0.6964677589057569, "flos": 19946667329280.0, "grad_norm": 2.6918858748377397, "language_loss": 0.8313309, "learning_rate": 8.909450334717301e-07, "loss": 0.85581815, "num_input_tokens_seen": 250004235, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19335938, "step": 11584, "time_per_iteration": 2.886331558227539 }, { "auxiliary_loss_clip": 0.01425192, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.2594167, "balance_loss_mlp": 1.01383996, "epoch": 0.6965278821584248, "flos": 22794374977920.0, "grad_norm": 2.4428956119151994, "language_loss": 0.8055625, "learning_rate": 8.906209579615107e-07, "loss": 0.83015847, "num_input_tokens_seen": 250017645, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20556641, "step": 11585, "time_per_iteration": 4.268278121948242 }, { "auxiliary_loss_clip": 0.01401509, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.24200964, "balance_loss_mlp": 1.01457882, "epoch": 0.6965880054110928, "flos": 20057146427520.0, "grad_norm": 1.54150622458124, "language_loss": 0.77927184, "learning_rate": 8.90296924519055e-07, "loss": 0.80363238, "num_input_tokens_seen": 250037640, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19958496, "step": 11586, "time_per_iteration": 2.866168260574341 }, { "auxiliary_loss_clip": 0.01388448, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.23185706, "balance_loss_mlp": 1.01255548, "epoch": 0.6966481286637607, "flos": 21918279052800.0, "grad_norm": 2.378691278223498, "language_loss": 0.79677951, "learning_rate": 8.899729331566519e-07, "loss": 0.8209902, "num_input_tokens_seen": 250056490, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.20031738, "step": 11587, "time_per_iteration": 2.86370587348938 }, { "auxiliary_loss_clip": 0.01392564, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.23526812, "balance_loss_mlp": 1.01361156, "epoch": 0.6967082519164287, "flos": 15641710554240.0, "grad_norm": 1.7844293310055668, "language_loss": 0.73884082, "learning_rate": 8.896489838865857e-07, "loss": 0.76309711, "num_input_tokens_seen": 250074285, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19458008, "step": 11588, "time_per_iteration": 2.8338394165039062 }, { "auxiliary_loss_clip": 0.01402915, "auxiliary_loss_mlp": 0.01032013, "balance_loss_clip": 1.24036002, "balance_loss_mlp": 1.01280868, "epoch": 0.6967683751690966, "flos": 24035235632640.0, "grad_norm": 1.6276920243715087, "language_loss": 0.76201957, "learning_rate": 8.893250767211413e-07, "loss": 0.78636885, "num_input_tokens_seen": 250093350, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19189453, "step": 11589, "time_per_iteration": 2.877988815307617 }, { "auxiliary_loss_clip": 0.01403306, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.24108028, "balance_loss_mlp": 1.01216412, "epoch": 0.6968284984217646, "flos": 31035813724800.0, "grad_norm": 1.7184693856345636, "language_loss": 0.64411879, "learning_rate": 8.890012116726012e-07, "loss": 0.66846985, "num_input_tokens_seen": 250114170, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19628906, "step": 11590, "time_per_iteration": 2.961817502975464 }, { "auxiliary_loss_clip": 0.01197105, "auxiliary_loss_mlp": 0.01036011, "balance_loss_clip": 1.10387516, "balance_loss_mlp": 1.01407611, "epoch": 0.6968886216744326, "flos": 67653866943360.0, "grad_norm": 0.7557423256790824, "language_loss": 0.61292011, "learning_rate": 8.88677388753248e-07, "loss": 0.63525122, "num_input_tokens_seen": 250178250, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.21972656, "step": 11591, "time_per_iteration": 3.438796281814575 }, { "auxiliary_loss_clip": 0.01417755, "auxiliary_loss_mlp": 0.01037903, "balance_loss_clip": 1.25490856, "balance_loss_mlp": 1.01692247, "epoch": 0.6969487449271006, "flos": 24874520273280.0, "grad_norm": 1.5822946820983526, "language_loss": 0.70120966, "learning_rate": 8.883536079753582e-07, "loss": 0.72576618, "num_input_tokens_seen": 250198420, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20983887, "step": 11592, "time_per_iteration": 2.895056962966919 }, { "auxiliary_loss_clip": 0.01416798, "auxiliary_loss_mlp": 0.01033823, "balance_loss_clip": 1.25516486, "balance_loss_mlp": 1.01461887, "epoch": 0.6970088681797685, "flos": 28779756071040.0, "grad_norm": 1.5709667561399687, "language_loss": 0.63153285, "learning_rate": 8.880298693512109e-07, "loss": 0.65603912, "num_input_tokens_seen": 250220650, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.1920166, "step": 11593, "time_per_iteration": 2.9703571796417236 }, { "auxiliary_loss_clip": 0.01393949, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.23544121, "balance_loss_mlp": 1.01522446, "epoch": 0.6970689914324365, "flos": 27320108970240.0, "grad_norm": 1.3746198203990576, "language_loss": 0.54402649, "learning_rate": 8.877061728930832e-07, "loss": 0.56831336, "num_input_tokens_seen": 250241750, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19519043, "step": 11594, "time_per_iteration": 2.9341232776641846 }, { "auxiliary_loss_clip": 0.01415082, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.25183177, "balance_loss_mlp": 1.01388025, "epoch": 0.6971291146851044, "flos": 19145822785920.0, "grad_norm": 2.2862345065269007, "language_loss": 0.78269649, "learning_rate": 8.87382518613248e-07, "loss": 0.8071748, "num_input_tokens_seen": 250259445, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18847656, "step": 11595, "time_per_iteration": 5.66045069694519 }, { "auxiliary_loss_clip": 0.01415119, "auxiliary_loss_mlp": 0.01036329, "balance_loss_clip": 1.25012541, "balance_loss_mlp": 1.01571798, "epoch": 0.6971892379377724, "flos": 14618505225600.0, "grad_norm": 2.377005486997459, "language_loss": 0.72564244, "learning_rate": 8.870589065239793e-07, "loss": 0.75015694, "num_input_tokens_seen": 250275640, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20617676, "step": 11596, "time_per_iteration": 2.8214449882507324 }, { "auxiliary_loss_clip": 0.01418165, "auxiliary_loss_mlp": 0.01038049, "balance_loss_clip": 1.25483084, "balance_loss_mlp": 1.01816463, "epoch": 0.6972493611904405, "flos": 22317049889280.0, "grad_norm": 1.9424288102484915, "language_loss": 0.77003741, "learning_rate": 8.867353366375492e-07, "loss": 0.79459953, "num_input_tokens_seen": 250296435, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19897461, "step": 11597, "time_per_iteration": 2.868818998336792 }, { "auxiliary_loss_clip": 0.01410358, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.24716246, "balance_loss_mlp": 1.01794577, "epoch": 0.6973094844431084, "flos": 17429220610560.0, "grad_norm": 1.9498856828740008, "language_loss": 0.75558686, "learning_rate": 8.864118089662267e-07, "loss": 0.78006876, "num_input_tokens_seen": 250314035, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19885254, "step": 11598, "time_per_iteration": 4.342143297195435 }, { "auxiliary_loss_clip": 0.01415996, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.24936819, "balance_loss_mlp": 1.01701653, "epoch": 0.6973696076957764, "flos": 27246848359680.0, "grad_norm": 1.856523923701185, "language_loss": 0.90272582, "learning_rate": 8.860883235222791e-07, "loss": 0.92726606, "num_input_tokens_seen": 250332995, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.21020508, "step": 11599, "time_per_iteration": 2.9140918254852295 }, { "auxiliary_loss_clip": 0.01434615, "auxiliary_loss_mlp": 0.01042479, "balance_loss_clip": 1.26459241, "balance_loss_mlp": 1.02098608, "epoch": 0.6974297309484443, "flos": 22028260458240.0, "grad_norm": 2.1045856265300253, "language_loss": 0.70103025, "learning_rate": 8.85764880317974e-07, "loss": 0.72580123, "num_input_tokens_seen": 250352120, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.21508789, "step": 11600, "time_per_iteration": 2.8804945945739746 }, { "auxiliary_loss_clip": 0.01418331, "auxiliary_loss_mlp": 0.01038709, "balance_loss_clip": 1.25303447, "balance_loss_mlp": 1.01720393, "epoch": 0.6974898542011123, "flos": 28378225301760.0, "grad_norm": 3.1482055349078406, "language_loss": 0.77510166, "learning_rate": 8.854414793655771e-07, "loss": 0.79967201, "num_input_tokens_seen": 250371705, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.21496582, "step": 11601, "time_per_iteration": 2.9103167057037354 }, { "auxiliary_loss_clip": 0.01393156, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.235201, "balance_loss_mlp": 1.01629889, "epoch": 0.6975499774537802, "flos": 15240541743360.0, "grad_norm": 2.075832261308193, "language_loss": 0.72846448, "learning_rate": 8.851181206773508e-07, "loss": 0.75275922, "num_input_tokens_seen": 250390485, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.20007324, "step": 11602, "time_per_iteration": 2.877328634262085 }, { "auxiliary_loss_clip": 0.01416007, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.25377595, "balance_loss_mlp": 1.01597881, "epoch": 0.6976101007064482, "flos": 22165913698560.0, "grad_norm": 2.165426834815899, "language_loss": 0.77131742, "learning_rate": 8.847948042655567e-07, "loss": 0.79583549, "num_input_tokens_seen": 250407020, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19836426, "step": 11603, "time_per_iteration": 2.8663036823272705 }, { "auxiliary_loss_clip": 0.01414069, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.25172579, "balance_loss_mlp": 1.01151681, "epoch": 0.6976702239591162, "flos": 22283767699200.0, "grad_norm": 1.6632891208023308, "language_loss": 0.62852293, "learning_rate": 8.844715301424557e-07, "loss": 0.65297812, "num_input_tokens_seen": 250425880, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19946289, "step": 11604, "time_per_iteration": 2.8941104412078857 }, { "auxiliary_loss_clip": 0.01416865, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.25286019, "balance_loss_mlp": 1.01452613, "epoch": 0.6977303472117842, "flos": 25859647463040.0, "grad_norm": 2.084829839459537, "language_loss": 0.82437348, "learning_rate": 8.841482983203057e-07, "loss": 0.84890133, "num_input_tokens_seen": 250442925, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.21411133, "step": 11605, "time_per_iteration": 2.862170696258545 }, { "auxiliary_loss_clip": 0.01408673, "auxiliary_loss_mlp": 0.01036374, "balance_loss_clip": 1.24698782, "balance_loss_mlp": 1.01589382, "epoch": 0.6977904704644521, "flos": 20969374965120.0, "grad_norm": 1.5835472155412509, "language_loss": 0.70817077, "learning_rate": 8.838251088113638e-07, "loss": 0.73262119, "num_input_tokens_seen": 250461220, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.20483398, "step": 11606, "time_per_iteration": 2.849299669265747 }, { "auxiliary_loss_clip": 0.01425068, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.25779796, "balance_loss_mlp": 1.01723766, "epoch": 0.6978505937171201, "flos": 22065343211520.0, "grad_norm": 1.8984628354700894, "language_loss": 0.83273292, "learning_rate": 8.835019616278856e-07, "loss": 0.85736394, "num_input_tokens_seen": 250480975, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.20776367, "step": 11607, "time_per_iteration": 2.8632678985595703 }, { "auxiliary_loss_clip": 0.01417999, "auxiliary_loss_mlp": 0.0103455, "balance_loss_clip": 1.24976802, "balance_loss_mlp": 1.01371205, "epoch": 0.697910716969788, "flos": 20052305233920.0, "grad_norm": 1.8612738692910307, "language_loss": 0.80197072, "learning_rate": 8.831788567821265e-07, "loss": 0.82649618, "num_input_tokens_seen": 250497980, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20849609, "step": 11608, "time_per_iteration": 2.8244423866271973 }, { "auxiliary_loss_clip": 0.01407437, "auxiliary_loss_mlp": 0.01033212, "balance_loss_clip": 1.24325442, "balance_loss_mlp": 1.01398349, "epoch": 0.697970840222456, "flos": 15896765347200.0, "grad_norm": 1.811677087541155, "language_loss": 0.91009283, "learning_rate": 8.828557942863357e-07, "loss": 0.93449938, "num_input_tokens_seen": 250511910, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19226074, "step": 11609, "time_per_iteration": 2.82832932472229 }, { "auxiliary_loss_clip": 0.01424716, "auxiliary_loss_mlp": 0.01041834, "balance_loss_clip": 1.25756359, "balance_loss_mlp": 1.02209353, "epoch": 0.698030963475124, "flos": 21225651367680.0, "grad_norm": 2.3761796428148982, "language_loss": 0.65071034, "learning_rate": 8.82532774152765e-07, "loss": 0.67537582, "num_input_tokens_seen": 250531090, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.1973877, "step": 11610, "time_per_iteration": 2.912391185760498 }, { "auxiliary_loss_clip": 0.01400616, "auxiliary_loss_mlp": 0.01035589, "balance_loss_clip": 1.23968589, "balance_loss_mlp": 1.01537097, "epoch": 0.698091086727792, "flos": 33771187238400.0, "grad_norm": 1.6015528451985452, "language_loss": 0.84929907, "learning_rate": 8.822097963936643e-07, "loss": 0.8736611, "num_input_tokens_seen": 250551565, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.20202637, "step": 11611, "time_per_iteration": 2.945190906524658 }, { "auxiliary_loss_clip": 0.01416897, "auxiliary_loss_mlp": 0.01036296, "balance_loss_clip": 1.25239563, "balance_loss_mlp": 1.01672149, "epoch": 0.69815120998046, "flos": 15896267654400.0, "grad_norm": 7.381770290059103, "language_loss": 0.71795309, "learning_rate": 8.818868610212793e-07, "loss": 0.74248505, "num_input_tokens_seen": 250569625, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19567871, "step": 11612, "time_per_iteration": 2.8099870681762695 }, { "auxiliary_loss_clip": 0.0141527, "auxiliary_loss_mlp": 0.01035446, "balance_loss_clip": 1.25457048, "balance_loss_mlp": 1.01518106, "epoch": 0.6982113332331279, "flos": 18954798664320.0, "grad_norm": 2.177734498504229, "language_loss": 0.81784254, "learning_rate": 8.815639680478573e-07, "loss": 0.84234965, "num_input_tokens_seen": 250586960, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20263672, "step": 11613, "time_per_iteration": 2.884998083114624 }, { "auxiliary_loss_clip": 0.01406248, "auxiliary_loss_mlp": 0.01031896, "balance_loss_clip": 1.24493241, "balance_loss_mlp": 1.0132513, "epoch": 0.6982714564857959, "flos": 24400362320640.0, "grad_norm": 1.852322606324556, "language_loss": 0.75786591, "learning_rate": 8.812411174856411e-07, "loss": 0.78224736, "num_input_tokens_seen": 250605080, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18652344, "step": 11614, "time_per_iteration": 2.8704352378845215 }, { "auxiliary_loss_clip": 0.01407096, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.24382794, "balance_loss_mlp": 1.0145936, "epoch": 0.6983315797384638, "flos": 20093233795200.0, "grad_norm": 2.003664963122446, "language_loss": 0.78541744, "learning_rate": 8.809183093468746e-07, "loss": 0.80982596, "num_input_tokens_seen": 250623965, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19165039, "step": 11615, "time_per_iteration": 2.8477370738983154 }, { "auxiliary_loss_clip": 0.0139569, "auxiliary_loss_mlp": 0.01033954, "balance_loss_clip": 1.23656058, "balance_loss_mlp": 1.01412928, "epoch": 0.6983917029911318, "flos": 13519234108800.0, "grad_norm": 1.9392046652887585, "language_loss": 0.73901016, "learning_rate": 8.80595543643797e-07, "loss": 0.76330656, "num_input_tokens_seen": 250640675, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19836426, "step": 11616, "time_per_iteration": 2.851538896560669 }, { "auxiliary_loss_clip": 0.01398559, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.24079967, "balance_loss_mlp": 1.01595831, "epoch": 0.6984518262437998, "flos": 22028667661440.0, "grad_norm": 1.544601557231382, "language_loss": 0.84533191, "learning_rate": 8.802728203886487e-07, "loss": 0.86967194, "num_input_tokens_seen": 250660295, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19494629, "step": 11617, "time_per_iteration": 2.8751742839813232 }, { "auxiliary_loss_clip": 0.01417522, "auxiliary_loss_mlp": 0.0104024, "balance_loss_clip": 1.25104856, "balance_loss_mlp": 1.02120256, "epoch": 0.6985119494964678, "flos": 18779881691520.0, "grad_norm": 2.233722183223116, "language_loss": 0.5973202, "learning_rate": 8.799501395936682e-07, "loss": 0.62189782, "num_input_tokens_seen": 250678155, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19042969, "step": 11618, "time_per_iteration": 2.847720146179199 }, { "auxiliary_loss_clip": 0.01409183, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.24663866, "balance_loss_mlp": 1.01533329, "epoch": 0.6985720727491357, "flos": 22393070432640.0, "grad_norm": 1.8109490547067972, "language_loss": 0.83839869, "learning_rate": 8.796275012710903e-07, "loss": 0.86284137, "num_input_tokens_seen": 250697230, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19763184, "step": 11619, "time_per_iteration": 2.9151976108551025 }, { "auxiliary_loss_clip": 0.01399248, "auxiliary_loss_mlp": 0.01032111, "balance_loss_clip": 1.24000633, "balance_loss_mlp": 1.01313281, "epoch": 0.6986321960018037, "flos": 39582646744320.0, "grad_norm": 1.610226917748209, "language_loss": 0.68016613, "learning_rate": 8.793049054331494e-07, "loss": 0.70447969, "num_input_tokens_seen": 250719865, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18994141, "step": 11620, "time_per_iteration": 4.376590728759766 }, { "auxiliary_loss_clip": 0.01415467, "auxiliary_loss_mlp": 0.01036554, "balance_loss_clip": 1.25205469, "balance_loss_mlp": 1.0163238, "epoch": 0.6986923192544716, "flos": 17976729663360.0, "grad_norm": 1.9811520462645826, "language_loss": 0.7402699, "learning_rate": 8.789823520920794e-07, "loss": 0.76479006, "num_input_tokens_seen": 250736565, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20239258, "step": 11621, "time_per_iteration": 2.8261077404022217 }, { "auxiliary_loss_clip": 0.01422199, "auxiliary_loss_mlp": 0.01036309, "balance_loss_clip": 1.25561857, "balance_loss_mlp": 1.01621079, "epoch": 0.6987524425071396, "flos": 25605588055680.0, "grad_norm": 1.6537132566848827, "language_loss": 0.68947327, "learning_rate": 8.7865984126011e-07, "loss": 0.71405834, "num_input_tokens_seen": 250757235, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.2010498, "step": 11622, "time_per_iteration": 2.929243803024292 }, { "auxiliary_loss_clip": 0.01401153, "auxiliary_loss_mlp": 0.01033399, "balance_loss_clip": 1.24101758, "balance_loss_mlp": 1.01361012, "epoch": 0.6988125657598077, "flos": 17539021036800.0, "grad_norm": 1.7798032163391981, "language_loss": 0.63381648, "learning_rate": 8.783373729494721e-07, "loss": 0.658162, "num_input_tokens_seen": 250775585, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19787598, "step": 11623, "time_per_iteration": 2.857217788696289 }, { "auxiliary_loss_clip": 0.01423101, "auxiliary_loss_mlp": 0.01035233, "balance_loss_clip": 1.25501466, "balance_loss_mlp": 1.01508665, "epoch": 0.6988726890124756, "flos": 39180165834240.0, "grad_norm": 2.3799195819647556, "language_loss": 0.6158663, "learning_rate": 8.780149471723932e-07, "loss": 0.64044964, "num_input_tokens_seen": 250795725, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20141602, "step": 11624, "time_per_iteration": 2.967169761657715 }, { "auxiliary_loss_clip": 0.01421499, "auxiliary_loss_mlp": 0.0103828, "balance_loss_clip": 1.2564944, "balance_loss_mlp": 1.01838362, "epoch": 0.6989328122651436, "flos": 20203079466240.0, "grad_norm": 1.5990677176861823, "language_loss": 0.79219019, "learning_rate": 8.776925639411017e-07, "loss": 0.81678796, "num_input_tokens_seen": 250814555, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19885254, "step": 11625, "time_per_iteration": 2.8308730125427246 }, { "auxiliary_loss_clip": 0.01406658, "auxiliary_loss_mlp": 0.01036264, "balance_loss_clip": 1.24697661, "balance_loss_mlp": 1.01618862, "epoch": 0.6989929355178115, "flos": 21844792218240.0, "grad_norm": 1.9622815122803738, "language_loss": 0.66617846, "learning_rate": 8.773702232678188e-07, "loss": 0.69060767, "num_input_tokens_seen": 250833105, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.20080566, "step": 11626, "time_per_iteration": 2.8384556770324707 }, { "auxiliary_loss_clip": 0.01406999, "auxiliary_loss_mlp": 0.01036948, "balance_loss_clip": 1.24447083, "balance_loss_mlp": 1.01758885, "epoch": 0.6990530587704795, "flos": 26334031639680.0, "grad_norm": 1.8010160344620985, "language_loss": 0.71419901, "learning_rate": 8.770479251647697e-07, "loss": 0.73863852, "num_input_tokens_seen": 250852570, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19348145, "step": 11627, "time_per_iteration": 2.8734607696533203 }, { "auxiliary_loss_clip": 0.01392414, "auxiliary_loss_mlp": 0.0103785, "balance_loss_clip": 1.23510885, "balance_loss_mlp": 1.01827621, "epoch": 0.6991131820231474, "flos": 19838586205440.0, "grad_norm": 3.6805060860766714, "language_loss": 0.63503051, "learning_rate": 8.767256696441768e-07, "loss": 0.65933317, "num_input_tokens_seen": 250870500, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.19567871, "step": 11628, "time_per_iteration": 2.8216824531555176 }, { "auxiliary_loss_clip": 0.01417994, "auxiliary_loss_mlp": 0.01036911, "balance_loss_clip": 1.25343192, "balance_loss_mlp": 1.01683581, "epoch": 0.6991733052758154, "flos": 33997936769280.0, "grad_norm": 4.283192159656449, "language_loss": 0.68752337, "learning_rate": 8.764034567182581e-07, "loss": 0.71207237, "num_input_tokens_seen": 250892745, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20092773, "step": 11629, "time_per_iteration": 2.9502267837524414 }, { "auxiliary_loss_clip": 0.01407847, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.24638462, "balance_loss_mlp": 1.01830363, "epoch": 0.6992334285284834, "flos": 15641665309440.0, "grad_norm": 1.5360720419650575, "language_loss": 0.72862047, "learning_rate": 8.760812863992337e-07, "loss": 0.75309694, "num_input_tokens_seen": 250910225, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.21496582, "step": 11630, "time_per_iteration": 4.261090040206909 }, { "auxiliary_loss_clip": 0.01407244, "auxiliary_loss_mlp": 0.01037094, "balance_loss_clip": 1.24739695, "balance_loss_mlp": 1.01753187, "epoch": 0.6992935517811514, "flos": 21736394380800.0, "grad_norm": 1.6494894225888876, "language_loss": 0.7479291, "learning_rate": 8.757591586993196e-07, "loss": 0.77237248, "num_input_tokens_seen": 250929715, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19555664, "step": 11631, "time_per_iteration": 4.259166955947876 }, { "auxiliary_loss_clip": 0.01419538, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.25327992, "balance_loss_mlp": 1.01345158, "epoch": 0.6993536750338193, "flos": 20123484583680.0, "grad_norm": 2.2520313960465175, "language_loss": 0.8997699, "learning_rate": 8.7543707363073e-07, "loss": 0.92430085, "num_input_tokens_seen": 250944230, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.2010498, "step": 11632, "time_per_iteration": 2.829986095428467 }, { "auxiliary_loss_clip": 0.01421703, "auxiliary_loss_mlp": 0.01041923, "balance_loss_clip": 1.2573278, "balance_loss_mlp": 1.0217886, "epoch": 0.6994137982864873, "flos": 22018759050240.0, "grad_norm": 1.5746183664921196, "language_loss": 0.80207133, "learning_rate": 8.751150312056792e-07, "loss": 0.8267076, "num_input_tokens_seen": 250961865, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20141602, "step": 11633, "time_per_iteration": 4.217574119567871 }, { "auxiliary_loss_clip": 0.01429632, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.26162481, "balance_loss_mlp": 1.01673508, "epoch": 0.6994739215391552, "flos": 25529658001920.0, "grad_norm": 1.8917895637267577, "language_loss": 0.6789993, "learning_rate": 8.747930314363794e-07, "loss": 0.70366347, "num_input_tokens_seen": 250982025, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20043945, "step": 11634, "time_per_iteration": 2.876605749130249 }, { "auxiliary_loss_clip": 0.012001, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.1072793, "balance_loss_mlp": 1.01525867, "epoch": 0.6995340447918232, "flos": 59158730747520.0, "grad_norm": 1.0270482839022725, "language_loss": 0.53184134, "learning_rate": 8.744710743350412e-07, "loss": 0.55427718, "num_input_tokens_seen": 251046900, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.28320312, "step": 11635, "time_per_iteration": 3.476649522781372 }, { "auxiliary_loss_clip": 0.01413378, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.25036299, "balance_loss_mlp": 1.01549864, "epoch": 0.6995941680444913, "flos": 17976865397760.0, "grad_norm": 1.601496346120854, "language_loss": 0.82677889, "learning_rate": 8.741491599138726e-07, "loss": 0.85125685, "num_input_tokens_seen": 251065050, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18908691, "step": 11636, "time_per_iteration": 2.8381102085113525 }, { "auxiliary_loss_clip": 0.0141215, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.24719501, "balance_loss_mlp": 1.01263833, "epoch": 0.6996542912971592, "flos": 21989865605760.0, "grad_norm": 2.355358243778302, "language_loss": 0.83908296, "learning_rate": 8.738272881850801e-07, "loss": 0.8635335, "num_input_tokens_seen": 251083355, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20263672, "step": 11637, "time_per_iteration": 2.854966402053833 }, { "auxiliary_loss_clip": 0.01410193, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.24778628, "balance_loss_mlp": 1.01905942, "epoch": 0.6997144145498272, "flos": 11691971856000.0, "grad_norm": 2.1893906400359775, "language_loss": 0.68182689, "learning_rate": 8.735054591608704e-07, "loss": 0.70632756, "num_input_tokens_seen": 251096420, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20812988, "step": 11638, "time_per_iteration": 2.816181182861328 }, { "auxiliary_loss_clip": 0.01424682, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.25607443, "balance_loss_mlp": 1.01726377, "epoch": 0.6997745378024951, "flos": 29619312180480.0, "grad_norm": 2.9199563216956492, "language_loss": 0.78100514, "learning_rate": 8.731836728534459e-07, "loss": 0.80564618, "num_input_tokens_seen": 251115410, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.22143555, "step": 11639, "time_per_iteration": 2.9106521606445312 }, { "auxiliary_loss_clip": 0.01419318, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.25484812, "balance_loss_mlp": 1.0176239, "epoch": 0.6998346610551631, "flos": 20896385823360.0, "grad_norm": 2.320189263007193, "language_loss": 0.83546048, "learning_rate": 8.728619292750093e-07, "loss": 0.86003196, "num_input_tokens_seen": 251133530, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20214844, "step": 11640, "time_per_iteration": 2.8706612586975098 }, { "auxiliary_loss_clip": 0.01402383, "auxiliary_loss_mlp": 0.01035139, "balance_loss_clip": 1.24002838, "balance_loss_mlp": 1.01514757, "epoch": 0.699894784307831, "flos": 27174673624320.0, "grad_norm": 1.663077428157904, "language_loss": 0.76240885, "learning_rate": 8.725402284377619e-07, "loss": 0.78678405, "num_input_tokens_seen": 251153985, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20007324, "step": 11641, "time_per_iteration": 2.9198415279388428 }, { "auxiliary_loss_clip": 0.01414219, "auxiliary_loss_mlp": 0.01029808, "balance_loss_clip": 1.25026226, "balance_loss_mlp": 1.0092442, "epoch": 0.699954907560499, "flos": 20933559066240.0, "grad_norm": 1.8581892058001863, "language_loss": 0.78628612, "learning_rate": 8.722185703539022e-07, "loss": 0.8107264, "num_input_tokens_seen": 251173225, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20581055, "step": 11642, "time_per_iteration": 2.8708479404449463 }, { "auxiliary_loss_clip": 0.01423029, "auxiliary_loss_mlp": 0.01037299, "balance_loss_clip": 1.25544488, "balance_loss_mlp": 1.01596034, "epoch": 0.700015030813167, "flos": 28669503196800.0, "grad_norm": 3.0795421260976266, "language_loss": 0.75672638, "learning_rate": 8.718969550356266e-07, "loss": 0.78132963, "num_input_tokens_seen": 251192485, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.21337891, "step": 11643, "time_per_iteration": 2.9185147285461426 }, { "auxiliary_loss_clip": 0.01417869, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.25165319, "balance_loss_mlp": 1.0132848, "epoch": 0.700075154065835, "flos": 29217193228800.0, "grad_norm": 1.433249195726307, "language_loss": 0.60669029, "learning_rate": 8.715753824951315e-07, "loss": 0.63120818, "num_input_tokens_seen": 251214965, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20617676, "step": 11644, "time_per_iteration": 2.9078919887542725 }, { "auxiliary_loss_clip": 0.01409098, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.24673533, "balance_loss_mlp": 1.01625943, "epoch": 0.7001352773185029, "flos": 23122826115840.0, "grad_norm": 2.3539099497609626, "language_loss": 0.81623888, "learning_rate": 8.712538527446119e-07, "loss": 0.84070146, "num_input_tokens_seen": 251234500, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.20898438, "step": 11645, "time_per_iteration": 2.851039171218872 }, { "auxiliary_loss_clip": 0.01397883, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.23689103, "balance_loss_mlp": 1.01396394, "epoch": 0.7001954005711709, "flos": 21332329902720.0, "grad_norm": 1.911168273091155, "language_loss": 0.69521481, "learning_rate": 8.709323657962584e-07, "loss": 0.71953821, "num_input_tokens_seen": 251254360, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.20495605, "step": 11646, "time_per_iteration": 2.829630136489868 }, { "auxiliary_loss_clip": 0.01396332, "auxiliary_loss_mlp": 0.01034597, "balance_loss_clip": 1.23584652, "balance_loss_mlp": 1.01439154, "epoch": 0.7002555238238388, "flos": 24546702562560.0, "grad_norm": 1.645878462705099, "language_loss": 0.71881169, "learning_rate": 8.706109216622635e-07, "loss": 0.74312097, "num_input_tokens_seen": 251274790, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.20214844, "step": 11647, "time_per_iteration": 2.9959559440612793 }, { "auxiliary_loss_clip": 0.0142113, "auxiliary_loss_mlp": 0.01038449, "balance_loss_clip": 1.25510192, "balance_loss_mlp": 1.01771832, "epoch": 0.7003156470765068, "flos": 39071044080000.0, "grad_norm": 1.6034074218353362, "language_loss": 0.72463167, "learning_rate": 8.702895203548155e-07, "loss": 0.7492274, "num_input_tokens_seen": 251296275, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20727539, "step": 11648, "time_per_iteration": 3.015843391418457 }, { "auxiliary_loss_clip": 0.01403745, "auxiliary_loss_mlp": 0.01038335, "balance_loss_clip": 1.24258184, "balance_loss_mlp": 1.0177238, "epoch": 0.7003757703291749, "flos": 28815933928320.0, "grad_norm": 1.631057113467804, "language_loss": 0.77763641, "learning_rate": 8.699681618861014e-07, "loss": 0.80205727, "num_input_tokens_seen": 251317375, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.20605469, "step": 11649, "time_per_iteration": 2.949418306350708 }, { "auxiliary_loss_clip": 0.01401753, "auxiliary_loss_mlp": 0.01036616, "balance_loss_clip": 1.23973846, "balance_loss_mlp": 1.01596904, "epoch": 0.7004358935818428, "flos": 15960479304960.0, "grad_norm": 1.6911973467186812, "language_loss": 0.79332536, "learning_rate": 8.69646846268308e-07, "loss": 0.81770909, "num_input_tokens_seen": 251333570, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20654297, "step": 11650, "time_per_iteration": 2.7975709438323975 }, { "auxiliary_loss_clip": 0.01407922, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.24522853, "balance_loss_mlp": 1.01468134, "epoch": 0.7004960168345108, "flos": 20421549198720.0, "grad_norm": 2.1309457612072773, "language_loss": 0.78996515, "learning_rate": 8.693255735136194e-07, "loss": 0.81439412, "num_input_tokens_seen": 251351070, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.203125, "step": 11651, "time_per_iteration": 2.8309266567230225 }, { "auxiliary_loss_clip": 0.0142303, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.25618362, "balance_loss_mlp": 1.01304865, "epoch": 0.7005561400871787, "flos": 17356412448000.0, "grad_norm": 2.0288439151830255, "language_loss": 0.70358759, "learning_rate": 8.690043436342198e-07, "loss": 0.72815132, "num_input_tokens_seen": 251370005, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20288086, "step": 11652, "time_per_iteration": 2.8250958919525146 }, { "auxiliary_loss_clip": 0.01413711, "auxiliary_loss_mlp": 0.01037803, "balance_loss_clip": 1.25041938, "balance_loss_mlp": 1.01645219, "epoch": 0.7006162633398467, "flos": 25313224285440.0, "grad_norm": 1.4662833090541683, "language_loss": 0.75230718, "learning_rate": 8.686831566422874e-07, "loss": 0.77682233, "num_input_tokens_seen": 251391210, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.21337891, "step": 11653, "time_per_iteration": 3.0283145904541016 }, { "auxiliary_loss_clip": 0.01407983, "auxiliary_loss_mlp": 0.01033669, "balance_loss_clip": 1.24201953, "balance_loss_mlp": 1.01225853, "epoch": 0.7006763865925146, "flos": 20679047210880.0, "grad_norm": 2.15765240031465, "language_loss": 0.7136907, "learning_rate": 8.68362012550003e-07, "loss": 0.7381072, "num_input_tokens_seen": 251411505, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.21386719, "step": 11654, "time_per_iteration": 2.89398193359375 }, { "auxiliary_loss_clip": 0.01410185, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.24557126, "balance_loss_mlp": 1.0116744, "epoch": 0.7007365098451827, "flos": 20055743838720.0, "grad_norm": 2.6906783904275984, "language_loss": 0.73850405, "learning_rate": 8.680409113695453e-07, "loss": 0.76294208, "num_input_tokens_seen": 251428975, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.21948242, "step": 11655, "time_per_iteration": 4.318296909332275 }, { "auxiliary_loss_clip": 0.01434892, "auxiliary_loss_mlp": 0.01039741, "balance_loss_clip": 1.26287019, "balance_loss_mlp": 1.01773477, "epoch": 0.7007966330978506, "flos": 20787173579520.0, "grad_norm": 1.8732173570998834, "language_loss": 0.7081793, "learning_rate": 8.677198531130889e-07, "loss": 0.73292565, "num_input_tokens_seen": 251446940, "router_z_loss_clip": 1.71875, "router_z_loss_mlp": 0.2199707, "step": 11656, "time_per_iteration": 2.859370708465576 }, { "auxiliary_loss_clip": 0.01403286, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.24236202, "balance_loss_mlp": 1.01334763, "epoch": 0.7008567563505186, "flos": 29648929541760.0, "grad_norm": 1.6541060634991553, "language_loss": 0.78577822, "learning_rate": 8.673988377928092e-07, "loss": 0.81014174, "num_input_tokens_seen": 251466205, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19702148, "step": 11657, "time_per_iteration": 2.9331865310668945 }, { "auxiliary_loss_clip": 0.01422517, "auxiliary_loss_mlp": 0.01039731, "balance_loss_clip": 1.25240135, "balance_loss_mlp": 1.01808286, "epoch": 0.7009168796031865, "flos": 17100769472640.0, "grad_norm": 2.41577996669732, "language_loss": 0.79259002, "learning_rate": 8.670778654208797e-07, "loss": 0.81721246, "num_input_tokens_seen": 251484820, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.21655273, "step": 11658, "time_per_iteration": 2.8667688369750977 }, { "auxiliary_loss_clip": 0.01400143, "auxiliary_loss_mlp": 0.01032053, "balance_loss_clip": 1.24096799, "balance_loss_mlp": 1.01047647, "epoch": 0.7009770028558545, "flos": 20458677196800.0, "grad_norm": 1.707573332022692, "language_loss": 0.83248377, "learning_rate": 8.667569360094713e-07, "loss": 0.85680568, "num_input_tokens_seen": 251502670, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.21582031, "step": 11659, "time_per_iteration": 2.8578951358795166 }, { "auxiliary_loss_clip": 0.01397199, "auxiliary_loss_mlp": 0.0103542, "balance_loss_clip": 1.23641944, "balance_loss_mlp": 1.01406968, "epoch": 0.7010371261085224, "flos": 19254582581760.0, "grad_norm": 4.813987693673104, "language_loss": 0.70703208, "learning_rate": 8.664360495707526e-07, "loss": 0.73135829, "num_input_tokens_seen": 251521630, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.21362305, "step": 11660, "time_per_iteration": 2.83294939994812 }, { "auxiliary_loss_clip": 0.01426492, "auxiliary_loss_mlp": 0.0103735, "balance_loss_clip": 1.25962543, "balance_loss_mlp": 1.01654828, "epoch": 0.7010972493611904, "flos": 22137789415680.0, "grad_norm": 1.6719302674615144, "language_loss": 0.81556308, "learning_rate": 8.661152061168924e-07, "loss": 0.8402015, "num_input_tokens_seen": 251540105, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20800781, "step": 11661, "time_per_iteration": 2.878554344177246 }, { "auxiliary_loss_clip": 0.01406032, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.24309111, "balance_loss_mlp": 1.01808619, "epoch": 0.7011573726138585, "flos": 31402750204800.0, "grad_norm": 1.756705381159017, "language_loss": 0.79865682, "learning_rate": 8.657944056600579e-07, "loss": 0.82309866, "num_input_tokens_seen": 251560530, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20056152, "step": 11662, "time_per_iteration": 2.9275314807891846 }, { "auxiliary_loss_clip": 0.01421803, "auxiliary_loss_mlp": 0.01033182, "balance_loss_clip": 1.2561208, "balance_loss_mlp": 1.01191545, "epoch": 0.7012174958665264, "flos": 18159745455360.0, "grad_norm": 1.7548390969245795, "language_loss": 0.84271652, "learning_rate": 8.654736482124134e-07, "loss": 0.86726642, "num_input_tokens_seen": 251577930, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.21264648, "step": 11663, "time_per_iteration": 2.844663381576538 }, { "auxiliary_loss_clip": 0.01190019, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.10105753, "balance_loss_mlp": 1.00561702, "epoch": 0.7012776191191944, "flos": 60679558097280.0, "grad_norm": 0.8223345229736677, "language_loss": 0.53843594, "learning_rate": 8.651529337861209e-07, "loss": 0.56063259, "num_input_tokens_seen": 251638820, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.24023438, "step": 11664, "time_per_iteration": 4.7706732749938965 }, { "auxiliary_loss_clip": 0.01415324, "auxiliary_loss_mlp": 0.01032461, "balance_loss_clip": 1.24937117, "balance_loss_mlp": 1.01305401, "epoch": 0.7013377423718623, "flos": 27209991830400.0, "grad_norm": 2.1675194380833545, "language_loss": 0.80285597, "learning_rate": 8.64832262393344e-07, "loss": 0.82733387, "num_input_tokens_seen": 251658070, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.1940918, "step": 11665, "time_per_iteration": 2.919515609741211 }, { "auxiliary_loss_clip": 0.01399618, "auxiliary_loss_mlp": 0.0103369, "balance_loss_clip": 1.23837996, "balance_loss_mlp": 1.01415181, "epoch": 0.7013978656245303, "flos": 16551857831040.0, "grad_norm": 2.0369617925332704, "language_loss": 0.77442813, "learning_rate": 8.645116340462404e-07, "loss": 0.79876125, "num_input_tokens_seen": 251671575, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.1953125, "step": 11666, "time_per_iteration": 4.236992597579956 }, { "auxiliary_loss_clip": 0.01412076, "auxiliary_loss_mlp": 0.01033528, "balance_loss_clip": 1.24825573, "balance_loss_mlp": 1.01369143, "epoch": 0.7014579888771982, "flos": 23152850680320.0, "grad_norm": 1.7288921220098765, "language_loss": 0.8197701, "learning_rate": 8.641910487569695e-07, "loss": 0.84422612, "num_input_tokens_seen": 251689350, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19836426, "step": 11667, "time_per_iteration": 2.862044095993042 }, { "auxiliary_loss_clip": 0.01408383, "auxiliary_loss_mlp": 0.01038499, "balance_loss_clip": 1.24573076, "balance_loss_mlp": 1.01789927, "epoch": 0.7015181121298663, "flos": 25092854271360.0, "grad_norm": 2.215272923488967, "language_loss": 0.65766925, "learning_rate": 8.638705065376879e-07, "loss": 0.68213809, "num_input_tokens_seen": 251704635, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20593262, "step": 11668, "time_per_iteration": 4.185842275619507 }, { "auxiliary_loss_clip": 0.01413112, "auxiliary_loss_mlp": 0.01030946, "balance_loss_clip": 1.24686217, "balance_loss_mlp": 1.01107371, "epoch": 0.7015782353825342, "flos": 23337450040320.0, "grad_norm": 5.253937859876017, "language_loss": 0.77483636, "learning_rate": 8.635500074005519e-07, "loss": 0.79927695, "num_input_tokens_seen": 251723035, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19885254, "step": 11669, "time_per_iteration": 2.881883144378662 }, { "auxiliary_loss_clip": 0.01193032, "auxiliary_loss_mlp": 0.01042639, "balance_loss_clip": 1.10291481, "balance_loss_mlp": 1.02251625, "epoch": 0.7016383586352022, "flos": 70429083143040.0, "grad_norm": 0.7155546492751289, "language_loss": 0.54497766, "learning_rate": 8.632295513577122e-07, "loss": 0.56733435, "num_input_tokens_seen": 251791630, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.20117188, "step": 11670, "time_per_iteration": 3.502450704574585 }, { "auxiliary_loss_clip": 0.01403051, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.24179316, "balance_loss_mlp": 1.01323247, "epoch": 0.7016984818878701, "flos": 19801820165760.0, "grad_norm": 1.7319518615407454, "language_loss": 0.82856905, "learning_rate": 8.629091384213218e-07, "loss": 0.85294694, "num_input_tokens_seen": 251809840, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.21520996, "step": 11671, "time_per_iteration": 2.9164483547210693 }, { "auxiliary_loss_clip": 0.01431932, "auxiliary_loss_mlp": 0.01037377, "balance_loss_clip": 1.26534271, "balance_loss_mlp": 1.0161221, "epoch": 0.7017586051405381, "flos": 12904798717440.0, "grad_norm": 2.6161132259047637, "language_loss": 0.76140004, "learning_rate": 8.625887686035313e-07, "loss": 0.78609312, "num_input_tokens_seen": 251827550, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.21252441, "step": 11672, "time_per_iteration": 2.866234540939331 }, { "auxiliary_loss_clip": 0.01406019, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.2431767, "balance_loss_mlp": 1.01531208, "epoch": 0.701818728393206, "flos": 18341992085760.0, "grad_norm": 1.651082831586969, "language_loss": 0.87152731, "learning_rate": 8.622684419164883e-07, "loss": 0.8959347, "num_input_tokens_seen": 251844880, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1940918, "step": 11673, "time_per_iteration": 2.833664655685425 }, { "auxiliary_loss_clip": 0.01400827, "auxiliary_loss_mlp": 0.01037659, "balance_loss_clip": 1.23972988, "balance_loss_mlp": 1.017048, "epoch": 0.701878851645874, "flos": 17393133242880.0, "grad_norm": 4.248773406523166, "language_loss": 0.74119234, "learning_rate": 8.619481583723399e-07, "loss": 0.7655772, "num_input_tokens_seen": 251861025, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.20593262, "step": 11674, "time_per_iteration": 2.853548526763916 }, { "auxiliary_loss_clip": 0.01410705, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.2502054, "balance_loss_mlp": 1.01543605, "epoch": 0.701938974898542, "flos": 23926204368000.0, "grad_norm": 3.4524403735306817, "language_loss": 0.72892404, "learning_rate": 8.616279179832329e-07, "loss": 0.75338233, "num_input_tokens_seen": 251880175, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19714355, "step": 11675, "time_per_iteration": 2.9173691272735596 }, { "auxiliary_loss_clip": 0.01420978, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.25578523, "balance_loss_mlp": 1.01376379, "epoch": 0.70199909815121, "flos": 21804768552960.0, "grad_norm": 2.241672086597585, "language_loss": 0.51960242, "learning_rate": 8.613077207613078e-07, "loss": 0.5441407, "num_input_tokens_seen": 251899005, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.1907959, "step": 11676, "time_per_iteration": 2.8823370933532715 }, { "auxiliary_loss_clip": 0.01189565, "auxiliary_loss_mlp": 0.01013055, "balance_loss_clip": 1.09878421, "balance_loss_mlp": 0.99121612, "epoch": 0.702059221403878, "flos": 71748452805120.0, "grad_norm": 0.7227030089320855, "language_loss": 0.59223044, "learning_rate": 8.609875667187079e-07, "loss": 0.61425662, "num_input_tokens_seen": 251966790, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.21875, "step": 11677, "time_per_iteration": 3.4079089164733887 }, { "auxiliary_loss_clip": 0.01412288, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.24859357, "balance_loss_mlp": 1.0131259, "epoch": 0.7021193446565459, "flos": 28122763305600.0, "grad_norm": 2.3892905589112168, "language_loss": 0.63287854, "learning_rate": 8.606674558675737e-07, "loss": 0.65732694, "num_input_tokens_seen": 251989315, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19433594, "step": 11678, "time_per_iteration": 2.915574312210083 }, { "auxiliary_loss_clip": 0.01400816, "auxiliary_loss_mlp": 0.01033761, "balance_loss_clip": 1.24021089, "balance_loss_mlp": 1.01472354, "epoch": 0.7021794679092139, "flos": 22933928499840.0, "grad_norm": 1.6368688558288498, "language_loss": 0.79681289, "learning_rate": 8.603473882200444e-07, "loss": 0.82115865, "num_input_tokens_seen": 252006620, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19055176, "step": 11679, "time_per_iteration": 2.8836517333984375 }, { "auxiliary_loss_clip": 0.01405147, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.24641526, "balance_loss_mlp": 1.01328802, "epoch": 0.7022395911618818, "flos": 18087027782400.0, "grad_norm": 2.153153118456413, "language_loss": 0.72105956, "learning_rate": 8.600273637882567e-07, "loss": 0.74544168, "num_input_tokens_seen": 252024570, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19775391, "step": 11680, "time_per_iteration": 2.822939157485962 }, { "auxiliary_loss_clip": 0.01427027, "auxiliary_loss_mlp": 0.01035452, "balance_loss_clip": 1.26002657, "balance_loss_mlp": 1.01532912, "epoch": 0.7022997144145499, "flos": 16042879365120.0, "grad_norm": 1.5261468878983375, "language_loss": 0.75447381, "learning_rate": 8.597073825843446e-07, "loss": 0.77909863, "num_input_tokens_seen": 252042775, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.2010498, "step": 11681, "time_per_iteration": 2.8128974437713623 }, { "auxiliary_loss_clip": 0.01403503, "auxiliary_loss_mlp": 0.01034732, "balance_loss_clip": 1.24368072, "balance_loss_mlp": 1.01532435, "epoch": 0.7023598376672178, "flos": 26479828944000.0, "grad_norm": 1.648678717570838, "language_loss": 0.77546906, "learning_rate": 8.593874446204434e-07, "loss": 0.79985136, "num_input_tokens_seen": 252063690, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19421387, "step": 11682, "time_per_iteration": 2.869854211807251 }, { "auxiliary_loss_clip": 0.01427581, "auxiliary_loss_mlp": 0.01040762, "balance_loss_clip": 1.26024902, "balance_loss_mlp": 1.02142632, "epoch": 0.7024199609198858, "flos": 17064772594560.0, "grad_norm": 3.966231573072461, "language_loss": 0.74434125, "learning_rate": 8.590675499086841e-07, "loss": 0.76902473, "num_input_tokens_seen": 252080335, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19335938, "step": 11683, "time_per_iteration": 2.8685128688812256 }, { "auxiliary_loss_clip": 0.0140757, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.24597049, "balance_loss_mlp": 1.01461482, "epoch": 0.7024800841725537, "flos": 25860190400640.0, "grad_norm": 1.7573304380246906, "language_loss": 0.72474766, "learning_rate": 8.587476984611976e-07, "loss": 0.74917746, "num_input_tokens_seen": 252101075, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20788574, "step": 11684, "time_per_iteration": 2.8778140544891357 }, { "auxiliary_loss_clip": 0.01411621, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 1.24965727, "balance_loss_mlp": 1.021101, "epoch": 0.7025402074252217, "flos": 23523316254720.0, "grad_norm": 2.500963492075203, "language_loss": 0.73608088, "learning_rate": 8.584278902901128e-07, "loss": 0.76060081, "num_input_tokens_seen": 252120510, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19274902, "step": 11685, "time_per_iteration": 2.8646981716156006 }, { "auxiliary_loss_clip": 0.01404218, "auxiliary_loss_mlp": 0.01033109, "balance_loss_clip": 1.24011731, "balance_loss_mlp": 1.01337957, "epoch": 0.7026003306778896, "flos": 20159029013760.0, "grad_norm": 1.6875490952471681, "language_loss": 0.85681832, "learning_rate": 8.581081254075582e-07, "loss": 0.88119161, "num_input_tokens_seen": 252137590, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19726562, "step": 11686, "time_per_iteration": 2.8408091068267822 }, { "auxiliary_loss_clip": 0.01187607, "auxiliary_loss_mlp": 0.01022744, "balance_loss_clip": 1.09771466, "balance_loss_mlp": 1.00061917, "epoch": 0.7026604539305576, "flos": 64801318101120.0, "grad_norm": 0.9793892573988293, "language_loss": 0.70025963, "learning_rate": 8.577884038256566e-07, "loss": 0.72236311, "num_input_tokens_seen": 252199830, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.22167969, "step": 11687, "time_per_iteration": 3.490342855453491 }, { "auxiliary_loss_clip": 0.01417553, "auxiliary_loss_mlp": 0.01039112, "balance_loss_clip": 1.25483561, "balance_loss_mlp": 1.01983595, "epoch": 0.7027205771832256, "flos": 21881422523520.0, "grad_norm": 2.1625600301802823, "language_loss": 0.77631295, "learning_rate": 8.574687255565329e-07, "loss": 0.8008796, "num_input_tokens_seen": 252217200, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19287109, "step": 11688, "time_per_iteration": 2.8764820098876953 }, { "auxiliary_loss_clip": 0.01399665, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.23807633, "balance_loss_mlp": 1.01964164, "epoch": 0.7027807004358936, "flos": 23378242867200.0, "grad_norm": 2.6774911839172546, "language_loss": 0.69065332, "learning_rate": 8.571490906123107e-07, "loss": 0.71504283, "num_input_tokens_seen": 252236105, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19641113, "step": 11689, "time_per_iteration": 2.8516480922698975 }, { "auxiliary_loss_clip": 0.01419094, "auxiliary_loss_mlp": 0.01039099, "balance_loss_clip": 1.25319457, "balance_loss_mlp": 1.0180707, "epoch": 0.7028408236885616, "flos": 15312580744320.0, "grad_norm": 2.4239166753422317, "language_loss": 0.80775821, "learning_rate": 8.568294990051086e-07, "loss": 0.83234006, "num_input_tokens_seen": 252253315, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.21044922, "step": 11690, "time_per_iteration": 4.319056749343872 }, { "auxiliary_loss_clip": 0.01406557, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.24412131, "balance_loss_mlp": 1.01483881, "epoch": 0.7029009469412295, "flos": 22028396192640.0, "grad_norm": 1.6948643174418232, "language_loss": 0.76519263, "learning_rate": 8.56509950747047e-07, "loss": 0.7896024, "num_input_tokens_seen": 252272765, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19580078, "step": 11691, "time_per_iteration": 2.8429200649261475 }, { "auxiliary_loss_clip": 0.01416196, "auxiliary_loss_mlp": 0.01035585, "balance_loss_clip": 1.25401664, "balance_loss_mlp": 1.01626086, "epoch": 0.7029610701938975, "flos": 21845606624640.0, "grad_norm": 1.6535280273734518, "language_loss": 0.82285202, "learning_rate": 8.561904458502429e-07, "loss": 0.84736979, "num_input_tokens_seen": 252290510, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.1932373, "step": 11692, "time_per_iteration": 2.836735725402832 }, { "auxiliary_loss_clip": 0.01411024, "auxiliary_loss_mlp": 0.01040428, "balance_loss_clip": 1.24962151, "balance_loss_mlp": 1.01939917, "epoch": 0.7030211934465654, "flos": 19145053624320.0, "grad_norm": 1.6144173124374728, "language_loss": 0.77259868, "learning_rate": 8.558709843268111e-07, "loss": 0.79711318, "num_input_tokens_seen": 252309365, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.21044922, "step": 11693, "time_per_iteration": 2.9013922214508057 }, { "auxiliary_loss_clip": 0.01414215, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.25190973, "balance_loss_mlp": 1.01730514, "epoch": 0.7030813166992335, "flos": 38560436801280.0, "grad_norm": 1.3709085605453615, "language_loss": 0.68684542, "learning_rate": 8.55551566188866e-07, "loss": 0.71136117, "num_input_tokens_seen": 252333010, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20056152, "step": 11694, "time_per_iteration": 2.9837522506713867 }, { "auxiliary_loss_clip": 0.01414567, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.25157261, "balance_loss_mlp": 1.01944017, "epoch": 0.7031414399519014, "flos": 14729165303040.0, "grad_norm": 2.3111271743218755, "language_loss": 0.76335579, "learning_rate": 8.552321914485203e-07, "loss": 0.78789151, "num_input_tokens_seen": 252351330, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19567871, "step": 11695, "time_per_iteration": 2.8471624851226807 }, { "auxiliary_loss_clip": 0.01432059, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.26558352, "balance_loss_mlp": 1.02229059, "epoch": 0.7032015632045694, "flos": 14035225518720.0, "grad_norm": 17.38644250736391, "language_loss": 0.74414468, "learning_rate": 8.549128601178852e-07, "loss": 0.76888669, "num_input_tokens_seen": 252369580, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19836426, "step": 11696, "time_per_iteration": 2.8867242336273193 }, { "auxiliary_loss_clip": 0.01413803, "auxiliary_loss_mlp": 0.01042704, "balance_loss_clip": 1.24972343, "balance_loss_mlp": 1.02248621, "epoch": 0.7032616864572373, "flos": 27648876821760.0, "grad_norm": 1.6190303156366626, "language_loss": 0.76160038, "learning_rate": 8.545935722090693e-07, "loss": 0.78616548, "num_input_tokens_seen": 252390525, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20227051, "step": 11697, "time_per_iteration": 2.9000773429870605 }, { "auxiliary_loss_clip": 0.01417369, "auxiliary_loss_mlp": 0.01043241, "balance_loss_clip": 1.25195742, "balance_loss_mlp": 1.02148509, "epoch": 0.7033218097099053, "flos": 17976186725760.0, "grad_norm": 3.4961286415473505, "language_loss": 0.80649567, "learning_rate": 8.542743277341793e-07, "loss": 0.83110183, "num_input_tokens_seen": 252407470, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.2175293, "step": 11698, "time_per_iteration": 2.9439756870269775 }, { "auxiliary_loss_clip": 0.01410451, "auxiliary_loss_mlp": 0.01045086, "balance_loss_clip": 1.24644291, "balance_loss_mlp": 1.02501118, "epoch": 0.7033819329625732, "flos": 19511537656320.0, "grad_norm": 4.074828101394712, "language_loss": 0.85388255, "learning_rate": 8.539551267053222e-07, "loss": 0.87843788, "num_input_tokens_seen": 252427025, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20068359, "step": 11699, "time_per_iteration": 4.319744825363159 }, { "auxiliary_loss_clip": 0.01410417, "auxiliary_loss_mlp": 0.01035446, "balance_loss_clip": 1.25079608, "balance_loss_mlp": 1.01508546, "epoch": 0.7034420562152413, "flos": 23998152879360.0, "grad_norm": 2.4309047385389304, "language_loss": 0.8030771, "learning_rate": 8.53635969134601e-07, "loss": 0.82753575, "num_input_tokens_seen": 252445410, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.20361328, "step": 11700, "time_per_iteration": 2.854562759399414 }, { "auxiliary_loss_clip": 0.01405744, "auxiliary_loss_mlp": 0.01037116, "balance_loss_clip": 1.24136543, "balance_loss_mlp": 1.01696944, "epoch": 0.7035021794679092, "flos": 35056957996800.0, "grad_norm": 3.494171653738354, "language_loss": 0.75031948, "learning_rate": 8.533168550341186e-07, "loss": 0.77474809, "num_input_tokens_seen": 252463905, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20153809, "step": 11701, "time_per_iteration": 4.414593696594238 }, { "auxiliary_loss_clip": 0.01420456, "auxiliary_loss_mlp": 0.01038795, "balance_loss_clip": 1.2551415, "balance_loss_mlp": 1.01880336, "epoch": 0.7035623027205772, "flos": 11004230609280.0, "grad_norm": 2.1619829796653094, "language_loss": 0.85422468, "learning_rate": 8.529977844159769e-07, "loss": 0.8788172, "num_input_tokens_seen": 252478655, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19970703, "step": 11702, "time_per_iteration": 2.791396379470825 }, { "auxiliary_loss_clip": 0.01420438, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.25510776, "balance_loss_mlp": 1.01823735, "epoch": 0.7036224259732452, "flos": 23634383535360.0, "grad_norm": 2.0744044274512112, "language_loss": 0.61591256, "learning_rate": 8.526787572922738e-07, "loss": 0.64049554, "num_input_tokens_seen": 252498740, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19616699, "step": 11703, "time_per_iteration": 4.252660274505615 }, { "auxiliary_loss_clip": 0.01412125, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.24862981, "balance_loss_mlp": 1.02128959, "epoch": 0.7036825492259131, "flos": 31698688314240.0, "grad_norm": 1.8687060685859946, "language_loss": 0.62279016, "learning_rate": 8.523597736751067e-07, "loss": 0.64732182, "num_input_tokens_seen": 252517800, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.1973877, "step": 11704, "time_per_iteration": 2.8988168239593506 }, { "auxiliary_loss_clip": 0.01403541, "auxiliary_loss_mlp": 0.01040308, "balance_loss_clip": 1.24465919, "balance_loss_mlp": 1.02084112, "epoch": 0.7037426724785811, "flos": 30205668533760.0, "grad_norm": 2.1428627602599772, "language_loss": 0.71592206, "learning_rate": 8.520408335765719e-07, "loss": 0.74036056, "num_input_tokens_seen": 252539620, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19458008, "step": 11705, "time_per_iteration": 2.928816080093384 }, { "auxiliary_loss_clip": 0.01413515, "auxiliary_loss_mlp": 0.01040077, "balance_loss_clip": 1.25224042, "balance_loss_mlp": 1.0196321, "epoch": 0.703802795731249, "flos": 24320948417280.0, "grad_norm": 1.7883723472515363, "language_loss": 0.62965024, "learning_rate": 8.517219370087645e-07, "loss": 0.65418613, "num_input_tokens_seen": 252557300, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20446777, "step": 11706, "time_per_iteration": 2.8349080085754395 }, { "auxiliary_loss_clip": 0.01418618, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.25459421, "balance_loss_mlp": 1.01715183, "epoch": 0.7038629189839171, "flos": 22539274940160.0, "grad_norm": 3.5377052393858017, "language_loss": 0.68881404, "learning_rate": 8.514030839837756e-07, "loss": 0.71336645, "num_input_tokens_seen": 252576715, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19470215, "step": 11707, "time_per_iteration": 2.848904609680176 }, { "auxiliary_loss_clip": 0.01414101, "auxiliary_loss_mlp": 0.01033441, "balance_loss_clip": 1.25257897, "balance_loss_mlp": 1.0144155, "epoch": 0.703923042236585, "flos": 26261766414720.0, "grad_norm": 1.7081140210224062, "language_loss": 0.7699312, "learning_rate": 8.510842745136974e-07, "loss": 0.79440665, "num_input_tokens_seen": 252596190, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19006348, "step": 11708, "time_per_iteration": 2.87967848777771 }, { "auxiliary_loss_clip": 0.01411475, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.2511673, "balance_loss_mlp": 1.01910698, "epoch": 0.703983165489253, "flos": 19399520234880.0, "grad_norm": 3.8356073239700734, "language_loss": 0.72602439, "learning_rate": 8.50765508610619e-07, "loss": 0.75050998, "num_input_tokens_seen": 252613410, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.17993164, "step": 11709, "time_per_iteration": 2.838552951812744 }, { "auxiliary_loss_clip": 0.01413225, "auxiliary_loss_mlp": 0.01037748, "balance_loss_clip": 1.25123358, "balance_loss_mlp": 1.01844764, "epoch": 0.7040432887419209, "flos": 16690461212160.0, "grad_norm": 1.987681309692948, "language_loss": 0.79986364, "learning_rate": 8.504467862866267e-07, "loss": 0.82437336, "num_input_tokens_seen": 252629150, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19311523, "step": 11710, "time_per_iteration": 2.8572070598602295 }, { "auxiliary_loss_clip": 0.01408297, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.24529731, "balance_loss_mlp": 1.01768374, "epoch": 0.7041034119945889, "flos": 21151078657920.0, "grad_norm": 1.736698838977467, "language_loss": 0.78114671, "learning_rate": 8.501281075538076e-07, "loss": 0.80560046, "num_input_tokens_seen": 252648225, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19384766, "step": 11711, "time_per_iteration": 2.8947036266326904 }, { "auxiliary_loss_clip": 0.01404777, "auxiliary_loss_mlp": 0.01031534, "balance_loss_clip": 1.24361348, "balance_loss_mlp": 1.01280642, "epoch": 0.7041635352472568, "flos": 16919201514240.0, "grad_norm": 2.028345937355536, "language_loss": 0.74927068, "learning_rate": 8.498094724242457e-07, "loss": 0.77363372, "num_input_tokens_seen": 252665380, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18725586, "step": 11712, "time_per_iteration": 2.880039691925049 }, { "auxiliary_loss_clip": 0.01193571, "auxiliary_loss_mlp": 0.01026956, "balance_loss_clip": 1.10209513, "balance_loss_mlp": 1.0015887, "epoch": 0.7042236584999249, "flos": 71715306349440.0, "grad_norm": 0.884630296077185, "language_loss": 0.64648455, "learning_rate": 8.494908809100247e-07, "loss": 0.66868985, "num_input_tokens_seen": 252727950, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.25390625, "step": 11713, "time_per_iteration": 3.4457130432128906 }, { "auxiliary_loss_clip": 0.01403795, "auxiliary_loss_mlp": 0.01032837, "balance_loss_clip": 1.2432003, "balance_loss_mlp": 1.01383495, "epoch": 0.7042837817525928, "flos": 28670000889600.0, "grad_norm": 2.8965384301266623, "language_loss": 0.73136985, "learning_rate": 8.49172333023225e-07, "loss": 0.75573611, "num_input_tokens_seen": 252746770, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18994141, "step": 11714, "time_per_iteration": 2.923923969268799 }, { "auxiliary_loss_clip": 0.01410852, "auxiliary_loss_mlp": 0.01038415, "balance_loss_clip": 1.24939799, "balance_loss_mlp": 1.01875746, "epoch": 0.7043439050052608, "flos": 19762520417280.0, "grad_norm": 1.6801676124325584, "language_loss": 0.800547, "learning_rate": 8.488538287759248e-07, "loss": 0.82503968, "num_input_tokens_seen": 252765610, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1965332, "step": 11715, "time_per_iteration": 2.8640058040618896 }, { "auxiliary_loss_clip": 0.01409913, "auxiliary_loss_mlp": 0.01036048, "balance_loss_clip": 1.24860334, "balance_loss_mlp": 1.01779664, "epoch": 0.7044040282579288, "flos": 11543867066880.0, "grad_norm": 2.503575400345042, "language_loss": 0.71389914, "learning_rate": 8.485353681802037e-07, "loss": 0.73835874, "num_input_tokens_seen": 252781610, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18237305, "step": 11716, "time_per_iteration": 2.8554110527038574 }, { "auxiliary_loss_clip": 0.01428531, "auxiliary_loss_mlp": 0.01037886, "balance_loss_clip": 1.26072347, "balance_loss_mlp": 1.01777494, "epoch": 0.7044641515105967, "flos": 33669078428160.0, "grad_norm": 3.2845259292163127, "language_loss": 0.67345643, "learning_rate": 8.482169512481358e-07, "loss": 0.69812053, "num_input_tokens_seen": 252800600, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20092773, "step": 11717, "time_per_iteration": 2.9919707775115967 }, { "auxiliary_loss_clip": 0.01406047, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.2429173, "balance_loss_mlp": 1.01385736, "epoch": 0.7045242747632647, "flos": 26735019471360.0, "grad_norm": 1.405422833554187, "language_loss": 0.74788904, "learning_rate": 8.478985779917967e-07, "loss": 0.77227843, "num_input_tokens_seen": 252822310, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19055176, "step": 11718, "time_per_iteration": 2.8866968154907227 }, { "auxiliary_loss_clip": 0.0141199, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.25163507, "balance_loss_mlp": 1.01466393, "epoch": 0.7045843980159326, "flos": 26809049243520.0, "grad_norm": 1.572311629146511, "language_loss": 0.80300188, "learning_rate": 8.475802484232606e-07, "loss": 0.82745272, "num_input_tokens_seen": 252842355, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18444824, "step": 11719, "time_per_iteration": 2.8984062671661377 }, { "auxiliary_loss_clip": 0.01406029, "auxiliary_loss_mlp": 0.01037942, "balance_loss_clip": 1.24532437, "balance_loss_mlp": 1.01835537, "epoch": 0.7046445212686007, "flos": 41590164856320.0, "grad_norm": 1.6332817000678286, "language_loss": 0.6667093, "learning_rate": 8.472619625545951e-07, "loss": 0.691149, "num_input_tokens_seen": 252866785, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19592285, "step": 11720, "time_per_iteration": 3.0373404026031494 }, { "auxiliary_loss_clip": 0.01433605, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.26589108, "balance_loss_mlp": 1.01601672, "epoch": 0.7047046445212686, "flos": 15568676167680.0, "grad_norm": 2.3595170420570684, "language_loss": 0.80970562, "learning_rate": 8.46943720397872e-07, "loss": 0.83440924, "num_input_tokens_seen": 252881870, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.20739746, "step": 11721, "time_per_iteration": 2.822758674621582 }, { "auxiliary_loss_clip": 0.01186225, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.09772635, "balance_loss_mlp": 1.01478148, "epoch": 0.7047647677739366, "flos": 70445597495040.0, "grad_norm": 0.7684669782251707, "language_loss": 0.64795172, "learning_rate": 8.466255219651582e-07, "loss": 0.67015159, "num_input_tokens_seen": 252951300, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.18945312, "step": 11722, "time_per_iteration": 3.481835126876831 }, { "auxiliary_loss_clip": 0.01406788, "auxiliary_loss_mlp": 0.01030517, "balance_loss_clip": 1.24529648, "balance_loss_mlp": 1.01156294, "epoch": 0.7048248910266045, "flos": 23670651882240.0, "grad_norm": 1.7148933353035734, "language_loss": 0.66130352, "learning_rate": 8.463073672685211e-07, "loss": 0.68567657, "num_input_tokens_seen": 252971400, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18969727, "step": 11723, "time_per_iteration": 2.9002490043640137 }, { "auxiliary_loss_clip": 0.01415262, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.25239074, "balance_loss_mlp": 1.01341367, "epoch": 0.7048850142792725, "flos": 21406993102080.0, "grad_norm": 1.6594072748101552, "language_loss": 0.8178637, "learning_rate": 8.459892563200235e-07, "loss": 0.84235048, "num_input_tokens_seen": 252989475, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19995117, "step": 11724, "time_per_iteration": 2.8698086738586426 }, { "auxiliary_loss_clip": 0.01428634, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.2638073, "balance_loss_mlp": 1.01898551, "epoch": 0.7049451375319404, "flos": 21656844743040.0, "grad_norm": 1.6869796783430473, "language_loss": 0.73291326, "learning_rate": 8.456711891317296e-07, "loss": 0.75758654, "num_input_tokens_seen": 253007220, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19702148, "step": 11725, "time_per_iteration": 4.262852430343628 }, { "auxiliary_loss_clip": 0.01421264, "auxiliary_loss_mlp": 0.01034019, "balance_loss_clip": 1.25537157, "balance_loss_mlp": 1.0135386, "epoch": 0.7050052607846085, "flos": 14875098341760.0, "grad_norm": 2.105783292139515, "language_loss": 0.78501236, "learning_rate": 8.453531657156998e-07, "loss": 0.80956519, "num_input_tokens_seen": 253025410, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20471191, "step": 11726, "time_per_iteration": 2.8376142978668213 }, { "auxiliary_loss_clip": 0.014071, "auxiliary_loss_mlp": 0.0103331, "balance_loss_clip": 1.24508309, "balance_loss_mlp": 1.01474953, "epoch": 0.7050653840372764, "flos": 19250329570560.0, "grad_norm": 1.8262484046073542, "language_loss": 0.71200049, "learning_rate": 8.450351860839931e-07, "loss": 0.73640454, "num_input_tokens_seen": 253043305, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18554688, "step": 11727, "time_per_iteration": 2.832303524017334 }, { "auxiliary_loss_clip": 0.01383133, "auxiliary_loss_mlp": 0.01031834, "balance_loss_clip": 1.22804761, "balance_loss_mlp": 1.01303434, "epoch": 0.7051255072899444, "flos": 27791461745280.0, "grad_norm": 1.7950727569978053, "language_loss": 0.69593376, "learning_rate": 8.44717250248668e-07, "loss": 0.72008342, "num_input_tokens_seen": 253062790, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18786621, "step": 11728, "time_per_iteration": 2.9273786544799805 }, { "auxiliary_loss_clip": 0.0140916, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.24708557, "balance_loss_mlp": 1.01307404, "epoch": 0.7051856305426124, "flos": 27903660145920.0, "grad_norm": 2.3055897850703673, "language_loss": 0.73758757, "learning_rate": 8.443993582217803e-07, "loss": 0.76200169, "num_input_tokens_seen": 253082055, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19189453, "step": 11729, "time_per_iteration": 2.898818016052246 }, { "auxiliary_loss_clip": 0.01436788, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.26734591, "balance_loss_mlp": 1.01603317, "epoch": 0.7052457537952803, "flos": 25053780746880.0, "grad_norm": 1.7279837797542845, "language_loss": 0.78894711, "learning_rate": 8.440815100153862e-07, "loss": 0.81366903, "num_input_tokens_seen": 253102575, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19372559, "step": 11730, "time_per_iteration": 2.8742361068725586 }, { "auxiliary_loss_clip": 0.01424425, "auxiliary_loss_mlp": 0.01036681, "balance_loss_clip": 1.25833583, "balance_loss_mlp": 1.01685667, "epoch": 0.7053058770479483, "flos": 21881739237120.0, "grad_norm": 3.4886426485115583, "language_loss": 0.63622737, "learning_rate": 8.437637056415359e-07, "loss": 0.66083848, "num_input_tokens_seen": 253121290, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19824219, "step": 11731, "time_per_iteration": 2.860541343688965 }, { "auxiliary_loss_clip": 0.01426384, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.26062632, "balance_loss_mlp": 1.01431084, "epoch": 0.7053660003006162, "flos": 16407282136320.0, "grad_norm": 2.802976265484762, "language_loss": 0.7533263, "learning_rate": 8.434459451122815e-07, "loss": 0.77793384, "num_input_tokens_seen": 253139720, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20056152, "step": 11732, "time_per_iteration": 2.9166829586029053 }, { "auxiliary_loss_clip": 0.01414149, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.25400519, "balance_loss_mlp": 1.01121449, "epoch": 0.7054261235532843, "flos": 22721974018560.0, "grad_norm": 1.3902654141309376, "language_loss": 0.71744335, "learning_rate": 8.431282284396735e-07, "loss": 0.74188799, "num_input_tokens_seen": 253160250, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19104004, "step": 11733, "time_per_iteration": 2.9336273670196533 }, { "auxiliary_loss_clip": 0.01400993, "auxiliary_loss_mlp": 0.01035707, "balance_loss_clip": 1.2391181, "balance_loss_mlp": 1.01614475, "epoch": 0.7054862468059522, "flos": 13597652626560.0, "grad_norm": 1.935950732763059, "language_loss": 0.7504701, "learning_rate": 8.428105556357583e-07, "loss": 0.77483708, "num_input_tokens_seen": 253178710, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19567871, "step": 11734, "time_per_iteration": 4.294881820678711 }, { "auxiliary_loss_clip": 0.0143066, "auxiliary_loss_mlp": 0.01034966, "balance_loss_clip": 1.26177359, "balance_loss_mlp": 1.01517701, "epoch": 0.7055463700586202, "flos": 15887263939200.0, "grad_norm": 2.7269571064788662, "language_loss": 0.69534481, "learning_rate": 8.424929267125829e-07, "loss": 0.72000104, "num_input_tokens_seen": 253194805, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.19787598, "step": 11735, "time_per_iteration": 2.832582473754883 }, { "auxiliary_loss_clip": 0.01417112, "auxiliary_loss_mlp": 0.01037626, "balance_loss_clip": 1.25207067, "balance_loss_mlp": 1.01660919, "epoch": 0.7056064933112881, "flos": 23086603013760.0, "grad_norm": 1.9021344881974143, "language_loss": 0.73380101, "learning_rate": 8.421753416821933e-07, "loss": 0.75834835, "num_input_tokens_seen": 253213895, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.21020508, "step": 11736, "time_per_iteration": 4.346101999282837 }, { "auxiliary_loss_clip": 0.01400002, "auxiliary_loss_mlp": 0.01029902, "balance_loss_clip": 1.24053478, "balance_loss_mlp": 1.01070881, "epoch": 0.7056666165639561, "flos": 24066798520320.0, "grad_norm": 2.138126957856501, "language_loss": 0.69515967, "learning_rate": 8.41857800556629e-07, "loss": 0.7194587, "num_input_tokens_seen": 253231620, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19177246, "step": 11737, "time_per_iteration": 2.898561477661133 }, { "auxiliary_loss_clip": 0.01408293, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.24414933, "balance_loss_mlp": 1.01370347, "epoch": 0.705726739816624, "flos": 17502074017920.0, "grad_norm": 2.2072242051407343, "language_loss": 0.68314618, "learning_rate": 8.415403033479332e-07, "loss": 0.70757258, "num_input_tokens_seen": 253249590, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20654297, "step": 11738, "time_per_iteration": 4.328527450561523 }, { "auxiliary_loss_clip": 0.01414858, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.25126946, "balance_loss_mlp": 1.01197767, "epoch": 0.7057868630692921, "flos": 51367633205760.0, "grad_norm": 1.5643705708665274, "language_loss": 0.75804138, "learning_rate": 8.41222850068145e-07, "loss": 0.78250802, "num_input_tokens_seen": 253273870, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19848633, "step": 11739, "time_per_iteration": 3.124157190322876 }, { "auxiliary_loss_clip": 0.01404199, "auxiliary_loss_mlp": 0.01030516, "balance_loss_clip": 1.24482381, "balance_loss_mlp": 1.01113307, "epoch": 0.70584698632196, "flos": 26113797360000.0, "grad_norm": 2.5288766372304265, "language_loss": 0.72054297, "learning_rate": 8.409054407293032e-07, "loss": 0.74489009, "num_input_tokens_seen": 253293720, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19384766, "step": 11740, "time_per_iteration": 2.8951213359832764 }, { "auxiliary_loss_clip": 0.01402454, "auxiliary_loss_mlp": 0.01035382, "balance_loss_clip": 1.24163568, "balance_loss_mlp": 1.01612949, "epoch": 0.705907109574628, "flos": 21553016630400.0, "grad_norm": 1.6552722094670702, "language_loss": 0.82697344, "learning_rate": 8.405880753434434e-07, "loss": 0.85135174, "num_input_tokens_seen": 253313700, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19250488, "step": 11741, "time_per_iteration": 2.870028018951416 }, { "auxiliary_loss_clip": 0.01411901, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.24784136, "balance_loss_mlp": 1.01489282, "epoch": 0.705967232827296, "flos": 22721069122560.0, "grad_norm": 1.843197305765223, "language_loss": 0.78849417, "learning_rate": 8.402707539225993e-07, "loss": 0.81296903, "num_input_tokens_seen": 253332425, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20703125, "step": 11742, "time_per_iteration": 2.8915622234344482 }, { "auxiliary_loss_clip": 0.0141321, "auxiliary_loss_mlp": 0.01029735, "balance_loss_clip": 1.24659872, "balance_loss_mlp": 1.01016116, "epoch": 0.7060273560799639, "flos": 28702151959680.0, "grad_norm": 1.647994686042008, "language_loss": 0.65101457, "learning_rate": 8.39953476478805e-07, "loss": 0.67544401, "num_input_tokens_seen": 253353620, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19580078, "step": 11743, "time_per_iteration": 2.898725748062134 }, { "auxiliary_loss_clip": 0.01420202, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.25422978, "balance_loss_mlp": 1.01512587, "epoch": 0.7060874793326319, "flos": 15714744940800.0, "grad_norm": 2.03241198710867, "language_loss": 0.66191149, "learning_rate": 8.396362430240902e-07, "loss": 0.68646502, "num_input_tokens_seen": 253370930, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20007324, "step": 11744, "time_per_iteration": 2.8372607231140137 }, { "auxiliary_loss_clip": 0.01401648, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.24249518, "balance_loss_mlp": 1.01320493, "epoch": 0.7061476025852998, "flos": 21516657793920.0, "grad_norm": 1.7808625434823844, "language_loss": 0.64245808, "learning_rate": 8.393190535704857e-07, "loss": 0.66680986, "num_input_tokens_seen": 253389810, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.20336914, "step": 11745, "time_per_iteration": 2.8310179710388184 }, { "auxiliary_loss_clip": 0.01418098, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.25367987, "balance_loss_mlp": 1.01087427, "epoch": 0.7062077258379679, "flos": 28192992514560.0, "grad_norm": 2.9863061585485418, "language_loss": 0.72117251, "learning_rate": 8.390019081300188e-07, "loss": 0.74564803, "num_input_tokens_seen": 253408685, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18591309, "step": 11746, "time_per_iteration": 2.9126250743865967 }, { "auxiliary_loss_clip": 0.01408586, "auxiliary_loss_mlp": 0.01035867, "balance_loss_clip": 1.24501097, "balance_loss_mlp": 1.01595926, "epoch": 0.7062678490906358, "flos": 27864812845440.0, "grad_norm": 1.4385372297867605, "language_loss": 0.79875076, "learning_rate": 8.386848067147175e-07, "loss": 0.82319528, "num_input_tokens_seen": 253429685, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19897461, "step": 11747, "time_per_iteration": 2.929713487625122 }, { "auxiliary_loss_clip": 0.01406238, "auxiliary_loss_mlp": 0.01034273, "balance_loss_clip": 1.24664235, "balance_loss_mlp": 1.01612914, "epoch": 0.7063279723433038, "flos": 23195136585600.0, "grad_norm": 2.4654901022563904, "language_loss": 0.66212523, "learning_rate": 8.383677493366031e-07, "loss": 0.68653035, "num_input_tokens_seen": 253448260, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18139648, "step": 11748, "time_per_iteration": 2.8790979385375977 }, { "auxiliary_loss_clip": 0.01404546, "auxiliary_loss_mlp": 0.0103742, "balance_loss_clip": 1.24289703, "balance_loss_mlp": 1.01745248, "epoch": 0.7063880955959717, "flos": 20197016663040.0, "grad_norm": 1.922223982414972, "language_loss": 0.80742395, "learning_rate": 8.380507360077003e-07, "loss": 0.83184361, "num_input_tokens_seen": 253467725, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19958496, "step": 11749, "time_per_iteration": 2.890880823135376 }, { "auxiliary_loss_clip": 0.01183998, "auxiliary_loss_mlp": 0.01024149, "balance_loss_clip": 1.09615552, "balance_loss_mlp": 1.00335932, "epoch": 0.7064482188486397, "flos": 63694310123520.0, "grad_norm": 0.788223546880592, "language_loss": 0.53999597, "learning_rate": 8.377337667400304e-07, "loss": 0.5620774, "num_input_tokens_seen": 253526940, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.20800781, "step": 11750, "time_per_iteration": 3.3300135135650635 }, { "auxiliary_loss_clip": 0.01417563, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.25427067, "balance_loss_mlp": 1.01914966, "epoch": 0.7065083421013076, "flos": 25201930780800.0, "grad_norm": 1.8281972416865868, "language_loss": 0.79724866, "learning_rate": 8.37416841545612e-07, "loss": 0.82182556, "num_input_tokens_seen": 253546160, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.2097168, "step": 11751, "time_per_iteration": 2.923722743988037 }, { "auxiliary_loss_clip": 0.01402445, "auxiliary_loss_mlp": 0.01033376, "balance_loss_clip": 1.24208701, "balance_loss_mlp": 1.01277637, "epoch": 0.7065684653539757, "flos": 22904084914560.0, "grad_norm": 2.0577866301889087, "language_loss": 0.68710268, "learning_rate": 8.370999604364634e-07, "loss": 0.71146089, "num_input_tokens_seen": 253565505, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20593262, "step": 11752, "time_per_iteration": 2.891237258911133 }, { "auxiliary_loss_clip": 0.01399654, "auxiliary_loss_mlp": 0.01039409, "balance_loss_clip": 1.23932624, "balance_loss_mlp": 1.01904845, "epoch": 0.7066285886066436, "flos": 23560760966400.0, "grad_norm": 2.050684827106445, "language_loss": 0.77145177, "learning_rate": 8.367831234246025e-07, "loss": 0.79584241, "num_input_tokens_seen": 253585125, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.20361328, "step": 11753, "time_per_iteration": 2.8983354568481445 }, { "auxiliary_loss_clip": 0.0140446, "auxiliary_loss_mlp": 0.01033919, "balance_loss_clip": 1.24517012, "balance_loss_mlp": 1.01397562, "epoch": 0.7066887118593116, "flos": 21079175391360.0, "grad_norm": 1.5815891673636249, "language_loss": 0.71838897, "learning_rate": 8.364663305220405e-07, "loss": 0.74277276, "num_input_tokens_seen": 253604815, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19934082, "step": 11754, "time_per_iteration": 2.8926970958709717 }, { "auxiliary_loss_clip": 0.01411502, "auxiliary_loss_mlp": 0.01035663, "balance_loss_clip": 1.24801362, "balance_loss_mlp": 1.01543307, "epoch": 0.7067488351119796, "flos": 21185582457600.0, "grad_norm": 1.6786550188019733, "language_loss": 0.89655709, "learning_rate": 8.361495817407919e-07, "loss": 0.92102873, "num_input_tokens_seen": 253622855, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20227051, "step": 11755, "time_per_iteration": 2.844797372817993 }, { "auxiliary_loss_clip": 0.01397534, "auxiliary_loss_mlp": 0.01038477, "balance_loss_clip": 1.23670208, "balance_loss_mlp": 1.01893818, "epoch": 0.7068089583646475, "flos": 20458993910400.0, "grad_norm": 2.155656507534046, "language_loss": 0.80343235, "learning_rate": 8.358328770928678e-07, "loss": 0.82779247, "num_input_tokens_seen": 253642760, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19543457, "step": 11756, "time_per_iteration": 2.934812068939209 }, { "auxiliary_loss_clip": 0.01192317, "auxiliary_loss_mlp": 0.01034323, "balance_loss_clip": 1.10141516, "balance_loss_mlp": 1.01315165, "epoch": 0.7068690816173155, "flos": 59134570024320.0, "grad_norm": 0.8268467830823323, "language_loss": 0.60444289, "learning_rate": 8.355162165902785e-07, "loss": 0.62670934, "num_input_tokens_seen": 253695685, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.21191406, "step": 11757, "time_per_iteration": 3.1590664386749268 }, { "auxiliary_loss_clip": 0.0140247, "auxiliary_loss_mlp": 0.0103799, "balance_loss_clip": 1.24282217, "balance_loss_mlp": 1.01743817, "epoch": 0.7069292048699835, "flos": 16259765529600.0, "grad_norm": 1.6434649756400264, "language_loss": 0.81035513, "learning_rate": 8.351996002450307e-07, "loss": 0.83475971, "num_input_tokens_seen": 253713305, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.20556641, "step": 11758, "time_per_iteration": 2.821810007095337 }, { "auxiliary_loss_clip": 0.01399951, "auxiliary_loss_mlp": 0.010368, "balance_loss_clip": 1.23936677, "balance_loss_mlp": 1.0164752, "epoch": 0.7069893281226515, "flos": 41187819680640.0, "grad_norm": 1.653778407348568, "language_loss": 0.78193629, "learning_rate": 8.348830280691304e-07, "loss": 0.80630386, "num_input_tokens_seen": 253736100, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.20324707, "step": 11759, "time_per_iteration": 3.057382822036743 }, { "auxiliary_loss_clip": 0.01402874, "auxiliary_loss_mlp": 0.01035131, "balance_loss_clip": 1.24147832, "balance_loss_mlp": 1.01465034, "epoch": 0.7070494513753194, "flos": 24218025200640.0, "grad_norm": 1.6736212201766554, "language_loss": 0.68621469, "learning_rate": 8.34566500074583e-07, "loss": 0.71059477, "num_input_tokens_seen": 253757350, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.20495605, "step": 11760, "time_per_iteration": 4.348189830780029 }, { "auxiliary_loss_clip": 0.01411618, "auxiliary_loss_mlp": 0.01032196, "balance_loss_clip": 1.24783456, "balance_loss_mlp": 1.01290846, "epoch": 0.7071095746279874, "flos": 20193261344640.0, "grad_norm": 1.8818082682733772, "language_loss": 0.80562747, "learning_rate": 8.342500162733899e-07, "loss": 0.83006561, "num_input_tokens_seen": 253772855, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19287109, "step": 11761, "time_per_iteration": 2.838639497756958 }, { "auxiliary_loss_clip": 0.0140517, "auxiliary_loss_mlp": 0.01037944, "balance_loss_clip": 1.24359477, "balance_loss_mlp": 1.01715398, "epoch": 0.7071696978806553, "flos": 18191172608640.0, "grad_norm": 2.5281474533569, "language_loss": 0.76317644, "learning_rate": 8.33933576677553e-07, "loss": 0.78760755, "num_input_tokens_seen": 253790360, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20800781, "step": 11762, "time_per_iteration": 2.854710817337036 }, { "auxiliary_loss_clip": 0.01408965, "auxiliary_loss_mlp": 0.01035156, "balance_loss_clip": 1.24877143, "balance_loss_mlp": 1.01475883, "epoch": 0.7072298211333233, "flos": 24141778433280.0, "grad_norm": 6.554440055568179, "language_loss": 0.77714741, "learning_rate": 8.336171812990724e-07, "loss": 0.80158854, "num_input_tokens_seen": 253810585, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.20410156, "step": 11763, "time_per_iteration": 2.874415397644043 }, { "auxiliary_loss_clip": 0.01408824, "auxiliary_loss_mlp": 0.01041755, "balance_loss_clip": 1.24583542, "balance_loss_mlp": 1.02077377, "epoch": 0.7072899443859912, "flos": 27209765606400.0, "grad_norm": 2.275166719996912, "language_loss": 0.79459578, "learning_rate": 8.333008301499453e-07, "loss": 0.81910157, "num_input_tokens_seen": 253829080, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20983887, "step": 11764, "time_per_iteration": 2.8755879402160645 }, { "auxiliary_loss_clip": 0.01425363, "auxiliary_loss_mlp": 0.01040948, "balance_loss_clip": 1.25942218, "balance_loss_mlp": 1.01950228, "epoch": 0.7073500676386593, "flos": 16444274400000.0, "grad_norm": 1.569428299007325, "language_loss": 0.80529594, "learning_rate": 8.32984523242167e-07, "loss": 0.82995903, "num_input_tokens_seen": 253846780, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21435547, "step": 11765, "time_per_iteration": 2.9497225284576416 }, { "auxiliary_loss_clip": 0.01408723, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.24681878, "balance_loss_mlp": 1.0173018, "epoch": 0.7074101908913272, "flos": 27685461882240.0, "grad_norm": 1.6753842986842695, "language_loss": 0.69270205, "learning_rate": 8.326682605877324e-07, "loss": 0.71715105, "num_input_tokens_seen": 253867075, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18859863, "step": 11766, "time_per_iteration": 2.915818452835083 }, { "auxiliary_loss_clip": 0.01414421, "auxiliary_loss_mlp": 0.01034247, "balance_loss_clip": 1.25077176, "balance_loss_mlp": 1.01414788, "epoch": 0.7074703141439952, "flos": 22248585227520.0, "grad_norm": 1.8922378048457638, "language_loss": 0.64852607, "learning_rate": 8.323520421986352e-07, "loss": 0.67301273, "num_input_tokens_seen": 253885790, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.20080566, "step": 11767, "time_per_iteration": 2.8752670288085938 }, { "auxiliary_loss_clip": 0.01413585, "auxiliary_loss_mlp": 0.01036433, "balance_loss_clip": 1.25078201, "balance_loss_mlp": 1.01591706, "epoch": 0.7075304373966632, "flos": 29655263813760.0, "grad_norm": 2.2242424684608952, "language_loss": 0.53755611, "learning_rate": 8.320358680868646e-07, "loss": 0.5620563, "num_input_tokens_seen": 253907070, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.20532227, "step": 11768, "time_per_iteration": 2.9267711639404297 }, { "auxiliary_loss_clip": 0.0139897, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 1.23894095, "balance_loss_mlp": 1.01434898, "epoch": 0.7075905606493311, "flos": 19764601678080.0, "grad_norm": 1.6240328521323, "language_loss": 0.76344121, "learning_rate": 8.317197382644119e-07, "loss": 0.78777063, "num_input_tokens_seen": 253927290, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19592285, "step": 11769, "time_per_iteration": 4.284012079238892 }, { "auxiliary_loss_clip": 0.01190797, "auxiliary_loss_mlp": 0.01022673, "balance_loss_clip": 1.09979534, "balance_loss_mlp": 0.9978776, "epoch": 0.7076506839019991, "flos": 65744023651200.0, "grad_norm": 0.8530176786815913, "language_loss": 0.62014562, "learning_rate": 8.314036527432637e-07, "loss": 0.64228034, "num_input_tokens_seen": 253983440, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.24804688, "step": 11770, "time_per_iteration": 3.282191038131714 }, { "auxiliary_loss_clip": 0.01426554, "auxiliary_loss_mlp": 0.010352, "balance_loss_clip": 1.2617445, "balance_loss_mlp": 1.01591206, "epoch": 0.707710807154667, "flos": 23774841953280.0, "grad_norm": 2.0932235567286517, "language_loss": 0.770118, "learning_rate": 8.310876115354055e-07, "loss": 0.79473555, "num_input_tokens_seen": 254003825, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19274902, "step": 11771, "time_per_iteration": 4.325823545455933 }, { "auxiliary_loss_clip": 0.0140007, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.24212635, "balance_loss_mlp": 1.01286376, "epoch": 0.7077709304073351, "flos": 21261195797760.0, "grad_norm": 1.4970080731864575, "language_loss": 0.71892321, "learning_rate": 8.307716146528221e-07, "loss": 0.74325407, "num_input_tokens_seen": 254023345, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.20166016, "step": 11772, "time_per_iteration": 4.419597864151001 }, { "auxiliary_loss_clip": 0.01419149, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.25296295, "balance_loss_mlp": 1.01700032, "epoch": 0.707831053660003, "flos": 20750498029440.0, "grad_norm": 1.79749341623192, "language_loss": 0.70174378, "learning_rate": 8.30455662107496e-07, "loss": 0.726309, "num_input_tokens_seen": 254041815, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20373535, "step": 11773, "time_per_iteration": 2.9243147373199463 }, { "auxiliary_loss_clip": 0.01422279, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.25956452, "balance_loss_mlp": 1.01300168, "epoch": 0.707891176912671, "flos": 21990996725760.0, "grad_norm": 1.3916900871239033, "language_loss": 0.71309465, "learning_rate": 8.301397539114095e-07, "loss": 0.73763418, "num_input_tokens_seen": 254062065, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18701172, "step": 11774, "time_per_iteration": 2.9795069694519043 }, { "auxiliary_loss_clip": 0.01398138, "auxiliary_loss_mlp": 0.01030159, "balance_loss_clip": 1.24090397, "balance_loss_mlp": 1.01134777, "epoch": 0.7079513001653389, "flos": 21078768188160.0, "grad_norm": 1.4642857897857584, "language_loss": 0.74824667, "learning_rate": 8.298238900765407e-07, "loss": 0.7725296, "num_input_tokens_seen": 254080605, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18811035, "step": 11775, "time_per_iteration": 2.9243435859680176 }, { "auxiliary_loss_clip": 0.01423568, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.25987411, "balance_loss_mlp": 1.01336169, "epoch": 0.7080114234180069, "flos": 18049673560320.0, "grad_norm": 1.7006219682496255, "language_loss": 0.88204038, "learning_rate": 8.295080706148665e-07, "loss": 0.90660143, "num_input_tokens_seen": 254098710, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19165039, "step": 11776, "time_per_iteration": 2.859626293182373 }, { "auxiliary_loss_clip": 0.0139818, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.23714292, "balance_loss_mlp": 1.01477623, "epoch": 0.7080715466706748, "flos": 15130650827520.0, "grad_norm": 1.880657216374079, "language_loss": 0.75473243, "learning_rate": 8.291922955383641e-07, "loss": 0.7790581, "num_input_tokens_seen": 254117200, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19616699, "step": 11777, "time_per_iteration": 2.847273349761963 }, { "auxiliary_loss_clip": 0.01438163, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.27117062, "balance_loss_mlp": 1.01628566, "epoch": 0.7081316699233429, "flos": 14429381385600.0, "grad_norm": 2.213838816948703, "language_loss": 0.8301847, "learning_rate": 8.288765648590066e-07, "loss": 0.854936, "num_input_tokens_seen": 254132115, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20678711, "step": 11778, "time_per_iteration": 2.84698486328125 }, { "auxiliary_loss_clip": 0.01392073, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 1.2355355, "balance_loss_mlp": 1.0166657, "epoch": 0.7081917931760108, "flos": 23232943255680.0, "grad_norm": 1.6728728316160362, "language_loss": 0.8549999, "learning_rate": 8.285608785887673e-07, "loss": 0.87928057, "num_input_tokens_seen": 254152285, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.19311523, "step": 11779, "time_per_iteration": 2.894827365875244 }, { "auxiliary_loss_clip": 0.01426492, "auxiliary_loss_mlp": 0.01037757, "balance_loss_clip": 1.26249778, "balance_loss_mlp": 1.01812291, "epoch": 0.7082519164286788, "flos": 39322072085760.0, "grad_norm": 2.9668855089516324, "language_loss": 0.72078484, "learning_rate": 8.28245236739618e-07, "loss": 0.74542737, "num_input_tokens_seen": 254172805, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19616699, "step": 11780, "time_per_iteration": 3.018651247024536 }, { "auxiliary_loss_clip": 0.01409564, "auxiliary_loss_mlp": 0.01032657, "balance_loss_clip": 1.24958575, "balance_loss_mlp": 1.01411963, "epoch": 0.7083120396813467, "flos": 21660690551040.0, "grad_norm": 1.7498551831105367, "language_loss": 0.73551118, "learning_rate": 8.279296393235256e-07, "loss": 0.75993335, "num_input_tokens_seen": 254191890, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18530273, "step": 11781, "time_per_iteration": 2.9287118911743164 }, { "auxiliary_loss_clip": 0.01405946, "auxiliary_loss_mlp": 0.01034058, "balance_loss_clip": 1.24643672, "balance_loss_mlp": 1.01448393, "epoch": 0.7083721629340147, "flos": 17576103790080.0, "grad_norm": 1.521327501725006, "language_loss": 0.7850275, "learning_rate": 8.276140863524585e-07, "loss": 0.8094275, "num_input_tokens_seen": 254210150, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19567871, "step": 11782, "time_per_iteration": 2.911020517349243 }, { "auxiliary_loss_clip": 0.01404715, "auxiliary_loss_mlp": 0.01031315, "balance_loss_clip": 1.24437165, "balance_loss_mlp": 1.01253915, "epoch": 0.7084322861866827, "flos": 29362266616320.0, "grad_norm": 1.4229243759035972, "language_loss": 0.70330679, "learning_rate": 8.272985778383828e-07, "loss": 0.72766709, "num_input_tokens_seen": 254233015, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18774414, "step": 11783, "time_per_iteration": 2.9125282764434814 }, { "auxiliary_loss_clip": 0.01423866, "auxiliary_loss_mlp": 0.01034931, "balance_loss_clip": 1.25917959, "balance_loss_mlp": 1.01502299, "epoch": 0.7084924094393507, "flos": 20204120096640.0, "grad_norm": 1.6642713925502632, "language_loss": 0.7974844, "learning_rate": 8.269831137932632e-07, "loss": 0.82207233, "num_input_tokens_seen": 254251345, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19897461, "step": 11784, "time_per_iteration": 2.875748634338379 }, { "auxiliary_loss_clip": 0.0141859, "auxiliary_loss_mlp": 0.01037647, "balance_loss_clip": 1.25678015, "balance_loss_mlp": 1.01812005, "epoch": 0.7085525326920187, "flos": 23487681335040.0, "grad_norm": 1.549524927161694, "language_loss": 0.77660084, "learning_rate": 8.266676942290609e-07, "loss": 0.8011632, "num_input_tokens_seen": 254269905, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.1953125, "step": 11785, "time_per_iteration": 2.8858065605163574 }, { "auxiliary_loss_clip": 0.01398938, "auxiliary_loss_mlp": 0.010324, "balance_loss_clip": 1.2385782, "balance_loss_mlp": 1.01307619, "epoch": 0.7086126559446866, "flos": 25970081316480.0, "grad_norm": 1.5871358556149409, "language_loss": 0.78446662, "learning_rate": 8.26352319157738e-07, "loss": 0.80878007, "num_input_tokens_seen": 254289990, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1932373, "step": 11786, "time_per_iteration": 2.8944664001464844 }, { "auxiliary_loss_clip": 0.01409853, "auxiliary_loss_mlp": 0.01033813, "balance_loss_clip": 1.24651456, "balance_loss_mlp": 1.01506162, "epoch": 0.7086727791973546, "flos": 26736195836160.0, "grad_norm": 2.1533953575037246, "language_loss": 0.79202056, "learning_rate": 8.260369885912526e-07, "loss": 0.81645727, "num_input_tokens_seen": 254309085, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18762207, "step": 11787, "time_per_iteration": 2.8816089630126953 }, { "auxiliary_loss_clip": 0.01408094, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.24683976, "balance_loss_mlp": 1.01464224, "epoch": 0.7087329024500225, "flos": 21691800990720.0, "grad_norm": 2.173076644330915, "language_loss": 0.77427161, "learning_rate": 8.257217025415615e-07, "loss": 0.79869562, "num_input_tokens_seen": 254327045, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1965332, "step": 11788, "time_per_iteration": 2.8515329360961914 }, { "auxiliary_loss_clip": 0.01431719, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.26323068, "balance_loss_mlp": 1.0136342, "epoch": 0.7087930257026905, "flos": 17940008868480.0, "grad_norm": 2.065315075328281, "language_loss": 0.68838525, "learning_rate": 8.254064610206212e-07, "loss": 0.71304274, "num_input_tokens_seen": 254344585, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20373535, "step": 11789, "time_per_iteration": 2.851297378540039 }, { "auxiliary_loss_clip": 0.01422854, "auxiliary_loss_mlp": 0.01032368, "balance_loss_clip": 1.25873613, "balance_loss_mlp": 1.01312721, "epoch": 0.7088531489553584, "flos": 18919525703040.0, "grad_norm": 1.5366379091233429, "language_loss": 0.78047806, "learning_rate": 8.250912640403858e-07, "loss": 0.80503023, "num_input_tokens_seen": 254362470, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19238281, "step": 11790, "time_per_iteration": 2.8657472133636475 }, { "auxiliary_loss_clip": 0.01422419, "auxiliary_loss_mlp": 0.01031616, "balance_loss_clip": 1.25583386, "balance_loss_mlp": 1.01202965, "epoch": 0.7089132722080265, "flos": 27392283705600.0, "grad_norm": 1.6425898750439292, "language_loss": 0.7162562, "learning_rate": 8.247761116128085e-07, "loss": 0.74079657, "num_input_tokens_seen": 254383190, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.19580078, "step": 11791, "time_per_iteration": 2.9373066425323486 }, { "auxiliary_loss_clip": 0.01403646, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.24317336, "balance_loss_mlp": 1.01569855, "epoch": 0.7089733954606944, "flos": 22172836152960.0, "grad_norm": 1.4678239754939364, "language_loss": 0.82489872, "learning_rate": 8.244610037498376e-07, "loss": 0.84929252, "num_input_tokens_seen": 254403115, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.20031738, "step": 11792, "time_per_iteration": 2.9464361667633057 }, { "auxiliary_loss_clip": 0.01423384, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.25747061, "balance_loss_mlp": 1.01229072, "epoch": 0.7090335187133624, "flos": 24436540177920.0, "grad_norm": 1.9648879748906931, "language_loss": 0.65740097, "learning_rate": 8.241459404634232e-07, "loss": 0.68195164, "num_input_tokens_seen": 254421875, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19384766, "step": 11793, "time_per_iteration": 2.903441905975342 }, { "auxiliary_loss_clip": 0.01402949, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.24321103, "balance_loss_mlp": 1.01250768, "epoch": 0.7090936419660303, "flos": 21845244666240.0, "grad_norm": 2.2120338988846435, "language_loss": 0.71449792, "learning_rate": 8.238309217655133e-07, "loss": 0.73883474, "num_input_tokens_seen": 254440765, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18225098, "step": 11794, "time_per_iteration": 4.319464683532715 }, { "auxiliary_loss_clip": 0.01401193, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.24149632, "balance_loss_mlp": 1.0149169, "epoch": 0.7091537652186983, "flos": 20091514492800.0, "grad_norm": 1.8462024111891113, "language_loss": 0.76108068, "learning_rate": 8.23515947668052e-07, "loss": 0.7854259, "num_input_tokens_seen": 254459480, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18408203, "step": 11795, "time_per_iteration": 2.8404104709625244 }, { "auxiliary_loss_clip": 0.01403571, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.24279165, "balance_loss_mlp": 1.01472008, "epoch": 0.7092138884713663, "flos": 13159310572800.0, "grad_norm": 2.286084851191497, "language_loss": 0.76313311, "learning_rate": 8.232010181829838e-07, "loss": 0.78751236, "num_input_tokens_seen": 254473985, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19628906, "step": 11796, "time_per_iteration": 2.8594119548797607 }, { "auxiliary_loss_clip": 0.01421877, "auxiliary_loss_mlp": 0.01036859, "balance_loss_clip": 1.25595331, "balance_loss_mlp": 1.01643801, "epoch": 0.7092740117240343, "flos": 21654175299840.0, "grad_norm": 1.8115005294003268, "language_loss": 0.75103605, "learning_rate": 8.228861333222523e-07, "loss": 0.77562344, "num_input_tokens_seen": 254492135, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20422363, "step": 11797, "time_per_iteration": 2.8479998111724854 }, { "auxiliary_loss_clip": 0.01411739, "auxiliary_loss_mlp": 0.0103133, "balance_loss_clip": 1.24872947, "balance_loss_mlp": 1.01201773, "epoch": 0.7093341349767023, "flos": 21042318862080.0, "grad_norm": 1.3739851719368212, "language_loss": 0.80235708, "learning_rate": 8.225712930977953e-07, "loss": 0.82678777, "num_input_tokens_seen": 254512865, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19335938, "step": 11798, "time_per_iteration": 2.872982978820801 }, { "auxiliary_loss_clip": 0.01401521, "auxiliary_loss_mlp": 0.01038058, "balance_loss_clip": 1.24083066, "balance_loss_mlp": 1.01850808, "epoch": 0.7093942582293702, "flos": 22027717520640.0, "grad_norm": 1.807580606646496, "language_loss": 0.67374372, "learning_rate": 8.222564975215529e-07, "loss": 0.69813955, "num_input_tokens_seen": 254532605, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19543457, "step": 11799, "time_per_iteration": 2.9210784435272217 }, { "auxiliary_loss_clip": 0.01412672, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 1.25026357, "balance_loss_mlp": 1.01113486, "epoch": 0.7094543814820382, "flos": 27247119828480.0, "grad_norm": 1.6006384064006411, "language_loss": 0.82592738, "learning_rate": 8.219417466054622e-07, "loss": 0.85036391, "num_input_tokens_seen": 254553780, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19848633, "step": 11800, "time_per_iteration": 2.8932902812957764 }, { "auxiliary_loss_clip": 0.01404107, "auxiliary_loss_mlp": 0.01029074, "balance_loss_clip": 1.24351132, "balance_loss_mlp": 1.0107038, "epoch": 0.7095145047347061, "flos": 12095493396480.0, "grad_norm": 1.7885518676166097, "language_loss": 0.87658906, "learning_rate": 8.21627040361459e-07, "loss": 0.90092087, "num_input_tokens_seen": 254567510, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18371582, "step": 11801, "time_per_iteration": 2.809185028076172 }, { "auxiliary_loss_clip": 0.01405728, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.24366117, "balance_loss_mlp": 1.01385045, "epoch": 0.7095746279873741, "flos": 19391376170880.0, "grad_norm": 1.731042699542604, "language_loss": 0.76814914, "learning_rate": 8.213123788014758e-07, "loss": 0.79254586, "num_input_tokens_seen": 254585565, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.2010498, "step": 11802, "time_per_iteration": 2.8582847118377686 }, { "auxiliary_loss_clip": 0.01399615, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.23841238, "balance_loss_mlp": 1.01736534, "epoch": 0.709634751240042, "flos": 21370408041600.0, "grad_norm": 2.176398794974906, "language_loss": 0.8272692, "learning_rate": 8.209977619374462e-07, "loss": 0.85163629, "num_input_tokens_seen": 254603465, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19726562, "step": 11803, "time_per_iteration": 4.325204372406006 }, { "auxiliary_loss_clip": 0.01411566, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.24718463, "balance_loss_mlp": 1.01444674, "epoch": 0.7096948744927101, "flos": 13924791665280.0, "grad_norm": 2.674285770278976, "language_loss": 0.69000149, "learning_rate": 8.206831897812995e-07, "loss": 0.71446866, "num_input_tokens_seen": 254620500, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20703125, "step": 11804, "time_per_iteration": 2.8424248695373535 }, { "auxiliary_loss_clip": 0.0139273, "auxiliary_loss_mlp": 0.0103164, "balance_loss_clip": 1.23674464, "balance_loss_mlp": 1.01262593, "epoch": 0.709754997745378, "flos": 30310130073600.0, "grad_norm": 1.8350930384779145, "language_loss": 0.78987736, "learning_rate": 8.203686623449637e-07, "loss": 0.81412101, "num_input_tokens_seen": 254638565, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.19006348, "step": 11805, "time_per_iteration": 2.960129737854004 }, { "auxiliary_loss_clip": 0.01409011, "auxiliary_loss_mlp": 0.01029663, "balance_loss_clip": 1.24708223, "balance_loss_mlp": 1.01056552, "epoch": 0.709815120998046, "flos": 18524329205760.0, "grad_norm": 1.852044868558423, "language_loss": 0.7944535, "learning_rate": 8.200541796403667e-07, "loss": 0.81884021, "num_input_tokens_seen": 254657505, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19104004, "step": 11806, "time_per_iteration": 4.2590553760528564 }, { "auxiliary_loss_clip": 0.01403555, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.24223411, "balance_loss_mlp": 1.01373315, "epoch": 0.7098752442507139, "flos": 22282591334400.0, "grad_norm": 2.116401135140007, "language_loss": 0.57561421, "learning_rate": 8.197397416794332e-07, "loss": 0.59998631, "num_input_tokens_seen": 254674730, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19909668, "step": 11807, "time_per_iteration": 4.238116502761841 }, { "auxiliary_loss_clip": 0.0141841, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.2507441, "balance_loss_mlp": 1.01663876, "epoch": 0.7099353675033819, "flos": 19283295047040.0, "grad_norm": 1.9009266081524119, "language_loss": 0.6910795, "learning_rate": 8.194253484740882e-07, "loss": 0.71563411, "num_input_tokens_seen": 254691665, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.20410156, "step": 11808, "time_per_iteration": 2.852538585662842 }, { "auxiliary_loss_clip": 0.01419456, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.25398815, "balance_loss_mlp": 1.01646924, "epoch": 0.70999549075605, "flos": 21918641011200.0, "grad_norm": 2.3483660339635346, "language_loss": 0.70925504, "learning_rate": 8.191110000362513e-07, "loss": 0.73379451, "num_input_tokens_seen": 254711610, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.18029785, "step": 11809, "time_per_iteration": 2.8739731311798096 }, { "auxiliary_loss_clip": 0.01187624, "auxiliary_loss_mlp": 0.01033222, "balance_loss_clip": 1.10149312, "balance_loss_mlp": 1.0155791, "epoch": 0.7100556140087179, "flos": 70484336323200.0, "grad_norm": 0.7562727968825211, "language_loss": 0.59464979, "learning_rate": 8.187966963778435e-07, "loss": 0.61685824, "num_input_tokens_seen": 254772615, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.17675781, "step": 11810, "time_per_iteration": 3.4314229488372803 }, { "auxiliary_loss_clip": 0.01409339, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.24863768, "balance_loss_mlp": 1.01379585, "epoch": 0.7101157372613859, "flos": 23049972708480.0, "grad_norm": 1.607523570812524, "language_loss": 0.75042045, "learning_rate": 8.18482437510784e-07, "loss": 0.77484119, "num_input_tokens_seen": 254791375, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18945312, "step": 11811, "time_per_iteration": 2.888417959213257 }, { "auxiliary_loss_clip": 0.013955, "auxiliary_loss_mlp": 0.01031173, "balance_loss_clip": 1.23806787, "balance_loss_mlp": 1.01217127, "epoch": 0.7101758605140538, "flos": 23195724768000.0, "grad_norm": 1.9162205372167527, "language_loss": 0.84048653, "learning_rate": 8.181682234469882e-07, "loss": 0.86475325, "num_input_tokens_seen": 254809300, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.19018555, "step": 11812, "time_per_iteration": 2.9091482162475586 }, { "auxiliary_loss_clip": 0.01412247, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.24863172, "balance_loss_mlp": 1.01183152, "epoch": 0.7102359837667218, "flos": 23706648760320.0, "grad_norm": 1.4408822391913676, "language_loss": 0.70896459, "learning_rate": 8.178540541983716e-07, "loss": 0.7334013, "num_input_tokens_seen": 254829325, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19592285, "step": 11813, "time_per_iteration": 2.932417154312134 }, { "auxiliary_loss_clip": 0.0139947, "auxiliary_loss_mlp": 0.01028568, "balance_loss_clip": 1.24050987, "balance_loss_mlp": 1.00978017, "epoch": 0.7102961070193897, "flos": 19400606110080.0, "grad_norm": 2.168805359282903, "language_loss": 0.82723457, "learning_rate": 8.175399297768495e-07, "loss": 0.85151494, "num_input_tokens_seen": 254847690, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18786621, "step": 11814, "time_per_iteration": 2.8755788803100586 }, { "auxiliary_loss_clip": 0.01407613, "auxiliary_loss_mlp": 0.01032321, "balance_loss_clip": 1.24616921, "balance_loss_mlp": 1.0123769, "epoch": 0.7103562302720577, "flos": 21517607934720.0, "grad_norm": 1.7874839952207309, "language_loss": 0.76916826, "learning_rate": 8.172258501943301e-07, "loss": 0.79356754, "num_input_tokens_seen": 254865960, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19946289, "step": 11815, "time_per_iteration": 2.8501358032226562 }, { "auxiliary_loss_clip": 0.01399472, "auxiliary_loss_mlp": 0.01033668, "balance_loss_clip": 1.24162591, "balance_loss_mlp": 1.01478493, "epoch": 0.7104163535247257, "flos": 14542394192640.0, "grad_norm": 1.6207973324635574, "language_loss": 0.7935372, "learning_rate": 8.16911815462725e-07, "loss": 0.81786859, "num_input_tokens_seen": 254882815, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1887207, "step": 11816, "time_per_iteration": 2.8523595333099365 }, { "auxiliary_loss_clip": 0.01389799, "auxiliary_loss_mlp": 0.01031609, "balance_loss_clip": 1.23066807, "balance_loss_mlp": 1.01319122, "epoch": 0.7104764767773937, "flos": 11407616415360.0, "grad_norm": 1.8688220483959654, "language_loss": 0.87151116, "learning_rate": 8.165978255939426e-07, "loss": 0.89572525, "num_input_tokens_seen": 254898705, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.1842041, "step": 11817, "time_per_iteration": 2.944854259490967 }, { "auxiliary_loss_clip": 0.01391731, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.23256505, "balance_loss_mlp": 1.01389039, "epoch": 0.7105366000300616, "flos": 11697355987200.0, "grad_norm": 2.1614272031044024, "language_loss": 0.85144889, "learning_rate": 8.162838805998897e-07, "loss": 0.87570298, "num_input_tokens_seen": 254913665, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19775391, "step": 11818, "time_per_iteration": 2.79404354095459 }, { "auxiliary_loss_clip": 0.01406855, "auxiliary_loss_mlp": 0.01031476, "balance_loss_clip": 1.24559844, "balance_loss_mlp": 1.0119729, "epoch": 0.7105967232827296, "flos": 19363342377600.0, "grad_norm": 2.394402306909922, "language_loss": 0.77036285, "learning_rate": 8.159699804924709e-07, "loss": 0.79474616, "num_input_tokens_seen": 254932140, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19494629, "step": 11819, "time_per_iteration": 2.8299973011016846 }, { "auxiliary_loss_clip": 0.014152, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.25453067, "balance_loss_mlp": 1.0146091, "epoch": 0.7106568465353975, "flos": 22940850954240.0, "grad_norm": 2.7472545006669105, "language_loss": 0.71525937, "learning_rate": 8.156561252835883e-07, "loss": 0.7397598, "num_input_tokens_seen": 254951580, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.20227051, "step": 11820, "time_per_iteration": 2.8683018684387207 }, { "auxiliary_loss_clip": 0.01398024, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.23795247, "balance_loss_mlp": 1.0130347, "epoch": 0.7107169697880655, "flos": 19109192480640.0, "grad_norm": 1.7506375107820273, "language_loss": 0.76327211, "learning_rate": 8.153423149851449e-07, "loss": 0.78757954, "num_input_tokens_seen": 254969425, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19665527, "step": 11821, "time_per_iteration": 2.8369550704956055 }, { "auxiliary_loss_clip": 0.01193156, "auxiliary_loss_mlp": 0.01030648, "balance_loss_clip": 1.1041801, "balance_loss_mlp": 1.00880933, "epoch": 0.7107770930407336, "flos": 63665823882240.0, "grad_norm": 0.7789398491004937, "language_loss": 0.55221355, "learning_rate": 8.150285496090388e-07, "loss": 0.57445157, "num_input_tokens_seen": 255032680, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.21875, "step": 11822, "time_per_iteration": 3.3936574459075928 }, { "auxiliary_loss_clip": 0.01390486, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.23539829, "balance_loss_mlp": 1.00978374, "epoch": 0.7108372162934015, "flos": 22064619294720.0, "grad_norm": 1.9876100442588547, "language_loss": 0.61242706, "learning_rate": 8.147148291671688e-07, "loss": 0.63664711, "num_input_tokens_seen": 255054400, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.21728516, "step": 11823, "time_per_iteration": 2.8972604274749756 }, { "auxiliary_loss_clip": 0.01411937, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.2513746, "balance_loss_mlp": 1.01034307, "epoch": 0.7108973395460695, "flos": 19144420197120.0, "grad_norm": 1.913044897778204, "language_loss": 0.7247231, "learning_rate": 8.144011536714322e-07, "loss": 0.74913323, "num_input_tokens_seen": 255072785, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18737793, "step": 11824, "time_per_iteration": 2.9166276454925537 }, { "auxiliary_loss_clip": 0.01391368, "auxiliary_loss_mlp": 0.01028088, "balance_loss_clip": 1.23563528, "balance_loss_mlp": 1.01086199, "epoch": 0.7109574627987374, "flos": 17903333318400.0, "grad_norm": 1.7598402544006317, "language_loss": 0.7332952, "learning_rate": 8.140875231337223e-07, "loss": 0.75748974, "num_input_tokens_seen": 255091820, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.17224121, "step": 11825, "time_per_iteration": 2.83345627784729 }, { "auxiliary_loss_clip": 0.01413768, "auxiliary_loss_mlp": 0.01032756, "balance_loss_clip": 1.25174618, "balance_loss_mlp": 1.01339591, "epoch": 0.7110175860514054, "flos": 28989719781120.0, "grad_norm": 1.6479384859953843, "language_loss": 0.7999922, "learning_rate": 8.137739375659321e-07, "loss": 0.82445747, "num_input_tokens_seen": 255111720, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19360352, "step": 11826, "time_per_iteration": 2.9245052337646484 }, { "auxiliary_loss_clip": 0.01397812, "auxiliary_loss_mlp": 0.01034, "balance_loss_clip": 1.23958826, "balance_loss_mlp": 1.01448596, "epoch": 0.7110777093040733, "flos": 26183483631360.0, "grad_norm": 1.7744475549604568, "language_loss": 0.83987868, "learning_rate": 8.134603969799527e-07, "loss": 0.86419678, "num_input_tokens_seen": 255133495, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19506836, "step": 11827, "time_per_iteration": 2.9219415187835693 }, { "auxiliary_loss_clip": 0.01415764, "auxiliary_loss_mlp": 0.01034658, "balance_loss_clip": 1.25394297, "balance_loss_mlp": 1.01488113, "epoch": 0.7111378325567413, "flos": 26881178734080.0, "grad_norm": 2.346649839393538, "language_loss": 0.62836397, "learning_rate": 8.131469013876748e-07, "loss": 0.65286815, "num_input_tokens_seen": 255156880, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19787598, "step": 11828, "time_per_iteration": 2.9100098609924316 }, { "auxiliary_loss_clip": 0.01410016, "auxiliary_loss_mlp": 0.01034791, "balance_loss_clip": 1.24919558, "balance_loss_mlp": 1.01600373, "epoch": 0.7111979558094093, "flos": 27283297685760.0, "grad_norm": 1.4601626564670322, "language_loss": 0.72529471, "learning_rate": 8.128334508009846e-07, "loss": 0.74974275, "num_input_tokens_seen": 255178920, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18762207, "step": 11829, "time_per_iteration": 4.338937520980835 }, { "auxiliary_loss_clip": 0.01404786, "auxiliary_loss_mlp": 0.01035074, "balance_loss_clip": 1.24493194, "balance_loss_mlp": 1.01658416, "epoch": 0.7112580790620773, "flos": 25058033758080.0, "grad_norm": 1.8193718901468916, "language_loss": 0.80793399, "learning_rate": 8.125200452317697e-07, "loss": 0.83233261, "num_input_tokens_seen": 255198095, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18493652, "step": 11830, "time_per_iteration": 2.873239755630493 }, { "auxiliary_loss_clip": 0.01411435, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.24951124, "balance_loss_mlp": 1.0138042, "epoch": 0.7113182023147452, "flos": 21654989706240.0, "grad_norm": 1.8866192635889636, "language_loss": 0.84762609, "learning_rate": 8.122066846919138e-07, "loss": 0.87207377, "num_input_tokens_seen": 255215860, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19519043, "step": 11831, "time_per_iteration": 2.8795948028564453 }, { "auxiliary_loss_clip": 0.01405388, "auxiliary_loss_mlp": 0.01031555, "balance_loss_clip": 1.24365842, "balance_loss_mlp": 1.01241004, "epoch": 0.7113783255674132, "flos": 21006141004800.0, "grad_norm": 1.9264576829828444, "language_loss": 0.78577507, "learning_rate": 8.118933691932985e-07, "loss": 0.81014454, "num_input_tokens_seen": 255235425, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19140625, "step": 11832, "time_per_iteration": 2.924132823944092 }, { "auxiliary_loss_clip": 0.01186846, "auxiliary_loss_mlp": 0.01035383, "balance_loss_clip": 1.09856963, "balance_loss_mlp": 1.01373446, "epoch": 0.7114384488200811, "flos": 66798429909120.0, "grad_norm": 0.7558065806242333, "language_loss": 0.56696212, "learning_rate": 8.115800987478059e-07, "loss": 0.5891844, "num_input_tokens_seen": 255291680, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21679688, "step": 11833, "time_per_iteration": 3.2933030128479004 }, { "auxiliary_loss_clip": 0.01407303, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.24675202, "balance_loss_mlp": 1.01757193, "epoch": 0.7114985720727491, "flos": 25021041494400.0, "grad_norm": 2.222167741792351, "language_loss": 0.71385181, "learning_rate": 8.11266873367315e-07, "loss": 0.7382859, "num_input_tokens_seen": 255313880, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1854248, "step": 11834, "time_per_iteration": 2.9071874618530273 }, { "auxiliary_loss_clip": 0.01407916, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.24551654, "balance_loss_mlp": 1.01353455, "epoch": 0.7115586953254172, "flos": 21479982243840.0, "grad_norm": 1.948160052068014, "language_loss": 0.80479467, "learning_rate": 8.10953693063704e-07, "loss": 0.82921028, "num_input_tokens_seen": 255332390, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20129395, "step": 11835, "time_per_iteration": 2.833233594894409 }, { "auxiliary_loss_clip": 0.0140024, "auxiliary_loss_mlp": 0.01030778, "balance_loss_clip": 1.24151754, "balance_loss_mlp": 1.01214528, "epoch": 0.7116188185780851, "flos": 28634637438720.0, "grad_norm": 1.5341531583328798, "language_loss": 0.76644242, "learning_rate": 8.10640557848848e-07, "loss": 0.79075259, "num_input_tokens_seen": 255354025, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18640137, "step": 11836, "time_per_iteration": 2.946895122528076 }, { "auxiliary_loss_clip": 0.01392042, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.23338938, "balance_loss_mlp": 1.01535404, "epoch": 0.7116789418307531, "flos": 25302501267840.0, "grad_norm": 1.8194935761325404, "language_loss": 0.706725, "learning_rate": 8.103274677346208e-07, "loss": 0.73100448, "num_input_tokens_seen": 255371400, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.20556641, "step": 11837, "time_per_iteration": 2.905081033706665 }, { "auxiliary_loss_clip": 0.01425406, "auxiliary_loss_mlp": 0.01038472, "balance_loss_clip": 1.25980091, "balance_loss_mlp": 1.01825428, "epoch": 0.711739065083421, "flos": 25568414812800.0, "grad_norm": 3.979016252730088, "language_loss": 0.62088335, "learning_rate": 8.100144227328958e-07, "loss": 0.64552212, "num_input_tokens_seen": 255390710, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20214844, "step": 11838, "time_per_iteration": 4.289820909500122 }, { "auxiliary_loss_clip": 0.01410323, "auxiliary_loss_mlp": 0.01035454, "balance_loss_clip": 1.24958062, "balance_loss_mlp": 1.01543832, "epoch": 0.711799188336089, "flos": 26152237457280.0, "grad_norm": 3.850926765531573, "language_loss": 0.67637575, "learning_rate": 8.097014228555426e-07, "loss": 0.70083356, "num_input_tokens_seen": 255408790, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20007324, "step": 11839, "time_per_iteration": 2.870102643966675 }, { "auxiliary_loss_clip": 0.01404379, "auxiliary_loss_mlp": 0.01035083, "balance_loss_clip": 1.2435813, "balance_loss_mlp": 1.01721334, "epoch": 0.7118593115887569, "flos": 21150128517120.0, "grad_norm": 3.4084073019820083, "language_loss": 0.85051638, "learning_rate": 8.093884681144305e-07, "loss": 0.87491095, "num_input_tokens_seen": 255426280, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.17871094, "step": 11840, "time_per_iteration": 2.8398663997650146 }, { "auxiliary_loss_clip": 0.01412978, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.24884391, "balance_loss_mlp": 1.01572382, "epoch": 0.711919434841425, "flos": 14983858137600.0, "grad_norm": 1.9961299719453303, "language_loss": 0.77519131, "learning_rate": 8.090755585214277e-07, "loss": 0.79967666, "num_input_tokens_seen": 255442935, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19836426, "step": 11841, "time_per_iteration": 4.194320201873779 }, { "auxiliary_loss_clip": 0.01408016, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.24658108, "balance_loss_mlp": 1.01966047, "epoch": 0.7119795580940929, "flos": 16517806479360.0, "grad_norm": 3.518422326737597, "language_loss": 0.75939703, "learning_rate": 8.087626940883994e-07, "loss": 0.78387177, "num_input_tokens_seen": 255460925, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19799805, "step": 11842, "time_per_iteration": 4.2595531940460205 }, { "auxiliary_loss_clip": 0.01190632, "auxiliary_loss_mlp": 0.01029048, "balance_loss_clip": 1.10297632, "balance_loss_mlp": 1.00978363, "epoch": 0.7120396813467609, "flos": 66602854045440.0, "grad_norm": 0.789500535665854, "language_loss": 0.61747146, "learning_rate": 8.084498748272082e-07, "loss": 0.63966835, "num_input_tokens_seen": 255521360, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.19238281, "step": 11843, "time_per_iteration": 3.3120384216308594 }, { "auxiliary_loss_clip": 0.01404889, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.24419332, "balance_loss_mlp": 1.01498497, "epoch": 0.7120998045994288, "flos": 26444013045120.0, "grad_norm": 1.5499140814693286, "language_loss": 0.80936527, "learning_rate": 8.081371007497171e-07, "loss": 0.83375114, "num_input_tokens_seen": 255541435, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18725586, "step": 11844, "time_per_iteration": 2.918930768966675 }, { "auxiliary_loss_clip": 0.01407723, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.24605083, "balance_loss_mlp": 1.01474082, "epoch": 0.7121599278520968, "flos": 16434682502400.0, "grad_norm": 2.3064723080222937, "language_loss": 0.80285144, "learning_rate": 8.078243718677873e-07, "loss": 0.82727009, "num_input_tokens_seen": 255558505, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19421387, "step": 11845, "time_per_iteration": 2.8584022521972656 }, { "auxiliary_loss_clip": 0.01398073, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.24062586, "balance_loss_mlp": 1.01527739, "epoch": 0.7122200511047647, "flos": 28961731232640.0, "grad_norm": 2.672711427524689, "language_loss": 0.78549361, "learning_rate": 8.075116881932762e-07, "loss": 0.80982971, "num_input_tokens_seen": 255577815, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.20263672, "step": 11846, "time_per_iteration": 2.8994081020355225 }, { "auxiliary_loss_clip": 0.0140795, "auxiliary_loss_mlp": 0.0103728, "balance_loss_clip": 1.24657774, "balance_loss_mlp": 1.01809907, "epoch": 0.7122801743574327, "flos": 16480497502080.0, "grad_norm": 2.269678257843906, "language_loss": 0.58850449, "learning_rate": 8.071990497380421e-07, "loss": 0.61295676, "num_input_tokens_seen": 255595885, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19177246, "step": 11847, "time_per_iteration": 2.851761817932129 }, { "auxiliary_loss_clip": 0.01397099, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.24120998, "balance_loss_mlp": 1.01426864, "epoch": 0.7123402976101008, "flos": 20640697603200.0, "grad_norm": 1.7278890015027006, "language_loss": 0.71740377, "learning_rate": 8.068864565139395e-07, "loss": 0.74171102, "num_input_tokens_seen": 255616750, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.19360352, "step": 11848, "time_per_iteration": 2.9037811756134033 }, { "auxiliary_loss_clip": 0.01190016, "auxiliary_loss_mlp": 0.01023502, "balance_loss_clip": 1.10170627, "balance_loss_mlp": 1.00652623, "epoch": 0.7124004208627687, "flos": 62353467164160.0, "grad_norm": 0.8213386957329786, "language_loss": 0.63143504, "learning_rate": 8.065739085328211e-07, "loss": 0.65357023, "num_input_tokens_seen": 255677900, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.16992188, "step": 11849, "time_per_iteration": 3.376105546951294 }, { "auxiliary_loss_clip": 0.01408122, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.24641776, "balance_loss_mlp": 1.01318514, "epoch": 0.7124605441154367, "flos": 39690411154560.0, "grad_norm": 1.511069003726596, "language_loss": 0.64597327, "learning_rate": 8.0626140580654e-07, "loss": 0.67038041, "num_input_tokens_seen": 255699140, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.1940918, "step": 11850, "time_per_iteration": 3.000702142715454 }, { "auxiliary_loss_clip": 0.01411647, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.2489084, "balance_loss_mlp": 1.01222944, "epoch": 0.7125206673681046, "flos": 28193037759360.0, "grad_norm": 1.4712276327350127, "language_loss": 0.70499325, "learning_rate": 8.05948948346946e-07, "loss": 0.72942674, "num_input_tokens_seen": 255719640, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19470215, "step": 11851, "time_per_iteration": 2.9218590259552 }, { "auxiliary_loss_clip": 0.01398348, "auxiliary_loss_mlp": 0.0103202, "balance_loss_clip": 1.24028885, "balance_loss_mlp": 1.01285124, "epoch": 0.7125807906207726, "flos": 26188053356160.0, "grad_norm": 1.5070641829515175, "language_loss": 0.83676845, "learning_rate": 8.056365361658882e-07, "loss": 0.86107218, "num_input_tokens_seen": 255740450, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19177246, "step": 11852, "time_per_iteration": 3.062297821044922 }, { "auxiliary_loss_clip": 0.01417025, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.25301218, "balance_loss_mlp": 1.01396346, "epoch": 0.7126409138734405, "flos": 17164121472000.0, "grad_norm": 2.732363642488351, "language_loss": 0.73547226, "learning_rate": 8.053241692752126e-07, "loss": 0.75998104, "num_input_tokens_seen": 255758070, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19873047, "step": 11853, "time_per_iteration": 2.8431310653686523 }, { "auxiliary_loss_clip": 0.01383354, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.22905171, "balance_loss_mlp": 1.01414311, "epoch": 0.7127010371261085, "flos": 18779022040320.0, "grad_norm": 2.253506757084599, "language_loss": 0.93205488, "learning_rate": 8.050118476867635e-07, "loss": 0.95622039, "num_input_tokens_seen": 255775685, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.1907959, "step": 11854, "time_per_iteration": 2.8059003353118896 }, { "auxiliary_loss_clip": 0.0139778, "auxiliary_loss_mlp": 0.01033383, "balance_loss_clip": 1.24005544, "balance_loss_mlp": 1.0132966, "epoch": 0.7127611603787765, "flos": 20386230992640.0, "grad_norm": 1.786060219151393, "language_loss": 0.80135965, "learning_rate": 8.046995714123856e-07, "loss": 0.82567132, "num_input_tokens_seen": 255794750, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.20092773, "step": 11855, "time_per_iteration": 2.8763790130615234 }, { "auxiliary_loss_clip": 0.01395153, "auxiliary_loss_mlp": 0.01034094, "balance_loss_clip": 1.23622298, "balance_loss_mlp": 1.01374483, "epoch": 0.7128212836314445, "flos": 20458722441600.0, "grad_norm": 2.34966046539848, "language_loss": 0.73573393, "learning_rate": 8.043873404639192e-07, "loss": 0.7600264, "num_input_tokens_seen": 255813325, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.20361328, "step": 11856, "time_per_iteration": 2.839785099029541 }, { "auxiliary_loss_clip": 0.01409536, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.24723554, "balance_loss_mlp": 1.01428807, "epoch": 0.7128814068841124, "flos": 23451322498560.0, "grad_norm": 1.5141864167140535, "language_loss": 0.70929384, "learning_rate": 8.040751548532046e-07, "loss": 0.73373145, "num_input_tokens_seen": 255832470, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19934082, "step": 11857, "time_per_iteration": 2.8646161556243896 }, { "auxiliary_loss_clip": 0.01398043, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.24019492, "balance_loss_mlp": 1.01194096, "epoch": 0.7129415301367804, "flos": 18231965435520.0, "grad_norm": 2.3923283681083847, "language_loss": 0.85416102, "learning_rate": 8.03763014592081e-07, "loss": 0.87846184, "num_input_tokens_seen": 255849740, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.20092773, "step": 11858, "time_per_iteration": 2.8267552852630615 }, { "auxiliary_loss_clip": 0.01418148, "auxiliary_loss_mlp": 0.01033477, "balance_loss_clip": 1.25435662, "balance_loss_mlp": 1.01316392, "epoch": 0.7130016533894483, "flos": 15532453065600.0, "grad_norm": 2.0907860286477384, "language_loss": 0.80838251, "learning_rate": 8.034509196923829e-07, "loss": 0.83289874, "num_input_tokens_seen": 255866975, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.203125, "step": 11859, "time_per_iteration": 2.831976890563965 }, { "auxiliary_loss_clip": 0.01398739, "auxiliary_loss_mlp": 0.01033157, "balance_loss_clip": 1.23986495, "balance_loss_mlp": 1.01363027, "epoch": 0.7130617766421163, "flos": 57134499321600.0, "grad_norm": 1.1965964578326465, "language_loss": 0.69304812, "learning_rate": 8.031388701659456e-07, "loss": 0.71736705, "num_input_tokens_seen": 255892915, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19519043, "step": 11860, "time_per_iteration": 3.274263381958008 }, { "auxiliary_loss_clip": 0.01410848, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.2491014, "balance_loss_mlp": 1.01170897, "epoch": 0.7131218998947844, "flos": 19796752748160.0, "grad_norm": 2.024056274445619, "language_loss": 0.65422463, "learning_rate": 8.028268660246023e-07, "loss": 0.67866933, "num_input_tokens_seen": 255911480, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.21887207, "step": 11861, "time_per_iteration": 2.871811628341675 }, { "auxiliary_loss_clip": 0.01433189, "auxiliary_loss_mlp": 0.01034653, "balance_loss_clip": 1.26795816, "balance_loss_mlp": 1.01510251, "epoch": 0.7131820231474523, "flos": 26663297184000.0, "grad_norm": 1.6391349259497663, "language_loss": 0.67823303, "learning_rate": 8.025149072801849e-07, "loss": 0.70291144, "num_input_tokens_seen": 255931140, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19543457, "step": 11862, "time_per_iteration": 2.9294931888580322 }, { "auxiliary_loss_clip": 0.01400144, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.24153602, "balance_loss_mlp": 1.01434231, "epoch": 0.7132421464001203, "flos": 29217600432000.0, "grad_norm": 10.683300890013431, "language_loss": 0.67187977, "learning_rate": 8.022029939445214e-07, "loss": 0.69620597, "num_input_tokens_seen": 255951665, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18127441, "step": 11863, "time_per_iteration": 2.9357428550720215 }, { "auxiliary_loss_clip": 0.01432506, "auxiliary_loss_mlp": 0.01032364, "balance_loss_clip": 1.2655915, "balance_loss_mlp": 1.01256323, "epoch": 0.7133022696527882, "flos": 23083390632960.0, "grad_norm": 1.9489122789512379, "language_loss": 0.66719675, "learning_rate": 8.018911260294414e-07, "loss": 0.69184542, "num_input_tokens_seen": 255970055, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19812012, "step": 11864, "time_per_iteration": 2.873563289642334 }, { "auxiliary_loss_clip": 0.01419756, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.2553072, "balance_loss_mlp": 1.01974058, "epoch": 0.7133623929054562, "flos": 17466031895040.0, "grad_norm": 3.868180829100113, "language_loss": 0.87023079, "learning_rate": 8.015793035467697e-07, "loss": 0.89482808, "num_input_tokens_seen": 255987720, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20239258, "step": 11865, "time_per_iteration": 4.279973268508911 }, { "auxiliary_loss_clip": 0.01404038, "auxiliary_loss_mlp": 0.01035338, "balance_loss_clip": 1.24174476, "balance_loss_mlp": 1.01520324, "epoch": 0.7134225161581241, "flos": 19545950966400.0, "grad_norm": 4.332891898787362, "language_loss": 0.75812638, "learning_rate": 8.012675265083304e-07, "loss": 0.78252017, "num_input_tokens_seen": 256005490, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20141602, "step": 11866, "time_per_iteration": 2.915315866470337 }, { "auxiliary_loss_clip": 0.0141467, "auxiliary_loss_mlp": 0.01038588, "balance_loss_clip": 1.25101566, "balance_loss_mlp": 1.01753592, "epoch": 0.7134826394107922, "flos": 26261268721920.0, "grad_norm": 2.079792507830645, "language_loss": 0.71375346, "learning_rate": 8.009557949259464e-07, "loss": 0.73828608, "num_input_tokens_seen": 256026030, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.21057129, "step": 11867, "time_per_iteration": 2.90033221244812 }, { "auxiliary_loss_clip": 0.01391539, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.23447824, "balance_loss_mlp": 1.01202416, "epoch": 0.7135427626634601, "flos": 15823821450240.0, "grad_norm": 2.76461652209277, "language_loss": 0.72139978, "learning_rate": 8.006441088114397e-07, "loss": 0.7456336, "num_input_tokens_seen": 256043680, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19812012, "step": 11868, "time_per_iteration": 2.836662530899048 }, { "auxiliary_loss_clip": 0.01407043, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.24438369, "balance_loss_mlp": 1.01214957, "epoch": 0.7136028859161281, "flos": 18232553617920.0, "grad_norm": 3.326572851249641, "language_loss": 0.66349816, "learning_rate": 8.003324681766286e-07, "loss": 0.68788731, "num_input_tokens_seen": 256059705, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19726562, "step": 11869, "time_per_iteration": 2.7960124015808105 }, { "auxiliary_loss_clip": 0.01406026, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.24264121, "balance_loss_mlp": 1.01420164, "epoch": 0.713663009168796, "flos": 24325110938880.0, "grad_norm": 1.6296236606908086, "language_loss": 0.78629661, "learning_rate": 8.000208730333298e-07, "loss": 0.81068814, "num_input_tokens_seen": 256079785, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18920898, "step": 11870, "time_per_iteration": 2.883226156234741 }, { "auxiliary_loss_clip": 0.01399258, "auxiliary_loss_mlp": 0.01035984, "balance_loss_clip": 1.23968148, "balance_loss_mlp": 1.01502693, "epoch": 0.713723132421464, "flos": 26548836543360.0, "grad_norm": 1.80721373865855, "language_loss": 0.81634521, "learning_rate": 7.997093233933597e-07, "loss": 0.84069765, "num_input_tokens_seen": 256099000, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.20959473, "step": 11871, "time_per_iteration": 2.981794834136963 }, { "auxiliary_loss_clip": 0.01414344, "auxiliary_loss_mlp": 0.01037023, "balance_loss_clip": 1.25026703, "balance_loss_mlp": 1.01624489, "epoch": 0.7137832556741319, "flos": 19875352245120.0, "grad_norm": 1.7133764718588806, "language_loss": 0.79838604, "learning_rate": 7.993978192685331e-07, "loss": 0.82289976, "num_input_tokens_seen": 256117985, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.20788574, "step": 11872, "time_per_iteration": 4.350321292877197 }, { "auxiliary_loss_clip": 0.01424136, "auxiliary_loss_mlp": 0.01035141, "balance_loss_clip": 1.25863242, "balance_loss_mlp": 1.01544738, "epoch": 0.7138433789267999, "flos": 21698813934720.0, "grad_norm": 2.335229049983905, "language_loss": 0.84478468, "learning_rate": 7.990863606706606e-07, "loss": 0.86937743, "num_input_tokens_seen": 256134350, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19689941, "step": 11873, "time_per_iteration": 2.8230140209198 }, { "auxiliary_loss_clip": 0.01386963, "auxiliary_loss_mlp": 0.01031457, "balance_loss_clip": 1.2291398, "balance_loss_mlp": 1.0120374, "epoch": 0.713903502179468, "flos": 17611602975360.0, "grad_norm": 2.097731776972648, "language_loss": 0.86895907, "learning_rate": 7.987749476115539e-07, "loss": 0.89314324, "num_input_tokens_seen": 256150610, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19421387, "step": 11874, "time_per_iteration": 2.844759702682495 }, { "auxiliary_loss_clip": 0.01410857, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.24772763, "balance_loss_mlp": 1.01066399, "epoch": 0.7139636254321359, "flos": 18049266357120.0, "grad_norm": 1.6959782848896736, "language_loss": 0.83386809, "learning_rate": 7.984635801030228e-07, "loss": 0.85828912, "num_input_tokens_seen": 256168620, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20593262, "step": 11875, "time_per_iteration": 2.807542562484741 }, { "auxiliary_loss_clip": 0.01427178, "auxiliary_loss_mlp": 0.01038426, "balance_loss_clip": 1.25876153, "balance_loss_mlp": 1.01751614, "epoch": 0.7140237486848039, "flos": 23341522072320.0, "grad_norm": 1.947763786666601, "language_loss": 0.7066797, "learning_rate": 7.981522581568721e-07, "loss": 0.73133576, "num_input_tokens_seen": 256186700, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20910645, "step": 11876, "time_per_iteration": 4.235002756118774 }, { "auxiliary_loss_clip": 0.01407453, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.24519241, "balance_loss_mlp": 1.01266956, "epoch": 0.7140838719374718, "flos": 16845895658880.0, "grad_norm": 1.9910910894419696, "language_loss": 0.79182315, "learning_rate": 7.978409817849079e-07, "loss": 0.81622148, "num_input_tokens_seen": 256205390, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19714355, "step": 11877, "time_per_iteration": 4.257075071334839 }, { "auxiliary_loss_clip": 0.01399459, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.24019647, "balance_loss_mlp": 1.01448011, "epoch": 0.7141439951901398, "flos": 21151847819520.0, "grad_norm": 1.8978381561413629, "language_loss": 0.70500004, "learning_rate": 7.97529750998934e-07, "loss": 0.72933048, "num_input_tokens_seen": 256224575, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19104004, "step": 11878, "time_per_iteration": 2.825953960418701 }, { "auxiliary_loss_clip": 0.01392228, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.23268843, "balance_loss_mlp": 1.01359463, "epoch": 0.7142041184428077, "flos": 24728496744960.0, "grad_norm": 1.784277592129552, "language_loss": 0.68398649, "learning_rate": 7.972185658107535e-07, "loss": 0.70823652, "num_input_tokens_seen": 256242130, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19177246, "step": 11879, "time_per_iteration": 2.860764503479004 }, { "auxiliary_loss_clip": 0.01409106, "auxiliary_loss_mlp": 0.01032801, "balance_loss_clip": 1.24622464, "balance_loss_mlp": 1.01199913, "epoch": 0.7142642416954758, "flos": 21918233808000.0, "grad_norm": 1.539876365836101, "language_loss": 0.70437062, "learning_rate": 7.969074262321646e-07, "loss": 0.72878969, "num_input_tokens_seen": 256261920, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20800781, "step": 11880, "time_per_iteration": 2.861961841583252 }, { "auxiliary_loss_clip": 0.01415807, "auxiliary_loss_mlp": 0.01032435, "balance_loss_clip": 1.25077081, "balance_loss_mlp": 1.01225281, "epoch": 0.7143243649481437, "flos": 20813307091200.0, "grad_norm": 2.886495936349849, "language_loss": 0.81842834, "learning_rate": 7.965963322749674e-07, "loss": 0.84291077, "num_input_tokens_seen": 256277970, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.2019043, "step": 11881, "time_per_iteration": 2.8490889072418213 }, { "auxiliary_loss_clip": 0.01407322, "auxiliary_loss_mlp": 0.01035744, "balance_loss_clip": 1.24687004, "balance_loss_mlp": 1.01636076, "epoch": 0.7143844882008117, "flos": 27246395911680.0, "grad_norm": 1.4940297895247827, "language_loss": 0.6476984, "learning_rate": 7.962852839509579e-07, "loss": 0.67212903, "num_input_tokens_seen": 256298205, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19384766, "step": 11882, "time_per_iteration": 2.920353889465332 }, { "auxiliary_loss_clip": 0.01411356, "auxiliary_loss_mlp": 0.01033851, "balance_loss_clip": 1.24781466, "balance_loss_mlp": 1.01426506, "epoch": 0.7144446114534796, "flos": 17938199076480.0, "grad_norm": 1.79216006948654, "language_loss": 0.69777381, "learning_rate": 7.959742812719304e-07, "loss": 0.7222259, "num_input_tokens_seen": 256316685, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19592285, "step": 11883, "time_per_iteration": 2.87205171585083 }, { "auxiliary_loss_clip": 0.01401513, "auxiliary_loss_mlp": 0.01037488, "balance_loss_clip": 1.24269962, "balance_loss_mlp": 1.01576829, "epoch": 0.7145047347061476, "flos": 20750769498240.0, "grad_norm": 2.537575158326298, "language_loss": 0.78807414, "learning_rate": 7.956633242496788e-07, "loss": 0.81246418, "num_input_tokens_seen": 256334205, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.21728516, "step": 11884, "time_per_iteration": 2.8900973796844482 }, { "auxiliary_loss_clip": 0.01421604, "auxiliary_loss_mlp": 0.01033979, "balance_loss_clip": 1.25225091, "balance_loss_mlp": 1.01353455, "epoch": 0.7145648579588155, "flos": 21188478124800.0, "grad_norm": 2.252166128658085, "language_loss": 0.74720728, "learning_rate": 7.953524128959954e-07, "loss": 0.77176309, "num_input_tokens_seen": 256353340, "router_z_loss_clip": 1.69335938, "router_z_loss_mlp": 0.20446777, "step": 11885, "time_per_iteration": 2.8393328189849854 }, { "auxiliary_loss_clip": 0.01186402, "auxiliary_loss_mlp": 0.0103642, "balance_loss_clip": 1.09880733, "balance_loss_mlp": 1.01429498, "epoch": 0.7146249812114835, "flos": 64816610843520.0, "grad_norm": 0.903022298548721, "language_loss": 0.66419291, "learning_rate": 7.95041547222669e-07, "loss": 0.68642116, "num_input_tokens_seen": 256411550, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.22167969, "step": 11886, "time_per_iteration": 3.3299286365509033 }, { "auxiliary_loss_clip": 0.01392891, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.23418307, "balance_loss_mlp": 1.01614285, "epoch": 0.7146851044641516, "flos": 18122979415680.0, "grad_norm": 1.634049749147538, "language_loss": 0.75023335, "learning_rate": 7.947307272414874e-07, "loss": 0.77451992, "num_input_tokens_seen": 256430360, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19616699, "step": 11887, "time_per_iteration": 2.8364665508270264 }, { "auxiliary_loss_clip": 0.01403136, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.241817, "balance_loss_mlp": 1.01367474, "epoch": 0.7147452277168195, "flos": 19252953768960.0, "grad_norm": 1.6784574291172119, "language_loss": 0.72914195, "learning_rate": 7.944199529642372e-07, "loss": 0.7534951, "num_input_tokens_seen": 256449750, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18493652, "step": 11888, "time_per_iteration": 2.8615500926971436 }, { "auxiliary_loss_clip": 0.01406495, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.24182856, "balance_loss_mlp": 1.0141573, "epoch": 0.7148053509694875, "flos": 23774118036480.0, "grad_norm": 4.224813764072091, "language_loss": 0.84830153, "learning_rate": 7.941092244027041e-07, "loss": 0.87271416, "num_input_tokens_seen": 256467330, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20629883, "step": 11889, "time_per_iteration": 2.8795015811920166 }, { "auxiliary_loss_clip": 0.0141782, "auxiliary_loss_mlp": 0.01030579, "balance_loss_clip": 1.25433016, "balance_loss_mlp": 1.01080203, "epoch": 0.7148654742221554, "flos": 22493776654080.0, "grad_norm": 1.766416341720488, "language_loss": 0.76729727, "learning_rate": 7.937985415686695e-07, "loss": 0.79178119, "num_input_tokens_seen": 256485705, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19750977, "step": 11890, "time_per_iteration": 2.873389482498169 }, { "auxiliary_loss_clip": 0.01405066, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 1.24605179, "balance_loss_mlp": 1.01394236, "epoch": 0.7149255974748234, "flos": 24689468465280.0, "grad_norm": 1.720158408868567, "language_loss": 0.74482131, "learning_rate": 7.934879044739147e-07, "loss": 0.7691986, "num_input_tokens_seen": 256504755, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18725586, "step": 11891, "time_per_iteration": 2.87640380859375 }, { "auxiliary_loss_clip": 0.01410914, "auxiliary_loss_mlp": 0.01037126, "balance_loss_clip": 1.24821889, "balance_loss_mlp": 1.01746893, "epoch": 0.7149857207274913, "flos": 18414709758720.0, "grad_norm": 2.439270402080044, "language_loss": 0.68986821, "learning_rate": 7.931773131302211e-07, "loss": 0.71434855, "num_input_tokens_seen": 256523670, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1965332, "step": 11892, "time_per_iteration": 2.9135901927948 }, { "auxiliary_loss_clip": 0.01419555, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.25399947, "balance_loss_mlp": 1.01375151, "epoch": 0.7150458439801594, "flos": 24979434261120.0, "grad_norm": 1.8175041671100156, "language_loss": 0.73977792, "learning_rate": 7.928667675493632e-07, "loss": 0.76430833, "num_input_tokens_seen": 256542225, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19714355, "step": 11893, "time_per_iteration": 2.8651673793792725 }, { "auxiliary_loss_clip": 0.01415811, "auxiliary_loss_mlp": 0.01032036, "balance_loss_clip": 1.24989748, "balance_loss_mlp": 1.01198494, "epoch": 0.7151059672328273, "flos": 16699464927360.0, "grad_norm": 2.1910436129154345, "language_loss": 0.67871201, "learning_rate": 7.925562677431185e-07, "loss": 0.70319045, "num_input_tokens_seen": 256560730, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20043945, "step": 11894, "time_per_iteration": 2.8403236865997314 }, { "auxiliary_loss_clip": 0.01414271, "auxiliary_loss_mlp": 0.01034918, "balance_loss_clip": 1.24994528, "balance_loss_mlp": 1.01562977, "epoch": 0.7151660904854953, "flos": 27283297685760.0, "grad_norm": 1.8265521324639244, "language_loss": 0.78251493, "learning_rate": 7.922458137232613e-07, "loss": 0.80700684, "num_input_tokens_seen": 256580505, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19287109, "step": 11895, "time_per_iteration": 2.8903191089630127 }, { "auxiliary_loss_clip": 0.01415059, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.25162816, "balance_loss_mlp": 1.01212716, "epoch": 0.7152262137381632, "flos": 18341268168960.0, "grad_norm": 1.846639313658224, "language_loss": 0.70365179, "learning_rate": 7.919354055015643e-07, "loss": 0.72812414, "num_input_tokens_seen": 256597330, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20056152, "step": 11896, "time_per_iteration": 2.8300161361694336 }, { "auxiliary_loss_clip": 0.01421131, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.25608957, "balance_loss_mlp": 1.01451457, "epoch": 0.7152863369908312, "flos": 21809428767360.0, "grad_norm": 1.6110674992239924, "language_loss": 0.87358558, "learning_rate": 7.91625043089798e-07, "loss": 0.89814472, "num_input_tokens_seen": 256616030, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20251465, "step": 11897, "time_per_iteration": 2.8574132919311523 }, { "auxiliary_loss_clip": 0.01404436, "auxiliary_loss_mlp": 0.01036843, "balance_loss_clip": 1.24529409, "balance_loss_mlp": 1.01718569, "epoch": 0.7153464602434991, "flos": 22167180552960.0, "grad_norm": 1.8534004569680465, "language_loss": 0.7898466, "learning_rate": 7.913147264997304e-07, "loss": 0.81425941, "num_input_tokens_seen": 256635570, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19665527, "step": 11898, "time_per_iteration": 2.8330564498901367 }, { "auxiliary_loss_clip": 0.01421493, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.25576329, "balance_loss_mlp": 1.01209617, "epoch": 0.7154065834961671, "flos": 24726415484160.0, "grad_norm": 1.8488722559881847, "language_loss": 0.73680377, "learning_rate": 7.910044557431302e-07, "loss": 0.76134038, "num_input_tokens_seen": 256655290, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20068359, "step": 11899, "time_per_iteration": 2.900855541229248 }, { "auxiliary_loss_clip": 0.01408006, "auxiliary_loss_mlp": 0.0103781, "balance_loss_clip": 1.24612391, "balance_loss_mlp": 1.01661444, "epoch": 0.7154667067488351, "flos": 22611721144320.0, "grad_norm": 2.9786356583343414, "language_loss": 0.76535571, "learning_rate": 7.906942308317614e-07, "loss": 0.78981388, "num_input_tokens_seen": 256671605, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.21203613, "step": 11900, "time_per_iteration": 4.311737775802612 }, { "auxiliary_loss_clip": 0.01419782, "auxiliary_loss_mlp": 0.01035938, "balance_loss_clip": 1.2564503, "balance_loss_mlp": 1.01630402, "epoch": 0.7155268300015031, "flos": 18780560363520.0, "grad_norm": 2.427425861595831, "language_loss": 0.8170265, "learning_rate": 7.903840517773886e-07, "loss": 0.84158373, "num_input_tokens_seen": 256689680, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19628906, "step": 11901, "time_per_iteration": 2.837871789932251 }, { "auxiliary_loss_clip": 0.01424194, "auxiliary_loss_mlp": 0.01039631, "balance_loss_clip": 1.25726807, "balance_loss_mlp": 1.01886439, "epoch": 0.7155869532541711, "flos": 18305497514880.0, "grad_norm": 1.9337695962026336, "language_loss": 0.82645464, "learning_rate": 7.900739185917744e-07, "loss": 0.85109282, "num_input_tokens_seen": 256707760, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.2076416, "step": 11902, "time_per_iteration": 2.9118950366973877 }, { "auxiliary_loss_clip": 0.01410632, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 1.24833226, "balance_loss_mlp": 1.01156545, "epoch": 0.715647076506839, "flos": 11987412272640.0, "grad_norm": 1.6864729313550786, "language_loss": 0.68447858, "learning_rate": 7.897638312866785e-07, "loss": 0.70888722, "num_input_tokens_seen": 256724150, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18664551, "step": 11903, "time_per_iteration": 2.915066957473755 }, { "auxiliary_loss_clip": 0.01407538, "auxiliary_loss_mlp": 0.01036105, "balance_loss_clip": 1.24663043, "balance_loss_mlp": 1.01703119, "epoch": 0.715707199759507, "flos": 18960589998720.0, "grad_norm": 1.840805857497778, "language_loss": 0.76499647, "learning_rate": 7.894537898738589e-07, "loss": 0.78943288, "num_input_tokens_seen": 256742780, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.1907959, "step": 11904, "time_per_iteration": 2.861319065093994 }, { "auxiliary_loss_clip": 0.01414866, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.2528559, "balance_loss_mlp": 1.01463652, "epoch": 0.7157673230121749, "flos": 15312761723520.0, "grad_norm": 1.8237726050077288, "language_loss": 0.72345072, "learning_rate": 7.891437943650727e-07, "loss": 0.74795151, "num_input_tokens_seen": 256761355, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.20556641, "step": 11905, "time_per_iteration": 2.827944755554199 }, { "auxiliary_loss_clip": 0.01409157, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.24757349, "balance_loss_mlp": 1.01470041, "epoch": 0.715827446264843, "flos": 23232264583680.0, "grad_norm": 2.7150081064598237, "language_loss": 0.78781867, "learning_rate": 7.88833844772076e-07, "loss": 0.81225491, "num_input_tokens_seen": 256781335, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19775391, "step": 11906, "time_per_iteration": 2.877495527267456 }, { "auxiliary_loss_clip": 0.01191011, "auxiliary_loss_mlp": 0.01034008, "balance_loss_clip": 1.09997535, "balance_loss_mlp": 1.01493478, "epoch": 0.7158875695175109, "flos": 61002715593600.0, "grad_norm": 0.7330394394348556, "language_loss": 0.55317008, "learning_rate": 7.885239411066205e-07, "loss": 0.57542032, "num_input_tokens_seen": 256838890, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.19042969, "step": 11907, "time_per_iteration": 3.300475597381592 }, { "auxiliary_loss_clip": 0.01411785, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 1.24897766, "balance_loss_mlp": 1.01199019, "epoch": 0.7159476927701789, "flos": 17137626001920.0, "grad_norm": 1.878257541844724, "language_loss": 0.70335257, "learning_rate": 7.882140833804593e-07, "loss": 0.72778255, "num_input_tokens_seen": 256858145, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19226074, "step": 11908, "time_per_iteration": 4.324196100234985 }, { "auxiliary_loss_clip": 0.01417564, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.25380576, "balance_loss_mlp": 1.01671135, "epoch": 0.7160078160228468, "flos": 22500427639680.0, "grad_norm": 1.6449469300210346, "language_loss": 0.72413468, "learning_rate": 7.879042716053415e-07, "loss": 0.74867463, "num_input_tokens_seen": 256878545, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19714355, "step": 11909, "time_per_iteration": 2.967806816101074 }, { "auxiliary_loss_clip": 0.0141392, "auxiliary_loss_mlp": 0.01034345, "balance_loss_clip": 1.24906731, "balance_loss_mlp": 1.01428199, "epoch": 0.7160679392755148, "flos": 30602991536640.0, "grad_norm": 1.529464293499408, "language_loss": 0.75419581, "learning_rate": 7.875945057930144e-07, "loss": 0.77867842, "num_input_tokens_seen": 256899920, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20056152, "step": 11910, "time_per_iteration": 2.9416685104370117 }, { "auxiliary_loss_clip": 0.01414893, "auxiliary_loss_mlp": 0.01033397, "balance_loss_clip": 1.25217104, "balance_loss_mlp": 1.01439524, "epoch": 0.7161280625281827, "flos": 21333325288320.0, "grad_norm": 1.4232623975088865, "language_loss": 0.77048266, "learning_rate": 7.872847859552251e-07, "loss": 0.79496557, "num_input_tokens_seen": 256918460, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18981934, "step": 11911, "time_per_iteration": 4.314856290817261 }, { "auxiliary_loss_clip": 0.01409735, "auxiliary_loss_mlp": 0.01034828, "balance_loss_clip": 1.24570322, "balance_loss_mlp": 1.01281047, "epoch": 0.7161881857808508, "flos": 61880467593600.0, "grad_norm": 2.7533493904714916, "language_loss": 0.59954733, "learning_rate": 7.869751121037192e-07, "loss": 0.62399298, "num_input_tokens_seen": 256942015, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.22021484, "step": 11912, "time_per_iteration": 4.713810443878174 }, { "auxiliary_loss_clip": 0.01416724, "auxiliary_loss_mlp": 0.0103475, "balance_loss_clip": 1.25602674, "balance_loss_mlp": 1.01508045, "epoch": 0.7162483090335187, "flos": 20821451155200.0, "grad_norm": 1.608731673717409, "language_loss": 0.7919693, "learning_rate": 7.866654842502376e-07, "loss": 0.81648397, "num_input_tokens_seen": 256961065, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19665527, "step": 11913, "time_per_iteration": 2.8514459133148193 }, { "auxiliary_loss_clip": 0.01401724, "auxiliary_loss_mlp": 0.01031946, "balance_loss_clip": 1.24287868, "balance_loss_mlp": 1.01373076, "epoch": 0.7163084322861867, "flos": 24108179529600.0, "grad_norm": 1.7280935967464837, "language_loss": 0.75623542, "learning_rate": 7.863559024065234e-07, "loss": 0.78057218, "num_input_tokens_seen": 256982165, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18212891, "step": 11914, "time_per_iteration": 2.8875882625579834 }, { "auxiliary_loss_clip": 0.01404715, "auxiliary_loss_mlp": 0.01038108, "balance_loss_clip": 1.2457844, "balance_loss_mlp": 1.0180335, "epoch": 0.7163685555388547, "flos": 20089659456000.0, "grad_norm": 1.5871907883057572, "language_loss": 0.74246073, "learning_rate": 7.860463665843143e-07, "loss": 0.76688892, "num_input_tokens_seen": 256999825, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.20056152, "step": 11915, "time_per_iteration": 2.820852756500244 }, { "auxiliary_loss_clip": 0.01404723, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.24206364, "balance_loss_mlp": 1.01458657, "epoch": 0.7164286787915226, "flos": 17465850915840.0, "grad_norm": 1.8230808300916983, "language_loss": 0.81562912, "learning_rate": 7.85736876795349e-07, "loss": 0.84001285, "num_input_tokens_seen": 257017450, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19067383, "step": 11916, "time_per_iteration": 2.804250955581665 }, { "auxiliary_loss_clip": 0.0141386, "auxiliary_loss_mlp": 0.01037763, "balance_loss_clip": 1.25003958, "balance_loss_mlp": 1.01880872, "epoch": 0.7164888020441906, "flos": 19728333331200.0, "grad_norm": 1.7335578239662366, "language_loss": 0.69299877, "learning_rate": 7.854274330513626e-07, "loss": 0.71751493, "num_input_tokens_seen": 257035465, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1895752, "step": 11917, "time_per_iteration": 2.829460859298706 }, { "auxiliary_loss_clip": 0.01406175, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.24465692, "balance_loss_mlp": 1.01382422, "epoch": 0.7165489252968585, "flos": 21480479936640.0, "grad_norm": 1.5469212283260947, "language_loss": 0.76277441, "learning_rate": 7.851180353640896e-07, "loss": 0.78716624, "num_input_tokens_seen": 257053750, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19177246, "step": 11918, "time_per_iteration": 2.8464229106903076 }, { "auxiliary_loss_clip": 0.01183686, "auxiliary_loss_mlp": 0.01057226, "balance_loss_clip": 1.09661198, "balance_loss_mlp": 1.037485, "epoch": 0.7166090485495266, "flos": 69961214217600.0, "grad_norm": 0.6497160109527855, "language_loss": 0.54031402, "learning_rate": 7.848086837452639e-07, "loss": 0.56272316, "num_input_tokens_seen": 257121215, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.19726562, "step": 11919, "time_per_iteration": 3.40771484375 }, { "auxiliary_loss_clip": 0.01423608, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.25840592, "balance_loss_mlp": 1.0142312, "epoch": 0.7166691718021945, "flos": 27355155707520.0, "grad_norm": 1.555952148634013, "language_loss": 0.69523454, "learning_rate": 7.844993782066132e-07, "loss": 0.71980196, "num_input_tokens_seen": 257143370, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.18908691, "step": 11920, "time_per_iteration": 2.9170873165130615 }, { "auxiliary_loss_clip": 0.01405825, "auxiliary_loss_mlp": 0.01038316, "balance_loss_clip": 1.2434814, "balance_loss_mlp": 1.01726341, "epoch": 0.7167292950548625, "flos": 30420971130240.0, "grad_norm": 3.1501183048146926, "language_loss": 0.75372148, "learning_rate": 7.841901187598678e-07, "loss": 0.77816284, "num_input_tokens_seen": 257162160, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.21044922, "step": 11921, "time_per_iteration": 2.9334871768951416 }, { "auxiliary_loss_clip": 0.01424426, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 1.25668526, "balance_loss_mlp": 1.01532757, "epoch": 0.7167894183075304, "flos": 14578526805120.0, "grad_norm": 1.9252039936106935, "language_loss": 0.77435267, "learning_rate": 7.83880905416755e-07, "loss": 0.79896587, "num_input_tokens_seen": 257179300, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.2154541, "step": 11922, "time_per_iteration": 2.8455607891082764 }, { "auxiliary_loss_clip": 0.01183262, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.09440303, "balance_loss_mlp": 1.01049519, "epoch": 0.7168495415601984, "flos": 64138126798080.0, "grad_norm": 0.7515881654348368, "language_loss": 0.55144417, "learning_rate": 7.83571738189001e-07, "loss": 0.57357049, "num_input_tokens_seen": 257235470, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.18847656, "step": 11923, "time_per_iteration": 3.110062837600708 }, { "auxiliary_loss_clip": 0.0141849, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.2548188, "balance_loss_mlp": 1.01520443, "epoch": 0.7169096648128663, "flos": 24692273642880.0, "grad_norm": 1.520170233637924, "language_loss": 0.77851999, "learning_rate": 7.832626170883279e-07, "loss": 0.80305231, "num_input_tokens_seen": 257255850, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.1953125, "step": 11924, "time_per_iteration": 2.8854012489318848 }, { "auxiliary_loss_clip": 0.01402354, "auxiliary_loss_mlp": 0.01035257, "balance_loss_clip": 1.24308622, "balance_loss_mlp": 1.01532507, "epoch": 0.7169697880655344, "flos": 20677192174080.0, "grad_norm": 1.7793255485975457, "language_loss": 0.69067156, "learning_rate": 7.829535421264588e-07, "loss": 0.71504772, "num_input_tokens_seen": 257275425, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19934082, "step": 11925, "time_per_iteration": 2.912184476852417 }, { "auxiliary_loss_clip": 0.01391624, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.23378241, "balance_loss_mlp": 1.01158273, "epoch": 0.7170299113182023, "flos": 21042545086080.0, "grad_norm": 1.8286109576521405, "language_loss": 0.78056443, "learning_rate": 7.826445133151133e-07, "loss": 0.80479521, "num_input_tokens_seen": 257295740, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.1986084, "step": 11926, "time_per_iteration": 2.9376730918884277 }, { "auxiliary_loss_clip": 0.01420328, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.25257766, "balance_loss_mlp": 1.01977122, "epoch": 0.7170900345708703, "flos": 22903587221760.0, "grad_norm": 2.239676771118708, "language_loss": 0.78257018, "learning_rate": 7.823355306660093e-07, "loss": 0.80716896, "num_input_tokens_seen": 257315970, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19763184, "step": 11927, "time_per_iteration": 2.8626770973205566 }, { "auxiliary_loss_clip": 0.0140415, "auxiliary_loss_mlp": 0.01033555, "balance_loss_clip": 1.2444514, "balance_loss_mlp": 1.01387405, "epoch": 0.7171501578235383, "flos": 15525892569600.0, "grad_norm": 1.770847482660701, "language_loss": 0.70304894, "learning_rate": 7.820265941908642e-07, "loss": 0.72742593, "num_input_tokens_seen": 257334230, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19689941, "step": 11928, "time_per_iteration": 2.859679698944092 }, { "auxiliary_loss_clip": 0.01397982, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.24009359, "balance_loss_mlp": 1.01478684, "epoch": 0.7172102810762062, "flos": 26115471417600.0, "grad_norm": 1.7849279086300927, "language_loss": 0.65657985, "learning_rate": 7.817177039013931e-07, "loss": 0.68090808, "num_input_tokens_seen": 257352145, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.20068359, "step": 11929, "time_per_iteration": 2.8804585933685303 }, { "auxiliary_loss_clip": 0.01411887, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.24760294, "balance_loss_mlp": 1.01593304, "epoch": 0.7172704043288742, "flos": 21516522059520.0, "grad_norm": 2.4474650673239697, "language_loss": 0.70722395, "learning_rate": 7.81408859809308e-07, "loss": 0.73170626, "num_input_tokens_seen": 257371460, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20410156, "step": 11930, "time_per_iteration": 2.8679916858673096 }, { "auxiliary_loss_clip": 0.01416221, "auxiliary_loss_mlp": 0.01031752, "balance_loss_clip": 1.25300384, "balance_loss_mlp": 1.01204658, "epoch": 0.7173305275815421, "flos": 18780288894720.0, "grad_norm": 1.8573097687497073, "language_loss": 0.81279081, "learning_rate": 7.811000619263219e-07, "loss": 0.8372705, "num_input_tokens_seen": 257390800, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19702148, "step": 11931, "time_per_iteration": 2.825329303741455 }, { "auxiliary_loss_clip": 0.01402138, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.24139953, "balance_loss_mlp": 1.01397276, "epoch": 0.7173906508342102, "flos": 16188405200640.0, "grad_norm": 2.2343783371955346, "language_loss": 0.79368877, "learning_rate": 7.80791310264143e-07, "loss": 0.81805134, "num_input_tokens_seen": 257407495, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.20153809, "step": 11932, "time_per_iteration": 2.8248519897460938 }, { "auxiliary_loss_clip": 0.01396911, "auxiliary_loss_mlp": 0.0103465, "balance_loss_clip": 1.23842323, "balance_loss_mlp": 1.01498055, "epoch": 0.7174507740868781, "flos": 26624857086720.0, "grad_norm": 1.4297053803581805, "language_loss": 0.75664586, "learning_rate": 7.804826048344803e-07, "loss": 0.78096151, "num_input_tokens_seen": 257429675, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19665527, "step": 11933, "time_per_iteration": 2.9148833751678467 }, { "auxiliary_loss_clip": 0.01422091, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.25367987, "balance_loss_mlp": 1.01605487, "epoch": 0.7175108973395461, "flos": 18439666905600.0, "grad_norm": 2.477630120927513, "language_loss": 0.69965661, "learning_rate": 7.801739456490388e-07, "loss": 0.72424799, "num_input_tokens_seen": 257442765, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20983887, "step": 11934, "time_per_iteration": 2.780611038208008 }, { "auxiliary_loss_clip": 0.01401477, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.24060798, "balance_loss_mlp": 1.01596236, "epoch": 0.717571020592214, "flos": 23925525696000.0, "grad_norm": 2.012680900594361, "language_loss": 0.86727953, "learning_rate": 7.798653327195237e-07, "loss": 0.89164329, "num_input_tokens_seen": 257459310, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18920898, "step": 11935, "time_per_iteration": 4.320261240005493 }, { "auxiliary_loss_clip": 0.01407355, "auxiliary_loss_mlp": 0.01034523, "balance_loss_clip": 1.24599719, "balance_loss_mlp": 1.0154258, "epoch": 0.717631143844882, "flos": 38272009328640.0, "grad_norm": 1.484989665030357, "language_loss": 0.74825472, "learning_rate": 7.795567660576388e-07, "loss": 0.77267349, "num_input_tokens_seen": 257484750, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19104004, "step": 11936, "time_per_iteration": 3.012108087539673 }, { "auxiliary_loss_clip": 0.01190263, "auxiliary_loss_mlp": 0.01031247, "balance_loss_clip": 1.09942698, "balance_loss_mlp": 1.01064801, "epoch": 0.7176912670975499, "flos": 65548583521920.0, "grad_norm": 0.7665729671020776, "language_loss": 0.56004906, "learning_rate": 7.79248245675082e-07, "loss": 0.58226418, "num_input_tokens_seen": 257543110, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.20605469, "step": 11937, "time_per_iteration": 3.3353137969970703 }, { "auxiliary_loss_clip": 0.01416213, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.2520237, "balance_loss_mlp": 1.01878428, "epoch": 0.717751390350218, "flos": 31292497330560.0, "grad_norm": 2.0319996180933044, "language_loss": 0.55127305, "learning_rate": 7.789397715835542e-07, "loss": 0.57582939, "num_input_tokens_seen": 257567410, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.2064209, "step": 11938, "time_per_iteration": 2.9410409927368164 }, { "auxiliary_loss_clip": 0.01387362, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.22966456, "balance_loss_mlp": 1.01324713, "epoch": 0.7178115136028859, "flos": 19866981957120.0, "grad_norm": 1.5977408577492318, "language_loss": 0.7728405, "learning_rate": 7.786313437947527e-07, "loss": 0.79704297, "num_input_tokens_seen": 257586270, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19641113, "step": 11939, "time_per_iteration": 2.850473403930664 }, { "auxiliary_loss_clip": 0.01188825, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.09815538, "balance_loss_mlp": 1.01240385, "epoch": 0.7178716368555539, "flos": 64381119212160.0, "grad_norm": 0.7648896488320684, "language_loss": 0.61561894, "learning_rate": 7.783229623203738e-07, "loss": 0.63783246, "num_input_tokens_seen": 257647415, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.20117188, "step": 11940, "time_per_iteration": 3.2522809505462646 }, { "auxiliary_loss_clip": 0.01402972, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.24359143, "balance_loss_mlp": 1.01602077, "epoch": 0.7179317601082219, "flos": 26774364464640.0, "grad_norm": 1.4178391115991138, "language_loss": 0.59386587, "learning_rate": 7.780146271721097e-07, "loss": 0.61825383, "num_input_tokens_seen": 257669795, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19812012, "step": 11941, "time_per_iteration": 2.916381359100342 }, { "auxiliary_loss_clip": 0.01405698, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.24585986, "balance_loss_mlp": 1.01814294, "epoch": 0.7179918833608898, "flos": 23524311640320.0, "grad_norm": 3.619366730279992, "language_loss": 0.80015951, "learning_rate": 7.777063383616543e-07, "loss": 0.82459295, "num_input_tokens_seen": 257687415, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19494629, "step": 11942, "time_per_iteration": 4.421423673629761 }, { "auxiliary_loss_clip": 0.01403905, "auxiliary_loss_mlp": 0.01038017, "balance_loss_clip": 1.24336493, "balance_loss_mlp": 1.01791883, "epoch": 0.7180520066135578, "flos": 17174527776000.0, "grad_norm": 2.1376282864449916, "language_loss": 0.66494781, "learning_rate": 7.773980959006968e-07, "loss": 0.689367, "num_input_tokens_seen": 257706215, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20117188, "step": 11943, "time_per_iteration": 2.8056344985961914 }, { "auxiliary_loss_clip": 0.01413432, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 1.25107765, "balance_loss_mlp": 1.01644897, "epoch": 0.7181121298662257, "flos": 17575606097280.0, "grad_norm": 1.8009987264788514, "language_loss": 0.79411644, "learning_rate": 7.770898998009254e-07, "loss": 0.81860435, "num_input_tokens_seen": 257724740, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18908691, "step": 11944, "time_per_iteration": 2.856820821762085 }, { "auxiliary_loss_clip": 0.01414695, "auxiliary_loss_mlp": 0.01036497, "balance_loss_clip": 1.2493329, "balance_loss_mlp": 1.01530194, "epoch": 0.7181722531188938, "flos": 11955125468160.0, "grad_norm": 4.4741333562509915, "language_loss": 0.64307666, "learning_rate": 7.767817500740277e-07, "loss": 0.66758859, "num_input_tokens_seen": 257742060, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.21203613, "step": 11945, "time_per_iteration": 2.9800782203674316 }, { "auxiliary_loss_clip": 0.01188921, "auxiliary_loss_mlp": 0.01026288, "balance_loss_clip": 1.09900093, "balance_loss_mlp": 1.00950325, "epoch": 0.7182323763715617, "flos": 65533155045120.0, "grad_norm": 0.6985574208126251, "language_loss": 0.51131666, "learning_rate": 7.76473646731689e-07, "loss": 0.53346878, "num_input_tokens_seen": 257802250, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.16796875, "step": 11946, "time_per_iteration": 4.611027956008911 }, { "auxiliary_loss_clip": 0.01414503, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.25044847, "balance_loss_mlp": 1.01606989, "epoch": 0.7182924996242297, "flos": 20640561868800.0, "grad_norm": 1.8058395600311008, "language_loss": 0.75577855, "learning_rate": 7.761655897855925e-07, "loss": 0.7802881, "num_input_tokens_seen": 257821155, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20373535, "step": 11947, "time_per_iteration": 4.2395124435424805 }, { "auxiliary_loss_clip": 0.01396334, "auxiliary_loss_mlp": 0.01030308, "balance_loss_clip": 1.23650312, "balance_loss_mlp": 1.0101254, "epoch": 0.7183526228768976, "flos": 16224990261120.0, "grad_norm": 1.70415399700799, "language_loss": 0.73311377, "learning_rate": 7.758575792474187e-07, "loss": 0.75738019, "num_input_tokens_seen": 257839905, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.20178223, "step": 11948, "time_per_iteration": 2.8384287357330322 }, { "auxiliary_loss_clip": 0.01414435, "auxiliary_loss_mlp": 0.01035688, "balance_loss_clip": 1.25273705, "balance_loss_mlp": 1.01598322, "epoch": 0.7184127461295656, "flos": 22241753262720.0, "grad_norm": 1.7335591290150743, "language_loss": 0.71859181, "learning_rate": 7.755496151288483e-07, "loss": 0.74309301, "num_input_tokens_seen": 257860055, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19714355, "step": 11949, "time_per_iteration": 2.8595261573791504 }, { "auxiliary_loss_clip": 0.01403923, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.24301326, "balance_loss_mlp": 1.01717448, "epoch": 0.7184728693822335, "flos": 27355924869120.0, "grad_norm": 1.9562058886406282, "language_loss": 0.77017343, "learning_rate": 7.752416974415598e-07, "loss": 0.79458141, "num_input_tokens_seen": 257879315, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19714355, "step": 11950, "time_per_iteration": 2.910169839859009 }, { "auxiliary_loss_clip": 0.01413935, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.25131297, "balance_loss_mlp": 1.01525354, "epoch": 0.7185329926349016, "flos": 16517218296960.0, "grad_norm": 4.185579789541104, "language_loss": 0.68369591, "learning_rate": 7.749338261972282e-07, "loss": 0.70818907, "num_input_tokens_seen": 257896570, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20117188, "step": 11951, "time_per_iteration": 2.971205234527588 }, { "auxiliary_loss_clip": 0.01403403, "auxiliary_loss_mlp": 0.01036664, "balance_loss_clip": 1.24063289, "balance_loss_mlp": 1.0164938, "epoch": 0.7185931158875695, "flos": 23961522574080.0, "grad_norm": 1.7824014643524613, "language_loss": 0.79126966, "learning_rate": 7.746260014075286e-07, "loss": 0.81567037, "num_input_tokens_seen": 257916855, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20178223, "step": 11952, "time_per_iteration": 2.8984999656677246 }, { "auxiliary_loss_clip": 0.01424631, "auxiliary_loss_mlp": 0.01036171, "balance_loss_clip": 1.25963545, "balance_loss_mlp": 1.01651323, "epoch": 0.7186532391402375, "flos": 26553270533760.0, "grad_norm": 1.7233292350133835, "language_loss": 0.7569766, "learning_rate": 7.743182230841352e-07, "loss": 0.78158462, "num_input_tokens_seen": 257937140, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.1965332, "step": 11953, "time_per_iteration": 2.9015767574310303 }, { "auxiliary_loss_clip": 0.01409007, "auxiliary_loss_mlp": 0.01036586, "balance_loss_clip": 1.24630761, "balance_loss_mlp": 1.01683259, "epoch": 0.7187133623929055, "flos": 22393341901440.0, "grad_norm": 1.6223293660386822, "language_loss": 0.73666042, "learning_rate": 7.740104912387164e-07, "loss": 0.76111633, "num_input_tokens_seen": 257956785, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1973877, "step": 11954, "time_per_iteration": 2.819594144821167 }, { "auxiliary_loss_clip": 0.01418447, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.25658059, "balance_loss_mlp": 1.01381707, "epoch": 0.7187734856455734, "flos": 15788277020160.0, "grad_norm": 1.6843273353271118, "language_loss": 0.75254118, "learning_rate": 7.737028058829425e-07, "loss": 0.77706254, "num_input_tokens_seen": 257975455, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19885254, "step": 11955, "time_per_iteration": 2.833716869354248 }, { "auxiliary_loss_clip": 0.01416304, "auxiliary_loss_mlp": 0.01033423, "balance_loss_clip": 1.25380397, "balance_loss_mlp": 1.01397979, "epoch": 0.7188336088982414, "flos": 31772265638400.0, "grad_norm": 1.7658769922709923, "language_loss": 0.74556792, "learning_rate": 7.733951670284817e-07, "loss": 0.77006519, "num_input_tokens_seen": 257996850, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19433594, "step": 11956, "time_per_iteration": 2.8969931602478027 }, { "auxiliary_loss_clip": 0.01415626, "auxiliary_loss_mlp": 0.01036029, "balance_loss_clip": 1.2512387, "balance_loss_mlp": 1.0160377, "epoch": 0.7188937321509093, "flos": 21473783706240.0, "grad_norm": 1.6232005260749167, "language_loss": 0.71731782, "learning_rate": 7.730875746869987e-07, "loss": 0.74183434, "num_input_tokens_seen": 258016145, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19970703, "step": 11957, "time_per_iteration": 2.884120225906372 }, { "auxiliary_loss_clip": 0.01421283, "auxiliary_loss_mlp": 0.01039016, "balance_loss_clip": 1.25543368, "balance_loss_mlp": 1.01798761, "epoch": 0.7189538554035774, "flos": 27282890482560.0, "grad_norm": 1.8326406325553835, "language_loss": 0.73848474, "learning_rate": 7.727800288701582e-07, "loss": 0.76308769, "num_input_tokens_seen": 258035420, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.21032715, "step": 11958, "time_per_iteration": 2.864164352416992 }, { "auxiliary_loss_clip": 0.01398971, "auxiliary_loss_mlp": 0.01034562, "balance_loss_clip": 1.23928618, "balance_loss_mlp": 1.01438034, "epoch": 0.7190139786562453, "flos": 21590778055680.0, "grad_norm": 3.359923473503292, "language_loss": 0.8439188, "learning_rate": 7.724725295896215e-07, "loss": 0.86825413, "num_input_tokens_seen": 258053520, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.2019043, "step": 11959, "time_per_iteration": 2.843977689743042 }, { "auxiliary_loss_clip": 0.01411469, "auxiliary_loss_mlp": 0.0103419, "balance_loss_clip": 1.24691939, "balance_loss_mlp": 1.01406765, "epoch": 0.7190741019089133, "flos": 26731716600960.0, "grad_norm": 1.5488925325362566, "language_loss": 0.82252312, "learning_rate": 7.7216507685705e-07, "loss": 0.84697974, "num_input_tokens_seen": 258073020, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20117188, "step": 11960, "time_per_iteration": 2.8928382396698 }, { "auxiliary_loss_clip": 0.01399797, "auxiliary_loss_mlp": 0.01038603, "balance_loss_clip": 1.23925972, "balance_loss_mlp": 1.01867175, "epoch": 0.7191342251615812, "flos": 26116059600000.0, "grad_norm": 3.339234815421593, "language_loss": 0.78627932, "learning_rate": 7.718576706841013e-07, "loss": 0.81066334, "num_input_tokens_seen": 258093155, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19934082, "step": 11961, "time_per_iteration": 2.8886313438415527 }, { "auxiliary_loss_clip": 0.01397397, "auxiliary_loss_mlp": 0.0103562, "balance_loss_clip": 1.24015141, "balance_loss_mlp": 1.01674974, "epoch": 0.7191943484142492, "flos": 22977526504320.0, "grad_norm": 1.4693010261022863, "language_loss": 0.75775218, "learning_rate": 7.715503110824326e-07, "loss": 0.78208232, "num_input_tokens_seen": 258113905, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.1887207, "step": 11962, "time_per_iteration": 2.862276315689087 }, { "auxiliary_loss_clip": 0.01405204, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.24363804, "balance_loss_mlp": 1.01178527, "epoch": 0.7192544716669171, "flos": 22575633776640.0, "grad_norm": 1.7206152717062972, "language_loss": 0.75658238, "learning_rate": 7.712429980637001e-07, "loss": 0.78095222, "num_input_tokens_seen": 258132820, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.20007324, "step": 11963, "time_per_iteration": 2.8644957542419434 }, { "auxiliary_loss_clip": 0.01428831, "auxiliary_loss_mlp": 0.01036684, "balance_loss_clip": 1.26173472, "balance_loss_mlp": 1.01531005, "epoch": 0.7193145949195852, "flos": 18989619177600.0, "grad_norm": 5.676084484264525, "language_loss": 0.81498903, "learning_rate": 7.709357316395564e-07, "loss": 0.83964419, "num_input_tokens_seen": 258148055, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.21374512, "step": 11964, "time_per_iteration": 2.83955717086792 }, { "auxiliary_loss_clip": 0.01396375, "auxiliary_loss_mlp": 0.01035955, "balance_loss_clip": 1.23724771, "balance_loss_mlp": 1.01537979, "epoch": 0.7193747181722531, "flos": 18013269479040.0, "grad_norm": 3.0495531227293853, "language_loss": 0.7539072, "learning_rate": 7.70628511821652e-07, "loss": 0.77823049, "num_input_tokens_seen": 258165995, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.20568848, "step": 11965, "time_per_iteration": 2.8270442485809326 }, { "auxiliary_loss_clip": 0.01421231, "auxiliary_loss_mlp": 0.01033885, "balance_loss_clip": 1.25594354, "balance_loss_mlp": 1.01271331, "epoch": 0.7194348414249211, "flos": 24400136096640.0, "grad_norm": 1.548098992307941, "language_loss": 0.77869529, "learning_rate": 7.703213386216377e-07, "loss": 0.80324638, "num_input_tokens_seen": 258186165, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.21166992, "step": 11966, "time_per_iteration": 2.9676949977874756 }, { "auxiliary_loss_clip": 0.01409455, "auxiliary_loss_mlp": 0.01031296, "balance_loss_clip": 1.24638867, "balance_loss_mlp": 1.01076853, "epoch": 0.7194949646775891, "flos": 22173514824960.0, "grad_norm": 1.7279475819709809, "language_loss": 0.73880231, "learning_rate": 7.700142120511619e-07, "loss": 0.76320988, "num_input_tokens_seen": 258204595, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20532227, "step": 11967, "time_per_iteration": 2.829530954360962 }, { "auxiliary_loss_clip": 0.01391086, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.23617303, "balance_loss_mlp": 1.01438808, "epoch": 0.719555087930257, "flos": 20275978118400.0, "grad_norm": 1.9138378673063465, "language_loss": 0.8255083, "learning_rate": 7.6970713212187e-07, "loss": 0.84975225, "num_input_tokens_seen": 258223110, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.18896484, "step": 11968, "time_per_iteration": 2.812560558319092 }, { "auxiliary_loss_clip": 0.01401217, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.2404201, "balance_loss_mlp": 1.01501048, "epoch": 0.719615211182925, "flos": 24726732197760.0, "grad_norm": 1.9204034875874365, "language_loss": 0.77093595, "learning_rate": 7.69400098845407e-07, "loss": 0.79529756, "num_input_tokens_seen": 258242660, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19921875, "step": 11969, "time_per_iteration": 2.9130172729492188 }, { "auxiliary_loss_clip": 0.01406549, "auxiliary_loss_mlp": 0.01031339, "balance_loss_clip": 1.2435987, "balance_loss_mlp": 1.01084745, "epoch": 0.719675334435593, "flos": 20018842064640.0, "grad_norm": 1.4923261589684718, "language_loss": 0.71483958, "learning_rate": 7.69093112233417e-07, "loss": 0.73921841, "num_input_tokens_seen": 258261850, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.20495605, "step": 11970, "time_per_iteration": 4.249422550201416 }, { "auxiliary_loss_clip": 0.01187313, "auxiliary_loss_mlp": 0.01025098, "balance_loss_clip": 1.09744716, "balance_loss_mlp": 1.00373578, "epoch": 0.719735457688261, "flos": 44224850131200.0, "grad_norm": 1.0233104450737351, "language_loss": 0.60879397, "learning_rate": 7.68786172297538e-07, "loss": 0.63091803, "num_input_tokens_seen": 258312570, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.21386719, "step": 11971, "time_per_iteration": 3.2314682006835938 }, { "auxiliary_loss_clip": 0.01418368, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.25016236, "balance_loss_mlp": 1.01393151, "epoch": 0.7197955809409289, "flos": 16812296755200.0, "grad_norm": 4.465347360436265, "language_loss": 0.80638188, "learning_rate": 7.684792790494105e-07, "loss": 0.83091497, "num_input_tokens_seen": 258331600, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.21008301, "step": 11972, "time_per_iteration": 2.883413553237915 }, { "auxiliary_loss_clip": 0.01417634, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.25393271, "balance_loss_mlp": 1.01802373, "epoch": 0.7198557041935969, "flos": 24546159624960.0, "grad_norm": 1.5912028382832362, "language_loss": 0.76938438, "learning_rate": 7.681724325006733e-07, "loss": 0.79394758, "num_input_tokens_seen": 258351785, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.20654297, "step": 11973, "time_per_iteration": 2.874927520751953 }, { "auxiliary_loss_clip": 0.0119325, "auxiliary_loss_mlp": 0.01028102, "balance_loss_clip": 1.10216999, "balance_loss_mlp": 1.00311589, "epoch": 0.7199158274462648, "flos": 70739029157760.0, "grad_norm": 0.8654038751513043, "language_loss": 0.57360011, "learning_rate": 7.6786563266296e-07, "loss": 0.59581363, "num_input_tokens_seen": 258404035, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.25, "step": 11974, "time_per_iteration": 3.129164457321167 }, { "auxiliary_loss_clip": 0.01407582, "auxiliary_loss_mlp": 0.01035491, "balance_loss_clip": 1.24397469, "balance_loss_mlp": 1.01486754, "epoch": 0.7199759506989328, "flos": 29359099480320.0, "grad_norm": 3.7020224967308764, "language_loss": 0.62099618, "learning_rate": 7.675588795479062e-07, "loss": 0.64542693, "num_input_tokens_seen": 258424850, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.20617676, "step": 11975, "time_per_iteration": 2.9228219985961914 }, { "auxiliary_loss_clip": 0.01394496, "auxiliary_loss_mlp": 0.01033691, "balance_loss_clip": 1.23315108, "balance_loss_mlp": 1.01416409, "epoch": 0.7200360739516007, "flos": 24650123472000.0, "grad_norm": 2.2323457217044087, "language_loss": 0.68472576, "learning_rate": 7.672521731671425e-07, "loss": 0.70900762, "num_input_tokens_seen": 258445485, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19519043, "step": 11976, "time_per_iteration": 2.9086384773254395 }, { "auxiliary_loss_clip": 0.0142428, "auxiliary_loss_mlp": 0.01043575, "balance_loss_clip": 1.26061988, "balance_loss_mlp": 1.02398932, "epoch": 0.7200961972042688, "flos": 20822401296000.0, "grad_norm": 1.7199583212527683, "language_loss": 0.67593133, "learning_rate": 7.669455135323004e-07, "loss": 0.70060992, "num_input_tokens_seen": 258464505, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19592285, "step": 11977, "time_per_iteration": 2.898306369781494 }, { "auxiliary_loss_clip": 0.01421175, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.2557354, "balance_loss_mlp": 1.01818407, "epoch": 0.7201563204569367, "flos": 31257676817280.0, "grad_norm": 1.5296240665929364, "language_loss": 0.76113594, "learning_rate": 7.666389006550074e-07, "loss": 0.78574288, "num_input_tokens_seen": 258487190, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.21337891, "step": 11978, "time_per_iteration": 4.454826831817627 }, { "auxiliary_loss_clip": 0.01405635, "auxiliary_loss_mlp": 0.01035, "balance_loss_clip": 1.24525166, "balance_loss_mlp": 1.01462758, "epoch": 0.7202164437096047, "flos": 26662663756800.0, "grad_norm": 1.9988329201155917, "language_loss": 0.79179418, "learning_rate": 7.663323345468908e-07, "loss": 0.81620055, "num_input_tokens_seen": 258503790, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.20373535, "step": 11979, "time_per_iteration": 2.8819196224212646 }, { "auxiliary_loss_clip": 0.01404029, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.2433418, "balance_loss_mlp": 1.01633775, "epoch": 0.7202765669622727, "flos": 25971076702080.0, "grad_norm": 2.1941896831564662, "language_loss": 0.65738094, "learning_rate": 7.660258152195767e-07, "loss": 0.68178952, "num_input_tokens_seen": 258527335, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20495605, "step": 11980, "time_per_iteration": 2.946620464324951 }, { "auxiliary_loss_clip": 0.01411356, "auxiliary_loss_mlp": 0.0104237, "balance_loss_clip": 1.24884248, "balance_loss_mlp": 1.02178264, "epoch": 0.7203366902149406, "flos": 28524158340480.0, "grad_norm": 1.875498176911067, "language_loss": 0.67897373, "learning_rate": 7.657193426846871e-07, "loss": 0.703511, "num_input_tokens_seen": 258546690, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20581055, "step": 11981, "time_per_iteration": 4.352744817733765 }, { "auxiliary_loss_clip": 0.01410584, "auxiliary_loss_mlp": 0.01038824, "balance_loss_clip": 1.24653459, "balance_loss_mlp": 1.01855862, "epoch": 0.7203968134676086, "flos": 21115986675840.0, "grad_norm": 1.85972514342076, "language_loss": 0.74541646, "learning_rate": 7.65412916953843e-07, "loss": 0.76991051, "num_input_tokens_seen": 258566340, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.20251465, "step": 11982, "time_per_iteration": 4.274494886398315 }, { "auxiliary_loss_clip": 0.01407002, "auxiliary_loss_mlp": 0.01042566, "balance_loss_clip": 1.24355578, "balance_loss_mlp": 1.02295637, "epoch": 0.7204569367202766, "flos": 18341132434560.0, "grad_norm": 2.132663860023864, "language_loss": 0.6653893, "learning_rate": 7.65106538038665e-07, "loss": 0.68988496, "num_input_tokens_seen": 258584455, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19616699, "step": 11983, "time_per_iteration": 2.811666250228882 }, { "auxiliary_loss_clip": 0.01400626, "auxiliary_loss_mlp": 0.01042962, "balance_loss_clip": 1.23788953, "balance_loss_mlp": 1.02120638, "epoch": 0.7205170599729446, "flos": 23264687122560.0, "grad_norm": 1.5090609352616173, "language_loss": 0.67136019, "learning_rate": 7.648002059507715e-07, "loss": 0.69579607, "num_input_tokens_seen": 258604725, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.21765137, "step": 11984, "time_per_iteration": 2.849229574203491 }, { "auxiliary_loss_clip": 0.01426418, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.26206231, "balance_loss_mlp": 1.02197552, "epoch": 0.7205771832256125, "flos": 20130588017280.0, "grad_norm": 1.5254404313229588, "language_loss": 0.74783057, "learning_rate": 7.644939207017771e-07, "loss": 0.77252537, "num_input_tokens_seen": 258622885, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.21105957, "step": 11985, "time_per_iteration": 2.904045581817627 }, { "auxiliary_loss_clip": 0.01414378, "auxiliary_loss_mlp": 0.01039601, "balance_loss_clip": 1.253443, "balance_loss_mlp": 1.02022958, "epoch": 0.7206373064782805, "flos": 27713178961920.0, "grad_norm": 1.7009508826955413, "language_loss": 0.6375941, "learning_rate": 7.641876823032977e-07, "loss": 0.66213393, "num_input_tokens_seen": 258644305, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19372559, "step": 11986, "time_per_iteration": 2.8900091648101807 }, { "auxiliary_loss_clip": 0.01411913, "auxiliary_loss_mlp": 0.01038863, "balance_loss_clip": 1.24756312, "balance_loss_mlp": 1.01801324, "epoch": 0.7206974297309484, "flos": 17977272600960.0, "grad_norm": 1.5502530157132937, "language_loss": 0.72902298, "learning_rate": 7.638814907669455e-07, "loss": 0.75353074, "num_input_tokens_seen": 258661775, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.20837402, "step": 11987, "time_per_iteration": 2.978158950805664 }, { "auxiliary_loss_clip": 0.01411927, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.24633837, "balance_loss_mlp": 1.01764274, "epoch": 0.7207575529836164, "flos": 16992054921600.0, "grad_norm": 1.7315809203291026, "language_loss": 0.79001915, "learning_rate": 7.635753461043301e-07, "loss": 0.81450725, "num_input_tokens_seen": 258679830, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19238281, "step": 11988, "time_per_iteration": 2.830598831176758 }, { "auxiliary_loss_clip": 0.01412157, "auxiliary_loss_mlp": 0.01037223, "balance_loss_clip": 1.24923849, "balance_loss_mlp": 1.01557446, "epoch": 0.7208176762362843, "flos": 18735152567040.0, "grad_norm": 1.629029539285493, "language_loss": 0.78936625, "learning_rate": 7.632692483270618e-07, "loss": 0.81385994, "num_input_tokens_seen": 258697415, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.21643066, "step": 11989, "time_per_iteration": 2.854494333267212 }, { "auxiliary_loss_clip": 0.01391031, "auxiliary_loss_mlp": 0.01035395, "balance_loss_clip": 1.23122692, "balance_loss_mlp": 1.01502216, "epoch": 0.7208777994889524, "flos": 18743839568640.0, "grad_norm": 1.8314500504593056, "language_loss": 0.83624172, "learning_rate": 7.629631974467481e-07, "loss": 0.86050606, "num_input_tokens_seen": 258716755, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.20373535, "step": 11990, "time_per_iteration": 2.8515193462371826 }, { "auxiliary_loss_clip": 0.0140189, "auxiliary_loss_mlp": 0.01036898, "balance_loss_clip": 1.23954356, "balance_loss_mlp": 1.01718116, "epoch": 0.7209379227416203, "flos": 14801611507200.0, "grad_norm": 2.9078216756949993, "language_loss": 0.76712954, "learning_rate": 7.626571934749931e-07, "loss": 0.79151744, "num_input_tokens_seen": 258733270, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19726562, "step": 11991, "time_per_iteration": 2.871149778366089 }, { "auxiliary_loss_clip": 0.01387204, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.22974777, "balance_loss_mlp": 1.01496959, "epoch": 0.7209980459942883, "flos": 29647255484160.0, "grad_norm": 1.4490511274849567, "language_loss": 0.73038077, "learning_rate": 7.623512364234022e-07, "loss": 0.75459909, "num_input_tokens_seen": 258755270, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.1965332, "step": 11992, "time_per_iteration": 2.9448790550231934 }, { "auxiliary_loss_clip": 0.01409751, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 1.24419761, "balance_loss_mlp": 1.01196074, "epoch": 0.7210581692469563, "flos": 23487636090240.0, "grad_norm": 3.115065893758335, "language_loss": 0.67493081, "learning_rate": 7.620453263035755e-07, "loss": 0.69934368, "num_input_tokens_seen": 258775340, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19580078, "step": 11993, "time_per_iteration": 2.917584180831909 }, { "auxiliary_loss_clip": 0.01413524, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.25084138, "balance_loss_mlp": 1.01227176, "epoch": 0.7211182924996242, "flos": 26109861062400.0, "grad_norm": 1.7395404404584849, "language_loss": 0.67173314, "learning_rate": 7.61739463127115e-07, "loss": 0.69619608, "num_input_tokens_seen": 258794580, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20495605, "step": 11994, "time_per_iteration": 2.93839955329895 }, { "auxiliary_loss_clip": 0.01414187, "auxiliary_loss_mlp": 0.01033569, "balance_loss_clip": 1.25010037, "balance_loss_mlp": 1.01226604, "epoch": 0.7211784157522922, "flos": 17720950953600.0, "grad_norm": 1.7648009416578843, "language_loss": 0.67556012, "learning_rate": 7.614336469056172e-07, "loss": 0.70003766, "num_input_tokens_seen": 258812330, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.2130127, "step": 11995, "time_per_iteration": 2.803344488143921 }, { "auxiliary_loss_clip": 0.01394544, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.23583674, "balance_loss_mlp": 1.0098505, "epoch": 0.7212385390049602, "flos": 24433780245120.0, "grad_norm": 1.6905210023185984, "language_loss": 0.80081379, "learning_rate": 7.6112787765068e-07, "loss": 0.82506764, "num_input_tokens_seen": 258831770, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.20983887, "step": 11996, "time_per_iteration": 2.8795478343963623 }, { "auxiliary_loss_clip": 0.01414615, "auxiliary_loss_mlp": 0.01033279, "balance_loss_clip": 1.25080538, "balance_loss_mlp": 1.01401424, "epoch": 0.7212986622576282, "flos": 28158895918080.0, "grad_norm": 2.650346266504728, "language_loss": 0.82489169, "learning_rate": 7.60822155373899e-07, "loss": 0.8493706, "num_input_tokens_seen": 258849090, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19238281, "step": 11997, "time_per_iteration": 2.893066167831421 }, { "auxiliary_loss_clip": 0.01413551, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.24763346, "balance_loss_mlp": 1.0142746, "epoch": 0.7213587855102961, "flos": 21845923338240.0, "grad_norm": 2.142757407226953, "language_loss": 0.68171251, "learning_rate": 7.605164800868646e-07, "loss": 0.70620668, "num_input_tokens_seen": 258868230, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21594238, "step": 11998, "time_per_iteration": 2.8460843563079834 }, { "auxiliary_loss_clip": 0.01409046, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.24684238, "balance_loss_mlp": 1.014503, "epoch": 0.7214189087629641, "flos": 14619817324800.0, "grad_norm": 2.024940766347357, "language_loss": 0.72797835, "learning_rate": 7.602108518011696e-07, "loss": 0.75240487, "num_input_tokens_seen": 258885525, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19104004, "step": 11999, "time_per_iteration": 2.8100616931915283 }, { "auxiliary_loss_clip": 0.01407658, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.24523973, "balance_loss_mlp": 1.01279616, "epoch": 0.721479032015632, "flos": 19400289396480.0, "grad_norm": 5.687019692259008, "language_loss": 0.84398949, "learning_rate": 7.599052705284039e-07, "loss": 0.86839616, "num_input_tokens_seen": 258903245, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20202637, "step": 12000, "time_per_iteration": 2.8108911514282227 }, { "auxiliary_loss_clip": 0.01412044, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.249228, "balance_loss_mlp": 1.01774728, "epoch": 0.7215391552683, "flos": 18521478783360.0, "grad_norm": 2.054086719660786, "language_loss": 0.78311515, "learning_rate": 7.59599736280154e-07, "loss": 0.80760592, "num_input_tokens_seen": 258921245, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19287109, "step": 12001, "time_per_iteration": 2.8439884185791016 }, { "auxiliary_loss_clip": 0.01395886, "auxiliary_loss_mlp": 0.0103748, "balance_loss_clip": 1.23792863, "balance_loss_mlp": 1.01740551, "epoch": 0.721599278520968, "flos": 23269256847360.0, "grad_norm": 1.6650387846520809, "language_loss": 0.82773691, "learning_rate": 7.592942490680066e-07, "loss": 0.85207057, "num_input_tokens_seen": 258939425, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.20068359, "step": 12002, "time_per_iteration": 2.9147582054138184 }, { "auxiliary_loss_clip": 0.01417614, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.25387943, "balance_loss_mlp": 1.01458466, "epoch": 0.721659401773636, "flos": 39212678862720.0, "grad_norm": 1.9342406308144207, "language_loss": 0.63340199, "learning_rate": 7.589888089035462e-07, "loss": 0.65792465, "num_input_tokens_seen": 258960710, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20068359, "step": 12003, "time_per_iteration": 3.043604612350464 }, { "auxiliary_loss_clip": 0.01423032, "auxiliary_loss_mlp": 0.01037613, "balance_loss_clip": 1.25745952, "balance_loss_mlp": 1.01738334, "epoch": 0.7217195250263039, "flos": 14948268462720.0, "grad_norm": 2.4014697920967123, "language_loss": 0.69294143, "learning_rate": 7.586834157983544e-07, "loss": 0.71754789, "num_input_tokens_seen": 258978475, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20239258, "step": 12004, "time_per_iteration": 2.837273359298706 }, { "auxiliary_loss_clip": 0.01188606, "auxiliary_loss_mlp": 0.01027737, "balance_loss_clip": 1.10000408, "balance_loss_mlp": 1.00103366, "epoch": 0.7217796482789719, "flos": 70900662142080.0, "grad_norm": 0.8579453607702863, "language_loss": 0.54102492, "learning_rate": 7.583780697640112e-07, "loss": 0.56318831, "num_input_tokens_seen": 259037520, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.26757812, "step": 12005, "time_per_iteration": 4.741447687149048 }, { "auxiliary_loss_clip": 0.01407996, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 1.24528956, "balance_loss_mlp": 1.01889348, "epoch": 0.7218397715316398, "flos": 37465690164480.0, "grad_norm": 1.5219384071488482, "language_loss": 0.63641822, "learning_rate": 7.580727708120962e-07, "loss": 0.6608932, "num_input_tokens_seen": 259061325, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.20617676, "step": 12006, "time_per_iteration": 2.9969661235809326 }, { "auxiliary_loss_clip": 0.01425169, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.25974584, "balance_loss_mlp": 1.01769638, "epoch": 0.7218998947843078, "flos": 22720842898560.0, "grad_norm": 2.4274958122345627, "language_loss": 0.91629338, "learning_rate": 7.577675189541865e-07, "loss": 0.94091791, "num_input_tokens_seen": 259078135, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19580078, "step": 12007, "time_per_iteration": 2.8227572441101074 }, { "auxiliary_loss_clip": 0.01410285, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.24690282, "balance_loss_mlp": 1.01280403, "epoch": 0.7219600180369758, "flos": 12174862055040.0, "grad_norm": 1.6699485456748728, "language_loss": 0.6443758, "learning_rate": 7.574623142018568e-07, "loss": 0.66881835, "num_input_tokens_seen": 259095910, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.21166992, "step": 12008, "time_per_iteration": 2.842782497406006 }, { "auxiliary_loss_clip": 0.01416329, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.2509433, "balance_loss_mlp": 1.01067567, "epoch": 0.7220201412896438, "flos": 22605522606720.0, "grad_norm": 2.7769349905231806, "language_loss": 0.78757954, "learning_rate": 7.57157156566681e-07, "loss": 0.81204474, "num_input_tokens_seen": 259114225, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19519043, "step": 12009, "time_per_iteration": 2.863978624343872 }, { "auxiliary_loss_clip": 0.01424908, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.25892437, "balance_loss_mlp": 1.0129354, "epoch": 0.7220802645423118, "flos": 26728458975360.0, "grad_norm": 4.611855756354056, "language_loss": 0.64231282, "learning_rate": 7.568520460602297e-07, "loss": 0.66689318, "num_input_tokens_seen": 259134660, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.20178223, "step": 12010, "time_per_iteration": 2.8872697353363037 }, { "auxiliary_loss_clip": 0.01412831, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.2507031, "balance_loss_mlp": 1.01100266, "epoch": 0.7221403877949797, "flos": 24429889192320.0, "grad_norm": 1.8721531601217651, "language_loss": 0.78275502, "learning_rate": 7.565469826940742e-07, "loss": 0.80718935, "num_input_tokens_seen": 259153300, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19592285, "step": 12011, "time_per_iteration": 2.8706040382385254 }, { "auxiliary_loss_clip": 0.01408116, "auxiliary_loss_mlp": 0.0103219, "balance_loss_clip": 1.24746251, "balance_loss_mlp": 1.01368868, "epoch": 0.7222005110476477, "flos": 23525035557120.0, "grad_norm": 1.7440084794151987, "language_loss": 0.80270994, "learning_rate": 7.56241966479781e-07, "loss": 0.82711303, "num_input_tokens_seen": 259172115, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18505859, "step": 12012, "time_per_iteration": 2.9151716232299805 }, { "auxiliary_loss_clip": 0.01405224, "auxiliary_loss_mlp": 0.01031337, "balance_loss_clip": 1.24251616, "balance_loss_mlp": 1.01221561, "epoch": 0.7222606343003156, "flos": 23122961850240.0, "grad_norm": 2.79803032023028, "language_loss": 0.76562667, "learning_rate": 7.559369974289171e-07, "loss": 0.78999227, "num_input_tokens_seen": 259191345, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19128418, "step": 12013, "time_per_iteration": 4.249547481536865 }, { "auxiliary_loss_clip": 0.01409403, "auxiliary_loss_mlp": 0.01031887, "balance_loss_clip": 1.24942422, "balance_loss_mlp": 1.01253963, "epoch": 0.7223207575529836, "flos": 24361424530560.0, "grad_norm": 1.5831642681304265, "language_loss": 0.76705694, "learning_rate": 7.556320755530484e-07, "loss": 0.79146981, "num_input_tokens_seen": 259211700, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19372559, "step": 12014, "time_per_iteration": 2.8735532760620117 }, { "auxiliary_loss_clip": 0.01418448, "auxiliary_loss_mlp": 0.01030714, "balance_loss_clip": 1.2543242, "balance_loss_mlp": 1.01149714, "epoch": 0.7223808808056515, "flos": 28342002199680.0, "grad_norm": 1.6127434477910607, "language_loss": 0.87092865, "learning_rate": 7.553272008637346e-07, "loss": 0.89542025, "num_input_tokens_seen": 259233825, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19213867, "step": 12015, "time_per_iteration": 2.911015272140503 }, { "auxiliary_loss_clip": 0.01402768, "auxiliary_loss_mlp": 0.0103022, "balance_loss_clip": 1.24239564, "balance_loss_mlp": 1.01155138, "epoch": 0.7224410040583196, "flos": 21079356370560.0, "grad_norm": 1.9720433764767953, "language_loss": 0.7864846, "learning_rate": 7.55022373372538e-07, "loss": 0.8108145, "num_input_tokens_seen": 259253055, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18640137, "step": 12016, "time_per_iteration": 4.259311199188232 }, { "auxiliary_loss_clip": 0.01403797, "auxiliary_loss_mlp": 0.01036484, "balance_loss_clip": 1.24371839, "balance_loss_mlp": 1.01707625, "epoch": 0.7225011273109875, "flos": 26806244065920.0, "grad_norm": 1.4568209385192235, "language_loss": 0.78258085, "learning_rate": 7.547175930910186e-07, "loss": 0.80698371, "num_input_tokens_seen": 259273420, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1940918, "step": 12017, "time_per_iteration": 4.2947492599487305 }, { "auxiliary_loss_clip": 0.0140183, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.24254203, "balance_loss_mlp": 1.01620293, "epoch": 0.7225612505636555, "flos": 23593273994880.0, "grad_norm": 1.7472856614962702, "language_loss": 0.74275994, "learning_rate": 7.54412860030732e-07, "loss": 0.7671293, "num_input_tokens_seen": 259291000, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18908691, "step": 12018, "time_per_iteration": 2.8729517459869385 }, { "auxiliary_loss_clip": 0.01395998, "auxiliary_loss_mlp": 0.0103437, "balance_loss_clip": 1.23990178, "balance_loss_mlp": 1.01594067, "epoch": 0.7226213738163234, "flos": 20787490293120.0, "grad_norm": 1.7054495311733995, "language_loss": 0.78101659, "learning_rate": 7.541081742032347e-07, "loss": 0.80532032, "num_input_tokens_seen": 259312390, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18408203, "step": 12019, "time_per_iteration": 2.869986057281494 }, { "auxiliary_loss_clip": 0.01402797, "auxiliary_loss_mlp": 0.01030722, "balance_loss_clip": 1.24206614, "balance_loss_mlp": 1.01062346, "epoch": 0.7226814970689914, "flos": 32648859256320.0, "grad_norm": 1.710176812401023, "language_loss": 0.74427223, "learning_rate": 7.53803535620081e-07, "loss": 0.76860738, "num_input_tokens_seen": 259332645, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.2010498, "step": 12020, "time_per_iteration": 2.932291269302368 }, { "auxiliary_loss_clip": 0.01416302, "auxiliary_loss_mlp": 0.01037192, "balance_loss_clip": 1.25216436, "balance_loss_mlp": 1.0183804, "epoch": 0.7227416203216595, "flos": 22464249782400.0, "grad_norm": 1.6280409385602646, "language_loss": 0.77928722, "learning_rate": 7.534989442928219e-07, "loss": 0.8038221, "num_input_tokens_seen": 259353810, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.18798828, "step": 12021, "time_per_iteration": 2.950657367706299 }, { "auxiliary_loss_clip": 0.01407597, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.2469461, "balance_loss_mlp": 1.01425672, "epoch": 0.7228017435743274, "flos": 21662138384640.0, "grad_norm": 3.6048549861755186, "language_loss": 0.69408977, "learning_rate": 7.531944002330073e-07, "loss": 0.71850419, "num_input_tokens_seen": 259372460, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19567871, "step": 12022, "time_per_iteration": 2.874795913696289 }, { "auxiliary_loss_clip": 0.01411404, "auxiliary_loss_mlp": 0.01032984, "balance_loss_clip": 1.24838376, "balance_loss_mlp": 1.01323056, "epoch": 0.7228618668269954, "flos": 29545418142720.0, "grad_norm": 1.6569712483134886, "language_loss": 0.69744623, "learning_rate": 7.528899034521858e-07, "loss": 0.72189009, "num_input_tokens_seen": 259393275, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1973877, "step": 12023, "time_per_iteration": 2.871166467666626 }, { "auxiliary_loss_clip": 0.01397819, "auxiliary_loss_mlp": 0.01032519, "balance_loss_clip": 1.23706746, "balance_loss_mlp": 1.01367211, "epoch": 0.7229219900796633, "flos": 27465227602560.0, "grad_norm": 1.5445384399234414, "language_loss": 0.71695876, "learning_rate": 7.525854539619052e-07, "loss": 0.7412622, "num_input_tokens_seen": 259416205, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18847656, "step": 12024, "time_per_iteration": 2.9208128452301025 }, { "auxiliary_loss_clip": 0.0140036, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.23931527, "balance_loss_mlp": 1.01577961, "epoch": 0.7229821133323313, "flos": 16297934158080.0, "grad_norm": 2.4517953666256416, "language_loss": 0.76369905, "learning_rate": 7.522810517737089e-07, "loss": 0.78804958, "num_input_tokens_seen": 259433115, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18908691, "step": 12025, "time_per_iteration": 2.7892825603485107 }, { "auxiliary_loss_clip": 0.01386349, "auxiliary_loss_mlp": 0.01032546, "balance_loss_clip": 1.22927475, "balance_loss_mlp": 1.01307952, "epoch": 0.7230422365849992, "flos": 20422001646720.0, "grad_norm": 1.962605814506806, "language_loss": 0.76594263, "learning_rate": 7.519766968991395e-07, "loss": 0.79013157, "num_input_tokens_seen": 259450475, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19470215, "step": 12026, "time_per_iteration": 2.807366132736206 }, { "auxiliary_loss_clip": 0.01413783, "auxiliary_loss_mlp": 0.01033087, "balance_loss_clip": 1.24979866, "balance_loss_mlp": 1.01408517, "epoch": 0.7231023598376672, "flos": 25604547425280.0, "grad_norm": 1.8726564749492303, "language_loss": 0.68944657, "learning_rate": 7.516723893497388e-07, "loss": 0.71391523, "num_input_tokens_seen": 259469355, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19006348, "step": 12027, "time_per_iteration": 2.8627994060516357 }, { "auxiliary_loss_clip": 0.01427606, "auxiliary_loss_mlp": 0.01035391, "balance_loss_clip": 1.26254284, "balance_loss_mlp": 1.01610267, "epoch": 0.7231624830903352, "flos": 25158966203520.0, "grad_norm": 1.864985347356744, "language_loss": 0.79984581, "learning_rate": 7.513681291370469e-07, "loss": 0.82447577, "num_input_tokens_seen": 259486565, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19274902, "step": 12028, "time_per_iteration": 2.8669936656951904 }, { "auxiliary_loss_clip": 0.01410981, "auxiliary_loss_mlp": 0.01029986, "balance_loss_clip": 1.24924839, "balance_loss_mlp": 1.01125801, "epoch": 0.7232226063430032, "flos": 21735715708800.0, "grad_norm": 1.7632658502273728, "language_loss": 0.83167505, "learning_rate": 7.510639162726e-07, "loss": 0.8560847, "num_input_tokens_seen": 259505070, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18701172, "step": 12029, "time_per_iteration": 2.8251454830169678 }, { "auxiliary_loss_clip": 0.011813, "auxiliary_loss_mlp": 0.01022149, "balance_loss_clip": 1.09431612, "balance_loss_mlp": 1.00278938, "epoch": 0.7232827295956711, "flos": 68470773390720.0, "grad_norm": 0.8091300271856494, "language_loss": 0.61822057, "learning_rate": 7.507597507679347e-07, "loss": 0.64025509, "num_input_tokens_seen": 259569135, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.19335938, "step": 12030, "time_per_iteration": 3.392035484313965 }, { "auxiliary_loss_clip": 0.01395277, "auxiliary_loss_mlp": 0.01033788, "balance_loss_clip": 1.23654819, "balance_loss_mlp": 1.0139401, "epoch": 0.7233428528483391, "flos": 20202038835840.0, "grad_norm": 1.9154288511415865, "language_loss": 0.78551698, "learning_rate": 7.504556326345859e-07, "loss": 0.8098076, "num_input_tokens_seen": 259587035, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.1986084, "step": 12031, "time_per_iteration": 3.0072810649871826 }, { "auxiliary_loss_clip": 0.01415907, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.25275087, "balance_loss_mlp": 1.01585531, "epoch": 0.723402976101007, "flos": 23959531802880.0, "grad_norm": 1.7331401658647874, "language_loss": 0.81855488, "learning_rate": 7.501515618840834e-07, "loss": 0.8430655, "num_input_tokens_seen": 259606140, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19311523, "step": 12032, "time_per_iteration": 2.9443275928497314 }, { "auxiliary_loss_clip": 0.01428835, "auxiliary_loss_mlp": 0.01035305, "balance_loss_clip": 1.26106203, "balance_loss_mlp": 1.01563597, "epoch": 0.723463099353675, "flos": 20823079968000.0, "grad_norm": 2.3935625001880734, "language_loss": 0.76231217, "learning_rate": 7.498475385279592e-07, "loss": 0.78695351, "num_input_tokens_seen": 259624275, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19677734, "step": 12033, "time_per_iteration": 2.854008436203003 }, { "auxiliary_loss_clip": 0.01402616, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.24281251, "balance_loss_mlp": 1.01154339, "epoch": 0.723523222606343, "flos": 19107111219840.0, "grad_norm": 1.8155031183393504, "language_loss": 0.75352263, "learning_rate": 7.495435625777423e-07, "loss": 0.7778576, "num_input_tokens_seen": 259643465, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1932373, "step": 12034, "time_per_iteration": 2.845275640487671 }, { "auxiliary_loss_clip": 0.0139792, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.23757076, "balance_loss_mlp": 1.01175892, "epoch": 0.723583345859011, "flos": 26518630999680.0, "grad_norm": 1.9432312333607356, "language_loss": 0.81827462, "learning_rate": 7.492396340449578e-07, "loss": 0.84255052, "num_input_tokens_seen": 259662500, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.17895508, "step": 12035, "time_per_iteration": 2.865886688232422 }, { "auxiliary_loss_clip": 0.01420269, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.25588369, "balance_loss_mlp": 1.01346326, "epoch": 0.723643469111679, "flos": 16042336427520.0, "grad_norm": 2.1505970593620884, "language_loss": 0.61385995, "learning_rate": 7.489357529411326e-07, "loss": 0.63838875, "num_input_tokens_seen": 259680140, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19140625, "step": 12036, "time_per_iteration": 2.8170084953308105 }, { "auxiliary_loss_clip": 0.01390756, "auxiliary_loss_mlp": 0.01034651, "balance_loss_clip": 1.23423684, "balance_loss_mlp": 1.01573217, "epoch": 0.7237035923643469, "flos": 21955814254080.0, "grad_norm": 1.5022210200663448, "language_loss": 0.68810284, "learning_rate": 7.486319192777883e-07, "loss": 0.71235693, "num_input_tokens_seen": 259700160, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18920898, "step": 12037, "time_per_iteration": 2.918562650680542 }, { "auxiliary_loss_clip": 0.01398413, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.23878431, "balance_loss_mlp": 1.01316905, "epoch": 0.7237637156170149, "flos": 23592685812480.0, "grad_norm": 2.086594719906185, "language_loss": 0.72854757, "learning_rate": 7.483281330664479e-07, "loss": 0.75285828, "num_input_tokens_seen": 259720525, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19482422, "step": 12038, "time_per_iteration": 2.8516435623168945 }, { "auxiliary_loss_clip": 0.0140955, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.24810529, "balance_loss_mlp": 1.01446033, "epoch": 0.7238238388696828, "flos": 20604384011520.0, "grad_norm": 1.622583060990797, "language_loss": 0.72841787, "learning_rate": 7.480243943186293e-07, "loss": 0.75286269, "num_input_tokens_seen": 259738680, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.20483398, "step": 12039, "time_per_iteration": 2.837143659591675 }, { "auxiliary_loss_clip": 0.01413971, "auxiliary_loss_mlp": 0.01033734, "balance_loss_clip": 1.25184774, "balance_loss_mlp": 1.01464868, "epoch": 0.7238839621223508, "flos": 24217708487040.0, "grad_norm": 1.69866560192204, "language_loss": 0.77008325, "learning_rate": 7.477207030458513e-07, "loss": 0.79456031, "num_input_tokens_seen": 259758790, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.1907959, "step": 12040, "time_per_iteration": 4.357841491699219 }, { "auxiliary_loss_clip": 0.01411384, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.24786246, "balance_loss_mlp": 1.0163815, "epoch": 0.7239440853750188, "flos": 14217788862720.0, "grad_norm": 1.693898473480938, "language_loss": 0.78029168, "learning_rate": 7.474170592596301e-07, "loss": 0.8047685, "num_input_tokens_seen": 259777370, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19921875, "step": 12041, "time_per_iteration": 2.820542573928833 }, { "auxiliary_loss_clip": 0.0140452, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.24233985, "balance_loss_mlp": 1.01365137, "epoch": 0.7240042086276868, "flos": 21624376959360.0, "grad_norm": 2.2020509346659085, "language_loss": 0.64625037, "learning_rate": 7.471134629714797e-07, "loss": 0.67062271, "num_input_tokens_seen": 259794665, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.1907959, "step": 12042, "time_per_iteration": 2.8402535915374756 }, { "auxiliary_loss_clip": 0.01418086, "auxiliary_loss_mlp": 0.01038944, "balance_loss_clip": 1.25254607, "balance_loss_mlp": 1.01755786, "epoch": 0.7240643318803547, "flos": 23341929275520.0, "grad_norm": 1.877949027570416, "language_loss": 0.84038532, "learning_rate": 7.468099141929116e-07, "loss": 0.86495554, "num_input_tokens_seen": 259811110, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.21386719, "step": 12043, "time_per_iteration": 2.8376688957214355 }, { "auxiliary_loss_clip": 0.01408623, "auxiliary_loss_mlp": 0.01034706, "balance_loss_clip": 1.2462883, "balance_loss_mlp": 1.01548982, "epoch": 0.7241244551330227, "flos": 24035552346240.0, "grad_norm": 2.3987181235540227, "language_loss": 0.64688647, "learning_rate": 7.465064129354379e-07, "loss": 0.67131972, "num_input_tokens_seen": 259831080, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1920166, "step": 12044, "time_per_iteration": 2.856257200241089 }, { "auxiliary_loss_clip": 0.01426527, "auxiliary_loss_mlp": 0.01035842, "balance_loss_clip": 1.26321828, "balance_loss_mlp": 1.0163871, "epoch": 0.7241845783856906, "flos": 18738591171840.0, "grad_norm": 1.5626507505605252, "language_loss": 0.82471317, "learning_rate": 7.462029592105658e-07, "loss": 0.84933686, "num_input_tokens_seen": 259850135, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19470215, "step": 12045, "time_per_iteration": 2.8290116786956787 }, { "auxiliary_loss_clip": 0.01397085, "auxiliary_loss_mlp": 0.01036997, "balance_loss_clip": 1.23965049, "balance_loss_mlp": 1.01774478, "epoch": 0.7242447016383586, "flos": 19507691848320.0, "grad_norm": 1.8716094106937367, "language_loss": 0.72667968, "learning_rate": 7.458995530298034e-07, "loss": 0.75102055, "num_input_tokens_seen": 259868185, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19262695, "step": 12046, "time_per_iteration": 2.866175651550293 }, { "auxiliary_loss_clip": 0.01415925, "auxiliary_loss_mlp": 0.01031017, "balance_loss_clip": 1.25313616, "balance_loss_mlp": 1.01210999, "epoch": 0.7243048248910267, "flos": 22173560069760.0, "grad_norm": 1.7745662094740648, "language_loss": 0.71631861, "learning_rate": 7.455961944046553e-07, "loss": 0.74078798, "num_input_tokens_seen": 259887055, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18920898, "step": 12047, "time_per_iteration": 4.250116586685181 }, { "auxiliary_loss_clip": 0.014291, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.26338506, "balance_loss_mlp": 1.01978469, "epoch": 0.7243649481436946, "flos": 27684240272640.0, "grad_norm": 1.5183386016287255, "language_loss": 0.70175636, "learning_rate": 7.45292883346627e-07, "loss": 0.72643858, "num_input_tokens_seen": 259908295, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.1932373, "step": 12048, "time_per_iteration": 2.86251163482666 }, { "auxiliary_loss_clip": 0.01179235, "auxiliary_loss_mlp": 0.01036243, "balance_loss_clip": 1.09257674, "balance_loss_mlp": 1.00610721, "epoch": 0.7244250713963626, "flos": 63274246968960.0, "grad_norm": 0.8422359325511274, "language_loss": 0.53720915, "learning_rate": 7.449896198672168e-07, "loss": 0.55936396, "num_input_tokens_seen": 259968475, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.30078125, "step": 12049, "time_per_iteration": 3.374913215637207 }, { "auxiliary_loss_clip": 0.01439719, "auxiliary_loss_mlp": 0.0103779, "balance_loss_clip": 1.26955998, "balance_loss_mlp": 1.01808429, "epoch": 0.7244851946490305, "flos": 17975960501760.0, "grad_norm": 2.280897325512806, "language_loss": 0.60312122, "learning_rate": 7.446864039779258e-07, "loss": 0.62789637, "num_input_tokens_seen": 259984865, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.19714355, "step": 12050, "time_per_iteration": 2.8094446659088135 }, { "auxiliary_loss_clip": 0.01182486, "auxiliary_loss_mlp": 0.01028033, "balance_loss_clip": 1.09600496, "balance_loss_mlp": 1.00829172, "epoch": 0.7245453179016985, "flos": 70975506320640.0, "grad_norm": 0.7134482866064371, "language_loss": 0.53333354, "learning_rate": 7.443832356902528e-07, "loss": 0.55543876, "num_input_tokens_seen": 260046735, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 0.19726562, "step": 12051, "time_per_iteration": 4.63675594329834 }, { "auxiliary_loss_clip": 0.0140225, "auxiliary_loss_mlp": 0.01040474, "balance_loss_clip": 1.24338293, "balance_loss_mlp": 1.02184165, "epoch": 0.7246054411543664, "flos": 24578355939840.0, "grad_norm": 1.455416704483892, "language_loss": 0.72455442, "learning_rate": 7.440801150156927e-07, "loss": 0.74898171, "num_input_tokens_seen": 260067950, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18640137, "step": 12052, "time_per_iteration": 4.333528518676758 }, { "auxiliary_loss_clip": 0.01414557, "auxiliary_loss_mlp": 0.01033003, "balance_loss_clip": 1.25219882, "balance_loss_mlp": 1.01409626, "epoch": 0.7246655644070344, "flos": 32349799255680.0, "grad_norm": 2.1303566387877546, "language_loss": 0.74858767, "learning_rate": 7.437770419657415e-07, "loss": 0.77306324, "num_input_tokens_seen": 260087730, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18920898, "step": 12053, "time_per_iteration": 2.9229955673217773 }, { "auxiliary_loss_clip": 0.01415915, "auxiliary_loss_mlp": 0.01033552, "balance_loss_clip": 1.25422001, "balance_loss_mlp": 1.01502705, "epoch": 0.7247256876597024, "flos": 21882689377920.0, "grad_norm": 2.0861571585412824, "language_loss": 0.78634149, "learning_rate": 7.434740165518898e-07, "loss": 0.81083614, "num_input_tokens_seen": 260107760, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18530273, "step": 12054, "time_per_iteration": 2.8486745357513428 }, { "auxiliary_loss_clip": 0.01404524, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.24520266, "balance_loss_mlp": 1.01923764, "epoch": 0.7247858109123704, "flos": 16220782494720.0, "grad_norm": 3.8904672516423573, "language_loss": 0.6929121, "learning_rate": 7.431710387856301e-07, "loss": 0.71733183, "num_input_tokens_seen": 260123660, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18212891, "step": 12055, "time_per_iteration": 2.810206413269043 }, { "auxiliary_loss_clip": 0.01411611, "auxiliary_loss_mlp": 0.01038958, "balance_loss_clip": 1.25005722, "balance_loss_mlp": 1.02000415, "epoch": 0.7248459341650383, "flos": 20860615169280.0, "grad_norm": 1.6443746038613802, "language_loss": 0.74373245, "learning_rate": 7.428681086784496e-07, "loss": 0.76823819, "num_input_tokens_seen": 260142690, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18969727, "step": 12056, "time_per_iteration": 2.837925672531128 }, { "auxiliary_loss_clip": 0.01406083, "auxiliary_loss_mlp": 0.01034294, "balance_loss_clip": 1.24828172, "balance_loss_mlp": 1.01487434, "epoch": 0.7249060574177063, "flos": 25932908073600.0, "grad_norm": 1.6577191573565986, "language_loss": 0.71441722, "learning_rate": 7.425652262418368e-07, "loss": 0.73882103, "num_input_tokens_seen": 260162590, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19433594, "step": 12057, "time_per_iteration": 2.893820285797119 }, { "auxiliary_loss_clip": 0.0143304, "auxiliary_loss_mlp": 0.01036588, "balance_loss_clip": 1.26708436, "balance_loss_mlp": 1.01724076, "epoch": 0.7249661806703742, "flos": 17353878739200.0, "grad_norm": 1.711166086286174, "language_loss": 0.62850428, "learning_rate": 7.42262391487277e-07, "loss": 0.65320051, "num_input_tokens_seen": 260181065, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19372559, "step": 12058, "time_per_iteration": 2.8284049034118652 }, { "auxiliary_loss_clip": 0.01412743, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.25152874, "balance_loss_mlp": 1.01788259, "epoch": 0.7250263039230422, "flos": 19583848126080.0, "grad_norm": 1.8336679143849142, "language_loss": 0.75636399, "learning_rate": 7.419596044262535e-07, "loss": 0.78085804, "num_input_tokens_seen": 260200330, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18786621, "step": 12059, "time_per_iteration": 2.844123125076294 }, { "auxiliary_loss_clip": 0.01396785, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.23951936, "balance_loss_mlp": 1.01910949, "epoch": 0.7250864271757103, "flos": 21985522104960.0, "grad_norm": 1.832719681295629, "language_loss": 0.79991531, "learning_rate": 7.416568650702472e-07, "loss": 0.82426649, "num_input_tokens_seen": 260219975, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19226074, "step": 12060, "time_per_iteration": 2.8727757930755615 }, { "auxiliary_loss_clip": 0.01415924, "auxiliary_loss_mlp": 0.01037471, "balance_loss_clip": 1.25310087, "balance_loss_mlp": 1.01738453, "epoch": 0.7251465504283782, "flos": 25024253875200.0, "grad_norm": 1.7539549735417825, "language_loss": 0.7678085, "learning_rate": 7.413541734307393e-07, "loss": 0.79234242, "num_input_tokens_seen": 260242025, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20080566, "step": 12061, "time_per_iteration": 2.8641445636749268 }, { "auxiliary_loss_clip": 0.01407065, "auxiliary_loss_mlp": 0.01033333, "balance_loss_clip": 1.24979484, "balance_loss_mlp": 1.01483166, "epoch": 0.7252066736810462, "flos": 16698650520960.0, "grad_norm": 1.6139273163158219, "language_loss": 0.81719947, "learning_rate": 7.410515295192068e-07, "loss": 0.84160352, "num_input_tokens_seen": 260260015, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18518066, "step": 12062, "time_per_iteration": 2.9163150787353516 }, { "auxiliary_loss_clip": 0.01431512, "auxiliary_loss_mlp": 0.01033101, "balance_loss_clip": 1.26574516, "balance_loss_mlp": 1.01386058, "epoch": 0.7252667969337141, "flos": 25714121627520.0, "grad_norm": 2.5367337008644695, "language_loss": 0.70038974, "learning_rate": 7.407489333471262e-07, "loss": 0.72503585, "num_input_tokens_seen": 260278635, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19238281, "step": 12063, "time_per_iteration": 2.85710072517395 }, { "auxiliary_loss_clip": 0.01399619, "auxiliary_loss_mlp": 0.01034349, "balance_loss_clip": 1.24214125, "balance_loss_mlp": 1.01589549, "epoch": 0.7253269201863821, "flos": 18269093433600.0, "grad_norm": 1.415242393153993, "language_loss": 0.70703, "learning_rate": 7.40446384925973e-07, "loss": 0.73136961, "num_input_tokens_seen": 260298510, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18457031, "step": 12064, "time_per_iteration": 2.8255996704101562 }, { "auxiliary_loss_clip": 0.01410075, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.24934292, "balance_loss_mlp": 1.01405203, "epoch": 0.72538704343905, "flos": 20421322974720.0, "grad_norm": 2.022143255212664, "language_loss": 0.91515481, "learning_rate": 7.401438842672192e-07, "loss": 0.93958628, "num_input_tokens_seen": 260317405, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19018555, "step": 12065, "time_per_iteration": 2.822315216064453 }, { "auxiliary_loss_clip": 0.01186727, "auxiliary_loss_mlp": 0.01023285, "balance_loss_clip": 1.09768438, "balance_loss_mlp": 1.00430667, "epoch": 0.725447166691718, "flos": 70185520529280.0, "grad_norm": 0.6503065049822433, "language_loss": 0.56102222, "learning_rate": 7.398414313823349e-07, "loss": 0.58312231, "num_input_tokens_seen": 260388085, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.18945312, "step": 12066, "time_per_iteration": 3.4587628841400146 }, { "auxiliary_loss_clip": 0.01404011, "auxiliary_loss_mlp": 0.01038178, "balance_loss_clip": 1.24470413, "balance_loss_mlp": 1.01997483, "epoch": 0.725507289944386, "flos": 27063606343680.0, "grad_norm": 1.643452986987224, "language_loss": 0.76999986, "learning_rate": 7.395390262827897e-07, "loss": 0.79442173, "num_input_tokens_seen": 260406165, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18200684, "step": 12067, "time_per_iteration": 2.8817858695983887 }, { "auxiliary_loss_clip": 0.01180954, "auxiliary_loss_mlp": 0.01018115, "balance_loss_clip": 1.09326386, "balance_loss_mlp": 0.99970931, "epoch": 0.725567413197054, "flos": 62953034999040.0, "grad_norm": 0.7238393188290771, "language_loss": 0.57083803, "learning_rate": 7.392366689800515e-07, "loss": 0.59282869, "num_input_tokens_seen": 260461365, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.18359375, "step": 12068, "time_per_iteration": 3.184457540512085 }, { "auxiliary_loss_clip": 0.01184163, "auxiliary_loss_mlp": 0.0102139, "balance_loss_clip": 1.09544706, "balance_loss_mlp": 0.99840659, "epoch": 0.7256275364497219, "flos": 60326647505280.0, "grad_norm": 0.6583360732542883, "language_loss": 0.55471879, "learning_rate": 7.389343594855848e-07, "loss": 0.57677436, "num_input_tokens_seen": 260523795, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.22949219, "step": 12069, "time_per_iteration": 3.302109956741333 }, { "auxiliary_loss_clip": 0.01402023, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.24440563, "balance_loss_mlp": 1.01403689, "epoch": 0.7256876597023899, "flos": 24509167361280.0, "grad_norm": 1.7452482634964892, "language_loss": 0.80218726, "learning_rate": 7.38632097810854e-07, "loss": 0.82653052, "num_input_tokens_seen": 260544765, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18261719, "step": 12070, "time_per_iteration": 2.914280414581299 }, { "auxiliary_loss_clip": 0.01397697, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.2419008, "balance_loss_mlp": 1.01335323, "epoch": 0.7257477829550578, "flos": 24363415301760.0, "grad_norm": 1.9333670830833847, "language_loss": 0.72951251, "learning_rate": 7.383298839673197e-07, "loss": 0.7538079, "num_input_tokens_seen": 260564340, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18481445, "step": 12071, "time_per_iteration": 2.907585859298706 }, { "auxiliary_loss_clip": 0.01405866, "auxiliary_loss_mlp": 0.0104111, "balance_loss_clip": 1.24755287, "balance_loss_mlp": 1.02301383, "epoch": 0.7258079062077258, "flos": 17211203326080.0, "grad_norm": 1.8102499453797956, "language_loss": 0.70559937, "learning_rate": 7.380277179664436e-07, "loss": 0.73006916, "num_input_tokens_seen": 260582565, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.1809082, "step": 12072, "time_per_iteration": 2.882935047149658 }, { "auxiliary_loss_clip": 0.01418105, "auxiliary_loss_mlp": 0.01040688, "balance_loss_clip": 1.25266051, "balance_loss_mlp": 1.01977849, "epoch": 0.7258680294603939, "flos": 21590416097280.0, "grad_norm": 1.9230776985328575, "language_loss": 0.78880638, "learning_rate": 7.377255998196821e-07, "loss": 0.81339431, "num_input_tokens_seen": 260601700, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20935059, "step": 12073, "time_per_iteration": 2.8940987586975098 }, { "auxiliary_loss_clip": 0.01408608, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.24945807, "balance_loss_mlp": 1.01315808, "epoch": 0.7259281527130618, "flos": 34867426953600.0, "grad_norm": 1.4153469374095564, "language_loss": 0.70550179, "learning_rate": 7.374235295384923e-07, "loss": 0.72990453, "num_input_tokens_seen": 260623040, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18518066, "step": 12074, "time_per_iteration": 3.077580213546753 }, { "auxiliary_loss_clip": 0.01416064, "auxiliary_loss_mlp": 0.01037089, "balance_loss_clip": 1.2521708, "balance_loss_mlp": 1.01811051, "epoch": 0.7259882759657298, "flos": 25413975751680.0, "grad_norm": 2.5704217843644397, "language_loss": 0.74729222, "learning_rate": 7.371215071343302e-07, "loss": 0.77182376, "num_input_tokens_seen": 260642735, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18969727, "step": 12075, "time_per_iteration": 4.337449312210083 }, { "auxiliary_loss_clip": 0.01406029, "auxiliary_loss_mlp": 0.01038863, "balance_loss_clip": 1.24400449, "balance_loss_mlp": 1.01858556, "epoch": 0.7260483992183977, "flos": 62974037865600.0, "grad_norm": 1.5067812408678491, "language_loss": 0.6439321, "learning_rate": 7.368195326186458e-07, "loss": 0.66838104, "num_input_tokens_seen": 260669935, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.20275879, "step": 12076, "time_per_iteration": 3.226510524749756 }, { "auxiliary_loss_clip": 0.01413699, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.25066912, "balance_loss_mlp": 1.01325345, "epoch": 0.7261085224710657, "flos": 26478381110400.0, "grad_norm": 1.7505039069783277, "language_loss": 0.79500055, "learning_rate": 7.365176060028912e-07, "loss": 0.81946558, "num_input_tokens_seen": 260689605, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19555664, "step": 12077, "time_per_iteration": 2.9068408012390137 }, { "auxiliary_loss_clip": 0.01181929, "auxiliary_loss_mlp": 0.01024077, "balance_loss_clip": 1.09180522, "balance_loss_mlp": 1.00347793, "epoch": 0.7261686457237336, "flos": 66800511169920.0, "grad_norm": 0.8941141302257031, "language_loss": 0.64993656, "learning_rate": 7.362157272985163e-07, "loss": 0.67199671, "num_input_tokens_seen": 260748265, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.20605469, "step": 12078, "time_per_iteration": 3.337397336959839 }, { "auxiliary_loss_clip": 0.0118073, "auxiliary_loss_mlp": 0.01029967, "balance_loss_clip": 1.09211516, "balance_loss_mlp": 1.00402725, "epoch": 0.7262287689764017, "flos": 70032393567360.0, "grad_norm": 0.7065851911831613, "language_loss": 0.59259927, "learning_rate": 7.359138965169671e-07, "loss": 0.61470628, "num_input_tokens_seen": 260816715, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.25976562, "step": 12079, "time_per_iteration": 3.478893995285034 }, { "auxiliary_loss_clip": 0.01406836, "auxiliary_loss_mlp": 0.01034528, "balance_loss_clip": 1.24666286, "balance_loss_mlp": 1.01566923, "epoch": 0.7262888922290696, "flos": 23815951493760.0, "grad_norm": 1.8534978875254196, "language_loss": 0.66255862, "learning_rate": 7.356121136696895e-07, "loss": 0.68697226, "num_input_tokens_seen": 260836765, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18859863, "step": 12080, "time_per_iteration": 2.898808717727661 }, { "auxiliary_loss_clip": 0.01414856, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.25012696, "balance_loss_mlp": 1.01514077, "epoch": 0.7263490154817376, "flos": 19509773109120.0, "grad_norm": 3.0433775755611703, "language_loss": 0.7188797, "learning_rate": 7.35310378768128e-07, "loss": 0.74337065, "num_input_tokens_seen": 260854610, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19116211, "step": 12081, "time_per_iteration": 2.849839210510254 }, { "auxiliary_loss_clip": 0.01414945, "auxiliary_loss_mlp": 0.01036743, "balance_loss_clip": 1.25033009, "balance_loss_mlp": 1.01714444, "epoch": 0.7264091387344055, "flos": 16293997860480.0, "grad_norm": 1.7181843117330031, "language_loss": 0.81794405, "learning_rate": 7.350086918237237e-07, "loss": 0.84246099, "num_input_tokens_seen": 260871620, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19580078, "step": 12082, "time_per_iteration": 4.314535140991211 }, { "auxiliary_loss_clip": 0.01428349, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.25840044, "balance_loss_mlp": 1.01559043, "epoch": 0.7264692619870735, "flos": 24362510405760.0, "grad_norm": 2.385260865079037, "language_loss": 0.78007758, "learning_rate": 7.347070528479158e-07, "loss": 0.80471396, "num_input_tokens_seen": 260890490, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.19689941, "step": 12083, "time_per_iteration": 2.874288320541382 }, { "auxiliary_loss_clip": 0.0142581, "auxiliary_loss_mlp": 0.01032539, "balance_loss_clip": 1.26089549, "balance_loss_mlp": 1.01323855, "epoch": 0.7265293852397414, "flos": 25130479962240.0, "grad_norm": 1.754378378815766, "language_loss": 0.73349011, "learning_rate": 7.344054618521433e-07, "loss": 0.75807357, "num_input_tokens_seen": 260909700, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19311523, "step": 12084, "time_per_iteration": 2.88417387008667 }, { "auxiliary_loss_clip": 0.01416784, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.25307953, "balance_loss_mlp": 1.01823401, "epoch": 0.7265895084924094, "flos": 22648532428800.0, "grad_norm": 1.7297975480036487, "language_loss": 0.78691012, "learning_rate": 7.34103918847843e-07, "loss": 0.81144559, "num_input_tokens_seen": 260929090, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18518066, "step": 12085, "time_per_iteration": 2.864600419998169 }, { "auxiliary_loss_clip": 0.01412951, "auxiliary_loss_mlp": 0.01035583, "balance_loss_clip": 1.2490859, "balance_loss_mlp": 1.01666451, "epoch": 0.7266496317450775, "flos": 23378785804800.0, "grad_norm": 2.2531652984469006, "language_loss": 0.72975284, "learning_rate": 7.338024238464493e-07, "loss": 0.75423825, "num_input_tokens_seen": 260946615, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18933105, "step": 12086, "time_per_iteration": 4.280728816986084 }, { "auxiliary_loss_clip": 0.01408735, "auxiliary_loss_mlp": 0.01034923, "balance_loss_clip": 1.24794793, "balance_loss_mlp": 1.01564682, "epoch": 0.7267097549977454, "flos": 28086313979520.0, "grad_norm": 3.7160703579365384, "language_loss": 0.70417094, "learning_rate": 7.335009768593938e-07, "loss": 0.72860742, "num_input_tokens_seen": 260968515, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19287109, "step": 12087, "time_per_iteration": 4.308889389038086 }, { "auxiliary_loss_clip": 0.01421186, "auxiliary_loss_mlp": 0.01037347, "balance_loss_clip": 1.25609303, "balance_loss_mlp": 1.01684272, "epoch": 0.7267698782504134, "flos": 22204444285440.0, "grad_norm": 2.2006842732751752, "language_loss": 0.79678547, "learning_rate": 7.331995778981088e-07, "loss": 0.82137084, "num_input_tokens_seen": 260986790, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20507812, "step": 12088, "time_per_iteration": 2.8395471572875977 }, { "auxiliary_loss_clip": 0.01419101, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.2543112, "balance_loss_mlp": 1.01645911, "epoch": 0.7268300015030813, "flos": 18523967247360.0, "grad_norm": 1.709143396029944, "language_loss": 0.74509984, "learning_rate": 7.328982269740221e-07, "loss": 0.76965237, "num_input_tokens_seen": 261004925, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19702148, "step": 12089, "time_per_iteration": 2.809056043624878 }, { "auxiliary_loss_clip": 0.0141726, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 1.25462699, "balance_loss_mlp": 1.01717412, "epoch": 0.7268901247557493, "flos": 23996116863360.0, "grad_norm": 1.723145008026815, "language_loss": 0.72060287, "learning_rate": 7.325969240985616e-07, "loss": 0.74514341, "num_input_tokens_seen": 261023895, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19616699, "step": 12090, "time_per_iteration": 2.8941128253936768 }, { "auxiliary_loss_clip": 0.01421769, "auxiliary_loss_mlp": 0.01034379, "balance_loss_clip": 1.25796962, "balance_loss_mlp": 1.01484013, "epoch": 0.7269502480084172, "flos": 32100852510720.0, "grad_norm": 15.1584887312411, "language_loss": 0.77583146, "learning_rate": 7.322956692831528e-07, "loss": 0.80039299, "num_input_tokens_seen": 261045445, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19519043, "step": 12091, "time_per_iteration": 3.0046520233154297 }, { "auxiliary_loss_clip": 0.01407496, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.2459836, "balance_loss_mlp": 1.01341784, "epoch": 0.7270103712610853, "flos": 19072200216960.0, "grad_norm": 1.9493405540081374, "language_loss": 0.72054982, "learning_rate": 7.319944625392205e-07, "loss": 0.7449559, "num_input_tokens_seen": 261064275, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19689941, "step": 12092, "time_per_iteration": 2.832512617111206 }, { "auxiliary_loss_clip": 0.01413373, "auxiliary_loss_mlp": 0.01033568, "balance_loss_clip": 1.25040007, "balance_loss_mlp": 1.01383924, "epoch": 0.7270704945137532, "flos": 34546395962880.0, "grad_norm": 1.89889620844141, "language_loss": 0.61968446, "learning_rate": 7.31693303878184e-07, "loss": 0.64415389, "num_input_tokens_seen": 261083310, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19726562, "step": 12093, "time_per_iteration": 2.9706997871398926 }, { "auxiliary_loss_clip": 0.01411273, "auxiliary_loss_mlp": 0.01033613, "balance_loss_clip": 1.25014806, "balance_loss_mlp": 1.01382422, "epoch": 0.7271306177664212, "flos": 21517698424320.0, "grad_norm": 1.5250103690673458, "language_loss": 0.7599386, "learning_rate": 7.313921933114644e-07, "loss": 0.78438747, "num_input_tokens_seen": 261103460, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19799805, "step": 12094, "time_per_iteration": 2.8583364486694336 }, { "auxiliary_loss_clip": 0.01402848, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.24422359, "balance_loss_mlp": 1.01490164, "epoch": 0.7271907410190891, "flos": 22282410355200.0, "grad_norm": 1.7692817442696116, "language_loss": 0.85461414, "learning_rate": 7.310911308504808e-07, "loss": 0.8789795, "num_input_tokens_seen": 261121375, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18786621, "step": 12095, "time_per_iteration": 2.8835935592651367 }, { "auxiliary_loss_clip": 0.0141097, "auxiliary_loss_mlp": 0.01036722, "balance_loss_clip": 1.2480402, "balance_loss_mlp": 1.01717186, "epoch": 0.7272508642717571, "flos": 22903360997760.0, "grad_norm": 1.9639040722498762, "language_loss": 0.78669798, "learning_rate": 7.307901165066479e-07, "loss": 0.81117487, "num_input_tokens_seen": 261141105, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19555664, "step": 12096, "time_per_iteration": 2.8442108631134033 }, { "auxiliary_loss_clip": 0.01419332, "auxiliary_loss_mlp": 0.01035869, "balance_loss_clip": 1.25614762, "balance_loss_mlp": 1.01764154, "epoch": 0.727310987524425, "flos": 11663304635520.0, "grad_norm": 1.9538289875191748, "language_loss": 0.73364484, "learning_rate": 7.30489150291381e-07, "loss": 0.75819683, "num_input_tokens_seen": 261159255, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18237305, "step": 12097, "time_per_iteration": 2.856088638305664 }, { "auxiliary_loss_clip": 0.01417987, "auxiliary_loss_mlp": 0.01035828, "balance_loss_clip": 1.25347376, "balance_loss_mlp": 1.01701677, "epoch": 0.727371110777093, "flos": 24546069135360.0, "grad_norm": 1.8447795624926446, "language_loss": 0.77224135, "learning_rate": 7.301882322160935e-07, "loss": 0.79677951, "num_input_tokens_seen": 261177960, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18811035, "step": 12098, "time_per_iteration": 2.8744685649871826 }, { "auxiliary_loss_clip": 0.01422314, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 1.25561213, "balance_loss_mlp": 1.01208472, "epoch": 0.7274312340297611, "flos": 74763639296640.0, "grad_norm": 1.7198908600579652, "language_loss": 0.68032181, "learning_rate": 7.298873622921952e-07, "loss": 0.70485616, "num_input_tokens_seen": 261205660, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19030762, "step": 12099, "time_per_iteration": 3.3003385066986084 }, { "auxiliary_loss_clip": 0.01425701, "auxiliary_loss_mlp": 0.01035844, "balance_loss_clip": 1.25758123, "balance_loss_mlp": 1.01507759, "epoch": 0.727491357282429, "flos": 22352187116160.0, "grad_norm": 1.5200208234118708, "language_loss": 0.73209786, "learning_rate": 7.29586540531095e-07, "loss": 0.75671327, "num_input_tokens_seen": 261225185, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20776367, "step": 12100, "time_per_iteration": 2.8768422603607178 }, { "auxiliary_loss_clip": 0.0142332, "auxiliary_loss_mlp": 0.01039499, "balance_loss_clip": 1.26048779, "balance_loss_mlp": 1.02096188, "epoch": 0.727551480535097, "flos": 23308692330240.0, "grad_norm": 1.591788990465889, "language_loss": 0.7490207, "learning_rate": 7.292857669442005e-07, "loss": 0.77364886, "num_input_tokens_seen": 261247965, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18554688, "step": 12101, "time_per_iteration": 2.913317918777466 }, { "auxiliary_loss_clip": 0.01405009, "auxiliary_loss_mlp": 0.01033635, "balance_loss_clip": 1.24582517, "balance_loss_mlp": 1.01561046, "epoch": 0.7276116037877649, "flos": 21480525181440.0, "grad_norm": 1.8114734862873358, "language_loss": 0.83265293, "learning_rate": 7.289850415429177e-07, "loss": 0.85703939, "num_input_tokens_seen": 261267585, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18041992, "step": 12102, "time_per_iteration": 2.839599132537842 }, { "auxiliary_loss_clip": 0.01410603, "auxiliary_loss_mlp": 0.01029339, "balance_loss_clip": 1.24871075, "balance_loss_mlp": 1.01111174, "epoch": 0.7276717270404329, "flos": 21472335872640.0, "grad_norm": 2.0843500721336867, "language_loss": 0.82358956, "learning_rate": 7.286843643386495e-07, "loss": 0.84798896, "num_input_tokens_seen": 261285200, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18225098, "step": 12103, "time_per_iteration": 2.8556008338928223 }, { "auxiliary_loss_clip": 0.01417471, "auxiliary_loss_mlp": 0.01032636, "balance_loss_clip": 1.25344324, "balance_loss_mlp": 1.01324105, "epoch": 0.7277318502931008, "flos": 16846076638080.0, "grad_norm": 1.7784000298165807, "language_loss": 0.67555165, "learning_rate": 7.283837353427968e-07, "loss": 0.70005274, "num_input_tokens_seen": 261303645, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19396973, "step": 12104, "time_per_iteration": 2.8202319145202637 }, { "auxiliary_loss_clip": 0.01408679, "auxiliary_loss_mlp": 0.01033517, "balance_loss_clip": 1.2482059, "balance_loss_mlp": 1.01407421, "epoch": 0.7277919735457689, "flos": 33413390208000.0, "grad_norm": 3.2835331604082496, "language_loss": 0.6641196, "learning_rate": 7.280831545667611e-07, "loss": 0.68854153, "num_input_tokens_seen": 261323265, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19421387, "step": 12105, "time_per_iteration": 2.998450994491577 }, { "auxiliary_loss_clip": 0.01409732, "auxiliary_loss_mlp": 0.01035398, "balance_loss_clip": 1.24873066, "balance_loss_mlp": 1.01608634, "epoch": 0.7278520967984368, "flos": 19215373322880.0, "grad_norm": 2.1219062688945454, "language_loss": 0.7602756, "learning_rate": 7.27782622021939e-07, "loss": 0.78472698, "num_input_tokens_seen": 261339745, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19311523, "step": 12106, "time_per_iteration": 2.823988914489746 }, { "auxiliary_loss_clip": 0.01433028, "auxiliary_loss_mlp": 0.0103996, "balance_loss_clip": 1.26654911, "balance_loss_mlp": 1.0203501, "epoch": 0.7279122200511048, "flos": 34107918174720.0, "grad_norm": 1.9990180625704166, "language_loss": 0.70888186, "learning_rate": 7.274821377197273e-07, "loss": 0.73361164, "num_input_tokens_seen": 261359310, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19604492, "step": 12107, "time_per_iteration": 2.9482038021087646 }, { "auxiliary_loss_clip": 0.01408053, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.24613404, "balance_loss_mlp": 1.01271248, "epoch": 0.7279723433037727, "flos": 54617459806080.0, "grad_norm": 1.4084801597393757, "language_loss": 0.75769401, "learning_rate": 7.271817016715205e-07, "loss": 0.78209049, "num_input_tokens_seen": 261384640, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18884277, "step": 12108, "time_per_iteration": 3.154111623764038 }, { "auxiliary_loss_clip": 0.01426693, "auxiliary_loss_mlp": 0.01034947, "balance_loss_clip": 1.26113629, "balance_loss_mlp": 1.01581442, "epoch": 0.7280324665564407, "flos": 36151161696000.0, "grad_norm": 1.982993180602593, "language_loss": 0.67644757, "learning_rate": 7.268813138887124e-07, "loss": 0.70106399, "num_input_tokens_seen": 261405290, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19140625, "step": 12109, "time_per_iteration": 4.440367221832275 }, { "auxiliary_loss_clip": 0.01407537, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.24718308, "balance_loss_mlp": 1.01614666, "epoch": 0.7280925898091086, "flos": 11625724189440.0, "grad_norm": 2.116810314290769, "language_loss": 0.63821995, "learning_rate": 7.265809743826912e-07, "loss": 0.66265786, "num_input_tokens_seen": 261419710, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.20068359, "step": 12110, "time_per_iteration": 2.851145029067993 }, { "auxiliary_loss_clip": 0.01423477, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.25726283, "balance_loss_mlp": 1.01411653, "epoch": 0.7281527130617766, "flos": 34290391029120.0, "grad_norm": 1.9238147650186237, "language_loss": 0.59912622, "learning_rate": 7.26280683164847e-07, "loss": 0.62369859, "num_input_tokens_seen": 261442385, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19628906, "step": 12111, "time_per_iteration": 3.0161428451538086 }, { "auxiliary_loss_clip": 0.01420623, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.25482738, "balance_loss_mlp": 1.01477981, "epoch": 0.7282128363144446, "flos": 13926058519680.0, "grad_norm": 2.0023844953482457, "language_loss": 0.74952918, "learning_rate": 7.259804402465677e-07, "loss": 0.77406865, "num_input_tokens_seen": 261459805, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.18530273, "step": 12112, "time_per_iteration": 2.8429298400878906 }, { "auxiliary_loss_clip": 0.01404313, "auxiliary_loss_mlp": 0.01032685, "balance_loss_clip": 1.24272633, "balance_loss_mlp": 1.01401722, "epoch": 0.7282729595671126, "flos": 20787490293120.0, "grad_norm": 2.6319206592044053, "language_loss": 0.67358816, "learning_rate": 7.25680245639237e-07, "loss": 0.69795811, "num_input_tokens_seen": 261477175, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18688965, "step": 12113, "time_per_iteration": 2.8669586181640625 }, { "auxiliary_loss_clip": 0.01411925, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.24713337, "balance_loss_mlp": 1.01283574, "epoch": 0.7283330828197806, "flos": 16333885791360.0, "grad_norm": 1.8998578281642216, "language_loss": 0.74103618, "learning_rate": 7.253800993542399e-07, "loss": 0.76548004, "num_input_tokens_seen": 261494990, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19628906, "step": 12114, "time_per_iteration": 2.85542368888855 }, { "auxiliary_loss_clip": 0.01407939, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.246508, "balance_loss_mlp": 1.01167536, "epoch": 0.7283932060724485, "flos": 27501586439040.0, "grad_norm": 2.1578201335609597, "language_loss": 0.6893152, "learning_rate": 7.250800014029564e-07, "loss": 0.71371061, "num_input_tokens_seen": 261514445, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19897461, "step": 12115, "time_per_iteration": 2.910413980484009 }, { "auxiliary_loss_clip": 0.0143702, "auxiliary_loss_mlp": 0.01035007, "balance_loss_clip": 1.26935303, "balance_loss_mlp": 1.01623154, "epoch": 0.7284533293251165, "flos": 18376676864640.0, "grad_norm": 1.7033770539496362, "language_loss": 0.60604846, "learning_rate": 7.247799517967674e-07, "loss": 0.63076866, "num_input_tokens_seen": 261533565, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.18774414, "step": 12116, "time_per_iteration": 2.9810190200805664 }, { "auxiliary_loss_clip": 0.01417136, "auxiliary_loss_mlp": 0.01034657, "balance_loss_clip": 1.25482607, "balance_loss_mlp": 1.01643038, "epoch": 0.7285134525777844, "flos": 21735444240000.0, "grad_norm": 2.0422862025745525, "language_loss": 0.73812866, "learning_rate": 7.2447995054705e-07, "loss": 0.76264656, "num_input_tokens_seen": 261553795, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18237305, "step": 12117, "time_per_iteration": 4.372398853302002 }, { "auxiliary_loss_clip": 0.01419597, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.25538826, "balance_loss_mlp": 1.01091027, "epoch": 0.7285735758304525, "flos": 20751267191040.0, "grad_norm": 2.1055684838085362, "language_loss": 0.71546394, "learning_rate": 7.241799976651807e-07, "loss": 0.73995841, "num_input_tokens_seen": 261572565, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1895752, "step": 12118, "time_per_iteration": 2.8421473503112793 }, { "auxiliary_loss_clip": 0.01408354, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.25039721, "balance_loss_mlp": 1.01692724, "epoch": 0.7286336990831204, "flos": 17319782142720.0, "grad_norm": 1.7135764188869396, "language_loss": 0.84919155, "learning_rate": 7.238800931625346e-07, "loss": 0.87362444, "num_input_tokens_seen": 261590910, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18017578, "step": 12119, "time_per_iteration": 2.816567897796631 }, { "auxiliary_loss_clip": 0.01419516, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.25578833, "balance_loss_mlp": 1.01144958, "epoch": 0.7286938223357884, "flos": 19795666872960.0, "grad_norm": 2.0797298924209744, "language_loss": 0.82270837, "learning_rate": 7.235802370504831e-07, "loss": 0.84719753, "num_input_tokens_seen": 261606005, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1796875, "step": 12120, "time_per_iteration": 2.8061304092407227 }, { "auxiliary_loss_clip": 0.01425594, "auxiliary_loss_mlp": 0.01034151, "balance_loss_clip": 1.26171851, "balance_loss_mlp": 1.01535201, "epoch": 0.7287539455884563, "flos": 15349211049600.0, "grad_norm": 1.760982644813853, "language_loss": 0.79007769, "learning_rate": 7.232804293403963e-07, "loss": 0.81467521, "num_input_tokens_seen": 261622305, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18811035, "step": 12121, "time_per_iteration": 4.422580718994141 }, { "auxiliary_loss_clip": 0.01424479, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.25682759, "balance_loss_mlp": 1.01423335, "epoch": 0.7288140688411243, "flos": 25203514348800.0, "grad_norm": 3.239788959764501, "language_loss": 0.696751, "learning_rate": 7.229806700436441e-07, "loss": 0.7213304, "num_input_tokens_seen": 261642465, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19238281, "step": 12122, "time_per_iteration": 4.2991838455200195 }, { "auxiliary_loss_clip": 0.01399012, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.2399919, "balance_loss_mlp": 1.01414371, "epoch": 0.7288741920937922, "flos": 23993764133760.0, "grad_norm": 1.8654465831727127, "language_loss": 0.87941277, "learning_rate": 7.226809591715923e-07, "loss": 0.90372062, "num_input_tokens_seen": 261661420, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.17651367, "step": 12123, "time_per_iteration": 2.847494602203369 }, { "auxiliary_loss_clip": 0.01412646, "auxiliary_loss_mlp": 0.01030935, "balance_loss_clip": 1.2519753, "balance_loss_mlp": 1.01198101, "epoch": 0.7289343153464602, "flos": 22754532291840.0, "grad_norm": 1.6651048585489334, "language_loss": 0.83103585, "learning_rate": 7.223812967356065e-07, "loss": 0.85547161, "num_input_tokens_seen": 261680865, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18945312, "step": 12124, "time_per_iteration": 2.8753581047058105 }, { "auxiliary_loss_clip": 0.0141429, "auxiliary_loss_mlp": 0.01032789, "balance_loss_clip": 1.25169301, "balance_loss_mlp": 1.01440692, "epoch": 0.7289944385991282, "flos": 24910833864960.0, "grad_norm": 1.7157243554116082, "language_loss": 0.6789794, "learning_rate": 7.220816827470499e-07, "loss": 0.7034502, "num_input_tokens_seen": 261701455, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18395996, "step": 12125, "time_per_iteration": 2.9622833728790283 }, { "auxiliary_loss_clip": 0.01428598, "auxiliary_loss_mlp": 0.01037385, "balance_loss_clip": 1.26022947, "balance_loss_mlp": 1.0175848, "epoch": 0.7290545618517962, "flos": 22977571749120.0, "grad_norm": 1.7420205968090947, "language_loss": 0.76263785, "learning_rate": 7.217821172172855e-07, "loss": 0.78729773, "num_input_tokens_seen": 261721260, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19812012, "step": 12126, "time_per_iteration": 2.869610548019409 }, { "auxiliary_loss_clip": 0.01181649, "auxiliary_loss_mlp": 0.01017469, "balance_loss_clip": 1.09454322, "balance_loss_mlp": 0.99887258, "epoch": 0.7291146851044642, "flos": 61932680092800.0, "grad_norm": 0.8233413291365862, "language_loss": 0.58688235, "learning_rate": 7.2148260015767e-07, "loss": 0.60887361, "num_input_tokens_seen": 261779370, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.18554688, "step": 12127, "time_per_iteration": 3.2875189781188965 }, { "auxiliary_loss_clip": 0.01409088, "auxiliary_loss_mlp": 0.01030131, "balance_loss_clip": 1.24860859, "balance_loss_mlp": 1.0116775, "epoch": 0.7291748083571321, "flos": 23341295848320.0, "grad_norm": 19.484785675667176, "language_loss": 0.69433421, "learning_rate": 7.21183131579562e-07, "loss": 0.7187264, "num_input_tokens_seen": 261798050, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18469238, "step": 12128, "time_per_iteration": 2.893202066421509 }, { "auxiliary_loss_clip": 0.01417356, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.25291896, "balance_loss_mlp": 1.01415348, "epoch": 0.7292349316098001, "flos": 28341775975680.0, "grad_norm": 1.797524535678285, "language_loss": 0.65705597, "learning_rate": 7.20883711494319e-07, "loss": 0.68155986, "num_input_tokens_seen": 261817660, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18884277, "step": 12129, "time_per_iteration": 2.953606128692627 }, { "auxiliary_loss_clip": 0.01406248, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.24720204, "balance_loss_mlp": 1.01458251, "epoch": 0.729295054862468, "flos": 24142411860480.0, "grad_norm": 1.9549774498277668, "language_loss": 0.75055939, "learning_rate": 7.205843399132927e-07, "loss": 0.77495605, "num_input_tokens_seen": 261837935, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18823242, "step": 12130, "time_per_iteration": 2.9776265621185303 }, { "auxiliary_loss_clip": 0.01416233, "auxiliary_loss_mlp": 0.01031676, "balance_loss_clip": 1.2529645, "balance_loss_mlp": 1.01291203, "epoch": 0.7293551781151361, "flos": 22825440172800.0, "grad_norm": 1.9229561389820833, "language_loss": 0.70550907, "learning_rate": 7.202850168478374e-07, "loss": 0.7299881, "num_input_tokens_seen": 261857575, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18774414, "step": 12131, "time_per_iteration": 2.926750898361206 }, { "auxiliary_loss_clip": 0.01403484, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.24338436, "balance_loss_mlp": 1.0172379, "epoch": 0.729415301367804, "flos": 22136658295680.0, "grad_norm": 1.585555452488032, "language_loss": 0.78036499, "learning_rate": 7.199857423093025e-07, "loss": 0.80475509, "num_input_tokens_seen": 261877265, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18286133, "step": 12132, "time_per_iteration": 2.886493444442749 }, { "auxiliary_loss_clip": 0.01413249, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.25308466, "balance_loss_mlp": 1.01508296, "epoch": 0.729475424620472, "flos": 12357334909440.0, "grad_norm": 2.070208347483394, "language_loss": 0.80141282, "learning_rate": 7.196865163090358e-07, "loss": 0.82588243, "num_input_tokens_seen": 261893695, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18640137, "step": 12133, "time_per_iteration": 2.8525891304016113 }, { "auxiliary_loss_clip": 0.01404097, "auxiliary_loss_mlp": 0.01031379, "balance_loss_clip": 1.24352729, "balance_loss_mlp": 1.01297355, "epoch": 0.7295355478731399, "flos": 22204172816640.0, "grad_norm": 1.7565451632547386, "language_loss": 0.72792494, "learning_rate": 7.193873388583846e-07, "loss": 0.75227964, "num_input_tokens_seen": 261911825, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1842041, "step": 12134, "time_per_iteration": 2.9668548107147217 }, { "auxiliary_loss_clip": 0.01433548, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.2691853, "balance_loss_mlp": 1.01847076, "epoch": 0.7295956711258079, "flos": 23232490807680.0, "grad_norm": 1.7177579327943748, "language_loss": 0.72156924, "learning_rate": 7.190882099686939e-07, "loss": 0.74628174, "num_input_tokens_seen": 261931190, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19238281, "step": 12135, "time_per_iteration": 3.0090444087982178 }, { "auxiliary_loss_clip": 0.01423921, "auxiliary_loss_mlp": 0.01039198, "balance_loss_clip": 1.25890887, "balance_loss_mlp": 1.02054191, "epoch": 0.7296557943784758, "flos": 31881794595840.0, "grad_norm": 2.060822482533133, "language_loss": 0.63160539, "learning_rate": 7.187891296513075e-07, "loss": 0.65623659, "num_input_tokens_seen": 261951240, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18664551, "step": 12136, "time_per_iteration": 3.0272276401519775 }, { "auxiliary_loss_clip": 0.0141774, "auxiliary_loss_mlp": 0.01035752, "balance_loss_clip": 1.25519276, "balance_loss_mlp": 1.01626086, "epoch": 0.7297159176311439, "flos": 26663070960000.0, "grad_norm": 1.7380209593505629, "language_loss": 0.75714183, "learning_rate": 7.184900979175654e-07, "loss": 0.78167677, "num_input_tokens_seen": 261971605, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19494629, "step": 12137, "time_per_iteration": 2.953033447265625 }, { "auxiliary_loss_clip": 0.0142125, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.25770795, "balance_loss_mlp": 1.01673853, "epoch": 0.7297760408838118, "flos": 24759154736640.0, "grad_norm": 1.8072777112899314, "language_loss": 0.74538445, "learning_rate": 7.181911147788069e-07, "loss": 0.76996672, "num_input_tokens_seen": 261990830, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20239258, "step": 12138, "time_per_iteration": 2.882807731628418 }, { "auxiliary_loss_clip": 0.01407229, "auxiliary_loss_mlp": 0.01030859, "balance_loss_clip": 1.24546206, "balance_loss_mlp": 1.01223886, "epoch": 0.7298361641364798, "flos": 18081915120000.0, "grad_norm": 3.920479000594158, "language_loss": 0.72374189, "learning_rate": 7.178921802463702e-07, "loss": 0.74812281, "num_input_tokens_seen": 262008190, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18615723, "step": 12139, "time_per_iteration": 2.836174249649048 }, { "auxiliary_loss_clip": 0.01401452, "auxiliary_loss_mlp": 0.01033217, "balance_loss_clip": 1.2439878, "balance_loss_mlp": 1.01521599, "epoch": 0.7298962873891478, "flos": 29906925246720.0, "grad_norm": 1.5141390556737533, "language_loss": 0.73797715, "learning_rate": 7.175932943315898e-07, "loss": 0.76232386, "num_input_tokens_seen": 262030460, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18017578, "step": 12140, "time_per_iteration": 2.8881592750549316 }, { "auxiliary_loss_clip": 0.0142193, "auxiliary_loss_mlp": 0.01038619, "balance_loss_clip": 1.25717688, "balance_loss_mlp": 1.0193429, "epoch": 0.7299564106418157, "flos": 32277760254720.0, "grad_norm": 1.4608758156402475, "language_loss": 0.55840242, "learning_rate": 7.172944570458003e-07, "loss": 0.58300787, "num_input_tokens_seen": 262050830, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19274902, "step": 12141, "time_per_iteration": 2.9648938179016113 }, { "auxiliary_loss_clip": 0.01409371, "auxiliary_loss_mlp": 0.01029178, "balance_loss_clip": 1.25119138, "balance_loss_mlp": 1.01087952, "epoch": 0.7300165338944837, "flos": 22940715219840.0, "grad_norm": 1.602406960157579, "language_loss": 0.73512506, "learning_rate": 7.169956684003342e-07, "loss": 0.75951058, "num_input_tokens_seen": 262071245, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.1829834, "step": 12142, "time_per_iteration": 2.8505566120147705 }, { "auxiliary_loss_clip": 0.01413953, "auxiliary_loss_mlp": 0.01032746, "balance_loss_clip": 1.25235128, "balance_loss_mlp": 1.0150075, "epoch": 0.7300766571471516, "flos": 19838359981440.0, "grad_norm": 2.9292595640583485, "language_loss": 0.74345177, "learning_rate": 7.16696928406521e-07, "loss": 0.76791871, "num_input_tokens_seen": 262087525, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.17736816, "step": 12143, "time_per_iteration": 2.7940311431884766 }, { "auxiliary_loss_clip": 0.01420326, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.25857794, "balance_loss_mlp": 1.01352918, "epoch": 0.7301367803998197, "flos": 24357216764160.0, "grad_norm": 1.8394217001251587, "language_loss": 0.6771335, "learning_rate": 7.163982370756882e-07, "loss": 0.70166475, "num_input_tokens_seen": 262107355, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19274902, "step": 12144, "time_per_iteration": 2.8618180751800537 }, { "auxiliary_loss_clip": 0.01435822, "auxiliary_loss_mlp": 0.01036297, "balance_loss_clip": 1.2707839, "balance_loss_mlp": 1.01723516, "epoch": 0.7301969036524876, "flos": 15312897457920.0, "grad_norm": 1.6799023123971732, "language_loss": 0.79869401, "learning_rate": 7.160995944191627e-07, "loss": 0.82341516, "num_input_tokens_seen": 262125645, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19067383, "step": 12145, "time_per_iteration": 4.263585329055786 }, { "auxiliary_loss_clip": 0.01410192, "auxiliary_loss_mlp": 0.0103621, "balance_loss_clip": 1.24928892, "balance_loss_mlp": 1.01686215, "epoch": 0.7302570269051556, "flos": 23516077086720.0, "grad_norm": 1.7196227530170864, "language_loss": 0.92188263, "learning_rate": 7.158010004482702e-07, "loss": 0.94634664, "num_input_tokens_seen": 262144075, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19360352, "step": 12146, "time_per_iteration": 2.862212896347046 }, { "auxiliary_loss_clip": 0.01409193, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.25016677, "balance_loss_mlp": 1.01747906, "epoch": 0.7303171501578235, "flos": 20533068927360.0, "grad_norm": 1.518822057580962, "language_loss": 0.62655491, "learning_rate": 7.155024551743316e-07, "loss": 0.65100288, "num_input_tokens_seen": 262165940, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18115234, "step": 12147, "time_per_iteration": 2.8818328380584717 }, { "auxiliary_loss_clip": 0.01423427, "auxiliary_loss_mlp": 0.01037096, "balance_loss_clip": 1.25861001, "balance_loss_mlp": 1.01747429, "epoch": 0.7303772734104915, "flos": 18341720616960.0, "grad_norm": 2.0495099550884066, "language_loss": 0.75881886, "learning_rate": 7.152039586086693e-07, "loss": 0.78342414, "num_input_tokens_seen": 262184520, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19604492, "step": 12148, "time_per_iteration": 2.846372365951538 }, { "auxiliary_loss_clip": 0.01181039, "auxiliary_loss_mlp": 0.01022885, "balance_loss_clip": 1.09290576, "balance_loss_mlp": 1.00629067, "epoch": 0.7304373966631594, "flos": 60683992087680.0, "grad_norm": 0.6928270853999937, "language_loss": 0.56796145, "learning_rate": 7.149055107626017e-07, "loss": 0.59000063, "num_input_tokens_seen": 262247070, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.16601562, "step": 12149, "time_per_iteration": 3.3290328979492188 }, { "auxiliary_loss_clip": 0.01434526, "auxiliary_loss_mlp": 0.01033884, "balance_loss_clip": 1.26920295, "balance_loss_mlp": 1.0140717, "epoch": 0.7304975199158275, "flos": 19837455085440.0, "grad_norm": 1.62426805894834, "language_loss": 0.74807513, "learning_rate": 7.146071116474451e-07, "loss": 0.7727592, "num_input_tokens_seen": 262266605, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19799805, "step": 12150, "time_per_iteration": 2.8794026374816895 }, { "auxiliary_loss_clip": 0.01422095, "auxiliary_loss_mlp": 0.01033803, "balance_loss_clip": 1.2570622, "balance_loss_mlp": 1.01509881, "epoch": 0.7305576431684954, "flos": 13231621042560.0, "grad_norm": 1.9351863608951152, "language_loss": 0.84214652, "learning_rate": 7.143087612745158e-07, "loss": 0.86670554, "num_input_tokens_seen": 262283880, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18688965, "step": 12151, "time_per_iteration": 4.4292590618133545 }, { "auxiliary_loss_clip": 0.01417669, "auxiliary_loss_mlp": 0.01037771, "balance_loss_clip": 1.25407624, "balance_loss_mlp": 1.01812577, "epoch": 0.7306177664211634, "flos": 24070961041920.0, "grad_norm": 1.9967447859671437, "language_loss": 0.78904712, "learning_rate": 7.14010459655127e-07, "loss": 0.81360149, "num_input_tokens_seen": 262304155, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19641113, "step": 12152, "time_per_iteration": 2.9217722415924072 }, { "auxiliary_loss_clip": 0.01428311, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.26574302, "balance_loss_mlp": 1.01982534, "epoch": 0.7306778896738314, "flos": 27100462872960.0, "grad_norm": 1.5392357848382765, "language_loss": 0.80104268, "learning_rate": 7.137122068005919e-07, "loss": 0.82571125, "num_input_tokens_seen": 262325660, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18713379, "step": 12153, "time_per_iteration": 2.9134867191314697 }, { "auxiliary_loss_clip": 0.01430217, "auxiliary_loss_mlp": 0.01035274, "balance_loss_clip": 1.26344872, "balance_loss_mlp": 1.01580751, "epoch": 0.7307380129264993, "flos": 16699329192960.0, "grad_norm": 1.6421420223235073, "language_loss": 0.68434024, "learning_rate": 7.134140027222173e-07, "loss": 0.70899516, "num_input_tokens_seen": 262344075, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.19482422, "step": 12154, "time_per_iteration": 2.845033884048462 }, { "auxiliary_loss_clip": 0.01420636, "auxiliary_loss_mlp": 0.01030948, "balance_loss_clip": 1.25512922, "balance_loss_mlp": 1.01213646, "epoch": 0.7307981361791673, "flos": 21735715708800.0, "grad_norm": 4.492142219477114, "language_loss": 0.66357714, "learning_rate": 7.131158474313128e-07, "loss": 0.68809301, "num_input_tokens_seen": 262363305, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.18811035, "step": 12155, "time_per_iteration": 2.849754810333252 }, { "auxiliary_loss_clip": 0.01396345, "auxiliary_loss_mlp": 0.01033052, "balance_loss_clip": 1.23734665, "balance_loss_mlp": 1.01478887, "epoch": 0.7308582594318352, "flos": 18049673560320.0, "grad_norm": 1.6665244227894938, "language_loss": 0.82763767, "learning_rate": 7.128177409391851e-07, "loss": 0.85193169, "num_input_tokens_seen": 262380730, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18273926, "step": 12156, "time_per_iteration": 4.268205642700195 }, { "auxiliary_loss_clip": 0.01410951, "auxiliary_loss_mlp": 0.01033328, "balance_loss_clip": 1.24986553, "balance_loss_mlp": 1.01516032, "epoch": 0.7309183826845033, "flos": 13852933643520.0, "grad_norm": 3.8533759638416707, "language_loss": 0.76273942, "learning_rate": 7.125196832571367e-07, "loss": 0.78718221, "num_input_tokens_seen": 262395480, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18164062, "step": 12157, "time_per_iteration": 2.8005106449127197 }, { "auxiliary_loss_clip": 0.01400901, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.24288297, "balance_loss_mlp": 1.01449132, "epoch": 0.7309785059371712, "flos": 17028006554880.0, "grad_norm": 1.9562726464332005, "language_loss": 0.73538417, "learning_rate": 7.122216743964713e-07, "loss": 0.75972795, "num_input_tokens_seen": 262413340, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18994141, "step": 12158, "time_per_iteration": 4.2931084632873535 }, { "auxiliary_loss_clip": 0.01422029, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.25957143, "balance_loss_mlp": 1.01585102, "epoch": 0.7310386291898392, "flos": 26512839665280.0, "grad_norm": 1.5417582188937977, "language_loss": 0.86456656, "learning_rate": 7.119237143684896e-07, "loss": 0.88913852, "num_input_tokens_seen": 262433455, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19299316, "step": 12159, "time_per_iteration": 2.883857011795044 }, { "auxiliary_loss_clip": 0.01428436, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.26024854, "balance_loss_mlp": 1.01381254, "epoch": 0.7310987524425071, "flos": 16954610209920.0, "grad_norm": 3.2173478691015887, "language_loss": 0.74050498, "learning_rate": 7.116258031844895e-07, "loss": 0.76512569, "num_input_tokens_seen": 262450335, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.19836426, "step": 12160, "time_per_iteration": 2.8384621143341064 }, { "auxiliary_loss_clip": 0.01437303, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.26873279, "balance_loss_mlp": 1.01447415, "epoch": 0.7311588756951751, "flos": 13853340846720.0, "grad_norm": 2.0370377509305384, "language_loss": 0.73872417, "learning_rate": 7.113279408557675e-07, "loss": 0.76344442, "num_input_tokens_seen": 262468240, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.20251465, "step": 12161, "time_per_iteration": 2.8398118019104004 }, { "auxiliary_loss_clip": 0.01452272, "auxiliary_loss_mlp": 0.01033002, "balance_loss_clip": 1.28069627, "balance_loss_mlp": 1.01271296, "epoch": 0.731218998947843, "flos": 28779801315840.0, "grad_norm": 1.990172930420111, "language_loss": 0.7049396, "learning_rate": 7.110301273936192e-07, "loss": 0.72979236, "num_input_tokens_seen": 262487045, "router_z_loss_clip": 1.71679688, "router_z_loss_mlp": 0.20288086, "step": 12162, "time_per_iteration": 2.9155490398406982 }, { "auxiliary_loss_clip": 0.0142564, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.26104748, "balance_loss_mlp": 1.01080287, "epoch": 0.7312791222005111, "flos": 27100055669760.0, "grad_norm": 1.664013755551982, "language_loss": 0.67635882, "learning_rate": 7.107323628093382e-07, "loss": 0.70091987, "num_input_tokens_seen": 262504855, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19665527, "step": 12163, "time_per_iteration": 2.8756847381591797 }, { "auxiliary_loss_clip": 0.01413078, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.250265, "balance_loss_mlp": 1.01398504, "epoch": 0.731339245453179, "flos": 20934056759040.0, "grad_norm": 1.7353501559267221, "language_loss": 0.69496351, "learning_rate": 7.104346471142153e-07, "loss": 0.71942818, "num_input_tokens_seen": 262524920, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19396973, "step": 12164, "time_per_iteration": 2.940541982650757 }, { "auxiliary_loss_clip": 0.01396911, "auxiliary_loss_mlp": 0.0102904, "balance_loss_clip": 1.24070716, "balance_loss_mlp": 1.01088452, "epoch": 0.731399368705847, "flos": 23086014831360.0, "grad_norm": 1.5842497379038467, "language_loss": 0.74066412, "learning_rate": 7.101369803195391e-07, "loss": 0.76492357, "num_input_tokens_seen": 262545725, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18151855, "step": 12165, "time_per_iteration": 2.9039058685302734 }, { "auxiliary_loss_clip": 0.01420772, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.25656939, "balance_loss_mlp": 1.01607335, "epoch": 0.731459491958515, "flos": 23592142874880.0, "grad_norm": 1.7521254493911536, "language_loss": 0.77537119, "learning_rate": 7.098393624365988e-07, "loss": 0.79993248, "num_input_tokens_seen": 262565480, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19262695, "step": 12166, "time_per_iteration": 2.8808186054229736 }, { "auxiliary_loss_clip": 0.01409605, "auxiliary_loss_mlp": 0.01034906, "balance_loss_clip": 1.24981046, "balance_loss_mlp": 1.01655996, "epoch": 0.7315196152111829, "flos": 22388591197440.0, "grad_norm": 1.5528189795522882, "language_loss": 0.80287784, "learning_rate": 7.095417934766781e-07, "loss": 0.82732296, "num_input_tokens_seen": 262584145, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18359375, "step": 12167, "time_per_iteration": 2.8568427562713623 }, { "auxiliary_loss_clip": 0.01408556, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.24723852, "balance_loss_mlp": 1.01393044, "epoch": 0.7315797384638509, "flos": 26188189090560.0, "grad_norm": 1.7444877027876589, "language_loss": 0.77444202, "learning_rate": 7.092442734510622e-07, "loss": 0.79886061, "num_input_tokens_seen": 262604045, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19372559, "step": 12168, "time_per_iteration": 2.90659236907959 }, { "auxiliary_loss_clip": 0.0143656, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.27019918, "balance_loss_mlp": 1.01314545, "epoch": 0.7316398617165188, "flos": 21516114856320.0, "grad_norm": 1.4564037975460493, "language_loss": 0.82541478, "learning_rate": 7.089468023710326e-07, "loss": 0.85011536, "num_input_tokens_seen": 262624540, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.20349121, "step": 12169, "time_per_iteration": 2.8382625579833984 }, { "auxiliary_loss_clip": 0.01428547, "auxiliary_loss_mlp": 0.01040736, "balance_loss_clip": 1.26373148, "balance_loss_mlp": 1.02134013, "epoch": 0.7316999849691869, "flos": 30494865168000.0, "grad_norm": 1.8102388019274824, "language_loss": 0.70870894, "learning_rate": 7.08649380247871e-07, "loss": 0.73340178, "num_input_tokens_seen": 262644545, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19384766, "step": 12170, "time_per_iteration": 2.9168505668640137 }, { "auxiliary_loss_clip": 0.0141598, "auxiliary_loss_mlp": 0.01030855, "balance_loss_clip": 1.2526803, "balance_loss_mlp": 1.01076829, "epoch": 0.7317601082218548, "flos": 21553831036800.0, "grad_norm": 1.9089536986116789, "language_loss": 0.70784169, "learning_rate": 7.083520070928533e-07, "loss": 0.73231006, "num_input_tokens_seen": 262662570, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20092773, "step": 12171, "time_per_iteration": 2.8841397762298584 }, { "auxiliary_loss_clip": 0.01413559, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.25127327, "balance_loss_mlp": 1.01866627, "epoch": 0.7318202314745228, "flos": 33263611361280.0, "grad_norm": 1.755393847573121, "language_loss": 0.6641283, "learning_rate": 7.080546829172564e-07, "loss": 0.68864071, "num_input_tokens_seen": 262683245, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19018555, "step": 12172, "time_per_iteration": 2.9791955947875977 }, { "auxiliary_loss_clip": 0.01419949, "auxiliary_loss_mlp": 0.01028646, "balance_loss_clip": 1.2567836, "balance_loss_mlp": 1.01008463, "epoch": 0.7318803547271907, "flos": 20166675384960.0, "grad_norm": 2.961478202580039, "language_loss": 0.63451481, "learning_rate": 7.077574077323564e-07, "loss": 0.65900075, "num_input_tokens_seen": 262701585, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18554688, "step": 12173, "time_per_iteration": 2.857858180999756 }, { "auxiliary_loss_clip": 0.01425286, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.2632643, "balance_loss_mlp": 1.01383281, "epoch": 0.7319404779798587, "flos": 20568296643840.0, "grad_norm": 2.7758347313704363, "language_loss": 0.74811161, "learning_rate": 7.074601815494243e-07, "loss": 0.77269053, "num_input_tokens_seen": 262719295, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18774414, "step": 12174, "time_per_iteration": 2.8775856494903564 }, { "auxiliary_loss_clip": 0.01420713, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.26109838, "balance_loss_mlp": 1.0153048, "epoch": 0.7320006012325266, "flos": 28707400356480.0, "grad_norm": 3.9170360773339654, "language_loss": 0.81483513, "learning_rate": 7.071630043797317e-07, "loss": 0.83938032, "num_input_tokens_seen": 262739995, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18505859, "step": 12175, "time_per_iteration": 2.907259702682495 }, { "auxiliary_loss_clip": 0.01414866, "auxiliary_loss_mlp": 0.01034989, "balance_loss_clip": 1.25256133, "balance_loss_mlp": 1.01616573, "epoch": 0.7320607244851947, "flos": 16371375747840.0, "grad_norm": 2.106907101039173, "language_loss": 0.77791011, "learning_rate": 7.068658762345488e-07, "loss": 0.8024087, "num_input_tokens_seen": 262757680, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18823242, "step": 12176, "time_per_iteration": 2.8151955604553223 }, { "auxiliary_loss_clip": 0.01420999, "auxiliary_loss_mlp": 0.01033912, "balance_loss_clip": 1.25871789, "balance_loss_mlp": 1.01483858, "epoch": 0.7321208477378626, "flos": 20963719365120.0, "grad_norm": 1.5133117362291926, "language_loss": 0.77051294, "learning_rate": 7.065687971251399e-07, "loss": 0.79506207, "num_input_tokens_seen": 262776990, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19067383, "step": 12177, "time_per_iteration": 2.873093366622925 }, { "auxiliary_loss_clip": 0.01403997, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.24365258, "balance_loss_mlp": 1.01927543, "epoch": 0.7321809709905306, "flos": 13853069377920.0, "grad_norm": 2.045475000563159, "language_loss": 0.75048345, "learning_rate": 7.06271767062772e-07, "loss": 0.77490115, "num_input_tokens_seen": 262795440, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18505859, "step": 12178, "time_per_iteration": 2.8613827228546143 }, { "auxiliary_loss_clip": 0.0142279, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.2565943, "balance_loss_mlp": 1.01244116, "epoch": 0.7322410942431986, "flos": 26991341118720.0, "grad_norm": 2.0570167547969436, "language_loss": 0.83367783, "learning_rate": 7.059747860587084e-07, "loss": 0.85821503, "num_input_tokens_seen": 262816385, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.18493652, "step": 12179, "time_per_iteration": 4.4028708934783936 }, { "auxiliary_loss_clip": 0.01402261, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.24561453, "balance_loss_mlp": 1.01705098, "epoch": 0.7323012174958665, "flos": 17648730973440.0, "grad_norm": 2.153217302235769, "language_loss": 0.75452822, "learning_rate": 7.056778541242115e-07, "loss": 0.77889371, "num_input_tokens_seen": 262834955, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.17236328, "step": 12180, "time_per_iteration": 2.8169541358947754 }, { "auxiliary_loss_clip": 0.014332, "auxiliary_loss_mlp": 0.01033884, "balance_loss_clip": 1.2652601, "balance_loss_mlp": 1.01409554, "epoch": 0.7323613407485345, "flos": 32355409610880.0, "grad_norm": 2.3264812745166603, "language_loss": 0.79871941, "learning_rate": 7.053809712705396e-07, "loss": 0.82339025, "num_input_tokens_seen": 262853555, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19787598, "step": 12181, "time_per_iteration": 2.934889078140259 }, { "auxiliary_loss_clip": 0.01434202, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 1.26843023, "balance_loss_mlp": 1.01768303, "epoch": 0.7324214640012024, "flos": 18370252103040.0, "grad_norm": 2.040871381491611, "language_loss": 0.72379398, "learning_rate": 7.050841375089506e-07, "loss": 0.74849242, "num_input_tokens_seen": 262870975, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.17956543, "step": 12182, "time_per_iteration": 2.8124849796295166 }, { "auxiliary_loss_clip": 0.01426889, "auxiliary_loss_mlp": 0.01035579, "balance_loss_clip": 1.26397872, "balance_loss_mlp": 1.01694655, "epoch": 0.7324815872538705, "flos": 30825035608320.0, "grad_norm": 1.6740843411565123, "language_loss": 0.72198987, "learning_rate": 7.047873528507015e-07, "loss": 0.74661458, "num_input_tokens_seen": 262892635, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.1862793, "step": 12183, "time_per_iteration": 2.9112558364868164 }, { "auxiliary_loss_clip": 0.01431883, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.26551509, "balance_loss_mlp": 1.01470518, "epoch": 0.7325417105065384, "flos": 21514531288320.0, "grad_norm": 2.668768259784539, "language_loss": 0.73316371, "learning_rate": 7.04490617307045e-07, "loss": 0.75782597, "num_input_tokens_seen": 262910725, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19628906, "step": 12184, "time_per_iteration": 2.8837087154388428 }, { "auxiliary_loss_clip": 0.01183818, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.0960803, "balance_loss_mlp": 1.01391339, "epoch": 0.7326018337592064, "flos": 67288423541760.0, "grad_norm": 0.7638295535750713, "language_loss": 0.65274191, "learning_rate": 7.041939308892344e-07, "loss": 0.67489564, "num_input_tokens_seen": 262974150, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.17675781, "step": 12185, "time_per_iteration": 3.3399131298065186 }, { "auxiliary_loss_clip": 0.01421726, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.25642347, "balance_loss_mlp": 1.01394737, "epoch": 0.7326619570118743, "flos": 22867409364480.0, "grad_norm": 1.8778363764887303, "language_loss": 0.81531566, "learning_rate": 7.038972936085197e-07, "loss": 0.83986723, "num_input_tokens_seen": 262993370, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19470215, "step": 12186, "time_per_iteration": 2.874206066131592 }, { "auxiliary_loss_clip": 0.01421805, "auxiliary_loss_mlp": 0.01032202, "balance_loss_clip": 1.25532889, "balance_loss_mlp": 1.01296163, "epoch": 0.7327220802645423, "flos": 23336952347520.0, "grad_norm": 2.219260085424728, "language_loss": 0.74260879, "learning_rate": 7.036007054761508e-07, "loss": 0.76714885, "num_input_tokens_seen": 263012665, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.19238281, "step": 12187, "time_per_iteration": 4.278978109359741 }, { "auxiliary_loss_clip": 0.01419982, "auxiliary_loss_mlp": 0.01037804, "balance_loss_clip": 1.25594997, "balance_loss_mlp": 1.0190165, "epoch": 0.7327822035172102, "flos": 23189842944000.0, "grad_norm": 1.5967643417105928, "language_loss": 0.89733565, "learning_rate": 7.033041665033716e-07, "loss": 0.9219135, "num_input_tokens_seen": 263031475, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18786621, "step": 12188, "time_per_iteration": 2.8587846755981445 }, { "auxiliary_loss_clip": 0.01431644, "auxiliary_loss_mlp": 0.01033676, "balance_loss_clip": 1.2651583, "balance_loss_mlp": 1.01436424, "epoch": 0.7328423267698783, "flos": 21075782031360.0, "grad_norm": 1.9830401615224187, "language_loss": 0.75746024, "learning_rate": 7.030076767014284e-07, "loss": 0.78211343, "num_input_tokens_seen": 263051445, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.1932373, "step": 12189, "time_per_iteration": 2.892582654953003 }, { "auxiliary_loss_clip": 0.01424124, "auxiliary_loss_mlp": 0.0103034, "balance_loss_clip": 1.25799358, "balance_loss_mlp": 1.0115881, "epoch": 0.7329024500225462, "flos": 21699628341120.0, "grad_norm": 1.5157209163800363, "language_loss": 0.82544619, "learning_rate": 7.027112360815648e-07, "loss": 0.84999084, "num_input_tokens_seen": 263070835, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.18737793, "step": 12190, "time_per_iteration": 2.8556103706359863 }, { "auxiliary_loss_clip": 0.01428534, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.2644434, "balance_loss_mlp": 1.01666617, "epoch": 0.7329625732752142, "flos": 24173024607360.0, "grad_norm": 5.460922811155115, "language_loss": 0.7250489, "learning_rate": 7.024148446550204e-07, "loss": 0.74969739, "num_input_tokens_seen": 263090070, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.1965332, "step": 12191, "time_per_iteration": 4.3539440631866455 }, { "auxiliary_loss_clip": 0.01412756, "auxiliary_loss_mlp": 0.01033626, "balance_loss_clip": 1.25056529, "balance_loss_mlp": 1.0154103, "epoch": 0.7330226965278822, "flos": 30089262366720.0, "grad_norm": 1.4700610688828695, "language_loss": 0.69313133, "learning_rate": 7.021185024330361e-07, "loss": 0.71759522, "num_input_tokens_seen": 263110030, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18237305, "step": 12192, "time_per_iteration": 2.9408326148986816 }, { "auxiliary_loss_clip": 0.01417447, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.25473011, "balance_loss_mlp": 1.01618826, "epoch": 0.7330828197805501, "flos": 23378876294400.0, "grad_norm": 1.5099641975298483, "language_loss": 0.74161553, "learning_rate": 7.01822209426848e-07, "loss": 0.76614076, "num_input_tokens_seen": 263129735, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18884277, "step": 12193, "time_per_iteration": 4.236151933670044 }, { "auxiliary_loss_clip": 0.01431983, "auxiliary_loss_mlp": 0.01039718, "balance_loss_clip": 1.26601362, "balance_loss_mlp": 1.01986909, "epoch": 0.7331429430332181, "flos": 21042680820480.0, "grad_norm": 3.115842133322399, "language_loss": 0.77708405, "learning_rate": 7.015259656476911e-07, "loss": 0.80180109, "num_input_tokens_seen": 263149100, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19836426, "step": 12194, "time_per_iteration": 2.855551242828369 }, { "auxiliary_loss_clip": 0.01422459, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 1.26168334, "balance_loss_mlp": 1.01199722, "epoch": 0.733203066285886, "flos": 14656130916480.0, "grad_norm": 1.7887486590778239, "language_loss": 0.71235657, "learning_rate": 7.012297711067998e-07, "loss": 0.73688817, "num_input_tokens_seen": 263166620, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18701172, "step": 12195, "time_per_iteration": 2.8642587661743164 }, { "auxiliary_loss_clip": 0.01414597, "auxiliary_loss_mlp": 0.01033619, "balance_loss_clip": 1.25147915, "balance_loss_mlp": 1.01485586, "epoch": 0.7332631895385541, "flos": 17174165817600.0, "grad_norm": 1.813232833088303, "language_loss": 0.72967637, "learning_rate": 7.009336258154057e-07, "loss": 0.7541585, "num_input_tokens_seen": 263184780, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18774414, "step": 12196, "time_per_iteration": 2.8394932746887207 }, { "auxiliary_loss_clip": 0.01410346, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.24886489, "balance_loss_mlp": 1.01289451, "epoch": 0.733323312791222, "flos": 28669955644800.0, "grad_norm": 4.167459497462062, "language_loss": 0.72292292, "learning_rate": 7.006375297847394e-07, "loss": 0.74735004, "num_input_tokens_seen": 263204625, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19470215, "step": 12197, "time_per_iteration": 2.897859573364258 }, { "auxiliary_loss_clip": 0.01441295, "auxiliary_loss_mlp": 0.01038893, "balance_loss_clip": 1.27169442, "balance_loss_mlp": 1.01945019, "epoch": 0.73338343604389, "flos": 16627244947200.0, "grad_norm": 3.318957471792519, "language_loss": 0.78810734, "learning_rate": 7.003414830260282e-07, "loss": 0.81290925, "num_input_tokens_seen": 263221565, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19445801, "step": 12198, "time_per_iteration": 2.8272902965545654 }, { "auxiliary_loss_clip": 0.0141534, "auxiliary_loss_mlp": 0.01031844, "balance_loss_clip": 1.25269508, "balance_loss_mlp": 1.0133549, "epoch": 0.7334435592965579, "flos": 21151938309120.0, "grad_norm": 1.8188677972355827, "language_loss": 0.7538327, "learning_rate": 7.000454855504974e-07, "loss": 0.77830452, "num_input_tokens_seen": 263240620, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18481445, "step": 12199, "time_per_iteration": 2.848068952560425 }, { "auxiliary_loss_clip": 0.01425221, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.25864565, "balance_loss_mlp": 1.01849341, "epoch": 0.7335036825492259, "flos": 17133282501120.0, "grad_norm": 2.326550013928157, "language_loss": 0.77063072, "learning_rate": 6.997495373693729e-07, "loss": 0.79525936, "num_input_tokens_seen": 263254365, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19140625, "step": 12200, "time_per_iteration": 2.785109758377075 }, { "auxiliary_loss_clip": 0.01411451, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.24942482, "balance_loss_mlp": 1.01507354, "epoch": 0.7335638058018938, "flos": 23742147945600.0, "grad_norm": 1.5730928035021519, "language_loss": 0.6242879, "learning_rate": 6.994536384938754e-07, "loss": 0.64874351, "num_input_tokens_seen": 263275880, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19030762, "step": 12201, "time_per_iteration": 2.9968905448913574 }, { "auxiliary_loss_clip": 0.01409568, "auxiliary_loss_mlp": 0.01028331, "balance_loss_clip": 1.24926198, "balance_loss_mlp": 1.00991321, "epoch": 0.7336239290545619, "flos": 34946614632960.0, "grad_norm": 1.5758784423194212, "language_loss": 0.5288465, "learning_rate": 6.991577889352264e-07, "loss": 0.55322552, "num_input_tokens_seen": 263298315, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1842041, "step": 12202, "time_per_iteration": 3.031066417694092 }, { "auxiliary_loss_clip": 0.01410533, "auxiliary_loss_mlp": 0.01031748, "balance_loss_clip": 1.25046718, "balance_loss_mlp": 1.01330662, "epoch": 0.7336840523072298, "flos": 21112231357440.0, "grad_norm": 1.9040689816356613, "language_loss": 0.69198108, "learning_rate": 6.98861988704645e-07, "loss": 0.71640384, "num_input_tokens_seen": 263318615, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18444824, "step": 12203, "time_per_iteration": 2.9745562076568604 }, { "auxiliary_loss_clip": 0.01444809, "auxiliary_loss_mlp": 0.0103704, "balance_loss_clip": 1.27709675, "balance_loss_mlp": 1.01865745, "epoch": 0.7337441755598978, "flos": 24035009408640.0, "grad_norm": 2.0738469384554135, "language_loss": 0.66661823, "learning_rate": 6.985662378133474e-07, "loss": 0.69143671, "num_input_tokens_seen": 263336705, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.18383789, "step": 12204, "time_per_iteration": 3.015991687774658 }, { "auxiliary_loss_clip": 0.01403621, "auxiliary_loss_mlp": 0.01033003, "balance_loss_clip": 1.24504113, "balance_loss_mlp": 1.01553845, "epoch": 0.7338042988125658, "flos": 22721657304960.0, "grad_norm": 1.9622609295074442, "language_loss": 0.77987564, "learning_rate": 6.982705362725479e-07, "loss": 0.8042419, "num_input_tokens_seen": 263355065, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.17480469, "step": 12205, "time_per_iteration": 2.977876663208008 }, { "auxiliary_loss_clip": 0.01408619, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.249789, "balance_loss_mlp": 1.00947785, "epoch": 0.7338644220652337, "flos": 21370905734400.0, "grad_norm": 1.5779325729850755, "language_loss": 0.80506128, "learning_rate": 6.979748840934601e-07, "loss": 0.82941848, "num_input_tokens_seen": 263374460, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.1763916, "step": 12206, "time_per_iteration": 2.8631017208099365 }, { "auxiliary_loss_clip": 0.01412823, "auxiliary_loss_mlp": 0.01029561, "balance_loss_clip": 1.24944293, "balance_loss_mlp": 1.01110768, "epoch": 0.7339245453179017, "flos": 30932619039360.0, "grad_norm": 1.9412489634940304, "language_loss": 0.72037524, "learning_rate": 6.976792812872958e-07, "loss": 0.74479914, "num_input_tokens_seen": 263393610, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.18469238, "step": 12207, "time_per_iteration": 3.004380464553833 }, { "auxiliary_loss_clip": 0.01182144, "auxiliary_loss_mlp": 0.01023238, "balance_loss_clip": 1.09413695, "balance_loss_mlp": 1.00206661, "epoch": 0.7339846685705697, "flos": 67926368246400.0, "grad_norm": 0.7977854237602594, "language_loss": 0.54840302, "learning_rate": 6.97383727865263e-07, "loss": 0.57045686, "num_input_tokens_seen": 263450340, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21191406, "step": 12208, "time_per_iteration": 3.44126033782959 }, { "auxiliary_loss_clip": 0.0142219, "auxiliary_loss_mlp": 0.01031245, "balance_loss_clip": 1.25859594, "balance_loss_mlp": 1.01344717, "epoch": 0.7340447918232377, "flos": 22246911169920.0, "grad_norm": 1.5545001626144244, "language_loss": 0.8091886, "learning_rate": 6.970882238385703e-07, "loss": 0.83372295, "num_input_tokens_seen": 263471735, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.17822266, "step": 12209, "time_per_iteration": 2.8994128704071045 }, { "auxiliary_loss_clip": 0.0140508, "auxiliary_loss_mlp": 0.01030838, "balance_loss_clip": 1.24523449, "balance_loss_mlp": 1.01238394, "epoch": 0.7341049150759056, "flos": 23774208526080.0, "grad_norm": 1.4988498983032987, "language_loss": 0.79433399, "learning_rate": 6.96792769218423e-07, "loss": 0.81869316, "num_input_tokens_seen": 263493245, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18444824, "step": 12210, "time_per_iteration": 2.8393685817718506 }, { "auxiliary_loss_clip": 0.01408763, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.24821687, "balance_loss_mlp": 1.00983596, "epoch": 0.7341650383285736, "flos": 17244983208960.0, "grad_norm": 1.6106678190251569, "language_loss": 0.7728278, "learning_rate": 6.964973640160236e-07, "loss": 0.79720163, "num_input_tokens_seen": 263511660, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18786621, "step": 12211, "time_per_iteration": 2.946786403656006 }, { "auxiliary_loss_clip": 0.01415564, "auxiliary_loss_mlp": 0.01032518, "balance_loss_clip": 1.2542367, "balance_loss_mlp": 1.01457667, "epoch": 0.7342251615812415, "flos": 23414330234880.0, "grad_norm": 1.9314416231516227, "language_loss": 0.72692382, "learning_rate": 6.962020082425748e-07, "loss": 0.75140464, "num_input_tokens_seen": 263530875, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.17944336, "step": 12212, "time_per_iteration": 2.8833909034729004 }, { "auxiliary_loss_clip": 0.01418858, "auxiliary_loss_mlp": 0.0103431, "balance_loss_clip": 1.25567532, "balance_loss_mlp": 1.01568925, "epoch": 0.7342852848339095, "flos": 22757382714240.0, "grad_norm": 1.4886691017838554, "language_loss": 0.69618988, "learning_rate": 6.959067019092766e-07, "loss": 0.7207216, "num_input_tokens_seen": 263551585, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18603516, "step": 12213, "time_per_iteration": 2.891525983810425 }, { "auxiliary_loss_clip": 0.0118434, "auxiliary_loss_mlp": 0.0101742, "balance_loss_clip": 1.09507346, "balance_loss_mlp": 1.00025403, "epoch": 0.7343454080865774, "flos": 53970076920960.0, "grad_norm": 0.7189568660841992, "language_loss": 0.54287273, "learning_rate": 6.956114450273276e-07, "loss": 0.56489033, "num_input_tokens_seen": 263609545, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.171875, "step": 12214, "time_per_iteration": 4.682013273239136 }, { "auxiliary_loss_clip": 0.01426852, "auxiliary_loss_mlp": 0.01027795, "balance_loss_clip": 1.26112127, "balance_loss_mlp": 1.00981867, "epoch": 0.7344055313392455, "flos": 12174454851840.0, "grad_norm": 2.8370277099407204, "language_loss": 0.71373463, "learning_rate": 6.953162376079233e-07, "loss": 0.73828113, "num_input_tokens_seen": 263627880, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.17980957, "step": 12215, "time_per_iteration": 2.8478610515594482 }, { "auxiliary_loss_clip": 0.01409256, "auxiliary_loss_mlp": 0.01030142, "balance_loss_clip": 1.24950206, "balance_loss_mlp": 1.01264167, "epoch": 0.7344656545919134, "flos": 18558833005440.0, "grad_norm": 1.6602652850301425, "language_loss": 0.72997713, "learning_rate": 6.950210796622573e-07, "loss": 0.75437117, "num_input_tokens_seen": 263645665, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.17492676, "step": 12216, "time_per_iteration": 2.888833522796631 }, { "auxiliary_loss_clip": 0.01441175, "auxiliary_loss_mlp": 0.01040904, "balance_loss_clip": 1.2710309, "balance_loss_mlp": 1.02099597, "epoch": 0.7345257778445814, "flos": 23672687898240.0, "grad_norm": 1.7651383828225713, "language_loss": 0.78770089, "learning_rate": 6.947259712015236e-07, "loss": 0.8125217, "num_input_tokens_seen": 263668170, "router_z_loss_clip": 1.70214844, "router_z_loss_mlp": 0.19909668, "step": 12217, "time_per_iteration": 2.9105124473571777 }, { "auxiliary_loss_clip": 0.01404016, "auxiliary_loss_mlp": 0.01032018, "balance_loss_clip": 1.24358582, "balance_loss_mlp": 1.01362419, "epoch": 0.7345859010972494, "flos": 13816665296640.0, "grad_norm": 1.8904346109585348, "language_loss": 0.79478109, "learning_rate": 6.94430912236911e-07, "loss": 0.81914145, "num_input_tokens_seen": 263684190, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18383789, "step": 12218, "time_per_iteration": 2.8076541423797607 }, { "auxiliary_loss_clip": 0.01408233, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.24873471, "balance_loss_mlp": 1.01695013, "epoch": 0.7346460243499173, "flos": 22282772313600.0, "grad_norm": 3.272117043162063, "language_loss": 0.73298669, "learning_rate": 6.941359027796092e-07, "loss": 0.75742722, "num_input_tokens_seen": 263702095, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18859863, "step": 12219, "time_per_iteration": 2.8672537803649902 }, { "auxiliary_loss_clip": 0.0140541, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 1.24709904, "balance_loss_mlp": 1.01934862, "epoch": 0.7347061476025853, "flos": 23264868101760.0, "grad_norm": 2.1786606809324827, "language_loss": 0.75042439, "learning_rate": 6.938409428408061e-07, "loss": 0.77485186, "num_input_tokens_seen": 263721385, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.17980957, "step": 12220, "time_per_iteration": 2.9258995056152344 }, { "auxiliary_loss_clip": 0.01422885, "auxiliary_loss_mlp": 0.01035419, "balance_loss_clip": 1.25726867, "balance_loss_mlp": 1.01633334, "epoch": 0.7347662708552533, "flos": 15275814704640.0, "grad_norm": 4.677673971840966, "language_loss": 0.6670211, "learning_rate": 6.93546032431684e-07, "loss": 0.69160414, "num_input_tokens_seen": 263737835, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19091797, "step": 12221, "time_per_iteration": 2.851973056793213 }, { "auxiliary_loss_clip": 0.0142313, "auxiliary_loss_mlp": 0.01036069, "balance_loss_clip": 1.26042795, "balance_loss_mlp": 1.01871204, "epoch": 0.7348263941079213, "flos": 24869860058880.0, "grad_norm": 5.868286735877517, "language_loss": 0.70613265, "learning_rate": 6.932511715634273e-07, "loss": 0.73072469, "num_input_tokens_seen": 263756480, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.17370605, "step": 12222, "time_per_iteration": 4.33062744140625 }, { "auxiliary_loss_clip": 0.01416289, "auxiliary_loss_mlp": 0.01034338, "balance_loss_clip": 1.25556409, "balance_loss_mlp": 1.01704121, "epoch": 0.7348865173605892, "flos": 24362600895360.0, "grad_norm": 1.5084475815124314, "language_loss": 0.66920483, "learning_rate": 6.92956360247217e-07, "loss": 0.69371104, "num_input_tokens_seen": 263776440, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.1730957, "step": 12223, "time_per_iteration": 2.9391605854034424 }, { "auxiliary_loss_clip": 0.0140984, "auxiliary_loss_mlp": 0.01032634, "balance_loss_clip": 1.24813056, "balance_loss_mlp": 1.01402521, "epoch": 0.7349466406132572, "flos": 20012507792640.0, "grad_norm": 2.0089764764572373, "language_loss": 0.73394555, "learning_rate": 6.926615984942332e-07, "loss": 0.75837028, "num_input_tokens_seen": 263793700, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18603516, "step": 12224, "time_per_iteration": 2.8297414779663086 }, { "auxiliary_loss_clip": 0.01422018, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.25857091, "balance_loss_mlp": 1.01541281, "epoch": 0.7350067638659251, "flos": 29837012751360.0, "grad_norm": 1.6755698051363455, "language_loss": 0.73207664, "learning_rate": 6.92366886315652e-07, "loss": 0.75664383, "num_input_tokens_seen": 263814620, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19287109, "step": 12225, "time_per_iteration": 2.9107179641723633 }, { "auxiliary_loss_clip": 0.01427414, "auxiliary_loss_mlp": 0.01038788, "balance_loss_clip": 1.25953746, "balance_loss_mlp": 1.01934457, "epoch": 0.7350668871185931, "flos": 21874726293120.0, "grad_norm": 1.6624366318660295, "language_loss": 0.76676399, "learning_rate": 6.920722237226501e-07, "loss": 0.791426, "num_input_tokens_seen": 263832725, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.19433594, "step": 12226, "time_per_iteration": 4.346354246139526 }, { "auxiliary_loss_clip": 0.0141891, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.25616336, "balance_loss_mlp": 1.01309013, "epoch": 0.735127010371261, "flos": 22576629162240.0, "grad_norm": 1.6653111628269737, "language_loss": 0.67198575, "learning_rate": 6.917776107264008e-07, "loss": 0.69648784, "num_input_tokens_seen": 263853850, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18200684, "step": 12227, "time_per_iteration": 4.163660526275635 }, { "auxiliary_loss_clip": 0.0143013, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.26499081, "balance_loss_mlp": 1.01538217, "epoch": 0.7351871336239291, "flos": 25895191893120.0, "grad_norm": 2.2856690738781578, "language_loss": 0.64357382, "learning_rate": 6.914830473380749e-07, "loss": 0.66821527, "num_input_tokens_seen": 263874760, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.1862793, "step": 12228, "time_per_iteration": 2.9297831058502197 }, { "auxiliary_loss_clip": 0.01412152, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.24964869, "balance_loss_mlp": 1.01594281, "epoch": 0.735247256876597, "flos": 17941411457280.0, "grad_norm": 1.9135418719853745, "language_loss": 0.64123833, "learning_rate": 6.911885335688427e-07, "loss": 0.66569978, "num_input_tokens_seen": 263893390, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18029785, "step": 12229, "time_per_iteration": 2.90708065032959 }, { "auxiliary_loss_clip": 0.01416365, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.25083709, "balance_loss_mlp": 1.01502705, "epoch": 0.735307380129265, "flos": 28886525095680.0, "grad_norm": 1.8734556462241183, "language_loss": 0.74193907, "learning_rate": 6.908940694298726e-07, "loss": 0.7664482, "num_input_tokens_seen": 263911180, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19519043, "step": 12230, "time_per_iteration": 2.928051471710205 }, { "auxiliary_loss_clip": 0.01418229, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.2539376, "balance_loss_mlp": 1.01496375, "epoch": 0.7353675033819329, "flos": 13633468525440.0, "grad_norm": 2.1408634448657, "language_loss": 0.73646963, "learning_rate": 6.90599654932332e-07, "loss": 0.7609905, "num_input_tokens_seen": 263928975, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18884277, "step": 12231, "time_per_iteration": 2.8995578289031982 }, { "auxiliary_loss_clip": 0.01426972, "auxiliary_loss_mlp": 0.01036747, "balance_loss_clip": 1.26177144, "balance_loss_mlp": 1.01732802, "epoch": 0.7354276266346009, "flos": 19472328397440.0, "grad_norm": 2.899890881830692, "language_loss": 0.65607405, "learning_rate": 6.903052900873823e-07, "loss": 0.68071127, "num_input_tokens_seen": 263944495, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19396973, "step": 12232, "time_per_iteration": 2.8283371925354004 }, { "auxiliary_loss_clip": 0.01413643, "auxiliary_loss_mlp": 0.01037655, "balance_loss_clip": 1.25064731, "balance_loss_mlp": 1.01862907, "epoch": 0.735487749887269, "flos": 15778549388160.0, "grad_norm": 2.2239155700182964, "language_loss": 0.76442516, "learning_rate": 6.900109749061874e-07, "loss": 0.78893816, "num_input_tokens_seen": 263961325, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19006348, "step": 12233, "time_per_iteration": 2.8176674842834473 }, { "auxiliary_loss_clip": 0.01413733, "auxiliary_loss_mlp": 0.01032655, "balance_loss_clip": 1.25017416, "balance_loss_mlp": 1.01285481, "epoch": 0.7355478731399369, "flos": 18269998329600.0, "grad_norm": 1.560039952349905, "language_loss": 0.74821484, "learning_rate": 6.897167093999079e-07, "loss": 0.77267873, "num_input_tokens_seen": 263980445, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19812012, "step": 12234, "time_per_iteration": 2.837233066558838 }, { "auxiliary_loss_clip": 0.01429202, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.26345897, "balance_loss_mlp": 1.01604033, "epoch": 0.7356079963926049, "flos": 26553315778560.0, "grad_norm": 2.1742725849651414, "language_loss": 0.60806161, "learning_rate": 6.894224935797017e-07, "loss": 0.63269895, "num_input_tokens_seen": 263999330, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.18469238, "step": 12235, "time_per_iteration": 2.8840866088867188 }, { "auxiliary_loss_clip": 0.01402766, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.24308801, "balance_loss_mlp": 1.0152235, "epoch": 0.7356681196452728, "flos": 10785398918400.0, "grad_norm": 2.3129606938394076, "language_loss": 0.87187696, "learning_rate": 6.891283274567259e-07, "loss": 0.89624709, "num_input_tokens_seen": 264014150, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19030762, "step": 12236, "time_per_iteration": 2.818617343902588 }, { "auxiliary_loss_clip": 0.01417099, "auxiliary_loss_mlp": 0.01031976, "balance_loss_clip": 1.25281239, "balance_loss_mlp": 1.01312888, "epoch": 0.7357282428979408, "flos": 19728061862400.0, "grad_norm": 1.6391001768455928, "language_loss": 0.70133388, "learning_rate": 6.888342110421364e-07, "loss": 0.72582459, "num_input_tokens_seen": 264033140, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18859863, "step": 12237, "time_per_iteration": 2.8453164100646973 }, { "auxiliary_loss_clip": 0.01421794, "auxiliary_loss_mlp": 0.01034029, "balance_loss_clip": 1.25807977, "balance_loss_mlp": 1.01568282, "epoch": 0.7357883661506087, "flos": 19473504762240.0, "grad_norm": 1.9391936545276878, "language_loss": 0.73563975, "learning_rate": 6.885401443470839e-07, "loss": 0.760198, "num_input_tokens_seen": 264052105, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18347168, "step": 12238, "time_per_iteration": 2.845498561859131 }, { "auxiliary_loss_clip": 0.014551, "auxiliary_loss_mlp": 0.01032736, "balance_loss_clip": 1.28323746, "balance_loss_mlp": 1.0130899, "epoch": 0.7358484894032767, "flos": 27133699818240.0, "grad_norm": 1.6910244009889452, "language_loss": 0.73260468, "learning_rate": 6.882461273827205e-07, "loss": 0.75748307, "num_input_tokens_seen": 264070690, "router_z_loss_clip": 1.71582031, "router_z_loss_mlp": 0.1965332, "step": 12239, "time_per_iteration": 2.946596384048462 }, { "auxiliary_loss_clip": 0.01401344, "auxiliary_loss_mlp": 0.01035158, "balance_loss_clip": 1.24282598, "balance_loss_mlp": 1.01621532, "epoch": 0.7359086126559446, "flos": 24513918065280.0, "grad_norm": 1.393882199598283, "language_loss": 0.79548323, "learning_rate": 6.879521601601954e-07, "loss": 0.81984824, "num_input_tokens_seen": 264094225, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18933105, "step": 12240, "time_per_iteration": 2.910717725753784 }, { "auxiliary_loss_clip": 0.01405814, "auxiliary_loss_mlp": 0.01037357, "balance_loss_clip": 1.24570513, "balance_loss_mlp": 1.0174253, "epoch": 0.7359687359086127, "flos": 23341703051520.0, "grad_norm": 1.8277993682149685, "language_loss": 0.84409958, "learning_rate": 6.876582426906565e-07, "loss": 0.86853129, "num_input_tokens_seen": 264113190, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19921875, "step": 12241, "time_per_iteration": 2.865825891494751 }, { "auxiliary_loss_clip": 0.01401639, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.24251878, "balance_loss_mlp": 1.01416445, "epoch": 0.7360288591612806, "flos": 20202988976640.0, "grad_norm": 1.8617281255384062, "language_loss": 0.79468137, "learning_rate": 6.873643749852484e-07, "loss": 0.81902564, "num_input_tokens_seen": 264132050, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.1862793, "step": 12242, "time_per_iteration": 2.8347976207733154 }, { "auxiliary_loss_clip": 0.01407578, "auxiliary_loss_mlp": 0.01029228, "balance_loss_clip": 1.24867463, "balance_loss_mlp": 1.01055956, "epoch": 0.7360889824139486, "flos": 24983551537920.0, "grad_norm": 1.7498059033327418, "language_loss": 0.80127871, "learning_rate": 6.870705570551145e-07, "loss": 0.82564676, "num_input_tokens_seen": 264152800, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18652344, "step": 12243, "time_per_iteration": 2.9074203968048096 }, { "auxiliary_loss_clip": 0.01429183, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.26185572, "balance_loss_mlp": 1.0117141, "epoch": 0.7361491056666165, "flos": 15020533687680.0, "grad_norm": 5.777180210163534, "language_loss": 0.75494832, "learning_rate": 6.867767889113969e-07, "loss": 0.77955103, "num_input_tokens_seen": 264169650, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19360352, "step": 12244, "time_per_iteration": 2.98274564743042 }, { "auxiliary_loss_clip": 0.0142676, "auxiliary_loss_mlp": 0.01035732, "balance_loss_clip": 1.26214671, "balance_loss_mlp": 1.01696849, "epoch": 0.7362092289192845, "flos": 22940850954240.0, "grad_norm": 1.7032348932661139, "language_loss": 0.69747078, "learning_rate": 6.864830705652347e-07, "loss": 0.72209573, "num_input_tokens_seen": 264190530, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18762207, "step": 12245, "time_per_iteration": 2.970857620239258 }, { "auxiliary_loss_clip": 0.01402441, "auxiliary_loss_mlp": 0.0103497, "balance_loss_clip": 1.24581945, "balance_loss_mlp": 1.01539636, "epoch": 0.7362693521719526, "flos": 20712012687360.0, "grad_norm": 1.6140317245914846, "language_loss": 0.74226308, "learning_rate": 6.861894020277658e-07, "loss": 0.76663721, "num_input_tokens_seen": 264210820, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.19567871, "step": 12246, "time_per_iteration": 2.882201910018921 }, { "auxiliary_loss_clip": 0.01399891, "auxiliary_loss_mlp": 0.01030378, "balance_loss_clip": 1.24269986, "balance_loss_mlp": 1.01169801, "epoch": 0.7363294754246205, "flos": 13118155787520.0, "grad_norm": 1.9950178838034276, "language_loss": 0.74152613, "learning_rate": 6.858957833101266e-07, "loss": 0.76582879, "num_input_tokens_seen": 264227430, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18676758, "step": 12247, "time_per_iteration": 2.884002685546875 }, { "auxiliary_loss_clip": 0.01403033, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.244524, "balance_loss_mlp": 1.01494491, "epoch": 0.7363895986772885, "flos": 14035451742720.0, "grad_norm": 1.5585204651088596, "language_loss": 0.74803984, "learning_rate": 6.856022144234526e-07, "loss": 0.77240157, "num_input_tokens_seen": 264245230, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18200684, "step": 12248, "time_per_iteration": 2.854557991027832 }, { "auxiliary_loss_clip": 0.01425114, "auxiliary_loss_mlp": 0.01040083, "balance_loss_clip": 1.26047111, "balance_loss_mlp": 1.02060461, "epoch": 0.7364497219299564, "flos": 19729690675200.0, "grad_norm": 2.296185857367207, "language_loss": 0.73177695, "learning_rate": 6.853086953788727e-07, "loss": 0.75642896, "num_input_tokens_seen": 264263945, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19482422, "step": 12249, "time_per_iteration": 2.827298879623413 }, { "auxiliary_loss_clip": 0.01411134, "auxiliary_loss_mlp": 0.01032425, "balance_loss_clip": 1.24877596, "balance_loss_mlp": 1.01379228, "epoch": 0.7365098451826244, "flos": 21371403427200.0, "grad_norm": 1.8170360875173526, "language_loss": 0.77639294, "learning_rate": 6.850152261875189e-07, "loss": 0.80082852, "num_input_tokens_seen": 264281500, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.1862793, "step": 12250, "time_per_iteration": 4.34359073638916 }, { "auxiliary_loss_clip": 0.01421135, "auxiliary_loss_mlp": 0.01038504, "balance_loss_clip": 1.25637805, "balance_loss_mlp": 1.01916838, "epoch": 0.7365699684352923, "flos": 23378785804800.0, "grad_norm": 1.5896066097467292, "language_loss": 0.7186631, "learning_rate": 6.8472180686052e-07, "loss": 0.74325949, "num_input_tokens_seen": 264301625, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.1932373, "step": 12251, "time_per_iteration": 2.865645408630371 }, { "auxiliary_loss_clip": 0.01409703, "auxiliary_loss_mlp": 0.01032153, "balance_loss_clip": 1.24939156, "balance_loss_mlp": 1.01346111, "epoch": 0.7366300916879603, "flos": 59544634078080.0, "grad_norm": 1.492416236789137, "language_loss": 0.65935707, "learning_rate": 6.844284374090015e-07, "loss": 0.68377566, "num_input_tokens_seen": 264323975, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18688965, "step": 12252, "time_per_iteration": 3.162604331970215 }, { "auxiliary_loss_clip": 0.01433691, "auxiliary_loss_mlp": 0.01038824, "balance_loss_clip": 1.27008629, "balance_loss_mlp": 1.02037001, "epoch": 0.7366902149406283, "flos": 20932925639040.0, "grad_norm": 1.5685067232738175, "language_loss": 0.79521799, "learning_rate": 6.841351178440884e-07, "loss": 0.81994313, "num_input_tokens_seen": 264343785, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18444824, "step": 12253, "time_per_iteration": 2.841643810272217 }, { "auxiliary_loss_clip": 0.0141019, "auxiliary_loss_mlp": 0.01032163, "balance_loss_clip": 1.25079918, "balance_loss_mlp": 1.01394808, "epoch": 0.7367503381932963, "flos": 17357181609600.0, "grad_norm": 3.259277797893367, "language_loss": 0.76925021, "learning_rate": 6.83841848176905e-07, "loss": 0.79367375, "num_input_tokens_seen": 264361130, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18212891, "step": 12254, "time_per_iteration": 2.831362247467041 }, { "auxiliary_loss_clip": 0.01415919, "auxiliary_loss_mlp": 0.01036054, "balance_loss_clip": 1.25413477, "balance_loss_mlp": 1.01745713, "epoch": 0.7368104614459642, "flos": 17830163197440.0, "grad_norm": 2.6427606885279227, "language_loss": 0.69866812, "learning_rate": 6.835486284185692e-07, "loss": 0.7231878, "num_input_tokens_seen": 264376965, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18591309, "step": 12255, "time_per_iteration": 2.841979742050171 }, { "auxiliary_loss_clip": 0.01420826, "auxiliary_loss_mlp": 0.01033984, "balance_loss_clip": 1.25621819, "balance_loss_mlp": 1.01512516, "epoch": 0.7368705846986322, "flos": 24616298344320.0, "grad_norm": 2.928222090632513, "language_loss": 0.75753665, "learning_rate": 6.832554585802012e-07, "loss": 0.7820847, "num_input_tokens_seen": 264396310, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.1887207, "step": 12256, "time_per_iteration": 2.8728294372558594 }, { "auxiliary_loss_clip": 0.01419014, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.25689411, "balance_loss_mlp": 1.01575375, "epoch": 0.7369307079513001, "flos": 34983606896640.0, "grad_norm": 1.7184798826014815, "language_loss": 0.73974454, "learning_rate": 6.829623386729182e-07, "loss": 0.76428676, "num_input_tokens_seen": 264418085, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19445801, "step": 12257, "time_per_iteration": 4.456803560256958 }, { "auxiliary_loss_clip": 0.01412985, "auxiliary_loss_mlp": 0.01035744, "balance_loss_clip": 1.25197005, "balance_loss_mlp": 1.01715922, "epoch": 0.7369908312039681, "flos": 21224339268480.0, "grad_norm": 1.490086311210768, "language_loss": 0.78580344, "learning_rate": 6.826692687078362e-07, "loss": 0.81029075, "num_input_tokens_seen": 264437595, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18591309, "step": 12258, "time_per_iteration": 2.8747777938842773 }, { "auxiliary_loss_clip": 0.01423703, "auxiliary_loss_mlp": 0.01035552, "balance_loss_clip": 1.26033556, "balance_loss_mlp": 1.01655006, "epoch": 0.7370509544566362, "flos": 23634293045760.0, "grad_norm": 1.9948486547070594, "language_loss": 0.67184222, "learning_rate": 6.823762486960674e-07, "loss": 0.6964348, "num_input_tokens_seen": 264457385, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19006348, "step": 12259, "time_per_iteration": 2.9185097217559814 }, { "auxiliary_loss_clip": 0.01410273, "auxiliary_loss_mlp": 0.01037073, "balance_loss_clip": 1.24910831, "balance_loss_mlp": 1.0191915, "epoch": 0.7371110777093041, "flos": 24838659129600.0, "grad_norm": 1.5986435996779478, "language_loss": 0.73472512, "learning_rate": 6.820832786487225e-07, "loss": 0.75919861, "num_input_tokens_seen": 264477205, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.17883301, "step": 12260, "time_per_iteration": 2.880825996398926 }, { "auxiliary_loss_clip": 0.01420284, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.25789273, "balance_loss_mlp": 1.01263177, "epoch": 0.7371712009619721, "flos": 23160044603520.0, "grad_norm": 1.6405907234592003, "language_loss": 0.74108493, "learning_rate": 6.817903585769125e-07, "loss": 0.76559961, "num_input_tokens_seen": 264497195, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1854248, "step": 12261, "time_per_iteration": 4.340795040130615 }, { "auxiliary_loss_clip": 0.01427477, "auxiliary_loss_mlp": 0.01035813, "balance_loss_clip": 1.26266193, "balance_loss_mlp": 1.01623905, "epoch": 0.73723132421464, "flos": 23123414298240.0, "grad_norm": 2.703160835708504, "language_loss": 0.6791203, "learning_rate": 6.814974884917438e-07, "loss": 0.70375323, "num_input_tokens_seen": 264516950, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19580078, "step": 12262, "time_per_iteration": 4.216994285583496 }, { "auxiliary_loss_clip": 0.01415233, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.2510761, "balance_loss_mlp": 1.01678944, "epoch": 0.737291447467308, "flos": 19280625603840.0, "grad_norm": 1.7414839880143522, "language_loss": 0.89146221, "learning_rate": 6.81204668404322e-07, "loss": 0.91597283, "num_input_tokens_seen": 264532675, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19055176, "step": 12263, "time_per_iteration": 2.815094470977783 }, { "auxiliary_loss_clip": 0.01394559, "auxiliary_loss_mlp": 0.01033285, "balance_loss_clip": 1.239375, "balance_loss_mlp": 1.01474833, "epoch": 0.7373515707199759, "flos": 25128941639040.0, "grad_norm": 2.2730082145431134, "language_loss": 0.67599541, "learning_rate": 6.809118983257522e-07, "loss": 0.70027387, "num_input_tokens_seen": 264555635, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.1854248, "step": 12264, "time_per_iteration": 2.8930203914642334 }, { "auxiliary_loss_clip": 0.01396977, "auxiliary_loss_mlp": 0.01034578, "balance_loss_clip": 1.23827887, "balance_loss_mlp": 1.01529014, "epoch": 0.737411693972644, "flos": 32419259303040.0, "grad_norm": 2.273165584209643, "language_loss": 0.80861163, "learning_rate": 6.806191782671356e-07, "loss": 0.83292711, "num_input_tokens_seen": 264573140, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19287109, "step": 12265, "time_per_iteration": 2.974125862121582 }, { "auxiliary_loss_clip": 0.01427105, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.25790954, "balance_loss_mlp": 1.02071667, "epoch": 0.7374718172253119, "flos": 24326332548480.0, "grad_norm": 1.6793279427237544, "language_loss": 0.75292754, "learning_rate": 6.803265082395711e-07, "loss": 0.77760178, "num_input_tokens_seen": 264591610, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19604492, "step": 12266, "time_per_iteration": 2.9120073318481445 }, { "auxiliary_loss_clip": 0.01402888, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.24123251, "balance_loss_mlp": 1.0190779, "epoch": 0.7375319404779799, "flos": 27165941377920.0, "grad_norm": 1.6811440159969364, "language_loss": 0.73718643, "learning_rate": 6.800338882541576e-07, "loss": 0.76159465, "num_input_tokens_seen": 264611170, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18859863, "step": 12267, "time_per_iteration": 2.9189398288726807 }, { "auxiliary_loss_clip": 0.01408236, "auxiliary_loss_mlp": 0.0103282, "balance_loss_clip": 1.24686205, "balance_loss_mlp": 1.01510525, "epoch": 0.7375920637306478, "flos": 18889274914560.0, "grad_norm": 1.9449508796864363, "language_loss": 0.83921498, "learning_rate": 6.797413183219923e-07, "loss": 0.86362553, "num_input_tokens_seen": 264629365, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.17712402, "step": 12268, "time_per_iteration": 2.8338892459869385 }, { "auxiliary_loss_clip": 0.01411975, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.2516439, "balance_loss_mlp": 1.02007663, "epoch": 0.7376521869833158, "flos": 15678340859520.0, "grad_norm": 1.8040314810680593, "language_loss": 0.73939347, "learning_rate": 6.794487984541677e-07, "loss": 0.76391268, "num_input_tokens_seen": 264647915, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19848633, "step": 12269, "time_per_iteration": 2.837789535522461 }, { "auxiliary_loss_clip": 0.01429826, "auxiliary_loss_mlp": 0.01033796, "balance_loss_clip": 1.26405501, "balance_loss_mlp": 1.01456714, "epoch": 0.7377123102359837, "flos": 36984655002240.0, "grad_norm": 1.8224886734937134, "language_loss": 0.71189225, "learning_rate": 6.791563286617776e-07, "loss": 0.7365284, "num_input_tokens_seen": 264669620, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19226074, "step": 12270, "time_per_iteration": 2.95424222946167 }, { "auxiliary_loss_clip": 0.01418725, "auxiliary_loss_mlp": 0.01033906, "balance_loss_clip": 1.25648284, "balance_loss_mlp": 1.01584625, "epoch": 0.7377724334886517, "flos": 24506362183680.0, "grad_norm": 1.7578735608230298, "language_loss": 0.70206964, "learning_rate": 6.788639089559119e-07, "loss": 0.72659594, "num_input_tokens_seen": 264689345, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18066406, "step": 12271, "time_per_iteration": 2.9211912155151367 }, { "auxiliary_loss_clip": 0.01420443, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.25699341, "balance_loss_mlp": 1.01369238, "epoch": 0.7378325567413198, "flos": 24400905258240.0, "grad_norm": 1.966691295126329, "language_loss": 0.68347317, "learning_rate": 6.785715393476586e-07, "loss": 0.70800084, "num_input_tokens_seen": 264707625, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.18652344, "step": 12272, "time_per_iteration": 2.8777613639831543 }, { "auxiliary_loss_clip": 0.01404961, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.24741316, "balance_loss_mlp": 1.01452959, "epoch": 0.7378926799939877, "flos": 17424198437760.0, "grad_norm": 1.7362303338535952, "language_loss": 0.79002523, "learning_rate": 6.782792198481049e-07, "loss": 0.81440127, "num_input_tokens_seen": 264725575, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18127441, "step": 12273, "time_per_iteration": 2.8311896324157715 }, { "auxiliary_loss_clip": 0.01411538, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.25106096, "balance_loss_mlp": 1.01159406, "epoch": 0.7379528032466557, "flos": 18482224279680.0, "grad_norm": 2.428615854670239, "language_loss": 0.83942449, "learning_rate": 6.779869504683355e-07, "loss": 0.86383915, "num_input_tokens_seen": 264742855, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18334961, "step": 12274, "time_per_iteration": 2.8109068870544434 }, { "auxiliary_loss_clip": 0.01441764, "auxiliary_loss_mlp": 0.01034233, "balance_loss_clip": 1.27394307, "balance_loss_mlp": 1.0145638, "epoch": 0.7380129264993236, "flos": 17831022848640.0, "grad_norm": 1.9828585061869195, "language_loss": 0.74530923, "learning_rate": 6.776947312194341e-07, "loss": 0.77006924, "num_input_tokens_seen": 264761155, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19702148, "step": 12275, "time_per_iteration": 2.82187557220459 }, { "auxiliary_loss_clip": 0.01431633, "auxiliary_loss_mlp": 0.01035152, "balance_loss_clip": 1.26488137, "balance_loss_mlp": 1.01563704, "epoch": 0.7380730497519916, "flos": 23006284214400.0, "grad_norm": 2.132395001244171, "language_loss": 0.743047, "learning_rate": 6.774025621124813e-07, "loss": 0.76771486, "num_input_tokens_seen": 264780660, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19519043, "step": 12276, "time_per_iteration": 2.8650801181793213 }, { "auxiliary_loss_clip": 0.01419768, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.25607252, "balance_loss_mlp": 1.0120734, "epoch": 0.7381331730046595, "flos": 20275842384000.0, "grad_norm": 18.377763326026084, "language_loss": 0.78679311, "learning_rate": 6.771104431585551e-07, "loss": 0.81129408, "num_input_tokens_seen": 264798850, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18249512, "step": 12277, "time_per_iteration": 2.847001314163208 }, { "auxiliary_loss_clip": 0.01408918, "auxiliary_loss_mlp": 0.01041135, "balance_loss_clip": 1.24966884, "balance_loss_mlp": 1.02108395, "epoch": 0.7381932962573275, "flos": 19763742026880.0, "grad_norm": 1.9725636820814512, "language_loss": 0.79761916, "learning_rate": 6.768183743687338e-07, "loss": 0.82211965, "num_input_tokens_seen": 264816795, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.20031738, "step": 12278, "time_per_iteration": 2.841799736022949 }, { "auxiliary_loss_clip": 0.0142724, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.26233554, "balance_loss_mlp": 1.0121969, "epoch": 0.7382534195099955, "flos": 17312678709120.0, "grad_norm": 2.0142004637053885, "language_loss": 0.72689509, "learning_rate": 6.765263557540921e-07, "loss": 0.75147367, "num_input_tokens_seen": 264834105, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.1842041, "step": 12279, "time_per_iteration": 2.8217525482177734 }, { "auxiliary_loss_clip": 0.01409451, "auxiliary_loss_mlp": 0.01036835, "balance_loss_clip": 1.24528933, "balance_loss_mlp": 1.01811874, "epoch": 0.7383135427626635, "flos": 18706530591360.0, "grad_norm": 2.1790011736892634, "language_loss": 0.86340767, "learning_rate": 6.762343873257034e-07, "loss": 0.88787055, "num_input_tokens_seen": 264850895, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18701172, "step": 12280, "time_per_iteration": 2.813356399536133 }, { "auxiliary_loss_clip": 0.01416476, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.25304699, "balance_loss_mlp": 1.01380467, "epoch": 0.7383736660153314, "flos": 20889689592960.0, "grad_norm": 1.9826232726380792, "language_loss": 0.72732401, "learning_rate": 6.759424690946408e-07, "loss": 0.75180703, "num_input_tokens_seen": 264869505, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18029785, "step": 12281, "time_per_iteration": 2.8505444526672363 }, { "auxiliary_loss_clip": 0.01420477, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.25624442, "balance_loss_mlp": 1.01327443, "epoch": 0.7384337892679994, "flos": 20671762798080.0, "grad_norm": 1.8463024864431228, "language_loss": 0.61857188, "learning_rate": 6.756506010719711e-07, "loss": 0.64310426, "num_input_tokens_seen": 264886915, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19458008, "step": 12282, "time_per_iteration": 2.861659049987793 }, { "auxiliary_loss_clip": 0.0144043, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.27408004, "balance_loss_mlp": 1.0161221, "epoch": 0.7384939125206673, "flos": 29181739288320.0, "grad_norm": 1.6179466868841768, "language_loss": 0.68873805, "learning_rate": 6.753587832687632e-07, "loss": 0.71349752, "num_input_tokens_seen": 264910350, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19396973, "step": 12283, "time_per_iteration": 2.8977510929107666 }, { "auxiliary_loss_clip": 0.01403993, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.24346673, "balance_loss_mlp": 1.01330709, "epoch": 0.7385540357733353, "flos": 36324721324800.0, "grad_norm": 1.619596029707286, "language_loss": 0.76425427, "learning_rate": 6.750670156960832e-07, "loss": 0.78861117, "num_input_tokens_seen": 264930705, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18395996, "step": 12284, "time_per_iteration": 2.9703495502471924 }, { "auxiliary_loss_clip": 0.01413967, "auxiliary_loss_mlp": 0.01030846, "balance_loss_clip": 1.25028634, "balance_loss_mlp": 1.01147437, "epoch": 0.7386141590260034, "flos": 20312155975680.0, "grad_norm": 1.9356438299836443, "language_loss": 0.700683, "learning_rate": 6.747752983649954e-07, "loss": 0.72513115, "num_input_tokens_seen": 264946975, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19384766, "step": 12285, "time_per_iteration": 4.349000692367554 }, { "auxiliary_loss_clip": 0.01425344, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.25770271, "balance_loss_mlp": 1.01345086, "epoch": 0.7386742822786713, "flos": 25494746999040.0, "grad_norm": 1.8097841715885963, "language_loss": 0.80465549, "learning_rate": 6.744836312865602e-07, "loss": 0.82923508, "num_input_tokens_seen": 264967665, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19165039, "step": 12286, "time_per_iteration": 2.8831522464752197 }, { "auxiliary_loss_clip": 0.0141212, "auxiliary_loss_mlp": 0.01031329, "balance_loss_clip": 1.25099111, "balance_loss_mlp": 1.01233876, "epoch": 0.7387344055313393, "flos": 13779899256960.0, "grad_norm": 2.016514368881021, "language_loss": 0.66385293, "learning_rate": 6.741920144718396e-07, "loss": 0.68828738, "num_input_tokens_seen": 264985480, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18994141, "step": 12287, "time_per_iteration": 2.9350907802581787 }, { "auxiliary_loss_clip": 0.01412014, "auxiliary_loss_mlp": 0.01028403, "balance_loss_clip": 1.25275576, "balance_loss_mlp": 1.01083159, "epoch": 0.7387945287840072, "flos": 27866713127040.0, "grad_norm": 1.8706210737274642, "language_loss": 0.77518797, "learning_rate": 6.739004479318903e-07, "loss": 0.7995922, "num_input_tokens_seen": 265004790, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17578125, "step": 12288, "time_per_iteration": 2.9579830169677734 }, { "auxiliary_loss_clip": 0.01436682, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.26872134, "balance_loss_mlp": 1.01246095, "epoch": 0.7388546520366752, "flos": 44247708034560.0, "grad_norm": 2.2231821292804836, "language_loss": 0.58918333, "learning_rate": 6.736089316777684e-07, "loss": 0.61387384, "num_input_tokens_seen": 265028790, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19909668, "step": 12289, "time_per_iteration": 3.059521436691284 }, { "auxiliary_loss_clip": 0.011846, "auxiliary_loss_mlp": 0.01026612, "balance_loss_clip": 1.09366012, "balance_loss_mlp": 1.00706124, "epoch": 0.7389147752893431, "flos": 70710769140480.0, "grad_norm": 0.6527582765421391, "language_loss": 0.49295568, "learning_rate": 6.733174657205287e-07, "loss": 0.51506782, "num_input_tokens_seen": 265096660, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.1953125, "step": 12290, "time_per_iteration": 3.4660584926605225 }, { "auxiliary_loss_clip": 0.01418417, "auxiliary_loss_mlp": 0.01034381, "balance_loss_clip": 1.25576758, "balance_loss_mlp": 1.01460397, "epoch": 0.7389748985420111, "flos": 26006168684160.0, "grad_norm": 1.7022549012832766, "language_loss": 0.68287933, "learning_rate": 6.730260500712237e-07, "loss": 0.70740736, "num_input_tokens_seen": 265116375, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19787598, "step": 12291, "time_per_iteration": 2.9051895141601562 }, { "auxiliary_loss_clip": 0.01187271, "auxiliary_loss_mlp": 0.01028818, "balance_loss_clip": 1.09477425, "balance_loss_mlp": 1.00592947, "epoch": 0.7390350217946791, "flos": 54428127724800.0, "grad_norm": 0.9818288655487112, "language_loss": 0.60856783, "learning_rate": 6.727346847409052e-07, "loss": 0.63072872, "num_input_tokens_seen": 265161230, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.22851562, "step": 12292, "time_per_iteration": 4.397026062011719 }, { "auxiliary_loss_clip": 0.01410937, "auxiliary_loss_mlp": 0.01034826, "balance_loss_clip": 1.24969029, "balance_loss_mlp": 1.01619315, "epoch": 0.7390951450473471, "flos": 32209612306560.0, "grad_norm": 4.572650551942778, "language_loss": 0.6798774, "learning_rate": 6.724433697406191e-07, "loss": 0.70433497, "num_input_tokens_seen": 265182515, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1862793, "step": 12293, "time_per_iteration": 2.922081470489502 }, { "auxiliary_loss_clip": 0.0141659, "auxiliary_loss_mlp": 0.01033616, "balance_loss_clip": 1.25561142, "balance_loss_mlp": 1.01506734, "epoch": 0.739155268300015, "flos": 16691230373760.0, "grad_norm": 2.015856095135241, "language_loss": 0.84392464, "learning_rate": 6.721521050814134e-07, "loss": 0.86842668, "num_input_tokens_seen": 265198160, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18566895, "step": 12294, "time_per_iteration": 2.831770658493042 }, { "auxiliary_loss_clip": 0.01396171, "auxiliary_loss_mlp": 0.01034187, "balance_loss_clip": 1.23779655, "balance_loss_mlp": 1.01616299, "epoch": 0.739215391552683, "flos": 31662284232960.0, "grad_norm": 1.521367400222795, "language_loss": 0.7352947, "learning_rate": 6.718608907743337e-07, "loss": 0.75959826, "num_input_tokens_seen": 265218480, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18017578, "step": 12295, "time_per_iteration": 2.9653072357177734 }, { "auxiliary_loss_clip": 0.0139818, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.24053049, "balance_loss_mlp": 1.01381636, "epoch": 0.7392755148053509, "flos": 29731193867520.0, "grad_norm": 1.7043833177612684, "language_loss": 0.78650451, "learning_rate": 6.715697268304215e-07, "loss": 0.81080163, "num_input_tokens_seen": 265240165, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.17724609, "step": 12296, "time_per_iteration": 4.393824577331543 }, { "auxiliary_loss_clip": 0.01405057, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.24434376, "balance_loss_mlp": 1.01240373, "epoch": 0.7393356380580189, "flos": 37064611843200.0, "grad_norm": 2.19197187306483, "language_loss": 0.66925871, "learning_rate": 6.712786132607182e-07, "loss": 0.69363666, "num_input_tokens_seen": 265263295, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20336914, "step": 12297, "time_per_iteration": 4.379637956619263 }, { "auxiliary_loss_clip": 0.01413138, "auxiliary_loss_mlp": 0.01040769, "balance_loss_clip": 1.25110602, "balance_loss_mlp": 1.02061105, "epoch": 0.739395761310687, "flos": 19729238227200.0, "grad_norm": 1.6462966092568079, "language_loss": 0.6920808, "learning_rate": 6.709875500762645e-07, "loss": 0.71661985, "num_input_tokens_seen": 265282740, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.20153809, "step": 12298, "time_per_iteration": 2.8607101440429688 }, { "auxiliary_loss_clip": 0.01400032, "auxiliary_loss_mlp": 0.01032229, "balance_loss_clip": 1.23823452, "balance_loss_mlp": 1.01289308, "epoch": 0.7394558845633549, "flos": 11808875715840.0, "grad_norm": 2.054188490422839, "language_loss": 0.75022185, "learning_rate": 6.706965372880946e-07, "loss": 0.77454448, "num_input_tokens_seen": 265300175, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19311523, "step": 12299, "time_per_iteration": 2.8427700996398926 }, { "auxiliary_loss_clip": 0.01182417, "auxiliary_loss_mlp": 0.01020248, "balance_loss_clip": 1.09152699, "balance_loss_mlp": 1.00107944, "epoch": 0.7395160078160229, "flos": 66225239792640.0, "grad_norm": 0.7254375466448695, "language_loss": 0.60897899, "learning_rate": 6.704055749072455e-07, "loss": 0.63100564, "num_input_tokens_seen": 265363275, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.19140625, "step": 12300, "time_per_iteration": 3.4057888984680176 }, { "auxiliary_loss_clip": 0.0142756, "auxiliary_loss_mlp": 0.01036598, "balance_loss_clip": 1.26423287, "balance_loss_mlp": 1.01791763, "epoch": 0.7395761310686908, "flos": 21259114536960.0, "grad_norm": 2.2543270365074157, "language_loss": 0.80989438, "learning_rate": 6.7011466294475e-07, "loss": 0.83453596, "num_input_tokens_seen": 265382935, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18688965, "step": 12301, "time_per_iteration": 2.919325590133667 }, { "auxiliary_loss_clip": 0.01409491, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.24898767, "balance_loss_mlp": 1.01765943, "epoch": 0.7396362543213588, "flos": 25965647326080.0, "grad_norm": 1.8059820533148325, "language_loss": 0.73761201, "learning_rate": 6.698238014116406e-07, "loss": 0.76206374, "num_input_tokens_seen": 265403245, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18029785, "step": 12302, "time_per_iteration": 2.996809959411621 }, { "auxiliary_loss_clip": 0.01431176, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.26675153, "balance_loss_mlp": 1.01429033, "epoch": 0.7396963775740267, "flos": 27388573632000.0, "grad_norm": 1.772804015936782, "language_loss": 0.74811864, "learning_rate": 6.695329903189451e-07, "loss": 0.77275908, "num_input_tokens_seen": 265423105, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18579102, "step": 12303, "time_per_iteration": 2.997849941253662 }, { "auxiliary_loss_clip": 0.01398254, "auxiliary_loss_mlp": 0.01036382, "balance_loss_clip": 1.23931646, "balance_loss_mlp": 1.01760674, "epoch": 0.7397565008266948, "flos": 25531060590720.0, "grad_norm": 1.7584017220154173, "language_loss": 0.5465228, "learning_rate": 6.692422296776927e-07, "loss": 0.57086915, "num_input_tokens_seen": 265443445, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18786621, "step": 12304, "time_per_iteration": 2.9331576824188232 }, { "auxiliary_loss_clip": 0.01413915, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.25161481, "balance_loss_mlp": 1.01515031, "epoch": 0.7398166240793627, "flos": 23737035283200.0, "grad_norm": 2.7564766129847404, "language_loss": 0.8501901, "learning_rate": 6.689515194989084e-07, "loss": 0.87466955, "num_input_tokens_seen": 265462085, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.1887207, "step": 12305, "time_per_iteration": 2.8845834732055664 }, { "auxiliary_loss_clip": 0.01183945, "auxiliary_loss_mlp": 0.0103553, "balance_loss_clip": 1.09274328, "balance_loss_mlp": 1.01426268, "epoch": 0.7398767473320307, "flos": 67300820616960.0, "grad_norm": 0.8730437643560194, "language_loss": 0.5767504, "learning_rate": 6.68660859793615e-07, "loss": 0.59894514, "num_input_tokens_seen": 265521190, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.21289062, "step": 12306, "time_per_iteration": 3.3787713050842285 }, { "auxiliary_loss_clip": 0.01425999, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.26222086, "balance_loss_mlp": 1.01631987, "epoch": 0.7399368705846986, "flos": 22029029619840.0, "grad_norm": 4.111389201480146, "language_loss": 0.82069016, "learning_rate": 6.683702505728355e-07, "loss": 0.8453058, "num_input_tokens_seen": 265539705, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19238281, "step": 12307, "time_per_iteration": 2.8456714153289795 }, { "auxiliary_loss_clip": 0.01389714, "auxiliary_loss_mlp": 0.01033349, "balance_loss_clip": 1.23355806, "balance_loss_mlp": 1.01437044, "epoch": 0.7399969938373666, "flos": 14182063453440.0, "grad_norm": 2.0290937653841263, "language_loss": 0.70641738, "learning_rate": 6.680796918475893e-07, "loss": 0.73064804, "num_input_tokens_seen": 265555855, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18981934, "step": 12308, "time_per_iteration": 2.7859859466552734 }, { "auxiliary_loss_clip": 0.01397576, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.23913252, "balance_loss_mlp": 1.01511168, "epoch": 0.7400571170900345, "flos": 25312409879040.0, "grad_norm": 1.746643621004893, "language_loss": 0.82020968, "learning_rate": 6.67789183628896e-07, "loss": 0.84452385, "num_input_tokens_seen": 265575455, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18725586, "step": 12309, "time_per_iteration": 2.887418746948242 }, { "auxiliary_loss_clip": 0.01413956, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.24812531, "balance_loss_mlp": 1.01558208, "epoch": 0.7401172403427025, "flos": 22721702549760.0, "grad_norm": 1.7347838967002225, "language_loss": 0.73469567, "learning_rate": 6.674987259277692e-07, "loss": 0.75918406, "num_input_tokens_seen": 265595250, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19287109, "step": 12310, "time_per_iteration": 2.8542439937591553 }, { "auxiliary_loss_clip": 0.01410552, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.24765444, "balance_loss_mlp": 1.01692569, "epoch": 0.7401773635953706, "flos": 18073952035200.0, "grad_norm": 2.8220062772374583, "language_loss": 0.89167738, "learning_rate": 6.672083187552239e-07, "loss": 0.91615009, "num_input_tokens_seen": 265606945, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19787598, "step": 12311, "time_per_iteration": 2.847985029220581 }, { "auxiliary_loss_clip": 0.01418992, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.25623202, "balance_loss_mlp": 1.01458895, "epoch": 0.7402374868480385, "flos": 22722697935360.0, "grad_norm": 5.703740698690624, "language_loss": 0.80499482, "learning_rate": 6.669179621222738e-07, "loss": 0.82951772, "num_input_tokens_seen": 265626115, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18701172, "step": 12312, "time_per_iteration": 2.88779354095459 }, { "auxiliary_loss_clip": 0.01415479, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.25583017, "balance_loss_mlp": 1.01890278, "epoch": 0.7402976101007065, "flos": 22866911671680.0, "grad_norm": 3.5478286811357926, "language_loss": 0.79215813, "learning_rate": 6.666276560399273e-07, "loss": 0.8166821, "num_input_tokens_seen": 265646520, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18029785, "step": 12313, "time_per_iteration": 2.8872907161712646 }, { "auxiliary_loss_clip": 0.01423825, "auxiliary_loss_mlp": 0.010368, "balance_loss_clip": 1.25863004, "balance_loss_mlp": 1.01754713, "epoch": 0.7403577333533744, "flos": 12352538960640.0, "grad_norm": 4.705946349383261, "language_loss": 0.7955879, "learning_rate": 6.663374005191937e-07, "loss": 0.82019424, "num_input_tokens_seen": 265661875, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19250488, "step": 12314, "time_per_iteration": 2.796251058578491 }, { "auxiliary_loss_clip": 0.01188084, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.09516883, "balance_loss_mlp": 1.01461434, "epoch": 0.7404178566060424, "flos": 60356626824960.0, "grad_norm": 0.8442621537771297, "language_loss": 0.55187768, "learning_rate": 6.660471955710809e-07, "loss": 0.57413357, "num_input_tokens_seen": 265721255, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.22851562, "step": 12315, "time_per_iteration": 3.3425121307373047 }, { "auxiliary_loss_clip": 0.01401938, "auxiliary_loss_mlp": 0.0103449, "balance_loss_clip": 1.24462521, "balance_loss_mlp": 1.01594114, "epoch": 0.7404779798587103, "flos": 32027275186560.0, "grad_norm": 6.959521302000672, "language_loss": 0.80163866, "learning_rate": 6.65757041206591e-07, "loss": 0.82600296, "num_input_tokens_seen": 265743970, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18554688, "step": 12316, "time_per_iteration": 2.994468927383423 }, { "auxiliary_loss_clip": 0.01408668, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 1.24676311, "balance_loss_mlp": 1.01656723, "epoch": 0.7405381031113784, "flos": 12895704512640.0, "grad_norm": 1.6659485955295927, "language_loss": 0.75577164, "learning_rate": 6.654669374367275e-07, "loss": 0.78021562, "num_input_tokens_seen": 265760890, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19165039, "step": 12317, "time_per_iteration": 2.8533871173858643 }, { "auxiliary_loss_clip": 0.01393314, "auxiliary_loss_mlp": 0.01034524, "balance_loss_clip": 1.23784077, "balance_loss_mlp": 1.01584339, "epoch": 0.7405982263640463, "flos": 20238533406720.0, "grad_norm": 1.6414440496231775, "language_loss": 0.81714094, "learning_rate": 6.651768842724917e-07, "loss": 0.84141928, "num_input_tokens_seen": 265779600, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18676758, "step": 12318, "time_per_iteration": 2.849893569946289 }, { "auxiliary_loss_clip": 0.01429775, "auxiliary_loss_mlp": 0.01034211, "balance_loss_clip": 1.263906, "balance_loss_mlp": 1.01532841, "epoch": 0.7406583496167143, "flos": 17576330014080.0, "grad_norm": 1.9090661659578239, "language_loss": 0.77303183, "learning_rate": 6.648868817248827e-07, "loss": 0.79767168, "num_input_tokens_seen": 265797030, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.1887207, "step": 12319, "time_per_iteration": 2.8331501483917236 }, { "auxiliary_loss_clip": 0.01417886, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.25721192, "balance_loss_mlp": 1.01985741, "epoch": 0.7407184728693822, "flos": 18304728353280.0, "grad_norm": 1.9450824980585146, "language_loss": 0.64778107, "learning_rate": 6.64596929804897e-07, "loss": 0.67233694, "num_input_tokens_seen": 265815055, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.17858887, "step": 12320, "time_per_iteration": 4.27356481552124 }, { "auxiliary_loss_clip": 0.0143047, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.26300597, "balance_loss_mlp": 1.02090955, "epoch": 0.7407785961220502, "flos": 16699193458560.0, "grad_norm": 2.5668337041249893, "language_loss": 0.83266634, "learning_rate": 6.643070285235288e-07, "loss": 0.85737133, "num_input_tokens_seen": 265828480, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.19116211, "step": 12321, "time_per_iteration": 2.8514344692230225 }, { "auxiliary_loss_clip": 0.01438891, "auxiliary_loss_mlp": 0.01044328, "balance_loss_clip": 1.27027154, "balance_loss_mlp": 1.02370441, "epoch": 0.7408387193747181, "flos": 22097810995200.0, "grad_norm": 2.863712750778219, "language_loss": 0.72625357, "learning_rate": 6.640171778917727e-07, "loss": 0.75108582, "num_input_tokens_seen": 265845825, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.20617676, "step": 12322, "time_per_iteration": 2.853593349456787 }, { "auxiliary_loss_clip": 0.01430742, "auxiliary_loss_mlp": 0.01036757, "balance_loss_clip": 1.26744723, "balance_loss_mlp": 1.01755238, "epoch": 0.7408988426273861, "flos": 24245651790720.0, "grad_norm": 1.6678358556559554, "language_loss": 0.64957619, "learning_rate": 6.637273779206183e-07, "loss": 0.67425108, "num_input_tokens_seen": 265866335, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19213867, "step": 12323, "time_per_iteration": 2.8764781951904297 }, { "auxiliary_loss_clip": 0.01423444, "auxiliary_loss_mlp": 0.01037501, "balance_loss_clip": 1.25933659, "balance_loss_mlp": 1.01817703, "epoch": 0.7409589658800542, "flos": 29034901353600.0, "grad_norm": 2.999977588561279, "language_loss": 0.76648688, "learning_rate": 6.634376286210559e-07, "loss": 0.79109633, "num_input_tokens_seen": 265888945, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.1932373, "step": 12324, "time_per_iteration": 2.9832944869995117 }, { "auxiliary_loss_clip": 0.01404371, "auxiliary_loss_mlp": 0.01029692, "balance_loss_clip": 1.24275231, "balance_loss_mlp": 1.01165533, "epoch": 0.7410190891327221, "flos": 19359994262400.0, "grad_norm": 1.7831133196316835, "language_loss": 0.74866199, "learning_rate": 6.63147930004073e-07, "loss": 0.77300262, "num_input_tokens_seen": 265908030, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18029785, "step": 12325, "time_per_iteration": 2.868947982788086 }, { "auxiliary_loss_clip": 0.01439465, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.27175105, "balance_loss_mlp": 1.0144279, "epoch": 0.7410792123853901, "flos": 22757970896640.0, "grad_norm": 2.308260222666618, "language_loss": 0.68497086, "learning_rate": 6.628582820806545e-07, "loss": 0.70970356, "num_input_tokens_seen": 265927030, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19384766, "step": 12326, "time_per_iteration": 2.8803138732910156 }, { "auxiliary_loss_clip": 0.01411352, "auxiliary_loss_mlp": 0.01034392, "balance_loss_clip": 1.24878025, "balance_loss_mlp": 1.01510382, "epoch": 0.741139335638058, "flos": 25382865312000.0, "grad_norm": 1.6693700574285433, "language_loss": 0.89859152, "learning_rate": 6.625686848617835e-07, "loss": 0.92304897, "num_input_tokens_seen": 265945490, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19287109, "step": 12327, "time_per_iteration": 4.4053733348846436 }, { "auxiliary_loss_clip": 0.01418345, "auxiliary_loss_mlp": 0.01034615, "balance_loss_clip": 1.25640631, "balance_loss_mlp": 1.01563716, "epoch": 0.741199458890726, "flos": 18594196456320.0, "grad_norm": 2.0697500109566396, "language_loss": 0.86342466, "learning_rate": 6.62279138358442e-07, "loss": 0.88795429, "num_input_tokens_seen": 265963265, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18969727, "step": 12328, "time_per_iteration": 2.794187307357788 }, { "auxiliary_loss_clip": 0.01410329, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.25216913, "balance_loss_mlp": 1.01212287, "epoch": 0.7412595821433939, "flos": 22137155988480.0, "grad_norm": 1.8631287896488757, "language_loss": 0.67365092, "learning_rate": 6.619896425816103e-07, "loss": 0.69806349, "num_input_tokens_seen": 265982270, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18811035, "step": 12329, "time_per_iteration": 2.898951768875122 }, { "auxiliary_loss_clip": 0.01440159, "auxiliary_loss_mlp": 0.01037498, "balance_loss_clip": 1.27233911, "balance_loss_mlp": 1.01902092, "epoch": 0.741319705396062, "flos": 29181829777920.0, "grad_norm": 1.848621068870848, "language_loss": 0.67618108, "learning_rate": 6.617001975422647e-07, "loss": 0.70095766, "num_input_tokens_seen": 266003835, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.18469238, "step": 12330, "time_per_iteration": 2.920916795730591 }, { "auxiliary_loss_clip": 0.01439149, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.27095985, "balance_loss_mlp": 1.01275659, "epoch": 0.7413798286487299, "flos": 20677146929280.0, "grad_norm": 1.9197923745612766, "language_loss": 0.86196762, "learning_rate": 6.614108032513823e-07, "loss": 0.88668716, "num_input_tokens_seen": 266021595, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20043945, "step": 12331, "time_per_iteration": 2.84601092338562 }, { "auxiliary_loss_clip": 0.01422403, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.25925469, "balance_loss_mlp": 1.01436591, "epoch": 0.7414399519013979, "flos": 16407508360320.0, "grad_norm": 1.8523985164435879, "language_loss": 0.70802605, "learning_rate": 6.611214597199364e-07, "loss": 0.73258317, "num_input_tokens_seen": 266039860, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18933105, "step": 12332, "time_per_iteration": 4.2664573192596436 }, { "auxiliary_loss_clip": 0.01417633, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.25454223, "balance_loss_mlp": 1.01492012, "epoch": 0.7415000751540658, "flos": 25641358709760.0, "grad_norm": 1.9480122837622453, "language_loss": 0.64148915, "learning_rate": 6.608321669588984e-07, "loss": 0.66600752, "num_input_tokens_seen": 266058050, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19274902, "step": 12333, "time_per_iteration": 2.8744170665740967 }, { "auxiliary_loss_clip": 0.01400912, "auxiliary_loss_mlp": 0.01034932, "balance_loss_clip": 1.24421024, "balance_loss_mlp": 1.01679993, "epoch": 0.7415601984067338, "flos": 24510705684480.0, "grad_norm": 1.8845113706596734, "language_loss": 0.72070193, "learning_rate": 6.605429249792387e-07, "loss": 0.74506032, "num_input_tokens_seen": 266078060, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18139648, "step": 12334, "time_per_iteration": 2.9569578170776367 }, { "auxiliary_loss_clip": 0.01414539, "auxiliary_loss_mlp": 0.01032151, "balance_loss_clip": 1.25235093, "balance_loss_mlp": 1.01295829, "epoch": 0.7416203216594017, "flos": 20897064495360.0, "grad_norm": 1.698431540795265, "language_loss": 0.83077323, "learning_rate": 6.602537337919257e-07, "loss": 0.85524017, "num_input_tokens_seen": 266097110, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19189453, "step": 12335, "time_per_iteration": 2.856623411178589 }, { "auxiliary_loss_clip": 0.01412101, "auxiliary_loss_mlp": 0.01035615, "balance_loss_clip": 1.24959731, "balance_loss_mlp": 1.0162555, "epoch": 0.7416804449120697, "flos": 15630489843840.0, "grad_norm": 2.325837658113184, "language_loss": 0.7553941, "learning_rate": 6.599645934079259e-07, "loss": 0.77987123, "num_input_tokens_seen": 266110870, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19360352, "step": 12336, "time_per_iteration": 2.8298897743225098 }, { "auxiliary_loss_clip": 0.01429851, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.26488042, "balance_loss_mlp": 1.01405609, "epoch": 0.7417405681647377, "flos": 17127174453120.0, "grad_norm": 2.4183565263123565, "language_loss": 0.74393857, "learning_rate": 6.596755038382029e-07, "loss": 0.76857042, "num_input_tokens_seen": 266127845, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19299316, "step": 12337, "time_per_iteration": 2.914642333984375 }, { "auxiliary_loss_clip": 0.01421162, "auxiliary_loss_mlp": 0.01036581, "balance_loss_clip": 1.26070869, "balance_loss_mlp": 1.01688731, "epoch": 0.7418006914174057, "flos": 18889682117760.0, "grad_norm": 1.8389820607296536, "language_loss": 0.77357626, "learning_rate": 6.593864650937186e-07, "loss": 0.79815364, "num_input_tokens_seen": 266145400, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19702148, "step": 12338, "time_per_iteration": 2.843045949935913 }, { "auxiliary_loss_clip": 0.0140247, "auxiliary_loss_mlp": 0.01034181, "balance_loss_clip": 1.24413705, "balance_loss_mlp": 1.01504827, "epoch": 0.7418608146700737, "flos": 21590913790080.0, "grad_norm": 1.7601472700520027, "language_loss": 0.73132598, "learning_rate": 6.590974771854345e-07, "loss": 0.75569248, "num_input_tokens_seen": 266164430, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19140625, "step": 12339, "time_per_iteration": 2.8659756183624268 }, { "auxiliary_loss_clip": 0.01410714, "auxiliary_loss_mlp": 0.01030717, "balance_loss_clip": 1.25006342, "balance_loss_mlp": 1.01169133, "epoch": 0.7419209379227416, "flos": 22349155714560.0, "grad_norm": 2.2148277440735042, "language_loss": 0.8066709, "learning_rate": 6.588085401243077e-07, "loss": 0.83108521, "num_input_tokens_seen": 266183855, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19030762, "step": 12340, "time_per_iteration": 2.8562278747558594 }, { "auxiliary_loss_clip": 0.01417169, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.25523257, "balance_loss_mlp": 1.01724362, "epoch": 0.7419810611754096, "flos": 16770599032320.0, "grad_norm": 1.6191961816927123, "language_loss": 0.76110655, "learning_rate": 6.585196539212958e-07, "loss": 0.78563946, "num_input_tokens_seen": 266202085, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18884277, "step": 12341, "time_per_iteration": 2.9123871326446533 }, { "auxiliary_loss_clip": 0.01388035, "auxiliary_loss_mlp": 0.01033007, "balance_loss_clip": 1.2353611, "balance_loss_mlp": 1.01415956, "epoch": 0.7420411844280775, "flos": 26223959744640.0, "grad_norm": 1.3842235463918187, "language_loss": 0.80918121, "learning_rate": 6.582308185873535e-07, "loss": 0.83339161, "num_input_tokens_seen": 266223445, "router_z_loss_clip": 1.52929688, "router_z_loss_mlp": 0.18859863, "step": 12342, "time_per_iteration": 2.8887994289398193 }, { "auxiliary_loss_clip": 0.01420557, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.25908589, "balance_loss_mlp": 1.01440084, "epoch": 0.7421013076807456, "flos": 68549517901440.0, "grad_norm": 1.6692967429468768, "language_loss": 0.77735126, "learning_rate": 6.57942034133433e-07, "loss": 0.80188966, "num_input_tokens_seen": 266246575, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18896484, "step": 12343, "time_per_iteration": 3.2489981651306152 }, { "auxiliary_loss_clip": 0.01423122, "auxiliary_loss_mlp": 0.01037726, "balance_loss_clip": 1.26092124, "balance_loss_mlp": 1.01871252, "epoch": 0.7421614309334135, "flos": 24435906750720.0, "grad_norm": 1.6838364016589684, "language_loss": 0.68467498, "learning_rate": 6.576533005704843e-07, "loss": 0.70928347, "num_input_tokens_seen": 266266055, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19018555, "step": 12344, "time_per_iteration": 2.90736722946167 }, { "auxiliary_loss_clip": 0.01417635, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.2537508, "balance_loss_mlp": 1.01531553, "epoch": 0.7422215541860815, "flos": 12318261384960.0, "grad_norm": 2.1467725525525005, "language_loss": 0.822716, "learning_rate": 6.573646179094572e-07, "loss": 0.84724861, "num_input_tokens_seen": 266282240, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20324707, "step": 12345, "time_per_iteration": 2.835594654083252 }, { "auxiliary_loss_clip": 0.01415312, "auxiliary_loss_mlp": 0.01035019, "balance_loss_clip": 1.25179958, "balance_loss_mlp": 1.01579094, "epoch": 0.7422816774387494, "flos": 19654891741440.0, "grad_norm": 1.872232406259468, "language_loss": 0.7158764, "learning_rate": 6.570759861612988e-07, "loss": 0.74037969, "num_input_tokens_seen": 266300980, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19226074, "step": 12346, "time_per_iteration": 2.869725465774536 }, { "auxiliary_loss_clip": 0.01414257, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.25189555, "balance_loss_mlp": 1.01854718, "epoch": 0.7423418006914174, "flos": 32028949244160.0, "grad_norm": 1.650952445424816, "language_loss": 0.74303687, "learning_rate": 6.56787405336953e-07, "loss": 0.76754916, "num_input_tokens_seen": 266322215, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1842041, "step": 12347, "time_per_iteration": 2.919619560241699 }, { "auxiliary_loss_clip": 0.01436178, "auxiliary_loss_mlp": 0.01034419, "balance_loss_clip": 1.26925254, "balance_loss_mlp": 1.01569128, "epoch": 0.7424019239440853, "flos": 18926357667840.0, "grad_norm": 2.509920738737155, "language_loss": 0.81810284, "learning_rate": 6.564988754473642e-07, "loss": 0.84280884, "num_input_tokens_seen": 266341600, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.18737793, "step": 12348, "time_per_iteration": 2.863982915878296 }, { "auxiliary_loss_clip": 0.01412118, "auxiliary_loss_mlp": 0.01037672, "balance_loss_clip": 1.25234926, "balance_loss_mlp": 1.01745379, "epoch": 0.7424620471967533, "flos": 35888686755840.0, "grad_norm": 2.180804768214526, "language_loss": 0.72664416, "learning_rate": 6.562103965034724e-07, "loss": 0.75114214, "num_input_tokens_seen": 266362895, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.20227051, "step": 12349, "time_per_iteration": 2.9970381259918213 }, { "auxiliary_loss_clip": 0.0143521, "auxiliary_loss_mlp": 0.01037681, "balance_loss_clip": 1.2669363, "balance_loss_mlp": 1.01823783, "epoch": 0.7425221704494213, "flos": 27028061913600.0, "grad_norm": 3.727844880351325, "language_loss": 0.80231309, "learning_rate": 6.559219685162165e-07, "loss": 0.82704198, "num_input_tokens_seen": 266384015, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.19445801, "step": 12350, "time_per_iteration": 2.8791427612304688 }, { "auxiliary_loss_clip": 0.01414114, "auxiliary_loss_mlp": 0.01040334, "balance_loss_clip": 1.25197041, "balance_loss_mlp": 1.02130866, "epoch": 0.7425822937020893, "flos": 34180273889280.0, "grad_norm": 2.2393207188878925, "language_loss": 0.76136231, "learning_rate": 6.556335914965343e-07, "loss": 0.78590685, "num_input_tokens_seen": 266405990, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19042969, "step": 12351, "time_per_iteration": 2.972874641418457 }, { "auxiliary_loss_clip": 0.0141083, "auxiliary_loss_mlp": 0.01032387, "balance_loss_clip": 1.24891758, "balance_loss_mlp": 1.01402879, "epoch": 0.7426424169547573, "flos": 21292306237440.0, "grad_norm": 2.0974388389177725, "language_loss": 0.82288557, "learning_rate": 6.553452654553611e-07, "loss": 0.84731776, "num_input_tokens_seen": 266424260, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18347168, "step": 12352, "time_per_iteration": 2.878365993499756 }, { "auxiliary_loss_clip": 0.01424678, "auxiliary_loss_mlp": 0.01042932, "balance_loss_clip": 1.26173162, "balance_loss_mlp": 1.02362061, "epoch": 0.7427025402074252, "flos": 22456784390400.0, "grad_norm": 1.8453228719656483, "language_loss": 0.72051215, "learning_rate": 6.550569904036307e-07, "loss": 0.74518824, "num_input_tokens_seen": 266444580, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1932373, "step": 12353, "time_per_iteration": 2.8827974796295166 }, { "auxiliary_loss_clip": 0.01430082, "auxiliary_loss_mlp": 0.01042886, "balance_loss_clip": 1.26807785, "balance_loss_mlp": 1.02425396, "epoch": 0.7427626634600932, "flos": 22533800319360.0, "grad_norm": 1.7755614579392558, "language_loss": 0.72725743, "learning_rate": 6.547687663522739e-07, "loss": 0.7519871, "num_input_tokens_seen": 266465640, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18652344, "step": 12354, "time_per_iteration": 2.8784947395324707 }, { "auxiliary_loss_clip": 0.01192199, "auxiliary_loss_mlp": 0.01046777, "balance_loss_clip": 1.09828961, "balance_loss_mlp": 1.02179086, "epoch": 0.7428227867127611, "flos": 67237088676480.0, "grad_norm": 0.7022558752732885, "language_loss": 0.59568858, "learning_rate": 6.544805933122199e-07, "loss": 0.61807835, "num_input_tokens_seen": 266531950, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 0.25, "step": 12355, "time_per_iteration": 4.897478818893433 }, { "auxiliary_loss_clip": 0.01412087, "auxiliary_loss_mlp": 0.01034138, "balance_loss_clip": 1.24912906, "balance_loss_mlp": 1.01598203, "epoch": 0.7428829099654292, "flos": 14729572506240.0, "grad_norm": 1.7611123231224195, "language_loss": 0.68806618, "learning_rate": 6.541924712943971e-07, "loss": 0.71252847, "num_input_tokens_seen": 266550665, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18151855, "step": 12356, "time_per_iteration": 2.9124960899353027 }, { "auxiliary_loss_clip": 0.01427537, "auxiliary_loss_mlp": 0.01039373, "balance_loss_clip": 1.262465, "balance_loss_mlp": 1.02084804, "epoch": 0.7429430332180971, "flos": 48661967543040.0, "grad_norm": 1.6117739783379834, "language_loss": 0.7217496, "learning_rate": 6.539044003097301e-07, "loss": 0.74641871, "num_input_tokens_seen": 266572455, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18530273, "step": 12357, "time_per_iteration": 3.084669589996338 }, { "auxiliary_loss_clip": 0.01407128, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.24932694, "balance_loss_mlp": 1.01476574, "epoch": 0.7430031564707651, "flos": 16772861272320.0, "grad_norm": 1.9563614403971479, "language_loss": 0.66162258, "learning_rate": 6.53616380369143e-07, "loss": 0.6860382, "num_input_tokens_seen": 266590895, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19665527, "step": 12358, "time_per_iteration": 2.9129788875579834 }, { "auxiliary_loss_clip": 0.01421337, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.25559676, "balance_loss_mlp": 1.01598263, "epoch": 0.743063279723433, "flos": 23879077269120.0, "grad_norm": 1.6697154887376184, "language_loss": 0.81314027, "learning_rate": 6.533284114835591e-07, "loss": 0.83770525, "num_input_tokens_seen": 266607660, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19189453, "step": 12359, "time_per_iteration": 2.890619993209839 }, { "auxiliary_loss_clip": 0.01413339, "auxiliary_loss_mlp": 0.01034832, "balance_loss_clip": 1.25071132, "balance_loss_mlp": 1.01566327, "epoch": 0.743123402976101, "flos": 14399673534720.0, "grad_norm": 2.0201834929421167, "language_loss": 0.69287097, "learning_rate": 6.530404936638956e-07, "loss": 0.71735269, "num_input_tokens_seen": 266624260, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19165039, "step": 12360, "time_per_iteration": 2.825568199157715 }, { "auxiliary_loss_clip": 0.01399944, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.23969972, "balance_loss_mlp": 1.01615334, "epoch": 0.7431835262287689, "flos": 27465861029760.0, "grad_norm": 1.5775027276536586, "language_loss": 0.73396301, "learning_rate": 6.527526269210715e-07, "loss": 0.75832325, "num_input_tokens_seen": 266644210, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19934082, "step": 12361, "time_per_iteration": 2.921623945236206 }, { "auxiliary_loss_clip": 0.01428053, "auxiliary_loss_mlp": 0.01039474, "balance_loss_clip": 1.26282525, "balance_loss_mlp": 1.02023363, "epoch": 0.743243649481437, "flos": 20969284475520.0, "grad_norm": 2.472894507033422, "language_loss": 0.56504977, "learning_rate": 6.524648112660027e-07, "loss": 0.58972502, "num_input_tokens_seen": 266664230, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19250488, "step": 12362, "time_per_iteration": 4.260235071182251 }, { "auxiliary_loss_clip": 0.01408017, "auxiliary_loss_mlp": 0.01037569, "balance_loss_clip": 1.24571538, "balance_loss_mlp": 1.01843631, "epoch": 0.7433037727341049, "flos": 22793243857920.0, "grad_norm": 2.426340331258866, "language_loss": 0.78387481, "learning_rate": 6.521770467096039e-07, "loss": 0.80833066, "num_input_tokens_seen": 266683270, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19128418, "step": 12363, "time_per_iteration": 2.840024948120117 }, { "auxiliary_loss_clip": 0.01415923, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.25406098, "balance_loss_mlp": 1.01784241, "epoch": 0.7433638959867729, "flos": 22205937363840.0, "grad_norm": 3.6329263873769566, "language_loss": 0.7876094, "learning_rate": 6.518893332627862e-07, "loss": 0.81214482, "num_input_tokens_seen": 266701235, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19775391, "step": 12364, "time_per_iteration": 2.8636791706085205 }, { "auxiliary_loss_clip": 0.01414459, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.25142562, "balance_loss_mlp": 1.01468241, "epoch": 0.7434240192394409, "flos": 23307696944640.0, "grad_norm": 1.6610116981770697, "language_loss": 0.78937995, "learning_rate": 6.516016709364604e-07, "loss": 0.81385189, "num_input_tokens_seen": 266721495, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.18054199, "step": 12365, "time_per_iteration": 2.9113433361053467 }, { "auxiliary_loss_clip": 0.01424816, "auxiliary_loss_mlp": 0.01035583, "balance_loss_clip": 1.25938153, "balance_loss_mlp": 1.01692653, "epoch": 0.7434841424921088, "flos": 54026352748800.0, "grad_norm": 1.6727700641878358, "language_loss": 0.7762441, "learning_rate": 6.513140597415346e-07, "loss": 0.80084807, "num_input_tokens_seen": 266747400, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.18652344, "step": 12366, "time_per_iteration": 3.1428966522216797 }, { "auxiliary_loss_clip": 0.01407859, "auxiliary_loss_mlp": 0.01036999, "balance_loss_clip": 1.24995518, "balance_loss_mlp": 1.0184381, "epoch": 0.7435442657447768, "flos": 21443894876160.0, "grad_norm": 1.3571170789658358, "language_loss": 0.71712971, "learning_rate": 6.510264996889141e-07, "loss": 0.74157834, "num_input_tokens_seen": 266767630, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18566895, "step": 12367, "time_per_iteration": 4.29633903503418 }, { "auxiliary_loss_clip": 0.01428456, "auxiliary_loss_mlp": 0.01036011, "balance_loss_clip": 1.26103115, "balance_loss_mlp": 1.01669884, "epoch": 0.7436043889974447, "flos": 24510298481280.0, "grad_norm": 2.685428334170787, "language_loss": 0.74950349, "learning_rate": 6.507389907895038e-07, "loss": 0.77414811, "num_input_tokens_seen": 266788015, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19311523, "step": 12368, "time_per_iteration": 2.8901779651641846 }, { "auxiliary_loss_clip": 0.01416159, "auxiliary_loss_mlp": 0.01035023, "balance_loss_clip": 1.25665188, "balance_loss_mlp": 1.01574659, "epoch": 0.7436645122501128, "flos": 40713299769600.0, "grad_norm": 1.6780653907453773, "language_loss": 0.70147812, "learning_rate": 6.50451533054207e-07, "loss": 0.72599, "num_input_tokens_seen": 266809010, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19287109, "step": 12369, "time_per_iteration": 3.003420352935791 }, { "auxiliary_loss_clip": 0.01423082, "auxiliary_loss_mlp": 0.0103433, "balance_loss_clip": 1.25953209, "balance_loss_mlp": 1.01654387, "epoch": 0.7437246355027807, "flos": 18916132343040.0, "grad_norm": 1.7543962371186423, "language_loss": 0.7617029, "learning_rate": 6.501641264939233e-07, "loss": 0.786277, "num_input_tokens_seen": 266825390, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.17785645, "step": 12370, "time_per_iteration": 2.8291265964508057 }, { "auxiliary_loss_clip": 0.01405053, "auxiliary_loss_mlp": 0.01042296, "balance_loss_clip": 1.24549818, "balance_loss_mlp": 1.02297199, "epoch": 0.7437847587554487, "flos": 21553876281600.0, "grad_norm": 1.4398465071214595, "language_loss": 0.79356098, "learning_rate": 6.498767711195503e-07, "loss": 0.81803453, "num_input_tokens_seen": 266844675, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1932373, "step": 12371, "time_per_iteration": 2.87307071685791 }, { "auxiliary_loss_clip": 0.014063, "auxiliary_loss_mlp": 0.01032445, "balance_loss_clip": 1.24434066, "balance_loss_mlp": 1.01313329, "epoch": 0.7438448820081166, "flos": 27794040698880.0, "grad_norm": 1.6463991832597595, "language_loss": 0.7018463, "learning_rate": 6.495894669419857e-07, "loss": 0.72623378, "num_input_tokens_seen": 266865160, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.1932373, "step": 12372, "time_per_iteration": 2.986769437789917 }, { "auxiliary_loss_clip": 0.01415131, "auxiliary_loss_mlp": 0.01034609, "balance_loss_clip": 1.25368142, "balance_loss_mlp": 1.01504683, "epoch": 0.7439050052607846, "flos": 17976865397760.0, "grad_norm": 16.3557695350612, "language_loss": 0.7599203, "learning_rate": 6.493022139721245e-07, "loss": 0.78441775, "num_input_tokens_seen": 266883285, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19567871, "step": 12373, "time_per_iteration": 2.858896255493164 }, { "auxiliary_loss_clip": 0.01418629, "auxiliary_loss_mlp": 0.01035444, "balance_loss_clip": 1.25416684, "balance_loss_mlp": 1.01561952, "epoch": 0.7439651285134525, "flos": 22967029710720.0, "grad_norm": 2.7296032138113984, "language_loss": 0.77884924, "learning_rate": 6.49015012220858e-07, "loss": 0.80339003, "num_input_tokens_seen": 266900960, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19824219, "step": 12374, "time_per_iteration": 2.848813056945801 }, { "auxiliary_loss_clip": 0.0141966, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.25618505, "balance_loss_mlp": 1.01116991, "epoch": 0.7440252517661206, "flos": 18815969059200.0, "grad_norm": 2.027007177767614, "language_loss": 0.77115446, "learning_rate": 6.487278616990774e-07, "loss": 0.79565042, "num_input_tokens_seen": 266917710, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.1875, "step": 12375, "time_per_iteration": 2.864499568939209 }, { "auxiliary_loss_clip": 0.01406757, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.24755228, "balance_loss_mlp": 1.01187599, "epoch": 0.7440853750187885, "flos": 20275887628800.0, "grad_norm": 1.9214664695588923, "language_loss": 0.77738023, "learning_rate": 6.484407624176733e-07, "loss": 0.80174202, "num_input_tokens_seen": 266934220, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17541504, "step": 12376, "time_per_iteration": 2.852778434753418 }, { "auxiliary_loss_clip": 0.01414107, "auxiliary_loss_mlp": 0.01030472, "balance_loss_clip": 1.25209737, "balance_loss_mlp": 1.01187539, "epoch": 0.7441454982714565, "flos": 25348044798720.0, "grad_norm": 1.702528103193998, "language_loss": 0.80324715, "learning_rate": 6.481537143875296e-07, "loss": 0.82769299, "num_input_tokens_seen": 266955210, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18603516, "step": 12377, "time_per_iteration": 2.88028883934021 }, { "auxiliary_loss_clip": 0.01409442, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.24575043, "balance_loss_mlp": 1.01202238, "epoch": 0.7442056215241245, "flos": 64500385080960.0, "grad_norm": 1.9033425628688072, "language_loss": 0.67023903, "learning_rate": 6.478667176195322e-07, "loss": 0.69464338, "num_input_tokens_seen": 266976555, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18969727, "step": 12378, "time_per_iteration": 3.258007287979126 }, { "auxiliary_loss_clip": 0.01413974, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.25086832, "balance_loss_mlp": 1.01255453, "epoch": 0.7442657447767924, "flos": 31297926706560.0, "grad_norm": 1.706474175499525, "language_loss": 0.72443175, "learning_rate": 6.475797721245648e-07, "loss": 0.74889135, "num_input_tokens_seen": 266997640, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19433594, "step": 12379, "time_per_iteration": 3.0003767013549805 }, { "auxiliary_loss_clip": 0.01411358, "auxiliary_loss_mlp": 0.01041926, "balance_loss_clip": 1.24922895, "balance_loss_mlp": 1.02229261, "epoch": 0.7443258680294604, "flos": 20816157513600.0, "grad_norm": 2.105034497859544, "language_loss": 0.65577471, "learning_rate": 6.472928779135085e-07, "loss": 0.68030757, "num_input_tokens_seen": 267016165, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19641113, "step": 12380, "time_per_iteration": 2.84525465965271 }, { "auxiliary_loss_clip": 0.01425183, "auxiliary_loss_mlp": 0.01032968, "balance_loss_clip": 1.26089191, "balance_loss_mlp": 1.01407337, "epoch": 0.7443859912821283, "flos": 22209873661440.0, "grad_norm": 2.119726839706107, "language_loss": 0.79809523, "learning_rate": 6.470060349972411e-07, "loss": 0.82267672, "num_input_tokens_seen": 267034075, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18884277, "step": 12381, "time_per_iteration": 2.849461317062378 }, { "auxiliary_loss_clip": 0.01419912, "auxiliary_loss_mlp": 0.01035147, "balance_loss_clip": 1.25469899, "balance_loss_mlp": 1.01584673, "epoch": 0.7444461145347964, "flos": 22027943744640.0, "grad_norm": 1.8886249265056847, "language_loss": 0.73398781, "learning_rate": 6.467192433866411e-07, "loss": 0.75853837, "num_input_tokens_seen": 267053645, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.1932373, "step": 12382, "time_per_iteration": 2.877836227416992 }, { "auxiliary_loss_clip": 0.01187381, "auxiliary_loss_mlp": 0.01023044, "balance_loss_clip": 1.09712005, "balance_loss_mlp": 1.00034618, "epoch": 0.7445062377874643, "flos": 70595150117760.0, "grad_norm": 0.6525247743979238, "language_loss": 0.5463044, "learning_rate": 6.464325030925831e-07, "loss": 0.56840861, "num_input_tokens_seen": 267121830, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.2265625, "step": 12383, "time_per_iteration": 3.506392240524292 }, { "auxiliary_loss_clip": 0.01405987, "auxiliary_loss_mlp": 0.01031124, "balance_loss_clip": 1.24368739, "balance_loss_mlp": 1.01178885, "epoch": 0.7445663610401323, "flos": 22174736434560.0, "grad_norm": 1.9919169845610232, "language_loss": 0.77191985, "learning_rate": 6.461458141259395e-07, "loss": 0.79629099, "num_input_tokens_seen": 267141145, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19311523, "step": 12384, "time_per_iteration": 2.891589403152466 }, { "auxiliary_loss_clip": 0.01409938, "auxiliary_loss_mlp": 0.01031847, "balance_loss_clip": 1.24840021, "balance_loss_mlp": 1.01280951, "epoch": 0.7446264842928002, "flos": 24180851957760.0, "grad_norm": 2.561943740346893, "language_loss": 0.7986424, "learning_rate": 6.458591764975823e-07, "loss": 0.82306015, "num_input_tokens_seen": 267159280, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19030762, "step": 12385, "time_per_iteration": 2.8676953315734863 }, { "auxiliary_loss_clip": 0.01434053, "auxiliary_loss_mlp": 0.0103638, "balance_loss_clip": 1.26795053, "balance_loss_mlp": 1.01677048, "epoch": 0.7446866075454682, "flos": 24145352772480.0, "grad_norm": 1.5433496581272748, "language_loss": 0.82270491, "learning_rate": 6.455725902183813e-07, "loss": 0.84740919, "num_input_tokens_seen": 267179390, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19604492, "step": 12386, "time_per_iteration": 2.8980581760406494 }, { "auxiliary_loss_clip": 0.01402242, "auxiliary_loss_mlp": 0.01035311, "balance_loss_clip": 1.2436223, "balance_loss_mlp": 1.01651204, "epoch": 0.7447467307981361, "flos": 23558001033600.0, "grad_norm": 1.640762722585074, "language_loss": 0.71532476, "learning_rate": 6.452860552992037e-07, "loss": 0.73970032, "num_input_tokens_seen": 267198165, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18798828, "step": 12387, "time_per_iteration": 2.8678295612335205 }, { "auxiliary_loss_clip": 0.01409945, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.24831963, "balance_loss_mlp": 1.01507282, "epoch": 0.7448068540508042, "flos": 19575885041280.0, "grad_norm": 1.9846426790703335, "language_loss": 0.71165502, "learning_rate": 6.449995717509138e-07, "loss": 0.73609519, "num_input_tokens_seen": 267214520, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18994141, "step": 12388, "time_per_iteration": 2.825624942779541 }, { "auxiliary_loss_clip": 0.01422212, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.25961232, "balance_loss_mlp": 1.01629019, "epoch": 0.7448669773034721, "flos": 21850900266240.0, "grad_norm": 1.5568825969977613, "language_loss": 0.85709751, "learning_rate": 6.447131395843761e-07, "loss": 0.88166559, "num_input_tokens_seen": 267236555, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.1829834, "step": 12389, "time_per_iteration": 2.900312662124634 }, { "auxiliary_loss_clip": 0.01422447, "auxiliary_loss_mlp": 0.01036242, "balance_loss_clip": 1.25814331, "balance_loss_mlp": 1.01801538, "epoch": 0.7449271005561401, "flos": 25166250616320.0, "grad_norm": 1.848644425089653, "language_loss": 0.79472899, "learning_rate": 6.444267588104526e-07, "loss": 0.81931585, "num_input_tokens_seen": 267254800, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18200684, "step": 12390, "time_per_iteration": 4.25848126411438 }, { "auxiliary_loss_clip": 0.01423208, "auxiliary_loss_mlp": 0.01031613, "balance_loss_clip": 1.25896335, "balance_loss_mlp": 1.01188397, "epoch": 0.7449872238088081, "flos": 22283541475200.0, "grad_norm": 1.8842490975296704, "language_loss": 0.85842049, "learning_rate": 6.441404294400014e-07, "loss": 0.88296866, "num_input_tokens_seen": 267274610, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19702148, "step": 12391, "time_per_iteration": 2.843517303466797 }, { "auxiliary_loss_clip": 0.01413806, "auxiliary_loss_mlp": 0.01033592, "balance_loss_clip": 1.25145555, "balance_loss_mlp": 1.01540041, "epoch": 0.745047347061476, "flos": 20604519745920.0, "grad_norm": 1.739986805176527, "language_loss": 0.74665713, "learning_rate": 6.438541514838811e-07, "loss": 0.77113104, "num_input_tokens_seen": 267292600, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18188477, "step": 12392, "time_per_iteration": 2.8697316646575928 }, { "auxiliary_loss_clip": 0.01393598, "auxiliary_loss_mlp": 0.0103669, "balance_loss_clip": 1.23508334, "balance_loss_mlp": 1.01719952, "epoch": 0.745107470314144, "flos": 22137744170880.0, "grad_norm": 1.7308083067822082, "language_loss": 0.77594614, "learning_rate": 6.435679249529487e-07, "loss": 0.80024904, "num_input_tokens_seen": 267311295, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19506836, "step": 12393, "time_per_iteration": 2.8552327156066895 }, { "auxiliary_loss_clip": 0.01408701, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.24812794, "balance_loss_mlp": 1.01476526, "epoch": 0.745167593566812, "flos": 22246594456320.0, "grad_norm": 1.7856018795121198, "language_loss": 0.7294488, "learning_rate": 6.432817498580552e-07, "loss": 0.75388265, "num_input_tokens_seen": 267328390, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19921875, "step": 12394, "time_per_iteration": 2.878673791885376 }, { "auxiliary_loss_clip": 0.01415026, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.25303447, "balance_loss_mlp": 1.01653039, "epoch": 0.74522771681948, "flos": 20675563361280.0, "grad_norm": 1.7321572282548086, "language_loss": 0.82443249, "learning_rate": 6.429956262100535e-07, "loss": 0.84895062, "num_input_tokens_seen": 267348185, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.20251465, "step": 12395, "time_per_iteration": 2.862405776977539 }, { "auxiliary_loss_clip": 0.01424372, "auxiliary_loss_mlp": 0.0104087, "balance_loss_clip": 1.25900412, "balance_loss_mlp": 1.02134299, "epoch": 0.7452878400721479, "flos": 21117208285440.0, "grad_norm": 2.0151656426585096, "language_loss": 0.71648419, "learning_rate": 6.427095540197937e-07, "loss": 0.74113655, "num_input_tokens_seen": 267367010, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19519043, "step": 12396, "time_per_iteration": 2.862217426300049 }, { "auxiliary_loss_clip": 0.0142175, "auxiliary_loss_mlp": 0.01036111, "balance_loss_clip": 1.25728858, "balance_loss_mlp": 1.01811051, "epoch": 0.7453479633248159, "flos": 26699384551680.0, "grad_norm": 21.648990472832654, "language_loss": 0.6931594, "learning_rate": 6.424235332981245e-07, "loss": 0.71773803, "num_input_tokens_seen": 267386605, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18005371, "step": 12397, "time_per_iteration": 4.316742420196533 }, { "auxiliary_loss_clip": 0.01400273, "auxiliary_loss_mlp": 0.01038909, "balance_loss_clip": 1.2407918, "balance_loss_mlp": 1.02043152, "epoch": 0.7454080865774838, "flos": 17024025012480.0, "grad_norm": 1.8599539699797933, "language_loss": 0.77836478, "learning_rate": 6.421375640558908e-07, "loss": 0.80275655, "num_input_tokens_seen": 267404135, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18469238, "step": 12398, "time_per_iteration": 2.8079679012298584 }, { "auxiliary_loss_clip": 0.01401558, "auxiliary_loss_mlp": 0.01031673, "balance_loss_clip": 1.24294901, "balance_loss_mlp": 1.0133028, "epoch": 0.7454682098301518, "flos": 21333642001920.0, "grad_norm": 2.137313638042295, "language_loss": 0.7874065, "learning_rate": 6.418516463039363e-07, "loss": 0.81173879, "num_input_tokens_seen": 267423120, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18359375, "step": 12399, "time_per_iteration": 2.827789306640625 }, { "auxiliary_loss_clip": 0.01400515, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.24381447, "balance_loss_mlp": 1.01849365, "epoch": 0.7455283330828197, "flos": 17867336440320.0, "grad_norm": 1.8894970603406782, "language_loss": 0.75089407, "learning_rate": 6.415657800531038e-07, "loss": 0.77526915, "num_input_tokens_seen": 267441250, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18505859, "step": 12400, "time_per_iteration": 2.833164930343628 }, { "auxiliary_loss_clip": 0.01413932, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.25294542, "balance_loss_mlp": 1.01567924, "epoch": 0.7455884563354878, "flos": 30786595511040.0, "grad_norm": 2.1793405617033264, "language_loss": 0.83034569, "learning_rate": 6.412799653142327e-07, "loss": 0.85482335, "num_input_tokens_seen": 267462820, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1817627, "step": 12401, "time_per_iteration": 2.9080002307891846 }, { "auxiliary_loss_clip": 0.01417335, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.25496268, "balance_loss_mlp": 1.01888359, "epoch": 0.7456485795881557, "flos": 23195996236800.0, "grad_norm": 2.1770639125542415, "language_loss": 0.65878344, "learning_rate": 6.409942020981611e-07, "loss": 0.68333244, "num_input_tokens_seen": 267483065, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18688965, "step": 12402, "time_per_iteration": 5.671504974365234 }, { "auxiliary_loss_clip": 0.01397322, "auxiliary_loss_mlp": 0.01036035, "balance_loss_clip": 1.23875809, "balance_loss_mlp": 1.01832032, "epoch": 0.7457087028408237, "flos": 38741280842880.0, "grad_norm": 1.828119628898687, "language_loss": 0.73615944, "learning_rate": 6.407084904157265e-07, "loss": 0.76049304, "num_input_tokens_seen": 267504825, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17712402, "step": 12403, "time_per_iteration": 2.9931159019470215 }, { "auxiliary_loss_clip": 0.0118575, "auxiliary_loss_mlp": 0.0101928, "balance_loss_clip": 1.09561729, "balance_loss_mlp": 0.99848998, "epoch": 0.7457688260934917, "flos": 56068166419200.0, "grad_norm": 0.8300283445998068, "language_loss": 0.58841026, "learning_rate": 6.404228302777621e-07, "loss": 0.61046058, "num_input_tokens_seen": 267559260, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.20800781, "step": 12404, "time_per_iteration": 3.1960744857788086 }, { "auxiliary_loss_clip": 0.01414699, "auxiliary_loss_mlp": 0.01036429, "balance_loss_clip": 1.2522999, "balance_loss_mlp": 1.01834524, "epoch": 0.7458289493461596, "flos": 20124615703680.0, "grad_norm": 2.1684712575019, "language_loss": 0.78246772, "learning_rate": 6.401372216950995e-07, "loss": 0.80697906, "num_input_tokens_seen": 267578720, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18103027, "step": 12405, "time_per_iteration": 2.8594865798950195 }, { "auxiliary_loss_clip": 0.01403994, "auxiliary_loss_mlp": 0.01035854, "balance_loss_clip": 1.24613237, "balance_loss_mlp": 1.01819932, "epoch": 0.7458890725988276, "flos": 20202581773440.0, "grad_norm": 1.632367130969013, "language_loss": 0.69074482, "learning_rate": 6.398516646785698e-07, "loss": 0.71514332, "num_input_tokens_seen": 267598250, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.1763916, "step": 12406, "time_per_iteration": 2.850078582763672 }, { "auxiliary_loss_clip": 0.0143367, "auxiliary_loss_mlp": 0.01037783, "balance_loss_clip": 1.26566553, "balance_loss_mlp": 1.01848328, "epoch": 0.7459491958514956, "flos": 17027418372480.0, "grad_norm": 1.736225851077095, "language_loss": 0.65576166, "learning_rate": 6.39566159239002e-07, "loss": 0.68047619, "num_input_tokens_seen": 267615430, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19299316, "step": 12407, "time_per_iteration": 2.823737144470215 }, { "auxiliary_loss_clip": 0.01421171, "auxiliary_loss_mlp": 0.01035087, "balance_loss_clip": 1.25609052, "balance_loss_mlp": 1.01739657, "epoch": 0.7460093191041636, "flos": 25088601260160.0, "grad_norm": 1.6771873185307231, "language_loss": 0.72925317, "learning_rate": 6.392807053872212e-07, "loss": 0.75381577, "num_input_tokens_seen": 267635075, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.17687988, "step": 12408, "time_per_iteration": 2.857102870941162 }, { "auxiliary_loss_clip": 0.01428728, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.26149833, "balance_loss_mlp": 1.01316535, "epoch": 0.7460694423568315, "flos": 21918641011200.0, "grad_norm": 2.8389834353835957, "language_loss": 0.73848861, "learning_rate": 6.38995303134053e-07, "loss": 0.76309437, "num_input_tokens_seen": 267654105, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.18688965, "step": 12409, "time_per_iteration": 2.812295436859131 }, { "auxiliary_loss_clip": 0.01388776, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.23271835, "balance_loss_mlp": 1.01707554, "epoch": 0.7461295656094995, "flos": 21225787102080.0, "grad_norm": 1.7243309574980383, "language_loss": 0.66266942, "learning_rate": 6.38709952490319e-07, "loss": 0.68690729, "num_input_tokens_seen": 267673090, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.17944336, "step": 12410, "time_per_iteration": 2.854914665222168 }, { "auxiliary_loss_clip": 0.01400741, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.24225402, "balance_loss_mlp": 1.01488519, "epoch": 0.7461896888621674, "flos": 22357209288960.0, "grad_norm": 1.9873608715473716, "language_loss": 0.85184252, "learning_rate": 6.384246534668396e-07, "loss": 0.8761785, "num_input_tokens_seen": 267690605, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17980957, "step": 12411, "time_per_iteration": 2.8333091735839844 }, { "auxiliary_loss_clip": 0.01422786, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.26006007, "balance_loss_mlp": 1.01565766, "epoch": 0.7462498121148354, "flos": 25493661123840.0, "grad_norm": 1.5098641832571382, "language_loss": 0.78844297, "learning_rate": 6.381394060744339e-07, "loss": 0.81301188, "num_input_tokens_seen": 267710540, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18457031, "step": 12412, "time_per_iteration": 2.885653495788574 }, { "auxiliary_loss_clip": 0.01416886, "auxiliary_loss_mlp": 0.0103777, "balance_loss_clip": 1.25365114, "balance_loss_mlp": 1.01895916, "epoch": 0.7463099353675033, "flos": 33960944505600.0, "grad_norm": 1.9682233779148723, "language_loss": 0.63739884, "learning_rate": 6.378542103239188e-07, "loss": 0.66194546, "num_input_tokens_seen": 267730780, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18823242, "step": 12413, "time_per_iteration": 2.981565237045288 }, { "auxiliary_loss_clip": 0.0118709, "auxiliary_loss_mlp": 0.01050313, "balance_loss_clip": 1.09891713, "balance_loss_mlp": 1.03095293, "epoch": 0.7463700586201714, "flos": 62796650411520.0, "grad_norm": 0.7319914071522214, "language_loss": 0.54885775, "learning_rate": 6.375690662261082e-07, "loss": 0.57123178, "num_input_tokens_seen": 267794240, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.19335938, "step": 12414, "time_per_iteration": 3.376025438308716 }, { "auxiliary_loss_clip": 0.01414079, "auxiliary_loss_mlp": 0.01037885, "balance_loss_clip": 1.25108087, "balance_loss_mlp": 1.01848948, "epoch": 0.7464301818728393, "flos": 33444364913280.0, "grad_norm": 2.2259934739346146, "language_loss": 0.55266225, "learning_rate": 6.372839737918154e-07, "loss": 0.57718194, "num_input_tokens_seen": 267817190, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19396973, "step": 12415, "time_per_iteration": 3.049652099609375 }, { "auxiliary_loss_clip": 0.01411719, "auxiliary_loss_mlp": 0.01030955, "balance_loss_clip": 1.25036025, "balance_loss_mlp": 1.01234663, "epoch": 0.7464903051255073, "flos": 26881088244480.0, "grad_norm": 1.7984070905232146, "language_loss": 0.75314361, "learning_rate": 6.369989330318506e-07, "loss": 0.77757037, "num_input_tokens_seen": 267836245, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18615723, "step": 12416, "time_per_iteration": 2.920586585998535 }, { "auxiliary_loss_clip": 0.01413942, "auxiliary_loss_mlp": 0.01037114, "balance_loss_clip": 1.25119388, "balance_loss_mlp": 1.01787412, "epoch": 0.7465504283781753, "flos": 44101548771840.0, "grad_norm": 1.5758525380809827, "language_loss": 0.6967513, "learning_rate": 6.367139439570233e-07, "loss": 0.72126186, "num_input_tokens_seen": 267858310, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19238281, "step": 12417, "time_per_iteration": 3.02443528175354 }, { "auxiliary_loss_clip": 0.01437034, "auxiliary_loss_mlp": 0.0103736, "balance_loss_clip": 1.27149677, "balance_loss_mlp": 1.0179528, "epoch": 0.7466105516308432, "flos": 19683875675520.0, "grad_norm": 1.9309556353316306, "language_loss": 0.74573654, "learning_rate": 6.364290065781392e-07, "loss": 0.77048051, "num_input_tokens_seen": 267876345, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19396973, "step": 12418, "time_per_iteration": 2.857792854309082 }, { "auxiliary_loss_clip": 0.01410325, "auxiliary_loss_mlp": 0.0103053, "balance_loss_clip": 1.24816966, "balance_loss_mlp": 1.01196885, "epoch": 0.7466706748835112, "flos": 20530128015360.0, "grad_norm": 1.764446461653002, "language_loss": 0.70307153, "learning_rate": 6.361441209060039e-07, "loss": 0.72748005, "num_input_tokens_seen": 267896740, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18554688, "step": 12419, "time_per_iteration": 2.9096386432647705 }, { "auxiliary_loss_clip": 0.01402567, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.24591088, "balance_loss_mlp": 1.01327562, "epoch": 0.7467307981361792, "flos": 21700487992320.0, "grad_norm": 2.2235562490390572, "language_loss": 0.75670457, "learning_rate": 6.358592869514216e-07, "loss": 0.7810508, "num_input_tokens_seen": 267914765, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18786621, "step": 12420, "time_per_iteration": 2.8544178009033203 }, { "auxiliary_loss_clip": 0.01420522, "auxiliary_loss_mlp": 0.01032393, "balance_loss_clip": 1.25684428, "balance_loss_mlp": 1.0144403, "epoch": 0.7467909213888472, "flos": 19583259943680.0, "grad_norm": 1.853853336872252, "language_loss": 0.68170172, "learning_rate": 6.355745047251904e-07, "loss": 0.70623082, "num_input_tokens_seen": 267934085, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.17944336, "step": 12421, "time_per_iteration": 2.9223642349243164 }, { "auxiliary_loss_clip": 0.0141924, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.25375295, "balance_loss_mlp": 1.01397181, "epoch": 0.7468510446415151, "flos": 23705517640320.0, "grad_norm": 7.435867422069504, "language_loss": 0.73099977, "learning_rate": 6.352897742381107e-07, "loss": 0.75552499, "num_input_tokens_seen": 267955170, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19311523, "step": 12422, "time_per_iteration": 2.9388985633850098 }, { "auxiliary_loss_clip": 0.01404551, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.24514616, "balance_loss_mlp": 1.01404977, "epoch": 0.7469111678941831, "flos": 29327989040640.0, "grad_norm": 1.855367404192207, "language_loss": 0.7547822, "learning_rate": 6.350050955009796e-07, "loss": 0.77915668, "num_input_tokens_seen": 267974980, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18847656, "step": 12423, "time_per_iteration": 2.9306602478027344 }, { "auxiliary_loss_clip": 0.01399715, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.24092412, "balance_loss_mlp": 1.012012, "epoch": 0.746971291146851, "flos": 21808614360960.0, "grad_norm": 1.2706046628813827, "language_loss": 0.68110698, "learning_rate": 6.347204685245929e-07, "loss": 0.70541203, "num_input_tokens_seen": 267994985, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18774414, "step": 12424, "time_per_iteration": 2.9095659255981445 }, { "auxiliary_loss_clip": 0.01413785, "auxiliary_loss_mlp": 0.01038851, "balance_loss_clip": 1.24972582, "balance_loss_mlp": 1.0190624, "epoch": 0.747031414399519, "flos": 36258292679040.0, "grad_norm": 1.9628583166154159, "language_loss": 0.7546314, "learning_rate": 6.344358933197418e-07, "loss": 0.77915776, "num_input_tokens_seen": 268014985, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19799805, "step": 12425, "time_per_iteration": 4.485883712768555 }, { "auxiliary_loss_clip": 0.01420271, "auxiliary_loss_mlp": 0.01034299, "balance_loss_clip": 1.25596809, "balance_loss_mlp": 1.01546431, "epoch": 0.7470915376521869, "flos": 19984564488960.0, "grad_norm": 3.1666902296725707, "language_loss": 0.70832789, "learning_rate": 6.341513698972194e-07, "loss": 0.73287356, "num_input_tokens_seen": 268034395, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18847656, "step": 12426, "time_per_iteration": 2.849729537963867 }, { "auxiliary_loss_clip": 0.01406485, "auxiliary_loss_mlp": 0.01034815, "balance_loss_clip": 1.24765337, "balance_loss_mlp": 1.01655173, "epoch": 0.747151660904855, "flos": 20093957712000.0, "grad_norm": 1.5438295780642828, "language_loss": 0.65624946, "learning_rate": 6.338668982678139e-07, "loss": 0.68066251, "num_input_tokens_seen": 268054485, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18249512, "step": 12427, "time_per_iteration": 2.8377249240875244 }, { "auxiliary_loss_clip": 0.01417989, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.25456917, "balance_loss_mlp": 1.01161098, "epoch": 0.7472117841575229, "flos": 16299201012480.0, "grad_norm": 1.6622124188583554, "language_loss": 0.75165904, "learning_rate": 6.335824784423118e-07, "loss": 0.77614599, "num_input_tokens_seen": 268072250, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19091797, "step": 12428, "time_per_iteration": 2.8006911277770996 }, { "auxiliary_loss_clip": 0.01425474, "auxiliary_loss_mlp": 0.0103416, "balance_loss_clip": 1.25749207, "balance_loss_mlp": 1.01459742, "epoch": 0.7472719074101909, "flos": 21397898897280.0, "grad_norm": 1.9599299798523406, "language_loss": 0.58765745, "learning_rate": 6.33298110431499e-07, "loss": 0.61225373, "num_input_tokens_seen": 268089840, "router_z_loss_clip": 1.6796875, "router_z_loss_mlp": 0.19543457, "step": 12429, "time_per_iteration": 2.8095595836639404 }, { "auxiliary_loss_clip": 0.0142623, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.26104736, "balance_loss_mlp": 1.01763082, "epoch": 0.7473320306628589, "flos": 29655354303360.0, "grad_norm": 1.8103043213659085, "language_loss": 0.62075961, "learning_rate": 6.330137942461595e-07, "loss": 0.64538538, "num_input_tokens_seen": 268109360, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.18737793, "step": 12430, "time_per_iteration": 2.9137394428253174 }, { "auxiliary_loss_clip": 0.01401762, "auxiliary_loss_mlp": 0.01033666, "balance_loss_clip": 1.24218225, "balance_loss_mlp": 1.0141629, "epoch": 0.7473921539155268, "flos": 24147298298880.0, "grad_norm": 1.4739379564644057, "language_loss": 0.76552403, "learning_rate": 6.327295298970734e-07, "loss": 0.78987825, "num_input_tokens_seen": 268131840, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19494629, "step": 12431, "time_per_iteration": 2.881517171859741 }, { "auxiliary_loss_clip": 0.01412408, "auxiliary_loss_mlp": 0.01034866, "balance_loss_clip": 1.24947333, "balance_loss_mlp": 1.01704478, "epoch": 0.7474522771681948, "flos": 17495423032320.0, "grad_norm": 1.6993995715478654, "language_loss": 0.76391143, "learning_rate": 6.32445317395021e-07, "loss": 0.7883842, "num_input_tokens_seen": 268148300, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.17834473, "step": 12432, "time_per_iteration": 4.340936899185181 }, { "auxiliary_loss_clip": 0.01423771, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.25619495, "balance_loss_mlp": 1.01269436, "epoch": 0.7475124004208628, "flos": 16736004743040.0, "grad_norm": 2.136116144373634, "language_loss": 0.70875347, "learning_rate": 6.321611567507787e-07, "loss": 0.73331785, "num_input_tokens_seen": 268166450, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.1998291, "step": 12433, "time_per_iteration": 2.7950756549835205 }, { "auxiliary_loss_clip": 0.01419201, "auxiliary_loss_mlp": 0.01030465, "balance_loss_clip": 1.25547147, "balance_loss_mlp": 1.01097405, "epoch": 0.7475725236735308, "flos": 19730007388800.0, "grad_norm": 1.9679652025593597, "language_loss": 0.68080568, "learning_rate": 6.318770479751232e-07, "loss": 0.70530236, "num_input_tokens_seen": 268186165, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19506836, "step": 12434, "time_per_iteration": 2.879580020904541 }, { "auxiliary_loss_clip": 0.01390043, "auxiliary_loss_mlp": 0.01032289, "balance_loss_clip": 1.23540735, "balance_loss_mlp": 1.01400208, "epoch": 0.7476326469261987, "flos": 26297039376000.0, "grad_norm": 1.4300783671726351, "language_loss": 0.80337906, "learning_rate": 6.315929910788263e-07, "loss": 0.82760239, "num_input_tokens_seen": 268208145, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.18286133, "step": 12435, "time_per_iteration": 2.8806560039520264 }, { "auxiliary_loss_clip": 0.0141638, "auxiliary_loss_mlp": 0.01029075, "balance_loss_clip": 1.25047469, "balance_loss_mlp": 1.01007354, "epoch": 0.7476927701788667, "flos": 31844304639360.0, "grad_norm": 2.0165663518604773, "language_loss": 0.68123138, "learning_rate": 6.313089860726604e-07, "loss": 0.70568597, "num_input_tokens_seen": 268228345, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19006348, "step": 12436, "time_per_iteration": 2.9192166328430176 }, { "auxiliary_loss_clip": 0.01423814, "auxiliary_loss_mlp": 0.01035046, "balance_loss_clip": 1.25691223, "balance_loss_mlp": 1.01623452, "epoch": 0.7477528934315346, "flos": 31807991047680.0, "grad_norm": 1.7405568607138195, "language_loss": 0.71402609, "learning_rate": 6.31025032967396e-07, "loss": 0.73861462, "num_input_tokens_seen": 268250260, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18823242, "step": 12437, "time_per_iteration": 5.654440879821777 }, { "auxiliary_loss_clip": 0.01394719, "auxiliary_loss_mlp": 0.01033309, "balance_loss_clip": 1.23681843, "balance_loss_mlp": 1.01537991, "epoch": 0.7478130166842026, "flos": 20380620637440.0, "grad_norm": 9.056087163221031, "language_loss": 0.68013227, "learning_rate": 6.307411317737986e-07, "loss": 0.70441258, "num_input_tokens_seen": 268268440, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.17932129, "step": 12438, "time_per_iteration": 2.8613014221191406 }, { "auxiliary_loss_clip": 0.01407596, "auxiliary_loss_mlp": 0.01037099, "balance_loss_clip": 1.24610472, "balance_loss_mlp": 1.01769221, "epoch": 0.7478731399368705, "flos": 18157664194560.0, "grad_norm": 1.8752179293142412, "language_loss": 0.81450802, "learning_rate": 6.304572825026344e-07, "loss": 0.83895499, "num_input_tokens_seen": 268285765, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19396973, "step": 12439, "time_per_iteration": 2.8371241092681885 }, { "auxiliary_loss_clip": 0.01405286, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.24535048, "balance_loss_mlp": 1.01540196, "epoch": 0.7479332631895386, "flos": 15276221907840.0, "grad_norm": 1.9973296407947154, "language_loss": 0.72161031, "learning_rate": 6.301734851646674e-07, "loss": 0.74600017, "num_input_tokens_seen": 268304015, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18286133, "step": 12440, "time_per_iteration": 2.8294785022735596 }, { "auxiliary_loss_clip": 0.01407069, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.2479012, "balance_loss_mlp": 1.01268518, "epoch": 0.7479933864422065, "flos": 21152481246720.0, "grad_norm": 1.6475253314475296, "language_loss": 0.74829066, "learning_rate": 6.298897397706597e-07, "loss": 0.77267051, "num_input_tokens_seen": 268323290, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18225098, "step": 12441, "time_per_iteration": 2.898759603500366 }, { "auxiliary_loss_clip": 0.01414562, "auxiliary_loss_mlp": 0.01036717, "balance_loss_clip": 1.25026703, "balance_loss_mlp": 1.01614189, "epoch": 0.7480535096948745, "flos": 14400080737920.0, "grad_norm": 2.370559785513862, "language_loss": 0.83114088, "learning_rate": 6.296060463313698e-07, "loss": 0.85565364, "num_input_tokens_seen": 268339490, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20568848, "step": 12442, "time_per_iteration": 2.8207619190216064 }, { "auxiliary_loss_clip": 0.01434538, "auxiliary_loss_mlp": 0.01038321, "balance_loss_clip": 1.26800227, "balance_loss_mlp": 1.01866364, "epoch": 0.7481136329475425, "flos": 27356151093120.0, "grad_norm": 2.046024760819203, "language_loss": 0.64662063, "learning_rate": 6.293224048575565e-07, "loss": 0.67134911, "num_input_tokens_seen": 268359865, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.1965332, "step": 12443, "time_per_iteration": 2.9087905883789062 }, { "auxiliary_loss_clip": 0.01401251, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.24161208, "balance_loss_mlp": 1.00947905, "epoch": 0.7481737562002104, "flos": 19539526204800.0, "grad_norm": 2.080946538257685, "language_loss": 0.72449958, "learning_rate": 6.29038815359975e-07, "loss": 0.74878699, "num_input_tokens_seen": 268377065, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18017578, "step": 12444, "time_per_iteration": 2.8500118255615234 }, { "auxiliary_loss_clip": 0.01413554, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.25098252, "balance_loss_mlp": 1.01483321, "epoch": 0.7482338794528784, "flos": 21769450346880.0, "grad_norm": 1.3767583578565141, "language_loss": 0.69907916, "learning_rate": 6.287552778493786e-07, "loss": 0.72355282, "num_input_tokens_seen": 268396935, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18981934, "step": 12445, "time_per_iteration": 2.8494691848754883 }, { "auxiliary_loss_clip": 0.0140363, "auxiliary_loss_mlp": 0.0102928, "balance_loss_clip": 1.24384379, "balance_loss_mlp": 1.01092184, "epoch": 0.7482940027055464, "flos": 18706530591360.0, "grad_norm": 1.6055861593137224, "language_loss": 0.74861014, "learning_rate": 6.28471792336519e-07, "loss": 0.77293921, "num_input_tokens_seen": 268414460, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18383789, "step": 12446, "time_per_iteration": 2.820281982421875 }, { "auxiliary_loss_clip": 0.01419575, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.25488079, "balance_loss_mlp": 1.01136041, "epoch": 0.7483541259582144, "flos": 16006656263040.0, "grad_norm": 1.9523759639281335, "language_loss": 0.73769248, "learning_rate": 6.281883588321475e-07, "loss": 0.76219422, "num_input_tokens_seen": 268432225, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19250488, "step": 12447, "time_per_iteration": 2.8095130920410156 }, { "auxiliary_loss_clip": 0.01400243, "auxiliary_loss_mlp": 0.01031659, "balance_loss_clip": 1.23899543, "balance_loss_mlp": 1.01393259, "epoch": 0.7484142492108823, "flos": 25567147958400.0, "grad_norm": 2.6412419409881323, "language_loss": 0.72665811, "learning_rate": 6.279049773470109e-07, "loss": 0.75097716, "num_input_tokens_seen": 268449270, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.17736816, "step": 12448, "time_per_iteration": 2.8531925678253174 }, { "auxiliary_loss_clip": 0.01412069, "auxiliary_loss_mlp": 0.01034951, "balance_loss_clip": 1.24785984, "balance_loss_mlp": 1.01685452, "epoch": 0.7484743724635503, "flos": 22896574277760.0, "grad_norm": 2.0624012992412735, "language_loss": 0.73983645, "learning_rate": 6.276216478918543e-07, "loss": 0.76430666, "num_input_tokens_seen": 268467250, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18103027, "step": 12449, "time_per_iteration": 2.845534086227417 }, { "auxiliary_loss_clip": 0.014295, "auxiliary_loss_mlp": 0.01035281, "balance_loss_clip": 1.26185703, "balance_loss_mlp": 1.01561117, "epoch": 0.7485344957162182, "flos": 25310916800640.0, "grad_norm": 2.17326506757167, "language_loss": 0.6204015, "learning_rate": 6.273383704774225e-07, "loss": 0.64504933, "num_input_tokens_seen": 268487270, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19677734, "step": 12450, "time_per_iteration": 2.8838274478912354 }, { "auxiliary_loss_clip": 0.01404035, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.24711871, "balance_loss_mlp": 1.01331925, "epoch": 0.7485946189688862, "flos": 27064465994880.0, "grad_norm": 2.073065104731006, "language_loss": 0.71523398, "learning_rate": 6.270551451144577e-07, "loss": 0.73958683, "num_input_tokens_seen": 268508020, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.17944336, "step": 12451, "time_per_iteration": 2.8766801357269287 }, { "auxiliary_loss_clip": 0.01430645, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.26162481, "balance_loss_mlp": 1.02014112, "epoch": 0.7486547422215541, "flos": 26918080508160.0, "grad_norm": 2.177203382172859, "language_loss": 0.81084663, "learning_rate": 6.267719718136988e-07, "loss": 0.83553547, "num_input_tokens_seen": 268527375, "router_z_loss_clip": 1.68945312, "router_z_loss_mlp": 0.1809082, "step": 12452, "time_per_iteration": 2.908031463623047 }, { "auxiliary_loss_clip": 0.01443904, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.27540493, "balance_loss_mlp": 1.01404786, "epoch": 0.7487148654742222, "flos": 22356485372160.0, "grad_norm": 2.2577179174956994, "language_loss": 0.72265023, "learning_rate": 6.264888505858843e-07, "loss": 0.74741894, "num_input_tokens_seen": 268544870, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.18933105, "step": 12453, "time_per_iteration": 2.8831496238708496 }, { "auxiliary_loss_clip": 0.01421519, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.2584517, "balance_loss_mlp": 1.01409125, "epoch": 0.7487749887268901, "flos": 23049022567680.0, "grad_norm": 1.6193462415975595, "language_loss": 0.74787021, "learning_rate": 6.262057814417517e-07, "loss": 0.7724117, "num_input_tokens_seen": 268564580, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1854248, "step": 12454, "time_per_iteration": 2.906676769256592 }, { "auxiliary_loss_clip": 0.01181161, "auxiliary_loss_mlp": 0.01022783, "balance_loss_clip": 1.09303331, "balance_loss_mlp": 1.00218344, "epoch": 0.7488351119795581, "flos": 71556750011520.0, "grad_norm": 0.7365398134440438, "language_loss": 0.59396625, "learning_rate": 6.259227643920322e-07, "loss": 0.61600566, "num_input_tokens_seen": 268629550, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20605469, "step": 12455, "time_per_iteration": 3.5278584957122803 }, { "auxiliary_loss_clip": 0.01412237, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.25330496, "balance_loss_mlp": 1.01463771, "epoch": 0.748895235232226, "flos": 17203692689280.0, "grad_norm": 1.5847272867481101, "language_loss": 0.80740643, "learning_rate": 6.256397994474592e-07, "loss": 0.83186573, "num_input_tokens_seen": 268646645, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19067383, "step": 12456, "time_per_iteration": 2.8248488903045654 }, { "auxiliary_loss_clip": 0.01183989, "auxiliary_loss_mlp": 0.01036206, "balance_loss_clip": 1.09545088, "balance_loss_mlp": 1.01961231, "epoch": 0.748955358484894, "flos": 59008770921600.0, "grad_norm": 0.8359277753263036, "language_loss": 0.61452705, "learning_rate": 6.25356886618763e-07, "loss": 0.636729, "num_input_tokens_seen": 268702275, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.16601562, "step": 12457, "time_per_iteration": 3.2244930267333984 }, { "auxiliary_loss_clip": 0.01417941, "auxiliary_loss_mlp": 0.01040594, "balance_loss_clip": 1.25357568, "balance_loss_mlp": 1.02110362, "epoch": 0.749015481737562, "flos": 11366823588480.0, "grad_norm": 4.7665354542432254, "language_loss": 0.67612457, "learning_rate": 6.250740259166711e-07, "loss": 0.70070994, "num_input_tokens_seen": 268716265, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19494629, "step": 12458, "time_per_iteration": 2.8123645782470703 }, { "auxiliary_loss_clip": 0.01404671, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.2448318, "balance_loss_mlp": 1.01613414, "epoch": 0.74907560499023, "flos": 21116574858240.0, "grad_norm": 1.7365549635788728, "language_loss": 0.80780137, "learning_rate": 6.247912173519106e-07, "loss": 0.83220065, "num_input_tokens_seen": 268734330, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19128418, "step": 12459, "time_per_iteration": 3.03592848777771 }, { "auxiliary_loss_clip": 0.01417162, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.25575912, "balance_loss_mlp": 1.01629424, "epoch": 0.749135728242898, "flos": 22276845244800.0, "grad_norm": 1.7394107446926732, "language_loss": 0.81163335, "learning_rate": 6.245084609352043e-07, "loss": 0.83615601, "num_input_tokens_seen": 268753500, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18798828, "step": 12460, "time_per_iteration": 4.249496698379517 }, { "auxiliary_loss_clip": 0.01408095, "auxiliary_loss_mlp": 0.0103228, "balance_loss_clip": 1.2466855, "balance_loss_mlp": 1.0129087, "epoch": 0.7491958514955659, "flos": 24067250968320.0, "grad_norm": 2.0175209933907445, "language_loss": 0.86343521, "learning_rate": 6.242257566772755e-07, "loss": 0.88783896, "num_input_tokens_seen": 268772055, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19384766, "step": 12461, "time_per_iteration": 2.889167308807373 }, { "auxiliary_loss_clip": 0.01402459, "auxiliary_loss_mlp": 0.01034891, "balance_loss_clip": 1.24443913, "balance_loss_mlp": 1.01667607, "epoch": 0.7492559747482339, "flos": 24501113786880.0, "grad_norm": 1.749516115506894, "language_loss": 0.70605892, "learning_rate": 6.239431045888435e-07, "loss": 0.73043239, "num_input_tokens_seen": 268792265, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18212891, "step": 12462, "time_per_iteration": 2.9687438011169434 }, { "auxiliary_loss_clip": 0.01410557, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.24848032, "balance_loss_mlp": 1.01477075, "epoch": 0.7493160980009018, "flos": 27756731721600.0, "grad_norm": 1.8041319594063903, "language_loss": 0.71324146, "learning_rate": 6.236605046806267e-07, "loss": 0.737688, "num_input_tokens_seen": 268812735, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19311523, "step": 12463, "time_per_iteration": 2.9328815937042236 }, { "auxiliary_loss_clip": 0.01420127, "auxiliary_loss_mlp": 0.01037194, "balance_loss_clip": 1.25823998, "balance_loss_mlp": 1.01845407, "epoch": 0.7493762212535698, "flos": 30238181562240.0, "grad_norm": 1.9868968205806885, "language_loss": 0.78278756, "learning_rate": 6.233779569633419e-07, "loss": 0.80736077, "num_input_tokens_seen": 268833090, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18737793, "step": 12464, "time_per_iteration": 2.942558765411377 }, { "auxiliary_loss_clip": 0.01404969, "auxiliary_loss_mlp": 0.0103742, "balance_loss_clip": 1.24323022, "balance_loss_mlp": 1.01821542, "epoch": 0.7494363445062378, "flos": 21954456910080.0, "grad_norm": 1.7153416336742429, "language_loss": 0.7946583, "learning_rate": 6.230954614477034e-07, "loss": 0.8190822, "num_input_tokens_seen": 268851880, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19189453, "step": 12465, "time_per_iteration": 2.8788678646087646 }, { "auxiliary_loss_clip": 0.01433178, "auxiliary_loss_mlp": 0.01038882, "balance_loss_clip": 1.26351845, "balance_loss_mlp": 1.01865244, "epoch": 0.7494964677589058, "flos": 12496752696960.0, "grad_norm": 2.276132614050069, "language_loss": 0.75717998, "learning_rate": 6.22813018144422e-07, "loss": 0.78190053, "num_input_tokens_seen": 268867910, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.20239258, "step": 12466, "time_per_iteration": 2.806399345397949 }, { "auxiliary_loss_clip": 0.01430047, "auxiliary_loss_mlp": 0.01035987, "balance_loss_clip": 1.26632547, "balance_loss_mlp": 1.01791489, "epoch": 0.7495565910115737, "flos": 21663088525440.0, "grad_norm": 5.872717285421449, "language_loss": 0.67423278, "learning_rate": 6.22530627064209e-07, "loss": 0.69889319, "num_input_tokens_seen": 268887260, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18078613, "step": 12467, "time_per_iteration": 4.319119691848755 }, { "auxiliary_loss_clip": 0.01424094, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 1.25921059, "balance_loss_mlp": 1.01356375, "epoch": 0.7496167142642417, "flos": 15277036314240.0, "grad_norm": 4.558684364239522, "language_loss": 0.77088392, "learning_rate": 6.222482882177735e-07, "loss": 0.79544222, "num_input_tokens_seen": 268902520, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.1817627, "step": 12468, "time_per_iteration": 2.802034616470337 }, { "auxiliary_loss_clip": 0.01408395, "auxiliary_loss_mlp": 0.01035677, "balance_loss_clip": 1.24729466, "balance_loss_mlp": 1.01560271, "epoch": 0.7496768375169096, "flos": 22065207477120.0, "grad_norm": 2.1606248057545914, "language_loss": 0.70485413, "learning_rate": 6.219660016158201e-07, "loss": 0.72929484, "num_input_tokens_seen": 268920970, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.20080566, "step": 12469, "time_per_iteration": 2.8313517570495605 }, { "auxiliary_loss_clip": 0.01421744, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.25757396, "balance_loss_mlp": 1.01672351, "epoch": 0.7497369607695776, "flos": 19064961048960.0, "grad_norm": 1.8523547203377615, "language_loss": 0.6943329, "learning_rate": 6.216837672690543e-07, "loss": 0.71890926, "num_input_tokens_seen": 268936600, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19165039, "step": 12470, "time_per_iteration": 2.826503038406372 }, { "auxiliary_loss_clip": 0.01433175, "auxiliary_loss_mlp": 0.01036708, "balance_loss_clip": 1.26328349, "balance_loss_mlp": 1.01647806, "epoch": 0.7497970840222457, "flos": 21627679829760.0, "grad_norm": 1.9883853060365262, "language_loss": 0.75543773, "learning_rate": 6.214015851881793e-07, "loss": 0.78013653, "num_input_tokens_seen": 268956560, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.20214844, "step": 12471, "time_per_iteration": 2.8243792057037354 }, { "auxiliary_loss_clip": 0.01406215, "auxiliary_loss_mlp": 0.01031249, "balance_loss_clip": 1.24384594, "balance_loss_mlp": 1.01262891, "epoch": 0.7498572072749136, "flos": 13743540420480.0, "grad_norm": 2.2023509364873792, "language_loss": 0.7826823, "learning_rate": 6.211194553838929e-07, "loss": 0.8070569, "num_input_tokens_seen": 268973945, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18615723, "step": 12472, "time_per_iteration": 5.638099908828735 }, { "auxiliary_loss_clip": 0.01409006, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.24921799, "balance_loss_mlp": 1.01364541, "epoch": 0.7499173305275816, "flos": 22976893077120.0, "grad_norm": 1.5428894868137881, "language_loss": 0.84730339, "learning_rate": 6.208373778668951e-07, "loss": 0.87172079, "num_input_tokens_seen": 268993245, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1907959, "step": 12473, "time_per_iteration": 2.8864071369171143 }, { "auxiliary_loss_clip": 0.01427815, "auxiliary_loss_mlp": 0.01036756, "balance_loss_clip": 1.26152599, "balance_loss_mlp": 1.01767087, "epoch": 0.7499774537802495, "flos": 22748921936640.0, "grad_norm": 2.3523608007598207, "language_loss": 0.75137985, "learning_rate": 6.205553526478829e-07, "loss": 0.77602553, "num_input_tokens_seen": 269012125, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.19091797, "step": 12474, "time_per_iteration": 2.8588924407958984 }, { "auxiliary_loss_clip": 0.01443316, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.27391481, "balance_loss_mlp": 1.01697969, "epoch": 0.7500375770329175, "flos": 18305768983680.0, "grad_norm": 2.98924882898071, "language_loss": 0.75214255, "learning_rate": 6.202733797375492e-07, "loss": 0.77693373, "num_input_tokens_seen": 269030545, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.18823242, "step": 12475, "time_per_iteration": 2.8488049507141113 }, { "auxiliary_loss_clip": 0.01436151, "auxiliary_loss_mlp": 0.0103678, "balance_loss_clip": 1.26526606, "balance_loss_mlp": 1.01688433, "epoch": 0.7500977002855854, "flos": 19178833507200.0, "grad_norm": 2.6536037971998057, "language_loss": 0.8059541, "learning_rate": 6.199914591465878e-07, "loss": 0.83068335, "num_input_tokens_seen": 269048180, "router_z_loss_clip": 1.70996094, "router_z_loss_mlp": 0.19885254, "step": 12476, "time_per_iteration": 2.868831157684326 }, { "auxiliary_loss_clip": 0.01413067, "auxiliary_loss_mlp": 0.01035379, "balance_loss_clip": 1.24987328, "balance_loss_mlp": 1.01682997, "epoch": 0.7501578235382534, "flos": 22174057762560.0, "grad_norm": 1.8075033734966004, "language_loss": 0.78118849, "learning_rate": 6.19709590885688e-07, "loss": 0.80567294, "num_input_tokens_seen": 269068600, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.1854248, "step": 12477, "time_per_iteration": 2.8602864742279053 }, { "auxiliary_loss_clip": 0.01181937, "auxiliary_loss_mlp": 0.01018882, "balance_loss_clip": 1.09420013, "balance_loss_mlp": 0.99542159, "epoch": 0.7502179467909214, "flos": 64489861025280.0, "grad_norm": 0.8036009265908917, "language_loss": 0.54449284, "learning_rate": 6.194277749655394e-07, "loss": 0.56650102, "num_input_tokens_seen": 269119045, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.234375, "step": 12478, "time_per_iteration": 3.3315060138702393 }, { "auxiliary_loss_clip": 0.01404429, "auxiliary_loss_mlp": 0.01033422, "balance_loss_clip": 1.24420953, "balance_loss_mlp": 1.01474166, "epoch": 0.7502780700435894, "flos": 20486122807680.0, "grad_norm": 1.8850877143869944, "language_loss": 0.80480963, "learning_rate": 6.191460113968272e-07, "loss": 0.82918817, "num_input_tokens_seen": 269136755, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18676758, "step": 12479, "time_per_iteration": 2.901224374771118 }, { "auxiliary_loss_clip": 0.01433399, "auxiliary_loss_mlp": 0.01034732, "balance_loss_clip": 1.26472259, "balance_loss_mlp": 1.01580131, "epoch": 0.7503381932962573, "flos": 20454016982400.0, "grad_norm": 2.285845219170473, "language_loss": 0.64078653, "learning_rate": 6.188643001902369e-07, "loss": 0.66546786, "num_input_tokens_seen": 269156120, "router_z_loss_clip": 1.6875, "router_z_loss_mlp": 0.18945312, "step": 12480, "time_per_iteration": 2.858152389526367 }, { "auxiliary_loss_clip": 0.01404371, "auxiliary_loss_mlp": 0.01039639, "balance_loss_clip": 1.24722457, "balance_loss_mlp": 1.02149498, "epoch": 0.7503983165489253, "flos": 22391939312640.0, "grad_norm": 1.6800236434957445, "language_loss": 0.78439772, "learning_rate": 6.185826413564512e-07, "loss": 0.80883777, "num_input_tokens_seen": 269175650, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18139648, "step": 12481, "time_per_iteration": 2.939526081085205 }, { "auxiliary_loss_clip": 0.01407523, "auxiliary_loss_mlp": 0.0103748, "balance_loss_clip": 1.24410844, "balance_loss_mlp": 1.01794147, "epoch": 0.7504584398015932, "flos": 24910109948160.0, "grad_norm": 1.663056736199344, "language_loss": 0.72127694, "learning_rate": 6.183010349061501e-07, "loss": 0.74572694, "num_input_tokens_seen": 269197080, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19555664, "step": 12482, "time_per_iteration": 2.924591064453125 }, { "auxiliary_loss_clip": 0.01410539, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.24814248, "balance_loss_mlp": 1.01841068, "epoch": 0.7505185630542612, "flos": 25896323013120.0, "grad_norm": 1.673825100827398, "language_loss": 0.70297801, "learning_rate": 6.180194808500118e-07, "loss": 0.72745895, "num_input_tokens_seen": 269218600, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19128418, "step": 12483, "time_per_iteration": 2.8641340732574463 }, { "auxiliary_loss_clip": 0.01418913, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.25615156, "balance_loss_mlp": 1.01337838, "epoch": 0.7505786863069293, "flos": 23152941169920.0, "grad_norm": 1.869204626047582, "language_loss": 0.75522268, "learning_rate": 6.177379791987131e-07, "loss": 0.77972847, "num_input_tokens_seen": 269239245, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18286133, "step": 12484, "time_per_iteration": 2.8730931282043457 }, { "auxiliary_loss_clip": 0.01406213, "auxiliary_loss_mlp": 0.01035445, "balance_loss_clip": 1.24513841, "balance_loss_mlp": 1.01572776, "epoch": 0.7506388095595972, "flos": 16992326390400.0, "grad_norm": 1.8574209210712411, "language_loss": 0.85759228, "learning_rate": 6.174565299629295e-07, "loss": 0.88200879, "num_input_tokens_seen": 269258520, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19726562, "step": 12485, "time_per_iteration": 2.82473087310791 }, { "auxiliary_loss_clip": 0.01421938, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.26175427, "balance_loss_mlp": 1.01632428, "epoch": 0.7506989328122652, "flos": 22354856559360.0, "grad_norm": 1.5313888610017428, "language_loss": 0.78860283, "learning_rate": 6.171751331533323e-07, "loss": 0.81317472, "num_input_tokens_seen": 269278320, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18933105, "step": 12486, "time_per_iteration": 2.888272523880005 }, { "auxiliary_loss_clip": 0.01419554, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.25517821, "balance_loss_mlp": 1.01455426, "epoch": 0.7507590560649331, "flos": 25787201258880.0, "grad_norm": 2.2463821732140676, "language_loss": 0.73488593, "learning_rate": 6.168937887805932e-07, "loss": 0.75941443, "num_input_tokens_seen": 269298025, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18737793, "step": 12487, "time_per_iteration": 2.889967679977417 }, { "auxiliary_loss_clip": 0.01424755, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.26058578, "balance_loss_mlp": 1.01527143, "epoch": 0.7508191793176011, "flos": 24289973712000.0, "grad_norm": 2.7295280587991484, "language_loss": 0.68765944, "learning_rate": 6.166124968553801e-07, "loss": 0.71224517, "num_input_tokens_seen": 269316770, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18566895, "step": 12488, "time_per_iteration": 2.9213786125183105 }, { "auxiliary_loss_clip": 0.01422514, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.25930071, "balance_loss_mlp": 1.01780844, "epoch": 0.750879302570269, "flos": 19908589190400.0, "grad_norm": 19.09172723996267, "language_loss": 0.7771998, "learning_rate": 6.163312573883592e-07, "loss": 0.80178541, "num_input_tokens_seen": 269334755, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18249512, "step": 12489, "time_per_iteration": 2.810661554336548 }, { "auxiliary_loss_clip": 0.01407254, "auxiliary_loss_mlp": 0.01040023, "balance_loss_clip": 1.24829388, "balance_loss_mlp": 1.02180779, "epoch": 0.750939425822937, "flos": 29217690921600.0, "grad_norm": 2.446715817527074, "language_loss": 0.75747764, "learning_rate": 6.160500703901956e-07, "loss": 0.78195041, "num_input_tokens_seen": 269353810, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18212891, "step": 12490, "time_per_iteration": 2.8904542922973633 }, { "auxiliary_loss_clip": 0.01408268, "auxiliary_loss_mlp": 0.01036466, "balance_loss_clip": 1.24685693, "balance_loss_mlp": 1.01874006, "epoch": 0.750999549075605, "flos": 21152209777920.0, "grad_norm": 1.4919482846402805, "language_loss": 0.79025286, "learning_rate": 6.157689358715527e-07, "loss": 0.81470025, "num_input_tokens_seen": 269372910, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.17724609, "step": 12491, "time_per_iteration": 2.8387835025787354 }, { "auxiliary_loss_clip": 0.01400794, "auxiliary_loss_mlp": 0.01036834, "balance_loss_clip": 1.24235666, "balance_loss_mlp": 1.01869023, "epoch": 0.751059672328273, "flos": 23557593830400.0, "grad_norm": 1.9104272516959286, "language_loss": 0.76931787, "learning_rate": 6.154878538430899e-07, "loss": 0.79369414, "num_input_tokens_seen": 269391545, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18151855, "step": 12492, "time_per_iteration": 2.934488296508789 }, { "auxiliary_loss_clip": 0.01405663, "auxiliary_loss_mlp": 0.01032985, "balance_loss_clip": 1.24428499, "balance_loss_mlp": 1.01496065, "epoch": 0.7511197955809409, "flos": 18999120585600.0, "grad_norm": 1.8108941772907208, "language_loss": 0.71372783, "learning_rate": 6.152068243154671e-07, "loss": 0.73811424, "num_input_tokens_seen": 269408530, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18029785, "step": 12493, "time_per_iteration": 2.864041805267334 }, { "auxiliary_loss_clip": 0.01410937, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.24829936, "balance_loss_mlp": 1.01520801, "epoch": 0.7511799188336089, "flos": 22055163131520.0, "grad_norm": 1.9020213367678784, "language_loss": 0.80652827, "learning_rate": 6.149258472993395e-07, "loss": 0.83098346, "num_input_tokens_seen": 269425930, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19372559, "step": 12494, "time_per_iteration": 2.869568347930908 }, { "auxiliary_loss_clip": 0.0141618, "auxiliary_loss_mlp": 0.01034394, "balance_loss_clip": 1.25413525, "balance_loss_mlp": 1.01560616, "epoch": 0.7512400420862768, "flos": 16474525188480.0, "grad_norm": 2.0697022343619693, "language_loss": 0.79491669, "learning_rate": 6.146449228053634e-07, "loss": 0.81942242, "num_input_tokens_seen": 269443945, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18798828, "step": 12495, "time_per_iteration": 4.344980955123901 }, { "auxiliary_loss_clip": 0.01402283, "auxiliary_loss_mlp": 0.01035513, "balance_loss_clip": 1.24266148, "balance_loss_mlp": 1.01668978, "epoch": 0.7513001653389448, "flos": 20457862790400.0, "grad_norm": 1.8181371972359652, "language_loss": 0.71882701, "learning_rate": 6.143640508441898e-07, "loss": 0.74320501, "num_input_tokens_seen": 269463625, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18835449, "step": 12496, "time_per_iteration": 2.8645927906036377 }, { "auxiliary_loss_clip": 0.01415671, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.25391245, "balance_loss_mlp": 1.01634836, "epoch": 0.7513602885916129, "flos": 23487183642240.0, "grad_norm": 1.6829236524801938, "language_loss": 0.78587139, "learning_rate": 6.140832314264705e-07, "loss": 0.8103807, "num_input_tokens_seen": 269483415, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18896484, "step": 12497, "time_per_iteration": 2.861813545227051 }, { "auxiliary_loss_clip": 0.01406156, "auxiliary_loss_mlp": 0.01034992, "balance_loss_clip": 1.24427915, "balance_loss_mlp": 1.01591873, "epoch": 0.7514204118442808, "flos": 26808506305920.0, "grad_norm": 1.5058327432914824, "language_loss": 0.77313495, "learning_rate": 6.13802464562855e-07, "loss": 0.79754639, "num_input_tokens_seen": 269504635, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.1907959, "step": 12498, "time_per_iteration": 2.9465792179107666 }, { "auxiliary_loss_clip": 0.0141105, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.25241709, "balance_loss_mlp": 1.01862574, "epoch": 0.7514805350969488, "flos": 19875307000320.0, "grad_norm": 1.8126640823116762, "language_loss": 0.74799573, "learning_rate": 6.135217502639878e-07, "loss": 0.77247143, "num_input_tokens_seen": 269523955, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.17907715, "step": 12499, "time_per_iteration": 2.9235429763793945 }, { "auxiliary_loss_clip": 0.01407623, "auxiliary_loss_mlp": 0.01033441, "balance_loss_clip": 1.24798286, "balance_loss_mlp": 1.01563096, "epoch": 0.7515406583496167, "flos": 24582246992640.0, "grad_norm": 1.8435611398797505, "language_loss": 0.79755712, "learning_rate": 6.132410885405148e-07, "loss": 0.82196772, "num_input_tokens_seen": 269544410, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.17810059, "step": 12500, "time_per_iteration": 2.9319345951080322 }, { "auxiliary_loss_clip": 0.01442549, "auxiliary_loss_mlp": 0.01034631, "balance_loss_clip": 1.27135444, "balance_loss_mlp": 1.01518822, "epoch": 0.7516007816022847, "flos": 20129773610880.0, "grad_norm": 2.2486830955849, "language_loss": 0.74562824, "learning_rate": 6.129604794030794e-07, "loss": 0.77040005, "num_input_tokens_seen": 269563315, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 0.19433594, "step": 12501, "time_per_iteration": 2.8921523094177246 }, { "auxiliary_loss_clip": 0.01402263, "auxiliary_loss_mlp": 0.01033321, "balance_loss_clip": 1.24165356, "balance_loss_mlp": 1.01514173, "epoch": 0.7516609048549526, "flos": 22795098894720.0, "grad_norm": 16.350561167693222, "language_loss": 0.7968235, "learning_rate": 6.126799228623207e-07, "loss": 0.82117939, "num_input_tokens_seen": 269583950, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18188477, "step": 12502, "time_per_iteration": 4.372798442840576 }, { "auxiliary_loss_clip": 0.01422679, "auxiliary_loss_mlp": 0.01034136, "balance_loss_clip": 1.25887966, "balance_loss_mlp": 1.01589704, "epoch": 0.7517210281076206, "flos": 10641094692480.0, "grad_norm": 2.062031127974115, "language_loss": 0.71065092, "learning_rate": 6.123994189288786e-07, "loss": 0.73521906, "num_input_tokens_seen": 269600120, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18237305, "step": 12503, "time_per_iteration": 2.795586109161377 }, { "auxiliary_loss_clip": 0.01182892, "auxiliary_loss_mlp": 0.01016496, "balance_loss_clip": 1.09642649, "balance_loss_mlp": 0.99541944, "epoch": 0.7517811513602886, "flos": 66082202421120.0, "grad_norm": 0.9770093400966515, "language_loss": 0.64069164, "learning_rate": 6.121189676133903e-07, "loss": 0.66268551, "num_input_tokens_seen": 269659815, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.2109375, "step": 12504, "time_per_iteration": 3.258028745651245 }, { "auxiliary_loss_clip": 0.01399552, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.24218059, "balance_loss_mlp": 1.01492715, "epoch": 0.7518412746129566, "flos": 37282267169280.0, "grad_norm": 2.646396450129587, "language_loss": 0.69074917, "learning_rate": 6.118385689264896e-07, "loss": 0.71507251, "num_input_tokens_seen": 269684565, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17858887, "step": 12505, "time_per_iteration": 3.008297920227051 }, { "auxiliary_loss_clip": 0.0118732, "auxiliary_loss_mlp": 0.01022075, "balance_loss_clip": 1.09891701, "balance_loss_mlp": 1.00204754, "epoch": 0.7519013978656245, "flos": 60550275144960.0, "grad_norm": 1.05462818756991, "language_loss": 0.55179429, "learning_rate": 6.11558222878809e-07, "loss": 0.5738883, "num_input_tokens_seen": 269752325, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20019531, "step": 12506, "time_per_iteration": 4.853604316711426 }, { "auxiliary_loss_clip": 0.01425654, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.26159692, "balance_loss_mlp": 1.01532364, "epoch": 0.7519615211182925, "flos": 18816104793600.0, "grad_norm": 2.1468944798636262, "language_loss": 0.79460728, "learning_rate": 6.112779294809796e-07, "loss": 0.81920302, "num_input_tokens_seen": 269770630, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18603516, "step": 12507, "time_per_iteration": 4.247362852096558 }, { "auxiliary_loss_clip": 0.01407648, "auxiliary_loss_mlp": 0.0103308, "balance_loss_clip": 1.2488935, "balance_loss_mlp": 1.01471031, "epoch": 0.7520216443709604, "flos": 14583548977920.0, "grad_norm": 3.8521031066597944, "language_loss": 0.71968645, "learning_rate": 6.10997688743631e-07, "loss": 0.74409372, "num_input_tokens_seen": 269787280, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18383789, "step": 12508, "time_per_iteration": 2.826523542404175 }, { "auxiliary_loss_clip": 0.01413725, "auxiliary_loss_mlp": 0.01032, "balance_loss_clip": 1.25178623, "balance_loss_mlp": 1.01295018, "epoch": 0.7520817676236284, "flos": 17065632245760.0, "grad_norm": 1.5966007704382137, "language_loss": 0.72159648, "learning_rate": 6.107175006773885e-07, "loss": 0.7460537, "num_input_tokens_seen": 269805205, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19030762, "step": 12509, "time_per_iteration": 2.8108487129211426 }, { "auxiliary_loss_clip": 0.01424354, "auxiliary_loss_mlp": 0.01038884, "balance_loss_clip": 1.25738788, "balance_loss_mlp": 1.01981044, "epoch": 0.7521418908762965, "flos": 25677446077440.0, "grad_norm": 2.0924255195168864, "language_loss": 0.6277138, "learning_rate": 6.104373652928785e-07, "loss": 0.65234613, "num_input_tokens_seen": 269824820, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19067383, "step": 12510, "time_per_iteration": 2.8786427974700928 }, { "auxiliary_loss_clip": 0.01412679, "auxiliary_loss_mlp": 0.0103822, "balance_loss_clip": 1.25413978, "balance_loss_mlp": 1.01911092, "epoch": 0.7522020141289644, "flos": 20896566802560.0, "grad_norm": 1.7097508539425792, "language_loss": 0.82820797, "learning_rate": 6.10157282600722e-07, "loss": 0.85271698, "num_input_tokens_seen": 269842825, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19104004, "step": 12511, "time_per_iteration": 2.8683831691741943 }, { "auxiliary_loss_clip": 0.01420667, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.2552228, "balance_loss_mlp": 1.01535869, "epoch": 0.7522621373816324, "flos": 12647798398080.0, "grad_norm": 1.9365320376541595, "language_loss": 0.76910347, "learning_rate": 6.098772526115412e-07, "loss": 0.79365873, "num_input_tokens_seen": 269859000, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19506836, "step": 12512, "time_per_iteration": 2.8434855937957764 }, { "auxiliary_loss_clip": 0.01389766, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.2345593, "balance_loss_mlp": 1.01573634, "epoch": 0.7523222606343003, "flos": 25636200802560.0, "grad_norm": 2.107700431577854, "language_loss": 0.82717276, "learning_rate": 6.095972753359537e-07, "loss": 0.85140753, "num_input_tokens_seen": 269878895, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.17993164, "step": 12513, "time_per_iteration": 2.893735408782959 }, { "auxiliary_loss_clip": 0.01425641, "auxiliary_loss_mlp": 0.01036922, "balance_loss_clip": 1.26160192, "balance_loss_mlp": 1.01762211, "epoch": 0.7523823838869683, "flos": 20458586707200.0, "grad_norm": 1.687500583729265, "language_loss": 0.75861132, "learning_rate": 6.093173507845771e-07, "loss": 0.78323698, "num_input_tokens_seen": 269897280, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19311523, "step": 12514, "time_per_iteration": 2.836536407470703 }, { "auxiliary_loss_clip": 0.01395902, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.24012041, "balance_loss_mlp": 1.01275277, "epoch": 0.7524425071396362, "flos": 14728803344640.0, "grad_norm": 1.9857301811744628, "language_loss": 0.68684578, "learning_rate": 6.090374789680271e-07, "loss": 0.7111159, "num_input_tokens_seen": 269914640, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18371582, "step": 12515, "time_per_iteration": 2.817331075668335 }, { "auxiliary_loss_clip": 0.01415851, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.25352597, "balance_loss_mlp": 1.01383877, "epoch": 0.7525026303923043, "flos": 30604394125440.0, "grad_norm": 2.10976560304191, "language_loss": 0.70994449, "learning_rate": 6.087576598969137e-07, "loss": 0.73441851, "num_input_tokens_seen": 269934960, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.17712402, "step": 12516, "time_per_iteration": 2.98014235496521 }, { "auxiliary_loss_clip": 0.01411969, "auxiliary_loss_mlp": 0.01034802, "balance_loss_clip": 1.25277019, "balance_loss_mlp": 1.01608574, "epoch": 0.7525627536449722, "flos": 24802843230720.0, "grad_norm": 1.6851932802490854, "language_loss": 0.89709735, "learning_rate": 6.084778935818495e-07, "loss": 0.92156506, "num_input_tokens_seen": 269956655, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18713379, "step": 12517, "time_per_iteration": 2.865992546081543 }, { "auxiliary_loss_clip": 0.01436706, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.27056086, "balance_loss_mlp": 1.02026176, "epoch": 0.7526228768976402, "flos": 20790204981120.0, "grad_norm": 1.701226716471453, "language_loss": 0.75208861, "learning_rate": 6.081981800334437e-07, "loss": 0.77683926, "num_input_tokens_seen": 269976835, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.18103027, "step": 12518, "time_per_iteration": 2.8460352420806885 }, { "auxiliary_loss_clip": 0.01181817, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.09595895, "balance_loss_mlp": 1.01510429, "epoch": 0.7526830001503081, "flos": 66588375709440.0, "grad_norm": 0.7125369424839948, "language_loss": 0.55726403, "learning_rate": 6.079185192623017e-07, "loss": 0.57940966, "num_input_tokens_seen": 270040630, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.17675781, "step": 12519, "time_per_iteration": 3.415013074874878 }, { "auxiliary_loss_clip": 0.01413972, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.25207484, "balance_loss_mlp": 1.01464975, "epoch": 0.7527431234029761, "flos": 23488540986240.0, "grad_norm": 1.4694622336932446, "language_loss": 0.77733541, "learning_rate": 6.07638911279029e-07, "loss": 0.80179954, "num_input_tokens_seen": 270059695, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.17797852, "step": 12520, "time_per_iteration": 2.9047603607177734 }, { "auxiliary_loss_clip": 0.01404201, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.24467158, "balance_loss_mlp": 1.02138722, "epoch": 0.752803246655644, "flos": 22058420757120.0, "grad_norm": 2.082117532418783, "language_loss": 0.74630105, "learning_rate": 6.07359356094229e-07, "loss": 0.77073717, "num_input_tokens_seen": 270078420, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18029785, "step": 12521, "time_per_iteration": 2.8940083980560303 }, { "auxiliary_loss_clip": 0.01430344, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.26285565, "balance_loss_mlp": 1.01268613, "epoch": 0.752863369908312, "flos": 30165916337280.0, "grad_norm": 8.102361819746235, "language_loss": 0.67711568, "learning_rate": 6.070798537185016e-07, "loss": 0.70173836, "num_input_tokens_seen": 270097040, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19238281, "step": 12522, "time_per_iteration": 2.9213716983795166 }, { "auxiliary_loss_clip": 0.01432435, "auxiliary_loss_mlp": 0.01041932, "balance_loss_clip": 1.26638496, "balance_loss_mlp": 1.02320385, "epoch": 0.7529234931609801, "flos": 24577315309440.0, "grad_norm": 1.558275238503345, "language_loss": 0.79491937, "learning_rate": 6.068004041624453e-07, "loss": 0.81966305, "num_input_tokens_seen": 270116365, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.1875, "step": 12523, "time_per_iteration": 2.8782238960266113 }, { "auxiliary_loss_clip": 0.01404199, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.2454524, "balance_loss_mlp": 1.01288319, "epoch": 0.752983616413648, "flos": 23122780871040.0, "grad_norm": 1.8380797841665848, "language_loss": 0.81043601, "learning_rate": 6.065210074366571e-07, "loss": 0.83479112, "num_input_tokens_seen": 270135395, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18432617, "step": 12524, "time_per_iteration": 2.8407185077667236 }, { "auxiliary_loss_clip": 0.01398818, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.24028397, "balance_loss_mlp": 1.01304257, "epoch": 0.753043739666316, "flos": 24327780382080.0, "grad_norm": 1.8163588976383853, "language_loss": 0.74491155, "learning_rate": 6.062416635517326e-07, "loss": 0.76921797, "num_input_tokens_seen": 270156425, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18774414, "step": 12525, "time_per_iteration": 2.874983310699463 }, { "auxiliary_loss_clip": 0.01400925, "auxiliary_loss_mlp": 0.01036158, "balance_loss_clip": 1.24067724, "balance_loss_mlp": 1.01641726, "epoch": 0.7531038629189839, "flos": 24253931589120.0, "grad_norm": 1.7696786625543217, "language_loss": 0.7282328, "learning_rate": 6.059623725182641e-07, "loss": 0.75260365, "num_input_tokens_seen": 270176905, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19750977, "step": 12526, "time_per_iteration": 2.8781533241271973 }, { "auxiliary_loss_clip": 0.01405112, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.24510753, "balance_loss_mlp": 1.01422548, "epoch": 0.7531639861716519, "flos": 30200555871360.0, "grad_norm": 1.9582785262952493, "language_loss": 0.72628415, "learning_rate": 6.056831343468414e-07, "loss": 0.75065374, "num_input_tokens_seen": 270196640, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.17614746, "step": 12527, "time_per_iteration": 2.907971143722534 }, { "auxiliary_loss_clip": 0.01413627, "auxiliary_loss_mlp": 0.01028882, "balance_loss_clip": 1.25331557, "balance_loss_mlp": 1.01088166, "epoch": 0.7532241094243198, "flos": 18232282149120.0, "grad_norm": 1.8334075499491955, "language_loss": 0.8164748, "learning_rate": 6.054039490480539e-07, "loss": 0.84089994, "num_input_tokens_seen": 270213905, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.17993164, "step": 12528, "time_per_iteration": 2.805462121963501 }, { "auxiliary_loss_clip": 0.01411241, "auxiliary_loss_mlp": 0.01033664, "balance_loss_clip": 1.24841654, "balance_loss_mlp": 1.01525807, "epoch": 0.7532842326769879, "flos": 20889372879360.0, "grad_norm": 1.8088842202638404, "language_loss": 0.85692602, "learning_rate": 6.051248166324892e-07, "loss": 0.88137507, "num_input_tokens_seen": 270231995, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18408203, "step": 12529, "time_per_iteration": 2.8273956775665283 }, { "auxiliary_loss_clip": 0.01439548, "auxiliary_loss_mlp": 0.01035293, "balance_loss_clip": 1.27128124, "balance_loss_mlp": 1.01587391, "epoch": 0.7533443559296558, "flos": 18088113657600.0, "grad_norm": 1.7717014266723807, "language_loss": 0.75264949, "learning_rate": 6.048457371107303e-07, "loss": 0.77739787, "num_input_tokens_seen": 270251480, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.1940918, "step": 12530, "time_per_iteration": 4.264735460281372 }, { "auxiliary_loss_clip": 0.01182346, "auxiliary_loss_mlp": 0.01034903, "balance_loss_clip": 1.0975858, "balance_loss_mlp": 1.01678336, "epoch": 0.7534044791823238, "flos": 50280958126080.0, "grad_norm": 0.8366184450679832, "language_loss": 0.63655579, "learning_rate": 6.045667104933612e-07, "loss": 0.6587283, "num_input_tokens_seen": 270306480, "router_z_loss_clip": 0.84765625, "router_z_loss_mlp": 0.18164062, "step": 12531, "time_per_iteration": 3.2316484451293945 }, { "auxiliary_loss_clip": 0.0142052, "auxiliary_loss_mlp": 0.01030067, "balance_loss_clip": 1.25538111, "balance_loss_mlp": 1.01137519, "epoch": 0.7534646024349917, "flos": 20860072231680.0, "grad_norm": 1.8568019660964725, "language_loss": 0.70818043, "learning_rate": 6.042877367909633e-07, "loss": 0.73268628, "num_input_tokens_seen": 270324595, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18688965, "step": 12532, "time_per_iteration": 2.8829541206359863 }, { "auxiliary_loss_clip": 0.01393005, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.23567224, "balance_loss_mlp": 1.01627898, "epoch": 0.7535247256876597, "flos": 23081083148160.0, "grad_norm": 1.689841383636595, "language_loss": 0.78231835, "learning_rate": 6.040088160141132e-07, "loss": 0.80658138, "num_input_tokens_seen": 270344375, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17041016, "step": 12533, "time_per_iteration": 2.863807439804077 }, { "auxiliary_loss_clip": 0.01183801, "auxiliary_loss_mlp": 0.01017532, "balance_loss_clip": 1.09644008, "balance_loss_mlp": 1.00007951, "epoch": 0.7535848489403276, "flos": 58655181657600.0, "grad_norm": 0.787424033101597, "language_loss": 0.57396382, "learning_rate": 6.037299481733886e-07, "loss": 0.59597719, "num_input_tokens_seen": 270405235, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.17480469, "step": 12534, "time_per_iteration": 3.331066131591797 }, { "auxiliary_loss_clip": 0.0140619, "auxiliary_loss_mlp": 0.0103194, "balance_loss_clip": 1.24534941, "balance_loss_mlp": 1.0131768, "epoch": 0.7536449721929956, "flos": 26588995943040.0, "grad_norm": 1.6432545054121002, "language_loss": 0.71813965, "learning_rate": 6.03451133279365e-07, "loss": 0.74252093, "num_input_tokens_seen": 270425820, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18762207, "step": 12535, "time_per_iteration": 2.9111478328704834 }, { "auxiliary_loss_clip": 0.01418365, "auxiliary_loss_mlp": 0.01034669, "balance_loss_clip": 1.25318992, "balance_loss_mlp": 1.01516604, "epoch": 0.7537050954456637, "flos": 25746182208000.0, "grad_norm": 1.6705233688828167, "language_loss": 0.81315279, "learning_rate": 6.031723713426135e-07, "loss": 0.83768314, "num_input_tokens_seen": 270447120, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19482422, "step": 12536, "time_per_iteration": 2.9483072757720947 }, { "auxiliary_loss_clip": 0.01409988, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.25033593, "balance_loss_mlp": 1.01395607, "epoch": 0.7537652186983316, "flos": 30235964567040.0, "grad_norm": 1.8477353485007515, "language_loss": 0.7576167, "learning_rate": 6.028936623737067e-07, "loss": 0.7820313, "num_input_tokens_seen": 270468680, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.17492676, "step": 12537, "time_per_iteration": 4.42828631401062 }, { "auxiliary_loss_clip": 0.01422663, "auxiliary_loss_mlp": 0.01035315, "balance_loss_clip": 1.25891995, "balance_loss_mlp": 1.0171001, "epoch": 0.7538253419509996, "flos": 12648974762880.0, "grad_norm": 1.7579458617504538, "language_loss": 0.75034875, "learning_rate": 6.026150063832111e-07, "loss": 0.77492857, "num_input_tokens_seen": 270486310, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18225098, "step": 12538, "time_per_iteration": 2.8212201595306396 }, { "auxiliary_loss_clip": 0.01419065, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 1.25491667, "balance_loss_mlp": 1.01501703, "epoch": 0.7538854652036675, "flos": 23196539174400.0, "grad_norm": 1.4749150972183194, "language_loss": 0.68112135, "learning_rate": 6.023364033816956e-07, "loss": 0.70565176, "num_input_tokens_seen": 270507210, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18945312, "step": 12539, "time_per_iteration": 2.8677146434783936 }, { "auxiliary_loss_clip": 0.01405761, "auxiliary_loss_mlp": 0.01036558, "balance_loss_clip": 1.24667215, "balance_loss_mlp": 1.01723433, "epoch": 0.7539455884563355, "flos": 23196855888000.0, "grad_norm": 1.605872604071508, "language_loss": 0.75441939, "learning_rate": 6.020578533797229e-07, "loss": 0.77884257, "num_input_tokens_seen": 270525250, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.1932373, "step": 12540, "time_per_iteration": 2.855470657348633 }, { "auxiliary_loss_clip": 0.01412358, "auxiliary_loss_mlp": 0.01035154, "balance_loss_clip": 1.2488296, "balance_loss_mlp": 1.01666439, "epoch": 0.7540057117090034, "flos": 13187118142080.0, "grad_norm": 2.2496803400617136, "language_loss": 0.73728019, "learning_rate": 6.017793563878566e-07, "loss": 0.76175529, "num_input_tokens_seen": 270539295, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18493652, "step": 12541, "time_per_iteration": 4.260338068008423 }, { "auxiliary_loss_clip": 0.01407454, "auxiliary_loss_mlp": 0.01031442, "balance_loss_clip": 1.24615538, "balance_loss_mlp": 1.01254725, "epoch": 0.7540658349616715, "flos": 45494314778880.0, "grad_norm": 1.779009623857626, "language_loss": 0.7288987, "learning_rate": 6.015009124166576e-07, "loss": 0.75328767, "num_input_tokens_seen": 270562815, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18884277, "step": 12542, "time_per_iteration": 4.425582647323608 }, { "auxiliary_loss_clip": 0.01403163, "auxiliary_loss_mlp": 0.01027816, "balance_loss_clip": 1.24333334, "balance_loss_mlp": 1.00958872, "epoch": 0.7541259582143394, "flos": 19938478020480.0, "grad_norm": 1.7828380713955987, "language_loss": 0.85685062, "learning_rate": 6.012225214766844e-07, "loss": 0.88116038, "num_input_tokens_seen": 270579055, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18225098, "step": 12543, "time_per_iteration": 2.9303622245788574 }, { "auxiliary_loss_clip": 0.01405055, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.24627209, "balance_loss_mlp": 1.01405692, "epoch": 0.7541860814670074, "flos": 27209539382400.0, "grad_norm": 2.0552790162767716, "language_loss": 0.74510562, "learning_rate": 6.009441835784927e-07, "loss": 0.76947534, "num_input_tokens_seen": 270599080, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.17858887, "step": 12544, "time_per_iteration": 2.912008047103882 }, { "auxiliary_loss_clip": 0.0139537, "auxiliary_loss_mlp": 0.01029888, "balance_loss_clip": 1.23677099, "balance_loss_mlp": 1.01291299, "epoch": 0.7542462047196753, "flos": 21333958715520.0, "grad_norm": 1.8254743662878836, "language_loss": 0.69305813, "learning_rate": 6.006658987326383e-07, "loss": 0.71731067, "num_input_tokens_seen": 270618715, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.1697998, "step": 12545, "time_per_iteration": 2.832503318786621 }, { "auxiliary_loss_clip": 0.01413448, "auxiliary_loss_mlp": 0.01030057, "balance_loss_clip": 1.25073099, "balance_loss_mlp": 1.01188982, "epoch": 0.7543063279723433, "flos": 11946664690560.0, "grad_norm": 1.7722481917770914, "language_loss": 0.70085895, "learning_rate": 6.003876669496728e-07, "loss": 0.72529399, "num_input_tokens_seen": 270635695, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1817627, "step": 12546, "time_per_iteration": 2.836237668991089 }, { "auxiliary_loss_clip": 0.01419487, "auxiliary_loss_mlp": 0.01032603, "balance_loss_clip": 1.25458789, "balance_loss_mlp": 1.01422048, "epoch": 0.7543664512250112, "flos": 22830145632000.0, "grad_norm": 6.860032061353637, "language_loss": 0.74859416, "learning_rate": 6.00109488240147e-07, "loss": 0.77311504, "num_input_tokens_seen": 270654325, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18395996, "step": 12547, "time_per_iteration": 2.84859561920166 }, { "auxiliary_loss_clip": 0.01398604, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.23734343, "balance_loss_mlp": 1.0168153, "epoch": 0.7544265744776792, "flos": 20933830535040.0, "grad_norm": 1.756312767150609, "language_loss": 0.68202412, "learning_rate": 5.998313626146099e-07, "loss": 0.70635951, "num_input_tokens_seen": 270674260, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18115234, "step": 12548, "time_per_iteration": 2.939211368560791 }, { "auxiliary_loss_clip": 0.01418706, "auxiliary_loss_mlp": 0.0103652, "balance_loss_clip": 1.25360286, "balance_loss_mlp": 1.01844811, "epoch": 0.7544866977303473, "flos": 15203956682880.0, "grad_norm": 1.7149477740098071, "language_loss": 0.87481385, "learning_rate": 5.995532900836088e-07, "loss": 0.89936614, "num_input_tokens_seen": 270692200, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.18066406, "step": 12549, "time_per_iteration": 2.8127150535583496 }, { "auxiliary_loss_clip": 0.01386178, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.23091137, "balance_loss_mlp": 1.01566768, "epoch": 0.7545468209830152, "flos": 27093630908160.0, "grad_norm": 1.9616979264033383, "language_loss": 0.78106558, "learning_rate": 5.992752706576865e-07, "loss": 0.80526531, "num_input_tokens_seen": 270709675, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.18139648, "step": 12550, "time_per_iteration": 2.8859975337982178 }, { "auxiliary_loss_clip": 0.01423197, "auxiliary_loss_mlp": 0.01037183, "balance_loss_clip": 1.26005888, "balance_loss_mlp": 1.01938546, "epoch": 0.7546069442356832, "flos": 26883395729280.0, "grad_norm": 1.499304719982298, "language_loss": 0.70070052, "learning_rate": 5.98997304347386e-07, "loss": 0.72530437, "num_input_tokens_seen": 270733055, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.17797852, "step": 12551, "time_per_iteration": 2.933253765106201 }, { "auxiliary_loss_clip": 0.01412982, "auxiliary_loss_mlp": 0.01031324, "balance_loss_clip": 1.25181699, "balance_loss_mlp": 1.01301301, "epoch": 0.7546670674883511, "flos": 15751827694080.0, "grad_norm": 1.9570946640751115, "language_loss": 0.87056875, "learning_rate": 5.987193911632487e-07, "loss": 0.8950119, "num_input_tokens_seen": 270749275, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18310547, "step": 12552, "time_per_iteration": 2.8008408546447754 }, { "auxiliary_loss_clip": 0.01415135, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.25214171, "balance_loss_mlp": 1.01639557, "epoch": 0.7547271907410191, "flos": 23488314762240.0, "grad_norm": 1.769343980059863, "language_loss": 0.79857737, "learning_rate": 5.98441531115812e-07, "loss": 0.82307315, "num_input_tokens_seen": 270768230, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18041992, "step": 12553, "time_per_iteration": 2.8795533180236816 }, { "auxiliary_loss_clip": 0.01410233, "auxiliary_loss_mlp": 0.01030432, "balance_loss_clip": 1.24926329, "balance_loss_mlp": 1.01166844, "epoch": 0.754787313993687, "flos": 31735363864320.0, "grad_norm": 2.4478203707279675, "language_loss": 0.63670516, "learning_rate": 5.981637242156135e-07, "loss": 0.66111183, "num_input_tokens_seen": 270786285, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18762207, "step": 12554, "time_per_iteration": 2.920714855194092 }, { "auxiliary_loss_clip": 0.01406221, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.2438519, "balance_loss_mlp": 1.01854157, "epoch": 0.7548474372463551, "flos": 27574439846400.0, "grad_norm": 1.6098107064623632, "language_loss": 0.73874444, "learning_rate": 5.978859704731864e-07, "loss": 0.76318491, "num_input_tokens_seen": 270805505, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19299316, "step": 12555, "time_per_iteration": 2.921966314315796 }, { "auxiliary_loss_clip": 0.01420432, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.25776148, "balance_loss_mlp": 1.01377678, "epoch": 0.754907560499023, "flos": 19328159905920.0, "grad_norm": 1.8633113786365774, "language_loss": 0.79764259, "learning_rate": 5.976082698990645e-07, "loss": 0.82216859, "num_input_tokens_seen": 270824610, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18383789, "step": 12556, "time_per_iteration": 2.908280372619629 }, { "auxiliary_loss_clip": 0.01183451, "auxiliary_loss_mlp": 0.01019499, "balance_loss_clip": 1.09563875, "balance_loss_mlp": 0.99947149, "epoch": 0.754967683751691, "flos": 69777474508800.0, "grad_norm": 0.7044004611801478, "language_loss": 0.50420934, "learning_rate": 5.973306225037769e-07, "loss": 0.5262388, "num_input_tokens_seen": 270886155, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.20019531, "step": 12557, "time_per_iteration": 3.336458921432495 }, { "auxiliary_loss_clip": 0.01423004, "auxiliary_loss_mlp": 0.01038098, "balance_loss_clip": 1.25887918, "balance_loss_mlp": 1.01867855, "epoch": 0.7550278070043589, "flos": 24431608494720.0, "grad_norm": 3.3870597843389367, "language_loss": 0.72497433, "learning_rate": 5.970530282978525e-07, "loss": 0.74958539, "num_input_tokens_seen": 270905325, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19421387, "step": 12558, "time_per_iteration": 2.926224946975708 }, { "auxiliary_loss_clip": 0.01404222, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.24366164, "balance_loss_mlp": 1.01625299, "epoch": 0.7550879302570269, "flos": 32647320933120.0, "grad_norm": 1.8054958584933598, "language_loss": 0.81105626, "learning_rate": 5.967754872918187e-07, "loss": 0.83544183, "num_input_tokens_seen": 270927535, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18078613, "step": 12559, "time_per_iteration": 2.952582597732544 }, { "auxiliary_loss_clip": 0.01425587, "auxiliary_loss_mlp": 0.01031069, "balance_loss_clip": 1.25949979, "balance_loss_mlp": 1.01204348, "epoch": 0.7551480535096948, "flos": 21804723308160.0, "grad_norm": 1.6990951724146872, "language_loss": 0.7919628, "learning_rate": 5.96497999496199e-07, "loss": 0.81652939, "num_input_tokens_seen": 270946920, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19018555, "step": 12560, "time_per_iteration": 2.8748464584350586 }, { "auxiliary_loss_clip": 0.01406611, "auxiliary_loss_mlp": 0.0103191, "balance_loss_clip": 1.24733305, "balance_loss_mlp": 1.0142312, "epoch": 0.7552081767623628, "flos": 18524283960960.0, "grad_norm": 1.5450851279325317, "language_loss": 0.71421862, "learning_rate": 5.96220564921515e-07, "loss": 0.73860383, "num_input_tokens_seen": 270965705, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.17675781, "step": 12561, "time_per_iteration": 2.8078396320343018 }, { "auxiliary_loss_clip": 0.01419741, "auxiliary_loss_mlp": 0.01032473, "balance_loss_clip": 1.25612235, "balance_loss_mlp": 1.01393569, "epoch": 0.7552683000150308, "flos": 27645890664960.0, "grad_norm": 4.560237584765932, "language_loss": 0.76271147, "learning_rate": 5.959431835782889e-07, "loss": 0.78723359, "num_input_tokens_seen": 270986550, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18530273, "step": 12562, "time_per_iteration": 2.9052159786224365 }, { "auxiliary_loss_clip": 0.01408703, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.2476213, "balance_loss_mlp": 1.0153302, "epoch": 0.7553284232676988, "flos": 20312563178880.0, "grad_norm": 1.988043047010246, "language_loss": 0.76611686, "learning_rate": 5.956658554770371e-07, "loss": 0.79055786, "num_input_tokens_seen": 271006250, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.20068359, "step": 12563, "time_per_iteration": 2.8394618034362793 }, { "auxiliary_loss_clip": 0.01456183, "auxiliary_loss_mlp": 0.01038823, "balance_loss_clip": 1.28453994, "balance_loss_mlp": 1.01904631, "epoch": 0.7553885465203668, "flos": 33268497799680.0, "grad_norm": 2.7248944395883465, "language_loss": 0.67545807, "learning_rate": 5.953885806282768e-07, "loss": 0.70040816, "num_input_tokens_seen": 271025575, "router_z_loss_clip": 1.71972656, "router_z_loss_mlp": 0.19775391, "step": 12564, "time_per_iteration": 2.950690746307373 }, { "auxiliary_loss_clip": 0.01422344, "auxiliary_loss_mlp": 0.01037049, "balance_loss_clip": 1.25567698, "balance_loss_mlp": 1.0180949, "epoch": 0.7554486697730347, "flos": 21626186751360.0, "grad_norm": 2.9496874089931846, "language_loss": 0.68906271, "learning_rate": 5.951113590425228e-07, "loss": 0.71365666, "num_input_tokens_seen": 271045805, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.18933105, "step": 12565, "time_per_iteration": 4.282850742340088 }, { "auxiliary_loss_clip": 0.01423468, "auxiliary_loss_mlp": 0.01035502, "balance_loss_clip": 1.25501037, "balance_loss_mlp": 1.01646423, "epoch": 0.7555087930257027, "flos": 27643583180160.0, "grad_norm": 1.4972850710576684, "language_loss": 0.75458127, "learning_rate": 5.94834190730287e-07, "loss": 0.77917093, "num_input_tokens_seen": 271066065, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19030762, "step": 12566, "time_per_iteration": 2.8908095359802246 }, { "auxiliary_loss_clip": 0.01431671, "auxiliary_loss_mlp": 0.01039042, "balance_loss_clip": 1.2639221, "balance_loss_mlp": 1.02030241, "epoch": 0.7555689162783706, "flos": 23631759336960.0, "grad_norm": 3.1021557168173026, "language_loss": 0.74362117, "learning_rate": 5.945570757020789e-07, "loss": 0.76832831, "num_input_tokens_seen": 271085870, "router_z_loss_clip": 1.67480469, "router_z_loss_mlp": 0.18737793, "step": 12567, "time_per_iteration": 2.8888962268829346 }, { "auxiliary_loss_clip": 0.0141983, "auxiliary_loss_mlp": 0.01030968, "balance_loss_clip": 1.25693011, "balance_loss_mlp": 1.01363468, "epoch": 0.7556290395310387, "flos": 24873751111680.0, "grad_norm": 1.8628773628798716, "language_loss": 0.63698506, "learning_rate": 5.942800139684073e-07, "loss": 0.66149306, "num_input_tokens_seen": 271104260, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.17346191, "step": 12568, "time_per_iteration": 2.882469892501831 }, { "auxiliary_loss_clip": 0.01415473, "auxiliary_loss_mlp": 0.01035311, "balance_loss_clip": 1.25450218, "balance_loss_mlp": 1.01704764, "epoch": 0.7556891627837066, "flos": 43559288115840.0, "grad_norm": 2.0132734911539387, "language_loss": 0.67123801, "learning_rate": 5.940030055397789e-07, "loss": 0.69574583, "num_input_tokens_seen": 271125745, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18273926, "step": 12569, "time_per_iteration": 3.054701566696167 }, { "auxiliary_loss_clip": 0.01421676, "auxiliary_loss_mlp": 0.01035487, "balance_loss_clip": 1.25353074, "balance_loss_mlp": 1.01631784, "epoch": 0.7557492860363746, "flos": 26662075574400.0, "grad_norm": 1.628883311413023, "language_loss": 0.67600596, "learning_rate": 5.93726050426697e-07, "loss": 0.70057762, "num_input_tokens_seen": 271147145, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.19189453, "step": 12570, "time_per_iteration": 2.8827171325683594 }, { "auxiliary_loss_clip": 0.01422827, "auxiliary_loss_mlp": 0.01034087, "balance_loss_clip": 1.25829399, "balance_loss_mlp": 1.01531136, "epoch": 0.7558094092890425, "flos": 55201463429760.0, "grad_norm": 1.9508263863980602, "language_loss": 0.72497034, "learning_rate": 5.934491486396647e-07, "loss": 0.74953943, "num_input_tokens_seen": 271170865, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18786621, "step": 12571, "time_per_iteration": 3.119129180908203 }, { "auxiliary_loss_clip": 0.01421011, "auxiliary_loss_mlp": 0.01036275, "balance_loss_clip": 1.25363505, "balance_loss_mlp": 1.01798844, "epoch": 0.7558695325417105, "flos": 23998967285760.0, "grad_norm": 2.247437391273483, "language_loss": 0.7429074, "learning_rate": 5.931723001891811e-07, "loss": 0.76748025, "num_input_tokens_seen": 271191450, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.18286133, "step": 12572, "time_per_iteration": 2.883322238922119 }, { "auxiliary_loss_clip": 0.01435482, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.26962304, "balance_loss_mlp": 1.01961029, "epoch": 0.7559296557943784, "flos": 14619455366400.0, "grad_norm": 1.970319854285585, "language_loss": 0.76883429, "learning_rate": 5.928955050857456e-07, "loss": 0.79357189, "num_input_tokens_seen": 271207335, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.18664551, "step": 12573, "time_per_iteration": 4.197461128234863 }, { "auxiliary_loss_clip": 0.0141905, "auxiliary_loss_mlp": 0.01035041, "balance_loss_clip": 1.25229669, "balance_loss_mlp": 1.01655114, "epoch": 0.7559897790470465, "flos": 18559375943040.0, "grad_norm": 1.4974926185495152, "language_loss": 0.69936991, "learning_rate": 5.926187633398527e-07, "loss": 0.72391075, "num_input_tokens_seen": 271226895, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.18481445, "step": 12574, "time_per_iteration": 2.835620403289795 }, { "auxiliary_loss_clip": 0.01413554, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.25167799, "balance_loss_mlp": 1.01124048, "epoch": 0.7560499022997144, "flos": 17976910642560.0, "grad_norm": 2.0954321921720487, "language_loss": 0.72516572, "learning_rate": 5.923420749619974e-07, "loss": 0.74960023, "num_input_tokens_seen": 271244375, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18652344, "step": 12575, "time_per_iteration": 2.8251450061798096 }, { "auxiliary_loss_clip": 0.01402177, "auxiliary_loss_mlp": 0.01033884, "balance_loss_clip": 1.24033666, "balance_loss_mlp": 1.01596737, "epoch": 0.7561100255523824, "flos": 15745131463680.0, "grad_norm": 2.583739146535471, "language_loss": 0.72361529, "learning_rate": 5.92065439962673e-07, "loss": 0.74797583, "num_input_tokens_seen": 271259530, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.17919922, "step": 12576, "time_per_iteration": 2.8344364166259766 }, { "auxiliary_loss_clip": 0.01412906, "auxiliary_loss_mlp": 0.01031609, "balance_loss_clip": 1.2518152, "balance_loss_mlp": 1.01309586, "epoch": 0.7561701488050504, "flos": 15896584368000.0, "grad_norm": 1.90847509990466, "language_loss": 0.67739773, "learning_rate": 5.917888583523669e-07, "loss": 0.70184278, "num_input_tokens_seen": 271276835, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18518066, "step": 12577, "time_per_iteration": 5.604647874832153 }, { "auxiliary_loss_clip": 0.01401605, "auxiliary_loss_mlp": 0.01035288, "balance_loss_clip": 1.24193382, "balance_loss_mlp": 1.01657236, "epoch": 0.7562302720577183, "flos": 20348741036160.0, "grad_norm": 1.8557066963813003, "language_loss": 0.78660214, "learning_rate": 5.915123301415685e-07, "loss": 0.81097108, "num_input_tokens_seen": 271296275, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18737793, "step": 12578, "time_per_iteration": 2.834484577178955 }, { "auxiliary_loss_clip": 0.01409822, "auxiliary_loss_mlp": 0.01033184, "balance_loss_clip": 1.24605799, "balance_loss_mlp": 1.014552, "epoch": 0.7562903953103863, "flos": 20821586889600.0, "grad_norm": 1.6037588359154342, "language_loss": 0.76298243, "learning_rate": 5.912358553407641e-07, "loss": 0.78741241, "num_input_tokens_seen": 271315685, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.1862793, "step": 12579, "time_per_iteration": 2.860870838165283 }, { "auxiliary_loss_clip": 0.01428766, "auxiliary_loss_mlp": 0.01035282, "balance_loss_clip": 1.25942397, "balance_loss_mlp": 1.01569629, "epoch": 0.7563505185630542, "flos": 37611080265600.0, "grad_norm": 1.9590962283197713, "language_loss": 0.62993252, "learning_rate": 5.90959433960437e-07, "loss": 0.65457296, "num_input_tokens_seen": 271336790, "router_z_loss_clip": 1.6953125, "router_z_loss_mlp": 0.19592285, "step": 12580, "time_per_iteration": 2.974163055419922 }, { "auxiliary_loss_clip": 0.01411968, "auxiliary_loss_mlp": 0.01031347, "balance_loss_clip": 1.25016284, "balance_loss_mlp": 1.01233268, "epoch": 0.7564106418157223, "flos": 20240886136320.0, "grad_norm": 1.595368022370854, "language_loss": 0.75741661, "learning_rate": 5.906830660110691e-07, "loss": 0.7818498, "num_input_tokens_seen": 271355470, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19018555, "step": 12581, "time_per_iteration": 2.874105930328369 }, { "auxiliary_loss_clip": 0.01423397, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.25801659, "balance_loss_mlp": 1.01528573, "epoch": 0.7564707650683902, "flos": 24765850967040.0, "grad_norm": 1.782824686918283, "language_loss": 0.6349957, "learning_rate": 5.904067515031412e-07, "loss": 0.65957558, "num_input_tokens_seen": 271375810, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19299316, "step": 12582, "time_per_iteration": 2.8719866275787354 }, { "auxiliary_loss_clip": 0.01188436, "auxiliary_loss_mlp": 0.0101452, "balance_loss_clip": 1.10093331, "balance_loss_mlp": 0.9977358, "epoch": 0.7565308883210582, "flos": 48553180485120.0, "grad_norm": 0.9470573987589211, "language_loss": 0.60659432, "learning_rate": 5.901304904471307e-07, "loss": 0.62862384, "num_input_tokens_seen": 271424775, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.16796875, "step": 12583, "time_per_iteration": 3.091330051422119 }, { "auxiliary_loss_clip": 0.01418013, "auxiliary_loss_mlp": 0.01034682, "balance_loss_clip": 1.25577903, "balance_loss_mlp": 1.01594198, "epoch": 0.7565910115737261, "flos": 12502725010560.0, "grad_norm": 2.158671006752946, "language_loss": 0.80159295, "learning_rate": 5.898542828535125e-07, "loss": 0.8261199, "num_input_tokens_seen": 271440500, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18737793, "step": 12584, "time_per_iteration": 2.8028736114501953 }, { "auxiliary_loss_clip": 0.01411763, "auxiliary_loss_mlp": 0.01031668, "balance_loss_clip": 1.2528497, "balance_loss_mlp": 1.01347625, "epoch": 0.7566511348263941, "flos": 21181419936000.0, "grad_norm": 1.8055809347775813, "language_loss": 0.78339148, "learning_rate": 5.895781287327612e-07, "loss": 0.8078258, "num_input_tokens_seen": 271458180, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18188477, "step": 12585, "time_per_iteration": 2.8593153953552246 }, { "auxiliary_loss_clip": 0.01433128, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.26749468, "balance_loss_mlp": 1.01939285, "epoch": 0.756711258079062, "flos": 21762889850880.0, "grad_norm": 1.835518427174297, "language_loss": 0.84419775, "learning_rate": 5.893020280953493e-07, "loss": 0.86892056, "num_input_tokens_seen": 271475730, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19750977, "step": 12586, "time_per_iteration": 2.8272526264190674 }, { "auxiliary_loss_clip": 0.01420079, "auxiliary_loss_mlp": 0.01033158, "balance_loss_clip": 1.25383985, "balance_loss_mlp": 1.0150975, "epoch": 0.75677138133173, "flos": 22393160922240.0, "grad_norm": 2.062512645665064, "language_loss": 0.84396672, "learning_rate": 5.890259809517459e-07, "loss": 0.86849916, "num_input_tokens_seen": 271495030, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.18066406, "step": 12587, "time_per_iteration": 2.8536458015441895 }, { "auxiliary_loss_clip": 0.01407588, "auxiliary_loss_mlp": 0.0103027, "balance_loss_clip": 1.2455579, "balance_loss_mlp": 1.0115304, "epoch": 0.756831504584398, "flos": 22718716392960.0, "grad_norm": 1.603572541410736, "language_loss": 0.71718216, "learning_rate": 5.88749987312418e-07, "loss": 0.74156082, "num_input_tokens_seen": 271515355, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18737793, "step": 12588, "time_per_iteration": 2.88269305229187 }, { "auxiliary_loss_clip": 0.01431615, "auxiliary_loss_mlp": 0.0103603, "balance_loss_clip": 1.26513934, "balance_loss_mlp": 1.01605082, "epoch": 0.756891627837066, "flos": 24109220160000.0, "grad_norm": 1.8144591871861817, "language_loss": 0.69924521, "learning_rate": 5.884740471878327e-07, "loss": 0.72392166, "num_input_tokens_seen": 271535090, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.1998291, "step": 12589, "time_per_iteration": 2.854109764099121 }, { "auxiliary_loss_clip": 0.01405298, "auxiliary_loss_mlp": 0.01031442, "balance_loss_clip": 1.24426627, "balance_loss_mlp": 1.01182008, "epoch": 0.756951751089734, "flos": 19756774327680.0, "grad_norm": 1.7334953193335612, "language_loss": 0.92790139, "learning_rate": 5.881981605884522e-07, "loss": 0.95226872, "num_input_tokens_seen": 271551075, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19616699, "step": 12590, "time_per_iteration": 2.8259172439575195 }, { "auxiliary_loss_clip": 0.01408252, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.24747944, "balance_loss_mlp": 1.0117631, "epoch": 0.7570118743424019, "flos": 35092638161280.0, "grad_norm": 1.832671500446489, "language_loss": 0.66461241, "learning_rate": 5.879223275247391e-07, "loss": 0.6889981, "num_input_tokens_seen": 271571035, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18554688, "step": 12591, "time_per_iteration": 3.029036045074463 }, { "auxiliary_loss_clip": 0.01415874, "auxiliary_loss_mlp": 0.0102865, "balance_loss_clip": 1.25588071, "balance_loss_mlp": 1.01155531, "epoch": 0.7570719975950699, "flos": 25605859524480.0, "grad_norm": 1.4576266971863459, "language_loss": 0.74319935, "learning_rate": 5.876465480071528e-07, "loss": 0.76764458, "num_input_tokens_seen": 271592950, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.17077637, "step": 12592, "time_per_iteration": 2.953808307647705 }, { "auxiliary_loss_clip": 0.01418678, "auxiliary_loss_mlp": 0.01036869, "balance_loss_clip": 1.25456643, "balance_loss_mlp": 1.01891637, "epoch": 0.7571321208477378, "flos": 10823296078080.0, "grad_norm": 2.8871062127715357, "language_loss": 0.72134143, "learning_rate": 5.873708220461522e-07, "loss": 0.74589688, "num_input_tokens_seen": 271608835, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.17956543, "step": 12593, "time_per_iteration": 2.8308708667755127 }, { "auxiliary_loss_clip": 0.01431902, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.26540351, "balance_loss_mlp": 1.01315904, "epoch": 0.7571922441004059, "flos": 18269319657600.0, "grad_norm": 15.552558455580714, "language_loss": 0.67777169, "learning_rate": 5.870951496521903e-07, "loss": 0.70240819, "num_input_tokens_seen": 271627730, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.18603516, "step": 12594, "time_per_iteration": 2.847644567489624 }, { "auxiliary_loss_clip": 0.01433819, "auxiliary_loss_mlp": 0.01039057, "balance_loss_clip": 1.26753092, "balance_loss_mlp": 1.02057922, "epoch": 0.7572523673530738, "flos": 22900058127360.0, "grad_norm": 1.6079319046225837, "language_loss": 0.81031311, "learning_rate": 5.86819530835722e-07, "loss": 0.83504188, "num_input_tokens_seen": 271646415, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.18481445, "step": 12595, "time_per_iteration": 2.905236005783081 }, { "auxiliary_loss_clip": 0.01405531, "auxiliary_loss_mlp": 0.01034436, "balance_loss_clip": 1.24433386, "balance_loss_mlp": 1.01569593, "epoch": 0.7573124906057418, "flos": 21006186249600.0, "grad_norm": 6.383171596471471, "language_loss": 0.72473657, "learning_rate": 5.865439656071993e-07, "loss": 0.74913621, "num_input_tokens_seen": 271666240, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18737793, "step": 12596, "time_per_iteration": 2.8789401054382324 }, { "auxiliary_loss_clip": 0.01396539, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.23849642, "balance_loss_mlp": 1.0126183, "epoch": 0.7573726138584097, "flos": 20895978620160.0, "grad_norm": 1.5238765871205333, "language_loss": 0.81258017, "learning_rate": 5.862684539770706e-07, "loss": 0.83684862, "num_input_tokens_seen": 271686370, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.17700195, "step": 12597, "time_per_iteration": 2.8578648567199707 }, { "auxiliary_loss_clip": 0.01434192, "auxiliary_loss_mlp": 0.01036017, "balance_loss_clip": 1.2681551, "balance_loss_mlp": 1.01712215, "epoch": 0.7574327371110777, "flos": 24540006332160.0, "grad_norm": 9.088442408206694, "language_loss": 0.83413863, "learning_rate": 5.859929959557835e-07, "loss": 0.85884082, "num_input_tokens_seen": 271705050, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.18908691, "step": 12598, "time_per_iteration": 2.8461194038391113 }, { "auxiliary_loss_clip": 0.01408118, "auxiliary_loss_mlp": 0.0103183, "balance_loss_clip": 1.24752402, "balance_loss_mlp": 1.01372206, "epoch": 0.7574928603637456, "flos": 23374125590400.0, "grad_norm": 1.6021123016321979, "language_loss": 0.63609302, "learning_rate": 5.857175915537845e-07, "loss": 0.66049254, "num_input_tokens_seen": 271724915, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18115234, "step": 12599, "time_per_iteration": 2.8466861248016357 }, { "auxiliary_loss_clip": 0.01423093, "auxiliary_loss_mlp": 0.01034588, "balance_loss_clip": 1.25511897, "balance_loss_mlp": 1.01471579, "epoch": 0.7575529836164137, "flos": 13524075302400.0, "grad_norm": 3.5232209373469736, "language_loss": 0.64941937, "learning_rate": 5.854422407815161e-07, "loss": 0.67399615, "num_input_tokens_seen": 271742410, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.19885254, "step": 12600, "time_per_iteration": 4.217883586883545 }, { "auxiliary_loss_clip": 0.01404703, "auxiliary_loss_mlp": 0.01035342, "balance_loss_clip": 1.2460382, "balance_loss_mlp": 1.01684093, "epoch": 0.7576131068690816, "flos": 19656158595840.0, "grad_norm": 1.6662831224253276, "language_loss": 0.66432536, "learning_rate": 5.851669436494191e-07, "loss": 0.68872577, "num_input_tokens_seen": 271761425, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18505859, "step": 12601, "time_per_iteration": 2.839223861694336 }, { "auxiliary_loss_clip": 0.01407604, "auxiliary_loss_mlp": 0.01032084, "balance_loss_clip": 1.2476294, "balance_loss_mlp": 1.01407146, "epoch": 0.7576732301217496, "flos": 20058006078720.0, "grad_norm": 1.884894320662196, "language_loss": 0.6865803, "learning_rate": 5.848917001679335e-07, "loss": 0.7109772, "num_input_tokens_seen": 271780875, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18017578, "step": 12602, "time_per_iteration": 2.8498661518096924 }, { "auxiliary_loss_clip": 0.01406306, "auxiliary_loss_mlp": 0.01030001, "balance_loss_clip": 1.244802, "balance_loss_mlp": 1.0111537, "epoch": 0.7577333533744176, "flos": 15385388906880.0, "grad_norm": 2.1628673986665565, "language_loss": 0.67812526, "learning_rate": 5.846165103474967e-07, "loss": 0.70248842, "num_input_tokens_seen": 271799490, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18847656, "step": 12603, "time_per_iteration": 2.7887380123138428 }, { "auxiliary_loss_clip": 0.01393825, "auxiliary_loss_mlp": 0.01028475, "balance_loss_clip": 1.23481131, "balance_loss_mlp": 1.01177382, "epoch": 0.7577934766270855, "flos": 17903876256000.0, "grad_norm": 2.1908115006354505, "language_loss": 0.62859905, "learning_rate": 5.843413741985439e-07, "loss": 0.65282202, "num_input_tokens_seen": 271817040, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.16687012, "step": 12604, "time_per_iteration": 2.830157995223999 }, { "auxiliary_loss_clip": 0.01411384, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.25124002, "balance_loss_mlp": 1.01738763, "epoch": 0.7578535998797535, "flos": 21623064860160.0, "grad_norm": 2.243013084775463, "language_loss": 0.80394733, "learning_rate": 5.840662917315076e-07, "loss": 0.8284266, "num_input_tokens_seen": 271835480, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19165039, "step": 12605, "time_per_iteration": 2.872981309890747 }, { "auxiliary_loss_clip": 0.01418847, "auxiliary_loss_mlp": 0.01032284, "balance_loss_clip": 1.25395513, "balance_loss_mlp": 1.01290107, "epoch": 0.7579137231324214, "flos": 18487472676480.0, "grad_norm": 3.620458141928992, "language_loss": 0.8128804, "learning_rate": 5.837912629568198e-07, "loss": 0.83739173, "num_input_tokens_seen": 271849835, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19384766, "step": 12606, "time_per_iteration": 2.7820231914520264 }, { "auxiliary_loss_clip": 0.01389476, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.23383343, "balance_loss_mlp": 1.01312637, "epoch": 0.7579738463850895, "flos": 23264596632960.0, "grad_norm": 1.33561557151199, "language_loss": 0.73275709, "learning_rate": 5.835162878849087e-07, "loss": 0.75695181, "num_input_tokens_seen": 271869560, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.16870117, "step": 12607, "time_per_iteration": 4.290786266326904 }, { "auxiliary_loss_clip": 0.01423061, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.25587595, "balance_loss_mlp": 1.01484084, "epoch": 0.7580339696377574, "flos": 14034954049920.0, "grad_norm": 2.183159638557979, "language_loss": 0.75489652, "learning_rate": 5.83241366526202e-07, "loss": 0.7794596, "num_input_tokens_seen": 271887950, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.18395996, "step": 12608, "time_per_iteration": 2.8395910263061523 }, { "auxiliary_loss_clip": 0.01401465, "auxiliary_loss_mlp": 0.01033374, "balance_loss_clip": 1.24032211, "balance_loss_mlp": 1.01495647, "epoch": 0.7580940928904254, "flos": 25093713922560.0, "grad_norm": 1.5391926009607988, "language_loss": 0.72265172, "learning_rate": 5.829664988911245e-07, "loss": 0.7470001, "num_input_tokens_seen": 271907700, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18432617, "step": 12609, "time_per_iteration": 2.865010976791382 }, { "auxiliary_loss_clip": 0.01410967, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.2490077, "balance_loss_mlp": 1.01206255, "epoch": 0.7581542161430933, "flos": 23845523610240.0, "grad_norm": 1.6925968741414323, "language_loss": 0.81943715, "learning_rate": 5.826916849901007e-07, "loss": 0.84386426, "num_input_tokens_seen": 271926840, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19665527, "step": 12610, "time_per_iteration": 2.8378775119781494 }, { "auxiliary_loss_clip": 0.01435611, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.26855922, "balance_loss_mlp": 1.01517797, "epoch": 0.7582143393957613, "flos": 22247227883520.0, "grad_norm": 2.031526260121046, "language_loss": 0.71286619, "learning_rate": 5.824169248335488e-07, "loss": 0.73755789, "num_input_tokens_seen": 271946465, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.18371582, "step": 12611, "time_per_iteration": 2.819511890411377 }, { "auxiliary_loss_clip": 0.0140783, "auxiliary_loss_mlp": 0.01029029, "balance_loss_clip": 1.24705386, "balance_loss_mlp": 1.01015806, "epoch": 0.7582744626484292, "flos": 21116439123840.0, "grad_norm": 1.5687099746473676, "language_loss": 0.71315581, "learning_rate": 5.821422184318893e-07, "loss": 0.73752439, "num_input_tokens_seen": 271967295, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.1887207, "step": 12612, "time_per_iteration": 5.584877014160156 }, { "auxiliary_loss_clip": 0.01413673, "auxiliary_loss_mlp": 0.01034188, "balance_loss_clip": 1.24919796, "balance_loss_mlp": 1.01549625, "epoch": 0.7583345859010973, "flos": 24614624286720.0, "grad_norm": 1.5038917366981355, "language_loss": 0.6041292, "learning_rate": 5.818675657955397e-07, "loss": 0.62860781, "num_input_tokens_seen": 271987960, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18688965, "step": 12613, "time_per_iteration": 2.8911592960357666 }, { "auxiliary_loss_clip": 0.01417145, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.25414324, "balance_loss_mlp": 1.0178535, "epoch": 0.7583947091537652, "flos": 33559775694720.0, "grad_norm": 1.4761006788361861, "language_loss": 0.60985392, "learning_rate": 5.815929669349135e-07, "loss": 0.63438344, "num_input_tokens_seen": 272011780, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.17956543, "step": 12614, "time_per_iteration": 2.9609375 }, { "auxiliary_loss_clip": 0.01423706, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.25789571, "balance_loss_mlp": 1.01470721, "epoch": 0.7584548324064332, "flos": 20130723751680.0, "grad_norm": 4.767740383882963, "language_loss": 0.73572576, "learning_rate": 5.813184218604246e-07, "loss": 0.7602911, "num_input_tokens_seen": 272030825, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.18127441, "step": 12615, "time_per_iteration": 2.8483150005340576 }, { "auxiliary_loss_clip": 0.01194399, "auxiliary_loss_mlp": 0.01024854, "balance_loss_clip": 1.10613525, "balance_loss_mlp": 1.00272834, "epoch": 0.7585149556591012, "flos": 70435915107840.0, "grad_norm": 0.8077764123816247, "language_loss": 0.67803764, "learning_rate": 5.810439305824828e-07, "loss": 0.70023012, "num_input_tokens_seen": 272095825, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.22167969, "step": 12616, "time_per_iteration": 3.4209842681884766 }, { "auxiliary_loss_clip": 0.01425909, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.26133502, "balance_loss_mlp": 1.01252961, "epoch": 0.7585750789117691, "flos": 16152408322560.0, "grad_norm": 2.0661337867857292, "language_loss": 0.84972751, "learning_rate": 5.807694931114979e-07, "loss": 0.87429976, "num_input_tokens_seen": 272113950, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.18786621, "step": 12617, "time_per_iteration": 2.803428888320923 }, { "auxiliary_loss_clip": 0.01411987, "auxiliary_loss_mlp": 0.01030965, "balance_loss_clip": 1.24891305, "balance_loss_mlp": 1.01297665, "epoch": 0.7586352021644371, "flos": 17501983528320.0, "grad_norm": 2.275970480764086, "language_loss": 0.75665677, "learning_rate": 5.804951094578757e-07, "loss": 0.78108627, "num_input_tokens_seen": 272130315, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.17993164, "step": 12618, "time_per_iteration": 2.829324245452881 }, { "auxiliary_loss_clip": 0.01435735, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.26617646, "balance_loss_mlp": 1.01200879, "epoch": 0.758695325417105, "flos": 17284192467840.0, "grad_norm": 2.1944487957927, "language_loss": 0.77965081, "learning_rate": 5.802207796320209e-07, "loss": 0.80432022, "num_input_tokens_seen": 272149080, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.19189453, "step": 12619, "time_per_iteration": 2.899750232696533 }, { "auxiliary_loss_clip": 0.01398546, "auxiliary_loss_mlp": 0.01035439, "balance_loss_clip": 1.23962569, "balance_loss_mlp": 1.01734316, "epoch": 0.7587554486697731, "flos": 29507249514240.0, "grad_norm": 3.2161316892126184, "language_loss": 0.83359545, "learning_rate": 5.79946503644337e-07, "loss": 0.85793531, "num_input_tokens_seen": 272168285, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18103027, "step": 12620, "time_per_iteration": 2.9023165702819824 }, { "auxiliary_loss_clip": 0.01417457, "auxiliary_loss_mlp": 0.01034451, "balance_loss_clip": 1.25093079, "balance_loss_mlp": 1.01468611, "epoch": 0.758815571922441, "flos": 16107724442880.0, "grad_norm": 2.0221723286571134, "language_loss": 0.83216965, "learning_rate": 5.796722815052242e-07, "loss": 0.85668874, "num_input_tokens_seen": 272184585, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19775391, "step": 12621, "time_per_iteration": 2.8102781772613525 }, { "auxiliary_loss_clip": 0.01403691, "auxiliary_loss_mlp": 0.01035081, "balance_loss_clip": 1.24350095, "balance_loss_mlp": 1.01625848, "epoch": 0.758875695175109, "flos": 16152317832960.0, "grad_norm": 2.292377578030205, "language_loss": 0.74763656, "learning_rate": 5.7939811322508e-07, "loss": 0.77202421, "num_input_tokens_seen": 272200205, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18823242, "step": 12622, "time_per_iteration": 2.883671998977661 }, { "auxiliary_loss_clip": 0.01191752, "auxiliary_loss_mlp": 0.01023299, "balance_loss_clip": 1.1036787, "balance_loss_mlp": 0.99831259, "epoch": 0.7589358184277769, "flos": 68493061094400.0, "grad_norm": 0.8138841223924056, "language_loss": 0.60898542, "learning_rate": 5.791239988143024e-07, "loss": 0.63113594, "num_input_tokens_seen": 272259670, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.25, "step": 12623, "time_per_iteration": 3.3693575859069824 }, { "auxiliary_loss_clip": 0.01408603, "auxiliary_loss_mlp": 0.01035191, "balance_loss_clip": 1.24982953, "balance_loss_mlp": 1.01641536, "epoch": 0.7589959416804449, "flos": 20056829713920.0, "grad_norm": 2.952302849139921, "language_loss": 0.68223, "learning_rate": 5.788499382832847e-07, "loss": 0.70666796, "num_input_tokens_seen": 272277925, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18774414, "step": 12624, "time_per_iteration": 2.8182222843170166 }, { "auxiliary_loss_clip": 0.01399432, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.23936534, "balance_loss_mlp": 1.00872445, "epoch": 0.7590560649331128, "flos": 18781374769920.0, "grad_norm": 1.847920696469423, "language_loss": 0.76990879, "learning_rate": 5.785759316424196e-07, "loss": 0.79418558, "num_input_tokens_seen": 272296010, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1953125, "step": 12625, "time_per_iteration": 2.8462681770324707 }, { "auxiliary_loss_clip": 0.01402313, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.24349689, "balance_loss_mlp": 1.01770568, "epoch": 0.7591161881857809, "flos": 29837284220160.0, "grad_norm": 2.5943295346450657, "language_loss": 0.64041746, "learning_rate": 5.783019789020977e-07, "loss": 0.66480321, "num_input_tokens_seen": 272318330, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18554688, "step": 12626, "time_per_iteration": 2.9676458835601807 }, { "auxiliary_loss_clip": 0.01417785, "auxiliary_loss_mlp": 0.01040903, "balance_loss_clip": 1.25352311, "balance_loss_mlp": 1.02118576, "epoch": 0.7591763114384488, "flos": 20312291710080.0, "grad_norm": 1.8999620641309276, "language_loss": 0.74419069, "learning_rate": 5.780280800727084e-07, "loss": 0.76877755, "num_input_tokens_seen": 272335265, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19726562, "step": 12627, "time_per_iteration": 2.849762439727783 }, { "auxiliary_loss_clip": 0.01416165, "auxiliary_loss_mlp": 0.0103943, "balance_loss_clip": 1.25345445, "balance_loss_mlp": 1.0203799, "epoch": 0.7592364346911168, "flos": 20823351436800.0, "grad_norm": 6.467921756175341, "language_loss": 0.69990492, "learning_rate": 5.777542351646356e-07, "loss": 0.7244609, "num_input_tokens_seen": 272354795, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19030762, "step": 12628, "time_per_iteration": 2.8572897911071777 }, { "auxiliary_loss_clip": 0.01439298, "auxiliary_loss_mlp": 0.01034994, "balance_loss_clip": 1.26936245, "balance_loss_mlp": 1.01488304, "epoch": 0.7592965579437848, "flos": 21261376776960.0, "grad_norm": 1.9266052835957244, "language_loss": 0.64209324, "learning_rate": 5.774804441882648e-07, "loss": 0.66683614, "num_input_tokens_seen": 272372875, "router_z_loss_clip": 1.69921875, "router_z_loss_mlp": 0.2010498, "step": 12629, "time_per_iteration": 2.8586268424987793 }, { "auxiliary_loss_clip": 0.01398083, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.23957062, "balance_loss_mlp": 1.01138914, "epoch": 0.7593566811964527, "flos": 26224774151040.0, "grad_norm": 1.5249017991487936, "language_loss": 0.78680682, "learning_rate": 5.772067071539786e-07, "loss": 0.81108171, "num_input_tokens_seen": 272394715, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18017578, "step": 12630, "time_per_iteration": 2.893017053604126 }, { "auxiliary_loss_clip": 0.0119059, "auxiliary_loss_mlp": 0.01039483, "balance_loss_clip": 1.10210133, "balance_loss_mlp": 1.01592731, "epoch": 0.7594168044491207, "flos": 71269634638080.0, "grad_norm": 0.8240228899201695, "language_loss": 0.61506993, "learning_rate": 5.769330240721562e-07, "loss": 0.63737065, "num_input_tokens_seen": 272458775, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.23535156, "step": 12631, "time_per_iteration": 3.3978114128112793 }, { "auxiliary_loss_clip": 0.01434373, "auxiliary_loss_mlp": 0.01035427, "balance_loss_clip": 1.26620519, "balance_loss_mlp": 1.01549518, "epoch": 0.7594769277017887, "flos": 26624178414720.0, "grad_norm": 1.71696188408971, "language_loss": 0.74681705, "learning_rate": 5.766593949531767e-07, "loss": 0.77151507, "num_input_tokens_seen": 272479355, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.19946289, "step": 12632, "time_per_iteration": 2.931518077850342 }, { "auxiliary_loss_clip": 0.01408864, "auxiliary_loss_mlp": 0.01032505, "balance_loss_clip": 1.24658537, "balance_loss_mlp": 1.01443255, "epoch": 0.7595370509544567, "flos": 17603051708160.0, "grad_norm": 1.8888811388528501, "language_loss": 0.75602293, "learning_rate": 5.763858198074154e-07, "loss": 0.78043664, "num_input_tokens_seen": 272493555, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18078613, "step": 12633, "time_per_iteration": 2.8170325756073 }, { "auxiliary_loss_clip": 0.01416247, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.25455427, "balance_loss_mlp": 1.01459813, "epoch": 0.7595971742071246, "flos": 18011640666240.0, "grad_norm": 1.9993429851470306, "language_loss": 0.74354339, "learning_rate": 5.76112298645246e-07, "loss": 0.76803029, "num_input_tokens_seen": 272508925, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.17858887, "step": 12634, "time_per_iteration": 2.8799753189086914 }, { "auxiliary_loss_clip": 0.01417038, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.25525141, "balance_loss_mlp": 1.01743126, "epoch": 0.7596572974597926, "flos": 28852111785600.0, "grad_norm": 1.6470954075719269, "language_loss": 0.65076387, "learning_rate": 5.758388314770408e-07, "loss": 0.67529356, "num_input_tokens_seen": 272528805, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18505859, "step": 12635, "time_per_iteration": 4.389769792556763 }, { "auxiliary_loss_clip": 0.01412397, "auxiliary_loss_mlp": 0.01031045, "balance_loss_clip": 1.24756336, "balance_loss_mlp": 1.01198387, "epoch": 0.7597174207124605, "flos": 14290913738880.0, "grad_norm": 1.9645075640377239, "language_loss": 0.69538093, "learning_rate": 5.7556541831317e-07, "loss": 0.71981537, "num_input_tokens_seen": 272546655, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19067383, "step": 12636, "time_per_iteration": 2.8139450550079346 }, { "auxiliary_loss_clip": 0.01427563, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.26139307, "balance_loss_mlp": 1.01466405, "epoch": 0.7597775439651285, "flos": 21698813934720.0, "grad_norm": 1.937469722712686, "language_loss": 0.8213262, "learning_rate": 5.752920591640018e-07, "loss": 0.84593928, "num_input_tokens_seen": 272564010, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.1907959, "step": 12637, "time_per_iteration": 2.837063789367676 }, { "auxiliary_loss_clip": 0.01407154, "auxiliary_loss_mlp": 0.01032099, "balance_loss_clip": 1.24376845, "balance_loss_mlp": 1.01322818, "epoch": 0.7598376672177964, "flos": 36114983838720.0, "grad_norm": 1.6908693189912851, "language_loss": 0.67408586, "learning_rate": 5.750187540399017e-07, "loss": 0.69847846, "num_input_tokens_seen": 272585840, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18884277, "step": 12638, "time_per_iteration": 2.9468774795532227 }, { "auxiliary_loss_clip": 0.01409595, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 1.24662983, "balance_loss_mlp": 1.0143671, "epoch": 0.7598977904704645, "flos": 18341358658560.0, "grad_norm": 2.030081164266426, "language_loss": 0.66438717, "learning_rate": 5.747455029512323e-07, "loss": 0.688824, "num_input_tokens_seen": 272602300, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19714355, "step": 12639, "time_per_iteration": 2.8283374309539795 }, { "auxiliary_loss_clip": 0.0140942, "auxiliary_loss_mlp": 0.01031282, "balance_loss_clip": 1.24897373, "balance_loss_mlp": 1.01313806, "epoch": 0.7599579137231324, "flos": 20202038835840.0, "grad_norm": 2.0819240337110556, "language_loss": 0.71192986, "learning_rate": 5.744723059083572e-07, "loss": 0.73633683, "num_input_tokens_seen": 272619595, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18139648, "step": 12640, "time_per_iteration": 2.839646816253662 }, { "auxiliary_loss_clip": 0.01426385, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.26039171, "balance_loss_mlp": 1.01282978, "epoch": 0.7600180369758004, "flos": 24035552346240.0, "grad_norm": 1.7040892854093315, "language_loss": 0.67131305, "learning_rate": 5.741991629216343e-07, "loss": 0.69589764, "num_input_tokens_seen": 272638825, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.19262695, "step": 12641, "time_per_iteration": 2.8556411266326904 }, { "auxiliary_loss_clip": 0.01429756, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.26390696, "balance_loss_mlp": 1.01500738, "epoch": 0.7600781602284684, "flos": 18998849116800.0, "grad_norm": 2.788778984661137, "language_loss": 0.68056262, "learning_rate": 5.73926074001422e-07, "loss": 0.70519859, "num_input_tokens_seen": 272657240, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.18847656, "step": 12642, "time_per_iteration": 4.244689226150513 }, { "auxiliary_loss_clip": 0.01404659, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.24614942, "balance_loss_mlp": 1.01297462, "epoch": 0.7601382834811363, "flos": 26078614888320.0, "grad_norm": 1.9190858013371213, "language_loss": 0.76856124, "learning_rate": 5.736530391580765e-07, "loss": 0.79291958, "num_input_tokens_seen": 272677520, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18200684, "step": 12643, "time_per_iteration": 2.8637728691101074 }, { "auxiliary_loss_clip": 0.01415975, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.25047255, "balance_loss_mlp": 1.01551664, "epoch": 0.7601984067338043, "flos": 18853685239680.0, "grad_norm": 1.7150142868682605, "language_loss": 0.79246938, "learning_rate": 5.733800584019508e-07, "loss": 0.81697899, "num_input_tokens_seen": 272696770, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19445801, "step": 12644, "time_per_iteration": 2.8438901901245117 }, { "auxiliary_loss_clip": 0.01415067, "auxiliary_loss_mlp": 0.0103122, "balance_loss_clip": 1.25175428, "balance_loss_mlp": 1.01397085, "epoch": 0.7602585299864723, "flos": 24657588864000.0, "grad_norm": 1.4730743984121433, "language_loss": 0.8119247, "learning_rate": 5.731071317433957e-07, "loss": 0.83638757, "num_input_tokens_seen": 272718340, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.17260742, "step": 12645, "time_per_iteration": 2.8678648471832275 }, { "auxiliary_loss_clip": 0.01419995, "auxiliary_loss_mlp": 0.01035535, "balance_loss_clip": 1.25514841, "balance_loss_mlp": 1.01594853, "epoch": 0.7603186532391403, "flos": 23852672288640.0, "grad_norm": 1.8397127178949644, "language_loss": 0.73452801, "learning_rate": 5.728342591927611e-07, "loss": 0.75908339, "num_input_tokens_seen": 272739575, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19580078, "step": 12646, "time_per_iteration": 2.872244119644165 }, { "auxiliary_loss_clip": 0.01404413, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.24367237, "balance_loss_mlp": 1.01247001, "epoch": 0.7603787764918082, "flos": 22209964151040.0, "grad_norm": 2.580906585253498, "language_loss": 0.67885613, "learning_rate": 5.725614407603949e-07, "loss": 0.70321119, "num_input_tokens_seen": 272758710, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18615723, "step": 12647, "time_per_iteration": 4.375565767288208 }, { "auxiliary_loss_clip": 0.01189928, "auxiliary_loss_mlp": 0.01032875, "balance_loss_clip": 1.1019448, "balance_loss_mlp": 1.0090332, "epoch": 0.7604388997444762, "flos": 54114290657280.0, "grad_norm": 0.6747503241843853, "language_loss": 0.48971501, "learning_rate": 5.722886764566415e-07, "loss": 0.51194304, "num_input_tokens_seen": 272814855, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.23828125, "step": 12648, "time_per_iteration": 3.329648971557617 }, { "auxiliary_loss_clip": 0.01403949, "auxiliary_loss_mlp": 0.01032079, "balance_loss_clip": 1.24527597, "balance_loss_mlp": 1.01373243, "epoch": 0.7604990229971441, "flos": 19691205333120.0, "grad_norm": 1.6894041442041148, "language_loss": 0.77208334, "learning_rate": 5.720159662918451e-07, "loss": 0.79644364, "num_input_tokens_seen": 272834400, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18359375, "step": 12649, "time_per_iteration": 2.988821506500244 }, { "auxiliary_loss_clip": 0.01414204, "auxiliary_loss_mlp": 0.01032626, "balance_loss_clip": 1.2527324, "balance_loss_mlp": 1.01374364, "epoch": 0.7605591462498121, "flos": 25238832554880.0, "grad_norm": 1.513830844266189, "language_loss": 0.69108212, "learning_rate": 5.717433102763462e-07, "loss": 0.71555042, "num_input_tokens_seen": 272854760, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.1887207, "step": 12650, "time_per_iteration": 2.936736583709717 }, { "auxiliary_loss_clip": 0.01190687, "auxiliary_loss_mlp": 0.01031157, "balance_loss_clip": 1.10297883, "balance_loss_mlp": 1.00636184, "epoch": 0.76061926950248, "flos": 66814401323520.0, "grad_norm": 0.752988453502928, "language_loss": 0.62708414, "learning_rate": 5.714707084204838e-07, "loss": 0.64930254, "num_input_tokens_seen": 272919030, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.24804688, "step": 12651, "time_per_iteration": 3.3325462341308594 }, { "auxiliary_loss_clip": 0.01401934, "auxiliary_loss_mlp": 0.01030122, "balance_loss_clip": 1.24241209, "balance_loss_mlp": 1.01201367, "epoch": 0.7606793927551481, "flos": 25349402142720.0, "grad_norm": 4.328515868522785, "language_loss": 0.71997809, "learning_rate": 5.711981607345951e-07, "loss": 0.74429864, "num_input_tokens_seen": 272938925, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1809082, "step": 12652, "time_per_iteration": 2.9037299156188965 }, { "auxiliary_loss_clip": 0.01410814, "auxiliary_loss_mlp": 0.01038879, "balance_loss_clip": 1.24767518, "balance_loss_mlp": 1.0191617, "epoch": 0.760739516007816, "flos": 18232825086720.0, "grad_norm": 2.2887124960910747, "language_loss": 0.80273175, "learning_rate": 5.709256672290152e-07, "loss": 0.82722867, "num_input_tokens_seen": 272954945, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19714355, "step": 12653, "time_per_iteration": 2.821322441101074 }, { "auxiliary_loss_clip": 0.01422185, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.25606608, "balance_loss_mlp": 1.01500106, "epoch": 0.760799639260484, "flos": 22567806426240.0, "grad_norm": 1.5416680078023959, "language_loss": 0.80905485, "learning_rate": 5.706532279140785e-07, "loss": 0.83361292, "num_input_tokens_seen": 272972855, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.1862793, "step": 12654, "time_per_iteration": 2.9069652557373047 }, { "auxiliary_loss_clip": 0.01414048, "auxiliary_loss_mlp": 0.0103319, "balance_loss_clip": 1.25030494, "balance_loss_mlp": 1.01436675, "epoch": 0.760859762513152, "flos": 22319402618880.0, "grad_norm": 2.1456540154096038, "language_loss": 0.7971471, "learning_rate": 5.703808428001136e-07, "loss": 0.82161945, "num_input_tokens_seen": 272989895, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18823242, "step": 12655, "time_per_iteration": 2.8112499713897705 }, { "auxiliary_loss_clip": 0.01403209, "auxiliary_loss_mlp": 0.01028773, "balance_loss_clip": 1.2440002, "balance_loss_mlp": 1.01120138, "epoch": 0.7609198857658199, "flos": 24874791742080.0, "grad_norm": 1.908968951095025, "language_loss": 0.6924051, "learning_rate": 5.701085118974505e-07, "loss": 0.71672487, "num_input_tokens_seen": 273011695, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17565918, "step": 12656, "time_per_iteration": 2.91989803314209 }, { "auxiliary_loss_clip": 0.01426977, "auxiliary_loss_mlp": 0.01030034, "balance_loss_clip": 1.25993109, "balance_loss_mlp": 1.01111519, "epoch": 0.760980009018488, "flos": 16845533700480.0, "grad_norm": 2.3775659643914793, "language_loss": 0.743855, "learning_rate": 5.698362352164164e-07, "loss": 0.76842511, "num_input_tokens_seen": 273028815, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.18933105, "step": 12657, "time_per_iteration": 2.840183973312378 }, { "auxiliary_loss_clip": 0.01190766, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.10218787, "balance_loss_mlp": 1.0114634, "epoch": 0.7610401322711559, "flos": 61257906120960.0, "grad_norm": 0.8609407686553608, "language_loss": 0.64843702, "learning_rate": 5.695640127673347e-07, "loss": 0.67071676, "num_input_tokens_seen": 273084080, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.2578125, "step": 12658, "time_per_iteration": 3.283670663833618 }, { "auxiliary_loss_clip": 0.01396216, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.23841739, "balance_loss_mlp": 1.01337254, "epoch": 0.7611002555238239, "flos": 19648783693440.0, "grad_norm": 1.8732084668414295, "language_loss": 0.80090892, "learning_rate": 5.692918445605293e-07, "loss": 0.82519829, "num_input_tokens_seen": 273102295, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19335938, "step": 12659, "time_per_iteration": 2.852208137512207 }, { "auxiliary_loss_clip": 0.01407401, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.24584961, "balance_loss_mlp": 1.01307094, "epoch": 0.7611603787764918, "flos": 26884029156480.0, "grad_norm": 1.5014736384359049, "language_loss": 0.69320959, "learning_rate": 5.690197306063209e-07, "loss": 0.71760035, "num_input_tokens_seen": 273123400, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18603516, "step": 12660, "time_per_iteration": 2.8886780738830566 }, { "auxiliary_loss_clip": 0.014118, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.24881268, "balance_loss_mlp": 1.01480997, "epoch": 0.7612205020291598, "flos": 27355608155520.0, "grad_norm": 1.599430104310088, "language_loss": 0.7083993, "learning_rate": 5.687476709150281e-07, "loss": 0.73285049, "num_input_tokens_seen": 273145150, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.18518066, "step": 12661, "time_per_iteration": 2.9272115230560303 }, { "auxiliary_loss_clip": 0.01408814, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.24676466, "balance_loss_mlp": 1.01443624, "epoch": 0.7612806252818277, "flos": 29326495962240.0, "grad_norm": 1.6645355091139196, "language_loss": 0.84363729, "learning_rate": 5.68475665496966e-07, "loss": 0.86805236, "num_input_tokens_seen": 273165180, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18237305, "step": 12662, "time_per_iteration": 2.9490137100219727 }, { "auxiliary_loss_clip": 0.01412647, "auxiliary_loss_mlp": 0.01039091, "balance_loss_clip": 1.24981081, "balance_loss_mlp": 1.02087557, "epoch": 0.7613407485344957, "flos": 19035388932480.0, "grad_norm": 1.612728730211622, "language_loss": 0.69444978, "learning_rate": 5.682037143624505e-07, "loss": 0.7189672, "num_input_tokens_seen": 273184005, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18212891, "step": 12663, "time_per_iteration": 2.8309402465820312 }, { "auxiliary_loss_clip": 0.0140537, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.24534678, "balance_loss_mlp": 1.01158428, "epoch": 0.7614008717871636, "flos": 23265863487360.0, "grad_norm": 1.5169621458826605, "language_loss": 0.70432025, "learning_rate": 5.67931817521794e-07, "loss": 0.72867072, "num_input_tokens_seen": 273203565, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1809082, "step": 12664, "time_per_iteration": 2.839362144470215 }, { "auxiliary_loss_clip": 0.01428921, "auxiliary_loss_mlp": 0.01041855, "balance_loss_clip": 1.26227736, "balance_loss_mlp": 1.02259052, "epoch": 0.7614609950398317, "flos": 21589873159680.0, "grad_norm": 2.3163050921061616, "language_loss": 0.79863906, "learning_rate": 5.676599749853066e-07, "loss": 0.82334685, "num_input_tokens_seen": 273221645, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.19274902, "step": 12665, "time_per_iteration": 2.8316965103149414 }, { "auxiliary_loss_clip": 0.01408739, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.24912846, "balance_loss_mlp": 1.01719642, "epoch": 0.7615211182924996, "flos": 29289729922560.0, "grad_norm": 1.5987782550868137, "language_loss": 0.88682783, "learning_rate": 5.673881867632959e-07, "loss": 0.91127425, "num_input_tokens_seen": 273242040, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18688965, "step": 12666, "time_per_iteration": 2.9033985137939453 }, { "auxiliary_loss_clip": 0.01417777, "auxiliary_loss_mlp": 0.01032374, "balance_loss_clip": 1.25522661, "balance_loss_mlp": 1.01351476, "epoch": 0.7615812415451676, "flos": 13268930019840.0, "grad_norm": 2.442720645794966, "language_loss": 0.84415805, "learning_rate": 5.671164528660693e-07, "loss": 0.8686595, "num_input_tokens_seen": 273257365, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1887207, "step": 12667, "time_per_iteration": 2.80924391746521 }, { "auxiliary_loss_clip": 0.01397138, "auxiliary_loss_mlp": 0.01030486, "balance_loss_clip": 1.2399857, "balance_loss_mlp": 1.01345146, "epoch": 0.7616413647978356, "flos": 18593065336320.0, "grad_norm": 1.7642901655144163, "language_loss": 0.7934407, "learning_rate": 5.668447733039296e-07, "loss": 0.81771696, "num_input_tokens_seen": 273274710, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.17041016, "step": 12668, "time_per_iteration": 2.8206419944763184 }, { "auxiliary_loss_clip": 0.01403202, "auxiliary_loss_mlp": 0.01032952, "balance_loss_clip": 1.24262404, "balance_loss_mlp": 1.01422405, "epoch": 0.7617014880505035, "flos": 18525641304960.0, "grad_norm": 1.8031309787516752, "language_loss": 0.6475516, "learning_rate": 5.6657314808718e-07, "loss": 0.67191315, "num_input_tokens_seen": 273292870, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18725586, "step": 12669, "time_per_iteration": 2.8475418090820312 }, { "auxiliary_loss_clip": 0.0141927, "auxiliary_loss_mlp": 0.01038286, "balance_loss_clip": 1.2544744, "balance_loss_mlp": 1.01886702, "epoch": 0.7617616113031715, "flos": 24984049230720.0, "grad_norm": 1.8533959803170799, "language_loss": 0.66995114, "learning_rate": 5.663015772261202e-07, "loss": 0.69452667, "num_input_tokens_seen": 273312375, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19421387, "step": 12670, "time_per_iteration": 2.872419834136963 }, { "auxiliary_loss_clip": 0.01422903, "auxiliary_loss_mlp": 0.01034896, "balance_loss_clip": 1.25678051, "balance_loss_mlp": 1.01447594, "epoch": 0.7618217345558395, "flos": 23305525194240.0, "grad_norm": 2.87357185274531, "language_loss": 0.73461282, "learning_rate": 5.660300607310493e-07, "loss": 0.75919092, "num_input_tokens_seen": 273332590, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.20410156, "step": 12671, "time_per_iteration": 4.285600900650024 }, { "auxiliary_loss_clip": 0.01401917, "auxiliary_loss_mlp": 0.01033809, "balance_loss_clip": 1.24187565, "balance_loss_mlp": 1.01592731, "epoch": 0.7618818578085075, "flos": 25493615879040.0, "grad_norm": 1.991022301907854, "language_loss": 0.73671198, "learning_rate": 5.657585986122613e-07, "loss": 0.76106924, "num_input_tokens_seen": 273352885, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.17871094, "step": 12672, "time_per_iteration": 2.9119551181793213 }, { "auxiliary_loss_clip": 0.01186617, "auxiliary_loss_mlp": 0.01024904, "balance_loss_clip": 1.09929812, "balance_loss_mlp": 1.00335121, "epoch": 0.7619419810611754, "flos": 61177813545600.0, "grad_norm": 0.764569102270039, "language_loss": 0.56760049, "learning_rate": 5.654871908800506e-07, "loss": 0.58971566, "num_input_tokens_seen": 273411730, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.21582031, "step": 12673, "time_per_iteration": 3.310997724533081 }, { "auxiliary_loss_clip": 0.01399427, "auxiliary_loss_mlp": 0.0103568, "balance_loss_clip": 1.23829269, "balance_loss_mlp": 1.01673818, "epoch": 0.7620021043138434, "flos": 23269166357760.0, "grad_norm": 2.0041517898950514, "language_loss": 0.75374138, "learning_rate": 5.652158375447102e-07, "loss": 0.7780925, "num_input_tokens_seen": 273430020, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18945312, "step": 12674, "time_per_iteration": 2.884979248046875 }, { "auxiliary_loss_clip": 0.01401975, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.24338019, "balance_loss_mlp": 1.01526022, "epoch": 0.7620622275665113, "flos": 25093351964160.0, "grad_norm": 2.262420639250545, "language_loss": 0.73137534, "learning_rate": 5.649445386165286e-07, "loss": 0.75573361, "num_input_tokens_seen": 273448690, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18591309, "step": 12675, "time_per_iteration": 2.885685443878174 }, { "auxiliary_loss_clip": 0.01413241, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.25309765, "balance_loss_mlp": 1.01271129, "epoch": 0.7621223508191793, "flos": 20164096431360.0, "grad_norm": 2.1045791581715676, "language_loss": 0.72916526, "learning_rate": 5.646732941057936e-07, "loss": 0.75361013, "num_input_tokens_seen": 273465190, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1854248, "step": 12676, "time_per_iteration": 2.849888324737549 }, { "auxiliary_loss_clip": 0.01426572, "auxiliary_loss_mlp": 0.01036879, "balance_loss_clip": 1.25770068, "balance_loss_mlp": 1.01830602, "epoch": 0.7621824740718472, "flos": 18008156816640.0, "grad_norm": 2.8232089680317025, "language_loss": 0.54947722, "learning_rate": 5.644021040227927e-07, "loss": 0.5741117, "num_input_tokens_seen": 273478620, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.18554688, "step": 12677, "time_per_iteration": 4.306567430496216 }, { "auxiliary_loss_clip": 0.01411749, "auxiliary_loss_mlp": 0.01034409, "balance_loss_clip": 1.24972188, "balance_loss_mlp": 1.01551402, "epoch": 0.7622425973245153, "flos": 21735625219200.0, "grad_norm": 2.1486598294880905, "language_loss": 0.7967158, "learning_rate": 5.641309683778064e-07, "loss": 0.82117736, "num_input_tokens_seen": 273497635, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18884277, "step": 12678, "time_per_iteration": 2.8339550495147705 }, { "auxiliary_loss_clip": 0.01406955, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.24427974, "balance_loss_mlp": 1.01582944, "epoch": 0.7623027205771832, "flos": 19727880883200.0, "grad_norm": 1.7605249262247913, "language_loss": 0.78073406, "learning_rate": 5.638598871811175e-07, "loss": 0.80515087, "num_input_tokens_seen": 273513955, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18896484, "step": 12679, "time_per_iteration": 2.832075834274292 }, { "auxiliary_loss_clip": 0.0140663, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.24339938, "balance_loss_mlp": 1.01251233, "epoch": 0.7623628438298512, "flos": 23999645957760.0, "grad_norm": 1.4562442673763532, "language_loss": 0.80524266, "learning_rate": 5.635888604430059e-07, "loss": 0.82961524, "num_input_tokens_seen": 273533970, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18115234, "step": 12680, "time_per_iteration": 2.87117862701416 }, { "auxiliary_loss_clip": 0.01413849, "auxiliary_loss_mlp": 0.01032432, "balance_loss_clip": 1.25283051, "balance_loss_mlp": 1.01323938, "epoch": 0.7624229670825191, "flos": 22355761455360.0, "grad_norm": 2.4494817700981923, "language_loss": 0.6389221, "learning_rate": 5.633178881737493e-07, "loss": 0.66338491, "num_input_tokens_seen": 273553090, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19189453, "step": 12681, "time_per_iteration": 2.821491241455078 }, { "auxiliary_loss_clip": 0.01399796, "auxiliary_loss_mlp": 0.01035028, "balance_loss_clip": 1.24050367, "balance_loss_mlp": 1.01314092, "epoch": 0.7624830903351871, "flos": 22722245487360.0, "grad_norm": 2.565114427231154, "language_loss": 0.77015972, "learning_rate": 5.63046970383622e-07, "loss": 0.79450798, "num_input_tokens_seen": 273572460, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.21887207, "step": 12682, "time_per_iteration": 4.3697521686553955 }, { "auxiliary_loss_clip": 0.01399986, "auxiliary_loss_mlp": 0.01030373, "balance_loss_clip": 1.24117351, "balance_loss_mlp": 1.01249135, "epoch": 0.7625432135878552, "flos": 25604818894080.0, "grad_norm": 1.54903505855677, "language_loss": 0.68538773, "learning_rate": 5.627761070828974e-07, "loss": 0.70969129, "num_input_tokens_seen": 273592815, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17895508, "step": 12683, "time_per_iteration": 2.8910129070281982 }, { "auxiliary_loss_clip": 0.01400332, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.23935294, "balance_loss_mlp": 1.01692784, "epoch": 0.7626033368405231, "flos": 23998152879360.0, "grad_norm": 2.3070600189162485, "language_loss": 0.83782005, "learning_rate": 5.625052982818472e-07, "loss": 0.8621856, "num_input_tokens_seen": 273611790, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19287109, "step": 12684, "time_per_iteration": 2.8531558513641357 }, { "auxiliary_loss_clip": 0.01419215, "auxiliary_loss_mlp": 0.01033776, "balance_loss_clip": 1.25698888, "balance_loss_mlp": 1.01429725, "epoch": 0.7626634600931911, "flos": 12605376758400.0, "grad_norm": 2.2752651794423095, "language_loss": 0.83569348, "learning_rate": 5.622345439907396e-07, "loss": 0.86022335, "num_input_tokens_seen": 273628340, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19470215, "step": 12685, "time_per_iteration": 2.818706750869751 }, { "auxiliary_loss_clip": 0.01415935, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.25334954, "balance_loss_mlp": 1.011271, "epoch": 0.762723583345859, "flos": 26333669681280.0, "grad_norm": 3.0395347593132294, "language_loss": 0.77791262, "learning_rate": 5.619638442198422e-07, "loss": 0.80237067, "num_input_tokens_seen": 273646585, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18603516, "step": 12686, "time_per_iteration": 2.8823633193969727 }, { "auxiliary_loss_clip": 0.01408762, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.24430323, "balance_loss_mlp": 1.01633716, "epoch": 0.762783706598527, "flos": 21916785974400.0, "grad_norm": 1.689747716230049, "language_loss": 0.7321105, "learning_rate": 5.616931989794198e-07, "loss": 0.75654948, "num_input_tokens_seen": 273665410, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18786621, "step": 12687, "time_per_iteration": 2.879981517791748 }, { "auxiliary_loss_clip": 0.01411284, "auxiliary_loss_mlp": 0.01034147, "balance_loss_clip": 1.25065935, "balance_loss_mlp": 1.01481104, "epoch": 0.7628438298511949, "flos": 15347853705600.0, "grad_norm": 1.7555698991841302, "language_loss": 0.65299356, "learning_rate": 5.614226082797369e-07, "loss": 0.67744792, "num_input_tokens_seen": 273683035, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19335938, "step": 12688, "time_per_iteration": 2.8217406272888184 }, { "auxiliary_loss_clip": 0.01400132, "auxiliary_loss_mlp": 0.01029352, "balance_loss_clip": 1.2415632, "balance_loss_mlp": 1.01081514, "epoch": 0.7629039531038629, "flos": 13014191940480.0, "grad_norm": 1.8453518394461212, "language_loss": 0.71569264, "learning_rate": 5.611520721310515e-07, "loss": 0.73998749, "num_input_tokens_seen": 273700130, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18530273, "step": 12689, "time_per_iteration": 2.8758504390716553 }, { "auxiliary_loss_clip": 0.01428654, "auxiliary_loss_mlp": 0.01032494, "balance_loss_clip": 1.26175499, "balance_loss_mlp": 1.01376617, "epoch": 0.7629640763565309, "flos": 26181402370560.0, "grad_norm": 1.7041280321196504, "language_loss": 0.7103675, "learning_rate": 5.608815905436238e-07, "loss": 0.73497903, "num_input_tokens_seen": 273720310, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.18737793, "step": 12690, "time_per_iteration": 2.9213674068450928 }, { "auxiliary_loss_clip": 0.01406999, "auxiliary_loss_mlp": 0.01030082, "balance_loss_clip": 1.24590039, "balance_loss_mlp": 1.01239157, "epoch": 0.7630241996091989, "flos": 36807430544640.0, "grad_norm": 1.4828477984733357, "language_loss": 0.70401013, "learning_rate": 5.606111635277109e-07, "loss": 0.72838092, "num_input_tokens_seen": 273744475, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.17675781, "step": 12691, "time_per_iteration": 2.9760804176330566 }, { "auxiliary_loss_clip": 0.01411939, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.25025344, "balance_loss_mlp": 1.01283431, "epoch": 0.7630843228618668, "flos": 21845154176640.0, "grad_norm": 1.669100928945534, "language_loss": 0.828336, "learning_rate": 5.603407910935662e-07, "loss": 0.85275793, "num_input_tokens_seen": 273764635, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.17431641, "step": 12692, "time_per_iteration": 2.873231887817383 }, { "auxiliary_loss_clip": 0.01418776, "auxiliary_loss_mlp": 0.01035649, "balance_loss_clip": 1.25605297, "balance_loss_mlp": 1.01745796, "epoch": 0.7631444461145348, "flos": 12648069866880.0, "grad_norm": 5.7568399503158485, "language_loss": 0.78128642, "learning_rate": 5.600704732514438e-07, "loss": 0.80583066, "num_input_tokens_seen": 273780115, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18188477, "step": 12693, "time_per_iteration": 2.9675512313842773 }, { "auxiliary_loss_clip": 0.01427037, "auxiliary_loss_mlp": 0.01037003, "balance_loss_clip": 1.26184702, "balance_loss_mlp": 1.01698744, "epoch": 0.7632045693672027, "flos": 16845307476480.0, "grad_norm": 2.2406780430554982, "language_loss": 0.73821074, "learning_rate": 5.598002100115933e-07, "loss": 0.76285112, "num_input_tokens_seen": 273796605, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20007324, "step": 12694, "time_per_iteration": 2.8677423000335693 }, { "auxiliary_loss_clip": 0.01413751, "auxiliary_loss_mlp": 0.01029184, "balance_loss_clip": 1.25238204, "balance_loss_mlp": 1.01084971, "epoch": 0.7632646926198707, "flos": 22027310317440.0, "grad_norm": 2.666657533146488, "language_loss": 0.70972693, "learning_rate": 5.595300013842625e-07, "loss": 0.73415631, "num_input_tokens_seen": 273816515, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18334961, "step": 12695, "time_per_iteration": 2.839151620864868 }, { "auxiliary_loss_clip": 0.01406595, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.24585319, "balance_loss_mlp": 1.01208818, "epoch": 0.7633248158725388, "flos": 23124816887040.0, "grad_norm": 2.445171762730224, "language_loss": 0.73227024, "learning_rate": 5.592598473796985e-07, "loss": 0.75664002, "num_input_tokens_seen": 273837060, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18273926, "step": 12696, "time_per_iteration": 2.880070447921753 }, { "auxiliary_loss_clip": 0.0141556, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.25192583, "balance_loss_mlp": 1.01192498, "epoch": 0.7633849391252067, "flos": 10897099626240.0, "grad_norm": 2.0647433863501523, "language_loss": 0.72334319, "learning_rate": 5.589897480081453e-07, "loss": 0.74780977, "num_input_tokens_seen": 273853365, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19165039, "step": 12697, "time_per_iteration": 2.8222146034240723 }, { "auxiliary_loss_clip": 0.0140338, "auxiliary_loss_mlp": 0.01030609, "balance_loss_clip": 1.24505877, "balance_loss_mlp": 1.01160693, "epoch": 0.7634450623778747, "flos": 21003471561600.0, "grad_norm": 2.3135528913712426, "language_loss": 0.67914712, "learning_rate": 5.587197032798461e-07, "loss": 0.70348704, "num_input_tokens_seen": 273870750, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18994141, "step": 12698, "time_per_iteration": 2.8458311557769775 }, { "auxiliary_loss_clip": 0.01403996, "auxiliary_loss_mlp": 0.01030764, "balance_loss_clip": 1.24168444, "balance_loss_mlp": 1.01128483, "epoch": 0.7635051856305426, "flos": 18891853868160.0, "grad_norm": 2.104630723177676, "language_loss": 0.73216397, "learning_rate": 5.5844971320504e-07, "loss": 0.75651157, "num_input_tokens_seen": 273890890, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19470215, "step": 12699, "time_per_iteration": 2.8765902519226074 }, { "auxiliary_loss_clip": 0.01392429, "auxiliary_loss_mlp": 0.01032634, "balance_loss_clip": 1.23492348, "balance_loss_mlp": 1.01440716, "epoch": 0.7635653088832106, "flos": 34800183901440.0, "grad_norm": 2.138823316965592, "language_loss": 0.74145281, "learning_rate": 5.581797777939648e-07, "loss": 0.76570344, "num_input_tokens_seen": 273914015, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18249512, "step": 12700, "time_per_iteration": 2.9625439643859863 }, { "auxiliary_loss_clip": 0.01407248, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.24517226, "balance_loss_mlp": 1.01621389, "epoch": 0.7636254321358785, "flos": 23187037766400.0, "grad_norm": 2.601566684221268, "language_loss": 0.69795811, "learning_rate": 5.579098970568574e-07, "loss": 0.72237903, "num_input_tokens_seen": 273927415, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18640137, "step": 12701, "time_per_iteration": 2.8201279640197754 }, { "auxiliary_loss_clip": 0.01403601, "auxiliary_loss_mlp": 0.01037622, "balance_loss_clip": 1.24260402, "balance_loss_mlp": 1.01907349, "epoch": 0.7636855553885465, "flos": 21335316059520.0, "grad_norm": 1.923229713149261, "language_loss": 0.65403557, "learning_rate": 5.576400710039508e-07, "loss": 0.67844784, "num_input_tokens_seen": 273946690, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18530273, "step": 12702, "time_per_iteration": 2.849137544631958 }, { "auxiliary_loss_clip": 0.01414866, "auxiliary_loss_mlp": 0.01035776, "balance_loss_clip": 1.2514112, "balance_loss_mlp": 1.01647592, "epoch": 0.7637456786412145, "flos": 28669865155200.0, "grad_norm": 1.9799337398349477, "language_loss": 0.66483706, "learning_rate": 5.57370299645477e-07, "loss": 0.68934345, "num_input_tokens_seen": 273966870, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19299316, "step": 12703, "time_per_iteration": 2.9443461894989014 }, { "auxiliary_loss_clip": 0.01408644, "auxiliary_loss_mlp": 0.0102842, "balance_loss_clip": 1.24712467, "balance_loss_mlp": 1.01039577, "epoch": 0.7638058018938825, "flos": 21917057443200.0, "grad_norm": 1.9344879909988062, "language_loss": 0.84562081, "learning_rate": 5.571005829916668e-07, "loss": 0.86999154, "num_input_tokens_seen": 273986360, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18017578, "step": 12704, "time_per_iteration": 2.845228910446167 }, { "auxiliary_loss_clip": 0.01404236, "auxiliary_loss_mlp": 0.01033544, "balance_loss_clip": 1.24338686, "balance_loss_mlp": 1.01488793, "epoch": 0.7638659251465504, "flos": 29656123464960.0, "grad_norm": 1.7550061312769576, "language_loss": 0.68241131, "learning_rate": 5.568309210527469e-07, "loss": 0.70678914, "num_input_tokens_seen": 274009745, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18652344, "step": 12705, "time_per_iteration": 2.880814790725708 }, { "auxiliary_loss_clip": 0.01400261, "auxiliary_loss_mlp": 0.01033781, "balance_loss_clip": 1.24115944, "balance_loss_mlp": 1.015113, "epoch": 0.7639260483992184, "flos": 26152463681280.0, "grad_norm": 1.6324087042250581, "language_loss": 0.75231093, "learning_rate": 5.565613138389427e-07, "loss": 0.77665132, "num_input_tokens_seen": 274028775, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18688965, "step": 12706, "time_per_iteration": 4.278339147567749 }, { "auxiliary_loss_clip": 0.01398908, "auxiliary_loss_mlp": 0.01032039, "balance_loss_clip": 1.23965526, "balance_loss_mlp": 1.01338291, "epoch": 0.7639861716518863, "flos": 20166222936960.0, "grad_norm": 6.341129685971686, "language_loss": 0.79136121, "learning_rate": 5.562917613604781e-07, "loss": 0.81567067, "num_input_tokens_seen": 274047520, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18664551, "step": 12707, "time_per_iteration": 2.845256805419922 }, { "auxiliary_loss_clip": 0.01413424, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.25081158, "balance_loss_mlp": 1.01060188, "epoch": 0.7640462949045543, "flos": 18591029320320.0, "grad_norm": 1.8003022386835217, "language_loss": 0.80592871, "learning_rate": 5.560222636275751e-07, "loss": 0.83035445, "num_input_tokens_seen": 274065350, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1854248, "step": 12708, "time_per_iteration": 2.92230224609375 }, { "auxiliary_loss_clip": 0.01186316, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.09819865, "balance_loss_mlp": 1.01104677, "epoch": 0.7641064181572224, "flos": 68354548202880.0, "grad_norm": 0.8218417430016498, "language_loss": 0.56588143, "learning_rate": 5.557528206504521e-07, "loss": 0.588063, "num_input_tokens_seen": 274122315, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20800781, "step": 12709, "time_per_iteration": 3.3683218955993652 }, { "auxiliary_loss_clip": 0.01399386, "auxiliary_loss_mlp": 0.01039148, "balance_loss_clip": 1.23798096, "balance_loss_mlp": 1.01878691, "epoch": 0.7641665414098903, "flos": 17978313231360.0, "grad_norm": 1.738426294155675, "language_loss": 0.641559, "learning_rate": 5.554834324393271e-07, "loss": 0.6659444, "num_input_tokens_seen": 274140555, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20361328, "step": 12710, "time_per_iteration": 2.8442046642303467 }, { "auxiliary_loss_clip": 0.01416728, "auxiliary_loss_mlp": 0.01032732, "balance_loss_clip": 1.25178456, "balance_loss_mlp": 1.01325357, "epoch": 0.7642266646625583, "flos": 21262100693760.0, "grad_norm": 2.0511692054306203, "language_loss": 0.66001225, "learning_rate": 5.552140990044154e-07, "loss": 0.68450689, "num_input_tokens_seen": 274161125, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19482422, "step": 12711, "time_per_iteration": 2.916053056716919 }, { "auxiliary_loss_clip": 0.0140631, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.24452579, "balance_loss_mlp": 1.01054049, "epoch": 0.7642867879152262, "flos": 22758151875840.0, "grad_norm": 1.4699619172574667, "language_loss": 0.73591125, "learning_rate": 5.549448203559293e-07, "loss": 0.76026165, "num_input_tokens_seen": 274180835, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18188477, "step": 12712, "time_per_iteration": 4.277302503585815 }, { "auxiliary_loss_clip": 0.01398545, "auxiliary_loss_mlp": 0.01030222, "balance_loss_clip": 1.24002886, "balance_loss_mlp": 1.01218593, "epoch": 0.7643469111678942, "flos": 23342788926720.0, "grad_norm": 2.392841579516698, "language_loss": 0.81207693, "learning_rate": 5.546755965040804e-07, "loss": 0.83636463, "num_input_tokens_seen": 274201190, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18029785, "step": 12713, "time_per_iteration": 2.904714822769165 }, { "auxiliary_loss_clip": 0.01419658, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.25574982, "balance_loss_mlp": 1.01648951, "epoch": 0.7644070344205621, "flos": 19864538737920.0, "grad_norm": 7.4534062194144886, "language_loss": 0.84001911, "learning_rate": 5.544064274590776e-07, "loss": 0.86457139, "num_input_tokens_seen": 274217595, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19091797, "step": 12714, "time_per_iteration": 2.8254282474517822 }, { "auxiliary_loss_clip": 0.01405858, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.24312687, "balance_loss_mlp": 1.01553619, "epoch": 0.7644671576732301, "flos": 22100706662400.0, "grad_norm": 1.6251951420537383, "language_loss": 0.73442614, "learning_rate": 5.541373132311287e-07, "loss": 0.75883317, "num_input_tokens_seen": 274237885, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1932373, "step": 12715, "time_per_iteration": 2.8599483966827393 }, { "auxiliary_loss_clip": 0.01400242, "auxiliary_loss_mlp": 0.01033889, "balance_loss_clip": 1.23996735, "balance_loss_mlp": 1.01431453, "epoch": 0.7645272809258981, "flos": 25491760842240.0, "grad_norm": 1.6315110861018363, "language_loss": 0.63892943, "learning_rate": 5.538682538304376e-07, "loss": 0.66327077, "num_input_tokens_seen": 274258820, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19604492, "step": 12716, "time_per_iteration": 4.282771110534668 }, { "auxiliary_loss_clip": 0.01423141, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.2574017, "balance_loss_mlp": 1.01307869, "epoch": 0.7645874041785661, "flos": 21551433062400.0, "grad_norm": 2.006292824749454, "language_loss": 0.80173397, "learning_rate": 5.535992492672068e-07, "loss": 0.82629097, "num_input_tokens_seen": 274278835, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19482422, "step": 12717, "time_per_iteration": 4.252044677734375 }, { "auxiliary_loss_clip": 0.01394237, "auxiliary_loss_mlp": 0.01038017, "balance_loss_clip": 1.23620737, "balance_loss_mlp": 1.01865721, "epoch": 0.764647527431234, "flos": 20640516624000.0, "grad_norm": 2.5220871710642623, "language_loss": 0.67484957, "learning_rate": 5.53330299551638e-07, "loss": 0.69917214, "num_input_tokens_seen": 274297110, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19348145, "step": 12718, "time_per_iteration": 2.8416671752929688 }, { "auxiliary_loss_clip": 0.01401097, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.24179149, "balance_loss_mlp": 1.01335287, "epoch": 0.764707650683902, "flos": 21444030610560.0, "grad_norm": 1.9131418645788234, "language_loss": 0.78200352, "learning_rate": 5.530614046939286e-07, "loss": 0.80632961, "num_input_tokens_seen": 274315610, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18139648, "step": 12719, "time_per_iteration": 2.8364920616149902 }, { "auxiliary_loss_clip": 0.01409417, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.24718368, "balance_loss_mlp": 1.01237535, "epoch": 0.7647677739365699, "flos": 22721521570560.0, "grad_norm": 2.092901508736322, "language_loss": 0.70849037, "learning_rate": 5.527925647042754e-07, "loss": 0.73289442, "num_input_tokens_seen": 274333975, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18603516, "step": 12720, "time_per_iteration": 2.9127211570739746 }, { "auxiliary_loss_clip": 0.01412858, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.25101924, "balance_loss_mlp": 1.01348901, "epoch": 0.7648278971892379, "flos": 21333913470720.0, "grad_norm": 1.956286699537998, "language_loss": 0.74803376, "learning_rate": 5.52523779592875e-07, "loss": 0.77248538, "num_input_tokens_seen": 274353695, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18811035, "step": 12721, "time_per_iteration": 2.9930717945098877 }, { "auxiliary_loss_clip": 0.01406509, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.24449444, "balance_loss_mlp": 1.01283288, "epoch": 0.764888020441906, "flos": 20676965950080.0, "grad_norm": 1.797970698467843, "language_loss": 0.74265468, "learning_rate": 5.522550493699163e-07, "loss": 0.76703173, "num_input_tokens_seen": 274371120, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18359375, "step": 12722, "time_per_iteration": 2.832766056060791 }, { "auxiliary_loss_clip": 0.01392635, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.23388958, "balance_loss_mlp": 1.01606059, "epoch": 0.7649481436945739, "flos": 25093397208960.0, "grad_norm": 2.636789129093383, "language_loss": 0.74439037, "learning_rate": 5.519863740455912e-07, "loss": 0.76866525, "num_input_tokens_seen": 274389665, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18786621, "step": 12723, "time_per_iteration": 2.8806347846984863 }, { "auxiliary_loss_clip": 0.01410106, "auxiliary_loss_mlp": 0.01031546, "balance_loss_clip": 1.24525011, "balance_loss_mlp": 1.01299715, "epoch": 0.7650082669472419, "flos": 24911919740160.0, "grad_norm": 2.186625810914325, "language_loss": 0.7425586, "learning_rate": 5.517177536300881e-07, "loss": 0.7669751, "num_input_tokens_seen": 274408750, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18566895, "step": 12724, "time_per_iteration": 2.8547439575195312 }, { "auxiliary_loss_clip": 0.01402226, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.24327266, "balance_loss_mlp": 1.01281047, "epoch": 0.7650683901999098, "flos": 14655497489280.0, "grad_norm": 1.9218402187409904, "language_loss": 0.85044861, "learning_rate": 5.514491881335935e-07, "loss": 0.87478715, "num_input_tokens_seen": 274424600, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18798828, "step": 12725, "time_per_iteration": 2.821133852005005 }, { "auxiliary_loss_clip": 0.01392504, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.23403811, "balance_loss_mlp": 1.01299334, "epoch": 0.7651285134525778, "flos": 26361793964160.0, "grad_norm": 1.760049644319288, "language_loss": 0.78347474, "learning_rate": 5.511806775662901e-07, "loss": 0.80771774, "num_input_tokens_seen": 274443075, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18798828, "step": 12726, "time_per_iteration": 2.8860785961151123 }, { "auxiliary_loss_clip": 0.0141282, "auxiliary_loss_mlp": 0.01034242, "balance_loss_clip": 1.25137281, "balance_loss_mlp": 1.01498997, "epoch": 0.7651886367052457, "flos": 26656962912000.0, "grad_norm": 2.037810448808766, "language_loss": 0.71051371, "learning_rate": 5.509122219383615e-07, "loss": 0.7349844, "num_input_tokens_seen": 274463240, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19250488, "step": 12727, "time_per_iteration": 2.9650168418884277 }, { "auxiliary_loss_clip": 0.01391564, "auxiliary_loss_mlp": 0.01034171, "balance_loss_clip": 1.2342937, "balance_loss_mlp": 1.01463306, "epoch": 0.7652487599579137, "flos": 25713895403520.0, "grad_norm": 1.698395938408037, "language_loss": 0.80450881, "learning_rate": 5.506438212599864e-07, "loss": 0.82876617, "num_input_tokens_seen": 274482750, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19519043, "step": 12728, "time_per_iteration": 2.895752191543579 }, { "auxiliary_loss_clip": 0.01417207, "auxiliary_loss_mlp": 0.01037484, "balance_loss_clip": 1.25388956, "balance_loss_mlp": 1.01830316, "epoch": 0.7653088832105817, "flos": 28597464195840.0, "grad_norm": 1.7209301654881508, "language_loss": 0.57152545, "learning_rate": 5.503754755413424e-07, "loss": 0.59607244, "num_input_tokens_seen": 274503545, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19177246, "step": 12729, "time_per_iteration": 2.9119129180908203 }, { "auxiliary_loss_clip": 0.01402366, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.24242258, "balance_loss_mlp": 1.01717496, "epoch": 0.7653690064632497, "flos": 23376930768000.0, "grad_norm": 2.3451645255769384, "language_loss": 0.78386605, "learning_rate": 5.501071847926055e-07, "loss": 0.80825031, "num_input_tokens_seen": 274523825, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1887207, "step": 12730, "time_per_iteration": 2.909264087677002 }, { "auxiliary_loss_clip": 0.01412304, "auxiliary_loss_mlp": 0.01039651, "balance_loss_clip": 1.24985492, "balance_loss_mlp": 1.02049375, "epoch": 0.7654291297159176, "flos": 15781128341760.0, "grad_norm": 1.991223923347525, "language_loss": 0.70263696, "learning_rate": 5.498389490239495e-07, "loss": 0.72715652, "num_input_tokens_seen": 274541625, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19177246, "step": 12731, "time_per_iteration": 2.811901092529297 }, { "auxiliary_loss_clip": 0.01407151, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.24575996, "balance_loss_mlp": 1.01730847, "epoch": 0.7654892529685856, "flos": 18041031803520.0, "grad_norm": 2.752650430306556, "language_loss": 0.70863175, "learning_rate": 5.495707682455471e-07, "loss": 0.73307008, "num_input_tokens_seen": 274557580, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19384766, "step": 12732, "time_per_iteration": 2.9116008281707764 }, { "auxiliary_loss_clip": 0.01404961, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.24213231, "balance_loss_mlp": 1.01211238, "epoch": 0.7655493762212535, "flos": 27247662766080.0, "grad_norm": 1.4746180353195881, "language_loss": 0.78681993, "learning_rate": 5.493026424675653e-07, "loss": 0.81117773, "num_input_tokens_seen": 274578135, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18701172, "step": 12733, "time_per_iteration": 2.928375720977783 }, { "auxiliary_loss_clip": 0.01399683, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.24224973, "balance_loss_mlp": 1.0174036, "epoch": 0.7656094994739215, "flos": 20782875323520.0, "grad_norm": 1.6268419670035998, "language_loss": 0.77862525, "learning_rate": 5.490345717001726e-07, "loss": 0.80297554, "num_input_tokens_seen": 274595655, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.17944336, "step": 12734, "time_per_iteration": 2.8362410068511963 }, { "auxiliary_loss_clip": 0.01416571, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.2516166, "balance_loss_mlp": 1.01711178, "epoch": 0.7656696227265896, "flos": 23049565505280.0, "grad_norm": 1.8853887936743, "language_loss": 0.73838872, "learning_rate": 5.48766555953535e-07, "loss": 0.7629261, "num_input_tokens_seen": 274616305, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20056152, "step": 12735, "time_per_iteration": 2.8671698570251465 }, { "auxiliary_loss_clip": 0.01404244, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.24263322, "balance_loss_mlp": 1.01641226, "epoch": 0.7657297459792575, "flos": 27536768910720.0, "grad_norm": 1.397290564991527, "language_loss": 0.73208427, "learning_rate": 5.484985952378145e-07, "loss": 0.75647998, "num_input_tokens_seen": 274638110, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18896484, "step": 12736, "time_per_iteration": 2.9463255405426025 }, { "auxiliary_loss_clip": 0.01419235, "auxiliary_loss_mlp": 0.01039332, "balance_loss_clip": 1.25605154, "balance_loss_mlp": 1.01951921, "epoch": 0.7657898692319255, "flos": 17137218798720.0, "grad_norm": 2.093482346143955, "language_loss": 0.77698588, "learning_rate": 5.482306895631728e-07, "loss": 0.80157149, "num_input_tokens_seen": 274656565, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19824219, "step": 12737, "time_per_iteration": 2.8669357299804688 }, { "auxiliary_loss_clip": 0.01397208, "auxiliary_loss_mlp": 0.01037166, "balance_loss_clip": 1.23660326, "balance_loss_mlp": 1.01828313, "epoch": 0.7658499924845934, "flos": 21474462378240.0, "grad_norm": 1.6361366748726105, "language_loss": 0.77768862, "learning_rate": 5.479628389397699e-07, "loss": 0.80203235, "num_input_tokens_seen": 274674215, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18884277, "step": 12738, "time_per_iteration": 2.838177442550659 }, { "auxiliary_loss_clip": 0.01413176, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.24858868, "balance_loss_mlp": 1.01627922, "epoch": 0.7659101157372614, "flos": 29508516368640.0, "grad_norm": 2.506770874554801, "language_loss": 0.63570589, "learning_rate": 5.476950433777603e-07, "loss": 0.66018701, "num_input_tokens_seen": 274693445, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.18652344, "step": 12739, "time_per_iteration": 2.9059417247772217 }, { "auxiliary_loss_clip": 0.01397954, "auxiliary_loss_mlp": 0.0103874, "balance_loss_clip": 1.23703063, "balance_loss_mlp": 1.01965415, "epoch": 0.7659702389899293, "flos": 18561050000640.0, "grad_norm": 1.9034500342387892, "language_loss": 0.80358171, "learning_rate": 5.474273028873004e-07, "loss": 0.82794869, "num_input_tokens_seen": 274712815, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1907959, "step": 12740, "time_per_iteration": 2.803060293197632 }, { "auxiliary_loss_clip": 0.01399915, "auxiliary_loss_mlp": 0.01033864, "balance_loss_clip": 1.23991966, "balance_loss_mlp": 1.01535058, "epoch": 0.7660303622425974, "flos": 23559403622400.0, "grad_norm": 1.6838082912296088, "language_loss": 0.66214216, "learning_rate": 5.471596174785429e-07, "loss": 0.68647993, "num_input_tokens_seen": 274732690, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18530273, "step": 12741, "time_per_iteration": 4.263515949249268 }, { "auxiliary_loss_clip": 0.01404374, "auxiliary_loss_mlp": 0.01032101, "balance_loss_clip": 1.24490142, "balance_loss_mlp": 1.01202607, "epoch": 0.7660904854952653, "flos": 18926086199040.0, "grad_norm": 1.6371855362171621, "language_loss": 0.76640928, "learning_rate": 5.468919871616386e-07, "loss": 0.79077399, "num_input_tokens_seen": 274752460, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.20080566, "step": 12742, "time_per_iteration": 2.877699136734009 }, { "auxiliary_loss_clip": 0.01392951, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.23649693, "balance_loss_mlp": 1.01365256, "epoch": 0.7661506087479333, "flos": 23157556139520.0, "grad_norm": 1.3748534017207428, "language_loss": 0.77090824, "learning_rate": 5.46624411946736e-07, "loss": 0.795156, "num_input_tokens_seen": 274773070, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.1817627, "step": 12743, "time_per_iteration": 2.967599868774414 }, { "auxiliary_loss_clip": 0.01401871, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.24114227, "balance_loss_mlp": 1.01558113, "epoch": 0.7662107320006012, "flos": 17574520222080.0, "grad_norm": 1.997736621863005, "language_loss": 0.75412136, "learning_rate": 5.463568918439805e-07, "loss": 0.77848321, "num_input_tokens_seen": 274790220, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18713379, "step": 12744, "time_per_iteration": 2.8230140209198 }, { "auxiliary_loss_clip": 0.01411315, "auxiliary_loss_mlp": 0.01037156, "balance_loss_clip": 1.2483356, "balance_loss_mlp": 1.01748669, "epoch": 0.7662708552532692, "flos": 22311530023680.0, "grad_norm": 2.437989169501319, "language_loss": 0.72114837, "learning_rate": 5.460894268635181e-07, "loss": 0.74563307, "num_input_tokens_seen": 274805095, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19665527, "step": 12745, "time_per_iteration": 2.835333824157715 }, { "auxiliary_loss_clip": 0.01401782, "auxiliary_loss_mlp": 0.01039867, "balance_loss_clip": 1.24045241, "balance_loss_mlp": 1.02093601, "epoch": 0.7663309785059371, "flos": 15750470350080.0, "grad_norm": 2.4880099141697842, "language_loss": 0.78274566, "learning_rate": 5.458220170154896e-07, "loss": 0.80716211, "num_input_tokens_seen": 274821800, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18933105, "step": 12746, "time_per_iteration": 2.8525919914245605 }, { "auxiliary_loss_clip": 0.01188714, "auxiliary_loss_mlp": 0.01050977, "balance_loss_clip": 1.10334241, "balance_loss_mlp": 1.02761197, "epoch": 0.7663911017586051, "flos": 62196195663360.0, "grad_norm": 0.6795747674224933, "language_loss": 0.56796539, "learning_rate": 5.455546623100362e-07, "loss": 0.59036231, "num_input_tokens_seen": 274886970, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.23339844, "step": 12747, "time_per_iteration": 4.801854372024536 }, { "auxiliary_loss_clip": 0.0140251, "auxiliary_loss_mlp": 0.01036958, "balance_loss_clip": 1.24372649, "balance_loss_mlp": 1.01974463, "epoch": 0.7664512250112732, "flos": 26517409390080.0, "grad_norm": 2.425647172387313, "language_loss": 0.72614336, "learning_rate": 5.452873627572956e-07, "loss": 0.75053805, "num_input_tokens_seen": 274907240, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17211914, "step": 12748, "time_per_iteration": 2.8943965435028076 }, { "auxiliary_loss_clip": 0.01398833, "auxiliary_loss_mlp": 0.01027221, "balance_loss_clip": 1.23826551, "balance_loss_mlp": 1.00827861, "epoch": 0.7665113482639411, "flos": 16257231820800.0, "grad_norm": 2.0161611956465455, "language_loss": 0.70450675, "learning_rate": 5.450201183674052e-07, "loss": 0.72876728, "num_input_tokens_seen": 274924650, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18933105, "step": 12749, "time_per_iteration": 2.8052446842193604 }, { "auxiliary_loss_clip": 0.01403074, "auxiliary_loss_mlp": 0.01034259, "balance_loss_clip": 1.2405175, "balance_loss_mlp": 1.01481605, "epoch": 0.7665714715166091, "flos": 27209086934400.0, "grad_norm": 1.7040117287892105, "language_loss": 0.74020934, "learning_rate": 5.447529291504967e-07, "loss": 0.76458269, "num_input_tokens_seen": 274944550, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19458008, "step": 12750, "time_per_iteration": 2.927014112472534 }, { "auxiliary_loss_clip": 0.01388173, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.23168039, "balance_loss_mlp": 1.01214623, "epoch": 0.766631594769277, "flos": 21077275109760.0, "grad_norm": 1.9516509449801305, "language_loss": 0.76692468, "learning_rate": 5.444857951167026e-07, "loss": 0.79110837, "num_input_tokens_seen": 274961330, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18054199, "step": 12751, "time_per_iteration": 4.208454847335815 }, { "auxiliary_loss_clip": 0.01391472, "auxiliary_loss_mlp": 0.01033743, "balance_loss_clip": 1.23362184, "balance_loss_mlp": 1.01459837, "epoch": 0.766691718021945, "flos": 24108722467200.0, "grad_norm": 1.8310705094854403, "language_loss": 0.6252712, "learning_rate": 5.442187162761537e-07, "loss": 0.64952332, "num_input_tokens_seen": 274981655, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.19140625, "step": 12752, "time_per_iteration": 4.2787926197052 }, { "auxiliary_loss_clip": 0.01414863, "auxiliary_loss_mlp": 0.0103323, "balance_loss_clip": 1.25264764, "balance_loss_mlp": 1.0136795, "epoch": 0.7667518412746129, "flos": 23451096274560.0, "grad_norm": 2.080690145097475, "language_loss": 0.70161468, "learning_rate": 5.439516926389767e-07, "loss": 0.72609568, "num_input_tokens_seen": 274999970, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19567871, "step": 12753, "time_per_iteration": 2.8480403423309326 }, { "auxiliary_loss_clip": 0.01407329, "auxiliary_loss_mlp": 0.01034504, "balance_loss_clip": 1.24695945, "balance_loss_mlp": 1.0163486, "epoch": 0.766811964527281, "flos": 18157483215360.0, "grad_norm": 2.385092777182043, "language_loss": 0.62560874, "learning_rate": 5.436847242152971e-07, "loss": 0.65002704, "num_input_tokens_seen": 275015805, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1817627, "step": 12754, "time_per_iteration": 2.821772336959839 }, { "auxiliary_loss_clip": 0.01404177, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.24551606, "balance_loss_mlp": 1.01032555, "epoch": 0.7668720877799489, "flos": 19545317539200.0, "grad_norm": 2.0865983866385815, "language_loss": 0.80607361, "learning_rate": 5.434178110152401e-07, "loss": 0.83041114, "num_input_tokens_seen": 275031810, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19238281, "step": 12755, "time_per_iteration": 2.8071374893188477 }, { "auxiliary_loss_clip": 0.01404143, "auxiliary_loss_mlp": 0.0102992, "balance_loss_clip": 1.24462748, "balance_loss_mlp": 1.01098967, "epoch": 0.7669322110326169, "flos": 22684529306880.0, "grad_norm": 1.823294851921207, "language_loss": 0.71481425, "learning_rate": 5.431509530489242e-07, "loss": 0.73915488, "num_input_tokens_seen": 275049325, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18920898, "step": 12756, "time_per_iteration": 2.8444745540618896 }, { "auxiliary_loss_clip": 0.01410849, "auxiliary_loss_mlp": 0.01034229, "balance_loss_clip": 1.25036216, "balance_loss_mlp": 1.01596642, "epoch": 0.7669923342852848, "flos": 26480733840000.0, "grad_norm": 1.5143147931872047, "language_loss": 0.70239735, "learning_rate": 5.428841503264706e-07, "loss": 0.72684813, "num_input_tokens_seen": 275070865, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18261719, "step": 12757, "time_per_iteration": 2.95697283744812 }, { "auxiliary_loss_clip": 0.01395915, "auxiliary_loss_mlp": 0.01041896, "balance_loss_clip": 1.23653436, "balance_loss_mlp": 1.02127337, "epoch": 0.7670524575379528, "flos": 22866232999680.0, "grad_norm": 1.9663767677185664, "language_loss": 0.77446169, "learning_rate": 5.426174028579955e-07, "loss": 0.79883981, "num_input_tokens_seen": 275088015, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.20629883, "step": 12758, "time_per_iteration": 2.8309836387634277 }, { "auxiliary_loss_clip": 0.01395682, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.23903072, "balance_loss_mlp": 1.01662493, "epoch": 0.7671125807906207, "flos": 22461444604800.0, "grad_norm": 1.608406565505334, "language_loss": 0.76974517, "learning_rate": 5.423507106536156e-07, "loss": 0.79404628, "num_input_tokens_seen": 275106975, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.17797852, "step": 12759, "time_per_iteration": 2.8280529975891113 }, { "auxiliary_loss_clip": 0.01410795, "auxiliary_loss_mlp": 0.01031435, "balance_loss_clip": 1.24836969, "balance_loss_mlp": 1.01331472, "epoch": 0.7671727040432887, "flos": 35385092421120.0, "grad_norm": 2.3245940972762957, "language_loss": 0.68531024, "learning_rate": 5.420840737234425e-07, "loss": 0.70973253, "num_input_tokens_seen": 275129560, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18139648, "step": 12760, "time_per_iteration": 2.9355599880218506 }, { "auxiliary_loss_clip": 0.01410036, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.24744272, "balance_loss_mlp": 1.01386178, "epoch": 0.7672328272959568, "flos": 22505902260480.0, "grad_norm": 1.4626931825978964, "language_loss": 0.79777592, "learning_rate": 5.418174920775871e-07, "loss": 0.82221532, "num_input_tokens_seen": 275151180, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20031738, "step": 12761, "time_per_iteration": 2.884333372116089 }, { "auxiliary_loss_clip": 0.01386912, "auxiliary_loss_mlp": 0.01029813, "balance_loss_clip": 1.23005366, "balance_loss_mlp": 1.01150274, "epoch": 0.7672929505486247, "flos": 22824580521600.0, "grad_norm": 2.8363527759582365, "language_loss": 0.66501915, "learning_rate": 5.415509657261589e-07, "loss": 0.68918645, "num_input_tokens_seen": 275170605, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.1829834, "step": 12762, "time_per_iteration": 2.832902431488037 }, { "auxiliary_loss_clip": 0.01410623, "auxiliary_loss_mlp": 0.01036982, "balance_loss_clip": 1.2480278, "balance_loss_mlp": 1.0156672, "epoch": 0.7673530738012927, "flos": 20348333832960.0, "grad_norm": 1.7223475564353614, "language_loss": 0.74805903, "learning_rate": 5.412844946792639e-07, "loss": 0.77253509, "num_input_tokens_seen": 275188750, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.21313477, "step": 12763, "time_per_iteration": 2.9580090045928955 }, { "auxiliary_loss_clip": 0.01405924, "auxiliary_loss_mlp": 0.01035692, "balance_loss_clip": 1.24618328, "balance_loss_mlp": 1.01688039, "epoch": 0.7674131970539606, "flos": 34946976591360.0, "grad_norm": 1.442533821166884, "language_loss": 0.71563649, "learning_rate": 5.410180789470067e-07, "loss": 0.74005264, "num_input_tokens_seen": 275211365, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18811035, "step": 12764, "time_per_iteration": 2.989431858062744 }, { "auxiliary_loss_clip": 0.01399538, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.24027145, "balance_loss_mlp": 1.01769626, "epoch": 0.7674733203066286, "flos": 28340328142080.0, "grad_norm": 1.425184812545941, "language_loss": 0.6997205, "learning_rate": 5.40751718539491e-07, "loss": 0.72407335, "num_input_tokens_seen": 275231670, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18054199, "step": 12765, "time_per_iteration": 2.892854928970337 }, { "auxiliary_loss_clip": 0.0139858, "auxiliary_loss_mlp": 0.01030897, "balance_loss_clip": 1.24086189, "balance_loss_mlp": 1.01336098, "epoch": 0.7675334435592965, "flos": 16298386606080.0, "grad_norm": 2.148916602136365, "language_loss": 0.61058736, "learning_rate": 5.404854134668162e-07, "loss": 0.63488221, "num_input_tokens_seen": 275249425, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.17529297, "step": 12766, "time_per_iteration": 2.8286032676696777 }, { "auxiliary_loss_clip": 0.01192818, "auxiliary_loss_mlp": 0.01018511, "balance_loss_clip": 1.1035428, "balance_loss_mlp": 0.99810201, "epoch": 0.7675935668119646, "flos": 64859665910400.0, "grad_norm": 0.7349148886378912, "language_loss": 0.60805774, "learning_rate": 5.402191637390803e-07, "loss": 0.63017106, "num_input_tokens_seen": 275312485, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.20410156, "step": 12767, "time_per_iteration": 3.460118532180786 }, { "auxiliary_loss_clip": 0.01400919, "auxiliary_loss_mlp": 0.01032004, "balance_loss_clip": 1.24242663, "balance_loss_mlp": 1.01434875, "epoch": 0.7676536900646325, "flos": 22685886650880.0, "grad_norm": 1.68323071124163, "language_loss": 0.70196617, "learning_rate": 5.399529693663801e-07, "loss": 0.72629541, "num_input_tokens_seen": 275331680, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17651367, "step": 12768, "time_per_iteration": 2.886561870574951 }, { "auxiliary_loss_clip": 0.01434743, "auxiliary_loss_mlp": 0.01038404, "balance_loss_clip": 1.26934481, "balance_loss_mlp": 1.01967573, "epoch": 0.7677138133173005, "flos": 26950095843840.0, "grad_norm": 2.223792538464322, "language_loss": 0.71228409, "learning_rate": 5.3968683035881e-07, "loss": 0.7370156, "num_input_tokens_seen": 275351615, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.18725586, "step": 12769, "time_per_iteration": 2.932403802871704 }, { "auxiliary_loss_clip": 0.01418011, "auxiliary_loss_mlp": 0.0103401, "balance_loss_clip": 1.25383496, "balance_loss_mlp": 1.01518655, "epoch": 0.7677739365699684, "flos": 23808531346560.0, "grad_norm": 2.1169885356919242, "language_loss": 0.81303322, "learning_rate": 5.394207467264611e-07, "loss": 0.8375535, "num_input_tokens_seen": 275368815, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18835449, "step": 12770, "time_per_iteration": 2.9230520725250244 }, { "auxiliary_loss_clip": 0.01408723, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.25135541, "balance_loss_mlp": 1.01400805, "epoch": 0.7678340598226364, "flos": 34467796465920.0, "grad_norm": 1.6782993195981655, "language_loss": 0.78907233, "learning_rate": 5.391547184794245e-07, "loss": 0.81348193, "num_input_tokens_seen": 275389345, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18212891, "step": 12771, "time_per_iteration": 2.9584872722625732 }, { "auxiliary_loss_clip": 0.01400064, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.23923707, "balance_loss_mlp": 1.01874411, "epoch": 0.7678941830753043, "flos": 23852219840640.0, "grad_norm": 1.450060724155299, "language_loss": 0.68706989, "learning_rate": 5.388887456277876e-07, "loss": 0.71144986, "num_input_tokens_seen": 275411240, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19189453, "step": 12772, "time_per_iteration": 2.8646507263183594 }, { "auxiliary_loss_clip": 0.01389494, "auxiliary_loss_mlp": 0.01028774, "balance_loss_clip": 1.23501587, "balance_loss_mlp": 1.01128626, "epoch": 0.7679543063279723, "flos": 25421893591680.0, "grad_norm": 1.6044283510639992, "language_loss": 0.73979634, "learning_rate": 5.386228281816349e-07, "loss": 0.76397902, "num_input_tokens_seen": 275432010, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.17492676, "step": 12773, "time_per_iteration": 2.9028186798095703 }, { "auxiliary_loss_clip": 0.01397247, "auxiliary_loss_mlp": 0.01031042, "balance_loss_clip": 1.24160707, "balance_loss_mlp": 1.01323211, "epoch": 0.7680144295806404, "flos": 27973346417280.0, "grad_norm": 1.9490750044625822, "language_loss": 0.82098019, "learning_rate": 5.383569661510512e-07, "loss": 0.84526306, "num_input_tokens_seen": 275453710, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.17810059, "step": 12774, "time_per_iteration": 2.9146907329559326 }, { "auxiliary_loss_clip": 0.01393823, "auxiliary_loss_mlp": 0.01028738, "balance_loss_clip": 1.23658514, "balance_loss_mlp": 1.0110836, "epoch": 0.7680745528333083, "flos": 20422589829120.0, "grad_norm": 1.609649060764939, "language_loss": 0.71059978, "learning_rate": 5.380911595461177e-07, "loss": 0.73482543, "num_input_tokens_seen": 275472915, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.17651367, "step": 12775, "time_per_iteration": 4.322916507720947 }, { "auxiliary_loss_clip": 0.01191205, "auxiliary_loss_mlp": 0.0103823, "balance_loss_clip": 1.10229623, "balance_loss_mlp": 1.01505554, "epoch": 0.7681346760859763, "flos": 68435274205440.0, "grad_norm": 0.6978242550233952, "language_loss": 0.56920683, "learning_rate": 5.378254083769147e-07, "loss": 0.59150118, "num_input_tokens_seen": 275534785, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.23144531, "step": 12776, "time_per_iteration": 3.38327956199646 }, { "auxiliary_loss_clip": 0.01402492, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.24438405, "balance_loss_mlp": 1.01698911, "epoch": 0.7681947993386442, "flos": 21261331532160.0, "grad_norm": 1.7299012545211407, "language_loss": 0.74467599, "learning_rate": 5.375597126535188e-07, "loss": 0.76905179, "num_input_tokens_seen": 275553205, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1809082, "step": 12777, "time_per_iteration": 2.8337197303771973 }, { "auxiliary_loss_clip": 0.01417176, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.2567457, "balance_loss_mlp": 1.0212723, "epoch": 0.7682549225913122, "flos": 21407671774080.0, "grad_norm": 2.1193447219060615, "language_loss": 0.71785462, "learning_rate": 5.372940723860043e-07, "loss": 0.7424255, "num_input_tokens_seen": 275571490, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18640137, "step": 12778, "time_per_iteration": 2.8349545001983643 }, { "auxiliary_loss_clip": 0.0140062, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.24113727, "balance_loss_mlp": 1.01218009, "epoch": 0.7683150458439801, "flos": 23049158302080.0, "grad_norm": 1.7730406764156998, "language_loss": 0.71230704, "learning_rate": 5.37028487584446e-07, "loss": 0.73660898, "num_input_tokens_seen": 275589665, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.1739502, "step": 12779, "time_per_iteration": 2.85711669921875 }, { "auxiliary_loss_clip": 0.01409353, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.24896002, "balance_loss_mlp": 1.01244593, "epoch": 0.7683751690966482, "flos": 67354065043200.0, "grad_norm": 1.7973171249310764, "language_loss": 0.59459984, "learning_rate": 5.367629582589133e-07, "loss": 0.61900187, "num_input_tokens_seen": 275615605, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18395996, "step": 12780, "time_per_iteration": 3.243476390838623 }, { "auxiliary_loss_clip": 0.01418238, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.25418174, "balance_loss_mlp": 1.01410592, "epoch": 0.7684352923493161, "flos": 21808931074560.0, "grad_norm": 1.8979079757864064, "language_loss": 0.68658757, "learning_rate": 5.364974844194759e-07, "loss": 0.71110857, "num_input_tokens_seen": 275634965, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19763184, "step": 12781, "time_per_iteration": 2.8873727321624756 }, { "auxiliary_loss_clip": 0.01416209, "auxiliary_loss_mlp": 0.01031781, "balance_loss_clip": 1.25452757, "balance_loss_mlp": 1.0129106, "epoch": 0.7684954156019841, "flos": 25858380608640.0, "grad_norm": 1.6249555011065762, "language_loss": 0.8020525, "learning_rate": 5.362320660762016e-07, "loss": 0.82653248, "num_input_tokens_seen": 275655785, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18884277, "step": 12782, "time_per_iteration": 4.345340013504028 }, { "auxiliary_loss_clip": 0.0140794, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.24556243, "balance_loss_mlp": 1.01242566, "epoch": 0.768555538854652, "flos": 25458342917760.0, "grad_norm": 1.9343739365142263, "language_loss": 0.67821908, "learning_rate": 5.35966703239153e-07, "loss": 0.70261872, "num_input_tokens_seen": 275676160, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19616699, "step": 12783, "time_per_iteration": 2.9172325134277344 }, { "auxiliary_loss_clip": 0.01410305, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.24937844, "balance_loss_mlp": 1.01235032, "epoch": 0.76861566210732, "flos": 19655887127040.0, "grad_norm": 2.4194890133328197, "language_loss": 0.69978172, "learning_rate": 5.357013959183938e-07, "loss": 0.72419786, "num_input_tokens_seen": 275695660, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18969727, "step": 12784, "time_per_iteration": 2.84894061088562 }, { "auxiliary_loss_clip": 0.01404988, "auxiliary_loss_mlp": 0.01034868, "balance_loss_clip": 1.24538469, "balance_loss_mlp": 1.0171535, "epoch": 0.7686757853599879, "flos": 22429203045120.0, "grad_norm": 1.647415591246203, "language_loss": 0.81241667, "learning_rate": 5.354361441239843e-07, "loss": 0.83681524, "num_input_tokens_seen": 275714025, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.17687988, "step": 12785, "time_per_iteration": 2.860949993133545 }, { "auxiliary_loss_clip": 0.01406846, "auxiliary_loss_mlp": 0.01031334, "balance_loss_clip": 1.24572659, "balance_loss_mlp": 1.01210523, "epoch": 0.768735908612656, "flos": 47790396097920.0, "grad_norm": 1.5316550827449233, "language_loss": 0.78156984, "learning_rate": 5.351709478659836e-07, "loss": 0.80595165, "num_input_tokens_seen": 275737300, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19238281, "step": 12786, "time_per_iteration": 4.514078378677368 }, { "auxiliary_loss_clip": 0.01396396, "auxiliary_loss_mlp": 0.01031644, "balance_loss_clip": 1.23692441, "balance_loss_mlp": 1.01364374, "epoch": 0.7687960318653239, "flos": 30275309560320.0, "grad_norm": 1.9887703443100189, "language_loss": 0.59352839, "learning_rate": 5.349058071544468e-07, "loss": 0.61780876, "num_input_tokens_seen": 275757895, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18017578, "step": 12787, "time_per_iteration": 4.139708042144775 }, { "auxiliary_loss_clip": 0.01401725, "auxiliary_loss_mlp": 0.01031968, "balance_loss_clip": 1.24319553, "balance_loss_mlp": 1.01383615, "epoch": 0.7688561551179919, "flos": 19582943230080.0, "grad_norm": 1.732209617760251, "language_loss": 0.76478291, "learning_rate": 5.346407219994292e-07, "loss": 0.78911984, "num_input_tokens_seen": 275776745, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18115234, "step": 12788, "time_per_iteration": 2.850006580352783 }, { "auxiliary_loss_clip": 0.01405862, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.24546814, "balance_loss_mlp": 1.01631379, "epoch": 0.7689162783706599, "flos": 22794013019520.0, "grad_norm": 1.7137851900376513, "language_loss": 0.67545211, "learning_rate": 5.343756924109821e-07, "loss": 0.69985539, "num_input_tokens_seen": 275797205, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18164062, "step": 12789, "time_per_iteration": 2.879929780960083 }, { "auxiliary_loss_clip": 0.01412704, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 1.25058854, "balance_loss_mlp": 1.01192153, "epoch": 0.7689764016233278, "flos": 34217492376960.0, "grad_norm": 1.7058385445896214, "language_loss": 0.69403952, "learning_rate": 5.341107183991553e-07, "loss": 0.71847808, "num_input_tokens_seen": 275817935, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19238281, "step": 12790, "time_per_iteration": 2.9817512035369873 }, { "auxiliary_loss_clip": 0.01401355, "auxiliary_loss_mlp": 0.01030561, "balance_loss_clip": 1.2420373, "balance_loss_mlp": 1.01219094, "epoch": 0.7690365248759958, "flos": 17283287571840.0, "grad_norm": 1.9637571035898815, "language_loss": 0.69847864, "learning_rate": 5.338457999739969e-07, "loss": 0.72279775, "num_input_tokens_seen": 275837145, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18395996, "step": 12791, "time_per_iteration": 2.825559139251709 }, { "auxiliary_loss_clip": 0.01408235, "auxiliary_loss_mlp": 0.0103479, "balance_loss_clip": 1.24923992, "balance_loss_mlp": 1.01641929, "epoch": 0.7690966481286637, "flos": 18232236904320.0, "grad_norm": 1.740819684190571, "language_loss": 0.80600226, "learning_rate": 5.335809371455526e-07, "loss": 0.83043247, "num_input_tokens_seen": 275855705, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18371582, "step": 12792, "time_per_iteration": 2.858492851257324 }, { "auxiliary_loss_clip": 0.01415337, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.25005054, "balance_loss_mlp": 1.01244736, "epoch": 0.7691567713813318, "flos": 21546003686400.0, "grad_norm": 1.7734934749712326, "language_loss": 0.73866391, "learning_rate": 5.333161299238673e-07, "loss": 0.76312864, "num_input_tokens_seen": 275873930, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.18676758, "step": 12793, "time_per_iteration": 2.860342025756836 }, { "auxiliary_loss_clip": 0.0141416, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.25117731, "balance_loss_mlp": 1.01375794, "epoch": 0.7692168946339997, "flos": 39393296680320.0, "grad_norm": 1.7431414748901366, "language_loss": 0.6437006, "learning_rate": 5.330513783189803e-07, "loss": 0.66816616, "num_input_tokens_seen": 275895895, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1862793, "step": 12794, "time_per_iteration": 2.986795663833618 }, { "auxiliary_loss_clip": 0.01417163, "auxiliary_loss_mlp": 0.01036586, "balance_loss_clip": 1.25392795, "balance_loss_mlp": 1.01750028, "epoch": 0.7692770178866677, "flos": 25020905760000.0, "grad_norm": 1.6315849625207577, "language_loss": 0.76469994, "learning_rate": 5.327866823409319e-07, "loss": 0.7892375, "num_input_tokens_seen": 275917825, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.1907959, "step": 12795, "time_per_iteration": 2.8748109340667725 }, { "auxiliary_loss_clip": 0.01413662, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.25205684, "balance_loss_mlp": 1.01166725, "epoch": 0.7693371411393356, "flos": 24726686952960.0, "grad_norm": 1.6206897875977369, "language_loss": 0.72078347, "learning_rate": 5.325220419997601e-07, "loss": 0.74522352, "num_input_tokens_seen": 275937890, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18664551, "step": 12796, "time_per_iteration": 2.8698031902313232 }, { "auxiliary_loss_clip": 0.0141114, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.24997342, "balance_loss_mlp": 1.01039124, "epoch": 0.7693972643920036, "flos": 15933350407680.0, "grad_norm": 1.825093296871905, "language_loss": 0.66065049, "learning_rate": 5.32257457305499e-07, "loss": 0.68504441, "num_input_tokens_seen": 275954495, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1784668, "step": 12797, "time_per_iteration": 2.842780828475952 }, { "auxiliary_loss_clip": 0.01408112, "auxiliary_loss_mlp": 0.01038502, "balance_loss_clip": 1.24592865, "balance_loss_mlp": 1.01791477, "epoch": 0.7694573876446715, "flos": 25415559319680.0, "grad_norm": 1.7350722714510556, "language_loss": 0.91854507, "learning_rate": 5.319929282681823e-07, "loss": 0.94301128, "num_input_tokens_seen": 275972395, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.20593262, "step": 12798, "time_per_iteration": 2.8760392665863037 }, { "auxiliary_loss_clip": 0.01413149, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.25164962, "balance_loss_mlp": 1.01225936, "epoch": 0.7695175108973396, "flos": 16662879866880.0, "grad_norm": 1.8395056853499427, "language_loss": 0.82906508, "learning_rate": 5.317284548978418e-07, "loss": 0.85350406, "num_input_tokens_seen": 275989020, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18481445, "step": 12799, "time_per_iteration": 2.790595769882202 }, { "auxiliary_loss_clip": 0.01423398, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.2596941, "balance_loss_mlp": 1.01214182, "epoch": 0.7695776341500075, "flos": 13634554400640.0, "grad_norm": 2.601764090360966, "language_loss": 0.78460765, "learning_rate": 5.314640372045045e-07, "loss": 0.80914903, "num_input_tokens_seen": 276006525, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18591309, "step": 12800, "time_per_iteration": 2.8040971755981445 }, { "auxiliary_loss_clip": 0.01428761, "auxiliary_loss_mlp": 0.01030947, "balance_loss_clip": 1.26064312, "balance_loss_mlp": 1.01059818, "epoch": 0.7696377574026755, "flos": 24286535107200.0, "grad_norm": 1.9057333586769767, "language_loss": 0.84238899, "learning_rate": 5.31199675198198e-07, "loss": 0.8669861, "num_input_tokens_seen": 276027130, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20361328, "step": 12801, "time_per_iteration": 2.8361923694610596 }, { "auxiliary_loss_clip": 0.01404531, "auxiliary_loss_mlp": 0.01033085, "balance_loss_clip": 1.24355912, "balance_loss_mlp": 1.01464391, "epoch": 0.7696978806553435, "flos": 20932925639040.0, "grad_norm": 1.852748684212862, "language_loss": 0.72558135, "learning_rate": 5.30935368888947e-07, "loss": 0.74995756, "num_input_tokens_seen": 276045715, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18432617, "step": 12802, "time_per_iteration": 2.8337087631225586 }, { "auxiliary_loss_clip": 0.01383355, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.2273941, "balance_loss_mlp": 1.01063108, "epoch": 0.7697580039080114, "flos": 22939855568640.0, "grad_norm": 2.4271224356371106, "language_loss": 0.77266896, "learning_rate": 5.306711182867747e-07, "loss": 0.79679692, "num_input_tokens_seen": 276065375, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18798828, "step": 12803, "time_per_iteration": 2.9112470149993896 }, { "auxiliary_loss_clip": 0.01191323, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.1029954, "balance_loss_mlp": 1.00494528, "epoch": 0.7698181271606794, "flos": 68748839804160.0, "grad_norm": 0.7396126573789003, "language_loss": 0.55878556, "learning_rate": 5.304069234017001e-07, "loss": 0.5809772, "num_input_tokens_seen": 276131405, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.22851562, "step": 12804, "time_per_iteration": 3.367241382598877 }, { "auxiliary_loss_clip": 0.01194097, "auxiliary_loss_mlp": 0.0103242, "balance_loss_clip": 1.1049726, "balance_loss_mlp": 1.01039016, "epoch": 0.7698782504133473, "flos": 67442002951680.0, "grad_norm": 0.7368160992069855, "language_loss": 0.54101014, "learning_rate": 5.301427842437429e-07, "loss": 0.56327534, "num_input_tokens_seen": 276200755, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.22070312, "step": 12805, "time_per_iteration": 3.4386563301086426 }, { "auxiliary_loss_clip": 0.01404919, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.2464087, "balance_loss_mlp": 1.0150125, "epoch": 0.7699383736660154, "flos": 22498210644480.0, "grad_norm": 5.022265834698802, "language_loss": 0.73916399, "learning_rate": 5.298787008229187e-07, "loss": 0.76356113, "num_input_tokens_seen": 276217880, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19799805, "step": 12806, "time_per_iteration": 2.8956682682037354 }, { "auxiliary_loss_clip": 0.01398942, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 1.23946881, "balance_loss_mlp": 1.01494956, "epoch": 0.7699984969186833, "flos": 21548718374400.0, "grad_norm": 1.9152635676784635, "language_loss": 0.75444597, "learning_rate": 5.296146731492408e-07, "loss": 0.77877396, "num_input_tokens_seen": 276234810, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18896484, "step": 12807, "time_per_iteration": 2.846027135848999 }, { "auxiliary_loss_clip": 0.01420067, "auxiliary_loss_mlp": 0.01035381, "balance_loss_clip": 1.25600529, "balance_loss_mlp": 1.01621246, "epoch": 0.7700586201713513, "flos": 21727345420800.0, "grad_norm": 2.961570059474174, "language_loss": 0.81495607, "learning_rate": 5.293507012327218e-07, "loss": 0.83951056, "num_input_tokens_seen": 276252850, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19165039, "step": 12808, "time_per_iteration": 2.829636573791504 }, { "auxiliary_loss_clip": 0.014155, "auxiliary_loss_mlp": 0.01037662, "balance_loss_clip": 1.25082159, "balance_loss_mlp": 1.01838541, "epoch": 0.7701187434240192, "flos": 27867753757440.0, "grad_norm": 2.657967422883669, "language_loss": 0.80193138, "learning_rate": 5.290867850833718e-07, "loss": 0.82646304, "num_input_tokens_seen": 276272525, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19274902, "step": 12809, "time_per_iteration": 2.8995964527130127 }, { "auxiliary_loss_clip": 0.01392872, "auxiliary_loss_mlp": 0.01028937, "balance_loss_clip": 1.23448944, "balance_loss_mlp": 1.01132965, "epoch": 0.7701788666766872, "flos": 28633008625920.0, "grad_norm": 1.5632715822604455, "language_loss": 0.70570064, "learning_rate": 5.288229247111993e-07, "loss": 0.72991872, "num_input_tokens_seen": 276294210, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.17614746, "step": 12810, "time_per_iteration": 4.401693820953369 }, { "auxiliary_loss_clip": 0.0140972, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.24627531, "balance_loss_mlp": 1.0138464, "epoch": 0.7702389899293551, "flos": 14254554902400.0, "grad_norm": 2.4259814534654076, "language_loss": 0.79507029, "learning_rate": 5.285591201262079e-07, "loss": 0.81950939, "num_input_tokens_seen": 276310290, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20349121, "step": 12811, "time_per_iteration": 2.8016722202301025 }, { "auxiliary_loss_clip": 0.01191011, "auxiliary_loss_mlp": 0.01024106, "balance_loss_clip": 1.10287786, "balance_loss_mlp": 1.00465095, "epoch": 0.7702991131820232, "flos": 70604678787840.0, "grad_norm": 0.8155455763012609, "language_loss": 0.56720567, "learning_rate": 5.28295371338402e-07, "loss": 0.58935678, "num_input_tokens_seen": 276371715, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.19433594, "step": 12812, "time_per_iteration": 3.3664188385009766 }, { "auxiliary_loss_clip": 0.01411281, "auxiliary_loss_mlp": 0.01031571, "balance_loss_clip": 1.2488817, "balance_loss_mlp": 1.01242566, "epoch": 0.7703592364346911, "flos": 25489996295040.0, "grad_norm": 1.582022072044726, "language_loss": 0.72502553, "learning_rate": 5.280316783577836e-07, "loss": 0.74945402, "num_input_tokens_seen": 276389895, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19140625, "step": 12813, "time_per_iteration": 2.8728551864624023 }, { "auxiliary_loss_clip": 0.01406276, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 1.24533987, "balance_loss_mlp": 1.01296639, "epoch": 0.7704193596873591, "flos": 19290172256640.0, "grad_norm": 1.6547571617312808, "language_loss": 0.67868578, "learning_rate": 5.27768041194351e-07, "loss": 0.70306683, "num_input_tokens_seen": 276408990, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18847656, "step": 12814, "time_per_iteration": 2.834472179412842 }, { "auxiliary_loss_clip": 0.0140127, "auxiliary_loss_mlp": 0.01033636, "balance_loss_clip": 1.24149752, "balance_loss_mlp": 1.01502776, "epoch": 0.7704794829400271, "flos": 23668661111040.0, "grad_norm": 3.1529219819671446, "language_loss": 0.66793787, "learning_rate": 5.275044598581018e-07, "loss": 0.69228691, "num_input_tokens_seen": 276428190, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18615723, "step": 12815, "time_per_iteration": 2.894047260284424 }, { "auxiliary_loss_clip": 0.01402334, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.24069047, "balance_loss_mlp": 1.01477432, "epoch": 0.770539606192695, "flos": 18998577648000.0, "grad_norm": 2.4803119383066385, "language_loss": 0.66539001, "learning_rate": 5.272409343590322e-07, "loss": 0.6897577, "num_input_tokens_seen": 276446855, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19677734, "step": 12816, "time_per_iteration": 2.8270721435546875 }, { "auxiliary_loss_clip": 0.01422185, "auxiliary_loss_mlp": 0.01034651, "balance_loss_clip": 1.25859022, "balance_loss_mlp": 1.01604211, "epoch": 0.770599729445363, "flos": 11835506920320.0, "grad_norm": 2.6438352732834707, "language_loss": 0.73354208, "learning_rate": 5.26977464707133e-07, "loss": 0.7581104, "num_input_tokens_seen": 276462000, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18615723, "step": 12817, "time_per_iteration": 2.7835729122161865 }, { "auxiliary_loss_clip": 0.01405531, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.24444675, "balance_loss_mlp": 1.01383519, "epoch": 0.770659852698031, "flos": 17831792010240.0, "grad_norm": 4.43763307180003, "language_loss": 0.62213111, "learning_rate": 5.267140509123957e-07, "loss": 0.64650607, "num_input_tokens_seen": 276481190, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18139648, "step": 12818, "time_per_iteration": 4.276930093765259 }, { "auxiliary_loss_clip": 0.0140831, "auxiliary_loss_mlp": 0.01027807, "balance_loss_clip": 1.2513411, "balance_loss_mlp": 1.00998497, "epoch": 0.770719975950699, "flos": 21882055950720.0, "grad_norm": 1.7265032021681979, "language_loss": 0.68078458, "learning_rate": 5.264506929848093e-07, "loss": 0.70514578, "num_input_tokens_seen": 276499520, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.17834473, "step": 12819, "time_per_iteration": 2.83488392829895 }, { "auxiliary_loss_clip": 0.01410425, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.24768114, "balance_loss_mlp": 1.01293325, "epoch": 0.7707800992033669, "flos": 21335089835520.0, "grad_norm": 1.6130608419056032, "language_loss": 0.58157355, "learning_rate": 5.261873909343608e-07, "loss": 0.60599315, "num_input_tokens_seen": 276519110, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18603516, "step": 12820, "time_per_iteration": 2.8096301555633545 }, { "auxiliary_loss_clip": 0.01405881, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.245309, "balance_loss_mlp": 1.010674, "epoch": 0.7708402224560349, "flos": 28189961112960.0, "grad_norm": 2.4298513974448617, "language_loss": 0.81753182, "learning_rate": 5.259241447710343e-07, "loss": 0.84188455, "num_input_tokens_seen": 276538805, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18701172, "step": 12821, "time_per_iteration": 2.9492671489715576 }, { "auxiliary_loss_clip": 0.01408424, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.24817467, "balance_loss_mlp": 1.01396906, "epoch": 0.7709003457087028, "flos": 15385343662080.0, "grad_norm": 2.1026370360533093, "language_loss": 0.6941129, "learning_rate": 5.256609545048114e-07, "loss": 0.71853554, "num_input_tokens_seen": 276554770, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1986084, "step": 12822, "time_per_iteration": 5.659339904785156 }, { "auxiliary_loss_clip": 0.01392994, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.23607969, "balance_loss_mlp": 1.0169692, "epoch": 0.7709604689613708, "flos": 30632473163520.0, "grad_norm": 1.6665474498593613, "language_loss": 0.72808671, "learning_rate": 5.253978201456733e-07, "loss": 0.75237292, "num_input_tokens_seen": 276574535, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18664551, "step": 12823, "time_per_iteration": 2.8972065448760986 }, { "auxiliary_loss_clip": 0.01418669, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.25296569, "balance_loss_mlp": 1.01533389, "epoch": 0.7710205922140387, "flos": 20310481918080.0, "grad_norm": 2.0015474202086456, "language_loss": 0.76665854, "learning_rate": 5.251347417035969e-07, "loss": 0.79119325, "num_input_tokens_seen": 276592925, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19482422, "step": 12824, "time_per_iteration": 2.8395004272460938 }, { "auxiliary_loss_clip": 0.01407299, "auxiliary_loss_mlp": 0.01030569, "balance_loss_clip": 1.24720407, "balance_loss_mlp": 1.01237786, "epoch": 0.7710807154667068, "flos": 19653172439040.0, "grad_norm": 2.29079044249999, "language_loss": 0.73371273, "learning_rate": 5.248717191885592e-07, "loss": 0.75809145, "num_input_tokens_seen": 276610540, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18188477, "step": 12825, "time_per_iteration": 2.853884220123291 }, { "auxiliary_loss_clip": 0.01381758, "auxiliary_loss_mlp": 0.01034387, "balance_loss_clip": 1.22751594, "balance_loss_mlp": 1.0172925, "epoch": 0.7711408387193747, "flos": 20014679543040.0, "grad_norm": 1.3557029017062532, "language_loss": 0.74471474, "learning_rate": 5.246087526105343e-07, "loss": 0.76887619, "num_input_tokens_seen": 276629200, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.17102051, "step": 12826, "time_per_iteration": 2.814441204071045 }, { "auxiliary_loss_clip": 0.01400668, "auxiliary_loss_mlp": 0.01033607, "balance_loss_clip": 1.23765802, "balance_loss_mlp": 1.01425982, "epoch": 0.7712009619720427, "flos": 24981470277120.0, "grad_norm": 1.58223404994639, "language_loss": 0.8224957, "learning_rate": 5.243458419794933e-07, "loss": 0.84683847, "num_input_tokens_seen": 276648655, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19348145, "step": 12827, "time_per_iteration": 2.882451057434082 }, { "auxiliary_loss_clip": 0.01194414, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 1.10544038, "balance_loss_mlp": 1.00704408, "epoch": 0.7712610852247107, "flos": 63280716975360.0, "grad_norm": 0.8582474124169457, "language_loss": 0.55192542, "learning_rate": 5.240829873054051e-07, "loss": 0.57413739, "num_input_tokens_seen": 276716500, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.19726562, "step": 12828, "time_per_iteration": 3.504800796508789 }, { "auxiliary_loss_clip": 0.01386234, "auxiliary_loss_mlp": 0.01032854, "balance_loss_clip": 1.22937012, "balance_loss_mlp": 1.01417363, "epoch": 0.7713212084773786, "flos": 18707164018560.0, "grad_norm": 5.330907224468873, "language_loss": 0.7072047, "learning_rate": 5.23820188598238e-07, "loss": 0.73139554, "num_input_tokens_seen": 276733535, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18676758, "step": 12829, "time_per_iteration": 2.8252997398376465 }, { "auxiliary_loss_clip": 0.01411577, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.24775314, "balance_loss_mlp": 1.01404977, "epoch": 0.7713813317300466, "flos": 14181430026240.0, "grad_norm": 2.6473450501136577, "language_loss": 0.80394006, "learning_rate": 5.235574458679579e-07, "loss": 0.82838869, "num_input_tokens_seen": 276749575, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19238281, "step": 12830, "time_per_iteration": 2.821965456008911 }, { "auxiliary_loss_clip": 0.01400921, "auxiliary_loss_mlp": 0.01035052, "balance_loss_clip": 1.23694754, "balance_loss_mlp": 1.01590741, "epoch": 0.7714414549827145, "flos": 25715297992320.0, "grad_norm": 1.812954315076191, "language_loss": 0.78421295, "learning_rate": 5.232947591245269e-07, "loss": 0.80857265, "num_input_tokens_seen": 276769460, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19152832, "step": 12831, "time_per_iteration": 2.863431215286255 }, { "auxiliary_loss_clip": 0.01402147, "auxiliary_loss_mlp": 0.01032215, "balance_loss_clip": 1.24187994, "balance_loss_mlp": 1.01333189, "epoch": 0.7715015782353826, "flos": 30567582840960.0, "grad_norm": 2.265148412997826, "language_loss": 0.61269808, "learning_rate": 5.230321283779071e-07, "loss": 0.63704169, "num_input_tokens_seen": 276790820, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.1887207, "step": 12832, "time_per_iteration": 2.901512384414673 }, { "auxiliary_loss_clip": 0.01416509, "auxiliary_loss_mlp": 0.01033934, "balance_loss_clip": 1.25405037, "balance_loss_mlp": 1.01470602, "epoch": 0.7715617014880505, "flos": 20238804875520.0, "grad_norm": 1.705116077266607, "language_loss": 0.79660481, "learning_rate": 5.227695536380572e-07, "loss": 0.82110929, "num_input_tokens_seen": 276811345, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19226074, "step": 12833, "time_per_iteration": 2.8835997581481934 }, { "auxiliary_loss_clip": 0.0118329, "auxiliary_loss_mlp": 0.01027781, "balance_loss_clip": 1.09530413, "balance_loss_mlp": 1.00727749, "epoch": 0.7716218247407185, "flos": 63690011867520.0, "grad_norm": 0.8607344608980417, "language_loss": 0.55551767, "learning_rate": 5.22507034914933e-07, "loss": 0.57762837, "num_input_tokens_seen": 276870950, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20507812, "step": 12834, "time_per_iteration": 3.3282384872436523 }, { "auxiliary_loss_clip": 0.01407871, "auxiliary_loss_mlp": 0.01030152, "balance_loss_clip": 1.24567378, "balance_loss_mlp": 1.01224685, "epoch": 0.7716819479933864, "flos": 19801231983360.0, "grad_norm": 1.980396611448822, "language_loss": 0.73991501, "learning_rate": 5.222445722184903e-07, "loss": 0.76429522, "num_input_tokens_seen": 276890760, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.17907715, "step": 12835, "time_per_iteration": 2.8459134101867676 }, { "auxiliary_loss_clip": 0.01415156, "auxiliary_loss_mlp": 0.01038341, "balance_loss_clip": 1.25190246, "balance_loss_mlp": 1.01886189, "epoch": 0.7717420712460544, "flos": 18451340064000.0, "grad_norm": 1.8718804345924962, "language_loss": 0.71372497, "learning_rate": 5.219821655586814e-07, "loss": 0.73825991, "num_input_tokens_seen": 276909625, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19482422, "step": 12836, "time_per_iteration": 2.834233283996582 }, { "auxiliary_loss_clip": 0.01398257, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.24053526, "balance_loss_mlp": 1.01588213, "epoch": 0.7718021944987223, "flos": 35203072014720.0, "grad_norm": 2.635408883332644, "language_loss": 0.60660714, "learning_rate": 5.217198149454575e-07, "loss": 0.63094103, "num_input_tokens_seen": 276930760, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19250488, "step": 12837, "time_per_iteration": 2.987755060195923 }, { "auxiliary_loss_clip": 0.01187782, "auxiliary_loss_mlp": 0.01040288, "balance_loss_clip": 1.09966731, "balance_loss_mlp": 1.01615953, "epoch": 0.7718623177513904, "flos": 67956275059200.0, "grad_norm": 0.8538491014193986, "language_loss": 0.55818582, "learning_rate": 5.214575203887666e-07, "loss": 0.58046651, "num_input_tokens_seen": 276989580, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.24121094, "step": 12838, "time_per_iteration": 3.242119312286377 }, { "auxiliary_loss_clip": 0.01406267, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.2469883, "balance_loss_mlp": 1.01482606, "epoch": 0.7719224410040583, "flos": 18588857569920.0, "grad_norm": 3.0293397793300985, "language_loss": 0.6994105, "learning_rate": 5.211952818985538e-07, "loss": 0.72380894, "num_input_tokens_seen": 277005450, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18762207, "step": 12839, "time_per_iteration": 2.824303388595581 }, { "auxiliary_loss_clip": 0.01403817, "auxiliary_loss_mlp": 0.01035474, "balance_loss_clip": 1.24482298, "balance_loss_mlp": 1.01682961, "epoch": 0.7719825642567263, "flos": 23086150565760.0, "grad_norm": 1.8188304737358343, "language_loss": 0.81145251, "learning_rate": 5.209330994847647e-07, "loss": 0.83584547, "num_input_tokens_seen": 277023055, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18640137, "step": 12840, "time_per_iteration": 2.842425584793091 }, { "auxiliary_loss_clip": 0.01410858, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.24964416, "balance_loss_mlp": 1.01526022, "epoch": 0.7720426875093943, "flos": 20348650546560.0, "grad_norm": 1.671582064062619, "language_loss": 0.80051064, "learning_rate": 5.206709731573402e-07, "loss": 0.82495821, "num_input_tokens_seen": 277041150, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18640137, "step": 12841, "time_per_iteration": 2.853870153427124 }, { "auxiliary_loss_clip": 0.01406611, "auxiliary_loss_mlp": 0.01033789, "balance_loss_clip": 1.24525023, "balance_loss_mlp": 1.015836, "epoch": 0.7721028107620622, "flos": 23891836302720.0, "grad_norm": 1.3840247019846976, "language_loss": 0.76970279, "learning_rate": 5.204089029262208e-07, "loss": 0.79410672, "num_input_tokens_seen": 277063895, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1796875, "step": 12842, "time_per_iteration": 2.899282455444336 }, { "auxiliary_loss_clip": 0.01414974, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.25206411, "balance_loss_mlp": 1.01523256, "epoch": 0.7721629340147302, "flos": 26662889980800.0, "grad_norm": 1.8691851572291214, "language_loss": 0.69549954, "learning_rate": 5.201468888013445e-07, "loss": 0.71998471, "num_input_tokens_seen": 277084045, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1829834, "step": 12843, "time_per_iteration": 2.898672580718994 }, { "auxiliary_loss_clip": 0.01421102, "auxiliary_loss_mlp": 0.01030881, "balance_loss_clip": 1.25505328, "balance_loss_mlp": 1.0118196, "epoch": 0.7722230572673981, "flos": 21188794838400.0, "grad_norm": 2.2500207691125302, "language_loss": 0.74403131, "learning_rate": 5.198849307926465e-07, "loss": 0.76855111, "num_input_tokens_seen": 277102625, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19067383, "step": 12844, "time_per_iteration": 2.8157546520233154 }, { "auxiliary_loss_clip": 0.01399684, "auxiliary_loss_mlp": 0.01034437, "balance_loss_clip": 1.2416997, "balance_loss_mlp": 1.0159483, "epoch": 0.7722831805200662, "flos": 27976196839680.0, "grad_norm": 1.6461528938584453, "language_loss": 0.7220279, "learning_rate": 5.196230289100596e-07, "loss": 0.74636912, "num_input_tokens_seen": 277123210, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18493652, "step": 12845, "time_per_iteration": 2.8905839920043945 }, { "auxiliary_loss_clip": 0.01396909, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.23910141, "balance_loss_mlp": 1.0170691, "epoch": 0.7723433037727341, "flos": 33888724525440.0, "grad_norm": 1.7125732304986534, "language_loss": 0.65468359, "learning_rate": 5.193611831635159e-07, "loss": 0.67900813, "num_input_tokens_seen": 277144895, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18469238, "step": 12846, "time_per_iteration": 4.394457101821899 }, { "auxiliary_loss_clip": 0.01191548, "auxiliary_loss_mlp": 0.01023154, "balance_loss_clip": 1.10113072, "balance_loss_mlp": 1.00446177, "epoch": 0.7724034270254021, "flos": 62879186206080.0, "grad_norm": 0.7890366956393436, "language_loss": 0.61851889, "learning_rate": 5.19099393562945e-07, "loss": 0.64066589, "num_input_tokens_seen": 277205160, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.18652344, "step": 12847, "time_per_iteration": 3.318760395050049 }, { "auxiliary_loss_clip": 0.01402479, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.24201107, "balance_loss_mlp": 1.01451027, "epoch": 0.77246355027807, "flos": 23306068131840.0, "grad_norm": 1.636822252845186, "language_loss": 0.80271322, "learning_rate": 5.188376601182732e-07, "loss": 0.82706714, "num_input_tokens_seen": 277223005, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18395996, "step": 12848, "time_per_iteration": 2.8582839965820312 }, { "auxiliary_loss_clip": 0.01415863, "auxiliary_loss_mlp": 0.01032764, "balance_loss_clip": 1.25074041, "balance_loss_mlp": 1.01485884, "epoch": 0.772523673530738, "flos": 20131085710080.0, "grad_norm": 1.528400076222253, "language_loss": 0.72917598, "learning_rate": 5.185759828394261e-07, "loss": 0.75366223, "num_input_tokens_seen": 277241785, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.17895508, "step": 12849, "time_per_iteration": 2.880981206893921 }, { "auxiliary_loss_clip": 0.01411911, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.25156248, "balance_loss_mlp": 1.01473999, "epoch": 0.7725837967834059, "flos": 17828624874240.0, "grad_norm": 1.8509400998446042, "language_loss": 0.78776729, "learning_rate": 5.183143617363261e-07, "loss": 0.8122105, "num_input_tokens_seen": 277259050, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.17675781, "step": 12850, "time_per_iteration": 2.865903615951538 }, { "auxiliary_loss_clip": 0.0141755, "auxiliary_loss_mlp": 0.01033151, "balance_loss_clip": 1.25367367, "balance_loss_mlp": 1.0149473, "epoch": 0.772643920036074, "flos": 27210399033600.0, "grad_norm": 1.5332649998210048, "language_loss": 0.80468416, "learning_rate": 5.180527968188935e-07, "loss": 0.82919115, "num_input_tokens_seen": 277278235, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18212891, "step": 12851, "time_per_iteration": 2.8891830444335938 }, { "auxiliary_loss_clip": 0.01397737, "auxiliary_loss_mlp": 0.01031241, "balance_loss_clip": 1.24008679, "balance_loss_mlp": 1.01312101, "epoch": 0.7727040432887419, "flos": 21589375466880.0, "grad_norm": 1.4253847718918558, "language_loss": 0.74217695, "learning_rate": 5.177912880970474e-07, "loss": 0.76646674, "num_input_tokens_seen": 277298355, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18139648, "step": 12852, "time_per_iteration": 2.83357572555542 }, { "auxiliary_loss_clip": 0.01391553, "auxiliary_loss_mlp": 0.01032656, "balance_loss_clip": 1.23369789, "balance_loss_mlp": 1.01377344, "epoch": 0.7727641665414099, "flos": 22246775435520.0, "grad_norm": 1.9314652279984856, "language_loss": 0.82790244, "learning_rate": 5.17529835580704e-07, "loss": 0.85214448, "num_input_tokens_seen": 277316095, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.1887207, "step": 12853, "time_per_iteration": 4.344544172286987 }, { "auxiliary_loss_clip": 0.01186047, "auxiliary_loss_mlp": 0.0102747, "balance_loss_clip": 1.0985899, "balance_loss_mlp": 1.00982654, "epoch": 0.7728242897940779, "flos": 54863863562880.0, "grad_norm": 0.8239972945649137, "language_loss": 0.54649103, "learning_rate": 5.172684392797786e-07, "loss": 0.56862622, "num_input_tokens_seen": 277380130, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.17675781, "step": 12854, "time_per_iteration": 3.408353805541992 }, { "auxiliary_loss_clip": 0.01420935, "auxiliary_loss_mlp": 0.01037412, "balance_loss_clip": 1.25696588, "balance_loss_mlp": 1.0175401, "epoch": 0.7728844130467458, "flos": 34475940529920.0, "grad_norm": 1.5948628609225892, "language_loss": 0.72890079, "learning_rate": 5.170070992041826e-07, "loss": 0.75348425, "num_input_tokens_seen": 277404015, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19873047, "step": 12855, "time_per_iteration": 2.9575035572052 }, { "auxiliary_loss_clip": 0.01411467, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.25110567, "balance_loss_mlp": 1.01288295, "epoch": 0.7729445362994138, "flos": 18925588506240.0, "grad_norm": 2.453061147726039, "language_loss": 0.69114137, "learning_rate": 5.167458153638254e-07, "loss": 0.71556914, "num_input_tokens_seen": 277421375, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18432617, "step": 12856, "time_per_iteration": 2.805394411087036 }, { "auxiliary_loss_clip": 0.01402437, "auxiliary_loss_mlp": 0.01028546, "balance_loss_clip": 1.24208808, "balance_loss_mlp": 1.00983036, "epoch": 0.7730046595520818, "flos": 22210145130240.0, "grad_norm": 1.8924085117425742, "language_loss": 0.80150884, "learning_rate": 5.164845877686162e-07, "loss": 0.82581866, "num_input_tokens_seen": 277440170, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18713379, "step": 12857, "time_per_iteration": 4.295732736587524 }, { "auxiliary_loss_clip": 0.0139525, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.23886681, "balance_loss_mlp": 1.01533866, "epoch": 0.7730647828047498, "flos": 13560026935680.0, "grad_norm": 2.452016914246388, "language_loss": 0.78909129, "learning_rate": 5.162234164284591e-07, "loss": 0.81338984, "num_input_tokens_seen": 277456880, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.19274902, "step": 12858, "time_per_iteration": 2.8240108489990234 }, { "auxiliary_loss_clip": 0.01407886, "auxiliary_loss_mlp": 0.01032705, "balance_loss_clip": 1.24599791, "balance_loss_mlp": 1.01451421, "epoch": 0.7731249060574177, "flos": 21984752943360.0, "grad_norm": 1.7847964278596005, "language_loss": 0.77638829, "learning_rate": 5.159623013532591e-07, "loss": 0.80079424, "num_input_tokens_seen": 277475365, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18200684, "step": 12859, "time_per_iteration": 2.826036214828491 }, { "auxiliary_loss_clip": 0.01403587, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.24727583, "balance_loss_mlp": 1.01485109, "epoch": 0.7731850293100857, "flos": 22612128347520.0, "grad_norm": 1.4274511925819988, "language_loss": 0.68554693, "learning_rate": 5.157012425529186e-07, "loss": 0.7099154, "num_input_tokens_seen": 277494975, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18395996, "step": 12860, "time_per_iteration": 2.883443832397461 }, { "auxiliary_loss_clip": 0.01418261, "auxiliary_loss_mlp": 0.01036986, "balance_loss_clip": 1.25267243, "balance_loss_mlp": 1.01833022, "epoch": 0.7732451525627536, "flos": 14105952420480.0, "grad_norm": 2.338921187850327, "language_loss": 0.76285827, "learning_rate": 5.154402400373343e-07, "loss": 0.7874108, "num_input_tokens_seen": 277510520, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.18664551, "step": 12861, "time_per_iteration": 2.857778787612915 }, { "auxiliary_loss_clip": 0.01420045, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.25649738, "balance_loss_mlp": 1.01252246, "epoch": 0.7733052758154216, "flos": 21479846509440.0, "grad_norm": 1.570606637619816, "language_loss": 0.7554003, "learning_rate": 5.15179293816405e-07, "loss": 0.77990568, "num_input_tokens_seen": 277530505, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.1796875, "step": 12862, "time_per_iteration": 2.909231185913086 }, { "auxiliary_loss_clip": 0.01396884, "auxiliary_loss_mlp": 0.0103267, "balance_loss_clip": 1.23909283, "balance_loss_mlp": 1.01394212, "epoch": 0.7733653990680895, "flos": 21403373518080.0, "grad_norm": 1.5202912941199171, "language_loss": 0.83944178, "learning_rate": 5.149184039000256e-07, "loss": 0.86373734, "num_input_tokens_seen": 277550810, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18737793, "step": 12863, "time_per_iteration": 2.9152114391326904 }, { "auxiliary_loss_clip": 0.01401871, "auxiliary_loss_mlp": 0.01030225, "balance_loss_clip": 1.24260831, "balance_loss_mlp": 1.01240349, "epoch": 0.7734255223207576, "flos": 17685044565120.0, "grad_norm": 1.9051990193551385, "language_loss": 0.74207211, "learning_rate": 5.146575702980898e-07, "loss": 0.76639307, "num_input_tokens_seen": 277567680, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17822266, "step": 12864, "time_per_iteration": 2.9077816009521484 }, { "auxiliary_loss_clip": 0.01408091, "auxiliary_loss_mlp": 0.0103057, "balance_loss_clip": 1.2476697, "balance_loss_mlp": 1.01343989, "epoch": 0.7734856455734255, "flos": 25242361649280.0, "grad_norm": 1.7492293814113908, "language_loss": 0.8309375, "learning_rate": 5.143967930204871e-07, "loss": 0.85532415, "num_input_tokens_seen": 277588970, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.17114258, "step": 12865, "time_per_iteration": 2.897696018218994 }, { "auxiliary_loss_clip": 0.01422875, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.25755203, "balance_loss_mlp": 1.0137043, "epoch": 0.7735457688260935, "flos": 23441594866560.0, "grad_norm": 2.0882221303424138, "language_loss": 0.72544312, "learning_rate": 5.141360720771077e-07, "loss": 0.75000536, "num_input_tokens_seen": 277605450, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19628906, "step": 12866, "time_per_iteration": 2.8843281269073486 }, { "auxiliary_loss_clip": 0.01412908, "auxiliary_loss_mlp": 0.01030208, "balance_loss_clip": 1.25153685, "balance_loss_mlp": 1.01211178, "epoch": 0.7736058920787615, "flos": 18737233827840.0, "grad_norm": 2.3386266845781343, "language_loss": 0.65964878, "learning_rate": 5.138754074778371e-07, "loss": 0.68407989, "num_input_tokens_seen": 277622530, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18103027, "step": 12867, "time_per_iteration": 2.7909903526306152 }, { "auxiliary_loss_clip": 0.01394424, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.23578787, "balance_loss_mlp": 1.01609325, "epoch": 0.7736660153314294, "flos": 22903587221760.0, "grad_norm": 1.5177113890457223, "language_loss": 0.7123813, "learning_rate": 5.136147992325595e-07, "loss": 0.73667163, "num_input_tokens_seen": 277642700, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18518066, "step": 12868, "time_per_iteration": 2.842097759246826 }, { "auxiliary_loss_clip": 0.0141998, "auxiliary_loss_mlp": 0.01031423, "balance_loss_clip": 1.25731146, "balance_loss_mlp": 1.01230156, "epoch": 0.7737261385840974, "flos": 13806892419840.0, "grad_norm": 1.9626503488805822, "language_loss": 0.78537834, "learning_rate": 5.133542473511578e-07, "loss": 0.80989236, "num_input_tokens_seen": 277660005, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19116211, "step": 12869, "time_per_iteration": 2.8037359714508057 }, { "auxiliary_loss_clip": 0.01400459, "auxiliary_loss_mlp": 0.01034163, "balance_loss_clip": 1.24254501, "balance_loss_mlp": 1.01498246, "epoch": 0.7737862618367654, "flos": 28742447093760.0, "grad_norm": 1.5303188553265503, "language_loss": 0.74835062, "learning_rate": 5.130937518435124e-07, "loss": 0.77269685, "num_input_tokens_seen": 277682890, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19165039, "step": 12870, "time_per_iteration": 2.942793130874634 }, { "auxiliary_loss_clip": 0.01415355, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 1.25303686, "balance_loss_mlp": 1.01335311, "epoch": 0.7738463850894334, "flos": 17027192148480.0, "grad_norm": 2.472342749312631, "language_loss": 0.76694071, "learning_rate": 5.12833312719501e-07, "loss": 0.79141855, "num_input_tokens_seen": 277699330, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19091797, "step": 12871, "time_per_iteration": 2.793790578842163 }, { "auxiliary_loss_clip": 0.01400639, "auxiliary_loss_mlp": 0.01034133, "balance_loss_clip": 1.24076998, "balance_loss_mlp": 1.01501179, "epoch": 0.7739065083421013, "flos": 20713686744960.0, "grad_norm": 1.574473737219808, "language_loss": 0.6963681, "learning_rate": 5.12572929988999e-07, "loss": 0.72071576, "num_input_tokens_seen": 277718750, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19116211, "step": 12872, "time_per_iteration": 2.824270486831665 }, { "auxiliary_loss_clip": 0.01409717, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.24749494, "balance_loss_mlp": 1.01439297, "epoch": 0.7739666315947693, "flos": 20705135477760.0, "grad_norm": 1.9495597432679974, "language_loss": 0.85515273, "learning_rate": 5.123126036618804e-07, "loss": 0.87959158, "num_input_tokens_seen": 277734645, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19763184, "step": 12873, "time_per_iteration": 2.8269100189208984 }, { "auxiliary_loss_clip": 0.01406359, "auxiliary_loss_mlp": 0.01037054, "balance_loss_clip": 1.24660957, "balance_loss_mlp": 1.01852906, "epoch": 0.7740267548474372, "flos": 29582817609600.0, "grad_norm": 2.589736076076167, "language_loss": 0.66572565, "learning_rate": 5.120523337480174e-07, "loss": 0.6901598, "num_input_tokens_seen": 277755535, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18518066, "step": 12874, "time_per_iteration": 2.922607660293579 }, { "auxiliary_loss_clip": 0.01406759, "auxiliary_loss_mlp": 0.01031479, "balance_loss_clip": 1.24698579, "balance_loss_mlp": 1.01340723, "epoch": 0.7740868781001052, "flos": 23669475517440.0, "grad_norm": 1.705741267761658, "language_loss": 0.63018864, "learning_rate": 5.117921202572785e-07, "loss": 0.65457106, "num_input_tokens_seen": 277775585, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18078613, "step": 12875, "time_per_iteration": 2.8653311729431152 }, { "auxiliary_loss_clip": 0.01406907, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.24439144, "balance_loss_mlp": 1.01476264, "epoch": 0.7741470013527731, "flos": 24727999052160.0, "grad_norm": 1.9197820823430194, "language_loss": 0.65760988, "learning_rate": 5.115319631995318e-07, "loss": 0.68201172, "num_input_tokens_seen": 277794795, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18505859, "step": 12876, "time_per_iteration": 2.8641226291656494 }, { "auxiliary_loss_clip": 0.01398711, "auxiliary_loss_mlp": 0.01035493, "balance_loss_clip": 1.24095464, "balance_loss_mlp": 1.01744461, "epoch": 0.7742071246054412, "flos": 21881648747520.0, "grad_norm": 10.772510389288765, "language_loss": 0.72669393, "learning_rate": 5.112718625846433e-07, "loss": 0.75103593, "num_input_tokens_seen": 277813235, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18054199, "step": 12877, "time_per_iteration": 2.8400442600250244 }, { "auxiliary_loss_clip": 0.01416097, "auxiliary_loss_mlp": 0.01034391, "balance_loss_clip": 1.25183821, "balance_loss_mlp": 1.01574719, "epoch": 0.7742672478581091, "flos": 22684438817280.0, "grad_norm": 2.0635081434209788, "language_loss": 0.83884025, "learning_rate": 5.110118184224736e-07, "loss": 0.86334509, "num_input_tokens_seen": 277832560, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.18652344, "step": 12878, "time_per_iteration": 2.829723596572876 }, { "auxiliary_loss_clip": 0.01403167, "auxiliary_loss_mlp": 0.01032724, "balance_loss_clip": 1.24294448, "balance_loss_mlp": 1.01477146, "epoch": 0.7743273711107771, "flos": 18849477473280.0, "grad_norm": 1.7196884352332735, "language_loss": 0.7386834, "learning_rate": 5.10751830722885e-07, "loss": 0.76304233, "num_input_tokens_seen": 277850120, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.17956543, "step": 12879, "time_per_iteration": 2.8248355388641357 }, { "auxiliary_loss_clip": 0.01397095, "auxiliary_loss_mlp": 0.01030791, "balance_loss_clip": 1.24001825, "balance_loss_mlp": 1.01233697, "epoch": 0.7743874943634451, "flos": 28740682546560.0, "grad_norm": 1.635856356933866, "language_loss": 0.80459201, "learning_rate": 5.104918994957364e-07, "loss": 0.82887089, "num_input_tokens_seen": 277871020, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18457031, "step": 12880, "time_per_iteration": 2.8781039714813232 }, { "auxiliary_loss_clip": 0.01398448, "auxiliary_loss_mlp": 0.0103383, "balance_loss_clip": 1.23990774, "balance_loss_mlp": 1.0152092, "epoch": 0.774447617616113, "flos": 21920043600000.0, "grad_norm": 1.4731922130262374, "language_loss": 0.70960265, "learning_rate": 5.102320247508847e-07, "loss": 0.73392546, "num_input_tokens_seen": 277891525, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18640137, "step": 12881, "time_per_iteration": 4.263557434082031 }, { "auxiliary_loss_clip": 0.01424369, "auxiliary_loss_mlp": 0.01040946, "balance_loss_clip": 1.25917172, "balance_loss_mlp": 1.0202992, "epoch": 0.774507740868781, "flos": 19510180312320.0, "grad_norm": 1.9324012921377238, "language_loss": 0.85210252, "learning_rate": 5.099722064981832e-07, "loss": 0.87675571, "num_input_tokens_seen": 277910425, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20629883, "step": 12882, "time_per_iteration": 2.8392956256866455 }, { "auxiliary_loss_clip": 0.01182439, "auxiliary_loss_mlp": 0.01023652, "balance_loss_clip": 1.09456658, "balance_loss_mlp": 1.00362444, "epoch": 0.774567864121449, "flos": 59458650399360.0, "grad_norm": 0.7902750420997748, "language_loss": 0.60439098, "learning_rate": 5.097124447474858e-07, "loss": 0.62645185, "num_input_tokens_seen": 277972795, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.20019531, "step": 12883, "time_per_iteration": 3.306915760040283 }, { "auxiliary_loss_clip": 0.01407711, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.24720836, "balance_loss_mlp": 1.01514626, "epoch": 0.774627987374117, "flos": 13233295100160.0, "grad_norm": 1.648033509030687, "language_loss": 0.7360152, "learning_rate": 5.094527395086416e-07, "loss": 0.76043522, "num_input_tokens_seen": 277990675, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19140625, "step": 12884, "time_per_iteration": 2.818967342376709 }, { "auxiliary_loss_clip": 0.01399085, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.24254286, "balance_loss_mlp": 1.01468003, "epoch": 0.7746881106267849, "flos": 21403056804480.0, "grad_norm": 1.6017123188197557, "language_loss": 0.81852895, "learning_rate": 5.091930907914986e-07, "loss": 0.84284818, "num_input_tokens_seen": 278010050, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18164062, "step": 12885, "time_per_iteration": 2.8797390460968018 }, { "auxiliary_loss_clip": 0.01387622, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.23049092, "balance_loss_mlp": 1.01277351, "epoch": 0.7747482338794529, "flos": 25640227589760.0, "grad_norm": 1.710636094915488, "language_loss": 0.64842707, "learning_rate": 5.089334986059029e-07, "loss": 0.67260194, "num_input_tokens_seen": 278030660, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.17089844, "step": 12886, "time_per_iteration": 2.9133424758911133 }, { "auxiliary_loss_clip": 0.01403154, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.2408886, "balance_loss_mlp": 1.01495147, "epoch": 0.7748083571321208, "flos": 11554454350080.0, "grad_norm": 2.093812379848307, "language_loss": 0.70476377, "learning_rate": 5.086739629616987e-07, "loss": 0.72911429, "num_input_tokens_seen": 278047645, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.16955566, "step": 12887, "time_per_iteration": 2.8770742416381836 }, { "auxiliary_loss_clip": 0.01384822, "auxiliary_loss_mlp": 0.01029109, "balance_loss_clip": 1.22812843, "balance_loss_mlp": 1.01157284, "epoch": 0.7748684803847888, "flos": 19071747768960.0, "grad_norm": 1.653601347565621, "language_loss": 0.7120955, "learning_rate": 5.084144838687275e-07, "loss": 0.73623478, "num_input_tokens_seen": 278066170, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.17541504, "step": 12888, "time_per_iteration": 4.24017596244812 }, { "auxiliary_loss_clip": 0.01409166, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.24681699, "balance_loss_mlp": 1.01270974, "epoch": 0.7749286036374567, "flos": 22283315251200.0, "grad_norm": 1.5595073930722814, "language_loss": 0.82871187, "learning_rate": 5.081550613368279e-07, "loss": 0.8531217, "num_input_tokens_seen": 278085545, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19116211, "step": 12889, "time_per_iteration": 2.8465945720672607 }, { "auxiliary_loss_clip": 0.01404989, "auxiliary_loss_mlp": 0.01033516, "balance_loss_clip": 1.24442959, "balance_loss_mlp": 1.01511014, "epoch": 0.7749887268901248, "flos": 20201812611840.0, "grad_norm": 1.9255146811125796, "language_loss": 0.80247045, "learning_rate": 5.07895695375838e-07, "loss": 0.82685554, "num_input_tokens_seen": 278102995, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18395996, "step": 12890, "time_per_iteration": 2.8774359226226807 }, { "auxiliary_loss_clip": 0.014116, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.25043631, "balance_loss_mlp": 1.01483011, "epoch": 0.7750488501427927, "flos": 20346705020160.0, "grad_norm": 1.6611897094374393, "language_loss": 0.67177778, "learning_rate": 5.076363859955932e-07, "loss": 0.69623303, "num_input_tokens_seen": 278121460, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1907959, "step": 12891, "time_per_iteration": 2.8410561084747314 }, { "auxiliary_loss_clip": 0.0142054, "auxiliary_loss_mlp": 0.0103039, "balance_loss_clip": 1.2580502, "balance_loss_mlp": 1.01174533, "epoch": 0.7751089733954607, "flos": 28375193900160.0, "grad_norm": 1.4249943814396275, "language_loss": 0.79196048, "learning_rate": 5.073771332059257e-07, "loss": 0.81646979, "num_input_tokens_seen": 278143905, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.1862793, "step": 12892, "time_per_iteration": 5.785657644271851 }, { "auxiliary_loss_clip": 0.01416146, "auxiliary_loss_mlp": 0.01030276, "balance_loss_clip": 1.25153244, "balance_loss_mlp": 1.01092863, "epoch": 0.7751690966481286, "flos": 16951940766720.0, "grad_norm": 1.9238127274112011, "language_loss": 0.6787461, "learning_rate": 5.071179370166669e-07, "loss": 0.70321041, "num_input_tokens_seen": 278160850, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19335938, "step": 12893, "time_per_iteration": 2.847093343734741 }, { "auxiliary_loss_clip": 0.01180029, "auxiliary_loss_mlp": 0.01019842, "balance_loss_clip": 1.09286535, "balance_loss_mlp": 1.00200808, "epoch": 0.7752292199007966, "flos": 65702389155840.0, "grad_norm": 0.8115400954314361, "language_loss": 0.58626282, "learning_rate": 5.068587974376468e-07, "loss": 0.60826153, "num_input_tokens_seen": 278219950, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.17871094, "step": 12894, "time_per_iteration": 3.3785808086395264 }, { "auxiliary_loss_clip": 0.01411436, "auxiliary_loss_mlp": 0.01035125, "balance_loss_clip": 1.24907136, "balance_loss_mlp": 1.01657593, "epoch": 0.7752893431534646, "flos": 20604338766720.0, "grad_norm": 2.9036921514873475, "language_loss": 0.78758126, "learning_rate": 5.065997144786895e-07, "loss": 0.81204683, "num_input_tokens_seen": 278237805, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1854248, "step": 12895, "time_per_iteration": 2.82365083694458 }, { "auxiliary_loss_clip": 0.01404765, "auxiliary_loss_mlp": 0.01033832, "balance_loss_clip": 1.24524379, "balance_loss_mlp": 1.01475835, "epoch": 0.7753494664061326, "flos": 20495036033280.0, "grad_norm": 1.7159670496360433, "language_loss": 0.68481636, "learning_rate": 5.063406881496209e-07, "loss": 0.70920229, "num_input_tokens_seen": 278257660, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19067383, "step": 12896, "time_per_iteration": 2.866715669631958 }, { "auxiliary_loss_clip": 0.01414593, "auxiliary_loss_mlp": 0.01033581, "balance_loss_clip": 1.25368404, "balance_loss_mlp": 1.01475775, "epoch": 0.7754095896588006, "flos": 20275389936000.0, "grad_norm": 2.1715762160471135, "language_loss": 0.69713795, "learning_rate": 5.060817184602629e-07, "loss": 0.72161967, "num_input_tokens_seen": 278275110, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18823242, "step": 12897, "time_per_iteration": 2.894561767578125 }, { "auxiliary_loss_clip": 0.01405168, "auxiliary_loss_mlp": 0.01034799, "balance_loss_clip": 1.24291015, "balance_loss_mlp": 1.01610732, "epoch": 0.7754697129114685, "flos": 23341476827520.0, "grad_norm": 1.7155598508936065, "language_loss": 0.75748509, "learning_rate": 5.058228054204364e-07, "loss": 0.78188479, "num_input_tokens_seen": 278293035, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18701172, "step": 12898, "time_per_iteration": 2.8963704109191895 }, { "auxiliary_loss_clip": 0.01408941, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.24713421, "balance_loss_mlp": 1.01377547, "epoch": 0.7755298361641365, "flos": 17356502937600.0, "grad_norm": 1.9640894237361135, "language_loss": 0.70487875, "learning_rate": 5.055639490399588e-07, "loss": 0.72929668, "num_input_tokens_seen": 278311010, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19067383, "step": 12899, "time_per_iteration": 2.8076064586639404 }, { "auxiliary_loss_clip": 0.01410123, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.24884725, "balance_loss_mlp": 1.01614261, "epoch": 0.7755899594168044, "flos": 19655253699840.0, "grad_norm": 1.7865752410334947, "language_loss": 0.75947958, "learning_rate": 5.053051493286453e-07, "loss": 0.78394055, "num_input_tokens_seen": 278329900, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19824219, "step": 12900, "time_per_iteration": 2.842414617538452 }, { "auxiliary_loss_clip": 0.01393273, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.23631525, "balance_loss_mlp": 1.01482034, "epoch": 0.7756500826694724, "flos": 27425113447680.0, "grad_norm": 1.5490326160186982, "language_loss": 0.7846278, "learning_rate": 5.050464062963113e-07, "loss": 0.80888623, "num_input_tokens_seen": 278349980, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.17749023, "step": 12901, "time_per_iteration": 2.9282703399658203 }, { "auxiliary_loss_clip": 0.0140564, "auxiliary_loss_mlp": 0.01032055, "balance_loss_clip": 1.246948, "balance_loss_mlp": 1.01330352, "epoch": 0.7757102059221404, "flos": 28742175624960.0, "grad_norm": 1.6164464857059373, "language_loss": 0.77544475, "learning_rate": 5.047877199527666e-07, "loss": 0.79982167, "num_input_tokens_seen": 278372485, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18737793, "step": 12902, "time_per_iteration": 2.888806104660034 }, { "auxiliary_loss_clip": 0.01401961, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.24158609, "balance_loss_mlp": 1.01185608, "epoch": 0.7757703291748084, "flos": 22495631690880.0, "grad_norm": 1.7845514698238587, "language_loss": 0.74036175, "learning_rate": 5.045290903078215e-07, "loss": 0.76468825, "num_input_tokens_seen": 278391660, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18823242, "step": 12903, "time_per_iteration": 2.9202637672424316 }, { "auxiliary_loss_clip": 0.01397406, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.23860967, "balance_loss_mlp": 1.01024699, "epoch": 0.7758304524274763, "flos": 21439098927360.0, "grad_norm": 3.515074531139347, "language_loss": 0.76816487, "learning_rate": 5.042705173712835e-07, "loss": 0.79243016, "num_input_tokens_seen": 278409125, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18884277, "step": 12904, "time_per_iteration": 2.8438947200775146 }, { "auxiliary_loss_clip": 0.01387382, "auxiliary_loss_mlp": 0.01028133, "balance_loss_clip": 1.23108673, "balance_loss_mlp": 1.00989437, "epoch": 0.7758905756801443, "flos": 23669656496640.0, "grad_norm": 2.444856560142635, "language_loss": 0.69312501, "learning_rate": 5.040120011529576e-07, "loss": 0.71728009, "num_input_tokens_seen": 278429450, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18237305, "step": 12905, "time_per_iteration": 2.9289638996124268 }, { "auxiliary_loss_clip": 0.01394269, "auxiliary_loss_mlp": 0.01029395, "balance_loss_clip": 1.23834491, "balance_loss_mlp": 1.01063156, "epoch": 0.7759506989328122, "flos": 28377003692160.0, "grad_norm": 1.831198117330667, "language_loss": 0.67560363, "learning_rate": 5.037535416626459e-07, "loss": 0.69984019, "num_input_tokens_seen": 278449925, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18762207, "step": 12906, "time_per_iteration": 2.909403085708618 }, { "auxiliary_loss_clip": 0.01399577, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.23958075, "balance_loss_mlp": 1.01030803, "epoch": 0.7760108221854802, "flos": 14910959485440.0, "grad_norm": 2.271311031902538, "language_loss": 0.81858087, "learning_rate": 5.034951389101498e-07, "loss": 0.84286571, "num_input_tokens_seen": 278467255, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18603516, "step": 12907, "time_per_iteration": 2.904597759246826 }, { "auxiliary_loss_clip": 0.01391529, "auxiliary_loss_mlp": 0.01033654, "balance_loss_clip": 1.23651075, "balance_loss_mlp": 1.01444948, "epoch": 0.7760709454381483, "flos": 14800435142400.0, "grad_norm": 1.9781690676265724, "language_loss": 0.67480612, "learning_rate": 5.032367929052685e-07, "loss": 0.69905794, "num_input_tokens_seen": 278484250, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.1920166, "step": 12908, "time_per_iteration": 2.819546699523926 }, { "auxiliary_loss_clip": 0.01399182, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.23831058, "balance_loss_mlp": 1.01401782, "epoch": 0.7761310686908162, "flos": 17387613377280.0, "grad_norm": 1.660729500401547, "language_loss": 0.70856047, "learning_rate": 5.029785036577976e-07, "loss": 0.73287892, "num_input_tokens_seen": 278502740, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18652344, "step": 12909, "time_per_iteration": 2.871462821960449 }, { "auxiliary_loss_clip": 0.01390466, "auxiliary_loss_mlp": 0.01039097, "balance_loss_clip": 1.2337296, "balance_loss_mlp": 1.02051258, "epoch": 0.7761911919434842, "flos": 25567464672000.0, "grad_norm": 1.7011111376816004, "language_loss": 0.68184507, "learning_rate": 5.027202711775324e-07, "loss": 0.70614076, "num_input_tokens_seen": 278523890, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18591309, "step": 12910, "time_per_iteration": 2.9592645168304443 }, { "auxiliary_loss_clip": 0.01407324, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.24603701, "balance_loss_mlp": 1.01582265, "epoch": 0.7762513151961521, "flos": 23188711824000.0, "grad_norm": 1.6409684090076275, "language_loss": 0.72162968, "learning_rate": 5.024620954742646e-07, "loss": 0.74604332, "num_input_tokens_seen": 278543185, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18225098, "step": 12911, "time_per_iteration": 2.8720638751983643 }, { "auxiliary_loss_clip": 0.01414866, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 1.25287592, "balance_loss_mlp": 1.01221645, "epoch": 0.7763114384488201, "flos": 21699673585920.0, "grad_norm": 2.4355548093365713, "language_loss": 0.64253592, "learning_rate": 5.022039765577836e-07, "loss": 0.6670019, "num_input_tokens_seen": 278559220, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19494629, "step": 12912, "time_per_iteration": 2.8300845623016357 }, { "auxiliary_loss_clip": 0.01186086, "auxiliary_loss_mlp": 0.01021148, "balance_loss_clip": 1.09646487, "balance_loss_mlp": 1.00178862, "epoch": 0.776371561701488, "flos": 69060975552000.0, "grad_norm": 0.7691832355903192, "language_loss": 0.53233808, "learning_rate": 5.019459144378779e-07, "loss": 0.5544104, "num_input_tokens_seen": 278618185, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.19335938, "step": 12913, "time_per_iteration": 3.420860767364502 }, { "auxiliary_loss_clip": 0.01399962, "auxiliary_loss_mlp": 0.01031996, "balance_loss_clip": 1.24038529, "balance_loss_mlp": 1.01254129, "epoch": 0.776431684954156, "flos": 22904356383360.0, "grad_norm": 8.809323581750837, "language_loss": 0.62608898, "learning_rate": 5.016879091243338e-07, "loss": 0.65040857, "num_input_tokens_seen": 278636210, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19458008, "step": 12914, "time_per_iteration": 2.847818374633789 }, { "auxiliary_loss_clip": 0.01395522, "auxiliary_loss_mlp": 0.01033794, "balance_loss_clip": 1.2362144, "balance_loss_mlp": 1.01442289, "epoch": 0.776491808206824, "flos": 20269915315200.0, "grad_norm": 2.1197376985735747, "language_loss": 0.82912111, "learning_rate": 5.014299606269339e-07, "loss": 0.8534143, "num_input_tokens_seen": 278653305, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19348145, "step": 12915, "time_per_iteration": 4.3609397411346436 }, { "auxiliary_loss_clip": 0.01407485, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.243343, "balance_loss_mlp": 1.01645947, "epoch": 0.776551931459492, "flos": 26769885229440.0, "grad_norm": 3.88367252532672, "language_loss": 0.75204778, "learning_rate": 5.011720689554603e-07, "loss": 0.77648592, "num_input_tokens_seen": 278671850, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19873047, "step": 12916, "time_per_iteration": 2.8761372566223145 }, { "auxiliary_loss_clip": 0.01404914, "auxiliary_loss_mlp": 0.01034154, "balance_loss_clip": 1.24298143, "balance_loss_mlp": 1.01521134, "epoch": 0.7766120547121599, "flos": 52682930835840.0, "grad_norm": 1.4204809177215751, "language_loss": 0.65994239, "learning_rate": 5.009142341196919e-07, "loss": 0.68433303, "num_input_tokens_seen": 278697860, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1895752, "step": 12917, "time_per_iteration": 3.153193712234497 }, { "auxiliary_loss_clip": 0.01401549, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.24165678, "balance_loss_mlp": 1.01564169, "epoch": 0.7766721779648279, "flos": 25167291246720.0, "grad_norm": 1.4470181056708658, "language_loss": 0.64872062, "learning_rate": 5.006564561294065e-07, "loss": 0.67308342, "num_input_tokens_seen": 278720655, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19116211, "step": 12918, "time_per_iteration": 2.93001651763916 }, { "auxiliary_loss_clip": 0.01399099, "auxiliary_loss_mlp": 0.01027931, "balance_loss_clip": 1.24001288, "balance_loss_mlp": 1.0092988, "epoch": 0.7767323012174958, "flos": 23769141108480.0, "grad_norm": 2.0456692122967, "language_loss": 0.73853737, "learning_rate": 5.003987349943777e-07, "loss": 0.76280761, "num_input_tokens_seen": 278737375, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1862793, "step": 12919, "time_per_iteration": 2.8447794914245605 }, { "auxiliary_loss_clip": 0.01412471, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.24922848, "balance_loss_mlp": 1.01239014, "epoch": 0.7767924244701638, "flos": 22095684489600.0, "grad_norm": 1.7103816970787662, "language_loss": 0.79628801, "learning_rate": 5.001410707243792e-07, "loss": 0.82073087, "num_input_tokens_seen": 278756510, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19421387, "step": 12920, "time_per_iteration": 2.863222360610962 }, { "auxiliary_loss_clip": 0.01403743, "auxiliary_loss_mlp": 0.01032681, "balance_loss_clip": 1.24236584, "balance_loss_mlp": 1.01365542, "epoch": 0.7768525477228319, "flos": 21991811132160.0, "grad_norm": 1.5147490201452205, "language_loss": 0.71430427, "learning_rate": 4.998834633291829e-07, "loss": 0.7386685, "num_input_tokens_seen": 278775410, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19018555, "step": 12921, "time_per_iteration": 2.8834228515625 }, { "auxiliary_loss_clip": 0.01411289, "auxiliary_loss_mlp": 0.01033825, "balance_loss_clip": 1.24811506, "balance_loss_mlp": 1.01425111, "epoch": 0.7769126709754998, "flos": 21803501698560.0, "grad_norm": 1.6706662816536082, "language_loss": 0.7694155, "learning_rate": 4.996259128185547e-07, "loss": 0.79386663, "num_input_tokens_seen": 278794260, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19580078, "step": 12922, "time_per_iteration": 2.86824369430542 }, { "auxiliary_loss_clip": 0.01399293, "auxiliary_loss_mlp": 0.01033788, "balance_loss_clip": 1.24095106, "balance_loss_mlp": 1.01519096, "epoch": 0.7769727942281678, "flos": 20057689365120.0, "grad_norm": 1.7932728023572104, "language_loss": 0.81364036, "learning_rate": 4.993684192022625e-07, "loss": 0.83797121, "num_input_tokens_seen": 278813290, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18591309, "step": 12923, "time_per_iteration": 2.8691940307617188 }, { "auxiliary_loss_clip": 0.01398589, "auxiliary_loss_mlp": 0.01032021, "balance_loss_clip": 1.24010944, "balance_loss_mlp": 1.01466453, "epoch": 0.7770329174808357, "flos": 21696099246720.0, "grad_norm": 1.9396210118800121, "language_loss": 0.92447531, "learning_rate": 4.991109824900699e-07, "loss": 0.94878137, "num_input_tokens_seen": 278830610, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.17358398, "step": 12924, "time_per_iteration": 4.334291219711304 }, { "auxiliary_loss_clip": 0.01399095, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 1.23842335, "balance_loss_mlp": 1.01041198, "epoch": 0.7770930407335037, "flos": 25860461869440.0, "grad_norm": 1.9375200640683818, "language_loss": 0.66922653, "learning_rate": 4.988536026917401e-07, "loss": 0.69350624, "num_input_tokens_seen": 278849530, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18469238, "step": 12925, "time_per_iteration": 2.8780159950256348 }, { "auxiliary_loss_clip": 0.01413769, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.24999297, "balance_loss_mlp": 1.01480806, "epoch": 0.7771531639861716, "flos": 24357262008960.0, "grad_norm": 1.9302850055585379, "language_loss": 0.72299391, "learning_rate": 4.985962798170314e-07, "loss": 0.74746281, "num_input_tokens_seen": 278869005, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18310547, "step": 12926, "time_per_iteration": 2.9007248878479004 }, { "auxiliary_loss_clip": 0.0140978, "auxiliary_loss_mlp": 0.01028382, "balance_loss_clip": 1.24740434, "balance_loss_mlp": 1.00990427, "epoch": 0.7772132872388396, "flos": 25640996751360.0, "grad_norm": 2.7102063772777587, "language_loss": 0.66797996, "learning_rate": 4.983390138757027e-07, "loss": 0.69236153, "num_input_tokens_seen": 278888790, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18481445, "step": 12927, "time_per_iteration": 5.581921815872192 }, { "auxiliary_loss_clip": 0.01399915, "auxiliary_loss_mlp": 0.01040382, "balance_loss_clip": 1.23862267, "balance_loss_mlp": 1.0192343, "epoch": 0.7772734104915076, "flos": 26078479153920.0, "grad_norm": 1.7177301025951173, "language_loss": 0.72823524, "learning_rate": 4.980818048775093e-07, "loss": 0.75263822, "num_input_tokens_seen": 278908150, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.21142578, "step": 12928, "time_per_iteration": 2.88481068611145 }, { "auxiliary_loss_clip": 0.01386972, "auxiliary_loss_mlp": 0.01030345, "balance_loss_clip": 1.22883511, "balance_loss_mlp": 1.01254702, "epoch": 0.7773335337441756, "flos": 22934109479040.0, "grad_norm": 1.7365717101075782, "language_loss": 0.74834478, "learning_rate": 4.978246528322036e-07, "loss": 0.77251792, "num_input_tokens_seen": 278927425, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.17810059, "step": 12929, "time_per_iteration": 2.8613927364349365 }, { "auxiliary_loss_clip": 0.01403881, "auxiliary_loss_mlp": 0.01032503, "balance_loss_clip": 1.24206829, "balance_loss_mlp": 1.0137279, "epoch": 0.7773936569968435, "flos": 20786404417920.0, "grad_norm": 2.1459134323343565, "language_loss": 0.78053153, "learning_rate": 4.975675577495377e-07, "loss": 0.8048954, "num_input_tokens_seen": 278946475, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18774414, "step": 12930, "time_per_iteration": 2.841214418411255 }, { "auxiliary_loss_clip": 0.01411855, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.25208688, "balance_loss_mlp": 1.01397562, "epoch": 0.7774537802495115, "flos": 20380982595840.0, "grad_norm": 1.707436002694152, "language_loss": 0.80083668, "learning_rate": 4.973105196392613e-07, "loss": 0.82530153, "num_input_tokens_seen": 278964345, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.20654297, "step": 12931, "time_per_iteration": 2.8196892738342285 }, { "auxiliary_loss_clip": 0.01187964, "auxiliary_loss_mlp": 0.01047396, "balance_loss_clip": 1.09740114, "balance_loss_mlp": 1.02860916, "epoch": 0.7775139035021794, "flos": 53941409700480.0, "grad_norm": 0.8130677247362975, "language_loss": 0.59784985, "learning_rate": 4.970535385111199e-07, "loss": 0.6202035, "num_input_tokens_seen": 279022380, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.1875, "step": 12932, "time_per_iteration": 3.305485725402832 }, { "auxiliary_loss_clip": 0.0140387, "auxiliary_loss_mlp": 0.01033784, "balance_loss_clip": 1.24128795, "balance_loss_mlp": 1.01503217, "epoch": 0.7775740267548474, "flos": 28854826473600.0, "grad_norm": 2.085943463338824, "language_loss": 0.76518631, "learning_rate": 4.967966143748595e-07, "loss": 0.78956276, "num_input_tokens_seen": 279044275, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1875, "step": 12933, "time_per_iteration": 2.9381344318389893 }, { "auxiliary_loss_clip": 0.01393511, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.23433685, "balance_loss_mlp": 1.01111913, "epoch": 0.7776341500075155, "flos": 21882689377920.0, "grad_norm": 2.6204597020687426, "language_loss": 0.74065655, "learning_rate": 4.965397472402215e-07, "loss": 0.76489681, "num_input_tokens_seen": 279063375, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19396973, "step": 12934, "time_per_iteration": 2.9044363498687744 }, { "auxiliary_loss_clip": 0.01407446, "auxiliary_loss_mlp": 0.01032343, "balance_loss_clip": 1.24561453, "balance_loss_mlp": 1.01241159, "epoch": 0.7776942732601834, "flos": 20239438302720.0, "grad_norm": 2.3637344195400773, "language_loss": 0.70880842, "learning_rate": 4.962829371169475e-07, "loss": 0.73320627, "num_input_tokens_seen": 279082680, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19934082, "step": 12935, "time_per_iteration": 2.863309621810913 }, { "auxiliary_loss_clip": 0.01408723, "auxiliary_loss_mlp": 0.01034041, "balance_loss_clip": 1.24709415, "balance_loss_mlp": 1.01471734, "epoch": 0.7777543965128514, "flos": 22240712632320.0, "grad_norm": 1.655110887743921, "language_loss": 0.84023738, "learning_rate": 4.960261840147746e-07, "loss": 0.86466497, "num_input_tokens_seen": 279099805, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19335938, "step": 12936, "time_per_iteration": 2.849276304244995 }, { "auxiliary_loss_clip": 0.01408111, "auxiliary_loss_mlp": 0.01027521, "balance_loss_clip": 1.24424469, "balance_loss_mlp": 1.00928164, "epoch": 0.7778145197655193, "flos": 14510152632960.0, "grad_norm": 2.053210204800827, "language_loss": 0.69038153, "learning_rate": 4.957694879434397e-07, "loss": 0.71473783, "num_input_tokens_seen": 279117975, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18249512, "step": 12937, "time_per_iteration": 2.8591666221618652 }, { "auxiliary_loss_clip": 0.01409464, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.24660182, "balance_loss_mlp": 1.01134872, "epoch": 0.7778746430181873, "flos": 21149676069120.0, "grad_norm": 1.4955594043905462, "language_loss": 0.87539577, "learning_rate": 4.955128489126777e-07, "loss": 0.89979511, "num_input_tokens_seen": 279137255, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19128418, "step": 12938, "time_per_iteration": 2.859717845916748 }, { "auxiliary_loss_clip": 0.0139337, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.23325539, "balance_loss_mlp": 1.01231384, "epoch": 0.7779347662708552, "flos": 20275978118400.0, "grad_norm": 2.063790364266617, "language_loss": 0.86109728, "learning_rate": 4.95256266932218e-07, "loss": 0.8853451, "num_input_tokens_seen": 279154500, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19104004, "step": 12939, "time_per_iteration": 2.827425479888916 }, { "auxiliary_loss_clip": 0.01403351, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.2458595, "balance_loss_mlp": 1.00938725, "epoch": 0.7779948895235232, "flos": 19218540458880.0, "grad_norm": 1.6384197250057164, "language_loss": 0.70029455, "learning_rate": 4.949997420117915e-07, "loss": 0.72461772, "num_input_tokens_seen": 279173635, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19580078, "step": 12940, "time_per_iteration": 2.831784963607788 }, { "auxiliary_loss_clip": 0.01408528, "auxiliary_loss_mlp": 0.01033473, "balance_loss_clip": 1.24606466, "balance_loss_mlp": 1.01484108, "epoch": 0.7780550127761912, "flos": 23925208982400.0, "grad_norm": 1.533306348141608, "language_loss": 0.78259188, "learning_rate": 4.947432741611255e-07, "loss": 0.8070119, "num_input_tokens_seen": 279194430, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18640137, "step": 12941, "time_per_iteration": 2.8709566593170166 }, { "auxiliary_loss_clip": 0.01407493, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.24196196, "balance_loss_mlp": 1.01773989, "epoch": 0.7781151360288592, "flos": 32428670221440.0, "grad_norm": 2.4282514478124124, "language_loss": 0.74448013, "learning_rate": 4.944868633899462e-07, "loss": 0.76894474, "num_input_tokens_seen": 279212920, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.21240234, "step": 12942, "time_per_iteration": 2.9307773113250732 }, { "auxiliary_loss_clip": 0.01385679, "auxiliary_loss_mlp": 0.01033378, "balance_loss_clip": 1.22930598, "balance_loss_mlp": 1.01434016, "epoch": 0.7781752592815271, "flos": 22356621106560.0, "grad_norm": 1.8541942870245243, "language_loss": 0.67977583, "learning_rate": 4.942305097079751e-07, "loss": 0.70396638, "num_input_tokens_seen": 279232310, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19042969, "step": 12943, "time_per_iteration": 2.8550002574920654 }, { "auxiliary_loss_clip": 0.01192954, "auxiliary_loss_mlp": 0.01030374, "balance_loss_clip": 1.09987271, "balance_loss_mlp": 1.0073905, "epoch": 0.7782353825341951, "flos": 70489539475200.0, "grad_norm": 0.78305372188722, "language_loss": 0.58594441, "learning_rate": 4.939742131249347e-07, "loss": 0.60817766, "num_input_tokens_seen": 279295375, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.22949219, "step": 12944, "time_per_iteration": 3.5229294300079346 }, { "auxiliary_loss_clip": 0.01402089, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 1.24046481, "balance_loss_mlp": 1.01230288, "epoch": 0.778295505786863, "flos": 19071928748160.0, "grad_norm": 13.09127850311997, "language_loss": 0.68005621, "learning_rate": 4.937179736505428e-07, "loss": 0.7043997, "num_input_tokens_seen": 279313660, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19958496, "step": 12945, "time_per_iteration": 2.8445849418640137 }, { "auxiliary_loss_clip": 0.01399312, "auxiliary_loss_mlp": 0.01035642, "balance_loss_clip": 1.23858118, "balance_loss_mlp": 1.016258, "epoch": 0.778355629039531, "flos": 21010484505600.0, "grad_norm": 2.6435631212804602, "language_loss": 0.69987553, "learning_rate": 4.93461791294516e-07, "loss": 0.72422504, "num_input_tokens_seen": 279334495, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19384766, "step": 12946, "time_per_iteration": 2.86156964302063 }, { "auxiliary_loss_clip": 0.01414371, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.25164771, "balance_loss_mlp": 1.01487184, "epoch": 0.7784157522921991, "flos": 21408169466880.0, "grad_norm": 2.5102933487474703, "language_loss": 0.66183341, "learning_rate": 4.932056660665689e-07, "loss": 0.68632096, "num_input_tokens_seen": 279352985, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19506836, "step": 12947, "time_per_iteration": 2.8440563678741455 }, { "auxiliary_loss_clip": 0.013984, "auxiliary_loss_mlp": 0.01032418, "balance_loss_clip": 1.23947525, "balance_loss_mlp": 1.01324916, "epoch": 0.778475875544867, "flos": 20823668150400.0, "grad_norm": 2.2010231519347347, "language_loss": 0.66081274, "learning_rate": 4.929495979764147e-07, "loss": 0.68512094, "num_input_tokens_seen": 279371360, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19177246, "step": 12948, "time_per_iteration": 2.829538583755493 }, { "auxiliary_loss_clip": 0.01395583, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.23671436, "balance_loss_mlp": 1.01188302, "epoch": 0.778535998797535, "flos": 14363359943040.0, "grad_norm": 1.7954011018244624, "language_loss": 0.76123285, "learning_rate": 4.926935870337625e-07, "loss": 0.78549665, "num_input_tokens_seen": 279389400, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18896484, "step": 12949, "time_per_iteration": 2.8459153175354004 }, { "auxiliary_loss_clip": 0.0142143, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.25796533, "balance_loss_mlp": 1.01505184, "epoch": 0.7785961220502029, "flos": 19219219130880.0, "grad_norm": 1.433289863327755, "language_loss": 0.69351196, "learning_rate": 4.924376332483202e-07, "loss": 0.71807122, "num_input_tokens_seen": 279409715, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19458008, "step": 12950, "time_per_iteration": 2.9117207527160645 }, { "auxiliary_loss_clip": 0.01418873, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.25449967, "balance_loss_mlp": 1.01395464, "epoch": 0.7786562453028709, "flos": 25749666057600.0, "grad_norm": 1.6224947516160673, "language_loss": 0.72752893, "learning_rate": 4.921817366297938e-07, "loss": 0.75204456, "num_input_tokens_seen": 279427705, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.18762207, "step": 12951, "time_per_iteration": 4.233428955078125 }, { "auxiliary_loss_clip": 0.01402635, "auxiliary_loss_mlp": 0.01032599, "balance_loss_clip": 1.24416804, "balance_loss_mlp": 1.01406157, "epoch": 0.7787163685555388, "flos": 25750525708800.0, "grad_norm": 1.6032221459014897, "language_loss": 0.66260016, "learning_rate": 4.919258971878877e-07, "loss": 0.68695247, "num_input_tokens_seen": 279448215, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.1854248, "step": 12952, "time_per_iteration": 2.862560987472534 }, { "auxiliary_loss_clip": 0.01378662, "auxiliary_loss_mlp": 0.01033863, "balance_loss_clip": 1.22561955, "balance_loss_mlp": 1.01537347, "epoch": 0.7787764918082068, "flos": 22758061386240.0, "grad_norm": 1.5432141301126079, "language_loss": 0.81803787, "learning_rate": 4.916701149323022e-07, "loss": 0.84216309, "num_input_tokens_seen": 279466260, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.18493652, "step": 12953, "time_per_iteration": 2.880923271179199 }, { "auxiliary_loss_clip": 0.01407447, "auxiliary_loss_mlp": 0.01031707, "balance_loss_clip": 1.24497068, "balance_loss_mlp": 1.01257372, "epoch": 0.7788366150608748, "flos": 15198120103680.0, "grad_norm": 3.3145636123312854, "language_loss": 0.77881086, "learning_rate": 4.91414389872737e-07, "loss": 0.80320239, "num_input_tokens_seen": 279484520, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19152832, "step": 12954, "time_per_iteration": 2.843950033187866 }, { "auxiliary_loss_clip": 0.01416348, "auxiliary_loss_mlp": 0.01028465, "balance_loss_clip": 1.25120449, "balance_loss_mlp": 1.00997615, "epoch": 0.7788967383135428, "flos": 21218864647680.0, "grad_norm": 1.545202496930838, "language_loss": 0.73318964, "learning_rate": 4.911587220188905e-07, "loss": 0.7576378, "num_input_tokens_seen": 279503130, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18481445, "step": 12955, "time_per_iteration": 2.813735008239746 }, { "auxiliary_loss_clip": 0.01404127, "auxiliary_loss_mlp": 0.01032744, "balance_loss_clip": 1.24253106, "balance_loss_mlp": 1.01290751, "epoch": 0.7789568615662107, "flos": 21691303297920.0, "grad_norm": 1.6548950002426, "language_loss": 0.69277, "learning_rate": 4.909031113804551e-07, "loss": 0.71713865, "num_input_tokens_seen": 279521930, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19824219, "step": 12956, "time_per_iteration": 2.8721396923065186 }, { "auxiliary_loss_clip": 0.01403684, "auxiliary_loss_mlp": 0.01033778, "balance_loss_clip": 1.24297619, "balance_loss_mlp": 1.01536012, "epoch": 0.7790169848188787, "flos": 26371793064960.0, "grad_norm": 1.53842506214568, "language_loss": 0.76611096, "learning_rate": 4.906475579671252e-07, "loss": 0.79048556, "num_input_tokens_seen": 279542375, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18432617, "step": 12957, "time_per_iteration": 2.9441137313842773 }, { "auxiliary_loss_clip": 0.01397612, "auxiliary_loss_mlp": 0.01031026, "balance_loss_clip": 1.23756099, "balance_loss_mlp": 1.0126555, "epoch": 0.7790771080715466, "flos": 25526083662720.0, "grad_norm": 1.8625265923912484, "language_loss": 0.78205442, "learning_rate": 4.903920617885917e-07, "loss": 0.80634075, "num_input_tokens_seen": 279561885, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18383789, "step": 12958, "time_per_iteration": 4.347756624221802 }, { "auxiliary_loss_clip": 0.01399235, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.23855996, "balance_loss_mlp": 1.0132463, "epoch": 0.7791372313242146, "flos": 16042743630720.0, "grad_norm": 2.220206722293401, "language_loss": 0.72488797, "learning_rate": 4.901366228545418e-07, "loss": 0.74920774, "num_input_tokens_seen": 279579965, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19506836, "step": 12959, "time_per_iteration": 2.8351247310638428 }, { "auxiliary_loss_clip": 0.01400103, "auxiliary_loss_mlp": 0.01036337, "balance_loss_clip": 1.23984671, "balance_loss_mlp": 1.01712012, "epoch": 0.7791973545768827, "flos": 23852808023040.0, "grad_norm": 1.6383521964250731, "language_loss": 0.78242111, "learning_rate": 4.898812411746632e-07, "loss": 0.80678552, "num_input_tokens_seen": 279599030, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19213867, "step": 12960, "time_per_iteration": 2.8474748134613037 }, { "auxiliary_loss_clip": 0.01413326, "auxiliary_loss_mlp": 0.01035087, "balance_loss_clip": 1.2514745, "balance_loss_mlp": 1.01618087, "epoch": 0.7792574778295506, "flos": 24178499228160.0, "grad_norm": 1.8550942028120934, "language_loss": 0.76104397, "learning_rate": 4.896259167586385e-07, "loss": 0.78552806, "num_input_tokens_seen": 279614400, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18920898, "step": 12961, "time_per_iteration": 2.834251880645752 }, { "auxiliary_loss_clip": 0.0138865, "auxiliary_loss_mlp": 0.01037461, "balance_loss_clip": 1.23371911, "balance_loss_mlp": 1.01882839, "epoch": 0.7793176010822186, "flos": 21473421747840.0, "grad_norm": 1.5323738741869304, "language_loss": 0.73918855, "learning_rate": 4.893706496161511e-07, "loss": 0.76344967, "num_input_tokens_seen": 279633745, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18640137, "step": 12962, "time_per_iteration": 5.5876686573028564 }, { "auxiliary_loss_clip": 0.01390097, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.2317785, "balance_loss_mlp": 1.01305556, "epoch": 0.7793777243348865, "flos": 20676151543680.0, "grad_norm": 1.8368777317580933, "language_loss": 0.70875698, "learning_rate": 4.891154397568795e-07, "loss": 0.73297524, "num_input_tokens_seen": 279651165, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18676758, "step": 12963, "time_per_iteration": 2.8474674224853516 }, { "auxiliary_loss_clip": 0.01402998, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.24561644, "balance_loss_mlp": 1.01504564, "epoch": 0.7794378475875545, "flos": 27137183667840.0, "grad_norm": 2.411470811887559, "language_loss": 0.64249909, "learning_rate": 4.888602871905019e-07, "loss": 0.66687781, "num_input_tokens_seen": 279671175, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19836426, "step": 12964, "time_per_iteration": 2.872852325439453 }, { "auxiliary_loss_clip": 0.01405611, "auxiliary_loss_mlp": 0.01034989, "balance_loss_clip": 1.24306488, "balance_loss_mlp": 1.01664305, "epoch": 0.7794979708402224, "flos": 28085725797120.0, "grad_norm": 1.5361110320124767, "language_loss": 0.76802999, "learning_rate": 4.88605191926694e-07, "loss": 0.792436, "num_input_tokens_seen": 279688675, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18347168, "step": 12965, "time_per_iteration": 2.8604252338409424 }, { "auxiliary_loss_clip": 0.01382127, "auxiliary_loss_mlp": 0.01036251, "balance_loss_clip": 1.22839141, "balance_loss_mlp": 1.01716566, "epoch": 0.7795580940928905, "flos": 26880681041280.0, "grad_norm": 3.1888611739312784, "language_loss": 0.73282957, "learning_rate": 4.883501539751289e-07, "loss": 0.75701338, "num_input_tokens_seen": 279710245, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.1907959, "step": 12966, "time_per_iteration": 2.912429094314575 }, { "auxiliary_loss_clip": 0.01393035, "auxiliary_loss_mlp": 0.0102884, "balance_loss_clip": 1.23828745, "balance_loss_mlp": 1.01135206, "epoch": 0.7796182173455584, "flos": 23844211511040.0, "grad_norm": 1.5776844912275862, "language_loss": 0.74684435, "learning_rate": 4.880951733454768e-07, "loss": 0.77106309, "num_input_tokens_seen": 279729045, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.17492676, "step": 12967, "time_per_iteration": 2.849928140640259 }, { "auxiliary_loss_clip": 0.0139797, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.23745298, "balance_loss_mlp": 1.01257157, "epoch": 0.7796783405982264, "flos": 19801774920960.0, "grad_norm": 2.2392398712681008, "language_loss": 0.73602057, "learning_rate": 4.878402500474073e-07, "loss": 0.76031089, "num_input_tokens_seen": 279748350, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18481445, "step": 12968, "time_per_iteration": 2.862865924835205 }, { "auxiliary_loss_clip": 0.01396531, "auxiliary_loss_mlp": 0.01033452, "balance_loss_clip": 1.23833764, "balance_loss_mlp": 1.01477194, "epoch": 0.7797384638508943, "flos": 15458559027840.0, "grad_norm": 1.817578737414817, "language_loss": 0.62199211, "learning_rate": 4.875853840905874e-07, "loss": 0.64629197, "num_input_tokens_seen": 279765620, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18676758, "step": 12969, "time_per_iteration": 2.8153748512268066 }, { "auxiliary_loss_clip": 0.01375265, "auxiliary_loss_mlp": 0.01031271, "balance_loss_clip": 1.22070956, "balance_loss_mlp": 1.01328206, "epoch": 0.7797985871035623, "flos": 20932427946240.0, "grad_norm": 2.1063751260244343, "language_loss": 0.71360373, "learning_rate": 4.873305754846811e-07, "loss": 0.73766911, "num_input_tokens_seen": 279782485, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.17993164, "step": 12970, "time_per_iteration": 2.8249034881591797 }, { "auxiliary_loss_clip": 0.01401899, "auxiliary_loss_mlp": 0.01035551, "balance_loss_clip": 1.24266505, "balance_loss_mlp": 1.01576185, "epoch": 0.7798587103562302, "flos": 36950332181760.0, "grad_norm": 1.7603112424268743, "language_loss": 0.72598779, "learning_rate": 4.870758242393507e-07, "loss": 0.75036234, "num_input_tokens_seen": 279804170, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19775391, "step": 12971, "time_per_iteration": 2.9629738330841064 }, { "auxiliary_loss_clip": 0.01413932, "auxiliary_loss_mlp": 0.0103439, "balance_loss_clip": 1.24824786, "balance_loss_mlp": 1.0152812, "epoch": 0.7799188336088982, "flos": 22429519758720.0, "grad_norm": 3.8867979568180018, "language_loss": 0.75040442, "learning_rate": 4.868211303642578e-07, "loss": 0.77488768, "num_input_tokens_seen": 279823730, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19091797, "step": 12972, "time_per_iteration": 2.86275053024292 }, { "auxiliary_loss_clip": 0.01405619, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.24479675, "balance_loss_mlp": 1.01187181, "epoch": 0.7799789568615663, "flos": 18889863096960.0, "grad_norm": 1.751668256941028, "language_loss": 0.72695756, "learning_rate": 4.865664938690584e-07, "loss": 0.75131947, "num_input_tokens_seen": 279843035, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18713379, "step": 12973, "time_per_iteration": 2.8891513347625732 }, { "auxiliary_loss_clip": 0.01395706, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.23709476, "balance_loss_mlp": 1.0125109, "epoch": 0.7800390801142342, "flos": 20270820211200.0, "grad_norm": 1.7938447069518013, "language_loss": 0.78306472, "learning_rate": 4.863119147634089e-07, "loss": 0.80733186, "num_input_tokens_seen": 279861450, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18493652, "step": 12974, "time_per_iteration": 2.8275368213653564 }, { "auxiliary_loss_clip": 0.01401878, "auxiliary_loss_mlp": 0.01031727, "balance_loss_clip": 1.24311996, "balance_loss_mlp": 1.01277232, "epoch": 0.7800992033669022, "flos": 16698876744960.0, "grad_norm": 1.6201087187802843, "language_loss": 0.69994426, "learning_rate": 4.86057393056964e-07, "loss": 0.72428024, "num_input_tokens_seen": 279878660, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18969727, "step": 12975, "time_per_iteration": 2.824751615524292 }, { "auxiliary_loss_clip": 0.01395474, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.23736954, "balance_loss_mlp": 1.01392031, "epoch": 0.7801593266195701, "flos": 18593653518720.0, "grad_norm": 1.8371547915935698, "language_loss": 0.82985628, "learning_rate": 4.858029287593739e-07, "loss": 0.85412776, "num_input_tokens_seen": 279895685, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1776123, "step": 12976, "time_per_iteration": 2.843679666519165 }, { "auxiliary_loss_clip": 0.01403505, "auxiliary_loss_mlp": 0.01032642, "balance_loss_clip": 1.24148643, "balance_loss_mlp": 1.01362848, "epoch": 0.7802194498722381, "flos": 25496194832640.0, "grad_norm": 1.4074638019666617, "language_loss": 0.66324306, "learning_rate": 4.85548521880289e-07, "loss": 0.68760455, "num_input_tokens_seen": 279917240, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19030762, "step": 12977, "time_per_iteration": 2.944409132003784 }, { "auxiliary_loss_clip": 0.01398068, "auxiliary_loss_mlp": 0.01032742, "balance_loss_clip": 1.23974121, "balance_loss_mlp": 1.01414573, "epoch": 0.780279573124906, "flos": 31188940686720.0, "grad_norm": 1.3753715698584794, "language_loss": 0.75424886, "learning_rate": 4.852941724293554e-07, "loss": 0.77855694, "num_input_tokens_seen": 279938665, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18615723, "step": 12978, "time_per_iteration": 2.9356861114501953 }, { "auxiliary_loss_clip": 0.01414495, "auxiliary_loss_mlp": 0.01035478, "balance_loss_clip": 1.24973762, "balance_loss_mlp": 1.0162735, "epoch": 0.780339696377574, "flos": 26955570464640.0, "grad_norm": 4.014597087676522, "language_loss": 0.61995471, "learning_rate": 4.85039880416219e-07, "loss": 0.64445448, "num_input_tokens_seen": 279957965, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.1920166, "step": 12979, "time_per_iteration": 2.893747568130493 }, { "auxiliary_loss_clip": 0.01396486, "auxiliary_loss_mlp": 0.01030023, "balance_loss_clip": 1.23687923, "balance_loss_mlp": 1.01148605, "epoch": 0.780399819630242, "flos": 27967102634880.0, "grad_norm": 2.381779203198935, "language_loss": 0.78247046, "learning_rate": 4.847856458505217e-07, "loss": 0.80673552, "num_input_tokens_seen": 279977490, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1854248, "step": 12980, "time_per_iteration": 2.917649507522583 }, { "auxiliary_loss_clip": 0.01410458, "auxiliary_loss_mlp": 0.01037768, "balance_loss_clip": 1.24794722, "balance_loss_mlp": 1.01786029, "epoch": 0.78045994288291, "flos": 22495631690880.0, "grad_norm": 1.9981213780639615, "language_loss": 0.77858138, "learning_rate": 4.845314687419046e-07, "loss": 0.80306363, "num_input_tokens_seen": 279994220, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19897461, "step": 12981, "time_per_iteration": 2.9370310306549072 }, { "auxiliary_loss_clip": 0.01394291, "auxiliary_loss_mlp": 0.01033498, "balance_loss_clip": 1.23474479, "balance_loss_mlp": 1.01434159, "epoch": 0.7805200661355779, "flos": 20860796148480.0, "grad_norm": 1.9711577020420064, "language_loss": 0.74030173, "learning_rate": 4.842773491000067e-07, "loss": 0.76457965, "num_input_tokens_seen": 280012590, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19152832, "step": 12982, "time_per_iteration": 2.910907745361328 }, { "auxiliary_loss_clip": 0.01400368, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.24050689, "balance_loss_mlp": 1.01414323, "epoch": 0.7805801893882459, "flos": 25676903139840.0, "grad_norm": 1.9527304102802039, "language_loss": 0.73905575, "learning_rate": 4.840232869344636e-07, "loss": 0.76337808, "num_input_tokens_seen": 280033700, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.17724609, "step": 12983, "time_per_iteration": 2.8859875202178955 }, { "auxiliary_loss_clip": 0.01405111, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 1.2457881, "balance_loss_mlp": 1.01211262, "epoch": 0.7806403126409138, "flos": 11335215456000.0, "grad_norm": 1.9913788675929787, "language_loss": 0.75110692, "learning_rate": 4.837692822549086e-07, "loss": 0.77546477, "num_input_tokens_seen": 280052215, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18566895, "step": 12984, "time_per_iteration": 2.8306057453155518 }, { "auxiliary_loss_clip": 0.01398076, "auxiliary_loss_mlp": 0.01031365, "balance_loss_clip": 1.23818421, "balance_loss_mlp": 1.01323295, "epoch": 0.7807004358935818, "flos": 19582852740480.0, "grad_norm": 1.8753534586343803, "language_loss": 0.82323694, "learning_rate": 4.835153350709746e-07, "loss": 0.84753144, "num_input_tokens_seen": 280070525, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18139648, "step": 12985, "time_per_iteration": 2.8380794525146484 }, { "auxiliary_loss_clip": 0.01394676, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.23551536, "balance_loss_mlp": 1.01547968, "epoch": 0.7807605591462499, "flos": 19145279848320.0, "grad_norm": 2.6271716510012415, "language_loss": 0.77886826, "learning_rate": 4.832614453922915e-07, "loss": 0.80316341, "num_input_tokens_seen": 280089855, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19372559, "step": 12986, "time_per_iteration": 4.284211158752441 }, { "auxiliary_loss_clip": 0.0139503, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.23478603, "balance_loss_mlp": 1.01075411, "epoch": 0.7808206823989178, "flos": 32386112847360.0, "grad_norm": 1.5963310433853592, "language_loss": 0.75058842, "learning_rate": 4.830076132284859e-07, "loss": 0.77483344, "num_input_tokens_seen": 280109960, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18725586, "step": 12987, "time_per_iteration": 2.9402637481689453 }, { "auxiliary_loss_clip": 0.0118706, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.09814739, "balance_loss_mlp": 1.01059747, "epoch": 0.7808808056515858, "flos": 55081699868160.0, "grad_norm": 0.7283969219330716, "language_loss": 0.55091417, "learning_rate": 4.82753838589184e-07, "loss": 0.57309961, "num_input_tokens_seen": 280169805, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.20898438, "step": 12988, "time_per_iteration": 3.362290859222412 }, { "auxiliary_loss_clip": 0.01383568, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.22782731, "balance_loss_mlp": 1.01432562, "epoch": 0.7809409289042537, "flos": 12867082536960.0, "grad_norm": 3.1877027573436476, "language_loss": 0.81424177, "learning_rate": 4.82500121484009e-07, "loss": 0.83839786, "num_input_tokens_seen": 280184630, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.17700195, "step": 12989, "time_per_iteration": 2.793565034866333 }, { "auxiliary_loss_clip": 0.01385369, "auxiliary_loss_mlp": 0.01031694, "balance_loss_clip": 1.22976828, "balance_loss_mlp": 1.01260829, "epoch": 0.7810010521569217, "flos": 21696687429120.0, "grad_norm": 1.4934553134017414, "language_loss": 0.70886892, "learning_rate": 4.822464619225806e-07, "loss": 0.73303962, "num_input_tokens_seen": 280203880, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.1907959, "step": 12990, "time_per_iteration": 2.9377901554107666 }, { "auxiliary_loss_clip": 0.01398851, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.23883104, "balance_loss_mlp": 1.01356614, "epoch": 0.7810611754095896, "flos": 16764038536320.0, "grad_norm": 2.972776726195489, "language_loss": 0.7879836, "learning_rate": 4.819928599145184e-07, "loss": 0.81230968, "num_input_tokens_seen": 280220460, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.20202637, "step": 12991, "time_per_iteration": 2.8548505306243896 }, { "auxiliary_loss_clip": 0.01399475, "auxiliary_loss_mlp": 0.01037227, "balance_loss_clip": 1.2380898, "balance_loss_mlp": 1.01783168, "epoch": 0.7811212986622577, "flos": 43523019768960.0, "grad_norm": 1.7988634875313585, "language_loss": 0.66935432, "learning_rate": 4.817393154694398e-07, "loss": 0.69372129, "num_input_tokens_seen": 280242680, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19396973, "step": 12992, "time_per_iteration": 3.0471010208129883 }, { "auxiliary_loss_clip": 0.01405538, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.24336326, "balance_loss_mlp": 1.01383686, "epoch": 0.7811814219149256, "flos": 21766509434880.0, "grad_norm": 1.79085967750718, "language_loss": 0.61930048, "learning_rate": 4.814858285969578e-07, "loss": 0.64367706, "num_input_tokens_seen": 280260655, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18273926, "step": 12993, "time_per_iteration": 4.325740098953247 }, { "auxiliary_loss_clip": 0.01392028, "auxiliary_loss_mlp": 0.01030398, "balance_loss_clip": 1.23493564, "balance_loss_mlp": 1.01200414, "epoch": 0.7812415451675936, "flos": 24072273141120.0, "grad_norm": 1.5457258450376856, "language_loss": 0.69388568, "learning_rate": 4.812323993066862e-07, "loss": 0.71810997, "num_input_tokens_seen": 280281185, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18395996, "step": 12994, "time_per_iteration": 2.8494036197662354 }, { "auxiliary_loss_clip": 0.01395785, "auxiliary_loss_mlp": 0.01033381, "balance_loss_clip": 1.23669672, "balance_loss_mlp": 1.01480854, "epoch": 0.7813016684202615, "flos": 18999075340800.0, "grad_norm": 2.1098154628228367, "language_loss": 0.70483458, "learning_rate": 4.809790276082335e-07, "loss": 0.72912621, "num_input_tokens_seen": 280298255, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18579102, "step": 12995, "time_per_iteration": 2.8093745708465576 }, { "auxiliary_loss_clip": 0.01371715, "auxiliary_loss_mlp": 0.0103059, "balance_loss_clip": 1.21897769, "balance_loss_mlp": 1.01304245, "epoch": 0.7813617916729295, "flos": 25270621666560.0, "grad_norm": 1.6194757677352019, "language_loss": 0.75604224, "learning_rate": 4.807257135112088e-07, "loss": 0.7800653, "num_input_tokens_seen": 280319000, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.17565918, "step": 12996, "time_per_iteration": 2.8690731525421143 }, { "auxiliary_loss_clip": 0.01416956, "auxiliary_loss_mlp": 0.01030072, "balance_loss_clip": 1.25253415, "balance_loss_mlp": 1.01053321, "epoch": 0.7814219149255974, "flos": 17974603157760.0, "grad_norm": 2.8501421898219523, "language_loss": 0.69705385, "learning_rate": 4.804724570252167e-07, "loss": 0.72152418, "num_input_tokens_seen": 280336375, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1953125, "step": 12997, "time_per_iteration": 5.647717714309692 }, { "auxiliary_loss_clip": 0.01411277, "auxiliary_loss_mlp": 0.01033515, "balance_loss_clip": 1.24642229, "balance_loss_mlp": 1.01444173, "epoch": 0.7814820381782654, "flos": 25787336993280.0, "grad_norm": 1.7618086406920521, "language_loss": 0.83031404, "learning_rate": 4.802192581598614e-07, "loss": 0.85476196, "num_input_tokens_seen": 280358760, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19067383, "step": 12998, "time_per_iteration": 2.881556749343872 }, { "auxiliary_loss_clip": 0.01404129, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.24162447, "balance_loss_mlp": 1.01312971, "epoch": 0.7815421614309335, "flos": 20528680181760.0, "grad_norm": 2.1304975275250118, "language_loss": 0.75691235, "learning_rate": 4.799661169247453e-07, "loss": 0.78128266, "num_input_tokens_seen": 280377085, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19775391, "step": 12999, "time_per_iteration": 2.8413801193237305 }, { "auxiliary_loss_clip": 0.01408344, "auxiliary_loss_mlp": 0.01036262, "balance_loss_clip": 1.24572933, "balance_loss_mlp": 1.01704502, "epoch": 0.7816022846836014, "flos": 21297464144640.0, "grad_norm": 1.4709608665299407, "language_loss": 0.85027063, "learning_rate": 4.797130333294652e-07, "loss": 0.8747167, "num_input_tokens_seen": 280395465, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1920166, "step": 13000, "time_per_iteration": 2.851088762283325 }, { "auxiliary_loss_clip": 0.01407115, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.24545383, "balance_loss_mlp": 1.01213837, "epoch": 0.7816624079362694, "flos": 19217771297280.0, "grad_norm": 2.10151247921773, "language_loss": 0.66874242, "learning_rate": 4.794600073836192e-07, "loss": 0.69313258, "num_input_tokens_seen": 280412775, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19775391, "step": 13001, "time_per_iteration": 2.785374641418457 }, { "auxiliary_loss_clip": 0.01409438, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.24666584, "balance_loss_mlp": 1.01562381, "epoch": 0.7817225311889373, "flos": 26115969110400.0, "grad_norm": 1.521587902980314, "language_loss": 0.6775111, "learning_rate": 4.792070390968027e-07, "loss": 0.70194387, "num_input_tokens_seen": 280432905, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18225098, "step": 13002, "time_per_iteration": 2.8679866790771484 }, { "auxiliary_loss_clip": 0.01408384, "auxiliary_loss_mlp": 0.01038007, "balance_loss_clip": 1.24580884, "balance_loss_mlp": 1.01842046, "epoch": 0.7817826544416053, "flos": 21260652860160.0, "grad_norm": 2.148328675645518, "language_loss": 0.74268848, "learning_rate": 4.78954128478607e-07, "loss": 0.76715243, "num_input_tokens_seen": 280450785, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19555664, "step": 13003, "time_per_iteration": 2.8665497303009033 }, { "auxiliary_loss_clip": 0.01389252, "auxiliary_loss_mlp": 0.010376, "balance_loss_clip": 1.23072743, "balance_loss_mlp": 1.01803756, "epoch": 0.7818427776942732, "flos": 19940197322880.0, "grad_norm": 3.5561849624486754, "language_loss": 0.62639415, "learning_rate": 4.787012755386233e-07, "loss": 0.65066266, "num_input_tokens_seen": 280468400, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19555664, "step": 13004, "time_per_iteration": 2.8020849227905273 }, { "auxiliary_loss_clip": 0.01375966, "auxiliary_loss_mlp": 0.0103094, "balance_loss_clip": 1.22148705, "balance_loss_mlp": 1.01290345, "epoch": 0.7819029009469413, "flos": 11371031354880.0, "grad_norm": 4.260272313095162, "language_loss": 0.83851421, "learning_rate": 4.784484802864403e-07, "loss": 0.86258328, "num_input_tokens_seen": 280483930, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.18041992, "step": 13005, "time_per_iteration": 2.803419828414917 }, { "auxiliary_loss_clip": 0.01397976, "auxiliary_loss_mlp": 0.01035463, "balance_loss_clip": 1.237921, "balance_loss_mlp": 1.01691413, "epoch": 0.7819630241996092, "flos": 24289792732800.0, "grad_norm": 2.7749163928906517, "language_loss": 0.73403215, "learning_rate": 4.781957427316432e-07, "loss": 0.75836658, "num_input_tokens_seen": 280503465, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18566895, "step": 13006, "time_per_iteration": 2.866915225982666 }, { "auxiliary_loss_clip": 0.01404715, "auxiliary_loss_mlp": 0.01032222, "balance_loss_clip": 1.24203992, "balance_loss_mlp": 1.01379168, "epoch": 0.7820231474522772, "flos": 22718625903360.0, "grad_norm": 1.558104661022574, "language_loss": 0.72805524, "learning_rate": 4.779430628838157e-07, "loss": 0.7524246, "num_input_tokens_seen": 280523375, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.18432617, "step": 13007, "time_per_iteration": 2.8378140926361084 }, { "auxiliary_loss_clip": 0.01401215, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.23960781, "balance_loss_mlp": 1.01192307, "epoch": 0.7820832707049451, "flos": 20056965448320.0, "grad_norm": 2.170121632712835, "language_loss": 0.70121992, "learning_rate": 4.776904407525397e-07, "loss": 0.72554159, "num_input_tokens_seen": 280542920, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19018555, "step": 13008, "time_per_iteration": 2.8372271060943604 }, { "auxiliary_loss_clip": 0.01411852, "auxiliary_loss_mlp": 0.01032787, "balance_loss_clip": 1.25099647, "balance_loss_mlp": 1.0137012, "epoch": 0.7821433939576131, "flos": 27174356910720.0, "grad_norm": 3.7471443092366563, "language_loss": 0.7061227, "learning_rate": 4.774378763473954e-07, "loss": 0.73056906, "num_input_tokens_seen": 280561700, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19091797, "step": 13009, "time_per_iteration": 2.885308265686035 }, { "auxiliary_loss_clip": 0.01392274, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.23347044, "balance_loss_mlp": 1.01469088, "epoch": 0.782203517210281, "flos": 22612445061120.0, "grad_norm": 1.77202396738767, "language_loss": 0.82575548, "learning_rate": 4.771853696779586e-07, "loss": 0.85002172, "num_input_tokens_seen": 280580605, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.1965332, "step": 13010, "time_per_iteration": 2.8283424377441406 }, { "auxiliary_loss_clip": 0.01397207, "auxiliary_loss_mlp": 0.01032522, "balance_loss_clip": 1.23939121, "balance_loss_mlp": 1.01383042, "epoch": 0.782263640462949, "flos": 29071803127680.0, "grad_norm": 1.4449851263979883, "language_loss": 0.63146597, "learning_rate": 4.76932920753806e-07, "loss": 0.65576327, "num_input_tokens_seen": 280601495, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18676758, "step": 13011, "time_per_iteration": 2.9336981773376465 }, { "auxiliary_loss_clip": 0.01403595, "auxiliary_loss_mlp": 0.01029083, "balance_loss_clip": 1.2452898, "balance_loss_mlp": 1.01138091, "epoch": 0.782323763715617, "flos": 25309921415040.0, "grad_norm": 1.6994300097498223, "language_loss": 0.70273566, "learning_rate": 4.7668052958450913e-07, "loss": 0.7270624, "num_input_tokens_seen": 280622760, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.17712402, "step": 13012, "time_per_iteration": 2.910227060317993 }, { "auxiliary_loss_clip": 0.01188184, "auxiliary_loss_mlp": 0.01025388, "balance_loss_clip": 1.09841585, "balance_loss_mlp": 1.00784063, "epoch": 0.782383886968285, "flos": 65228774140800.0, "grad_norm": 0.7057369109183197, "language_loss": 0.55032551, "learning_rate": 4.764281961796395e-07, "loss": 0.57246125, "num_input_tokens_seen": 280687115, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.17578125, "step": 13013, "time_per_iteration": 3.445112943649292 }, { "auxiliary_loss_clip": 0.01413286, "auxiliary_loss_mlp": 0.01032869, "balance_loss_clip": 1.25016546, "balance_loss_mlp": 1.01416552, "epoch": 0.782444010220953, "flos": 18414528779520.0, "grad_norm": 1.9350609671886692, "language_loss": 0.66058284, "learning_rate": 4.76175920548765e-07, "loss": 0.68504441, "num_input_tokens_seen": 280705000, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18688965, "step": 13014, "time_per_iteration": 2.8648433685302734 }, { "auxiliary_loss_clip": 0.01187829, "auxiliary_loss_mlp": 0.01026315, "balance_loss_clip": 1.09857166, "balance_loss_mlp": 1.00810003, "epoch": 0.7825041334736209, "flos": 63989361319680.0, "grad_norm": 0.7216210942972047, "language_loss": 0.5847705, "learning_rate": 4.759237027014524e-07, "loss": 0.60691196, "num_input_tokens_seen": 280773525, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.18261719, "step": 13015, "time_per_iteration": 3.3883609771728516 }, { "auxiliary_loss_clip": 0.01394898, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.23588991, "balance_loss_mlp": 1.01632619, "epoch": 0.7825642567262889, "flos": 20349012504960.0, "grad_norm": 1.665599260359687, "language_loss": 0.75652218, "learning_rate": 4.756715426472666e-07, "loss": 0.78082299, "num_input_tokens_seen": 280791915, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18847656, "step": 13016, "time_per_iteration": 2.83048415184021 }, { "auxiliary_loss_clip": 0.01401928, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.24022222, "balance_loss_mlp": 1.01321018, "epoch": 0.7826243799789568, "flos": 20271679862400.0, "grad_norm": 1.89620983116403, "language_loss": 0.75568002, "learning_rate": 4.7541944039576766e-07, "loss": 0.78002918, "num_input_tokens_seen": 280811460, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19775391, "step": 13017, "time_per_iteration": 2.8532748222351074 }, { "auxiliary_loss_clip": 0.01404532, "auxiliary_loss_mlp": 0.01035188, "balance_loss_clip": 1.2421999, "balance_loss_mlp": 1.01539922, "epoch": 0.7826845032316249, "flos": 21140400885120.0, "grad_norm": 1.9854552585107719, "language_loss": 0.76432729, "learning_rate": 4.7516739595651636e-07, "loss": 0.78872454, "num_input_tokens_seen": 280825415, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19763184, "step": 13018, "time_per_iteration": 2.8410983085632324 }, { "auxiliary_loss_clip": 0.01401185, "auxiliary_loss_mlp": 0.01032122, "balance_loss_clip": 1.24076712, "balance_loss_mlp": 1.01308477, "epoch": 0.7827446264842928, "flos": 22502508900480.0, "grad_norm": 1.7139443538740933, "language_loss": 0.77774203, "learning_rate": 4.749154093390708e-07, "loss": 0.80207515, "num_input_tokens_seen": 280845335, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19030762, "step": 13019, "time_per_iteration": 2.9361085891723633 }, { "auxiliary_loss_clip": 0.01391661, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.23201227, "balance_loss_mlp": 1.01201153, "epoch": 0.7828047497369608, "flos": 28852518988800.0, "grad_norm": 1.4017092587619229, "language_loss": 0.6789971, "learning_rate": 4.746634805529852e-07, "loss": 0.70322847, "num_input_tokens_seen": 280867145, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19482422, "step": 13020, "time_per_iteration": 2.9403278827667236 }, { "auxiliary_loss_clip": 0.0141425, "auxiliary_loss_mlp": 0.01035069, "balance_loss_clip": 1.25452352, "balance_loss_mlp": 1.01615095, "epoch": 0.7828648729896287, "flos": 23267944748160.0, "grad_norm": 1.9142440350307939, "language_loss": 0.6351428, "learning_rate": 4.7441160960781325e-07, "loss": 0.6596359, "num_input_tokens_seen": 280886185, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18945312, "step": 13021, "time_per_iteration": 4.308583736419678 }, { "auxiliary_loss_clip": 0.01402818, "auxiliary_loss_mlp": 0.01037934, "balance_loss_clip": 1.24253356, "balance_loss_mlp": 1.01833558, "epoch": 0.7829249962422967, "flos": 25276774959360.0, "grad_norm": 1.8221266527442146, "language_loss": 0.69956446, "learning_rate": 4.7415979651310636e-07, "loss": 0.72397202, "num_input_tokens_seen": 280907665, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19580078, "step": 13022, "time_per_iteration": 2.8561339378356934 }, { "auxiliary_loss_clip": 0.01189093, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.10087299, "balance_loss_mlp": 1.01266384, "epoch": 0.7829851194949646, "flos": 70753145535360.0, "grad_norm": 0.9201245487460189, "language_loss": 0.56132513, "learning_rate": 4.739080412784131e-07, "loss": 0.5835411, "num_input_tokens_seen": 280971405, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.19824219, "step": 13023, "time_per_iteration": 3.4867708683013916 }, { "auxiliary_loss_clip": 0.01382799, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.22768664, "balance_loss_mlp": 1.0148654, "epoch": 0.7830452427476327, "flos": 25670387888640.0, "grad_norm": 1.6293329751410373, "language_loss": 0.67138052, "learning_rate": 4.736563439132792e-07, "loss": 0.69553208, "num_input_tokens_seen": 280989615, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.17504883, "step": 13024, "time_per_iteration": 2.8926727771759033 }, { "auxiliary_loss_clip": 0.01412354, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.24921727, "balance_loss_mlp": 1.01301646, "epoch": 0.7831053660003006, "flos": 22794691691520.0, "grad_norm": 1.7695032788532197, "language_loss": 0.77935326, "learning_rate": 4.734047044272498e-07, "loss": 0.80380332, "num_input_tokens_seen": 281009450, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19628906, "step": 13025, "time_per_iteration": 2.899077892303467 }, { "auxiliary_loss_clip": 0.01394259, "auxiliary_loss_mlp": 0.01031426, "balance_loss_clip": 1.23616874, "balance_loss_mlp": 1.01297259, "epoch": 0.7831654892529686, "flos": 25823333871360.0, "grad_norm": 1.7283572912796994, "language_loss": 0.79432505, "learning_rate": 4.731531228298673e-07, "loss": 0.81858194, "num_input_tokens_seen": 281028120, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18457031, "step": 13026, "time_per_iteration": 2.890242576599121 }, { "auxiliary_loss_clip": 0.01397609, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.23795974, "balance_loss_mlp": 1.01126337, "epoch": 0.7832256125056366, "flos": 20779843921920.0, "grad_norm": 1.9161527871519994, "language_loss": 0.76276159, "learning_rate": 4.729015991306715e-07, "loss": 0.78703439, "num_input_tokens_seen": 281042130, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18408203, "step": 13027, "time_per_iteration": 2.829946279525757 }, { "auxiliary_loss_clip": 0.01383651, "auxiliary_loss_mlp": 0.01031072, "balance_loss_clip": 1.22722173, "balance_loss_mlp": 1.01246309, "epoch": 0.7832857357583045, "flos": 21516386325120.0, "grad_norm": 1.775939269878815, "language_loss": 0.71456814, "learning_rate": 4.726501333391997e-07, "loss": 0.73871529, "num_input_tokens_seen": 281060945, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.1862793, "step": 13028, "time_per_iteration": 4.281008720397949 }, { "auxiliary_loss_clip": 0.0140888, "auxiliary_loss_mlp": 0.01034941, "balance_loss_clip": 1.24558187, "balance_loss_mlp": 1.01531923, "epoch": 0.7833458590109725, "flos": 18086982537600.0, "grad_norm": 2.3574613039435888, "language_loss": 0.69524658, "learning_rate": 4.7239872546498774e-07, "loss": 0.71968472, "num_input_tokens_seen": 281079270, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19604492, "step": 13029, "time_per_iteration": 2.8206748962402344 }, { "auxiliary_loss_clip": 0.01408725, "auxiliary_loss_mlp": 0.01034409, "balance_loss_clip": 1.24500763, "balance_loss_mlp": 1.01498985, "epoch": 0.7834059822636404, "flos": 28299263846400.0, "grad_norm": 1.7009069705774704, "language_loss": 0.81088704, "learning_rate": 4.721473755175698e-07, "loss": 0.83531833, "num_input_tokens_seen": 281099500, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1940918, "step": 13030, "time_per_iteration": 2.898679733276367 }, { "auxiliary_loss_clip": 0.01414065, "auxiliary_loss_mlp": 0.01029343, "balance_loss_clip": 1.2493906, "balance_loss_mlp": 1.00970936, "epoch": 0.7834661055163085, "flos": 31698733559040.0, "grad_norm": 1.6134550990267615, "language_loss": 0.71673876, "learning_rate": 4.71896083506476e-07, "loss": 0.74117279, "num_input_tokens_seen": 281121250, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19641113, "step": 13031, "time_per_iteration": 2.920527696609497 }, { "auxiliary_loss_clip": 0.01407041, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.24472368, "balance_loss_mlp": 1.01493382, "epoch": 0.7835262287689764, "flos": 12941021819520.0, "grad_norm": 2.25309099742245, "language_loss": 0.79505622, "learning_rate": 4.7164484944123574e-07, "loss": 0.81946015, "num_input_tokens_seen": 281138760, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18432617, "step": 13032, "time_per_iteration": 4.302996873855591 }, { "auxiliary_loss_clip": 0.0141984, "auxiliary_loss_mlp": 0.01036373, "balance_loss_clip": 1.2566843, "balance_loss_mlp": 1.01638126, "epoch": 0.7835863520216444, "flos": 16151684405760.0, "grad_norm": 4.473564124739625, "language_loss": 0.63702285, "learning_rate": 4.7139367333137726e-07, "loss": 0.66158497, "num_input_tokens_seen": 281157420, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19995117, "step": 13033, "time_per_iteration": 2.8294851779937744 }, { "auxiliary_loss_clip": 0.01402582, "auxiliary_loss_mlp": 0.01033552, "balance_loss_clip": 1.24131632, "balance_loss_mlp": 1.012321, "epoch": 0.7836464752743123, "flos": 11517281107200.0, "grad_norm": 1.5936208561337228, "language_loss": 0.7254467, "learning_rate": 4.7114255518642255e-07, "loss": 0.74980807, "num_input_tokens_seen": 281174620, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.21240234, "step": 13034, "time_per_iteration": 2.831049919128418 }, { "auxiliary_loss_clip": 0.01411032, "auxiliary_loss_mlp": 0.01034149, "balance_loss_clip": 1.24850523, "balance_loss_mlp": 1.01445532, "epoch": 0.7837065985269803, "flos": 18232915576320.0, "grad_norm": 1.7518187347945404, "language_loss": 0.72614825, "learning_rate": 4.7089149501589555e-07, "loss": 0.75059998, "num_input_tokens_seen": 281193865, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19702148, "step": 13035, "time_per_iteration": 2.969496250152588 }, { "auxiliary_loss_clip": 0.01409551, "auxiliary_loss_mlp": 0.01034858, "balance_loss_clip": 1.24790406, "balance_loss_mlp": 1.01510501, "epoch": 0.7837667217796482, "flos": 24765262784640.0, "grad_norm": 2.8138327502982383, "language_loss": 0.66971612, "learning_rate": 4.7064049282931664e-07, "loss": 0.69416022, "num_input_tokens_seen": 281212250, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19750977, "step": 13036, "time_per_iteration": 2.8700625896453857 }, { "auxiliary_loss_clip": 0.01428801, "auxiliary_loss_mlp": 0.01031779, "balance_loss_clip": 1.26164758, "balance_loss_mlp": 1.01214504, "epoch": 0.7838268450323163, "flos": 22393432391040.0, "grad_norm": 2.412661510933924, "language_loss": 0.73452461, "learning_rate": 4.703895486362031e-07, "loss": 0.75913048, "num_input_tokens_seen": 281230850, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19628906, "step": 13037, "time_per_iteration": 2.8770968914031982 }, { "auxiliary_loss_clip": 0.01403867, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.24175763, "balance_loss_mlp": 1.01527798, "epoch": 0.7838869682849842, "flos": 19509727864320.0, "grad_norm": 2.0718239100503633, "language_loss": 0.61009365, "learning_rate": 4.701386624460717e-07, "loss": 0.6344732, "num_input_tokens_seen": 281249810, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18823242, "step": 13038, "time_per_iteration": 2.875861406326294 }, { "auxiliary_loss_clip": 0.01398795, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.23929453, "balance_loss_mlp": 1.01205325, "epoch": 0.7839470915376522, "flos": 32906357268480.0, "grad_norm": 2.8675947189716857, "language_loss": 0.69062352, "learning_rate": 4.698878342684349e-07, "loss": 0.7149145, "num_input_tokens_seen": 281273730, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18273926, "step": 13039, "time_per_iteration": 2.969456911087036 }, { "auxiliary_loss_clip": 0.01386743, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.22880268, "balance_loss_mlp": 1.01515365, "epoch": 0.7840072147903202, "flos": 29687098170240.0, "grad_norm": 2.0604243724699125, "language_loss": 0.69867575, "learning_rate": 4.6963706411280537e-07, "loss": 0.72286713, "num_input_tokens_seen": 281293670, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.17236328, "step": 13040, "time_per_iteration": 2.975078821182251 }, { "auxiliary_loss_clip": 0.01403668, "auxiliary_loss_mlp": 0.01033121, "balance_loss_clip": 1.24003434, "balance_loss_mlp": 1.01355839, "epoch": 0.7840673380429881, "flos": 18195832823040.0, "grad_norm": 1.5399012042993476, "language_loss": 0.68511176, "learning_rate": 4.6938635198869116e-07, "loss": 0.70947963, "num_input_tokens_seen": 281313070, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19567871, "step": 13041, "time_per_iteration": 2.8367114067077637 }, { "auxiliary_loss_clip": 0.01184433, "auxiliary_loss_mlp": 0.01032059, "balance_loss_clip": 1.09728742, "balance_loss_mlp": 1.0136528, "epoch": 0.7841274612956561, "flos": 66376556962560.0, "grad_norm": 0.6738887200192273, "language_loss": 0.57451963, "learning_rate": 4.691356979055998e-07, "loss": 0.59668458, "num_input_tokens_seen": 281374880, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.18359375, "step": 13042, "time_per_iteration": 3.318455696105957 }, { "auxiliary_loss_clip": 0.01405228, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.24361157, "balance_loss_mlp": 1.01282167, "epoch": 0.784187584548324, "flos": 26658998928000.0, "grad_norm": 3.3301492896633484, "language_loss": 0.84695703, "learning_rate": 4.688851018730369e-07, "loss": 0.87132347, "num_input_tokens_seen": 281392620, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18591309, "step": 13043, "time_per_iteration": 2.8806488513946533 }, { "auxiliary_loss_clip": 0.01398449, "auxiliary_loss_mlp": 0.01032219, "balance_loss_clip": 1.2403301, "balance_loss_mlp": 1.01425385, "epoch": 0.7842477078009921, "flos": 25751430604800.0, "grad_norm": 1.403675687803688, "language_loss": 0.88573456, "learning_rate": 4.6863456390050425e-07, "loss": 0.91004121, "num_input_tokens_seen": 281413140, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.1796875, "step": 13044, "time_per_iteration": 2.899879217147827 }, { "auxiliary_loss_clip": 0.01432976, "auxiliary_loss_mlp": 0.01032569, "balance_loss_clip": 1.26501918, "balance_loss_mlp": 1.01357937, "epoch": 0.78430783105366, "flos": 21990951480960.0, "grad_norm": 1.6464969611380285, "language_loss": 0.79901791, "learning_rate": 4.6838408399750195e-07, "loss": 0.82367337, "num_input_tokens_seen": 281430860, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.18994141, "step": 13045, "time_per_iteration": 2.8622372150421143 }, { "auxiliary_loss_clip": 0.01407077, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.24754608, "balance_loss_mlp": 1.0142653, "epoch": 0.784367954306328, "flos": 23852898512640.0, "grad_norm": 1.6663942649260395, "language_loss": 0.73004389, "learning_rate": 4.6813366217352925e-07, "loss": 0.75444448, "num_input_tokens_seen": 281451385, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18713379, "step": 13046, "time_per_iteration": 2.8690426349639893 }, { "auxiliary_loss_clip": 0.01386438, "auxiliary_loss_mlp": 0.010355, "balance_loss_clip": 1.2293644, "balance_loss_mlp": 1.01575923, "epoch": 0.7844280775589959, "flos": 24837347030400.0, "grad_norm": 1.605668286714682, "language_loss": 0.63567019, "learning_rate": 4.678832984380809e-07, "loss": 0.65988958, "num_input_tokens_seen": 281472255, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19763184, "step": 13047, "time_per_iteration": 2.8654730319976807 }, { "auxiliary_loss_clip": 0.01382317, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.22654295, "balance_loss_mlp": 1.01584435, "epoch": 0.7844882008116639, "flos": 22465878595200.0, "grad_norm": 1.4418588881948526, "language_loss": 0.73733646, "learning_rate": 4.676329928006515e-07, "loss": 0.76150203, "num_input_tokens_seen": 281492860, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18383789, "step": 13048, "time_per_iteration": 2.8380441665649414 }, { "auxiliary_loss_clip": 0.01424723, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.26026511, "balance_loss_mlp": 1.01289487, "epoch": 0.7845483240643318, "flos": 26115154704000.0, "grad_norm": 1.6811489555815828, "language_loss": 0.7584542, "learning_rate": 4.6738274527073243e-07, "loss": 0.78301984, "num_input_tokens_seen": 281511815, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18945312, "step": 13049, "time_per_iteration": 2.907701253890991 }, { "auxiliary_loss_clip": 0.01419098, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.25229669, "balance_loss_mlp": 1.01640534, "epoch": 0.7846084473169999, "flos": 19363704336000.0, "grad_norm": 2.5697764316261487, "language_loss": 0.73680598, "learning_rate": 4.6713255585781454e-07, "loss": 0.7613647, "num_input_tokens_seen": 281530090, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20373535, "step": 13050, "time_per_iteration": 2.8363893032073975 }, { "auxiliary_loss_clip": 0.01396841, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.23773789, "balance_loss_mlp": 1.01516294, "epoch": 0.7846685705696678, "flos": 23334825841920.0, "grad_norm": 3.388389562284319, "language_loss": 0.74128026, "learning_rate": 4.668824245713825e-07, "loss": 0.76558828, "num_input_tokens_seen": 281547075, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18786621, "step": 13051, "time_per_iteration": 2.865374803543091 }, { "auxiliary_loss_clip": 0.01405715, "auxiliary_loss_mlp": 0.01035715, "balance_loss_clip": 1.24363518, "balance_loss_mlp": 1.01603317, "epoch": 0.7847286938223358, "flos": 35823253495680.0, "grad_norm": 1.7512106253690365, "language_loss": 0.73840845, "learning_rate": 4.666323514209227e-07, "loss": 0.76282275, "num_input_tokens_seen": 281568080, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19677734, "step": 13052, "time_per_iteration": 2.9775054454803467 }, { "auxiliary_loss_clip": 0.01386784, "auxiliary_loss_mlp": 0.01030939, "balance_loss_clip": 1.23174763, "balance_loss_mlp": 1.0134151, "epoch": 0.7847888170750038, "flos": 18487201207680.0, "grad_norm": 1.794222135962841, "language_loss": 0.70359498, "learning_rate": 4.663823364159183e-07, "loss": 0.72777224, "num_input_tokens_seen": 281586925, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.1751709, "step": 13053, "time_per_iteration": 2.826277732849121 }, { "auxiliary_loss_clip": 0.01398459, "auxiliary_loss_mlp": 0.01029984, "balance_loss_clip": 1.2396102, "balance_loss_mlp": 1.01277018, "epoch": 0.7848489403276717, "flos": 25130072759040.0, "grad_norm": 2.0448443434757313, "language_loss": 0.71073198, "learning_rate": 4.6613237956584893e-07, "loss": 0.73501641, "num_input_tokens_seen": 281603915, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.17224121, "step": 13054, "time_per_iteration": 2.8972856998443604 }, { "auxiliary_loss_clip": 0.01420279, "auxiliary_loss_mlp": 0.01033939, "balance_loss_clip": 1.25541186, "balance_loss_mlp": 1.01493692, "epoch": 0.7849090635803397, "flos": 26512884910080.0, "grad_norm": 1.6419634943706152, "language_loss": 0.76551646, "learning_rate": 4.658824808801938e-07, "loss": 0.79005867, "num_input_tokens_seen": 281624220, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19006348, "step": 13055, "time_per_iteration": 2.8905715942382812 }, { "auxiliary_loss_clip": 0.01408879, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.24388599, "balance_loss_mlp": 1.01345444, "epoch": 0.7849691868330076, "flos": 20969465454720.0, "grad_norm": 1.656475693331985, "language_loss": 0.75763351, "learning_rate": 4.656326403684283e-07, "loss": 0.78205311, "num_input_tokens_seen": 281642325, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19628906, "step": 13056, "time_per_iteration": 4.253546237945557 }, { "auxiliary_loss_clip": 0.01406331, "auxiliary_loss_mlp": 0.01031326, "balance_loss_clip": 1.24618399, "balance_loss_mlp": 1.01250243, "epoch": 0.7850293100856757, "flos": 26078569643520.0, "grad_norm": 1.6879934161918249, "language_loss": 0.704813, "learning_rate": 4.6538285804002744e-07, "loss": 0.72918952, "num_input_tokens_seen": 281663065, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18823242, "step": 13057, "time_per_iteration": 2.9047679901123047 }, { "auxiliary_loss_clip": 0.01408453, "auxiliary_loss_mlp": 0.01030628, "balance_loss_clip": 1.24576151, "balance_loss_mlp": 1.01138759, "epoch": 0.7850894333383436, "flos": 22502056452480.0, "grad_norm": 2.5217608712053177, "language_loss": 0.77057707, "learning_rate": 4.6513313390446175e-07, "loss": 0.79496789, "num_input_tokens_seen": 281681005, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19226074, "step": 13058, "time_per_iteration": 2.850862979888916 }, { "auxiliary_loss_clip": 0.01402839, "auxiliary_loss_mlp": 0.01035585, "balance_loss_clip": 1.24262595, "balance_loss_mlp": 1.01690507, "epoch": 0.7851495565910116, "flos": 20568432378240.0, "grad_norm": 2.1789705463054316, "language_loss": 0.71500045, "learning_rate": 4.6488346797120146e-07, "loss": 0.73938465, "num_input_tokens_seen": 281697965, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18688965, "step": 13059, "time_per_iteration": 2.788881778717041 }, { "auxiliary_loss_clip": 0.0142461, "auxiliary_loss_mlp": 0.01038854, "balance_loss_clip": 1.25557542, "balance_loss_mlp": 1.01890993, "epoch": 0.7852096798436795, "flos": 15933395652480.0, "grad_norm": 1.9498435699233132, "language_loss": 0.77097058, "learning_rate": 4.646338602497144e-07, "loss": 0.79560524, "num_input_tokens_seen": 281716035, "router_z_loss_clip": 1.69140625, "router_z_loss_mlp": 0.19934082, "step": 13060, "time_per_iteration": 2.8147923946380615 }, { "auxiliary_loss_clip": 0.01397575, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.23653913, "balance_loss_mlp": 1.01653135, "epoch": 0.7852698030963475, "flos": 19071566789760.0, "grad_norm": 3.96583942742496, "language_loss": 0.77340651, "learning_rate": 4.643843107494654e-07, "loss": 0.79773748, "num_input_tokens_seen": 281732815, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18981934, "step": 13061, "time_per_iteration": 2.8011252880096436 }, { "auxiliary_loss_clip": 0.0139907, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.23739719, "balance_loss_mlp": 1.0175792, "epoch": 0.7853299263490154, "flos": 24655055155200.0, "grad_norm": 1.9189128849551842, "language_loss": 0.75165868, "learning_rate": 4.641348194799164e-07, "loss": 0.77601087, "num_input_tokens_seen": 281751980, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18566895, "step": 13062, "time_per_iteration": 2.893202781677246 }, { "auxiliary_loss_clip": 0.01394754, "auxiliary_loss_mlp": 0.01035173, "balance_loss_clip": 1.23618174, "balance_loss_mlp": 1.01463366, "epoch": 0.7853900496016835, "flos": 22028124723840.0, "grad_norm": 1.6606820501597521, "language_loss": 0.69770932, "learning_rate": 4.638853864505297e-07, "loss": 0.72200853, "num_input_tokens_seen": 281772670, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.20556641, "step": 13063, "time_per_iteration": 4.311232328414917 }, { "auxiliary_loss_clip": 0.01400353, "auxiliary_loss_mlp": 0.01034588, "balance_loss_clip": 1.24291229, "balance_loss_mlp": 1.01476359, "epoch": 0.7854501728543514, "flos": 30239719885440.0, "grad_norm": 2.103788531023688, "language_loss": 0.74097103, "learning_rate": 4.636360116707625e-07, "loss": 0.76532048, "num_input_tokens_seen": 281792930, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19812012, "step": 13064, "time_per_iteration": 3.012084722518921 }, { "auxiliary_loss_clip": 0.01403289, "auxiliary_loss_mlp": 0.0103604, "balance_loss_clip": 1.24060977, "balance_loss_mlp": 1.01660895, "epoch": 0.7855102961070194, "flos": 18853006567680.0, "grad_norm": 1.7592552135700514, "language_loss": 0.68472248, "learning_rate": 4.633866951500718e-07, "loss": 0.70911574, "num_input_tokens_seen": 281811805, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19421387, "step": 13065, "time_per_iteration": 2.8086533546447754 }, { "auxiliary_loss_clip": 0.01402434, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.24323988, "balance_loss_mlp": 1.01707721, "epoch": 0.7855704193596874, "flos": 22320217025280.0, "grad_norm": 3.080608069282332, "language_loss": 0.77254295, "learning_rate": 4.6313743689791196e-07, "loss": 0.79692125, "num_input_tokens_seen": 281831885, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18322754, "step": 13066, "time_per_iteration": 2.867990732192993 }, { "auxiliary_loss_clip": 0.01184875, "auxiliary_loss_mlp": 0.01029396, "balance_loss_clip": 1.09634936, "balance_loss_mlp": 1.00889194, "epoch": 0.7856305426123553, "flos": 60035143386240.0, "grad_norm": 0.7077892430351624, "language_loss": 0.53409147, "learning_rate": 4.628882369237346e-07, "loss": 0.55623418, "num_input_tokens_seen": 281900310, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20507812, "step": 13067, "time_per_iteration": 4.882134437561035 }, { "auxiliary_loss_clip": 0.01400917, "auxiliary_loss_mlp": 0.01032932, "balance_loss_clip": 1.23863149, "balance_loss_mlp": 1.01427579, "epoch": 0.7856906658650233, "flos": 21877802939520.0, "grad_norm": 1.6778242546905209, "language_loss": 0.68169677, "learning_rate": 4.62639095236989e-07, "loss": 0.70603526, "num_input_tokens_seen": 281918870, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18652344, "step": 13068, "time_per_iteration": 2.859661340713501 }, { "auxiliary_loss_clip": 0.0139457, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.23609638, "balance_loss_mlp": 1.01454949, "epoch": 0.7857507891176913, "flos": 23633388149760.0, "grad_norm": 1.9383701587717874, "language_loss": 0.68951464, "learning_rate": 4.6239001184712267e-07, "loss": 0.71379852, "num_input_tokens_seen": 281936905, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19250488, "step": 13069, "time_per_iteration": 2.866055965423584 }, { "auxiliary_loss_clip": 0.01400808, "auxiliary_loss_mlp": 0.01032991, "balance_loss_clip": 1.23929238, "balance_loss_mlp": 1.01485932, "epoch": 0.7858109123703593, "flos": 25530970101120.0, "grad_norm": 1.4688811108640505, "language_loss": 0.77373844, "learning_rate": 4.6214098676358195e-07, "loss": 0.79807651, "num_input_tokens_seen": 281955625, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18139648, "step": 13070, "time_per_iteration": 2.896535634994507 }, { "auxiliary_loss_clip": 0.01388255, "auxiliary_loss_mlp": 0.01032927, "balance_loss_clip": 1.23127556, "balance_loss_mlp": 1.01415193, "epoch": 0.7858710356230272, "flos": 17466393853440.0, "grad_norm": 1.788181622523246, "language_loss": 0.66943705, "learning_rate": 4.618920199958083e-07, "loss": 0.69364882, "num_input_tokens_seen": 281973285, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18786621, "step": 13071, "time_per_iteration": 2.8158957958221436 }, { "auxiliary_loss_clip": 0.01415319, "auxiliary_loss_mlp": 0.01032216, "balance_loss_clip": 1.25089049, "balance_loss_mlp": 1.01339293, "epoch": 0.7859311588756952, "flos": 24690056647680.0, "grad_norm": 1.6242134549229874, "language_loss": 0.74359584, "learning_rate": 4.616431115532442e-07, "loss": 0.76807117, "num_input_tokens_seen": 281991410, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18811035, "step": 13072, "time_per_iteration": 2.863069772720337 }, { "auxiliary_loss_clip": 0.01414685, "auxiliary_loss_mlp": 0.01034019, "balance_loss_clip": 1.25091362, "balance_loss_mlp": 1.01330018, "epoch": 0.7859912821283631, "flos": 21809247788160.0, "grad_norm": 1.6748413796446775, "language_loss": 0.71869552, "learning_rate": 4.613942614453268e-07, "loss": 0.74318254, "num_input_tokens_seen": 282010845, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20739746, "step": 13073, "time_per_iteration": 2.8655266761779785 }, { "auxiliary_loss_clip": 0.01400071, "auxiliary_loss_mlp": 0.01034423, "balance_loss_clip": 1.2383852, "balance_loss_mlp": 1.01490891, "epoch": 0.7860514053810311, "flos": 20856316913280.0, "grad_norm": 1.985768155984696, "language_loss": 0.7745887, "learning_rate": 4.611454696814938e-07, "loss": 0.79893363, "num_input_tokens_seen": 282029635, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.1953125, "step": 13074, "time_per_iteration": 2.8562986850738525 }, { "auxiliary_loss_clip": 0.01393024, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.23570776, "balance_loss_mlp": 1.01335979, "epoch": 0.786111528633699, "flos": 24326196814080.0, "grad_norm": 1.7162570867431068, "language_loss": 0.75493431, "learning_rate": 4.608967362711782e-07, "loss": 0.77918363, "num_input_tokens_seen": 282050285, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18554688, "step": 13075, "time_per_iteration": 2.85612416267395 }, { "auxiliary_loss_clip": 0.01413589, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.25087941, "balance_loss_mlp": 1.0142808, "epoch": 0.7861716518863671, "flos": 24363958239360.0, "grad_norm": 1.5892777648182057, "language_loss": 0.69289535, "learning_rate": 4.6064806122381283e-07, "loss": 0.71735591, "num_input_tokens_seen": 282071040, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1817627, "step": 13076, "time_per_iteration": 2.872288703918457 }, { "auxiliary_loss_clip": 0.01390455, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.23190761, "balance_loss_mlp": 1.01935458, "epoch": 0.786231775139035, "flos": 14029117470720.0, "grad_norm": 2.127097694202951, "language_loss": 0.80834281, "learning_rate": 4.603994445488282e-07, "loss": 0.83262384, "num_input_tokens_seen": 282086610, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18273926, "step": 13077, "time_per_iteration": 2.799344539642334 }, { "auxiliary_loss_clip": 0.01400855, "auxiliary_loss_mlp": 0.01035053, "balance_loss_clip": 1.24138165, "balance_loss_mlp": 1.01520455, "epoch": 0.786291898391703, "flos": 33735959521920.0, "grad_norm": 6.254359433186231, "language_loss": 0.71338087, "learning_rate": 4.6015088625564956e-07, "loss": 0.73773992, "num_input_tokens_seen": 282107440, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19848633, "step": 13078, "time_per_iteration": 2.9313602447509766 }, { "auxiliary_loss_clip": 0.01392739, "auxiliary_loss_mlp": 0.01033458, "balance_loss_clip": 1.23523808, "balance_loss_mlp": 1.01432514, "epoch": 0.786352021644371, "flos": 25822338485760.0, "grad_norm": 1.4650103859296424, "language_loss": 0.82143408, "learning_rate": 4.599023863537039e-07, "loss": 0.84569603, "num_input_tokens_seen": 282127290, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.19140625, "step": 13079, "time_per_iteration": 2.9809300899505615 }, { "auxiliary_loss_clip": 0.01386161, "auxiliary_loss_mlp": 0.01029304, "balance_loss_clip": 1.23141932, "balance_loss_mlp": 1.01080251, "epoch": 0.7864121448970389, "flos": 28921979036160.0, "grad_norm": 1.483621496917262, "language_loss": 0.68893397, "learning_rate": 4.596539448524146e-07, "loss": 0.71308857, "num_input_tokens_seen": 282147505, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.18493652, "step": 13080, "time_per_iteration": 2.9410922527313232 }, { "auxiliary_loss_clip": 0.01406732, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.24535275, "balance_loss_mlp": 1.01698077, "epoch": 0.7864722681497069, "flos": 19218314234880.0, "grad_norm": 2.121933409377488, "language_loss": 0.70145488, "learning_rate": 4.594055617612016e-07, "loss": 0.72588599, "num_input_tokens_seen": 282166450, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19396973, "step": 13081, "time_per_iteration": 2.8512074947357178 }, { "auxiliary_loss_clip": 0.01408318, "auxiliary_loss_mlp": 0.01033274, "balance_loss_clip": 1.24693096, "balance_loss_mlp": 1.01439118, "epoch": 0.7865323914023749, "flos": 21881558257920.0, "grad_norm": 1.5175661549401527, "language_loss": 0.69180393, "learning_rate": 4.591572370894838e-07, "loss": 0.71621984, "num_input_tokens_seen": 282186465, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18896484, "step": 13082, "time_per_iteration": 2.849046230316162 }, { "auxiliary_loss_clip": 0.01392288, "auxiliary_loss_mlp": 0.01028237, "balance_loss_clip": 1.23461747, "balance_loss_mlp": 1.00986743, "epoch": 0.7865925146550429, "flos": 25531286814720.0, "grad_norm": 3.187113904525805, "language_loss": 0.66893369, "learning_rate": 4.589089708466789e-07, "loss": 0.69313896, "num_input_tokens_seen": 282207180, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18371582, "step": 13083, "time_per_iteration": 2.8761391639709473 }, { "auxiliary_loss_clip": 0.01412692, "auxiliary_loss_mlp": 0.01035481, "balance_loss_clip": 1.24761415, "balance_loss_mlp": 1.01566815, "epoch": 0.7866526379077108, "flos": 19106296813440.0, "grad_norm": 2.0643848136477256, "language_loss": 0.7601493, "learning_rate": 4.5866076304220015e-07, "loss": 0.78463101, "num_input_tokens_seen": 282225865, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19799805, "step": 13084, "time_per_iteration": 2.8028526306152344 }, { "auxiliary_loss_clip": 0.01395379, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 1.2374624, "balance_loss_mlp": 1.01347899, "epoch": 0.7867127611603788, "flos": 16180713584640.0, "grad_norm": 1.9698061447107538, "language_loss": 0.70769948, "learning_rate": 4.584126136854591e-07, "loss": 0.7319715, "num_input_tokens_seen": 282242895, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18371582, "step": 13085, "time_per_iteration": 2.7927777767181396 }, { "auxiliary_loss_clip": 0.01417847, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.25138068, "balance_loss_mlp": 1.01629806, "epoch": 0.7867728844130467, "flos": 20782468120320.0, "grad_norm": 3.4155729288041896, "language_loss": 0.73158485, "learning_rate": 4.5816452278586617e-07, "loss": 0.75611997, "num_input_tokens_seen": 282260425, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19372559, "step": 13086, "time_per_iteration": 2.838545799255371 }, { "auxiliary_loss_clip": 0.01401976, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.24236131, "balance_loss_mlp": 1.01190186, "epoch": 0.7868330076657147, "flos": 21769540836480.0, "grad_norm": 1.6063665131774179, "language_loss": 0.74972248, "learning_rate": 4.5791649035282965e-07, "loss": 0.77404487, "num_input_tokens_seen": 282279335, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18371582, "step": 13087, "time_per_iteration": 2.8449931144714355 }, { "auxiliary_loss_clip": 0.01394097, "auxiliary_loss_mlp": 0.01032114, "balance_loss_clip": 1.23562491, "balance_loss_mlp": 1.01346946, "epoch": 0.7868931309183826, "flos": 25711180715520.0, "grad_norm": 1.5678719455008807, "language_loss": 0.7197752, "learning_rate": 4.5766851639575456e-07, "loss": 0.74403727, "num_input_tokens_seen": 282299905, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18652344, "step": 13088, "time_per_iteration": 2.8436849117279053 }, { "auxiliary_loss_clip": 0.01181542, "auxiliary_loss_mlp": 0.01027951, "balance_loss_clip": 1.0937407, "balance_loss_mlp": 1.00363278, "epoch": 0.7869532541710507, "flos": 64678550400000.0, "grad_norm": 0.6751855620452716, "language_loss": 0.55575746, "learning_rate": 4.574206009240431e-07, "loss": 0.57785243, "num_input_tokens_seen": 282367620, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.24316406, "step": 13089, "time_per_iteration": 3.48530650138855 }, { "auxiliary_loss_clip": 0.01181507, "auxiliary_loss_mlp": 0.01018094, "balance_loss_clip": 1.09483123, "balance_loss_mlp": 0.99739915, "epoch": 0.7870133774237186, "flos": 67487817951360.0, "grad_norm": 0.7217355566410324, "language_loss": 0.50065136, "learning_rate": 4.571727439470976e-07, "loss": 0.52264738, "num_input_tokens_seen": 282435695, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.20703125, "step": 13090, "time_per_iteration": 3.3805007934570312 }, { "auxiliary_loss_clip": 0.01390389, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.23374987, "balance_loss_mlp": 1.01216257, "epoch": 0.7870735006763866, "flos": 26079610273920.0, "grad_norm": 1.4921356497422589, "language_loss": 0.8393603, "learning_rate": 4.5692494547431583e-07, "loss": 0.86356521, "num_input_tokens_seen": 282456025, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.17956543, "step": 13091, "time_per_iteration": 4.403404474258423 }, { "auxiliary_loss_clip": 0.01187454, "auxiliary_loss_mlp": 0.01024258, "balance_loss_clip": 1.09759355, "balance_loss_mlp": 1.00442147, "epoch": 0.7871336239290546, "flos": 70321952160000.0, "grad_norm": 0.7520196810134033, "language_loss": 0.64070886, "learning_rate": 4.566772055150947e-07, "loss": 0.662826, "num_input_tokens_seen": 282520995, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.19824219, "step": 13092, "time_per_iteration": 3.3163554668426514 }, { "auxiliary_loss_clip": 0.01403762, "auxiliary_loss_mlp": 0.01034015, "balance_loss_clip": 1.24259901, "balance_loss_mlp": 1.01539421, "epoch": 0.7871937471817225, "flos": 15787010165760.0, "grad_norm": 1.939181008655669, "language_loss": 0.79935181, "learning_rate": 4.564295240788285e-07, "loss": 0.82372952, "num_input_tokens_seen": 282539355, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18615723, "step": 13093, "time_per_iteration": 2.834933280944824 }, { "auxiliary_loss_clip": 0.01395844, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.23792887, "balance_loss_mlp": 1.01217842, "epoch": 0.7872538704343905, "flos": 20494900298880.0, "grad_norm": 1.8817966158010948, "language_loss": 0.76454651, "learning_rate": 4.561819011749106e-07, "loss": 0.78881496, "num_input_tokens_seen": 282555735, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18811035, "step": 13094, "time_per_iteration": 2.8988912105560303 }, { "auxiliary_loss_clip": 0.01409796, "auxiliary_loss_mlp": 0.01034073, "balance_loss_clip": 1.24651718, "balance_loss_mlp": 1.01496387, "epoch": 0.7873139936870585, "flos": 25093578188160.0, "grad_norm": 1.6197846547070032, "language_loss": 0.7989338, "learning_rate": 4.5593433681272884e-07, "loss": 0.82337248, "num_input_tokens_seen": 282574550, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19104004, "step": 13095, "time_per_iteration": 2.926340341567993 }, { "auxiliary_loss_clip": 0.01417056, "auxiliary_loss_mlp": 0.01029894, "balance_loss_clip": 1.25155473, "balance_loss_mlp": 1.01127374, "epoch": 0.7873741169397265, "flos": 30895400551680.0, "grad_norm": 3.0243484426360236, "language_loss": 0.68693143, "learning_rate": 4.556868310016715e-07, "loss": 0.71140093, "num_input_tokens_seen": 282596520, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.1862793, "step": 13096, "time_per_iteration": 2.977992057800293 }, { "auxiliary_loss_clip": 0.01382064, "auxiliary_loss_mlp": 0.0102753, "balance_loss_clip": 1.22714734, "balance_loss_mlp": 1.01024508, "epoch": 0.7874342401923944, "flos": 46808571778560.0, "grad_norm": 1.9263557640852986, "language_loss": 0.71184075, "learning_rate": 4.55439383751125e-07, "loss": 0.7359367, "num_input_tokens_seen": 282620560, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.17285156, "step": 13097, "time_per_iteration": 3.134122133255005 }, { "auxiliary_loss_clip": 0.01419948, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.2558012, "balance_loss_mlp": 1.01418006, "epoch": 0.7874943634450624, "flos": 23594495604480.0, "grad_norm": 3.0813082465038497, "language_loss": 0.80984259, "learning_rate": 4.5519199507047126e-07, "loss": 0.83436877, "num_input_tokens_seen": 282639830, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18493652, "step": 13098, "time_per_iteration": 4.278926610946655 }, { "auxiliary_loss_clip": 0.01405798, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.24676895, "balance_loss_mlp": 1.0125308, "epoch": 0.7875544866977303, "flos": 20200183799040.0, "grad_norm": 1.7960888943148634, "language_loss": 0.74524385, "learning_rate": 4.5494466496909177e-07, "loss": 0.76961815, "num_input_tokens_seen": 282660130, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19104004, "step": 13099, "time_per_iteration": 2.855069875717163 }, { "auxiliary_loss_clip": 0.01402784, "auxiliary_loss_mlp": 0.01031274, "balance_loss_clip": 1.24291217, "balance_loss_mlp": 1.01259398, "epoch": 0.7876146099503983, "flos": 22613078488320.0, "grad_norm": 1.575075372643208, "language_loss": 0.78598326, "learning_rate": 4.5469739345636603e-07, "loss": 0.81032383, "num_input_tokens_seen": 282681125, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18676758, "step": 13100, "time_per_iteration": 2.8431742191314697 }, { "auxiliary_loss_clip": 0.01417921, "auxiliary_loss_mlp": 0.01030985, "balance_loss_clip": 1.25002408, "balance_loss_mlp": 1.0117805, "epoch": 0.7876747332030662, "flos": 10712636000640.0, "grad_norm": 2.4086443667305133, "language_loss": 0.67027497, "learning_rate": 4.5445018054167007e-07, "loss": 0.69476408, "num_input_tokens_seen": 282696690, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19189453, "step": 13101, "time_per_iteration": 2.7791900634765625 }, { "auxiliary_loss_clip": 0.01403883, "auxiliary_loss_mlp": 0.01030875, "balance_loss_clip": 1.24338758, "balance_loss_mlp": 1.01231444, "epoch": 0.7877348564557343, "flos": 38413508376960.0, "grad_norm": 1.5930540822563457, "language_loss": 0.78630197, "learning_rate": 4.5420302623437745e-07, "loss": 0.81064951, "num_input_tokens_seen": 282721210, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18566895, "step": 13102, "time_per_iteration": 4.4363086223602295 }, { "auxiliary_loss_clip": 0.01407143, "auxiliary_loss_mlp": 0.01034371, "balance_loss_clip": 1.24549472, "balance_loss_mlp": 1.01579869, "epoch": 0.7877949797084022, "flos": 18337829564160.0, "grad_norm": 1.9713081738052707, "language_loss": 0.82978702, "learning_rate": 4.5395593054386093e-07, "loss": 0.85420215, "num_input_tokens_seen": 282738505, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18579102, "step": 13103, "time_per_iteration": 2.8238611221313477 }, { "auxiliary_loss_clip": 0.01418275, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.25459528, "balance_loss_mlp": 1.01266885, "epoch": 0.7878551029610702, "flos": 25816366172160.0, "grad_norm": 1.8704237441301041, "language_loss": 0.81146783, "learning_rate": 4.537088934794913e-07, "loss": 0.83596456, "num_input_tokens_seen": 282756895, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.18737793, "step": 13104, "time_per_iteration": 2.903510570526123 }, { "auxiliary_loss_clip": 0.01408668, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.24754274, "balance_loss_mlp": 1.01455712, "epoch": 0.7879152262137382, "flos": 22351960892160.0, "grad_norm": 1.744454142235192, "language_loss": 0.74557817, "learning_rate": 4.5346191505063515e-07, "loss": 0.77000248, "num_input_tokens_seen": 282774955, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19189453, "step": 13105, "time_per_iteration": 2.8472750186920166 }, { "auxiliary_loss_clip": 0.01407141, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.24304557, "balance_loss_mlp": 1.016433, "epoch": 0.7879753494664061, "flos": 24794744411520.0, "grad_norm": 2.218426667926628, "language_loss": 0.76321751, "learning_rate": 4.5321499526665776e-07, "loss": 0.78763491, "num_input_tokens_seen": 282793165, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1817627, "step": 13106, "time_per_iteration": 3.0162904262542725 }, { "auxiliary_loss_clip": 0.0140882, "auxiliary_loss_mlp": 0.01032622, "balance_loss_clip": 1.24734163, "balance_loss_mlp": 1.01347733, "epoch": 0.7880354727190741, "flos": 16917663191040.0, "grad_norm": 2.9886489591470133, "language_loss": 0.7395466, "learning_rate": 4.5296813413692337e-07, "loss": 0.76396108, "num_input_tokens_seen": 282809820, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19140625, "step": 13107, "time_per_iteration": 2.8826143741607666 }, { "auxiliary_loss_clip": 0.01395267, "auxiliary_loss_mlp": 0.01034481, "balance_loss_clip": 1.23766005, "balance_loss_mlp": 1.01482368, "epoch": 0.7880955959717421, "flos": 22239038574720.0, "grad_norm": 1.5431232318482726, "language_loss": 0.73333645, "learning_rate": 4.5272133167079165e-07, "loss": 0.75763392, "num_input_tokens_seen": 282828600, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1965332, "step": 13108, "time_per_iteration": 2.8822319507598877 }, { "auxiliary_loss_clip": 0.01184848, "auxiliary_loss_mlp": 0.01028541, "balance_loss_clip": 1.09639215, "balance_loss_mlp": 1.00183856, "epoch": 0.7881557192244101, "flos": 69212229494400.0, "grad_norm": 0.8855643149798529, "language_loss": 0.60293037, "learning_rate": 4.5247458787762216e-07, "loss": 0.62506431, "num_input_tokens_seen": 282882775, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.26757812, "step": 13109, "time_per_iteration": 3.3394501209259033 }, { "auxiliary_loss_clip": 0.01405864, "auxiliary_loss_mlp": 0.01036575, "balance_loss_clip": 1.24818766, "balance_loss_mlp": 1.01776385, "epoch": 0.788215842477078, "flos": 24945971091840.0, "grad_norm": 1.7970172585502624, "language_loss": 0.73347068, "learning_rate": 4.5222790276677126e-07, "loss": 0.75789505, "num_input_tokens_seen": 282902680, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18798828, "step": 13110, "time_per_iteration": 2.8715012073516846 }, { "auxiliary_loss_clip": 0.01392286, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.23588586, "balance_loss_mlp": 1.01443744, "epoch": 0.788275965729746, "flos": 26118231350400.0, "grad_norm": 1.7989786419543992, "language_loss": 0.75328046, "learning_rate": 4.5198127634759455e-07, "loss": 0.77753794, "num_input_tokens_seen": 282923625, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.19030762, "step": 13111, "time_per_iteration": 2.9268946647644043 }, { "auxiliary_loss_clip": 0.01398857, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.2387197, "balance_loss_mlp": 1.01292861, "epoch": 0.7883360889824139, "flos": 21224610737280.0, "grad_norm": 2.0195089402529303, "language_loss": 0.62934369, "learning_rate": 4.5173470862944206e-07, "loss": 0.65365303, "num_input_tokens_seen": 282941955, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19152832, "step": 13112, "time_per_iteration": 2.8421576023101807 }, { "auxiliary_loss_clip": 0.01397307, "auxiliary_loss_mlp": 0.0103088, "balance_loss_clip": 1.2367866, "balance_loss_mlp": 1.0117588, "epoch": 0.7883962122350819, "flos": 21152526491520.0, "grad_norm": 1.583519804162614, "language_loss": 0.68047154, "learning_rate": 4.514881996216644e-07, "loss": 0.7047534, "num_input_tokens_seen": 282961280, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19128418, "step": 13113, "time_per_iteration": 2.832814931869507 }, { "auxiliary_loss_clip": 0.01397917, "auxiliary_loss_mlp": 0.0103087, "balance_loss_clip": 1.2395916, "balance_loss_mlp": 1.0121783, "epoch": 0.7884563354877498, "flos": 15310861441920.0, "grad_norm": 4.208869199240977, "language_loss": 0.59512091, "learning_rate": 4.5124174933361e-07, "loss": 0.61940885, "num_input_tokens_seen": 282978210, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18701172, "step": 13114, "time_per_iteration": 2.8384997844696045 }, { "auxiliary_loss_clip": 0.01412641, "auxiliary_loss_mlp": 0.01033966, "balance_loss_clip": 1.24961758, "balance_loss_mlp": 1.01483262, "epoch": 0.7885164587404179, "flos": 24398733507840.0, "grad_norm": 1.663883766622936, "language_loss": 0.67282438, "learning_rate": 4.5099535777462306e-07, "loss": 0.69729042, "num_input_tokens_seen": 282998845, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19116211, "step": 13115, "time_per_iteration": 2.916074752807617 }, { "auxiliary_loss_clip": 0.01401339, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.24017358, "balance_loss_mlp": 1.01239216, "epoch": 0.7885765819930858, "flos": 14392343877120.0, "grad_norm": 2.1095630452075493, "language_loss": 0.88872176, "learning_rate": 4.50749024954048e-07, "loss": 0.91305494, "num_input_tokens_seen": 283015200, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19580078, "step": 13116, "time_per_iteration": 2.81485915184021 }, { "auxiliary_loss_clip": 0.0143052, "auxiliary_loss_mlp": 0.01036832, "balance_loss_clip": 1.26200366, "balance_loss_mlp": 1.01773429, "epoch": 0.7886367052457538, "flos": 18269093433600.0, "grad_norm": 1.9617815308627462, "language_loss": 0.73685533, "learning_rate": 4.505027508812245e-07, "loss": 0.76152885, "num_input_tokens_seen": 283033680, "router_z_loss_clip": 1.68554688, "router_z_loss_mlp": 0.19091797, "step": 13117, "time_per_iteration": 2.876619577407837 }, { "auxiliary_loss_clip": 0.01394531, "auxiliary_loss_mlp": 0.01029964, "balance_loss_clip": 1.23755896, "balance_loss_mlp": 1.01183236, "epoch": 0.7886968284984217, "flos": 15313621374720.0, "grad_norm": 1.4983411461230447, "language_loss": 0.81000149, "learning_rate": 4.502565355654926e-07, "loss": 0.8342464, "num_input_tokens_seen": 283050620, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18139648, "step": 13118, "time_per_iteration": 2.878735065460205 }, { "auxiliary_loss_clip": 0.01407737, "auxiliary_loss_mlp": 0.0103093, "balance_loss_clip": 1.24851215, "balance_loss_mlp": 1.0116415, "epoch": 0.7887569517510897, "flos": 21225425143680.0, "grad_norm": 1.9753034576956758, "language_loss": 0.7397542, "learning_rate": 4.500103790161878e-07, "loss": 0.7641409, "num_input_tokens_seen": 283070215, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19299316, "step": 13119, "time_per_iteration": 2.859299898147583 }, { "auxiliary_loss_clip": 0.01403499, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.24210012, "balance_loss_mlp": 1.01246846, "epoch": 0.7888170750037578, "flos": 22721566815360.0, "grad_norm": 1.9493997572483794, "language_loss": 0.7310946, "learning_rate": 4.4976428124264454e-07, "loss": 0.75544536, "num_input_tokens_seen": 283091485, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19104004, "step": 13120, "time_per_iteration": 2.882517099380493 }, { "auxiliary_loss_clip": 0.01409898, "auxiliary_loss_mlp": 0.01036451, "balance_loss_clip": 1.24904084, "balance_loss_mlp": 1.01713943, "epoch": 0.7888771982564257, "flos": 36442846794240.0, "grad_norm": 1.5329161960345008, "language_loss": 0.79328454, "learning_rate": 4.4951824225419564e-07, "loss": 0.81774801, "num_input_tokens_seen": 283115040, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19311523, "step": 13121, "time_per_iteration": 2.9987456798553467 }, { "auxiliary_loss_clip": 0.0139102, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.23349535, "balance_loss_mlp": 1.01289988, "epoch": 0.7889373215090937, "flos": 27321918762240.0, "grad_norm": 1.3902463288001532, "language_loss": 0.80629271, "learning_rate": 4.4927226206017057e-07, "loss": 0.8305198, "num_input_tokens_seen": 283136925, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18774414, "step": 13122, "time_per_iteration": 2.9769225120544434 }, { "auxiliary_loss_clip": 0.01413012, "auxiliary_loss_mlp": 0.01028139, "balance_loss_clip": 1.24985075, "balance_loss_mlp": 1.00991225, "epoch": 0.7889974447617616, "flos": 19838631450240.0, "grad_norm": 2.5935306288793583, "language_loss": 0.79072869, "learning_rate": 4.4902634066989597e-07, "loss": 0.81514019, "num_input_tokens_seen": 283155725, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18237305, "step": 13123, "time_per_iteration": 2.8771793842315674 }, { "auxiliary_loss_clip": 0.01407222, "auxiliary_loss_mlp": 0.01032216, "balance_loss_clip": 1.24439275, "balance_loss_mlp": 1.01355982, "epoch": 0.7890575680144296, "flos": 17279170295040.0, "grad_norm": 2.0654404777079205, "language_loss": 0.6775887, "learning_rate": 4.487804780926985e-07, "loss": 0.70198309, "num_input_tokens_seen": 283173845, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.18652344, "step": 13124, "time_per_iteration": 2.8167896270751953 }, { "auxiliary_loss_clip": 0.01420293, "auxiliary_loss_mlp": 0.01033748, "balance_loss_clip": 1.25524139, "balance_loss_mlp": 1.01538992, "epoch": 0.7891176912670975, "flos": 27611703578880.0, "grad_norm": 2.4496303760940434, "language_loss": 0.7376523, "learning_rate": 4.4853467433790036e-07, "loss": 0.76219273, "num_input_tokens_seen": 283191985, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.18383789, "step": 13125, "time_per_iteration": 2.9604265689849854 }, { "auxiliary_loss_clip": 0.01405206, "auxiliary_loss_mlp": 0.01032488, "balance_loss_clip": 1.24139702, "balance_loss_mlp": 1.01355767, "epoch": 0.7891778145197655, "flos": 22722109752960.0, "grad_norm": 2.1977709569571147, "language_loss": 0.73349631, "learning_rate": 4.4828892941482267e-07, "loss": 0.75787318, "num_input_tokens_seen": 283210855, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18920898, "step": 13126, "time_per_iteration": 4.367655515670776 }, { "auxiliary_loss_clip": 0.01409055, "auxiliary_loss_mlp": 0.01030547, "balance_loss_clip": 1.24649835, "balance_loss_mlp": 1.0114851, "epoch": 0.7892379377724335, "flos": 17319555918720.0, "grad_norm": 2.223271789224908, "language_loss": 0.77672559, "learning_rate": 4.480432433327845e-07, "loss": 0.80112159, "num_input_tokens_seen": 283229665, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19067383, "step": 13127, "time_per_iteration": 2.8758678436279297 }, { "auxiliary_loss_clip": 0.01395145, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.23907828, "balance_loss_mlp": 1.01709878, "epoch": 0.7892980610251015, "flos": 25786703566080.0, "grad_norm": 3.832518336375528, "language_loss": 0.85890484, "learning_rate": 4.47797616101103e-07, "loss": 0.88321376, "num_input_tokens_seen": 283248615, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18615723, "step": 13128, "time_per_iteration": 2.8777527809143066 }, { "auxiliary_loss_clip": 0.01405736, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.246171, "balance_loss_mlp": 1.01908827, "epoch": 0.7893581842777694, "flos": 21589918404480.0, "grad_norm": 1.9144395087568669, "language_loss": 0.70285076, "learning_rate": 4.475520477290904e-07, "loss": 0.72728771, "num_input_tokens_seen": 283267135, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18859863, "step": 13129, "time_per_iteration": 2.862499475479126 }, { "auxiliary_loss_clip": 0.01183807, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 1.0916419, "balance_loss_mlp": 0.99970025, "epoch": 0.7894183075304374, "flos": 69049528617600.0, "grad_norm": 0.7194667768663862, "language_loss": 0.61630964, "learning_rate": 4.473065382260597e-07, "loss": 0.63841558, "num_input_tokens_seen": 283328940, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.27148438, "step": 13130, "time_per_iteration": 3.3502655029296875 }, { "auxiliary_loss_clip": 0.01408183, "auxiliary_loss_mlp": 0.01029988, "balance_loss_clip": 1.24634767, "balance_loss_mlp": 1.01093817, "epoch": 0.7894784307831053, "flos": 24253886344320.0, "grad_norm": 4.821647133248566, "language_loss": 0.74241775, "learning_rate": 4.4706108760132124e-07, "loss": 0.76679945, "num_input_tokens_seen": 283350000, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19055176, "step": 13131, "time_per_iteration": 2.9093801975250244 }, { "auxiliary_loss_clip": 0.01446175, "auxiliary_loss_mlp": 0.01035677, "balance_loss_clip": 1.27344298, "balance_loss_mlp": 1.01665068, "epoch": 0.7895385540357733, "flos": 20276204342400.0, "grad_norm": 2.6704034172239477, "language_loss": 0.70941299, "learning_rate": 4.4681569586418153e-07, "loss": 0.73423147, "num_input_tokens_seen": 283368020, "router_z_loss_clip": 1.72558594, "router_z_loss_mlp": 0.19030762, "step": 13132, "time_per_iteration": 2.8548989295959473 }, { "auxiliary_loss_clip": 0.01405836, "auxiliary_loss_mlp": 0.01039112, "balance_loss_clip": 1.24321532, "balance_loss_mlp": 1.01851273, "epoch": 0.7895986772884414, "flos": 21006502963200.0, "grad_norm": 2.053478760963436, "language_loss": 0.63007629, "learning_rate": 4.465703630239468e-07, "loss": 0.6545257, "num_input_tokens_seen": 283387030, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20581055, "step": 13133, "time_per_iteration": 4.370396375656128 }, { "auxiliary_loss_clip": 0.01420051, "auxiliary_loss_mlp": 0.01036168, "balance_loss_clip": 1.25486302, "balance_loss_mlp": 1.01533008, "epoch": 0.7896588005411093, "flos": 18666416436480.0, "grad_norm": 2.299738570329337, "language_loss": 0.80583155, "learning_rate": 4.463250890899195e-07, "loss": 0.83039373, "num_input_tokens_seen": 283402090, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20837402, "step": 13134, "time_per_iteration": 2.860684633255005 }, { "auxiliary_loss_clip": 0.01409755, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.24739587, "balance_loss_mlp": 1.0158608, "epoch": 0.7897189237937773, "flos": 18415116961920.0, "grad_norm": 1.84131816972958, "language_loss": 0.80728316, "learning_rate": 4.460798740713998e-07, "loss": 0.83172166, "num_input_tokens_seen": 283421035, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18237305, "step": 13135, "time_per_iteration": 2.8761332035064697 }, { "auxiliary_loss_clip": 0.01396051, "auxiliary_loss_mlp": 0.0103313, "balance_loss_clip": 1.23638904, "balance_loss_mlp": 1.01379466, "epoch": 0.7897790470464452, "flos": 23741876476800.0, "grad_norm": 1.5044080583111246, "language_loss": 0.72690171, "learning_rate": 4.4583471797768733e-07, "loss": 0.75119352, "num_input_tokens_seen": 283441830, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19335938, "step": 13136, "time_per_iteration": 2.8541975021362305 }, { "auxiliary_loss_clip": 0.01421436, "auxiliary_loss_mlp": 0.0103475, "balance_loss_clip": 1.25418782, "balance_loss_mlp": 1.01539075, "epoch": 0.7898391702991132, "flos": 15925975505280.0, "grad_norm": 2.011809003530409, "language_loss": 0.71973455, "learning_rate": 4.455896208180778e-07, "loss": 0.74429643, "num_input_tokens_seen": 283459540, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19348145, "step": 13137, "time_per_iteration": 5.683560132980347 }, { "auxiliary_loss_clip": 0.0139441, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.23619914, "balance_loss_mlp": 1.01956296, "epoch": 0.7898992935517811, "flos": 19838676695040.0, "grad_norm": 1.6990068612306333, "language_loss": 0.74920261, "learning_rate": 4.4534458260186645e-07, "loss": 0.77353448, "num_input_tokens_seen": 283478790, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19238281, "step": 13138, "time_per_iteration": 2.8559727668762207 }, { "auxiliary_loss_clip": 0.01400315, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.24054599, "balance_loss_mlp": 1.01327229, "epoch": 0.7899594168044491, "flos": 16224764037120.0, "grad_norm": 2.9882428534371495, "language_loss": 0.69143927, "learning_rate": 4.4509960333834426e-07, "loss": 0.71576136, "num_input_tokens_seen": 283495720, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18615723, "step": 13139, "time_per_iteration": 2.790332317352295 }, { "auxiliary_loss_clip": 0.01181143, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.09111667, "balance_loss_mlp": 1.01306152, "epoch": 0.790019540057117, "flos": 68365904647680.0, "grad_norm": 0.8436019008232135, "language_loss": 0.60291696, "learning_rate": 4.448546830368003e-07, "loss": 0.62509358, "num_input_tokens_seen": 283558795, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.234375, "step": 13140, "time_per_iteration": 3.4267210960388184 }, { "auxiliary_loss_clip": 0.01417216, "auxiliary_loss_mlp": 0.01035537, "balance_loss_clip": 1.2556777, "balance_loss_mlp": 1.01637959, "epoch": 0.7900796633097851, "flos": 30344045690880.0, "grad_norm": 1.51829752592844, "language_loss": 0.770459, "learning_rate": 4.4460982170652304e-07, "loss": 0.79498649, "num_input_tokens_seen": 283579305, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19152832, "step": 13141, "time_per_iteration": 2.9079947471618652 }, { "auxiliary_loss_clip": 0.01411276, "auxiliary_loss_mlp": 0.01038082, "balance_loss_clip": 1.24683428, "balance_loss_mlp": 1.01817441, "epoch": 0.790139786562453, "flos": 22136794030080.0, "grad_norm": 2.0135132679004792, "language_loss": 0.68994081, "learning_rate": 4.4436501935679694e-07, "loss": 0.71443439, "num_input_tokens_seen": 283597840, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19909668, "step": 13142, "time_per_iteration": 2.8320469856262207 }, { "auxiliary_loss_clip": 0.01180452, "auxiliary_loss_mlp": 0.01032645, "balance_loss_clip": 1.09011459, "balance_loss_mlp": 1.01032948, "epoch": 0.790199909815121, "flos": 58235100520320.0, "grad_norm": 1.2373978371991885, "language_loss": 0.60003251, "learning_rate": 4.441202759969049e-07, "loss": 0.62216347, "num_input_tokens_seen": 283647950, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.22363281, "step": 13143, "time_per_iteration": 3.0765933990478516 }, { "auxiliary_loss_clip": 0.01416476, "auxiliary_loss_mlp": 0.01032506, "balance_loss_clip": 1.25278449, "balance_loss_mlp": 1.01365948, "epoch": 0.7902600330677889, "flos": 34546893655680.0, "grad_norm": 1.4596886939115634, "language_loss": 0.74733305, "learning_rate": 4.4387559163612875e-07, "loss": 0.77182293, "num_input_tokens_seen": 283670645, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18859863, "step": 13144, "time_per_iteration": 3.0011978149414062 }, { "auxiliary_loss_clip": 0.01408943, "auxiliary_loss_mlp": 0.01034807, "balance_loss_clip": 1.24678051, "balance_loss_mlp": 1.01593614, "epoch": 0.7903201563204569, "flos": 22356621106560.0, "grad_norm": 3.321935167621783, "language_loss": 0.84239352, "learning_rate": 4.4363096628374605e-07, "loss": 0.86683106, "num_input_tokens_seen": 283688830, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.1887207, "step": 13145, "time_per_iteration": 2.9080007076263428 }, { "auxiliary_loss_clip": 0.01384292, "auxiliary_loss_mlp": 0.01031804, "balance_loss_clip": 1.22722793, "balance_loss_mlp": 1.01453066, "epoch": 0.790380279573125, "flos": 22063442929920.0, "grad_norm": 2.8307370625736734, "language_loss": 0.7358157, "learning_rate": 4.4338639994903235e-07, "loss": 0.75997669, "num_input_tokens_seen": 283708625, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.17272949, "step": 13146, "time_per_iteration": 2.973395347595215 }, { "auxiliary_loss_clip": 0.01409847, "auxiliary_loss_mlp": 0.01037364, "balance_loss_clip": 1.24598622, "balance_loss_mlp": 1.01808763, "epoch": 0.7904404028257929, "flos": 20312155975680.0, "grad_norm": 2.124659270269472, "language_loss": 0.76710016, "learning_rate": 4.4314189264126246e-07, "loss": 0.79157227, "num_input_tokens_seen": 283725710, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19299316, "step": 13147, "time_per_iteration": 2.8957834243774414 }, { "auxiliary_loss_clip": 0.01396334, "auxiliary_loss_mlp": 0.01036236, "balance_loss_clip": 1.23676586, "balance_loss_mlp": 1.0173887, "epoch": 0.7905005260784609, "flos": 20017937168640.0, "grad_norm": 2.3954272797524236, "language_loss": 0.72488105, "learning_rate": 4.428974443697087e-07, "loss": 0.74920678, "num_input_tokens_seen": 283744150, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18847656, "step": 13148, "time_per_iteration": 2.8731532096862793 }, { "auxiliary_loss_clip": 0.01412503, "auxiliary_loss_mlp": 0.01036442, "balance_loss_clip": 1.24941218, "balance_loss_mlp": 1.01705861, "epoch": 0.7905606493311288, "flos": 26917039877760.0, "grad_norm": 1.906188222676574, "language_loss": 0.71959454, "learning_rate": 4.4265305514363913e-07, "loss": 0.744084, "num_input_tokens_seen": 283764170, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19384766, "step": 13149, "time_per_iteration": 2.9207262992858887 }, { "auxiliary_loss_clip": 0.01409236, "auxiliary_loss_mlp": 0.01036535, "balance_loss_clip": 1.24598837, "balance_loss_mlp": 1.01610255, "epoch": 0.7906207725837968, "flos": 23706784494720.0, "grad_norm": 1.9337503226556743, "language_loss": 0.65471315, "learning_rate": 4.424087249723225e-07, "loss": 0.67917091, "num_input_tokens_seen": 283784305, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.2043457, "step": 13150, "time_per_iteration": 2.8461062908172607 }, { "auxiliary_loss_clip": 0.01411307, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.24906743, "balance_loss_mlp": 1.01425338, "epoch": 0.7906808958364647, "flos": 20858262439680.0, "grad_norm": 1.9555739119724718, "language_loss": 0.70785117, "learning_rate": 4.421644538650231e-07, "loss": 0.73230058, "num_input_tokens_seen": 283804040, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19396973, "step": 13151, "time_per_iteration": 2.839892864227295 }, { "auxiliary_loss_clip": 0.01418297, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.25471997, "balance_loss_mlp": 1.01595616, "epoch": 0.7907410190891327, "flos": 40750201543680.0, "grad_norm": 1.3722799824834713, "language_loss": 0.70692933, "learning_rate": 4.4192024183100306e-07, "loss": 0.73146445, "num_input_tokens_seen": 283827120, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19262695, "step": 13152, "time_per_iteration": 3.0044891834259033 }, { "auxiliary_loss_clip": 0.01410187, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.24863911, "balance_loss_mlp": 1.01585031, "epoch": 0.7908011423418007, "flos": 13268975264640.0, "grad_norm": 2.05821725722843, "language_loss": 0.73932326, "learning_rate": 4.4167608887952367e-07, "loss": 0.76377261, "num_input_tokens_seen": 283844820, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18896484, "step": 13153, "time_per_iteration": 2.8173136711120605 }, { "auxiliary_loss_clip": 0.01410963, "auxiliary_loss_mlp": 0.0103216, "balance_loss_clip": 1.250283, "balance_loss_mlp": 1.01326489, "epoch": 0.7908612655944687, "flos": 19764013495680.0, "grad_norm": 1.7237798766394241, "language_loss": 0.79206467, "learning_rate": 4.4143199501984306e-07, "loss": 0.81649595, "num_input_tokens_seen": 283862870, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18908691, "step": 13154, "time_per_iteration": 2.8330025672912598 }, { "auxiliary_loss_clip": 0.01425989, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.25730419, "balance_loss_mlp": 1.01405025, "epoch": 0.7909213888471366, "flos": 21297554634240.0, "grad_norm": 4.448787687740204, "language_loss": 0.70871878, "learning_rate": 4.411879602612185e-07, "loss": 0.73331499, "num_input_tokens_seen": 283882405, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.19555664, "step": 13155, "time_per_iteration": 2.8646368980407715 }, { "auxiliary_loss_clip": 0.01419962, "auxiliary_loss_mlp": 0.01034471, "balance_loss_clip": 1.25636077, "balance_loss_mlp": 1.01533818, "epoch": 0.7909815120998046, "flos": 22539184450560.0, "grad_norm": 1.6651271864144412, "language_loss": 0.77502751, "learning_rate": 4.4094398461290174e-07, "loss": 0.79957181, "num_input_tokens_seen": 283902070, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19140625, "step": 13156, "time_per_iteration": 2.825650930404663 }, { "auxiliary_loss_clip": 0.01402693, "auxiliary_loss_mlp": 0.01032755, "balance_loss_clip": 1.24104881, "balance_loss_mlp": 1.01375306, "epoch": 0.7910416353524725, "flos": 26739860664960.0, "grad_norm": 2.0404942755576925, "language_loss": 0.66375816, "learning_rate": 4.4070006808414526e-07, "loss": 0.68811262, "num_input_tokens_seen": 283924100, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19006348, "step": 13157, "time_per_iteration": 2.8835389614105225 }, { "auxiliary_loss_clip": 0.01416466, "auxiliary_loss_mlp": 0.01033973, "balance_loss_clip": 1.25177956, "balance_loss_mlp": 1.01379132, "epoch": 0.7911017586051405, "flos": 24655643337600.0, "grad_norm": 2.793494836542453, "language_loss": 0.74993968, "learning_rate": 4.4045621068419894e-07, "loss": 0.77444398, "num_input_tokens_seen": 283944955, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20178223, "step": 13158, "time_per_iteration": 2.8694803714752197 }, { "auxiliary_loss_clip": 0.01392419, "auxiliary_loss_mlp": 0.01035435, "balance_loss_clip": 1.23459029, "balance_loss_mlp": 1.01702857, "epoch": 0.7911618818578086, "flos": 17574158263680.0, "grad_norm": 2.576151492569059, "language_loss": 0.68523359, "learning_rate": 4.40212412422309e-07, "loss": 0.70951211, "num_input_tokens_seen": 283963125, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18395996, "step": 13159, "time_per_iteration": 2.828059434890747 }, { "auxiliary_loss_clip": 0.01398619, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.23818159, "balance_loss_mlp": 1.01380706, "epoch": 0.7912220051104765, "flos": 16728901309440.0, "grad_norm": 1.9181800479059365, "language_loss": 0.67978263, "learning_rate": 4.399686733077206e-07, "loss": 0.70409459, "num_input_tokens_seen": 283982850, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18762207, "step": 13160, "time_per_iteration": 4.280477523803711 }, { "auxiliary_loss_clip": 0.013879, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.23261452, "balance_loss_mlp": 1.01340914, "epoch": 0.7912821283631445, "flos": 13706593401600.0, "grad_norm": 2.1502519696768574, "language_loss": 0.73218083, "learning_rate": 4.3972499334967694e-07, "loss": 0.75636953, "num_input_tokens_seen": 283998275, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.17565918, "step": 13161, "time_per_iteration": 2.8082079887390137 }, { "auxiliary_loss_clip": 0.01395333, "auxiliary_loss_mlp": 0.01031281, "balance_loss_clip": 1.23659706, "balance_loss_mlp": 1.01215959, "epoch": 0.7913422516158124, "flos": 23779547412480.0, "grad_norm": 1.8162508539690134, "language_loss": 0.7410934, "learning_rate": 4.39481372557418e-07, "loss": 0.76535952, "num_input_tokens_seen": 284018750, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19128418, "step": 13162, "time_per_iteration": 2.8707776069641113 }, { "auxiliary_loss_clip": 0.01421036, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.25523901, "balance_loss_mlp": 1.01274443, "epoch": 0.7914023748684804, "flos": 19947843694080.0, "grad_norm": 1.7038615507579982, "language_loss": 0.72782254, "learning_rate": 4.392378109401811e-07, "loss": 0.75234759, "num_input_tokens_seen": 284037850, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.18725586, "step": 13163, "time_per_iteration": 2.796349287033081 }, { "auxiliary_loss_clip": 0.01400566, "auxiliary_loss_mlp": 0.01031049, "balance_loss_clip": 1.24086487, "balance_loss_mlp": 1.01115251, "epoch": 0.7914624981211483, "flos": 20604519745920.0, "grad_norm": 1.9181859606666187, "language_loss": 0.70454097, "learning_rate": 4.3899430850720296e-07, "loss": 0.72885716, "num_input_tokens_seen": 284056380, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19885254, "step": 13164, "time_per_iteration": 2.8394362926483154 }, { "auxiliary_loss_clip": 0.01400034, "auxiliary_loss_mlp": 0.01035046, "balance_loss_clip": 1.24113321, "balance_loss_mlp": 1.01544833, "epoch": 0.7915226213738163, "flos": 21809654991360.0, "grad_norm": 1.7922628726138288, "language_loss": 0.67050624, "learning_rate": 4.387508652677177e-07, "loss": 0.694857, "num_input_tokens_seen": 284074945, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19604492, "step": 13165, "time_per_iteration": 2.954888105392456 }, { "auxiliary_loss_clip": 0.01388108, "auxiliary_loss_mlp": 0.01031358, "balance_loss_clip": 1.23080039, "balance_loss_mlp": 1.01286817, "epoch": 0.7915827446264843, "flos": 16296667303680.0, "grad_norm": 2.2926026842653866, "language_loss": 0.73131597, "learning_rate": 4.385074812309557e-07, "loss": 0.75551069, "num_input_tokens_seen": 284092070, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18493652, "step": 13166, "time_per_iteration": 2.813096284866333 }, { "auxiliary_loss_clip": 0.01397987, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.23840797, "balance_loss_mlp": 1.01428521, "epoch": 0.7916428678791523, "flos": 25713578689920.0, "grad_norm": 1.5996349383257311, "language_loss": 0.78094697, "learning_rate": 4.382641564061462e-07, "loss": 0.80527401, "num_input_tokens_seen": 284112255, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.2043457, "step": 13167, "time_per_iteration": 2.9601874351501465 }, { "auxiliary_loss_clip": 0.01392816, "auxiliary_loss_mlp": 0.01032282, "balance_loss_clip": 1.23496222, "balance_loss_mlp": 1.01404262, "epoch": 0.7917029911318202, "flos": 23889076369920.0, "grad_norm": 1.6243061132833714, "language_loss": 0.84426802, "learning_rate": 4.3802089080251713e-07, "loss": 0.86851901, "num_input_tokens_seen": 284132330, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18249512, "step": 13168, "time_per_iteration": 4.280240297317505 }, { "auxiliary_loss_clip": 0.01404729, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.2432971, "balance_loss_mlp": 1.01758838, "epoch": 0.7917631143844882, "flos": 21654944461440.0, "grad_norm": 1.77450893405643, "language_loss": 0.73309529, "learning_rate": 4.3777768442929155e-07, "loss": 0.75751358, "num_input_tokens_seen": 284150640, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19506836, "step": 13169, "time_per_iteration": 2.8296756744384766 }, { "auxiliary_loss_clip": 0.01420191, "auxiliary_loss_mlp": 0.01041806, "balance_loss_clip": 1.25557947, "balance_loss_mlp": 1.02217197, "epoch": 0.7918232376371561, "flos": 38888661715200.0, "grad_norm": 1.6275570288010899, "language_loss": 0.67600018, "learning_rate": 4.3753453729569287e-07, "loss": 0.70062006, "num_input_tokens_seen": 284171910, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19616699, "step": 13170, "time_per_iteration": 3.0673794746398926 }, { "auxiliary_loss_clip": 0.01400194, "auxiliary_loss_mlp": 0.01031609, "balance_loss_clip": 1.23907876, "balance_loss_mlp": 1.01365578, "epoch": 0.7918833608898241, "flos": 20784866094720.0, "grad_norm": 1.6055142056580742, "language_loss": 0.71968305, "learning_rate": 4.372914494109412e-07, "loss": 0.74400115, "num_input_tokens_seen": 284191340, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.17956543, "step": 13171, "time_per_iteration": 2.842642307281494 }, { "auxiliary_loss_clip": 0.01402512, "auxiliary_loss_mlp": 0.01033377, "balance_loss_clip": 1.24136353, "balance_loss_mlp": 1.01516199, "epoch": 0.7919434841424922, "flos": 33922775877120.0, "grad_norm": 1.8860067673785397, "language_loss": 0.68087256, "learning_rate": 4.370484207842553e-07, "loss": 0.70523143, "num_input_tokens_seen": 284212495, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18212891, "step": 13172, "time_per_iteration": 4.533926725387573 }, { "auxiliary_loss_clip": 0.01397557, "auxiliary_loss_mlp": 0.01033922, "balance_loss_clip": 1.23638105, "balance_loss_mlp": 1.01458585, "epoch": 0.7920036073951601, "flos": 21073338812160.0, "grad_norm": 2.090175880612048, "language_loss": 0.79872406, "learning_rate": 4.3680545142484893e-07, "loss": 0.82303888, "num_input_tokens_seen": 284230825, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19335938, "step": 13173, "time_per_iteration": 2.897432327270508 }, { "auxiliary_loss_clip": 0.01404878, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.24240565, "balance_loss_mlp": 1.01510859, "epoch": 0.7920637306478281, "flos": 23665539219840.0, "grad_norm": 2.0252304779881922, "language_loss": 0.77689534, "learning_rate": 4.365625413419365e-07, "loss": 0.80127859, "num_input_tokens_seen": 284250365, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18334961, "step": 13174, "time_per_iteration": 2.88288950920105 }, { "auxiliary_loss_clip": 0.01394412, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.23533607, "balance_loss_mlp": 1.01611161, "epoch": 0.792123853900496, "flos": 27206010288000.0, "grad_norm": 1.8321112930921333, "language_loss": 0.72266459, "learning_rate": 4.363196905447297e-07, "loss": 0.74695635, "num_input_tokens_seen": 284269635, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18652344, "step": 13175, "time_per_iteration": 2.8836381435394287 }, { "auxiliary_loss_clip": 0.01407785, "auxiliary_loss_mlp": 0.01034528, "balance_loss_clip": 1.2461524, "balance_loss_mlp": 1.01501369, "epoch": 0.792183977153164, "flos": 19107925626240.0, "grad_norm": 2.0657255494935116, "language_loss": 0.60511816, "learning_rate": 4.360768990424364e-07, "loss": 0.62954128, "num_input_tokens_seen": 284288380, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19506836, "step": 13176, "time_per_iteration": 2.8110954761505127 }, { "auxiliary_loss_clip": 0.01392103, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.23574853, "balance_loss_mlp": 1.01554251, "epoch": 0.7922441004058319, "flos": 17137716491520.0, "grad_norm": 1.8596129923946556, "language_loss": 0.74512452, "learning_rate": 4.3583416684426376e-07, "loss": 0.76939881, "num_input_tokens_seen": 284306920, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.19775391, "step": 13177, "time_per_iteration": 2.827244281768799 }, { "auxiliary_loss_clip": 0.01404377, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.2458086, "balance_loss_mlp": 1.01376271, "epoch": 0.7923042236585, "flos": 17830570400640.0, "grad_norm": 1.8281009577180232, "language_loss": 0.64802253, "learning_rate": 4.355914939594174e-07, "loss": 0.67239356, "num_input_tokens_seen": 284324700, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18969727, "step": 13178, "time_per_iteration": 2.8260326385498047 }, { "auxiliary_loss_clip": 0.01403499, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.24205625, "balance_loss_mlp": 1.01429009, "epoch": 0.7923643469111679, "flos": 29947356115200.0, "grad_norm": 2.362481135009082, "language_loss": 0.69361758, "learning_rate": 4.3534888039709726e-07, "loss": 0.7179786, "num_input_tokens_seen": 284345985, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18322754, "step": 13179, "time_per_iteration": 2.9049313068389893 }, { "auxiliary_loss_clip": 0.01393639, "auxiliary_loss_mlp": 0.01032195, "balance_loss_clip": 1.23342586, "balance_loss_mlp": 1.01345503, "epoch": 0.7924244701638359, "flos": 22685026999680.0, "grad_norm": 2.0149103280749907, "language_loss": 0.74820715, "learning_rate": 4.3510632616650444e-07, "loss": 0.77246547, "num_input_tokens_seen": 284364475, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1875, "step": 13180, "time_per_iteration": 2.854595422744751 }, { "auxiliary_loss_clip": 0.01408467, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.24522781, "balance_loss_mlp": 1.01422977, "epoch": 0.7924845934165038, "flos": 17977272600960.0, "grad_norm": 2.294296105556513, "language_loss": 0.82487768, "learning_rate": 4.3486383127683646e-07, "loss": 0.84930062, "num_input_tokens_seen": 284382125, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19616699, "step": 13181, "time_per_iteration": 2.814661741256714 }, { "auxiliary_loss_clip": 0.01388243, "auxiliary_loss_mlp": 0.01036224, "balance_loss_clip": 1.22960806, "balance_loss_mlp": 1.01636398, "epoch": 0.7925447166691718, "flos": 23487364621440.0, "grad_norm": 1.8611054589738132, "language_loss": 0.78680831, "learning_rate": 4.346213957372895e-07, "loss": 0.81105304, "num_input_tokens_seen": 284401585, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.1986084, "step": 13182, "time_per_iteration": 2.849520444869995 }, { "auxiliary_loss_clip": 0.01417278, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.25096703, "balance_loss_mlp": 1.01864672, "epoch": 0.7926048399218397, "flos": 20456912649600.0, "grad_norm": 2.221762866295825, "language_loss": 0.75491172, "learning_rate": 4.34379019557056e-07, "loss": 0.77946168, "num_input_tokens_seen": 284419125, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.1907959, "step": 13183, "time_per_iteration": 2.8210959434509277 }, { "auxiliary_loss_clip": 0.01398339, "auxiliary_loss_mlp": 0.01032323, "balance_loss_clip": 1.23877013, "balance_loss_mlp": 1.01352382, "epoch": 0.7926649631745077, "flos": 37174231290240.0, "grad_norm": 1.7071848944929464, "language_loss": 0.68921286, "learning_rate": 4.341367027453264e-07, "loss": 0.71351945, "num_input_tokens_seen": 284440445, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18811035, "step": 13184, "time_per_iteration": 2.9569835662841797 }, { "auxiliary_loss_clip": 0.01413681, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 1.25013804, "balance_loss_mlp": 1.01264024, "epoch": 0.7927250864271758, "flos": 17027237393280.0, "grad_norm": 1.9679779153109054, "language_loss": 0.71353328, "learning_rate": 4.338944453112907e-07, "loss": 0.73798758, "num_input_tokens_seen": 284459370, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19104004, "step": 13185, "time_per_iteration": 2.8615000247955322 }, { "auxiliary_loss_clip": 0.01400075, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.23778105, "balance_loss_mlp": 1.01341498, "epoch": 0.7927852096798437, "flos": 17758259930880.0, "grad_norm": 10.31688541981603, "language_loss": 0.66104364, "learning_rate": 4.3365224726413375e-07, "loss": 0.68536037, "num_input_tokens_seen": 284477525, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18188477, "step": 13186, "time_per_iteration": 2.9377281665802 }, { "auxiliary_loss_clip": 0.013876, "auxiliary_loss_mlp": 0.01034852, "balance_loss_clip": 1.2290777, "balance_loss_mlp": 1.01542068, "epoch": 0.7928453329325117, "flos": 23848057319040.0, "grad_norm": 1.4825015731640327, "language_loss": 0.77625597, "learning_rate": 4.334101086130408e-07, "loss": 0.80048048, "num_input_tokens_seen": 284496590, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19445801, "step": 13187, "time_per_iteration": 2.886777639389038 }, { "auxiliary_loss_clip": 0.01400319, "auxiliary_loss_mlp": 0.01033617, "balance_loss_clip": 1.24077559, "balance_loss_mlp": 1.01496112, "epoch": 0.7929054561851796, "flos": 17463271962240.0, "grad_norm": 1.999573854177103, "language_loss": 0.7290529, "learning_rate": 4.3316802936719334e-07, "loss": 0.75339228, "num_input_tokens_seen": 284511470, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18664551, "step": 13188, "time_per_iteration": 2.8092637062072754 }, { "auxiliary_loss_clip": 0.01407193, "auxiliary_loss_mlp": 0.01039315, "balance_loss_clip": 1.24319744, "balance_loss_mlp": 1.01914477, "epoch": 0.7929655794378476, "flos": 21991358684160.0, "grad_norm": 2.1513763927081686, "language_loss": 0.64211547, "learning_rate": 4.329260095357725e-07, "loss": 0.66658056, "num_input_tokens_seen": 284531125, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.20178223, "step": 13189, "time_per_iteration": 2.8658435344696045 }, { "auxiliary_loss_clip": 0.01397687, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.23750854, "balance_loss_mlp": 1.01042593, "epoch": 0.7930257026905155, "flos": 17282608899840.0, "grad_norm": 1.8065356601772888, "language_loss": 0.73053253, "learning_rate": 4.3268404912795307e-07, "loss": 0.75480527, "num_input_tokens_seen": 284549340, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19140625, "step": 13190, "time_per_iteration": 2.8388171195983887 }, { "auxiliary_loss_clip": 0.01396891, "auxiliary_loss_mlp": 0.01032212, "balance_loss_clip": 1.23919296, "balance_loss_mlp": 1.0142467, "epoch": 0.7930858259431836, "flos": 27310064624640.0, "grad_norm": 2.7094537506629135, "language_loss": 0.73473883, "learning_rate": 4.3244214815291166e-07, "loss": 0.75902981, "num_input_tokens_seen": 284567060, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.17956543, "step": 13191, "time_per_iteration": 2.8924832344055176 }, { "auxiliary_loss_clip": 0.01401804, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.24108315, "balance_loss_mlp": 1.01940405, "epoch": 0.7931459491958515, "flos": 19872909025920.0, "grad_norm": 1.7392862851351796, "language_loss": 0.69570041, "learning_rate": 4.322003066198219e-07, "loss": 0.72010541, "num_input_tokens_seen": 284586600, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19274902, "step": 13192, "time_per_iteration": 2.8502604961395264 }, { "auxiliary_loss_clip": 0.01414783, "auxiliary_loss_mlp": 0.01035304, "balance_loss_clip": 1.25192881, "balance_loss_mlp": 1.01631439, "epoch": 0.7932060724485195, "flos": 23157058446720.0, "grad_norm": 1.9322576501059043, "language_loss": 0.7556476, "learning_rate": 4.3195852453785274e-07, "loss": 0.78014845, "num_input_tokens_seen": 284605715, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18994141, "step": 13193, "time_per_iteration": 2.939967393875122 }, { "auxiliary_loss_clip": 0.01393939, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.2340486, "balance_loss_mlp": 1.01543915, "epoch": 0.7932661957011874, "flos": 29946903667200.0, "grad_norm": 1.644386691094131, "language_loss": 0.72673452, "learning_rate": 4.317168019161741e-07, "loss": 0.75102925, "num_input_tokens_seen": 284628540, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.20092773, "step": 13194, "time_per_iteration": 2.8973097801208496 }, { "auxiliary_loss_clip": 0.01414286, "auxiliary_loss_mlp": 0.01033588, "balance_loss_clip": 1.24880528, "balance_loss_mlp": 1.01381063, "epoch": 0.7933263189538554, "flos": 22567806426240.0, "grad_norm": 2.5712499962936546, "language_loss": 0.70860493, "learning_rate": 4.314751387639517e-07, "loss": 0.73308367, "num_input_tokens_seen": 284646040, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19787598, "step": 13195, "time_per_iteration": 2.846968412399292 }, { "auxiliary_loss_clip": 0.01394902, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.23511231, "balance_loss_mlp": 1.01311159, "epoch": 0.7933864422065233, "flos": 25488774685440.0, "grad_norm": 1.4688099150118168, "language_loss": 0.78165507, "learning_rate": 4.3123353509034844e-07, "loss": 0.80592626, "num_input_tokens_seen": 284665110, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19104004, "step": 13196, "time_per_iteration": 4.371555805206299 }, { "auxiliary_loss_clip": 0.01417405, "auxiliary_loss_mlp": 0.01037769, "balance_loss_clip": 1.25412297, "balance_loss_mlp": 1.01852846, "epoch": 0.7934465654591913, "flos": 33596089286400.0, "grad_norm": 1.9101247984088534, "language_loss": 0.69080579, "learning_rate": 4.309919909045268e-07, "loss": 0.71535754, "num_input_tokens_seen": 284686515, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19250488, "step": 13197, "time_per_iteration": 2.9264822006225586 }, { "auxiliary_loss_clip": 0.01397776, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.23803759, "balance_loss_mlp": 1.0112468, "epoch": 0.7935066887118594, "flos": 31445443313280.0, "grad_norm": 1.7005880860943656, "language_loss": 0.66152132, "learning_rate": 4.30750506215646e-07, "loss": 0.68580383, "num_input_tokens_seen": 284707300, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19226074, "step": 13198, "time_per_iteration": 2.9265859127044678 }, { "auxiliary_loss_clip": 0.01400807, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.23846543, "balance_loss_mlp": 1.01669741, "epoch": 0.7935668119645273, "flos": 14690408492160.0, "grad_norm": 2.039838745056827, "language_loss": 0.74258578, "learning_rate": 4.30509081032864e-07, "loss": 0.76696837, "num_input_tokens_seen": 284723545, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.2076416, "step": 13199, "time_per_iteration": 2.827786684036255 }, { "auxiliary_loss_clip": 0.01412847, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.25139618, "balance_loss_mlp": 1.01610398, "epoch": 0.7936269352171953, "flos": 18013269479040.0, "grad_norm": 1.841506396335372, "language_loss": 0.81233013, "learning_rate": 4.302677153653349e-07, "loss": 0.83680964, "num_input_tokens_seen": 284742650, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19006348, "step": 13200, "time_per_iteration": 2.825831174850464 }, { "auxiliary_loss_clip": 0.01390297, "auxiliary_loss_mlp": 0.01033671, "balance_loss_clip": 1.23537064, "balance_loss_mlp": 1.01422834, "epoch": 0.7936870584698632, "flos": 18889546383360.0, "grad_norm": 1.6423996170618573, "language_loss": 0.78093421, "learning_rate": 4.3002640922221077e-07, "loss": 0.80517387, "num_input_tokens_seen": 284760955, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.19445801, "step": 13201, "time_per_iteration": 2.834867477416992 }, { "auxiliary_loss_clip": 0.01395213, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.23586833, "balance_loss_mlp": 1.01820087, "epoch": 0.7937471817225312, "flos": 23377247481600.0, "grad_norm": 1.6072687672762493, "language_loss": 0.67429101, "learning_rate": 4.2978516261264296e-07, "loss": 0.69861782, "num_input_tokens_seen": 284780745, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19262695, "step": 13202, "time_per_iteration": 2.8812999725341797 }, { "auxiliary_loss_clip": 0.01402189, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.24027252, "balance_loss_mlp": 1.01507354, "epoch": 0.7938073049751991, "flos": 22684665041280.0, "grad_norm": 1.9616246797932737, "language_loss": 0.75297928, "learning_rate": 4.2954397554577884e-07, "loss": 0.77733982, "num_input_tokens_seen": 284799000, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18798828, "step": 13203, "time_per_iteration": 4.264508247375488 }, { "auxiliary_loss_clip": 0.01407366, "auxiliary_loss_mlp": 0.01034564, "balance_loss_clip": 1.24497986, "balance_loss_mlp": 1.0155977, "epoch": 0.7938674282278672, "flos": 22860305930880.0, "grad_norm": 1.9545093505133189, "language_loss": 0.6720528, "learning_rate": 4.293028480307643e-07, "loss": 0.69647205, "num_input_tokens_seen": 284817450, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18981934, "step": 13204, "time_per_iteration": 2.8519256114959717 }, { "auxiliary_loss_clip": 0.01390921, "auxiliary_loss_mlp": 0.0103024, "balance_loss_clip": 1.23128951, "balance_loss_mlp": 1.01202476, "epoch": 0.7939275514805351, "flos": 27022904006400.0, "grad_norm": 1.3627280460936382, "language_loss": 0.79843754, "learning_rate": 4.290617800767438e-07, "loss": 0.82264912, "num_input_tokens_seen": 284838865, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18225098, "step": 13205, "time_per_iteration": 2.9623944759368896 }, { "auxiliary_loss_clip": 0.01396951, "auxiliary_loss_mlp": 0.01035479, "balance_loss_clip": 1.23822927, "balance_loss_mlp": 1.01557064, "epoch": 0.7939876747332031, "flos": 21152888449920.0, "grad_norm": 1.779260183489354, "language_loss": 0.78298801, "learning_rate": 4.28820771692858e-07, "loss": 0.80731231, "num_input_tokens_seen": 284857975, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19897461, "step": 13206, "time_per_iteration": 2.945782423019409 }, { "auxiliary_loss_clip": 0.01416442, "auxiliary_loss_mlp": 0.01033281, "balance_loss_clip": 1.25192726, "balance_loss_mlp": 1.01253843, "epoch": 0.794047797985871, "flos": 23298014557440.0, "grad_norm": 2.1726778832580966, "language_loss": 0.79911202, "learning_rate": 4.285798228882456e-07, "loss": 0.82360923, "num_input_tokens_seen": 284877145, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.20715332, "step": 13207, "time_per_iteration": 4.3649115562438965 }, { "auxiliary_loss_clip": 0.014017, "auxiliary_loss_mlp": 0.01039277, "balance_loss_clip": 1.24150646, "balance_loss_mlp": 1.02017951, "epoch": 0.794107921238539, "flos": 24618786808320.0, "grad_norm": 1.7916487385926858, "language_loss": 0.84398955, "learning_rate": 4.2833893367204375e-07, "loss": 0.86839926, "num_input_tokens_seen": 284895560, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19104004, "step": 13208, "time_per_iteration": 2.9645464420318604 }, { "auxiliary_loss_clip": 0.01184335, "auxiliary_loss_mlp": 0.01026061, "balance_loss_clip": 1.09130323, "balance_loss_mlp": 1.00355399, "epoch": 0.7941680444912069, "flos": 64126200153600.0, "grad_norm": 0.7225737517523693, "language_loss": 0.58323455, "learning_rate": 4.280981040533875e-07, "loss": 0.60533845, "num_input_tokens_seen": 284963135, "router_z_loss_clip": 0.9296875, "router_z_loss_mlp": 0.22460938, "step": 13209, "time_per_iteration": 3.454373359680176 }, { "auxiliary_loss_clip": 0.01420409, "auxiliary_loss_mlp": 0.01036939, "balance_loss_clip": 1.25391078, "balance_loss_mlp": 1.01656592, "epoch": 0.794228167743875, "flos": 24399412179840.0, "grad_norm": 2.2981187421525173, "language_loss": 0.64016068, "learning_rate": 4.2785733404140825e-07, "loss": 0.66473413, "num_input_tokens_seen": 284981755, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20385742, "step": 13210, "time_per_iteration": 2.8909783363342285 }, { "auxiliary_loss_clip": 0.01401812, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.24311686, "balance_loss_mlp": 1.01520348, "epoch": 0.794288290996543, "flos": 28524339319680.0, "grad_norm": 1.701178463083485, "language_loss": 0.69494903, "learning_rate": 4.2761662364523676e-07, "loss": 0.71931159, "num_input_tokens_seen": 285003060, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19226074, "step": 13211, "time_per_iteration": 2.925614833831787 }, { "auxiliary_loss_clip": 0.01406839, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.24249315, "balance_loss_mlp": 1.01669097, "epoch": 0.7943484142492109, "flos": 25933315276800.0, "grad_norm": 2.052002291558341, "language_loss": 0.73279762, "learning_rate": 4.2737597287400074e-07, "loss": 0.75723624, "num_input_tokens_seen": 285021640, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.20336914, "step": 13212, "time_per_iteration": 2.9228127002716064 }, { "auxiliary_loss_clip": 0.01394043, "auxiliary_loss_mlp": 0.01030094, "balance_loss_clip": 1.23897028, "balance_loss_mlp": 1.01096082, "epoch": 0.7944085375018789, "flos": 23925932899200.0, "grad_norm": 1.6840048480512937, "language_loss": 0.81189346, "learning_rate": 4.271353817368246e-07, "loss": 0.83613485, "num_input_tokens_seen": 285040490, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.19128418, "step": 13213, "time_per_iteration": 2.866795301437378 }, { "auxiliary_loss_clip": 0.01412572, "auxiliary_loss_mlp": 0.01036382, "balance_loss_clip": 1.24875236, "balance_loss_mlp": 1.01656902, "epoch": 0.7944686607545468, "flos": 20239574037120.0, "grad_norm": 2.0142548273966536, "language_loss": 0.67880762, "learning_rate": 4.268948502428327e-07, "loss": 0.70329714, "num_input_tokens_seen": 285059270, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19799805, "step": 13214, "time_per_iteration": 2.828404426574707 }, { "auxiliary_loss_clip": 0.01388405, "auxiliary_loss_mlp": 0.01032573, "balance_loss_clip": 1.23167729, "balance_loss_mlp": 1.01441693, "epoch": 0.7945287840072148, "flos": 21990996725760.0, "grad_norm": 1.7219054150412112, "language_loss": 0.73300099, "learning_rate": 4.2665437840114535e-07, "loss": 0.75721073, "num_input_tokens_seen": 285075390, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18151855, "step": 13215, "time_per_iteration": 2.8291025161743164 }, { "auxiliary_loss_clip": 0.01398246, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.24046552, "balance_loss_mlp": 1.01218057, "epoch": 0.7945889072598827, "flos": 26409373511040.0, "grad_norm": 1.5518921628164828, "language_loss": 0.79656738, "learning_rate": 4.2641396622088253e-07, "loss": 0.820867, "num_input_tokens_seen": 285096290, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.1953125, "step": 13216, "time_per_iteration": 2.9038772583007812 }, { "auxiliary_loss_clip": 0.01410338, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.24876857, "balance_loss_mlp": 1.01261973, "epoch": 0.7946490305125508, "flos": 25820800162560.0, "grad_norm": 1.7092256504923542, "language_loss": 0.74444008, "learning_rate": 4.261736137111598e-07, "loss": 0.76885748, "num_input_tokens_seen": 285116020, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18798828, "step": 13217, "time_per_iteration": 2.8707785606384277 }, { "auxiliary_loss_clip": 0.01395524, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.238675, "balance_loss_mlp": 1.01654935, "epoch": 0.7947091537652187, "flos": 15969437775360.0, "grad_norm": 1.7249471626209976, "language_loss": 0.74181449, "learning_rate": 4.259333208810907e-07, "loss": 0.76612568, "num_input_tokens_seen": 285133510, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19055176, "step": 13218, "time_per_iteration": 2.809525728225708 }, { "auxiliary_loss_clip": 0.01418024, "auxiliary_loss_mlp": 0.01034117, "balance_loss_clip": 1.2531594, "balance_loss_mlp": 1.01473355, "epoch": 0.7947692770178867, "flos": 18597001633920.0, "grad_norm": 1.955040649114494, "language_loss": 0.84067714, "learning_rate": 4.2569308773978817e-07, "loss": 0.86519861, "num_input_tokens_seen": 285151690, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19384766, "step": 13219, "time_per_iteration": 2.8192687034606934 }, { "auxiliary_loss_clip": 0.01421688, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.25615549, "balance_loss_mlp": 1.01447845, "epoch": 0.7948294002705546, "flos": 20450171174400.0, "grad_norm": 1.7637282539829946, "language_loss": 0.76430631, "learning_rate": 4.2545291429636123e-07, "loss": 0.78886431, "num_input_tokens_seen": 285170485, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19628906, "step": 13220, "time_per_iteration": 2.8452131748199463 }, { "auxiliary_loss_clip": 0.01421163, "auxiliary_loss_mlp": 0.01037565, "balance_loss_clip": 1.25567639, "balance_loss_mlp": 1.01840794, "epoch": 0.7948895235232226, "flos": 38195536337280.0, "grad_norm": 1.6277391725459955, "language_loss": 0.72934854, "learning_rate": 4.252128005599176e-07, "loss": 0.75393581, "num_input_tokens_seen": 285191050, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19165039, "step": 13221, "time_per_iteration": 3.0420186519622803 }, { "auxiliary_loss_clip": 0.01402991, "auxiliary_loss_mlp": 0.01033673, "balance_loss_clip": 1.24548435, "balance_loss_mlp": 1.0146358, "epoch": 0.7949496467758905, "flos": 15568223719680.0, "grad_norm": 2.0207509239734303, "language_loss": 0.76240504, "learning_rate": 4.249727465395634e-07, "loss": 0.78677171, "num_input_tokens_seen": 285208750, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.19042969, "step": 13222, "time_per_iteration": 2.802612781524658 }, { "auxiliary_loss_clip": 0.01182224, "auxiliary_loss_mlp": 0.01022005, "balance_loss_clip": 1.09122252, "balance_loss_mlp": 0.9984495, "epoch": 0.7950097700285585, "flos": 70926252716160.0, "grad_norm": 0.7728219635981586, "language_loss": 0.67168772, "learning_rate": 4.247327522443993e-07, "loss": 0.69373, "num_input_tokens_seen": 285264605, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.23535156, "step": 13223, "time_per_iteration": 3.1903345584869385 }, { "auxiliary_loss_clip": 0.01397485, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.23614466, "balance_loss_mlp": 1.01689315, "epoch": 0.7950698932812266, "flos": 23962246490880.0, "grad_norm": 1.6633361261182191, "language_loss": 0.72039998, "learning_rate": 4.2449281768352717e-07, "loss": 0.74473643, "num_input_tokens_seen": 285283940, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19250488, "step": 13224, "time_per_iteration": 2.898922920227051 }, { "auxiliary_loss_clip": 0.01186334, "auxiliary_loss_mlp": 0.0102556, "balance_loss_clip": 1.09301329, "balance_loss_mlp": 1.0039115, "epoch": 0.7951300165338945, "flos": 60309771194880.0, "grad_norm": 0.6702092418642943, "language_loss": 0.55040693, "learning_rate": 4.2425294286604527e-07, "loss": 0.57252586, "num_input_tokens_seen": 285349525, "router_z_loss_clip": 0.93359375, "router_z_loss_mlp": 0.21679688, "step": 13225, "time_per_iteration": 3.3522980213165283 }, { "auxiliary_loss_clip": 0.0138591, "auxiliary_loss_mlp": 0.01029846, "balance_loss_clip": 1.22908032, "balance_loss_mlp": 1.01135683, "epoch": 0.7951901397865625, "flos": 22828924022400.0, "grad_norm": 1.8804770480151354, "language_loss": 0.65793741, "learning_rate": 4.2401312780105034e-07, "loss": 0.68209499, "num_input_tokens_seen": 285367355, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18493652, "step": 13226, "time_per_iteration": 2.9450154304504395 }, { "auxiliary_loss_clip": 0.0141149, "auxiliary_loss_mlp": 0.01039701, "balance_loss_clip": 1.24934304, "balance_loss_mlp": 1.02037668, "epoch": 0.7952502630392304, "flos": 35708204672640.0, "grad_norm": 2.188620469654787, "language_loss": 0.70580125, "learning_rate": 4.237733724976349e-07, "loss": 0.73031318, "num_input_tokens_seen": 285386190, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19311523, "step": 13227, "time_per_iteration": 2.961012601852417 }, { "auxiliary_loss_clip": 0.01396341, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.23811364, "balance_loss_mlp": 1.01648211, "epoch": 0.7953103862918984, "flos": 25640861016960.0, "grad_norm": 2.024395469589045, "language_loss": 0.70730519, "learning_rate": 4.2353367696489184e-07, "loss": 0.7316227, "num_input_tokens_seen": 285406150, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18933105, "step": 13228, "time_per_iteration": 2.9083805084228516 }, { "auxiliary_loss_clip": 0.01411004, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.24759865, "balance_loss_mlp": 1.01774132, "epoch": 0.7953705095445663, "flos": 40566733303680.0, "grad_norm": 1.4588173432157578, "language_loss": 0.7151854, "learning_rate": 4.232940412119095e-07, "loss": 0.73966277, "num_input_tokens_seen": 285429900, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18981934, "step": 13229, "time_per_iteration": 3.0630245208740234 }, { "auxiliary_loss_clip": 0.01422885, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.25694394, "balance_loss_mlp": 1.0155592, "epoch": 0.7954306327972344, "flos": 27648243394560.0, "grad_norm": 2.0472774328758256, "language_loss": 0.72387516, "learning_rate": 4.2305446524777457e-07, "loss": 0.74844426, "num_input_tokens_seen": 285452555, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.18481445, "step": 13230, "time_per_iteration": 2.9199440479278564 }, { "auxiliary_loss_clip": 0.01185271, "auxiliary_loss_mlp": 0.01018789, "balance_loss_clip": 1.09497464, "balance_loss_mlp": 0.99818945, "epoch": 0.7954907560499023, "flos": 59537340385920.0, "grad_norm": 0.8918167198737608, "language_loss": 0.63586503, "learning_rate": 4.2281494908157247e-07, "loss": 0.65790558, "num_input_tokens_seen": 285515700, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.20605469, "step": 13231, "time_per_iteration": 4.875218152999878 }, { "auxiliary_loss_clip": 0.01398469, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.23853719, "balance_loss_mlp": 1.01590061, "epoch": 0.7955508793025703, "flos": 20130135569280.0, "grad_norm": 1.5484346043210993, "language_loss": 0.69868237, "learning_rate": 4.2257549272238566e-07, "loss": 0.72301567, "num_input_tokens_seen": 285533910, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1895752, "step": 13232, "time_per_iteration": 2.8007094860076904 }, { "auxiliary_loss_clip": 0.01396758, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.23744047, "balance_loss_mlp": 1.01046824, "epoch": 0.7956110025552382, "flos": 26516594983680.0, "grad_norm": 1.5926178486358917, "language_loss": 0.78725058, "learning_rate": 4.223360961792952e-07, "loss": 0.81150609, "num_input_tokens_seen": 285554080, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18334961, "step": 13233, "time_per_iteration": 2.8794307708740234 }, { "auxiliary_loss_clip": 0.01413425, "auxiliary_loss_mlp": 0.01036236, "balance_loss_clip": 1.25088644, "balance_loss_mlp": 1.01685286, "epoch": 0.7956711258079062, "flos": 22575814755840.0, "grad_norm": 1.870837900095762, "language_loss": 0.7901178, "learning_rate": 4.220967594613769e-07, "loss": 0.81461436, "num_input_tokens_seen": 285572325, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19384766, "step": 13234, "time_per_iteration": 2.8657515048980713 }, { "auxiliary_loss_clip": 0.01395766, "auxiliary_loss_mlp": 0.01030043, "balance_loss_clip": 1.23649096, "balance_loss_mlp": 1.01226902, "epoch": 0.7957312490605741, "flos": 17386572746880.0, "grad_norm": 1.8780614939952491, "language_loss": 0.71234792, "learning_rate": 4.218574825777077e-07, "loss": 0.736606, "num_input_tokens_seen": 285589770, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17773438, "step": 13235, "time_per_iteration": 2.8679332733154297 }, { "auxiliary_loss_clip": 0.01405271, "auxiliary_loss_mlp": 0.01032778, "balance_loss_clip": 1.2432363, "balance_loss_mlp": 1.01259565, "epoch": 0.7957913723132422, "flos": 22501468270080.0, "grad_norm": 1.7680316176325024, "language_loss": 0.68436146, "learning_rate": 4.2161826553736145e-07, "loss": 0.70874196, "num_input_tokens_seen": 285610065, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.20178223, "step": 13236, "time_per_iteration": 2.913508415222168 }, { "auxiliary_loss_clip": 0.01393211, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.23367381, "balance_loss_mlp": 1.01159573, "epoch": 0.7958514955659101, "flos": 22648351449600.0, "grad_norm": 1.7280168756465975, "language_loss": 0.7573992, "learning_rate": 4.2137910834940826e-07, "loss": 0.78163278, "num_input_tokens_seen": 285628480, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18554688, "step": 13237, "time_per_iteration": 2.849341630935669 }, { "auxiliary_loss_clip": 0.01407541, "auxiliary_loss_mlp": 0.01038355, "balance_loss_clip": 1.24599612, "balance_loss_mlp": 1.0184356, "epoch": 0.7959116188185781, "flos": 20713958213760.0, "grad_norm": 1.9014453575501817, "language_loss": 0.72672468, "learning_rate": 4.211400110229175e-07, "loss": 0.75118363, "num_input_tokens_seen": 285647805, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19909668, "step": 13238, "time_per_iteration": 4.229424238204956 }, { "auxiliary_loss_clip": 0.01409904, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.24572587, "balance_loss_mlp": 1.01168537, "epoch": 0.7959717420712461, "flos": 19033669630080.0, "grad_norm": 2.0546732464113133, "language_loss": 0.74735898, "learning_rate": 4.2090097356695684e-07, "loss": 0.77174985, "num_input_tokens_seen": 285665505, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.17504883, "step": 13239, "time_per_iteration": 2.808567762374878 }, { "auxiliary_loss_clip": 0.01405062, "auxiliary_loss_mlp": 0.01033867, "balance_loss_clip": 1.24230301, "balance_loss_mlp": 1.01465046, "epoch": 0.796031865323914, "flos": 26367087605760.0, "grad_norm": 3.221169611220296, "language_loss": 0.70325577, "learning_rate": 4.2066199599058814e-07, "loss": 0.72764504, "num_input_tokens_seen": 285685855, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.1920166, "step": 13240, "time_per_iteration": 2.8738486766815186 }, { "auxiliary_loss_clip": 0.01187707, "auxiliary_loss_mlp": 0.01017627, "balance_loss_clip": 1.09629679, "balance_loss_mlp": 0.9950245, "epoch": 0.796091988576582, "flos": 62096874048000.0, "grad_norm": 0.8885041855905748, "language_loss": 0.58782405, "learning_rate": 4.2042307830287526e-07, "loss": 0.60987735, "num_input_tokens_seen": 285735710, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.22558594, "step": 13241, "time_per_iteration": 3.1371231079101562 }, { "auxiliary_loss_clip": 0.01411643, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.25100815, "balance_loss_mlp": 1.01712775, "epoch": 0.7961521118292499, "flos": 39034594753920.0, "grad_norm": 4.131947185764265, "language_loss": 0.65623599, "learning_rate": 4.201842205128772e-07, "loss": 0.68070567, "num_input_tokens_seen": 285757045, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18188477, "step": 13242, "time_per_iteration": 3.0218987464904785 }, { "auxiliary_loss_clip": 0.0140678, "auxiliary_loss_mlp": 0.01037504, "balance_loss_clip": 1.2445246, "balance_loss_mlp": 1.01839447, "epoch": 0.796212235081918, "flos": 21772979441280.0, "grad_norm": 1.7332283403050783, "language_loss": 0.76824605, "learning_rate": 4.199454226296526e-07, "loss": 0.79268891, "num_input_tokens_seen": 285776050, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19116211, "step": 13243, "time_per_iteration": 4.4210405349731445 }, { "auxiliary_loss_clip": 0.01402089, "auxiliary_loss_mlp": 0.01034296, "balance_loss_clip": 1.23929465, "balance_loss_mlp": 1.01392341, "epoch": 0.7962723583345859, "flos": 21188794838400.0, "grad_norm": 2.291292089819298, "language_loss": 0.7970767, "learning_rate": 4.1970668466225565e-07, "loss": 0.82144058, "num_input_tokens_seen": 285796830, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.20373535, "step": 13244, "time_per_iteration": 2.8988592624664307 }, { "auxiliary_loss_clip": 0.01404974, "auxiliary_loss_mlp": 0.01033267, "balance_loss_clip": 1.24122405, "balance_loss_mlp": 1.01469398, "epoch": 0.7963324815872539, "flos": 17137445022720.0, "grad_norm": 2.417821148948565, "language_loss": 0.6985274, "learning_rate": 4.1946800661973934e-07, "loss": 0.72290981, "num_input_tokens_seen": 285814755, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18591309, "step": 13245, "time_per_iteration": 2.874469041824341 }, { "auxiliary_loss_clip": 0.01405238, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.24390292, "balance_loss_mlp": 1.0187937, "epoch": 0.7963926048399218, "flos": 21407400305280.0, "grad_norm": 1.391528315711335, "language_loss": 0.79606742, "learning_rate": 4.192293885111549e-07, "loss": 0.8204962, "num_input_tokens_seen": 285834255, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18847656, "step": 13246, "time_per_iteration": 2.8558104038238525 }, { "auxiliary_loss_clip": 0.01412459, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.24821126, "balance_loss_mlp": 1.01987052, "epoch": 0.7964527280925898, "flos": 25193062800000.0, "grad_norm": 1.9368216300400465, "language_loss": 0.66860163, "learning_rate": 4.1899083034555007e-07, "loss": 0.69311631, "num_input_tokens_seen": 285853540, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19128418, "step": 13247, "time_per_iteration": 2.864593267440796 }, { "auxiliary_loss_clip": 0.01401426, "auxiliary_loss_mlp": 0.01033106, "balance_loss_clip": 1.24315238, "balance_loss_mlp": 1.01453292, "epoch": 0.7965128513452577, "flos": 27027157017600.0, "grad_norm": 1.9813205772606521, "language_loss": 0.72694278, "learning_rate": 4.1875233213197123e-07, "loss": 0.75128812, "num_input_tokens_seen": 285872705, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18579102, "step": 13248, "time_per_iteration": 2.9105472564697266 }, { "auxiliary_loss_clip": 0.01412747, "auxiliary_loss_mlp": 0.01034353, "balance_loss_clip": 1.24760914, "balance_loss_mlp": 1.01468325, "epoch": 0.7965729745979258, "flos": 24428667582720.0, "grad_norm": 2.0671284721643066, "language_loss": 0.7641809, "learning_rate": 4.1851389387946255e-07, "loss": 0.78865188, "num_input_tokens_seen": 285890290, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19677734, "step": 13249, "time_per_iteration": 2.842820882797241 }, { "auxiliary_loss_clip": 0.01398635, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.23889685, "balance_loss_mlp": 1.01322269, "epoch": 0.7966330978505937, "flos": 18848844046080.0, "grad_norm": 3.1168721366569816, "language_loss": 0.62194723, "learning_rate": 4.1827551559706674e-07, "loss": 0.64625061, "num_input_tokens_seen": 285909190, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18493652, "step": 13250, "time_per_iteration": 2.862680435180664 }, { "auxiliary_loss_clip": 0.01397947, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.23784089, "balance_loss_mlp": 1.01095164, "epoch": 0.7966932211032617, "flos": 13160260713600.0, "grad_norm": 5.950341315173901, "language_loss": 0.73937345, "learning_rate": 4.180371972938206e-07, "loss": 0.76365435, "num_input_tokens_seen": 285927570, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19189453, "step": 13251, "time_per_iteration": 2.94284987449646 }, { "auxiliary_loss_clip": 0.0141653, "auxiliary_loss_mlp": 0.01037786, "balance_loss_clip": 1.25211668, "balance_loss_mlp": 1.01622128, "epoch": 0.7967533443559297, "flos": 23959893761280.0, "grad_norm": 1.7336423675468802, "language_loss": 0.73754013, "learning_rate": 4.177989389787624e-07, "loss": 0.76208329, "num_input_tokens_seen": 285945810, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.21569824, "step": 13252, "time_per_iteration": 2.849231481552124 }, { "auxiliary_loss_clip": 0.0139357, "auxiliary_loss_mlp": 0.01035141, "balance_loss_clip": 1.23594499, "balance_loss_mlp": 1.01598418, "epoch": 0.7968134676085976, "flos": 30379952079360.0, "grad_norm": 1.7973329785108136, "language_loss": 0.67353249, "learning_rate": 4.175607406609278e-07, "loss": 0.69781959, "num_input_tokens_seen": 285964235, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19152832, "step": 13253, "time_per_iteration": 2.9045536518096924 }, { "auxiliary_loss_clip": 0.01411506, "auxiliary_loss_mlp": 0.01034814, "balance_loss_clip": 1.24890065, "balance_loss_mlp": 1.01540637, "epoch": 0.7968735908612656, "flos": 23085200424960.0, "grad_norm": 1.5761818291393073, "language_loss": 0.68036878, "learning_rate": 4.1732260234934767e-07, "loss": 0.70483196, "num_input_tokens_seen": 285983710, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19421387, "step": 13254, "time_per_iteration": 2.8448596000671387 }, { "auxiliary_loss_clip": 0.01418212, "auxiliary_loss_mlp": 0.0103998, "balance_loss_clip": 1.2564069, "balance_loss_mlp": 1.02128768, "epoch": 0.7969337141139335, "flos": 23591599937280.0, "grad_norm": 1.800961943067236, "language_loss": 0.70118439, "learning_rate": 4.1708452405305314e-07, "loss": 0.7257663, "num_input_tokens_seen": 286003425, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18688965, "step": 13255, "time_per_iteration": 2.897002696990967 }, { "auxiliary_loss_clip": 0.01399022, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.24006164, "balance_loss_mlp": 1.01514125, "epoch": 0.7969938373666016, "flos": 19765416084480.0, "grad_norm": 1.8944943288054252, "language_loss": 0.79627419, "learning_rate": 4.168465057810733e-07, "loss": 0.8206045, "num_input_tokens_seen": 286020130, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18884277, "step": 13256, "time_per_iteration": 2.969757318496704 }, { "auxiliary_loss_clip": 0.01408854, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.24612904, "balance_loss_mlp": 1.01261735, "epoch": 0.7970539606192695, "flos": 24144764590080.0, "grad_norm": 1.598665652955987, "language_loss": 0.66440415, "learning_rate": 4.166085475424315e-07, "loss": 0.68880951, "num_input_tokens_seen": 286040230, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19067383, "step": 13257, "time_per_iteration": 2.8490302562713623 }, { "auxiliary_loss_clip": 0.01434841, "auxiliary_loss_mlp": 0.01035336, "balance_loss_clip": 1.26830828, "balance_loss_mlp": 1.01689422, "epoch": 0.7971140838719375, "flos": 17977860783360.0, "grad_norm": 3.236489136504159, "language_loss": 0.73731267, "learning_rate": 4.163706493461523e-07, "loss": 0.76201439, "num_input_tokens_seen": 286059475, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.18457031, "step": 13258, "time_per_iteration": 2.8159732818603516 }, { "auxiliary_loss_clip": 0.01404991, "auxiliary_loss_mlp": 0.01033192, "balance_loss_clip": 1.24158025, "balance_loss_mlp": 1.01414251, "epoch": 0.7971742071246054, "flos": 19178290569600.0, "grad_norm": 1.7377220730474514, "language_loss": 0.69556797, "learning_rate": 4.1613281120125655e-07, "loss": 0.71994984, "num_input_tokens_seen": 286077820, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19042969, "step": 13259, "time_per_iteration": 2.8283636569976807 }, { "auxiliary_loss_clip": 0.01399275, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.2403295, "balance_loss_mlp": 1.01282096, "epoch": 0.7972343303772734, "flos": 27137409891840.0, "grad_norm": 1.622953234984002, "language_loss": 0.74181032, "learning_rate": 4.158950331167641e-07, "loss": 0.76611519, "num_input_tokens_seen": 286097285, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18395996, "step": 13260, "time_per_iteration": 2.8672029972076416 }, { "auxiliary_loss_clip": 0.01390697, "auxiliary_loss_mlp": 0.01032216, "balance_loss_clip": 1.23162365, "balance_loss_mlp": 1.01400137, "epoch": 0.7972944536299413, "flos": 21006593452800.0, "grad_norm": 3.133663783291427, "language_loss": 0.78948921, "learning_rate": 4.1565731510169065e-07, "loss": 0.81371838, "num_input_tokens_seen": 286116000, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18200684, "step": 13261, "time_per_iteration": 2.873385190963745 }, { "auxiliary_loss_clip": 0.01376204, "auxiliary_loss_mlp": 0.01029857, "balance_loss_clip": 1.22323489, "balance_loss_mlp": 1.01235712, "epoch": 0.7973545768826094, "flos": 21590008894080.0, "grad_norm": 1.5419340529151206, "language_loss": 0.7637623, "learning_rate": 4.154196571650501e-07, "loss": 0.7878229, "num_input_tokens_seen": 286135110, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.17504883, "step": 13262, "time_per_iteration": 2.854612112045288 }, { "auxiliary_loss_clip": 0.01416179, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.24953008, "balance_loss_mlp": 1.0133419, "epoch": 0.7974147001352773, "flos": 20568296643840.0, "grad_norm": 2.27672191806721, "language_loss": 0.71654081, "learning_rate": 4.1518205931585524e-07, "loss": 0.74104285, "num_input_tokens_seen": 286152835, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.20678711, "step": 13263, "time_per_iteration": 2.8338117599487305 }, { "auxiliary_loss_clip": 0.01426213, "auxiliary_loss_mlp": 0.01038194, "balance_loss_clip": 1.25859606, "balance_loss_mlp": 1.01860762, "epoch": 0.7974748233879453, "flos": 21006774432000.0, "grad_norm": 1.8106810051212017, "language_loss": 0.72356218, "learning_rate": 4.149445215631153e-07, "loss": 0.74820626, "num_input_tokens_seen": 286171785, "router_z_loss_clip": 1.67578125, "router_z_loss_mlp": 0.19592285, "step": 13264, "time_per_iteration": 2.8476834297180176 }, { "auxiliary_loss_clip": 0.01388235, "auxiliary_loss_mlp": 0.01033399, "balance_loss_clip": 1.2312746, "balance_loss_mlp": 1.01557755, "epoch": 0.7975349466406133, "flos": 22575452797440.0, "grad_norm": 2.421825716571987, "language_loss": 0.77565849, "learning_rate": 4.1470704391583776e-07, "loss": 0.79987478, "num_input_tokens_seen": 286190420, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.17822266, "step": 13265, "time_per_iteration": 2.8510687351226807 }, { "auxiliary_loss_clip": 0.01419756, "auxiliary_loss_mlp": 0.01036552, "balance_loss_clip": 1.25434637, "balance_loss_mlp": 1.01737118, "epoch": 0.7975950698932812, "flos": 21699583096320.0, "grad_norm": 1.7862720124012819, "language_loss": 0.75793487, "learning_rate": 4.144696263830285e-07, "loss": 0.78249788, "num_input_tokens_seen": 286210105, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19165039, "step": 13266, "time_per_iteration": 4.291946887969971 }, { "auxiliary_loss_clip": 0.01402497, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.24305773, "balance_loss_mlp": 1.01365888, "epoch": 0.7976551931459492, "flos": 19612696325760.0, "grad_norm": 4.168652815933306, "language_loss": 0.8449589, "learning_rate": 4.1423226897369015e-07, "loss": 0.86930686, "num_input_tokens_seen": 286228180, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18664551, "step": 13267, "time_per_iteration": 2.8800418376922607 }, { "auxiliary_loss_clip": 0.01399145, "auxiliary_loss_mlp": 0.01032265, "balance_loss_clip": 1.2401967, "balance_loss_mlp": 1.01340628, "epoch": 0.7977153163986171, "flos": 21697139877120.0, "grad_norm": 1.601632516319317, "language_loss": 0.77319175, "learning_rate": 4.139949716968223e-07, "loss": 0.79750586, "num_input_tokens_seen": 286247305, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18835449, "step": 13268, "time_per_iteration": 2.8882741928100586 }, { "auxiliary_loss_clip": 0.01407597, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.24764872, "balance_loss_mlp": 1.01638532, "epoch": 0.7977754396512852, "flos": 23487138397440.0, "grad_norm": 1.6707782944322322, "language_loss": 0.78623521, "learning_rate": 4.1375773456142403e-07, "loss": 0.81066298, "num_input_tokens_seen": 286268145, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18811035, "step": 13269, "time_per_iteration": 2.885432004928589 }, { "auxiliary_loss_clip": 0.01394269, "auxiliary_loss_mlp": 0.0103313, "balance_loss_clip": 1.23748016, "balance_loss_mlp": 1.01527214, "epoch": 0.7978355629039531, "flos": 22392301271040.0, "grad_norm": 2.2996677297106265, "language_loss": 0.82722688, "learning_rate": 4.135205575764922e-07, "loss": 0.85150087, "num_input_tokens_seen": 286286775, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.17871094, "step": 13270, "time_per_iteration": 2.865199565887451 }, { "auxiliary_loss_clip": 0.01398148, "auxiliary_loss_mlp": 0.01033876, "balance_loss_clip": 1.2376045, "balance_loss_mlp": 1.0149219, "epoch": 0.7978956861566211, "flos": 20275932873600.0, "grad_norm": 1.7828878033480218, "language_loss": 0.60792458, "learning_rate": 4.1328344075101905e-07, "loss": 0.63224477, "num_input_tokens_seen": 286305590, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.1895752, "step": 13271, "time_per_iteration": 2.8499033451080322 }, { "auxiliary_loss_clip": 0.0142633, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.26023734, "balance_loss_mlp": 1.0120635, "epoch": 0.797955809409289, "flos": 28124482608000.0, "grad_norm": 1.536301934465478, "language_loss": 0.74077737, "learning_rate": 4.130463840939975e-07, "loss": 0.76535457, "num_input_tokens_seen": 286328050, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19311523, "step": 13272, "time_per_iteration": 4.34929895401001 }, { "auxiliary_loss_clip": 0.01402225, "auxiliary_loss_mlp": 0.01035763, "balance_loss_clip": 1.2432462, "balance_loss_mlp": 1.01676154, "epoch": 0.798015932661957, "flos": 15567952250880.0, "grad_norm": 1.8591890248152174, "language_loss": 0.72395533, "learning_rate": 4.128093876144161e-07, "loss": 0.74833524, "num_input_tokens_seen": 286345265, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18981934, "step": 13273, "time_per_iteration": 2.7964675426483154 }, { "auxiliary_loss_clip": 0.01415621, "auxiliary_loss_mlp": 0.01032908, "balance_loss_clip": 1.25213075, "balance_loss_mlp": 1.01334584, "epoch": 0.7980760559146249, "flos": 23961703553280.0, "grad_norm": 1.851210603569782, "language_loss": 0.76823199, "learning_rate": 4.1257245132126117e-07, "loss": 0.79271734, "num_input_tokens_seen": 286364465, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19567871, "step": 13274, "time_per_iteration": 2.8522191047668457 }, { "auxiliary_loss_clip": 0.01388233, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.23365355, "balance_loss_mlp": 1.01368916, "epoch": 0.798136179167293, "flos": 28049774163840.0, "grad_norm": 1.395743994326899, "language_loss": 0.78209221, "learning_rate": 4.12335575223518e-07, "loss": 0.80628681, "num_input_tokens_seen": 286385565, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.17553711, "step": 13275, "time_per_iteration": 2.914494037628174 }, { "auxiliary_loss_clip": 0.01404373, "auxiliary_loss_mlp": 0.01039, "balance_loss_clip": 1.24106503, "balance_loss_mlp": 1.01949692, "epoch": 0.7981963024199609, "flos": 35996360676480.0, "grad_norm": 1.9699397959520755, "language_loss": 0.63994145, "learning_rate": 4.1209875933016877e-07, "loss": 0.66437519, "num_input_tokens_seen": 286403950, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19519043, "step": 13276, "time_per_iteration": 2.936234951019287 }, { "auxiliary_loss_clip": 0.0139197, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.23520362, "balance_loss_mlp": 1.0133388, "epoch": 0.7982564256726289, "flos": 25895915809920.0, "grad_norm": 2.2587716022534106, "language_loss": 0.61781532, "learning_rate": 4.118620036501945e-07, "loss": 0.64205116, "num_input_tokens_seen": 286426160, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18273926, "step": 13277, "time_per_iteration": 4.273302316665649 }, { "auxiliary_loss_clip": 0.01421657, "auxiliary_loss_mlp": 0.01036707, "balance_loss_clip": 1.25654674, "balance_loss_mlp": 1.01858723, "epoch": 0.7983165489252969, "flos": 25750163750400.0, "grad_norm": 2.0819229394265997, "language_loss": 0.80471265, "learning_rate": 4.1162530819257227e-07, "loss": 0.82929623, "num_input_tokens_seen": 286446610, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18103027, "step": 13278, "time_per_iteration": 4.296995162963867 }, { "auxiliary_loss_clip": 0.01401165, "auxiliary_loss_mlp": 0.0103428, "balance_loss_clip": 1.23896527, "balance_loss_mlp": 1.01441932, "epoch": 0.7983766721779648, "flos": 21918233808000.0, "grad_norm": 1.8176660452942959, "language_loss": 0.63939524, "learning_rate": 4.113886729662768e-07, "loss": 0.66374964, "num_input_tokens_seen": 286465460, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19873047, "step": 13279, "time_per_iteration": 2.8215928077697754 }, { "auxiliary_loss_clip": 0.01376156, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.22301793, "balance_loss_mlp": 1.01301718, "epoch": 0.7984367954306328, "flos": 29358375563520.0, "grad_norm": 2.321491824805067, "language_loss": 0.71855992, "learning_rate": 4.111520979802825e-07, "loss": 0.74263203, "num_input_tokens_seen": 286485720, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.18029785, "step": 13280, "time_per_iteration": 2.9326117038726807 }, { "auxiliary_loss_clip": 0.01413956, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.25072145, "balance_loss_mlp": 1.01613545, "epoch": 0.7984969186833007, "flos": 31370191931520.0, "grad_norm": 1.7489052342858649, "language_loss": 0.63653111, "learning_rate": 4.1091558324355955e-07, "loss": 0.66102791, "num_input_tokens_seen": 286507465, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19592285, "step": 13281, "time_per_iteration": 2.957246780395508 }, { "auxiliary_loss_clip": 0.01423877, "auxiliary_loss_mlp": 0.01033817, "balance_loss_clip": 1.25820422, "balance_loss_mlp": 1.01479137, "epoch": 0.7985570419359688, "flos": 24322712964480.0, "grad_norm": 1.7332388445531814, "language_loss": 0.81166023, "learning_rate": 4.1067912876507683e-07, "loss": 0.83623719, "num_input_tokens_seen": 286526345, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19030762, "step": 13282, "time_per_iteration": 2.870013952255249 }, { "auxiliary_loss_clip": 0.0141107, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.24618638, "balance_loss_mlp": 1.01185, "epoch": 0.7986171651886367, "flos": 15750696574080.0, "grad_norm": 1.8152278234385615, "language_loss": 0.72231758, "learning_rate": 4.10442734553802e-07, "loss": 0.74673498, "num_input_tokens_seen": 286544095, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18835449, "step": 13283, "time_per_iteration": 2.823507785797119 }, { "auxiliary_loss_clip": 0.01400339, "auxiliary_loss_mlp": 0.01035929, "balance_loss_clip": 1.24028969, "balance_loss_mlp": 1.01715374, "epoch": 0.7986772884413047, "flos": 11626810064640.0, "grad_norm": 3.1678750335664905, "language_loss": 0.74192721, "learning_rate": 4.102064006186967e-07, "loss": 0.76628989, "num_input_tokens_seen": 286560960, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18786621, "step": 13284, "time_per_iteration": 2.8379576206207275 }, { "auxiliary_loss_clip": 0.01396599, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 1.23708129, "balance_loss_mlp": 1.01319408, "epoch": 0.7987374116939726, "flos": 22101249600000.0, "grad_norm": 1.7289609691828889, "language_loss": 0.71047521, "learning_rate": 4.0997012696872415e-07, "loss": 0.73474991, "num_input_tokens_seen": 286579865, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.17687988, "step": 13285, "time_per_iteration": 2.8509156703948975 }, { "auxiliary_loss_clip": 0.01411738, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.24990129, "balance_loss_mlp": 1.01467252, "epoch": 0.7987975349466406, "flos": 17898627859200.0, "grad_norm": 2.086115844629727, "language_loss": 0.74683487, "learning_rate": 4.097339136128437e-07, "loss": 0.77129364, "num_input_tokens_seen": 286597295, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19470215, "step": 13286, "time_per_iteration": 2.8563501834869385 }, { "auxiliary_loss_clip": 0.01396194, "auxiliary_loss_mlp": 0.01036274, "balance_loss_clip": 1.23619795, "balance_loss_mlp": 1.01755834, "epoch": 0.7988576581993085, "flos": 19728740534400.0, "grad_norm": 1.9104079453622531, "language_loss": 0.75835037, "learning_rate": 4.0949776056001296e-07, "loss": 0.78267503, "num_input_tokens_seen": 286616270, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18725586, "step": 13287, "time_per_iteration": 2.850722312927246 }, { "auxiliary_loss_clip": 0.01401793, "auxiliary_loss_mlp": 0.01033046, "balance_loss_clip": 1.24214661, "balance_loss_mlp": 1.01520038, "epoch": 0.7989177814519766, "flos": 28047602413440.0, "grad_norm": 1.666730307914582, "language_loss": 0.62692153, "learning_rate": 4.092616678191863e-07, "loss": 0.65126991, "num_input_tokens_seen": 286638315, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.17834473, "step": 13288, "time_per_iteration": 2.9073915481567383 }, { "auxiliary_loss_clip": 0.01396084, "auxiliary_loss_mlp": 0.01030855, "balance_loss_clip": 1.23836446, "balance_loss_mlp": 1.01268768, "epoch": 0.7989779047046445, "flos": 28881321943680.0, "grad_norm": 1.87429497291728, "language_loss": 0.71692508, "learning_rate": 4.090256353993169e-07, "loss": 0.74119449, "num_input_tokens_seen": 286658630, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18151855, "step": 13289, "time_per_iteration": 2.9358832836151123 }, { "auxiliary_loss_clip": 0.01387696, "auxiliary_loss_mlp": 0.0103451, "balance_loss_clip": 1.23375213, "balance_loss_mlp": 1.0150311, "epoch": 0.7990380279573125, "flos": 18196013802240.0, "grad_norm": 2.0150737019581975, "language_loss": 0.63507438, "learning_rate": 4.0878966330935506e-07, "loss": 0.65929639, "num_input_tokens_seen": 286676870, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.19470215, "step": 13290, "time_per_iteration": 2.835797071456909 }, { "auxiliary_loss_clip": 0.01404, "auxiliary_loss_mlp": 0.0103398, "balance_loss_clip": 1.24208045, "balance_loss_mlp": 1.01469159, "epoch": 0.7990981512099805, "flos": 20887970290560.0, "grad_norm": 2.1633976861846715, "language_loss": 0.71976525, "learning_rate": 4.08553751558248e-07, "loss": 0.74414504, "num_input_tokens_seen": 286694300, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19287109, "step": 13291, "time_per_iteration": 2.8636746406555176 }, { "auxiliary_loss_clip": 0.01385762, "auxiliary_loss_mlp": 0.01033465, "balance_loss_clip": 1.22900915, "balance_loss_mlp": 1.01439178, "epoch": 0.7991582744626484, "flos": 26109951552000.0, "grad_norm": 1.4615128070866787, "language_loss": 0.64702326, "learning_rate": 4.083179001549422e-07, "loss": 0.67121553, "num_input_tokens_seen": 286714545, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.1907959, "step": 13292, "time_per_iteration": 2.8756368160247803 }, { "auxiliary_loss_clip": 0.01392818, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.23469555, "balance_loss_mlp": 1.01342797, "epoch": 0.7992183977153164, "flos": 35308664674560.0, "grad_norm": 1.678089109899059, "language_loss": 0.56733555, "learning_rate": 4.0808210910838105e-07, "loss": 0.59158254, "num_input_tokens_seen": 286734525, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18457031, "step": 13293, "time_per_iteration": 2.971869707107544 }, { "auxiliary_loss_clip": 0.01404835, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 1.24594545, "balance_loss_mlp": 1.01574683, "epoch": 0.7992785209679844, "flos": 51868739076480.0, "grad_norm": 4.444168723012482, "language_loss": 0.72471726, "learning_rate": 4.0784637842750704e-07, "loss": 0.74910843, "num_input_tokens_seen": 286753430, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18530273, "step": 13294, "time_per_iteration": 3.219226121902466 }, { "auxiliary_loss_clip": 0.01400193, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 1.23936558, "balance_loss_mlp": 1.01618338, "epoch": 0.7993386442206524, "flos": 22575362307840.0, "grad_norm": 1.7500496273240513, "language_loss": 0.73398054, "learning_rate": 4.0761070812125675e-07, "loss": 0.75832868, "num_input_tokens_seen": 286771915, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18432617, "step": 13295, "time_per_iteration": 2.8873276710510254 }, { "auxiliary_loss_clip": 0.01396698, "auxiliary_loss_mlp": 0.01035188, "balance_loss_clip": 1.23978031, "balance_loss_mlp": 1.01709211, "epoch": 0.7993987674733203, "flos": 18807779750400.0, "grad_norm": 2.036066491082595, "language_loss": 0.77066594, "learning_rate": 4.0737509819856797e-07, "loss": 0.79498482, "num_input_tokens_seen": 286789835, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.1809082, "step": 13296, "time_per_iteration": 2.838067054748535 }, { "auxiliary_loss_clip": 0.0118438, "auxiliary_loss_mlp": 0.01032535, "balance_loss_clip": 1.09535599, "balance_loss_mlp": 1.01174533, "epoch": 0.7994588907259883, "flos": 69455900580480.0, "grad_norm": 0.7017637639938828, "language_loss": 0.60867286, "learning_rate": 4.0713954866837573e-07, "loss": 0.63084203, "num_input_tokens_seen": 286855580, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.20800781, "step": 13297, "time_per_iteration": 3.4176766872406006 }, { "auxiliary_loss_clip": 0.01394317, "auxiliary_loss_mlp": 0.01030395, "balance_loss_clip": 1.23679876, "balance_loss_mlp": 1.01198936, "epoch": 0.7995190139786562, "flos": 13488259403520.0, "grad_norm": 1.8885496590076198, "language_loss": 0.71579051, "learning_rate": 4.0690405953961073e-07, "loss": 0.74003756, "num_input_tokens_seen": 286874360, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1842041, "step": 13298, "time_per_iteration": 2.8164517879486084 }, { "auxiliary_loss_clip": 0.01421931, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.25796485, "balance_loss_mlp": 1.01103246, "epoch": 0.7995791372313242, "flos": 21662817056640.0, "grad_norm": 2.186709837120161, "language_loss": 0.76327157, "learning_rate": 4.066686308212037e-07, "loss": 0.78779685, "num_input_tokens_seen": 286891950, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19555664, "step": 13299, "time_per_iteration": 2.8223607540130615 }, { "auxiliary_loss_clip": 0.01394258, "auxiliary_loss_mlp": 0.01037483, "balance_loss_clip": 1.23843789, "balance_loss_mlp": 1.01889849, "epoch": 0.7996392604839921, "flos": 26079384049920.0, "grad_norm": 1.6596092438204544, "language_loss": 0.78017485, "learning_rate": 4.064332625220828e-07, "loss": 0.80449224, "num_input_tokens_seen": 286911725, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18579102, "step": 13300, "time_per_iteration": 2.876464605331421 }, { "auxiliary_loss_clip": 0.01413821, "auxiliary_loss_mlp": 0.0103368, "balance_loss_clip": 1.25062513, "balance_loss_mlp": 1.01514268, "epoch": 0.7996993837366602, "flos": 24617293729920.0, "grad_norm": 1.7198083623228242, "language_loss": 0.64811099, "learning_rate": 4.0619795465117115e-07, "loss": 0.67258602, "num_input_tokens_seen": 286931400, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.1854248, "step": 13301, "time_per_iteration": 4.368839979171753 }, { "auxiliary_loss_clip": 0.01402931, "auxiliary_loss_mlp": 0.01032181, "balance_loss_clip": 1.24505448, "balance_loss_mlp": 1.01444244, "epoch": 0.7997595069893281, "flos": 21000802118400.0, "grad_norm": 1.8998907969499954, "language_loss": 0.72645801, "learning_rate": 4.059627072173928e-07, "loss": 0.75080919, "num_input_tokens_seen": 286949795, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.17724609, "step": 13302, "time_per_iteration": 2.860013246536255 }, { "auxiliary_loss_clip": 0.01417624, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 1.25416493, "balance_loss_mlp": 1.01526725, "epoch": 0.7998196302419961, "flos": 24437399829120.0, "grad_norm": 1.7562027035667913, "language_loss": 0.84599954, "learning_rate": 4.057275202296684e-07, "loss": 0.87051535, "num_input_tokens_seen": 286968805, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18664551, "step": 13303, "time_per_iteration": 2.9080238342285156 }, { "auxiliary_loss_clip": 0.0138479, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.22932541, "balance_loss_mlp": 1.0161202, "epoch": 0.7998797534946641, "flos": 30277164597120.0, "grad_norm": 2.041327873573778, "language_loss": 0.59849191, "learning_rate": 4.054923936969166e-07, "loss": 0.62268674, "num_input_tokens_seen": 286990235, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18579102, "step": 13304, "time_per_iteration": 2.912205696105957 }, { "auxiliary_loss_clip": 0.013987, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.23696947, "balance_loss_mlp": 1.01341391, "epoch": 0.799939876747332, "flos": 23524537864320.0, "grad_norm": 2.01576990073222, "language_loss": 0.70049548, "learning_rate": 4.0525732762805265e-07, "loss": 0.7248081, "num_input_tokens_seen": 287011060, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19140625, "step": 13305, "time_per_iteration": 2.86551570892334 }, { "auxiliary_loss_clip": 0.01398361, "auxiliary_loss_mlp": 0.01033327, "balance_loss_clip": 1.24065924, "balance_loss_mlp": 1.01468253, "epoch": 0.8, "flos": 19327571723520.0, "grad_norm": 1.7454988102765079, "language_loss": 0.69893146, "learning_rate": 4.0502232203199107e-07, "loss": 0.72324836, "num_input_tokens_seen": 287029215, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18640137, "step": 13306, "time_per_iteration": 2.838120937347412 }, { "auxiliary_loss_clip": 0.01408175, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.24775124, "balance_loss_mlp": 1.01516438, "epoch": 0.800060123252668, "flos": 32424236231040.0, "grad_norm": 1.4449921462211532, "language_loss": 0.70533663, "learning_rate": 4.0478737691764286e-07, "loss": 0.72975302, "num_input_tokens_seen": 287050855, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.1829834, "step": 13307, "time_per_iteration": 2.99957013130188 }, { "auxiliary_loss_clip": 0.01407771, "auxiliary_loss_mlp": 0.01036395, "balance_loss_clip": 1.24719274, "balance_loss_mlp": 1.01736903, "epoch": 0.800120246505336, "flos": 20020063674240.0, "grad_norm": 3.784966767959506, "language_loss": 0.77890271, "learning_rate": 4.0455249229391677e-07, "loss": 0.80334437, "num_input_tokens_seen": 287069915, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19030762, "step": 13308, "time_per_iteration": 4.374410390853882 }, { "auxiliary_loss_clip": 0.01407826, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.24431705, "balance_loss_mlp": 1.01024508, "epoch": 0.8001803697580039, "flos": 31880753965440.0, "grad_norm": 2.226683186507593, "language_loss": 0.78958583, "learning_rate": 4.0431766816972e-07, "loss": 0.81395566, "num_input_tokens_seen": 287091450, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18920898, "step": 13309, "time_per_iteration": 2.928068161010742 }, { "auxiliary_loss_clip": 0.01183171, "auxiliary_loss_mlp": 0.01026653, "balance_loss_clip": 1.0938518, "balance_loss_mlp": 1.0046227, "epoch": 0.8002404930106719, "flos": 63422216023680.0, "grad_norm": 0.8749271372195145, "language_loss": 0.64609575, "learning_rate": 4.040829045539571e-07, "loss": 0.668194, "num_input_tokens_seen": 287148365, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.22070312, "step": 13310, "time_per_iteration": 3.267462968826294 }, { "auxiliary_loss_clip": 0.01403327, "auxiliary_loss_mlp": 0.01035966, "balance_loss_clip": 1.24301207, "balance_loss_mlp": 1.01772642, "epoch": 0.8003006162633398, "flos": 27867075085440.0, "grad_norm": 1.9751929962755699, "language_loss": 0.83766818, "learning_rate": 4.0384820145553156e-07, "loss": 0.86206114, "num_input_tokens_seen": 287168280, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18249512, "step": 13311, "time_per_iteration": 2.895260810852051 }, { "auxiliary_loss_clip": 0.0140173, "auxiliary_loss_mlp": 0.01030494, "balance_loss_clip": 1.24203825, "balance_loss_mlp": 1.01184988, "epoch": 0.8003607395160078, "flos": 18232463128320.0, "grad_norm": 1.9382538817917305, "language_loss": 0.67085201, "learning_rate": 4.0361355888334116e-07, "loss": 0.69517422, "num_input_tokens_seen": 287185980, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18640137, "step": 13312, "time_per_iteration": 4.316673517227173 }, { "auxiliary_loss_clip": 0.01407897, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.2458632, "balance_loss_mlp": 1.01129985, "epoch": 0.8004208627686757, "flos": 20896838271360.0, "grad_norm": 1.7393213547943387, "language_loss": 0.75894678, "learning_rate": 4.033789768462843e-07, "loss": 0.78333461, "num_input_tokens_seen": 287203875, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19580078, "step": 13313, "time_per_iteration": 2.9634180068969727 }, { "auxiliary_loss_clip": 0.01405378, "auxiliary_loss_mlp": 0.01036747, "balance_loss_clip": 1.2447902, "balance_loss_mlp": 1.01834083, "epoch": 0.8004809860213438, "flos": 26448311301120.0, "grad_norm": 2.024263548702081, "language_loss": 0.76427019, "learning_rate": 4.031444553532575e-07, "loss": 0.7886914, "num_input_tokens_seen": 287226445, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18408203, "step": 13314, "time_per_iteration": 3.0608139038085938 }, { "auxiliary_loss_clip": 0.01179191, "auxiliary_loss_mlp": 0.01011983, "balance_loss_clip": 1.09272552, "balance_loss_mlp": 0.99472106, "epoch": 0.8005411092740117, "flos": 63679125853440.0, "grad_norm": 0.8021283165920722, "language_loss": 0.53799558, "learning_rate": 4.029099944131522e-07, "loss": 0.55990732, "num_input_tokens_seen": 287286240, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.17285156, "step": 13315, "time_per_iteration": 3.290274143218994 }, { "auxiliary_loss_clip": 0.01393643, "auxiliary_loss_mlp": 0.0103687, "balance_loss_clip": 1.23478246, "balance_loss_mlp": 1.01782012, "epoch": 0.8006012325266797, "flos": 36151885612800.0, "grad_norm": 2.344575574171086, "language_loss": 0.72153997, "learning_rate": 4.026755940348603e-07, "loss": 0.74584508, "num_input_tokens_seen": 287310265, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.1907959, "step": 13316, "time_per_iteration": 3.0587940216064453 }, { "auxiliary_loss_clip": 0.0141602, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.25119805, "balance_loss_mlp": 1.01713777, "epoch": 0.8006613557793477, "flos": 33852048975360.0, "grad_norm": 1.8151006819756503, "language_loss": 0.65632206, "learning_rate": 4.024412542272706e-07, "loss": 0.68085629, "num_input_tokens_seen": 287331610, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20275879, "step": 13317, "time_per_iteration": 2.9783949851989746 }, { "auxiliary_loss_clip": 0.0118188, "auxiliary_loss_mlp": 0.0103068, "balance_loss_clip": 1.09422231, "balance_loss_mlp": 1.01017618, "epoch": 0.8007214790320156, "flos": 67383582635520.0, "grad_norm": 0.7905015223025839, "language_loss": 0.59083366, "learning_rate": 4.0220697499926783e-07, "loss": 0.61295933, "num_input_tokens_seen": 287394795, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.20507812, "step": 13318, "time_per_iteration": 3.3819329738616943 }, { "auxiliary_loss_clip": 0.01396477, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.23671484, "balance_loss_mlp": 1.01166189, "epoch": 0.8007816022846836, "flos": 23196131971200.0, "grad_norm": 1.5968409563819024, "language_loss": 0.66664571, "learning_rate": 4.019727563597366e-07, "loss": 0.69091713, "num_input_tokens_seen": 287414595, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18994141, "step": 13319, "time_per_iteration": 2.8640754222869873 }, { "auxiliary_loss_clip": 0.01410417, "auxiliary_loss_mlp": 0.01035206, "balance_loss_clip": 1.24702477, "balance_loss_mlp": 1.01492834, "epoch": 0.8008417255373516, "flos": 21991494418560.0, "grad_norm": 1.6950110563242653, "language_loss": 0.74484235, "learning_rate": 4.0173859831755873e-07, "loss": 0.76929855, "num_input_tokens_seen": 287434395, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20263672, "step": 13320, "time_per_iteration": 2.8719913959503174 }, { "auxiliary_loss_clip": 0.01401467, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.24016058, "balance_loss_mlp": 1.01243651, "epoch": 0.8009018487900196, "flos": 16736049987840.0, "grad_norm": 1.9969630608322348, "language_loss": 0.8143028, "learning_rate": 4.015045008816138e-07, "loss": 0.83863884, "num_input_tokens_seen": 287450590, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19702148, "step": 13321, "time_per_iteration": 2.8701283931732178 }, { "auxiliary_loss_clip": 0.01387901, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.23184276, "balance_loss_mlp": 1.0137893, "epoch": 0.8009619720426875, "flos": 20823351436800.0, "grad_norm": 1.6626634863518122, "language_loss": 0.66346359, "learning_rate": 4.0127046406077825e-07, "loss": 0.68766564, "num_input_tokens_seen": 287468455, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18505859, "step": 13322, "time_per_iteration": 2.8731205463409424 }, { "auxiliary_loss_clip": 0.0140787, "auxiliary_loss_mlp": 0.01031912, "balance_loss_clip": 1.2469269, "balance_loss_mlp": 1.01330316, "epoch": 0.8010220952953555, "flos": 17940054113280.0, "grad_norm": 3.724672340431362, "language_loss": 0.78425062, "learning_rate": 4.010364878639265e-07, "loss": 0.80864841, "num_input_tokens_seen": 287486485, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18591309, "step": 13323, "time_per_iteration": 2.8070011138916016 }, { "auxiliary_loss_clip": 0.01410205, "auxiliary_loss_mlp": 0.01031981, "balance_loss_clip": 1.24658406, "balance_loss_mlp": 1.01311004, "epoch": 0.8010822185480234, "flos": 24583106643840.0, "grad_norm": 2.8363813073089803, "language_loss": 0.72595429, "learning_rate": 4.00802572299932e-07, "loss": 0.75037616, "num_input_tokens_seen": 287503940, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.1887207, "step": 13324, "time_per_iteration": 2.8942506313323975 }, { "auxiliary_loss_clip": 0.01406229, "auxiliary_loss_mlp": 0.0102984, "balance_loss_clip": 1.24246788, "balance_loss_mlp": 1.01033771, "epoch": 0.8011423418006914, "flos": 21838638925440.0, "grad_norm": 1.9085114351444477, "language_loss": 0.76782846, "learning_rate": 4.005687173776635e-07, "loss": 0.79218912, "num_input_tokens_seen": 287521660, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19506836, "step": 13325, "time_per_iteration": 2.8620030879974365 }, { "auxiliary_loss_clip": 0.01378213, "auxiliary_loss_mlp": 0.01031473, "balance_loss_clip": 1.22391212, "balance_loss_mlp": 1.01297176, "epoch": 0.8012024650533593, "flos": 23925797164800.0, "grad_norm": 1.5414783718703742, "language_loss": 0.7995562, "learning_rate": 4.003349231059898e-07, "loss": 0.8236531, "num_input_tokens_seen": 287541505, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.18505859, "step": 13326, "time_per_iteration": 2.8625690937042236 }, { "auxiliary_loss_clip": 0.01388224, "auxiliary_loss_mlp": 0.01032069, "balance_loss_clip": 1.23289514, "balance_loss_mlp": 1.01332974, "epoch": 0.8012625883060274, "flos": 23597662740480.0, "grad_norm": 2.33854399127897, "language_loss": 0.67225844, "learning_rate": 4.001011894937765e-07, "loss": 0.69646138, "num_input_tokens_seen": 287560015, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.1875, "step": 13327, "time_per_iteration": 2.871870756149292 }, { "auxiliary_loss_clip": 0.01381455, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.22755504, "balance_loss_mlp": 1.01573324, "epoch": 0.8013227115586953, "flos": 20823758640000.0, "grad_norm": 1.5921773694600445, "language_loss": 0.74410141, "learning_rate": 3.9986751654988636e-07, "loss": 0.76825547, "num_input_tokens_seen": 287579150, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.18212891, "step": 13328, "time_per_iteration": 2.8560938835144043 }, { "auxiliary_loss_clip": 0.01408069, "auxiliary_loss_mlp": 0.01035181, "balance_loss_clip": 1.2446444, "balance_loss_mlp": 1.01610708, "epoch": 0.8013828348113633, "flos": 15896855836800.0, "grad_norm": 4.8184963041625, "language_loss": 0.75032544, "learning_rate": 3.996339042831798e-07, "loss": 0.77475786, "num_input_tokens_seen": 287597420, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19067383, "step": 13329, "time_per_iteration": 2.871370553970337 }, { "auxiliary_loss_clip": 0.01181576, "auxiliary_loss_mlp": 0.01018336, "balance_loss_clip": 1.0924747, "balance_loss_mlp": 0.99582928, "epoch": 0.8014429580640313, "flos": 71097432353280.0, "grad_norm": 0.7153730252649813, "language_loss": 0.53056759, "learning_rate": 3.9940035270251605e-07, "loss": 0.55256671, "num_input_tokens_seen": 287667280, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.22460938, "step": 13330, "time_per_iteration": 3.4606757164001465 }, { "auxiliary_loss_clip": 0.01412876, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.24903917, "balance_loss_mlp": 1.01433611, "epoch": 0.8015030813166992, "flos": 23086829237760.0, "grad_norm": 2.1635703215190034, "language_loss": 0.73551559, "learning_rate": 3.991668618167519e-07, "loss": 0.75999117, "num_input_tokens_seen": 287687375, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20336914, "step": 13331, "time_per_iteration": 2.8754663467407227 }, { "auxiliary_loss_clip": 0.01403448, "auxiliary_loss_mlp": 0.01030694, "balance_loss_clip": 1.24385619, "balance_loss_mlp": 1.01195419, "epoch": 0.8015632045693672, "flos": 21882508398720.0, "grad_norm": 2.0673101500122457, "language_loss": 0.78023982, "learning_rate": 3.989334316347401e-07, "loss": 0.80458128, "num_input_tokens_seen": 287707895, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1875, "step": 13332, "time_per_iteration": 2.9269278049468994 }, { "auxiliary_loss_clip": 0.01413133, "auxiliary_loss_mlp": 0.01031908, "balance_loss_clip": 1.24985361, "balance_loss_mlp": 1.01309717, "epoch": 0.8016233278220352, "flos": 23666489360640.0, "grad_norm": 2.1745263091696927, "language_loss": 0.83880568, "learning_rate": 3.987000621653338e-07, "loss": 0.8632561, "num_input_tokens_seen": 287723990, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18811035, "step": 13333, "time_per_iteration": 2.872791290283203 }, { "auxiliary_loss_clip": 0.01408549, "auxiliary_loss_mlp": 0.01028267, "balance_loss_clip": 1.2464354, "balance_loss_mlp": 1.00881195, "epoch": 0.8016834510747032, "flos": 16261801545600.0, "grad_norm": 1.5992027164871274, "language_loss": 0.74431789, "learning_rate": 3.9846675341738133e-07, "loss": 0.76868606, "num_input_tokens_seen": 287742380, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19458008, "step": 13334, "time_per_iteration": 2.933253765106201 }, { "auxiliary_loss_clip": 0.01394653, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.23831701, "balance_loss_mlp": 1.01210022, "epoch": 0.8017435743273711, "flos": 12283576606080.0, "grad_norm": 2.0958048110053693, "language_loss": 0.75370705, "learning_rate": 3.9823350539972967e-07, "loss": 0.77796209, "num_input_tokens_seen": 287760130, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.1875, "step": 13335, "time_per_iteration": 2.997249126434326 }, { "auxiliary_loss_clip": 0.01393926, "auxiliary_loss_mlp": 0.01032674, "balance_loss_clip": 1.2350527, "balance_loss_mlp": 1.01329112, "epoch": 0.8018036975800391, "flos": 17203511710080.0, "grad_norm": 1.7343200601894442, "language_loss": 0.76203632, "learning_rate": 3.9800031812122416e-07, "loss": 0.78630233, "num_input_tokens_seen": 287777565, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19372559, "step": 13336, "time_per_iteration": 4.448650360107422 }, { "auxiliary_loss_clip": 0.0142825, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.26044357, "balance_loss_mlp": 1.01596701, "epoch": 0.801863820832707, "flos": 20641738233600.0, "grad_norm": 2.348112691545349, "language_loss": 0.75608218, "learning_rate": 3.977671915907068e-07, "loss": 0.78072083, "num_input_tokens_seen": 287796310, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.19641113, "step": 13337, "time_per_iteration": 3.005729913711548 }, { "auxiliary_loss_clip": 0.01419074, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.25424063, "balance_loss_mlp": 1.01583362, "epoch": 0.801923944085375, "flos": 30457737169920.0, "grad_norm": 1.6151915566018284, "language_loss": 0.8077603, "learning_rate": 3.9753412581701883e-07, "loss": 0.83231795, "num_input_tokens_seen": 287817330, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20849609, "step": 13338, "time_per_iteration": 2.909785747528076 }, { "auxiliary_loss_clip": 0.01411257, "auxiliary_loss_mlp": 0.01036635, "balance_loss_clip": 1.24632215, "balance_loss_mlp": 1.01633406, "epoch": 0.801984067338043, "flos": 20020018429440.0, "grad_norm": 2.440966384649834, "language_loss": 0.74669373, "learning_rate": 3.9730112080899733e-07, "loss": 0.77117264, "num_input_tokens_seen": 287835095, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20300293, "step": 13339, "time_per_iteration": 2.831796169281006 }, { "auxiliary_loss_clip": 0.01393431, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.23631573, "balance_loss_mlp": 1.01236415, "epoch": 0.802044190590711, "flos": 22794329733120.0, "grad_norm": 1.569885241709506, "language_loss": 0.80079502, "learning_rate": 3.970681765754775e-07, "loss": 0.82504141, "num_input_tokens_seen": 287854595, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18859863, "step": 13340, "time_per_iteration": 2.8791189193725586 }, { "auxiliary_loss_clip": 0.01409918, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.24893641, "balance_loss_mlp": 1.01490879, "epoch": 0.8021043138433789, "flos": 27611658334080.0, "grad_norm": 1.6570462058200652, "language_loss": 0.68713456, "learning_rate": 3.968352931252936e-07, "loss": 0.71156579, "num_input_tokens_seen": 287876960, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1829834, "step": 13341, "time_per_iteration": 2.904867649078369 }, { "auxiliary_loss_clip": 0.01180193, "auxiliary_loss_mlp": 0.01025175, "balance_loss_clip": 1.09204245, "balance_loss_mlp": 1.00324047, "epoch": 0.8021644370960469, "flos": 62089680124800.0, "grad_norm": 0.8059924242146608, "language_loss": 0.61814427, "learning_rate": 3.9660247046727547e-07, "loss": 0.64019799, "num_input_tokens_seen": 287936530, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21972656, "step": 13342, "time_per_iteration": 4.70370888710022 }, { "auxiliary_loss_clip": 0.01400198, "auxiliary_loss_mlp": 0.01038023, "balance_loss_clip": 1.23883688, "balance_loss_mlp": 1.01791191, "epoch": 0.8022245603487148, "flos": 23370686985600.0, "grad_norm": 1.7655239161903353, "language_loss": 0.64092684, "learning_rate": 3.963697086102522e-07, "loss": 0.66530907, "num_input_tokens_seen": 287954285, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20129395, "step": 13343, "time_per_iteration": 2.8668668270111084 }, { "auxiliary_loss_clip": 0.01378764, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.2258904, "balance_loss_mlp": 1.01294732, "epoch": 0.8022846836013828, "flos": 10860605055360.0, "grad_norm": 1.9682487662073866, "language_loss": 0.69904774, "learning_rate": 3.96137007563051e-07, "loss": 0.72314924, "num_input_tokens_seen": 287971595, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.18444824, "step": 13344, "time_per_iteration": 2.8238441944122314 }, { "auxiliary_loss_clip": 0.01410126, "auxiliary_loss_mlp": 0.01029293, "balance_loss_clip": 1.24877131, "balance_loss_mlp": 1.0107801, "epoch": 0.8023448068540509, "flos": 29252013742080.0, "grad_norm": 1.4506718320760645, "language_loss": 0.70702064, "learning_rate": 3.9590436733449506e-07, "loss": 0.73141491, "num_input_tokens_seen": 287992540, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18518066, "step": 13345, "time_per_iteration": 2.9619622230529785 }, { "auxiliary_loss_clip": 0.0118192, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.09240556, "balance_loss_mlp": 1.01091623, "epoch": 0.8024049301067188, "flos": 64182720188160.0, "grad_norm": 0.8698532431680145, "language_loss": 0.63026547, "learning_rate": 3.956717879334059e-07, "loss": 0.65242368, "num_input_tokens_seen": 288052810, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.22949219, "step": 13346, "time_per_iteration": 4.80489182472229 }, { "auxiliary_loss_clip": 0.01394854, "auxiliary_loss_mlp": 0.01032865, "balance_loss_clip": 1.23882937, "balance_loss_mlp": 1.01391053, "epoch": 0.8024650533593868, "flos": 28597192727040.0, "grad_norm": 1.452428863659513, "language_loss": 0.73034203, "learning_rate": 3.9543926936860327e-07, "loss": 0.75461918, "num_input_tokens_seen": 288073045, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18969727, "step": 13347, "time_per_iteration": 4.368834972381592 }, { "auxiliary_loss_clip": 0.01410289, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.24692655, "balance_loss_mlp": 1.01192868, "epoch": 0.8025251766120547, "flos": 16990923801600.0, "grad_norm": 2.0851838048172584, "language_loss": 0.74169374, "learning_rate": 3.9520681164890493e-07, "loss": 0.76610082, "num_input_tokens_seen": 288091165, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18481445, "step": 13348, "time_per_iteration": 2.839907646179199 }, { "auxiliary_loss_clip": 0.01409438, "auxiliary_loss_mlp": 0.01030658, "balance_loss_clip": 1.24862599, "balance_loss_mlp": 1.01191854, "epoch": 0.8025852998647227, "flos": 22173786293760.0, "grad_norm": 1.8729169133572126, "language_loss": 0.77139854, "learning_rate": 3.9497441478312444e-07, "loss": 0.79579949, "num_input_tokens_seen": 288110595, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18737793, "step": 13349, "time_per_iteration": 2.8709583282470703 }, { "auxiliary_loss_clip": 0.01406627, "auxiliary_loss_mlp": 0.01033297, "balance_loss_clip": 1.2463845, "balance_loss_mlp": 1.01511788, "epoch": 0.8026454231173906, "flos": 22026903114240.0, "grad_norm": 1.9305067292656592, "language_loss": 0.84559262, "learning_rate": 3.947420787800755e-07, "loss": 0.8699919, "num_input_tokens_seen": 288128995, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18200684, "step": 13350, "time_per_iteration": 2.8675475120544434 }, { "auxiliary_loss_clip": 0.01403036, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.24237621, "balance_loss_mlp": 1.01594651, "epoch": 0.8027055463700586, "flos": 22501332535680.0, "grad_norm": 1.8557291929072517, "language_loss": 0.7174865, "learning_rate": 3.945098036485679e-07, "loss": 0.74186099, "num_input_tokens_seen": 288149265, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18444824, "step": 13351, "time_per_iteration": 2.8649048805236816 }, { "auxiliary_loss_clip": 0.01399525, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.24127555, "balance_loss_mlp": 1.01439667, "epoch": 0.8027656696227266, "flos": 28924603234560.0, "grad_norm": 1.84848062896164, "language_loss": 0.62185168, "learning_rate": 3.9427758939740885e-07, "loss": 0.64618874, "num_input_tokens_seen": 288170745, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19787598, "step": 13352, "time_per_iteration": 2.981959581375122 }, { "auxiliary_loss_clip": 0.01405343, "auxiliary_loss_mlp": 0.01040151, "balance_loss_clip": 1.24642456, "balance_loss_mlp": 1.02150643, "epoch": 0.8028257928753946, "flos": 18598901915520.0, "grad_norm": 2.5197190629107244, "language_loss": 0.77759337, "learning_rate": 3.940454360354046e-07, "loss": 0.80204833, "num_input_tokens_seen": 288189415, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18640137, "step": 13353, "time_per_iteration": 2.8468422889709473 }, { "auxiliary_loss_clip": 0.01437039, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.26907778, "balance_loss_mlp": 1.01776195, "epoch": 0.8028859161280625, "flos": 19137950190720.0, "grad_norm": 2.1501713607611195, "language_loss": 0.73810446, "learning_rate": 3.938133435713582e-07, "loss": 0.76284695, "num_input_tokens_seen": 288206900, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.19445801, "step": 13354, "time_per_iteration": 2.892155885696411 }, { "auxiliary_loss_clip": 0.01409714, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.24654627, "balance_loss_mlp": 1.01441717, "epoch": 0.8029460393807305, "flos": 20239483547520.0, "grad_norm": 1.9218560480353146, "language_loss": 0.66441488, "learning_rate": 3.935813120140714e-07, "loss": 0.68884099, "num_input_tokens_seen": 288224800, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18481445, "step": 13355, "time_per_iteration": 2.866694211959839 }, { "auxiliary_loss_clip": 0.0141864, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.25425148, "balance_loss_mlp": 1.01343441, "epoch": 0.8030061626333984, "flos": 49800583653120.0, "grad_norm": 2.1045918623817155, "language_loss": 0.70108789, "learning_rate": 3.9334934137234235e-07, "loss": 0.72560674, "num_input_tokens_seen": 288249400, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19824219, "step": 13356, "time_per_iteration": 3.144876480102539 }, { "auxiliary_loss_clip": 0.01396133, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.23636615, "balance_loss_mlp": 1.01814675, "epoch": 0.8030662858860664, "flos": 21624919896960.0, "grad_norm": 1.5179408840296515, "language_loss": 0.77886069, "learning_rate": 3.931174316549666e-07, "loss": 0.80319762, "num_input_tokens_seen": 288268780, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19421387, "step": 13357, "time_per_iteration": 2.864150285720825 }, { "auxiliary_loss_clip": 0.01413859, "auxiliary_loss_mlp": 0.01033698, "balance_loss_clip": 1.24884462, "balance_loss_mlp": 1.01446986, "epoch": 0.8031264091387345, "flos": 25641132485760.0, "grad_norm": 2.791562979941978, "language_loss": 0.77858865, "learning_rate": 3.9288558287073937e-07, "loss": 0.80306423, "num_input_tokens_seen": 288290830, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19213867, "step": 13358, "time_per_iteration": 2.904538154602051 }, { "auxiliary_loss_clip": 0.01407522, "auxiliary_loss_mlp": 0.01034472, "balance_loss_clip": 1.24705625, "balance_loss_mlp": 1.01476693, "epoch": 0.8031865323914024, "flos": 19655706147840.0, "grad_norm": 6.785732844229658, "language_loss": 0.84930623, "learning_rate": 3.9265379502845143e-07, "loss": 0.87372613, "num_input_tokens_seen": 288308865, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19726562, "step": 13359, "time_per_iteration": 2.8707432746887207 }, { "auxiliary_loss_clip": 0.01397784, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.23972654, "balance_loss_mlp": 1.01734114, "epoch": 0.8032466556440704, "flos": 26178189989760.0, "grad_norm": 2.097757955264157, "language_loss": 0.74238986, "learning_rate": 3.924220681368928e-07, "loss": 0.766729, "num_input_tokens_seen": 288327325, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18786621, "step": 13360, "time_per_iteration": 2.868642568588257 }, { "auxiliary_loss_clip": 0.01400121, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.23841929, "balance_loss_mlp": 1.01592064, "epoch": 0.8033067788967383, "flos": 25530924856320.0, "grad_norm": 1.8195244037567007, "language_loss": 0.7042132, "learning_rate": 3.921904022048512e-07, "loss": 0.72855419, "num_input_tokens_seen": 288347285, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18078613, "step": 13361, "time_per_iteration": 2.8701629638671875 }, { "auxiliary_loss_clip": 0.0142019, "auxiliary_loss_mlp": 0.01041346, "balance_loss_clip": 1.25511992, "balance_loss_mlp": 1.02109241, "epoch": 0.8033669021494063, "flos": 24034828429440.0, "grad_norm": 1.9306990159389623, "language_loss": 0.70559216, "learning_rate": 3.919587972411098e-07, "loss": 0.7302075, "num_input_tokens_seen": 288367785, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20251465, "step": 13362, "time_per_iteration": 2.8885715007781982 }, { "auxiliary_loss_clip": 0.01435, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.26640141, "balance_loss_mlp": 1.02402854, "epoch": 0.8034270254020742, "flos": 13595571365760.0, "grad_norm": 2.273211962661656, "language_loss": 0.80425048, "learning_rate": 3.91727253254452e-07, "loss": 0.82905066, "num_input_tokens_seen": 288384135, "router_z_loss_clip": 1.68359375, "router_z_loss_mlp": 0.20983887, "step": 13363, "time_per_iteration": 2.8379573822021484 }, { "auxiliary_loss_clip": 0.01407458, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.24526346, "balance_loss_mlp": 1.01166821, "epoch": 0.8034871486547422, "flos": 27422851207680.0, "grad_norm": 2.09731247627203, "language_loss": 0.75518692, "learning_rate": 3.9149577025365787e-07, "loss": 0.77957392, "num_input_tokens_seen": 288403805, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19567871, "step": 13364, "time_per_iteration": 2.9101834297180176 }, { "auxiliary_loss_clip": 0.01402736, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.24424291, "balance_loss_mlp": 1.01319742, "epoch": 0.8035472719074102, "flos": 32611414544640.0, "grad_norm": 2.411511613457792, "language_loss": 0.61450535, "learning_rate": 3.9126434824750596e-07, "loss": 0.63886482, "num_input_tokens_seen": 288424895, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.20019531, "step": 13365, "time_per_iteration": 2.919126033782959 }, { "auxiliary_loss_clip": 0.0141585, "auxiliary_loss_mlp": 0.0103769, "balance_loss_clip": 1.25221515, "balance_loss_mlp": 1.01809168, "epoch": 0.8036073951600782, "flos": 21297871347840.0, "grad_norm": 1.7043073744871837, "language_loss": 0.66899055, "learning_rate": 3.910329872447706e-07, "loss": 0.69352603, "num_input_tokens_seen": 288443865, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19592285, "step": 13366, "time_per_iteration": 2.895183801651001 }, { "auxiliary_loss_clip": 0.01392792, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.23471987, "balance_loss_mlp": 1.01560879, "epoch": 0.8036675184127461, "flos": 18122934170880.0, "grad_norm": 2.0232066243531284, "language_loss": 0.75347459, "learning_rate": 3.908016872542259e-07, "loss": 0.77773875, "num_input_tokens_seen": 288461065, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18005371, "step": 13367, "time_per_iteration": 2.838040590286255 }, { "auxiliary_loss_clip": 0.01396053, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.2357235, "balance_loss_mlp": 1.01367247, "epoch": 0.8037276416654141, "flos": 26041170176640.0, "grad_norm": 1.6785514424084573, "language_loss": 0.74849415, "learning_rate": 3.905704482846428e-07, "loss": 0.77278501, "num_input_tokens_seen": 288481865, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19348145, "step": 13368, "time_per_iteration": 2.9056050777435303 }, { "auxiliary_loss_clip": 0.01420491, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.25623417, "balance_loss_mlp": 1.01153433, "epoch": 0.803787764918082, "flos": 18810177724800.0, "grad_norm": 1.84992754034051, "language_loss": 0.71005833, "learning_rate": 3.90339270344789e-07, "loss": 0.73457015, "num_input_tokens_seen": 288499345, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19152832, "step": 13369, "time_per_iteration": 2.870034694671631 }, { "auxiliary_loss_clip": 0.0139655, "auxiliary_loss_mlp": 0.010392, "balance_loss_clip": 1.23738241, "balance_loss_mlp": 1.01980495, "epoch": 0.80384788817075, "flos": 20234325640320.0, "grad_norm": 2.6113379166807786, "language_loss": 0.74184191, "learning_rate": 3.901081534434312e-07, "loss": 0.76619947, "num_input_tokens_seen": 288517660, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19396973, "step": 13370, "time_per_iteration": 2.8375585079193115 }, { "auxiliary_loss_clip": 0.01419329, "auxiliary_loss_mlp": 0.01035162, "balance_loss_clip": 1.2539084, "balance_loss_mlp": 1.01672029, "epoch": 0.8039080114234181, "flos": 18524510184960.0, "grad_norm": 4.004737498883886, "language_loss": 0.88359576, "learning_rate": 3.898770975893342e-07, "loss": 0.90814072, "num_input_tokens_seen": 288534180, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.18432617, "step": 13371, "time_per_iteration": 2.86247181892395 }, { "auxiliary_loss_clip": 0.01410706, "auxiliary_loss_mlp": 0.01032768, "balance_loss_clip": 1.24493933, "balance_loss_mlp": 1.01379001, "epoch": 0.803968134676086, "flos": 22392663229440.0, "grad_norm": 2.561506122708692, "language_loss": 0.75775659, "learning_rate": 3.89646102791259e-07, "loss": 0.7821914, "num_input_tokens_seen": 288553350, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.18994141, "step": 13372, "time_per_iteration": 4.3639185428619385 }, { "auxiliary_loss_clip": 0.01402667, "auxiliary_loss_mlp": 0.01034056, "balance_loss_clip": 1.24137187, "balance_loss_mlp": 1.01461315, "epoch": 0.804028257928754, "flos": 23853260471040.0, "grad_norm": 2.1775810626621945, "language_loss": 0.80139089, "learning_rate": 3.894151690579646e-07, "loss": 0.82575816, "num_input_tokens_seen": 288571325, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19445801, "step": 13373, "time_per_iteration": 2.8709256649017334 }, { "auxiliary_loss_clip": 0.01396602, "auxiliary_loss_mlp": 0.01035528, "balance_loss_clip": 1.23903036, "balance_loss_mlp": 1.01748013, "epoch": 0.8040883811814219, "flos": 23561349148800.0, "grad_norm": 1.4792355393855843, "language_loss": 0.75224125, "learning_rate": 3.8918429639820815e-07, "loss": 0.77656257, "num_input_tokens_seen": 288592100, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18054199, "step": 13374, "time_per_iteration": 2.895918846130371 }, { "auxiliary_loss_clip": 0.01413674, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 1.248752, "balance_loss_mlp": 1.01750886, "epoch": 0.8041485044340899, "flos": 19035660401280.0, "grad_norm": 3.145790817163915, "language_loss": 0.70417738, "learning_rate": 3.889534848207452e-07, "loss": 0.72867978, "num_input_tokens_seen": 288612305, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19067383, "step": 13375, "time_per_iteration": 2.8317067623138428 }, { "auxiliary_loss_clip": 0.01180618, "auxiliary_loss_mlp": 0.01025227, "balance_loss_clip": 1.09132934, "balance_loss_mlp": 1.00205266, "epoch": 0.8042086276867578, "flos": 70040039938560.0, "grad_norm": 0.7248596454600553, "language_loss": 0.55704474, "learning_rate": 3.887227343343271e-07, "loss": 0.57910323, "num_input_tokens_seen": 288676015, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.23144531, "step": 13376, "time_per_iteration": 3.401216506958008 }, { "auxiliary_loss_clip": 0.01414418, "auxiliary_loss_mlp": 0.01030204, "balance_loss_clip": 1.25132394, "balance_loss_mlp": 1.01126146, "epoch": 0.8042687509394258, "flos": 21882644133120.0, "grad_norm": 1.6362383386345543, "language_loss": 0.74345529, "learning_rate": 3.8849204494770425e-07, "loss": 0.76790154, "num_input_tokens_seen": 288696455, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18933105, "step": 13377, "time_per_iteration": 4.335128545761108 }, { "auxiliary_loss_clip": 0.01403893, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.24230123, "balance_loss_mlp": 1.01428533, "epoch": 0.8043288741920938, "flos": 26626123941120.0, "grad_norm": 1.824744961102246, "language_loss": 0.70923626, "learning_rate": 3.8826141666962567e-07, "loss": 0.73360646, "num_input_tokens_seen": 288715560, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18847656, "step": 13378, "time_per_iteration": 2.9427356719970703 }, { "auxiliary_loss_clip": 0.01412892, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.25102913, "balance_loss_mlp": 1.01460564, "epoch": 0.8043889974447618, "flos": 33416964547200.0, "grad_norm": 1.5260763012474514, "language_loss": 0.69611555, "learning_rate": 3.880308495088347e-07, "loss": 0.72058737, "num_input_tokens_seen": 288739485, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19689941, "step": 13379, "time_per_iteration": 2.9899532794952393 }, { "auxiliary_loss_clip": 0.0141677, "auxiliary_loss_mlp": 0.01036144, "balance_loss_clip": 1.24986553, "balance_loss_mlp": 1.01643848, "epoch": 0.8044491206974297, "flos": 20385642810240.0, "grad_norm": 1.6541637886710243, "language_loss": 0.76768619, "learning_rate": 3.8780034347407533e-07, "loss": 0.79221535, "num_input_tokens_seen": 288757420, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 0.19702148, "step": 13380, "time_per_iteration": 2.8687806129455566 }, { "auxiliary_loss_clip": 0.01401888, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.24181104, "balance_loss_mlp": 1.01444054, "epoch": 0.8045092439500977, "flos": 23414149255680.0, "grad_norm": 1.8580536363382052, "language_loss": 0.70144057, "learning_rate": 3.875698985740887e-07, "loss": 0.72578353, "num_input_tokens_seen": 288775535, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.17980957, "step": 13381, "time_per_iteration": 4.260072231292725 }, { "auxiliary_loss_clip": 0.01399153, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.23931193, "balance_loss_mlp": 1.0159682, "epoch": 0.8045693672027656, "flos": 24107817571200.0, "grad_norm": 2.2076316758976007, "language_loss": 0.64842212, "learning_rate": 3.873395148176135e-07, "loss": 0.67275834, "num_input_tokens_seen": 288795035, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18493652, "step": 13382, "time_per_iteration": 4.318398714065552 }, { "auxiliary_loss_clip": 0.01397266, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.23773038, "balance_loss_mlp": 1.01495266, "epoch": 0.8046294904554336, "flos": 27718110645120.0, "grad_norm": 2.1800360630137163, "language_loss": 0.76733893, "learning_rate": 3.8710919221338487e-07, "loss": 0.79164571, "num_input_tokens_seen": 288816270, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18444824, "step": 13383, "time_per_iteration": 2.9065141677856445 }, { "auxiliary_loss_clip": 0.01403413, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.24282432, "balance_loss_mlp": 1.01678848, "epoch": 0.8046896137081017, "flos": 24983823006720.0, "grad_norm": 1.699060748201446, "language_loss": 0.70425606, "learning_rate": 3.868789307701381e-07, "loss": 0.72864074, "num_input_tokens_seen": 288836050, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18273926, "step": 13384, "time_per_iteration": 2.895906448364258 }, { "auxiliary_loss_clip": 0.01413816, "auxiliary_loss_mlp": 0.01032732, "balance_loss_clip": 1.24940586, "balance_loss_mlp": 1.01345611, "epoch": 0.8047497369607696, "flos": 17684320648320.0, "grad_norm": 2.0454217516507676, "language_loss": 0.80326629, "learning_rate": 3.8664873049660375e-07, "loss": 0.82773185, "num_input_tokens_seen": 288852900, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19287109, "step": 13385, "time_per_iteration": 2.867671012878418 }, { "auxiliary_loss_clip": 0.01413446, "auxiliary_loss_mlp": 0.01038978, "balance_loss_clip": 1.25112367, "balance_loss_mlp": 1.01872492, "epoch": 0.8048098602134376, "flos": 22392120291840.0, "grad_norm": 1.6124151516120915, "language_loss": 0.72837007, "learning_rate": 3.864185914015108e-07, "loss": 0.75289428, "num_input_tokens_seen": 288872625, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.20251465, "step": 13386, "time_per_iteration": 2.8528923988342285 }, { "auxiliary_loss_clip": 0.01177859, "auxiliary_loss_mlp": 0.01020416, "balance_loss_clip": 1.09060335, "balance_loss_mlp": 1.0009613, "epoch": 0.8048699834661055, "flos": 71233366291200.0, "grad_norm": 0.660111408875946, "language_loss": 0.51274914, "learning_rate": 3.861885134935865e-07, "loss": 0.53473186, "num_input_tokens_seen": 288939180, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.19433594, "step": 13387, "time_per_iteration": 3.4094936847686768 }, { "auxiliary_loss_clip": 0.01405939, "auxiliary_loss_mlp": 0.01035011, "balance_loss_clip": 1.24468327, "balance_loss_mlp": 1.01525784, "epoch": 0.8049301067187735, "flos": 23671013840640.0, "grad_norm": 1.7715273981272661, "language_loss": 0.74534595, "learning_rate": 3.859584967815559e-07, "loss": 0.76975548, "num_input_tokens_seen": 288958925, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19750977, "step": 13388, "time_per_iteration": 2.8850924968719482 }, { "auxiliary_loss_clip": 0.01406912, "auxiliary_loss_mlp": 0.01030998, "balance_loss_clip": 1.24756408, "balance_loss_mlp": 1.01223445, "epoch": 0.8049902299714414, "flos": 24437173605120.0, "grad_norm": 1.5394425217330887, "language_loss": 0.72088742, "learning_rate": 3.857285412741411e-07, "loss": 0.7452665, "num_input_tokens_seen": 288980935, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18737793, "step": 13389, "time_per_iteration": 2.9136300086975098 }, { "auxiliary_loss_clip": 0.01398889, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 1.24092698, "balance_loss_mlp": 1.01997852, "epoch": 0.8050503532241094, "flos": 17501304856320.0, "grad_norm": 2.742761592439407, "language_loss": 0.83371258, "learning_rate": 3.8549864698006097e-07, "loss": 0.8580876, "num_input_tokens_seen": 288996780, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.1862793, "step": 13390, "time_per_iteration": 2.8573920726776123 }, { "auxiliary_loss_clip": 0.01177475, "auxiliary_loss_mlp": 0.01022807, "balance_loss_clip": 1.08930671, "balance_loss_mlp": 0.99896479, "epoch": 0.8051104764767774, "flos": 57685193493120.0, "grad_norm": 0.7870348976964566, "language_loss": 0.55487627, "learning_rate": 3.8526881390803424e-07, "loss": 0.57687902, "num_input_tokens_seen": 289057590, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.23828125, "step": 13391, "time_per_iteration": 3.2774362564086914 }, { "auxiliary_loss_clip": 0.01391105, "auxiliary_loss_mlp": 0.010337, "balance_loss_clip": 1.23493397, "balance_loss_mlp": 1.01465011, "epoch": 0.8051705997294454, "flos": 18012274093440.0, "grad_norm": 1.4839752610738173, "language_loss": 0.85194135, "learning_rate": 3.850390420667762e-07, "loss": 0.87618941, "num_input_tokens_seen": 289076285, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.19042969, "step": 13392, "time_per_iteration": 2.8301596641540527 }, { "auxiliary_loss_clip": 0.0139934, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.2377255, "balance_loss_mlp": 1.01678348, "epoch": 0.8052307229821133, "flos": 26409418755840.0, "grad_norm": 1.376581924699406, "language_loss": 0.71049362, "learning_rate": 3.8480933146499914e-07, "loss": 0.73483384, "num_input_tokens_seen": 289097585, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.17883301, "step": 13393, "time_per_iteration": 2.996227502822876 }, { "auxiliary_loss_clip": 0.01413413, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.25075722, "balance_loss_mlp": 1.01450419, "epoch": 0.8052908462347813, "flos": 21765649783680.0, "grad_norm": 2.102848144448321, "language_loss": 0.77329767, "learning_rate": 3.84579682111414e-07, "loss": 0.79777873, "num_input_tokens_seen": 289116890, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20202637, "step": 13394, "time_per_iteration": 2.8571460247039795 }, { "auxiliary_loss_clip": 0.01399391, "auxiliary_loss_mlp": 0.01035775, "balance_loss_clip": 1.23942518, "balance_loss_mlp": 1.01696396, "epoch": 0.8053509694874492, "flos": 25451918156160.0, "grad_norm": 1.6990338924013488, "language_loss": 0.65545225, "learning_rate": 3.843500940147304e-07, "loss": 0.67980391, "num_input_tokens_seen": 289136670, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18811035, "step": 13395, "time_per_iteration": 3.0366392135620117 }, { "auxiliary_loss_clip": 0.01178067, "auxiliary_loss_mlp": 0.01016177, "balance_loss_clip": 1.08974624, "balance_loss_mlp": 0.99824774, "epoch": 0.8054110927401172, "flos": 57697907281920.0, "grad_norm": 0.7573514763798864, "language_loss": 0.57400811, "learning_rate": 3.8412056718365206e-07, "loss": 0.59595048, "num_input_tokens_seen": 289200150, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.1796875, "step": 13396, "time_per_iteration": 3.5205607414245605 }, { "auxiliary_loss_clip": 0.0140063, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.24034929, "balance_loss_mlp": 1.01807928, "epoch": 0.8054712159927853, "flos": 19284245187840.0, "grad_norm": 1.6523474499059487, "language_loss": 0.78283644, "learning_rate": 3.8389110162688353e-07, "loss": 0.8072176, "num_input_tokens_seen": 289218125, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19396973, "step": 13397, "time_per_iteration": 2.8544516563415527 }, { "auxiliary_loss_clip": 0.01406363, "auxiliary_loss_mlp": 0.01030724, "balance_loss_clip": 1.24623597, "balance_loss_mlp": 1.01268756, "epoch": 0.8055313392454532, "flos": 17976593928960.0, "grad_norm": 1.5269162296436827, "language_loss": 0.71035254, "learning_rate": 3.836616973531266e-07, "loss": 0.73472345, "num_input_tokens_seen": 289237115, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18041992, "step": 13398, "time_per_iteration": 2.874073028564453 }, { "auxiliary_loss_clip": 0.01402514, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.24234724, "balance_loss_mlp": 1.01258111, "epoch": 0.8055914624981212, "flos": 13485770939520.0, "grad_norm": 4.277179931618462, "language_loss": 0.70165575, "learning_rate": 3.834323543710805e-07, "loss": 0.72599733, "num_input_tokens_seen": 289253635, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19067383, "step": 13399, "time_per_iteration": 2.8311643600463867 }, { "auxiliary_loss_clip": 0.01405009, "auxiliary_loss_mlp": 0.01032616, "balance_loss_clip": 1.24476457, "balance_loss_mlp": 1.01409113, "epoch": 0.8056515857507891, "flos": 13232616428160.0, "grad_norm": 2.388711289797942, "language_loss": 0.73357832, "learning_rate": 3.8320307268944153e-07, "loss": 0.7579546, "num_input_tokens_seen": 289270085, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18518066, "step": 13400, "time_per_iteration": 2.8717217445373535 }, { "auxiliary_loss_clip": 0.0139456, "auxiliary_loss_mlp": 0.010313, "balance_loss_clip": 1.2356416, "balance_loss_mlp": 1.01272678, "epoch": 0.8057117090034571, "flos": 23888669166720.0, "grad_norm": 1.6750622398553707, "language_loss": 0.6437602, "learning_rate": 3.829738523169037e-07, "loss": 0.66801876, "num_input_tokens_seen": 289289645, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18579102, "step": 13401, "time_per_iteration": 2.903275489807129 }, { "auxiliary_loss_clip": 0.01401045, "auxiliary_loss_mlp": 0.0103563, "balance_loss_clip": 1.24025702, "balance_loss_mlp": 1.01766479, "epoch": 0.805771832256125, "flos": 21223977310080.0, "grad_norm": 2.0537003144785007, "language_loss": 0.85153913, "learning_rate": 3.8274469326215985e-07, "loss": 0.87590593, "num_input_tokens_seen": 289306630, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.1796875, "step": 13402, "time_per_iteration": 2.8236422538757324 }, { "auxiliary_loss_clip": 0.01411687, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.25045991, "balance_loss_mlp": 1.01458752, "epoch": 0.805831955508793, "flos": 17575832321280.0, "grad_norm": 1.974165997438179, "language_loss": 0.68876833, "learning_rate": 3.8251559553389876e-07, "loss": 0.71322548, "num_input_tokens_seen": 289324960, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19433594, "step": 13403, "time_per_iteration": 2.813140869140625 }, { "auxiliary_loss_clip": 0.01394621, "auxiliary_loss_mlp": 0.01036229, "balance_loss_clip": 1.23779941, "balance_loss_mlp": 1.01739371, "epoch": 0.805892078761461, "flos": 26919075893760.0, "grad_norm": 1.908368806174867, "language_loss": 0.85583323, "learning_rate": 3.822865591408084e-07, "loss": 0.88014162, "num_input_tokens_seen": 289344980, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18823242, "step": 13404, "time_per_iteration": 2.8958840370178223 }, { "auxiliary_loss_clip": 0.01386982, "auxiliary_loss_mlp": 0.01031847, "balance_loss_clip": 1.23119926, "balance_loss_mlp": 1.01335764, "epoch": 0.805952202014129, "flos": 31519608819840.0, "grad_norm": 3.766745832848424, "language_loss": 0.71002877, "learning_rate": 3.820575840915743e-07, "loss": 0.73421705, "num_input_tokens_seen": 289367500, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.18469238, "step": 13405, "time_per_iteration": 3.001819372177124 }, { "auxiliary_loss_clip": 0.01395809, "auxiliary_loss_mlp": 0.01034174, "balance_loss_clip": 1.23836029, "balance_loss_mlp": 1.01561356, "epoch": 0.8060123252667969, "flos": 24400814768640.0, "grad_norm": 2.32728104341965, "language_loss": 0.76381093, "learning_rate": 3.818286703948788e-07, "loss": 0.78811073, "num_input_tokens_seen": 289385930, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18566895, "step": 13406, "time_per_iteration": 2.8827598094940186 }, { "auxiliary_loss_clip": 0.01404163, "auxiliary_loss_mlp": 0.01031186, "balance_loss_clip": 1.2430861, "balance_loss_mlp": 1.01238704, "epoch": 0.8060724485194649, "flos": 23491119939840.0, "grad_norm": 1.4433025573017388, "language_loss": 0.76715004, "learning_rate": 3.815998180594018e-07, "loss": 0.79150343, "num_input_tokens_seen": 289408025, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18798828, "step": 13407, "time_per_iteration": 4.370076894760132 }, { "auxiliary_loss_clip": 0.01396391, "auxiliary_loss_mlp": 0.01035529, "balance_loss_clip": 1.23763514, "balance_loss_mlp": 1.01608622, "epoch": 0.8061325717721328, "flos": 18633677184000.0, "grad_norm": 2.0721268400864585, "language_loss": 0.74708676, "learning_rate": 3.81371027093822e-07, "loss": 0.77140594, "num_input_tokens_seen": 289426575, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19433594, "step": 13408, "time_per_iteration": 2.857653856277466 }, { "auxiliary_loss_clip": 0.01391995, "auxiliary_loss_mlp": 0.01031772, "balance_loss_clip": 1.23388374, "balance_loss_mlp": 1.01248431, "epoch": 0.8061926950248008, "flos": 23592233364480.0, "grad_norm": 2.2115415629621915, "language_loss": 0.71501279, "learning_rate": 3.8114229750681523e-07, "loss": 0.73925042, "num_input_tokens_seen": 289447760, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19299316, "step": 13409, "time_per_iteration": 2.8886425495147705 }, { "auxiliary_loss_clip": 0.01403514, "auxiliary_loss_mlp": 0.01031988, "balance_loss_clip": 1.24171436, "balance_loss_mlp": 1.01261663, "epoch": 0.8062528182774689, "flos": 11150163648000.0, "grad_norm": 2.420248317528739, "language_loss": 0.77676463, "learning_rate": 3.809136293070545e-07, "loss": 0.80111969, "num_input_tokens_seen": 289463920, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19372559, "step": 13410, "time_per_iteration": 2.8205883502960205 }, { "auxiliary_loss_clip": 0.0140103, "auxiliary_loss_mlp": 0.01034071, "balance_loss_clip": 1.24196076, "balance_loss_mlp": 1.01374602, "epoch": 0.8063129415301368, "flos": 22357164044160.0, "grad_norm": 1.7771576195938623, "language_loss": 0.6957072, "learning_rate": 3.806850225032117e-07, "loss": 0.72005826, "num_input_tokens_seen": 289482635, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.20324707, "step": 13411, "time_per_iteration": 2.909684181213379 }, { "auxiliary_loss_clip": 0.01394575, "auxiliary_loss_mlp": 0.01030172, "balance_loss_clip": 1.23658526, "balance_loss_mlp": 1.01101506, "epoch": 0.8063730647828048, "flos": 23999103020160.0, "grad_norm": 1.8186997707670824, "language_loss": 0.68837917, "learning_rate": 3.804564771039551e-07, "loss": 0.7126267, "num_input_tokens_seen": 289502040, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19140625, "step": 13412, "time_per_iteration": 4.444105386734009 }, { "auxiliary_loss_clip": 0.01411578, "auxiliary_loss_mlp": 0.01035383, "balance_loss_clip": 1.24694204, "balance_loss_mlp": 1.01557016, "epoch": 0.8064331880354727, "flos": 21330610600320.0, "grad_norm": 1.6652889251444154, "language_loss": 0.82359982, "learning_rate": 3.8022799311795064e-07, "loss": 0.84806943, "num_input_tokens_seen": 289520740, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19787598, "step": 13413, "time_per_iteration": 2.838923454284668 }, { "auxiliary_loss_clip": 0.01413195, "auxiliary_loss_mlp": 0.01030124, "balance_loss_clip": 1.25296795, "balance_loss_mlp": 1.01195621, "epoch": 0.8064933112881407, "flos": 19692426942720.0, "grad_norm": 1.7864748797440038, "language_loss": 0.85732174, "learning_rate": 3.7999957055386303e-07, "loss": 0.88175499, "num_input_tokens_seen": 289535840, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.1817627, "step": 13414, "time_per_iteration": 2.835331678390503 }, { "auxiliary_loss_clip": 0.01400054, "auxiliary_loss_mlp": 0.01030397, "balance_loss_clip": 1.24170423, "balance_loss_mlp": 1.01214647, "epoch": 0.8065534345408086, "flos": 19288679178240.0, "grad_norm": 1.856555948343962, "language_loss": 0.68126941, "learning_rate": 3.7977120942035467e-07, "loss": 0.70557392, "num_input_tokens_seen": 289555205, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18249512, "step": 13415, "time_per_iteration": 2.846843957901001 }, { "auxiliary_loss_clip": 0.01387345, "auxiliary_loss_mlp": 0.01028241, "balance_loss_clip": 1.23092055, "balance_loss_mlp": 1.01019323, "epoch": 0.8066135577934767, "flos": 19685911691520.0, "grad_norm": 1.5312430613754078, "language_loss": 0.7666108, "learning_rate": 3.7954290972608383e-07, "loss": 0.79076672, "num_input_tokens_seen": 289573000, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18041992, "step": 13416, "time_per_iteration": 4.20928430557251 }, { "auxiliary_loss_clip": 0.01410127, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.24603999, "balance_loss_mlp": 1.0117321, "epoch": 0.8066736810461446, "flos": 21153838590720.0, "grad_norm": 1.4524632766069496, "language_loss": 0.659881, "learning_rate": 3.793146714797086e-07, "loss": 0.68428689, "num_input_tokens_seen": 289592625, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18725586, "step": 13417, "time_per_iteration": 2.867288112640381 }, { "auxiliary_loss_clip": 0.01405622, "auxiliary_loss_mlp": 0.0102984, "balance_loss_clip": 1.24253702, "balance_loss_mlp": 1.01152968, "epoch": 0.8067338042988126, "flos": 22607649112320.0, "grad_norm": 2.0606198133882385, "language_loss": 0.81405467, "learning_rate": 3.7908649468988306e-07, "loss": 0.8384093, "num_input_tokens_seen": 289610780, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18322754, "step": 13418, "time_per_iteration": 4.231289386749268 }, { "auxiliary_loss_clip": 0.01412803, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.25128007, "balance_loss_mlp": 1.01063967, "epoch": 0.8067939275514805, "flos": 16517082562560.0, "grad_norm": 1.4988214139260125, "language_loss": 0.84761739, "learning_rate": 3.7885837936526066e-07, "loss": 0.87204462, "num_input_tokens_seen": 289628890, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19262695, "step": 13419, "time_per_iteration": 2.8087000846862793 }, { "auxiliary_loss_clip": 0.0140044, "auxiliary_loss_mlp": 0.01030373, "balance_loss_clip": 1.23788905, "balance_loss_mlp": 1.01190722, "epoch": 0.8068540508041485, "flos": 28551830175360.0, "grad_norm": 1.5717635802358858, "language_loss": 0.76504129, "learning_rate": 3.7863032551449047e-07, "loss": 0.78934944, "num_input_tokens_seen": 289647220, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18444824, "step": 13420, "time_per_iteration": 2.917249917984009 }, { "auxiliary_loss_clip": 0.01396025, "auxiliary_loss_mlp": 0.01030779, "balance_loss_clip": 1.23767257, "balance_loss_mlp": 1.01245666, "epoch": 0.8069141740568164, "flos": 21662364608640.0, "grad_norm": 1.7092244522952564, "language_loss": 0.78796673, "learning_rate": 3.784023331462207e-07, "loss": 0.81223476, "num_input_tokens_seen": 289665800, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18322754, "step": 13421, "time_per_iteration": 2.855011224746704 }, { "auxiliary_loss_clip": 0.01412617, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.25177979, "balance_loss_mlp": 1.01333582, "epoch": 0.8069742973094844, "flos": 17538070896000.0, "grad_norm": 2.0495907132919964, "language_loss": 0.8040309, "learning_rate": 3.78174402269098e-07, "loss": 0.8284806, "num_input_tokens_seen": 289682705, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19006348, "step": 13422, "time_per_iteration": 2.852229356765747 }, { "auxiliary_loss_clip": 0.01395802, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.2371943, "balance_loss_mlp": 1.01395929, "epoch": 0.8070344205621525, "flos": 23377292726400.0, "grad_norm": 1.7468343822093018, "language_loss": 0.6895225, "learning_rate": 3.7794653289176347e-07, "loss": 0.71380544, "num_input_tokens_seen": 289702920, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.1854248, "step": 13423, "time_per_iteration": 2.840623140335083 }, { "auxiliary_loss_clip": 0.01419077, "auxiliary_loss_mlp": 0.01038839, "balance_loss_clip": 1.25435531, "balance_loss_mlp": 1.01881194, "epoch": 0.8070945438148204, "flos": 22940579485440.0, "grad_norm": 1.762256392215455, "language_loss": 0.8105579, "learning_rate": 3.7771872502285904e-07, "loss": 0.83513713, "num_input_tokens_seen": 289723280, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20043945, "step": 13424, "time_per_iteration": 2.8937370777130127 }, { "auxiliary_loss_clip": 0.01412924, "auxiliary_loss_mlp": 0.01028897, "balance_loss_clip": 1.24881172, "balance_loss_mlp": 1.00999045, "epoch": 0.8071546670674884, "flos": 25311414493440.0, "grad_norm": 2.321315802714643, "language_loss": 0.79427475, "learning_rate": 3.774909786710232e-07, "loss": 0.81869292, "num_input_tokens_seen": 289743475, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18908691, "step": 13425, "time_per_iteration": 2.9255423545837402 }, { "auxiliary_loss_clip": 0.01403706, "auxiliary_loss_mlp": 0.01034302, "balance_loss_clip": 1.24339509, "balance_loss_mlp": 1.0156343, "epoch": 0.8072147903201563, "flos": 18122753191680.0, "grad_norm": 2.0971848216966538, "language_loss": 0.76181436, "learning_rate": 3.772632938448923e-07, "loss": 0.78619444, "num_input_tokens_seen": 289761400, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18676758, "step": 13426, "time_per_iteration": 2.800926446914673 }, { "auxiliary_loss_clip": 0.01403854, "auxiliary_loss_mlp": 0.01032693, "balance_loss_clip": 1.24378252, "balance_loss_mlp": 1.01422751, "epoch": 0.8072749135728243, "flos": 26699520286080.0, "grad_norm": 1.7238513280618004, "language_loss": 0.73577142, "learning_rate": 3.770356705530997e-07, "loss": 0.7601369, "num_input_tokens_seen": 289781025, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18469238, "step": 13427, "time_per_iteration": 2.9195337295532227 }, { "auxiliary_loss_clip": 0.01399103, "auxiliary_loss_mlp": 0.01038449, "balance_loss_clip": 1.23940539, "balance_loss_mlp": 1.01887488, "epoch": 0.8073350368254922, "flos": 19248474533760.0, "grad_norm": 2.5037620072777624, "language_loss": 0.70854712, "learning_rate": 3.768081088042774e-07, "loss": 0.73292267, "num_input_tokens_seen": 289798380, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19580078, "step": 13428, "time_per_iteration": 2.827843427658081 }, { "auxiliary_loss_clip": 0.01402142, "auxiliary_loss_mlp": 0.01030152, "balance_loss_clip": 1.24028349, "balance_loss_mlp": 1.01230669, "epoch": 0.8073951600781603, "flos": 13342462099200.0, "grad_norm": 2.063948696597186, "language_loss": 0.75970745, "learning_rate": 3.765806086070544e-07, "loss": 0.78403044, "num_input_tokens_seen": 289814515, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.17858887, "step": 13429, "time_per_iteration": 2.8270816802978516 }, { "auxiliary_loss_clip": 0.01379577, "auxiliary_loss_mlp": 0.01032636, "balance_loss_clip": 1.22508872, "balance_loss_mlp": 1.01476634, "epoch": 0.8074552833308282, "flos": 22862522926080.0, "grad_norm": 3.0863117260565063, "language_loss": 0.68205303, "learning_rate": 3.763531699700568e-07, "loss": 0.70617515, "num_input_tokens_seen": 289834315, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.17883301, "step": 13430, "time_per_iteration": 2.861515998840332 }, { "auxiliary_loss_clip": 0.01406064, "auxiliary_loss_mlp": 0.0103287, "balance_loss_clip": 1.24630642, "balance_loss_mlp": 1.01395106, "epoch": 0.8075154065834962, "flos": 20349057749760.0, "grad_norm": 1.8807955256633995, "language_loss": 0.80627155, "learning_rate": 3.7612579290190994e-07, "loss": 0.83066094, "num_input_tokens_seen": 289853770, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18920898, "step": 13431, "time_per_iteration": 2.8602702617645264 }, { "auxiliary_loss_clip": 0.01390081, "auxiliary_loss_mlp": 0.01029611, "balance_loss_clip": 1.2339052, "balance_loss_mlp": 1.01022744, "epoch": 0.8075755298361641, "flos": 21918052828800.0, "grad_norm": 2.031051061640121, "language_loss": 0.81108963, "learning_rate": 3.7589847741123593e-07, "loss": 0.83528656, "num_input_tokens_seen": 289870480, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19384766, "step": 13432, "time_per_iteration": 2.849001169204712 }, { "auxiliary_loss_clip": 0.01422394, "auxiliary_loss_mlp": 0.01034018, "balance_loss_clip": 1.2576108, "balance_loss_mlp": 1.01475358, "epoch": 0.8076356530888321, "flos": 15677752677120.0, "grad_norm": 2.24558034939697, "language_loss": 0.71517682, "learning_rate": 3.7567122350665415e-07, "loss": 0.73974097, "num_input_tokens_seen": 289888275, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19287109, "step": 13433, "time_per_iteration": 2.852055788040161 }, { "auxiliary_loss_clip": 0.01398316, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.23847163, "balance_loss_mlp": 1.01363671, "epoch": 0.8076957763415, "flos": 37791019411200.0, "grad_norm": 1.3435480779108975, "language_loss": 0.72543967, "learning_rate": 3.754440311967828e-07, "loss": 0.7497499, "num_input_tokens_seen": 289911495, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19055176, "step": 13434, "time_per_iteration": 2.9942433834075928 }, { "auxiliary_loss_clip": 0.01410553, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 1.25133061, "balance_loss_mlp": 1.0135448, "epoch": 0.807755899594168, "flos": 19620523676160.0, "grad_norm": 2.184675900733435, "language_loss": 0.68845904, "learning_rate": 3.752169004902361e-07, "loss": 0.71288216, "num_input_tokens_seen": 289930045, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18225098, "step": 13435, "time_per_iteration": 2.8255913257598877 }, { "auxiliary_loss_clip": 0.01416443, "auxiliary_loss_mlp": 0.01034263, "balance_loss_clip": 1.25368381, "balance_loss_mlp": 1.01390171, "epoch": 0.8078160228468361, "flos": 23305344215040.0, "grad_norm": 1.5233189888650691, "language_loss": 0.7551465, "learning_rate": 3.749898313956279e-07, "loss": 0.77965355, "num_input_tokens_seen": 289950815, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20349121, "step": 13436, "time_per_iteration": 2.858335494995117 }, { "auxiliary_loss_clip": 0.01382798, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.22650027, "balance_loss_mlp": 1.0140698, "epoch": 0.807876146099504, "flos": 27174175931520.0, "grad_norm": 1.6432180504901805, "language_loss": 0.71060187, "learning_rate": 3.747628239215674e-07, "loss": 0.73475164, "num_input_tokens_seen": 289971730, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18103027, "step": 13437, "time_per_iteration": 2.88662052154541 }, { "auxiliary_loss_clip": 0.01384997, "auxiliary_loss_mlp": 0.01028084, "balance_loss_clip": 1.22900522, "balance_loss_mlp": 1.01005995, "epoch": 0.807936269352172, "flos": 27170873061120.0, "grad_norm": 1.5604907904453087, "language_loss": 0.73257667, "learning_rate": 3.745358780766636e-07, "loss": 0.75670755, "num_input_tokens_seen": 289992995, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18017578, "step": 13438, "time_per_iteration": 2.9677536487579346 }, { "auxiliary_loss_clip": 0.01399877, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 1.24069858, "balance_loss_mlp": 1.01134086, "epoch": 0.8079963926048399, "flos": 20750090826240.0, "grad_norm": 1.8005601387861576, "language_loss": 0.77489525, "learning_rate": 3.7430899386952344e-07, "loss": 0.79920167, "num_input_tokens_seen": 290009405, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1940918, "step": 13439, "time_per_iteration": 2.8827760219573975 }, { "auxiliary_loss_clip": 0.01392876, "auxiliary_loss_mlp": 0.01030652, "balance_loss_clip": 1.23657906, "balance_loss_mlp": 1.01210344, "epoch": 0.8080565158575079, "flos": 25020543801600.0, "grad_norm": 1.7471501629102566, "language_loss": 0.7868281, "learning_rate": 3.7408217130874786e-07, "loss": 0.81106341, "num_input_tokens_seen": 290031085, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.1854248, "step": 13440, "time_per_iteration": 2.9174726009368896 }, { "auxiliary_loss_clip": 0.0139757, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.23653221, "balance_loss_mlp": 1.01216996, "epoch": 0.8081166391101758, "flos": 18707435487360.0, "grad_norm": 1.6725201238059553, "language_loss": 0.60161972, "learning_rate": 3.7385541040293946e-07, "loss": 0.62590694, "num_input_tokens_seen": 290048670, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18981934, "step": 13441, "time_per_iteration": 2.845165491104126 }, { "auxiliary_loss_clip": 0.01396581, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.23917246, "balance_loss_mlp": 1.01580548, "epoch": 0.8081767623628439, "flos": 19838179002240.0, "grad_norm": 2.4444683439216446, "language_loss": 0.7675755, "learning_rate": 3.7362871116069684e-07, "loss": 0.79189801, "num_input_tokens_seen": 290064085, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19873047, "step": 13442, "time_per_iteration": 4.256820440292358 }, { "auxiliary_loss_clip": 0.01391403, "auxiliary_loss_mlp": 0.01031045, "balance_loss_clip": 1.23285604, "balance_loss_mlp": 1.01274657, "epoch": 0.8082368856155118, "flos": 35786623190400.0, "grad_norm": 1.5338835421849233, "language_loss": 0.71498811, "learning_rate": 3.734020735906169e-07, "loss": 0.73921257, "num_input_tokens_seen": 290086255, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.1829834, "step": 13443, "time_per_iteration": 2.9580256938934326 }, { "auxiliary_loss_clip": 0.01391541, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.23460186, "balance_loss_mlp": 1.01431108, "epoch": 0.8082970088681798, "flos": 17205909684480.0, "grad_norm": 1.8009008886486892, "language_loss": 0.82978928, "learning_rate": 3.7317549770129286e-07, "loss": 0.85402864, "num_input_tokens_seen": 290103995, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18078613, "step": 13444, "time_per_iteration": 2.865795850753784 }, { "auxiliary_loss_clip": 0.01174137, "auxiliary_loss_mlp": 0.01019104, "balance_loss_clip": 1.08760107, "balance_loss_mlp": 0.99850464, "epoch": 0.8083571321208477, "flos": 63580392420480.0, "grad_norm": 0.8272516304923463, "language_loss": 0.53665751, "learning_rate": 3.7294898350131754e-07, "loss": 0.55858994, "num_input_tokens_seen": 290157245, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.20605469, "step": 13445, "time_per_iteration": 3.2157814502716064 }, { "auxiliary_loss_clip": 0.01391211, "auxiliary_loss_mlp": 0.01034058, "balance_loss_clip": 1.23282659, "balance_loss_mlp": 1.01463938, "epoch": 0.8084172553735157, "flos": 17939556420480.0, "grad_norm": 2.0178203616534214, "language_loss": 0.72440219, "learning_rate": 3.7272253099927964e-07, "loss": 0.7486549, "num_input_tokens_seen": 290174970, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1940918, "step": 13446, "time_per_iteration": 4.304142713546753 }, { "auxiliary_loss_clip": 0.01407064, "auxiliary_loss_mlp": 0.01034476, "balance_loss_clip": 1.24586511, "balance_loss_mlp": 1.01547408, "epoch": 0.8084773786261836, "flos": 24108767712000.0, "grad_norm": 1.7112118015712188, "language_loss": 0.72015703, "learning_rate": 3.7249614020376606e-07, "loss": 0.74457252, "num_input_tokens_seen": 290194395, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18994141, "step": 13447, "time_per_iteration": 2.8976590633392334 }, { "auxiliary_loss_clip": 0.01410539, "auxiliary_loss_mlp": 0.01035327, "balance_loss_clip": 1.24756515, "balance_loss_mlp": 1.01644468, "epoch": 0.8085375018788516, "flos": 15594538210560.0, "grad_norm": 2.183405769692324, "language_loss": 0.75872105, "learning_rate": 3.7226981112336197e-07, "loss": 0.78317976, "num_input_tokens_seen": 290209200, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.1887207, "step": 13448, "time_per_iteration": 2.8139891624450684 }, { "auxiliary_loss_clip": 0.0117512, "auxiliary_loss_mlp": 0.01022667, "balance_loss_clip": 1.08901834, "balance_loss_mlp": 1.0035938, "epoch": 0.8085976251315197, "flos": 67595084668800.0, "grad_norm": 0.7410067512851463, "language_loss": 0.6386224, "learning_rate": 3.7204354376665024e-07, "loss": 0.6606003, "num_input_tokens_seen": 290274565, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.19042969, "step": 13449, "time_per_iteration": 3.3320038318634033 }, { "auxiliary_loss_clip": 0.01394795, "auxiliary_loss_mlp": 0.01031452, "balance_loss_clip": 1.23682225, "balance_loss_mlp": 1.01229537, "epoch": 0.8086577483841876, "flos": 22570747338240.0, "grad_norm": 1.6531788752029546, "language_loss": 0.74663514, "learning_rate": 3.718173381422105e-07, "loss": 0.77089763, "num_input_tokens_seen": 290293630, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19152832, "step": 13450, "time_per_iteration": 2.9263105392456055 }, { "auxiliary_loss_clip": 0.01403752, "auxiliary_loss_mlp": 0.0103165, "balance_loss_clip": 1.24338615, "balance_loss_mlp": 1.01256454, "epoch": 0.8087178716368556, "flos": 17977182111360.0, "grad_norm": 1.5437321048113068, "language_loss": 0.74898309, "learning_rate": 3.7159119425861986e-07, "loss": 0.77333713, "num_input_tokens_seen": 290311450, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19091797, "step": 13451, "time_per_iteration": 4.340465545654297 }, { "auxiliary_loss_clip": 0.01406669, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.2430979, "balance_loss_mlp": 1.01401758, "epoch": 0.8087779948895235, "flos": 21727888358400.0, "grad_norm": 1.7114667337020155, "language_loss": 0.80888969, "learning_rate": 3.713651121244543e-07, "loss": 0.83329558, "num_input_tokens_seen": 290330165, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19921875, "step": 13452, "time_per_iteration": 4.305746793746948 }, { "auxiliary_loss_clip": 0.01407692, "auxiliary_loss_mlp": 0.01034812, "balance_loss_clip": 1.24642503, "balance_loss_mlp": 1.01656127, "epoch": 0.8088381181421915, "flos": 29103999442560.0, "grad_norm": 1.6044304996884662, "language_loss": 0.78824681, "learning_rate": 3.711390917482875e-07, "loss": 0.8126719, "num_input_tokens_seen": 290350815, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18249512, "step": 13453, "time_per_iteration": 2.9207146167755127 }, { "auxiliary_loss_clip": 0.01402337, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.24172354, "balance_loss_mlp": 1.01487637, "epoch": 0.8088982413948594, "flos": 22208109114240.0, "grad_norm": 2.0147043983107427, "language_loss": 0.77580142, "learning_rate": 3.709131331386892e-07, "loss": 0.8001644, "num_input_tokens_seen": 290367380, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19091797, "step": 13454, "time_per_iteration": 2.8465993404388428 }, { "auxiliary_loss_clip": 0.0139017, "auxiliary_loss_mlp": 0.01032808, "balance_loss_clip": 1.23253608, "balance_loss_mlp": 1.01423478, "epoch": 0.8089583646475275, "flos": 28048054861440.0, "grad_norm": 2.5628215284992946, "language_loss": 0.77683169, "learning_rate": 3.7068723630422795e-07, "loss": 0.80106145, "num_input_tokens_seen": 290387965, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18591309, "step": 13455, "time_per_iteration": 2.983485460281372 }, { "auxiliary_loss_clip": 0.01402375, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.24126875, "balance_loss_mlp": 1.01271582, "epoch": 0.8090184879001954, "flos": 16626385296000.0, "grad_norm": 1.7265328103719009, "language_loss": 0.78946733, "learning_rate": 3.70461401253471e-07, "loss": 0.8138079, "num_input_tokens_seen": 290404150, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1895752, "step": 13456, "time_per_iteration": 2.936126470565796 }, { "auxiliary_loss_clip": 0.01396775, "auxiliary_loss_mlp": 0.01036171, "balance_loss_clip": 1.2388258, "balance_loss_mlp": 1.01703823, "epoch": 0.8090786111528634, "flos": 27351264654720.0, "grad_norm": 2.6741286677832736, "language_loss": 0.72651482, "learning_rate": 3.702356279949801e-07, "loss": 0.75084424, "num_input_tokens_seen": 290422370, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19152832, "step": 13457, "time_per_iteration": 2.9458792209625244 }, { "auxiliary_loss_clip": 0.01396255, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.23786497, "balance_loss_mlp": 1.0149343, "epoch": 0.8091387344055313, "flos": 21115624717440.0, "grad_norm": 1.7721667203755127, "language_loss": 0.73588699, "learning_rate": 3.700099165373176e-07, "loss": 0.76018488, "num_input_tokens_seen": 290442645, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18603516, "step": 13458, "time_per_iteration": 2.8698205947875977 }, { "auxiliary_loss_clip": 0.01397105, "auxiliary_loss_mlp": 0.01031916, "balance_loss_clip": 1.23778677, "balance_loss_mlp": 1.0123415, "epoch": 0.8091988576581993, "flos": 11662264005120.0, "grad_norm": 2.1287545182733334, "language_loss": 0.80517602, "learning_rate": 3.6978426688904275e-07, "loss": 0.82946622, "num_input_tokens_seen": 290458520, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19567871, "step": 13459, "time_per_iteration": 2.8145265579223633 }, { "auxiliary_loss_clip": 0.01409241, "auxiliary_loss_mlp": 0.01033741, "balance_loss_clip": 1.24548149, "balance_loss_mlp": 1.01453614, "epoch": 0.8092589809108672, "flos": 22972866289920.0, "grad_norm": 2.06712138115123, "language_loss": 0.80803663, "learning_rate": 3.695586790587113e-07, "loss": 0.83246642, "num_input_tokens_seen": 290474465, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1920166, "step": 13460, "time_per_iteration": 2.843167781829834 }, { "auxiliary_loss_clip": 0.01400691, "auxiliary_loss_mlp": 0.01033471, "balance_loss_clip": 1.23851919, "balance_loss_mlp": 1.01429009, "epoch": 0.8093191041635353, "flos": 13268749040640.0, "grad_norm": 1.630399147939918, "language_loss": 0.84659678, "learning_rate": 3.693331530548789e-07, "loss": 0.87093842, "num_input_tokens_seen": 290492060, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19152832, "step": 13461, "time_per_iteration": 2.8288347721099854 }, { "auxiliary_loss_clip": 0.01427082, "auxiliary_loss_mlp": 0.01033396, "balance_loss_clip": 1.26469254, "balance_loss_mlp": 1.01404786, "epoch": 0.8093792274162032, "flos": 25525857438720.0, "grad_norm": 2.3343039637622183, "language_loss": 0.76328546, "learning_rate": 3.69107688886096e-07, "loss": 0.7878902, "num_input_tokens_seen": 290511510, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19348145, "step": 13462, "time_per_iteration": 2.8809814453125 }, { "auxiliary_loss_clip": 0.01402101, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.23977041, "balance_loss_mlp": 1.01029837, "epoch": 0.8094393506688712, "flos": 23556010262400.0, "grad_norm": 1.8638709452131261, "language_loss": 0.83789152, "learning_rate": 3.6888228656091357e-07, "loss": 0.86221051, "num_input_tokens_seen": 290530035, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19494629, "step": 13463, "time_per_iteration": 2.9326932430267334 }, { "auxiliary_loss_clip": 0.01399908, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.24146128, "balance_loss_mlp": 1.01553977, "epoch": 0.8094994739215392, "flos": 17064772594560.0, "grad_norm": 1.7739793667129924, "language_loss": 0.63016355, "learning_rate": 3.686569460878779e-07, "loss": 0.65450633, "num_input_tokens_seen": 290548245, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18859863, "step": 13464, "time_per_iteration": 2.907449245452881 }, { "auxiliary_loss_clip": 0.01392434, "auxiliary_loss_mlp": 0.01029867, "balance_loss_clip": 1.23524594, "balance_loss_mlp": 1.01154506, "epoch": 0.8095595971742071, "flos": 23561982576000.0, "grad_norm": 1.44915529881324, "language_loss": 0.62461627, "learning_rate": 3.684316674755341e-07, "loss": 0.64883929, "num_input_tokens_seen": 290568625, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18322754, "step": 13465, "time_per_iteration": 2.954393148422241 }, { "auxiliary_loss_clip": 0.0140304, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.24598396, "balance_loss_mlp": 1.01724279, "epoch": 0.8096197204268751, "flos": 20382158960640.0, "grad_norm": 1.8480124554444806, "language_loss": 0.83076537, "learning_rate": 3.682064507324256e-07, "loss": 0.85515791, "num_input_tokens_seen": 290586575, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18969727, "step": 13466, "time_per_iteration": 2.8842945098876953 }, { "auxiliary_loss_clip": 0.01410914, "auxiliary_loss_mlp": 0.01034638, "balance_loss_clip": 1.24905515, "balance_loss_mlp": 1.01602888, "epoch": 0.809679843679543, "flos": 27830309045760.0, "grad_norm": 1.7689382894199852, "language_loss": 0.76922786, "learning_rate": 3.6798129586709204e-07, "loss": 0.79368341, "num_input_tokens_seen": 290606790, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18615723, "step": 13467, "time_per_iteration": 2.9189577102661133 }, { "auxiliary_loss_clip": 0.01399078, "auxiliary_loss_mlp": 0.01032671, "balance_loss_clip": 1.2388072, "balance_loss_mlp": 1.01372886, "epoch": 0.8097399669322111, "flos": 22023238285440.0, "grad_norm": 2.443888589279125, "language_loss": 0.80130643, "learning_rate": 3.6775620288807073e-07, "loss": 0.82562387, "num_input_tokens_seen": 290625525, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18933105, "step": 13468, "time_per_iteration": 2.8364083766937256 }, { "auxiliary_loss_clip": 0.01397534, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.24015021, "balance_loss_mlp": 1.01430726, "epoch": 0.809800090184879, "flos": 18998170444800.0, "grad_norm": 1.7580657093807892, "language_loss": 0.68790483, "learning_rate": 3.675311718038978e-07, "loss": 0.71220434, "num_input_tokens_seen": 290644935, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18115234, "step": 13469, "time_per_iteration": 2.8162729740142822 }, { "auxiliary_loss_clip": 0.01178165, "auxiliary_loss_mlp": 0.01023253, "balance_loss_clip": 1.09203196, "balance_loss_mlp": 1.00122309, "epoch": 0.809860213437547, "flos": 66132315676800.0, "grad_norm": 0.6967584942257878, "language_loss": 0.5469597, "learning_rate": 3.6730620262310683e-07, "loss": 0.56897384, "num_input_tokens_seen": 290710735, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.22070312, "step": 13470, "time_per_iteration": 3.4479353427886963 }, { "auxiliary_loss_clip": 0.01394841, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.23712695, "balance_loss_mlp": 1.01683855, "epoch": 0.8099203366902149, "flos": 20891182671360.0, "grad_norm": 1.7057923027401618, "language_loss": 0.69693375, "learning_rate": 3.670812953542279e-07, "loss": 0.72122902, "num_input_tokens_seen": 290729565, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.17871094, "step": 13471, "time_per_iteration": 2.892167806625366 }, { "auxiliary_loss_clip": 0.01402354, "auxiliary_loss_mlp": 0.01031147, "balance_loss_clip": 1.24107051, "balance_loss_mlp": 1.01231182, "epoch": 0.8099804599428829, "flos": 26041984583040.0, "grad_norm": 2.390512875219101, "language_loss": 0.80031836, "learning_rate": 3.6685645000579003e-07, "loss": 0.82465345, "num_input_tokens_seen": 290749360, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18847656, "step": 13472, "time_per_iteration": 2.9150197505950928 }, { "auxiliary_loss_clip": 0.011809, "auxiliary_loss_mlp": 0.01025477, "balance_loss_clip": 1.09441757, "balance_loss_mlp": 1.00201643, "epoch": 0.8100405831955508, "flos": 69335467626240.0, "grad_norm": 0.7484884530121954, "language_loss": 0.57821077, "learning_rate": 3.666316665863201e-07, "loss": 0.6002745, "num_input_tokens_seen": 290812145, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 0.234375, "step": 13473, "time_per_iteration": 3.2223007678985596 }, { "auxiliary_loss_clip": 0.01410316, "auxiliary_loss_mlp": 0.01028804, "balance_loss_clip": 1.24806726, "balance_loss_mlp": 1.0100646, "epoch": 0.8101007064482189, "flos": 15020759911680.0, "grad_norm": 1.778275432865064, "language_loss": 0.75583076, "learning_rate": 3.664069451043399e-07, "loss": 0.780222, "num_input_tokens_seen": 290829845, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1875, "step": 13474, "time_per_iteration": 2.8070240020751953 }, { "auxiliary_loss_clip": 0.0141126, "auxiliary_loss_mlp": 0.01037863, "balance_loss_clip": 1.24860883, "balance_loss_mlp": 1.01903987, "epoch": 0.8101608297008868, "flos": 21076551192960.0, "grad_norm": 1.9408270906159197, "language_loss": 0.79078841, "learning_rate": 3.661822855683723e-07, "loss": 0.8152796, "num_input_tokens_seen": 290848815, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18823242, "step": 13475, "time_per_iteration": 2.8977417945861816 }, { "auxiliary_loss_clip": 0.01389287, "auxiliary_loss_mlp": 0.01032614, "balance_loss_clip": 1.23308361, "balance_loss_mlp": 1.01389766, "epoch": 0.8102209529535548, "flos": 23741469273600.0, "grad_norm": 1.7942690540795605, "language_loss": 0.75947475, "learning_rate": 3.659576879869364e-07, "loss": 0.78369373, "num_input_tokens_seen": 290868580, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18713379, "step": 13476, "time_per_iteration": 2.869100332260132 }, { "auxiliary_loss_clip": 0.01414755, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.24997663, "balance_loss_mlp": 1.01991653, "epoch": 0.8102810762062228, "flos": 10961356521600.0, "grad_norm": 7.7721040218832265, "language_loss": 0.75003046, "learning_rate": 3.657331523685485e-07, "loss": 0.77457112, "num_input_tokens_seen": 290883540, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19384766, "step": 13477, "time_per_iteration": 4.328181266784668 }, { "auxiliary_loss_clip": 0.01393006, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.23454666, "balance_loss_mlp": 1.01401854, "epoch": 0.8103411994588907, "flos": 14657307281280.0, "grad_norm": 2.11863210499417, "language_loss": 0.70381033, "learning_rate": 3.6550867872172365e-07, "loss": 0.72806448, "num_input_tokens_seen": 290901560, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18395996, "step": 13478, "time_per_iteration": 2.860466241836548 }, { "auxiliary_loss_clip": 0.0118039, "auxiliary_loss_mlp": 0.01022524, "balance_loss_clip": 1.09372914, "balance_loss_mlp": 1.00173366, "epoch": 0.8104013227115587, "flos": 59180701737600.0, "grad_norm": 0.6836770516720002, "language_loss": 0.52149808, "learning_rate": 3.6528426705497293e-07, "loss": 0.54352725, "num_input_tokens_seen": 290959185, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.20800781, "step": 13479, "time_per_iteration": 3.291825294494629 }, { "auxiliary_loss_clip": 0.01408573, "auxiliary_loss_mlp": 0.01033011, "balance_loss_clip": 1.24887002, "balance_loss_mlp": 1.01417565, "epoch": 0.8104614459642266, "flos": 19838224247040.0, "grad_norm": 1.5617914474173218, "language_loss": 0.71860874, "learning_rate": 3.650599173768072e-07, "loss": 0.74302459, "num_input_tokens_seen": 290979585, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18847656, "step": 13480, "time_per_iteration": 2.905086040496826 }, { "auxiliary_loss_clip": 0.01411177, "auxiliary_loss_mlp": 0.01034661, "balance_loss_clip": 1.24888122, "balance_loss_mlp": 1.01561129, "epoch": 0.8105215692168947, "flos": 25385172796800.0, "grad_norm": 1.8330618593781554, "language_loss": 0.80463707, "learning_rate": 3.648356296957327e-07, "loss": 0.82909548, "num_input_tokens_seen": 291000865, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19067383, "step": 13481, "time_per_iteration": 4.388688325881958 }, { "auxiliary_loss_clip": 0.01397843, "auxiliary_loss_mlp": 0.01031003, "balance_loss_clip": 1.23775244, "balance_loss_mlp": 1.01163137, "epoch": 0.8105816924695626, "flos": 20490466308480.0, "grad_norm": 2.163503746868634, "language_loss": 0.74123478, "learning_rate": 3.646114040202548e-07, "loss": 0.76552331, "num_input_tokens_seen": 291018285, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19360352, "step": 13482, "time_per_iteration": 2.875131368637085 }, { "auxiliary_loss_clip": 0.01389642, "auxiliary_loss_mlp": 0.0103049, "balance_loss_clip": 1.22962558, "balance_loss_mlp": 1.01135719, "epoch": 0.8106418157222306, "flos": 14546963917440.0, "grad_norm": 2.1120403161941885, "language_loss": 0.65736389, "learning_rate": 3.6438724035887705e-07, "loss": 0.68156523, "num_input_tokens_seen": 291035745, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19128418, "step": 13483, "time_per_iteration": 2.8489186763763428 }, { "auxiliary_loss_clip": 0.01387617, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.23010445, "balance_loss_mlp": 1.01284313, "epoch": 0.8107019389748985, "flos": 22574547901440.0, "grad_norm": 2.0622138686317077, "language_loss": 0.77287006, "learning_rate": 3.641631387200992e-07, "loss": 0.79707164, "num_input_tokens_seen": 291053280, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19702148, "step": 13484, "time_per_iteration": 2.8588461875915527 }, { "auxiliary_loss_clip": 0.01431082, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.2646066, "balance_loss_mlp": 1.0146246, "epoch": 0.8107620622275665, "flos": 19618985352960.0, "grad_norm": 1.6977798272517879, "language_loss": 0.72568345, "learning_rate": 3.639390991124183e-07, "loss": 0.75033516, "num_input_tokens_seen": 291072855, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.19470215, "step": 13485, "time_per_iteration": 2.927285671234131 }, { "auxiliary_loss_clip": 0.01386486, "auxiliary_loss_mlp": 0.01028801, "balance_loss_clip": 1.23151326, "balance_loss_mlp": 1.01040721, "epoch": 0.8108221854802344, "flos": 16152046364160.0, "grad_norm": 2.33374652513425, "language_loss": 0.76655138, "learning_rate": 3.637151215443308e-07, "loss": 0.79070425, "num_input_tokens_seen": 291090285, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18383789, "step": 13486, "time_per_iteration": 4.2570788860321045 }, { "auxiliary_loss_clip": 0.01426312, "auxiliary_loss_mlp": 0.01031071, "balance_loss_clip": 1.26047564, "balance_loss_mlp": 1.01218843, "epoch": 0.8108823087329025, "flos": 21116212899840.0, "grad_norm": 2.8913080062013714, "language_loss": 0.72489178, "learning_rate": 3.6349120602433045e-07, "loss": 0.74946564, "num_input_tokens_seen": 291107675, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.18884277, "step": 13487, "time_per_iteration": 4.246339559555054 }, { "auxiliary_loss_clip": 0.01396277, "auxiliary_loss_mlp": 0.01033745, "balance_loss_clip": 1.24049568, "balance_loss_mlp": 1.01487374, "epoch": 0.8109424319855704, "flos": 29210089795200.0, "grad_norm": 1.947805547147092, "language_loss": 0.84972107, "learning_rate": 3.6326735256090715e-07, "loss": 0.87402129, "num_input_tokens_seen": 291126900, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18884277, "step": 13488, "time_per_iteration": 2.899254322052002 }, { "auxiliary_loss_clip": 0.01406675, "auxiliary_loss_mlp": 0.01030756, "balance_loss_clip": 1.24568844, "balance_loss_mlp": 1.01239824, "epoch": 0.8110025552382384, "flos": 23122102199040.0, "grad_norm": 1.7468792126379293, "language_loss": 0.75008273, "learning_rate": 3.630435611625502e-07, "loss": 0.77445704, "num_input_tokens_seen": 291145285, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18359375, "step": 13489, "time_per_iteration": 2.847289562225342 }, { "auxiliary_loss_clip": 0.01388591, "auxiliary_loss_mlp": 0.01035654, "balance_loss_clip": 1.23300743, "balance_loss_mlp": 1.0167594, "epoch": 0.8110626784909064, "flos": 22389450848640.0, "grad_norm": 1.8397913676858275, "language_loss": 0.72508311, "learning_rate": 3.628198318377453e-07, "loss": 0.74932557, "num_input_tokens_seen": 291163485, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18896484, "step": 13490, "time_per_iteration": 2.897650957107544 }, { "auxiliary_loss_clip": 0.01415898, "auxiliary_loss_mlp": 0.01040963, "balance_loss_clip": 1.25347757, "balance_loss_mlp": 1.02074552, "epoch": 0.8111228017435743, "flos": 23378559580800.0, "grad_norm": 2.192671840505119, "language_loss": 0.72671551, "learning_rate": 3.625961645949762e-07, "loss": 0.75128412, "num_input_tokens_seen": 291182215, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20214844, "step": 13491, "time_per_iteration": 2.8672280311584473 }, { "auxiliary_loss_clip": 0.01392519, "auxiliary_loss_mlp": 0.01032714, "balance_loss_clip": 1.23382807, "balance_loss_mlp": 1.01360464, "epoch": 0.8111829249962423, "flos": 21296061555840.0, "grad_norm": 1.3452202847585073, "language_loss": 0.6821835, "learning_rate": 3.623725594427245e-07, "loss": 0.7064358, "num_input_tokens_seen": 291203145, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19104004, "step": 13492, "time_per_iteration": 2.9359896183013916 }, { "auxiliary_loss_clip": 0.01397894, "auxiliary_loss_mlp": 0.01029345, "balance_loss_clip": 1.2372793, "balance_loss_mlp": 1.01000905, "epoch": 0.8112430482489102, "flos": 22355580476160.0, "grad_norm": 1.678052942179456, "language_loss": 0.72720635, "learning_rate": 3.6214901638947006e-07, "loss": 0.75147867, "num_input_tokens_seen": 291220600, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.1932373, "step": 13493, "time_per_iteration": 2.8839704990386963 }, { "auxiliary_loss_clip": 0.01403098, "auxiliary_loss_mlp": 0.01034695, "balance_loss_clip": 1.24274445, "balance_loss_mlp": 1.01539505, "epoch": 0.8113031715015783, "flos": 31150545834240.0, "grad_norm": 1.7168049747418779, "language_loss": 0.71684796, "learning_rate": 3.619255354436885e-07, "loss": 0.74122596, "num_input_tokens_seen": 291241195, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19299316, "step": 13494, "time_per_iteration": 2.9632768630981445 }, { "auxiliary_loss_clip": 0.01413005, "auxiliary_loss_mlp": 0.01038539, "balance_loss_clip": 1.24896169, "balance_loss_mlp": 1.01828527, "epoch": 0.8113632947542462, "flos": 25345737313920.0, "grad_norm": 1.9418700433829768, "language_loss": 0.76929957, "learning_rate": 3.6170211661385543e-07, "loss": 0.79381502, "num_input_tokens_seen": 291258715, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20263672, "step": 13495, "time_per_iteration": 2.8555891513824463 }, { "auxiliary_loss_clip": 0.01408341, "auxiliary_loss_mlp": 0.01037582, "balance_loss_clip": 1.24595952, "balance_loss_mlp": 1.01816297, "epoch": 0.8114234180069142, "flos": 28450897729920.0, "grad_norm": 2.2689920798579335, "language_loss": 0.80666351, "learning_rate": 3.614787599084417e-07, "loss": 0.83112282, "num_input_tokens_seen": 291278030, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19433594, "step": 13496, "time_per_iteration": 2.881586790084839 }, { "auxiliary_loss_clip": 0.01405028, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.24503183, "balance_loss_mlp": 1.01046252, "epoch": 0.8114835412595821, "flos": 20348379077760.0, "grad_norm": 1.525812396407154, "language_loss": 0.7189908, "learning_rate": 3.6125546533591787e-07, "loss": 0.74333709, "num_input_tokens_seen": 291296740, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19140625, "step": 13497, "time_per_iteration": 2.8347280025482178 }, { "auxiliary_loss_clip": 0.01416153, "auxiliary_loss_mlp": 0.01035095, "balance_loss_clip": 1.25307035, "balance_loss_mlp": 1.0164628, "epoch": 0.8115436645122501, "flos": 22500789598080.0, "grad_norm": 1.5048499581551866, "language_loss": 0.77281737, "learning_rate": 3.610322329047508e-07, "loss": 0.7973299, "num_input_tokens_seen": 291318730, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.1862793, "step": 13498, "time_per_iteration": 2.9008307456970215 }, { "auxiliary_loss_clip": 0.01393321, "auxiliary_loss_mlp": 0.01035428, "balance_loss_clip": 1.2333827, "balance_loss_mlp": 1.01637816, "epoch": 0.811603787764918, "flos": 13853340846720.0, "grad_norm": 1.811891277669646, "language_loss": 0.84690154, "learning_rate": 3.608090626234055e-07, "loss": 0.87118906, "num_input_tokens_seen": 291336755, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19042969, "step": 13499, "time_per_iteration": 2.827768087387085 }, { "auxiliary_loss_clip": 0.01399868, "auxiliary_loss_mlp": 0.0103214, "balance_loss_clip": 1.24043703, "balance_loss_mlp": 1.01220846, "epoch": 0.8116639110175861, "flos": 21624150735360.0, "grad_norm": 1.3912354954799457, "language_loss": 0.76919818, "learning_rate": 3.6058595450034603e-07, "loss": 0.79351819, "num_input_tokens_seen": 291356795, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19934082, "step": 13500, "time_per_iteration": 2.825660228729248 }, { "auxiliary_loss_clip": 0.01178126, "auxiliary_loss_mlp": 0.01019484, "balance_loss_clip": 1.09238482, "balance_loss_mlp": 0.99993384, "epoch": 0.811724034270254, "flos": 64492729430400.0, "grad_norm": 0.8026169566292045, "language_loss": 0.5999583, "learning_rate": 3.603629085440303e-07, "loss": 0.62193441, "num_input_tokens_seen": 291416005, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 0.1953125, "step": 13501, "time_per_iteration": 3.4107296466827393 }, { "auxiliary_loss_clip": 0.01382712, "auxiliary_loss_mlp": 0.01029999, "balance_loss_clip": 1.22846603, "balance_loss_mlp": 1.01141465, "epoch": 0.811784157522922, "flos": 24764900826240.0, "grad_norm": 1.6743180323355924, "language_loss": 0.79802608, "learning_rate": 3.6013992476291753e-07, "loss": 0.82215315, "num_input_tokens_seen": 291434870, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18579102, "step": 13502, "time_per_iteration": 2.9191954135894775 }, { "auxiliary_loss_clip": 0.01391643, "auxiliary_loss_mlp": 0.01034578, "balance_loss_clip": 1.23425317, "balance_loss_mlp": 1.01605332, "epoch": 0.81184428077559, "flos": 12174816810240.0, "grad_norm": 1.7881579827195597, "language_loss": 0.72144854, "learning_rate": 3.599170031654635e-07, "loss": 0.74571073, "num_input_tokens_seen": 291452230, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18518066, "step": 13503, "time_per_iteration": 2.8148205280303955 }, { "auxiliary_loss_clip": 0.01393295, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 1.23299289, "balance_loss_mlp": 1.01054764, "epoch": 0.8119044040282579, "flos": 44438460687360.0, "grad_norm": 1.4717057571244003, "language_loss": 0.68397236, "learning_rate": 3.5969414376012065e-07, "loss": 0.70820349, "num_input_tokens_seen": 291477425, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19299316, "step": 13504, "time_per_iteration": 3.081800699234009 }, { "auxiliary_loss_clip": 0.01399839, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.23771286, "balance_loss_mlp": 1.01270294, "epoch": 0.8119645272809259, "flos": 52173545166720.0, "grad_norm": 2.9173700377815783, "language_loss": 0.74783909, "learning_rate": 3.594713465553403e-07, "loss": 0.77215803, "num_input_tokens_seen": 291501070, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19360352, "step": 13505, "time_per_iteration": 3.146167755126953 }, { "auxiliary_loss_clip": 0.01402245, "auxiliary_loss_mlp": 0.01031461, "balance_loss_clip": 1.24235678, "balance_loss_mlp": 1.01225662, "epoch": 0.8120246505335939, "flos": 30246732829440.0, "grad_norm": 2.7580390856095347, "language_loss": 0.73336411, "learning_rate": 3.5924861155957123e-07, "loss": 0.75770116, "num_input_tokens_seen": 291524945, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19213867, "step": 13506, "time_per_iteration": 3.004589080810547 }, { "auxiliary_loss_clip": 0.01417786, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.25182283, "balance_loss_mlp": 1.01774395, "epoch": 0.8120847737862619, "flos": 22137879905280.0, "grad_norm": 2.4767184757549647, "language_loss": 0.77827179, "learning_rate": 3.590259387812593e-07, "loss": 0.80281949, "num_input_tokens_seen": 291544605, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19238281, "step": 13507, "time_per_iteration": 2.8584558963775635 }, { "auxiliary_loss_clip": 0.01418503, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.25210333, "balance_loss_mlp": 1.01113486, "epoch": 0.8121448970389298, "flos": 23305706173440.0, "grad_norm": 5.014580703257244, "language_loss": 0.71337032, "learning_rate": 3.5880332822884783e-07, "loss": 0.73786384, "num_input_tokens_seen": 291563850, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19714355, "step": 13508, "time_per_iteration": 2.8471953868865967 }, { "auxiliary_loss_clip": 0.01393624, "auxiliary_loss_mlp": 0.01032514, "balance_loss_clip": 1.2349, "balance_loss_mlp": 1.01354778, "epoch": 0.8122050202915978, "flos": 22174374476160.0, "grad_norm": 19.045173736050675, "language_loss": 0.76612306, "learning_rate": 3.585807799107785e-07, "loss": 0.79038441, "num_input_tokens_seen": 291581730, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18981934, "step": 13509, "time_per_iteration": 2.8435637950897217 }, { "auxiliary_loss_clip": 0.01410409, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.24751461, "balance_loss_mlp": 1.01383829, "epoch": 0.8122651435442657, "flos": 23269528316160.0, "grad_norm": 2.3691179832243416, "language_loss": 0.77860075, "learning_rate": 3.58358293835491e-07, "loss": 0.8030355, "num_input_tokens_seen": 291601225, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19213867, "step": 13510, "time_per_iteration": 2.91888689994812 }, { "auxiliary_loss_clip": 0.01416841, "auxiliary_loss_mlp": 0.01033886, "balance_loss_clip": 1.25255942, "balance_loss_mlp": 1.01439548, "epoch": 0.8123252667969337, "flos": 16147657618560.0, "grad_norm": 2.056496193666657, "language_loss": 0.70399904, "learning_rate": 3.581358700114212e-07, "loss": 0.72850633, "num_input_tokens_seen": 291616995, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19494629, "step": 13511, "time_per_iteration": 2.8182337284088135 }, { "auxiliary_loss_clip": 0.01408433, "auxiliary_loss_mlp": 0.01033878, "balance_loss_clip": 1.24589694, "balance_loss_mlp": 1.01455426, "epoch": 0.8123853900496016, "flos": 21253685160960.0, "grad_norm": 2.2762200634618153, "language_loss": 0.79794562, "learning_rate": 3.57913508447004e-07, "loss": 0.82236874, "num_input_tokens_seen": 291636145, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19311523, "step": 13512, "time_per_iteration": 4.245841026306152 }, { "auxiliary_loss_clip": 0.0140241, "auxiliary_loss_mlp": 0.01031691, "balance_loss_clip": 1.24207425, "balance_loss_mlp": 1.01339257, "epoch": 0.8124455133022697, "flos": 64398638229120.0, "grad_norm": 1.690663887340754, "language_loss": 0.64406532, "learning_rate": 3.5769120915067076e-07, "loss": 0.66840637, "num_input_tokens_seen": 291662440, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18310547, "step": 13513, "time_per_iteration": 3.2920007705688477 }, { "auxiliary_loss_clip": 0.01411044, "auxiliary_loss_mlp": 0.01032914, "balance_loss_clip": 1.24816287, "balance_loss_mlp": 1.01359022, "epoch": 0.8125056365549376, "flos": 23852808023040.0, "grad_norm": 1.676351556656475, "language_loss": 0.72163451, "learning_rate": 3.5746897213085194e-07, "loss": 0.74607408, "num_input_tokens_seen": 291680950, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19311523, "step": 13514, "time_per_iteration": 2.861069917678833 }, { "auxiliary_loss_clip": 0.01397489, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.23965168, "balance_loss_mlp": 1.01359916, "epoch": 0.8125657598076056, "flos": 23560851456000.0, "grad_norm": 2.195099394732198, "language_loss": 0.63877189, "learning_rate": 3.5724679739597364e-07, "loss": 0.66307044, "num_input_tokens_seen": 291702395, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18762207, "step": 13515, "time_per_iteration": 2.917102336883545 }, { "auxiliary_loss_clip": 0.01370875, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.21892405, "balance_loss_mlp": 1.01120031, "epoch": 0.8126258830602736, "flos": 20713958213760.0, "grad_norm": 1.4684457395372836, "language_loss": 0.7569828, "learning_rate": 3.570246849544616e-07, "loss": 0.78098941, "num_input_tokens_seen": 291721135, "router_z_loss_clip": 1.52050781, "router_z_loss_mlp": 0.18591309, "step": 13516, "time_per_iteration": 2.8092081546783447 }, { "auxiliary_loss_clip": 0.0141482, "auxiliary_loss_mlp": 0.01036833, "balance_loss_clip": 1.25246429, "balance_loss_mlp": 1.01702082, "epoch": 0.8126860063129415, "flos": 23627370591360.0, "grad_norm": 1.4521761860822036, "language_loss": 0.92041659, "learning_rate": 3.5680263481473907e-07, "loss": 0.94493306, "num_input_tokens_seen": 291741235, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19812012, "step": 13517, "time_per_iteration": 4.2871012687683105 }, { "auxiliary_loss_clip": 0.01410051, "auxiliary_loss_mlp": 0.01039691, "balance_loss_clip": 1.25011039, "balance_loss_mlp": 1.02027178, "epoch": 0.8127461295656095, "flos": 25017376665600.0, "grad_norm": 1.3625454617075954, "language_loss": 0.79225785, "learning_rate": 3.565806469852244e-07, "loss": 0.81675524, "num_input_tokens_seen": 291761430, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19421387, "step": 13518, "time_per_iteration": 2.888892889022827 }, { "auxiliary_loss_clip": 0.0140407, "auxiliary_loss_mlp": 0.01034838, "balance_loss_clip": 1.24506402, "balance_loss_mlp": 1.01682556, "epoch": 0.8128062528182775, "flos": 27352983957120.0, "grad_norm": 1.8822185888809235, "language_loss": 0.79822671, "learning_rate": 3.56358721474336e-07, "loss": 0.82261574, "num_input_tokens_seen": 291781755, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18005371, "step": 13519, "time_per_iteration": 2.9129831790924072 }, { "auxiliary_loss_clip": 0.01398401, "auxiliary_loss_mlp": 0.01036891, "balance_loss_clip": 1.23722494, "balance_loss_mlp": 1.01844943, "epoch": 0.8128663760709455, "flos": 26517771348480.0, "grad_norm": 1.554729094038506, "language_loss": 0.70869911, "learning_rate": 3.561368582904905e-07, "loss": 0.73305202, "num_input_tokens_seen": 291804410, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18444824, "step": 13520, "time_per_iteration": 2.94035005569458 }, { "auxiliary_loss_clip": 0.01405209, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.24294662, "balance_loss_mlp": 1.01211953, "epoch": 0.8129264993236134, "flos": 17940235092480.0, "grad_norm": 2.2056915212875836, "language_loss": 0.73329836, "learning_rate": 3.5591505744209925e-07, "loss": 0.75765795, "num_input_tokens_seen": 291823285, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18640137, "step": 13521, "time_per_iteration": 4.249757289886475 }, { "auxiliary_loss_clip": 0.01405277, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.24350035, "balance_loss_mlp": 1.01295781, "epoch": 0.8129866225762814, "flos": 26189003496960.0, "grad_norm": 2.9470438122567457, "language_loss": 0.71075451, "learning_rate": 3.5569331893757394e-07, "loss": 0.73513341, "num_input_tokens_seen": 291845305, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19641113, "step": 13522, "time_per_iteration": 2.97939133644104 }, { "auxiliary_loss_clip": 0.01391753, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.23535109, "balance_loss_mlp": 1.01413798, "epoch": 0.8130467458289493, "flos": 21041911658880.0, "grad_norm": 1.5984336671436858, "language_loss": 0.71099138, "learning_rate": 3.554716427853233e-07, "loss": 0.73523259, "num_input_tokens_seen": 291863715, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18225098, "step": 13523, "time_per_iteration": 4.290147066116333 }, { "auxiliary_loss_clip": 0.01395383, "auxiliary_loss_mlp": 0.01033948, "balance_loss_clip": 1.23711014, "balance_loss_mlp": 1.01398015, "epoch": 0.8131068690816173, "flos": 15495506046720.0, "grad_norm": 2.0694671035541887, "language_loss": 0.72241986, "learning_rate": 3.5525002899375256e-07, "loss": 0.74671316, "num_input_tokens_seen": 291880735, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1998291, "step": 13524, "time_per_iteration": 2.8383266925811768 }, { "auxiliary_loss_clip": 0.01389356, "auxiliary_loss_mlp": 0.01031221, "balance_loss_clip": 1.23142838, "balance_loss_mlp": 1.01260018, "epoch": 0.8131669923342852, "flos": 29363171512320.0, "grad_norm": 1.8033140219625157, "language_loss": 0.63730812, "learning_rate": 3.550284775712653e-07, "loss": 0.66151381, "num_input_tokens_seen": 291900535, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.1862793, "step": 13525, "time_per_iteration": 2.9527149200439453 }, { "auxiliary_loss_clip": 0.01403223, "auxiliary_loss_mlp": 0.01034239, "balance_loss_clip": 1.24391043, "balance_loss_mlp": 1.01474857, "epoch": 0.8132271155869533, "flos": 35268143316480.0, "grad_norm": 1.5629172108576377, "language_loss": 0.6585139, "learning_rate": 3.548069885262628e-07, "loss": 0.68288851, "num_input_tokens_seen": 291919760, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19494629, "step": 13526, "time_per_iteration": 2.956932306289673 }, { "auxiliary_loss_clip": 0.01390152, "auxiliary_loss_mlp": 0.01031599, "balance_loss_clip": 1.23187602, "balance_loss_mlp": 1.01344371, "epoch": 0.8132872388396212, "flos": 27793316782080.0, "grad_norm": 1.8155000453708177, "language_loss": 0.75843775, "learning_rate": 3.5458556186714473e-07, "loss": 0.7826553, "num_input_tokens_seen": 291938915, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.1817627, "step": 13527, "time_per_iteration": 2.9314746856689453 }, { "auxiliary_loss_clip": 0.01400581, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.24044442, "balance_loss_mlp": 1.01200581, "epoch": 0.8133473620922892, "flos": 27831349676160.0, "grad_norm": 1.7040118385777634, "language_loss": 0.7118206, "learning_rate": 3.5436419760230706e-07, "loss": 0.7361334, "num_input_tokens_seen": 291958145, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18701172, "step": 13528, "time_per_iteration": 2.8801779747009277 }, { "auxiliary_loss_clip": 0.01395936, "auxiliary_loss_mlp": 0.01029895, "balance_loss_clip": 1.23482645, "balance_loss_mlp": 1.01141787, "epoch": 0.8134074853449572, "flos": 18998849116800.0, "grad_norm": 1.9995951856702965, "language_loss": 0.69646776, "learning_rate": 3.5414289574014357e-07, "loss": 0.72072613, "num_input_tokens_seen": 291976860, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18481445, "step": 13529, "time_per_iteration": 2.851712703704834 }, { "auxiliary_loss_clip": 0.01383788, "auxiliary_loss_mlp": 0.01033319, "balance_loss_clip": 1.22788942, "balance_loss_mlp": 1.01473379, "epoch": 0.8134676085976251, "flos": 24253388651520.0, "grad_norm": 1.368128870405857, "language_loss": 0.78070927, "learning_rate": 3.5392165628904635e-07, "loss": 0.80488032, "num_input_tokens_seen": 291998085, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18579102, "step": 13530, "time_per_iteration": 2.9558002948760986 }, { "auxiliary_loss_clip": 0.01393532, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.23566103, "balance_loss_mlp": 1.01156819, "epoch": 0.8135277318502931, "flos": 19071702524160.0, "grad_norm": 1.9190069865026615, "language_loss": 0.8244375, "learning_rate": 3.537004792574052e-07, "loss": 0.8486771, "num_input_tokens_seen": 292016585, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18847656, "step": 13531, "time_per_iteration": 2.864884853363037 }, { "auxiliary_loss_clip": 0.01407921, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.24574888, "balance_loss_mlp": 1.01368356, "epoch": 0.813587855102961, "flos": 17277315258240.0, "grad_norm": 2.0285763293091947, "language_loss": 0.72579634, "learning_rate": 3.534793646536065e-07, "loss": 0.75021076, "num_input_tokens_seen": 292033255, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19836426, "step": 13532, "time_per_iteration": 2.801363945007324 }, { "auxiliary_loss_clip": 0.01400062, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.24135172, "balance_loss_mlp": 1.01355171, "epoch": 0.8136479783556291, "flos": 20167354056960.0, "grad_norm": 2.216635256682109, "language_loss": 0.76574361, "learning_rate": 3.5325831248603533e-07, "loss": 0.79006982, "num_input_tokens_seen": 292051800, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18981934, "step": 13533, "time_per_iteration": 2.8682994842529297 }, { "auxiliary_loss_clip": 0.01407498, "auxiliary_loss_mlp": 0.01033217, "balance_loss_clip": 1.24401295, "balance_loss_mlp": 1.01390529, "epoch": 0.813708101608297, "flos": 22061949851520.0, "grad_norm": 6.7888533142617264, "language_loss": 0.77142185, "learning_rate": 3.5303732276307495e-07, "loss": 0.795829, "num_input_tokens_seen": 292072215, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19311523, "step": 13534, "time_per_iteration": 2.848888635635376 }, { "auxiliary_loss_clip": 0.01407938, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.24814129, "balance_loss_mlp": 1.0093658, "epoch": 0.813768224860965, "flos": 16180261136640.0, "grad_norm": 2.044377705431547, "language_loss": 0.94264638, "learning_rate": 3.5281639549310336e-07, "loss": 0.9670136, "num_input_tokens_seen": 292088830, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1940918, "step": 13535, "time_per_iteration": 2.7819979190826416 }, { "auxiliary_loss_clip": 0.01397901, "auxiliary_loss_mlp": 0.01029653, "balance_loss_clip": 1.24181354, "balance_loss_mlp": 1.01139009, "epoch": 0.8138283481136329, "flos": 24362872364160.0, "grad_norm": 1.6683097287701654, "language_loss": 0.70738614, "learning_rate": 3.52595530684499e-07, "loss": 0.73166162, "num_input_tokens_seen": 292109225, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18261719, "step": 13536, "time_per_iteration": 2.8698537349700928 }, { "auxiliary_loss_clip": 0.01397007, "auxiliary_loss_mlp": 0.01033763, "balance_loss_clip": 1.23711205, "balance_loss_mlp": 1.01397443, "epoch": 0.8138884713663009, "flos": 25526309886720.0, "grad_norm": 2.4962809913961164, "language_loss": 0.75701231, "learning_rate": 3.5237472834563775e-07, "loss": 0.78131998, "num_input_tokens_seen": 292129660, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19787598, "step": 13537, "time_per_iteration": 2.850372791290283 }, { "auxiliary_loss_clip": 0.01390951, "auxiliary_loss_mlp": 0.01033501, "balance_loss_clip": 1.23403311, "balance_loss_mlp": 1.01418924, "epoch": 0.8139485946189688, "flos": 22464249782400.0, "grad_norm": 1.513434597064544, "language_loss": 0.76451468, "learning_rate": 3.5215398848489163e-07, "loss": 0.78875923, "num_input_tokens_seen": 292149090, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.1932373, "step": 13538, "time_per_iteration": 2.8471248149871826 }, { "auxiliary_loss_clip": 0.0140765, "auxiliary_loss_mlp": 0.01033266, "balance_loss_clip": 1.24579942, "balance_loss_mlp": 1.01555192, "epoch": 0.8140087178716369, "flos": 21259974188160.0, "grad_norm": 1.6569194589258451, "language_loss": 0.78301644, "learning_rate": 3.5193331111063176e-07, "loss": 0.80742562, "num_input_tokens_seen": 292169260, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.17700195, "step": 13539, "time_per_iteration": 2.828122138977051 }, { "auxiliary_loss_clip": 0.01389705, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.23372817, "balance_loss_mlp": 1.01440334, "epoch": 0.8140688411243048, "flos": 39428841110400.0, "grad_norm": 2.858238899256375, "language_loss": 0.66952294, "learning_rate": 3.5171269623122533e-07, "loss": 0.6937449, "num_input_tokens_seen": 292188145, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.1809082, "step": 13540, "time_per_iteration": 2.967628002166748 }, { "auxiliary_loss_clip": 0.01404755, "auxiliary_loss_mlp": 0.01032372, "balance_loss_clip": 1.2446692, "balance_loss_mlp": 1.01346588, "epoch": 0.8141289643769728, "flos": 25428137374080.0, "grad_norm": 1.7951704032067992, "language_loss": 0.68229127, "learning_rate": 3.5149214385503913e-07, "loss": 0.70666254, "num_input_tokens_seen": 292212135, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18896484, "step": 13541, "time_per_iteration": 2.942077875137329 }, { "auxiliary_loss_clip": 0.01402197, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.24270988, "balance_loss_mlp": 1.01548648, "epoch": 0.8141890876296408, "flos": 12575714152320.0, "grad_norm": 1.8837731430623945, "language_loss": 0.6962626, "learning_rate": 3.512716539904355e-07, "loss": 0.72063255, "num_input_tokens_seen": 292230645, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19299316, "step": 13542, "time_per_iteration": 2.823521137237549 }, { "auxiliary_loss_clip": 0.01423765, "auxiliary_loss_mlp": 0.01028281, "balance_loss_clip": 1.25870323, "balance_loss_mlp": 1.00879073, "epoch": 0.8142492108823087, "flos": 14973135120000.0, "grad_norm": 5.119279918072919, "language_loss": 0.81214213, "learning_rate": 3.5105122664577613e-07, "loss": 0.83666253, "num_input_tokens_seen": 292243540, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19494629, "step": 13543, "time_per_iteration": 2.791187047958374 }, { "auxiliary_loss_clip": 0.01419746, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.25422668, "balance_loss_mlp": 1.01318467, "epoch": 0.8143093341349767, "flos": 12429600134400.0, "grad_norm": 3.3323498821398703, "language_loss": 0.79722989, "learning_rate": 3.5083086182942003e-07, "loss": 0.82175434, "num_input_tokens_seen": 292261715, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.1953125, "step": 13544, "time_per_iteration": 2.8285703659057617 }, { "auxiliary_loss_clip": 0.0143176, "auxiliary_loss_mlp": 0.01030138, "balance_loss_clip": 1.26263475, "balance_loss_mlp": 1.01208961, "epoch": 0.8143694573876447, "flos": 11917861735680.0, "grad_norm": 3.047154412613177, "language_loss": 0.74279177, "learning_rate": 3.5061055954972264e-07, "loss": 0.76741076, "num_input_tokens_seen": 292275080, "router_z_loss_clip": 1.69042969, "router_z_loss_mlp": 0.18054199, "step": 13545, "time_per_iteration": 2.823859453201294 }, { "auxiliary_loss_clip": 0.01392817, "auxiliary_loss_mlp": 0.01030142, "balance_loss_clip": 1.23609269, "balance_loss_mlp": 1.01171207, "epoch": 0.8144295806403127, "flos": 21222212762880.0, "grad_norm": 1.5678898567386188, "language_loss": 0.77275741, "learning_rate": 3.5039031981503776e-07, "loss": 0.796987, "num_input_tokens_seen": 292294635, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18432617, "step": 13546, "time_per_iteration": 2.8518002033233643 }, { "auxiliary_loss_clip": 0.01413351, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.25243139, "balance_loss_mlp": 1.01019728, "epoch": 0.8144897038929806, "flos": 19874854552320.0, "grad_norm": 2.7186540605652163, "language_loss": 0.71753359, "learning_rate": 3.501701426337178e-07, "loss": 0.741956, "num_input_tokens_seen": 292312695, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18701172, "step": 13547, "time_per_iteration": 4.267401695251465 }, { "auxiliary_loss_clip": 0.01421429, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 1.25696838, "balance_loss_mlp": 1.01546419, "epoch": 0.8145498271456486, "flos": 24582473216640.0, "grad_norm": 2.211363391236721, "language_loss": 0.71152198, "learning_rate": 3.49950028014111e-07, "loss": 0.7360875, "num_input_tokens_seen": 292332005, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1965332, "step": 13548, "time_per_iteration": 2.9667985439300537 }, { "auxiliary_loss_clip": 0.01412259, "auxiliary_loss_mlp": 0.01029501, "balance_loss_clip": 1.25073051, "balance_loss_mlp": 1.01035643, "epoch": 0.8146099503983165, "flos": 20202536528640.0, "grad_norm": 2.497025247035961, "language_loss": 0.77958041, "learning_rate": 3.4972997596456444e-07, "loss": 0.80399799, "num_input_tokens_seen": 292348365, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19116211, "step": 13549, "time_per_iteration": 2.8495147228240967 }, { "auxiliary_loss_clip": 0.01408734, "auxiliary_loss_mlp": 0.01035835, "balance_loss_clip": 1.24729347, "balance_loss_mlp": 1.01713097, "epoch": 0.8146700736509845, "flos": 19546448659200.0, "grad_norm": 1.9174457948763852, "language_loss": 0.71618432, "learning_rate": 3.4950998649342233e-07, "loss": 0.74063003, "num_input_tokens_seen": 292368050, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18701172, "step": 13550, "time_per_iteration": 2.8415443897247314 }, { "auxiliary_loss_clip": 0.01392213, "auxiliary_loss_mlp": 0.01033443, "balance_loss_clip": 1.23547709, "balance_loss_mlp": 1.01400006, "epoch": 0.8147301969036524, "flos": 18050533211520.0, "grad_norm": 1.8368164783651235, "language_loss": 0.72357595, "learning_rate": 3.4929005960902826e-07, "loss": 0.74783254, "num_input_tokens_seen": 292385315, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.19458008, "step": 13551, "time_per_iteration": 2.8269121646881104 }, { "auxiliary_loss_clip": 0.01422562, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 1.25694931, "balance_loss_mlp": 1.01482654, "epoch": 0.8147903201563205, "flos": 18013857661440.0, "grad_norm": 2.3669171851338904, "language_loss": 0.69401944, "learning_rate": 3.4907019531971926e-07, "loss": 0.71859586, "num_input_tokens_seen": 292403375, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20263672, "step": 13552, "time_per_iteration": 4.2146618366241455 }, { "auxiliary_loss_clip": 0.01402661, "auxiliary_loss_mlp": 0.01031345, "balance_loss_clip": 1.24277711, "balance_loss_mlp": 1.01283216, "epoch": 0.8148504434089884, "flos": 20267110137600.0, "grad_norm": 2.152403379533042, "language_loss": 0.83003736, "learning_rate": 3.4885039363383407e-07, "loss": 0.85437745, "num_input_tokens_seen": 292419260, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18530273, "step": 13553, "time_per_iteration": 2.8352210521698 }, { "auxiliary_loss_clip": 0.01407573, "auxiliary_loss_mlp": 0.01030901, "balance_loss_clip": 1.24663806, "balance_loss_mlp": 1.01251936, "epoch": 0.8149105666616564, "flos": 12502272562560.0, "grad_norm": 2.1078577879750053, "language_loss": 0.68576002, "learning_rate": 3.4863065455970795e-07, "loss": 0.7101447, "num_input_tokens_seen": 292436095, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18395996, "step": 13554, "time_per_iteration": 2.8004610538482666 }, { "auxiliary_loss_clip": 0.01399359, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.23927426, "balance_loss_mlp": 1.01618314, "epoch": 0.8149706899143244, "flos": 32535982183680.0, "grad_norm": 1.8774545221828276, "language_loss": 0.66914481, "learning_rate": 3.484109781056723e-07, "loss": 0.69349384, "num_input_tokens_seen": 292457190, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19348145, "step": 13555, "time_per_iteration": 2.92211651802063 }, { "auxiliary_loss_clip": 0.01411673, "auxiliary_loss_mlp": 0.01034318, "balance_loss_clip": 1.24681532, "balance_loss_mlp": 1.01519656, "epoch": 0.8150308131669923, "flos": 19394271838080.0, "grad_norm": 2.1583631178587943, "language_loss": 0.74342304, "learning_rate": 3.4819136428005844e-07, "loss": 0.76788294, "num_input_tokens_seen": 292474300, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19128418, "step": 13556, "time_per_iteration": 4.223504066467285 }, { "auxiliary_loss_clip": 0.01405625, "auxiliary_loss_mlp": 0.01030125, "balance_loss_clip": 1.24709916, "balance_loss_mlp": 1.01270866, "epoch": 0.8150909364196604, "flos": 17430487464960.0, "grad_norm": 1.5634842674435958, "language_loss": 0.81100249, "learning_rate": 3.4797181309119307e-07, "loss": 0.83535999, "num_input_tokens_seen": 292492420, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17419434, "step": 13557, "time_per_iteration": 2.8169989585876465 }, { "auxiliary_loss_clip": 0.01416098, "auxiliary_loss_mlp": 0.01033922, "balance_loss_clip": 1.2524457, "balance_loss_mlp": 1.0133822, "epoch": 0.8151510596723283, "flos": 27174356910720.0, "grad_norm": 1.546706155200626, "language_loss": 0.6626333, "learning_rate": 3.4775232454740255e-07, "loss": 0.68713355, "num_input_tokens_seen": 292512895, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20507812, "step": 13558, "time_per_iteration": 4.331890821456909 }, { "auxiliary_loss_clip": 0.01183451, "auxiliary_loss_mlp": 0.01021018, "balance_loss_clip": 1.09469497, "balance_loss_mlp": 1.00032318, "epoch": 0.8152111829249963, "flos": 64246253166720.0, "grad_norm": 1.05260805802815, "language_loss": 0.56966209, "learning_rate": 3.4753289865700896e-07, "loss": 0.59170675, "num_input_tokens_seen": 292566580, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.20703125, "step": 13559, "time_per_iteration": 3.3095040321350098 }, { "auxiliary_loss_clip": 0.01179237, "auxiliary_loss_mlp": 0.01022681, "balance_loss_clip": 1.09365773, "balance_loss_mlp": 1.00303578, "epoch": 0.8152713061776642, "flos": 67101788165760.0, "grad_norm": 0.6822284107158197, "language_loss": 0.55229557, "learning_rate": 3.473135354283334e-07, "loss": 0.57431471, "num_input_tokens_seen": 292621490, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 0.19628906, "step": 13560, "time_per_iteration": 3.148033380508423 }, { "auxiliary_loss_clip": 0.01396496, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.23767126, "balance_loss_mlp": 1.01430416, "epoch": 0.8153314294303322, "flos": 14398949617920.0, "grad_norm": 1.6336265634506493, "language_loss": 0.6763739, "learning_rate": 3.470942348696948e-07, "loss": 0.7006706, "num_input_tokens_seen": 292638660, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18859863, "step": 13561, "time_per_iteration": 2.8311214447021484 }, { "auxiliary_loss_clip": 0.0141314, "auxiliary_loss_mlp": 0.01031834, "balance_loss_clip": 1.24955237, "balance_loss_mlp": 1.01324892, "epoch": 0.8153915526830001, "flos": 25632897932160.0, "grad_norm": 1.6048239218587728, "language_loss": 0.8229087, "learning_rate": 3.468749969894085e-07, "loss": 0.84735841, "num_input_tokens_seen": 292658545, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.18566895, "step": 13562, "time_per_iteration": 2.880063533782959 }, { "auxiliary_loss_clip": 0.01405866, "auxiliary_loss_mlp": 0.01029784, "balance_loss_clip": 1.24431086, "balance_loss_mlp": 1.01079369, "epoch": 0.8154516759356681, "flos": 23379962169600.0, "grad_norm": 1.5071510172553437, "language_loss": 0.72423738, "learning_rate": 3.4665582179578734e-07, "loss": 0.74859393, "num_input_tokens_seen": 292678460, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18994141, "step": 13563, "time_per_iteration": 2.8954052925109863 }, { "auxiliary_loss_clip": 0.01395499, "auxiliary_loss_mlp": 0.01033725, "balance_loss_clip": 1.23416996, "balance_loss_mlp": 1.0129106, "epoch": 0.815511799188336, "flos": 28161339137280.0, "grad_norm": 1.6858600436701376, "language_loss": 0.70792514, "learning_rate": 3.4643670929714387e-07, "loss": 0.73221743, "num_input_tokens_seen": 292699815, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.20788574, "step": 13564, "time_per_iteration": 2.894212245941162 }, { "auxiliary_loss_clip": 0.01406503, "auxiliary_loss_mlp": 0.01032143, "balance_loss_clip": 1.2446394, "balance_loss_mlp": 1.01377249, "epoch": 0.8155719224410041, "flos": 16992371635200.0, "grad_norm": 1.837469886573768, "language_loss": 0.71225548, "learning_rate": 3.462176595017854e-07, "loss": 0.736642, "num_input_tokens_seen": 292717370, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18383789, "step": 13565, "time_per_iteration": 2.864867925643921 }, { "auxiliary_loss_clip": 0.01394427, "auxiliary_loss_mlp": 0.01031878, "balance_loss_clip": 1.23589516, "balance_loss_mlp": 1.01332939, "epoch": 0.815632045693672, "flos": 24692635601280.0, "grad_norm": 1.9626181086217405, "language_loss": 0.79486388, "learning_rate": 3.459986724180188e-07, "loss": 0.8191269, "num_input_tokens_seen": 292737110, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18554688, "step": 13566, "time_per_iteration": 2.9344594478607178 }, { "auxiliary_loss_clip": 0.01401934, "auxiliary_loss_mlp": 0.0102942, "balance_loss_clip": 1.24482656, "balance_loss_mlp": 1.01156259, "epoch": 0.81569216894634, "flos": 19947934183680.0, "grad_norm": 1.7532638678498138, "language_loss": 0.82946861, "learning_rate": 3.457797480541491e-07, "loss": 0.85378218, "num_input_tokens_seen": 292756510, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.17858887, "step": 13567, "time_per_iteration": 2.90010142326355 }, { "auxiliary_loss_clip": 0.0139531, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.23741698, "balance_loss_mlp": 1.00942898, "epoch": 0.8157522921990079, "flos": 21809564501760.0, "grad_norm": 1.9504872357600622, "language_loss": 0.80785686, "learning_rate": 3.455608864184771e-07, "loss": 0.8320815, "num_input_tokens_seen": 292776710, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.17736816, "step": 13568, "time_per_iteration": 2.940246343612671 }, { "auxiliary_loss_clip": 0.01389113, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.23271465, "balance_loss_mlp": 1.01562154, "epoch": 0.8158124154516759, "flos": 18515732693760.0, "grad_norm": 1.737845573299104, "language_loss": 0.78044611, "learning_rate": 3.453420875193016e-07, "loss": 0.80468035, "num_input_tokens_seen": 292794350, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18688965, "step": 13569, "time_per_iteration": 2.876735210418701 }, { "auxiliary_loss_clip": 0.01402434, "auxiliary_loss_mlp": 0.01032321, "balance_loss_clip": 1.24333107, "balance_loss_mlp": 1.01407003, "epoch": 0.815872538704344, "flos": 26841336048000.0, "grad_norm": 2.5153069587250654, "language_loss": 0.60361445, "learning_rate": 3.451233513649199e-07, "loss": 0.62796199, "num_input_tokens_seen": 292814005, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18249512, "step": 13570, "time_per_iteration": 2.878286123275757 }, { "auxiliary_loss_clip": 0.01420727, "auxiliary_loss_mlp": 0.01041426, "balance_loss_clip": 1.2552911, "balance_loss_mlp": 1.02163768, "epoch": 0.8159326619570119, "flos": 21735670464000.0, "grad_norm": 2.1566803612483456, "language_loss": 0.82799274, "learning_rate": 3.4490467796362687e-07, "loss": 0.85261428, "num_input_tokens_seen": 292833485, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19775391, "step": 13571, "time_per_iteration": 2.854039192199707 }, { "auxiliary_loss_clip": 0.0140083, "auxiliary_loss_mlp": 0.01036102, "balance_loss_clip": 1.2400918, "balance_loss_mlp": 1.01650381, "epoch": 0.8159927852096799, "flos": 13847594757120.0, "grad_norm": 7.84937214120772, "language_loss": 0.79818088, "learning_rate": 3.446860673237142e-07, "loss": 0.82255018, "num_input_tokens_seen": 292848045, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19592285, "step": 13572, "time_per_iteration": 2.8260018825531006 }, { "auxiliary_loss_clip": 0.01390885, "auxiliary_loss_mlp": 0.01032422, "balance_loss_clip": 1.23120713, "balance_loss_mlp": 1.0136826, "epoch": 0.8160529084623478, "flos": 24510434215680.0, "grad_norm": 1.6316371337733233, "language_loss": 0.65803164, "learning_rate": 3.4446751945347186e-07, "loss": 0.68226469, "num_input_tokens_seen": 292869965, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1875, "step": 13573, "time_per_iteration": 2.8815722465515137 }, { "auxiliary_loss_clip": 0.01386164, "auxiliary_loss_mlp": 0.01035025, "balance_loss_clip": 1.22865462, "balance_loss_mlp": 1.0165832, "epoch": 0.8161130317150158, "flos": 24836939827200.0, "grad_norm": 1.60587970426764, "language_loss": 0.75788772, "learning_rate": 3.442490343611868e-07, "loss": 0.7820996, "num_input_tokens_seen": 292889680, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18444824, "step": 13574, "time_per_iteration": 2.892906427383423 }, { "auxiliary_loss_clip": 0.01414185, "auxiliary_loss_mlp": 0.01034106, "balance_loss_clip": 1.25107038, "balance_loss_mlp": 1.01535404, "epoch": 0.8161731549676837, "flos": 30968887386240.0, "grad_norm": 1.8672883125320314, "language_loss": 0.60763943, "learning_rate": 3.4403061205514485e-07, "loss": 0.6321224, "num_input_tokens_seen": 292912360, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.1875, "step": 13575, "time_per_iteration": 2.909097194671631 }, { "auxiliary_loss_clip": 0.01401962, "auxiliary_loss_mlp": 0.01032989, "balance_loss_clip": 1.2421844, "balance_loss_mlp": 1.0131762, "epoch": 0.8162332782203517, "flos": 18561276224640.0, "grad_norm": 1.9168040804998403, "language_loss": 0.74944532, "learning_rate": 3.4381225254362736e-07, "loss": 0.77379477, "num_input_tokens_seen": 292928325, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19787598, "step": 13576, "time_per_iteration": 2.8313894271850586 }, { "auxiliary_loss_clip": 0.01183961, "auxiliary_loss_mlp": 0.01020512, "balance_loss_clip": 1.09604371, "balance_loss_mlp": 0.99819583, "epoch": 0.8162934014730197, "flos": 70416097885440.0, "grad_norm": 0.8489935748313457, "language_loss": 0.58721066, "learning_rate": 3.435939558349155e-07, "loss": 0.60925531, "num_input_tokens_seen": 292992795, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.22363281, "step": 13577, "time_per_iteration": 3.358959674835205 }, { "auxiliary_loss_clip": 0.01387996, "auxiliary_loss_mlp": 0.01031052, "balance_loss_clip": 1.23156059, "balance_loss_mlp": 1.01256263, "epoch": 0.8163535247256877, "flos": 21224655982080.0, "grad_norm": 1.892239910634745, "language_loss": 0.71718776, "learning_rate": 3.4337572193728747e-07, "loss": 0.74137819, "num_input_tokens_seen": 293011950, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18493652, "step": 13578, "time_per_iteration": 2.852933645248413 }, { "auxiliary_loss_clip": 0.01405302, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.24514103, "balance_loss_mlp": 1.0153327, "epoch": 0.8164136479783556, "flos": 21106937715840.0, "grad_norm": 1.7037479012927401, "language_loss": 0.74561465, "learning_rate": 3.431575508590172e-07, "loss": 0.77000529, "num_input_tokens_seen": 293030175, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18432617, "step": 13579, "time_per_iteration": 2.894939661026001 }, { "auxiliary_loss_clip": 0.01403014, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.24042416, "balance_loss_mlp": 1.01326227, "epoch": 0.8164737712310236, "flos": 21729290947200.0, "grad_norm": 1.9876260065042013, "language_loss": 0.79443181, "learning_rate": 3.4293944260837873e-07, "loss": 0.81878209, "num_input_tokens_seen": 293047980, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18762207, "step": 13580, "time_per_iteration": 2.8581268787384033 }, { "auxiliary_loss_clip": 0.01384843, "auxiliary_loss_mlp": 0.01031269, "balance_loss_clip": 1.22907853, "balance_loss_mlp": 1.01246989, "epoch": 0.8165338944836915, "flos": 19546267680000.0, "grad_norm": 1.813379842606365, "language_loss": 0.70175481, "learning_rate": 3.4272139719364314e-07, "loss": 0.72591591, "num_input_tokens_seen": 293067030, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18786621, "step": 13581, "time_per_iteration": 4.268720626831055 }, { "auxiliary_loss_clip": 0.0141223, "auxiliary_loss_mlp": 0.0103002, "balance_loss_clip": 1.25189352, "balance_loss_mlp": 1.0112443, "epoch": 0.8165940177363595, "flos": 22938543469440.0, "grad_norm": 2.535597630302302, "language_loss": 0.60517716, "learning_rate": 3.4250341462307786e-07, "loss": 0.62959969, "num_input_tokens_seen": 293085575, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18774414, "step": 13582, "time_per_iteration": 2.853557586669922 }, { "auxiliary_loss_clip": 0.01383944, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.22956252, "balance_loss_mlp": 1.01135635, "epoch": 0.8166541409890276, "flos": 23381500492800.0, "grad_norm": 1.4219648572864834, "language_loss": 0.8270306, "learning_rate": 3.4228549490494897e-07, "loss": 0.85117424, "num_input_tokens_seen": 293108200, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.19067383, "step": 13583, "time_per_iteration": 2.8908979892730713 }, { "auxiliary_loss_clip": 0.01406107, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.24483752, "balance_loss_mlp": 1.011729, "epoch": 0.8167142642416955, "flos": 18451113840000.0, "grad_norm": 1.7027146221963114, "language_loss": 0.74572939, "learning_rate": 3.4206763804752093e-07, "loss": 0.77009469, "num_input_tokens_seen": 293126020, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18688965, "step": 13584, "time_per_iteration": 2.842946767807007 }, { "auxiliary_loss_clip": 0.01413256, "auxiliary_loss_mlp": 0.0103151, "balance_loss_clip": 1.25113368, "balance_loss_mlp": 1.01237655, "epoch": 0.8167743874943635, "flos": 21224836961280.0, "grad_norm": 1.5913049334934954, "language_loss": 0.74825937, "learning_rate": 3.4184984405905405e-07, "loss": 0.77270699, "num_input_tokens_seen": 293144620, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19128418, "step": 13585, "time_per_iteration": 2.8418047428131104 }, { "auxiliary_loss_clip": 0.01399488, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.24019933, "balance_loss_mlp": 1.01253331, "epoch": 0.8168345107470314, "flos": 18706937794560.0, "grad_norm": 1.5264375937352501, "language_loss": 0.69754159, "learning_rate": 3.416321129478068e-07, "loss": 0.72185504, "num_input_tokens_seen": 293162850, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19311523, "step": 13586, "time_per_iteration": 4.432991027832031 }, { "auxiliary_loss_clip": 0.01398819, "auxiliary_loss_mlp": 0.01034955, "balance_loss_clip": 1.23985755, "balance_loss_mlp": 1.01679909, "epoch": 0.8168946339996994, "flos": 16261711056000.0, "grad_norm": 1.6887032516169675, "language_loss": 0.6168133, "learning_rate": 3.4141444472203594e-07, "loss": 0.64115107, "num_input_tokens_seen": 293181620, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18164062, "step": 13587, "time_per_iteration": 2.836973190307617 }, { "auxiliary_loss_clip": 0.01424557, "auxiliary_loss_mlp": 0.01034578, "balance_loss_clip": 1.25949359, "balance_loss_mlp": 1.01589823, "epoch": 0.8169547572523673, "flos": 26952493818240.0, "grad_norm": 2.178159926810593, "language_loss": 0.70101333, "learning_rate": 3.4119683938999624e-07, "loss": 0.72560465, "num_input_tokens_seen": 293200270, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.18688965, "step": 13588, "time_per_iteration": 2.8740203380584717 }, { "auxiliary_loss_clip": 0.01409328, "auxiliary_loss_mlp": 0.01034508, "balance_loss_clip": 1.24706411, "balance_loss_mlp": 1.01474333, "epoch": 0.8170148805050353, "flos": 18961449649920.0, "grad_norm": 1.4605860792696277, "language_loss": 0.73127711, "learning_rate": 3.4097929695993854e-07, "loss": 0.75571549, "num_input_tokens_seen": 293218960, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19750977, "step": 13589, "time_per_iteration": 2.834681749343872 }, { "auxiliary_loss_clip": 0.01390468, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.23284101, "balance_loss_mlp": 1.01668739, "epoch": 0.8170750037577033, "flos": 21844837463040.0, "grad_norm": 1.751038553397567, "language_loss": 0.74025905, "learning_rate": 3.4076181744011166e-07, "loss": 0.76452565, "num_input_tokens_seen": 293236450, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19506836, "step": 13590, "time_per_iteration": 2.861180067062378 }, { "auxiliary_loss_clip": 0.0142486, "auxiliary_loss_mlp": 0.01037573, "balance_loss_clip": 1.25890338, "balance_loss_mlp": 1.01776052, "epoch": 0.8171351270103713, "flos": 33519299581440.0, "grad_norm": 1.8209665943625968, "language_loss": 0.65270162, "learning_rate": 3.4054440083876345e-07, "loss": 0.67732596, "num_input_tokens_seen": 293256480, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19812012, "step": 13591, "time_per_iteration": 4.374577522277832 }, { "auxiliary_loss_clip": 0.01416304, "auxiliary_loss_mlp": 0.0103017, "balance_loss_clip": 1.25123262, "balance_loss_mlp": 1.01158547, "epoch": 0.8171952502630392, "flos": 22717721007360.0, "grad_norm": 2.023447457249368, "language_loss": 0.69270837, "learning_rate": 3.403270471641373e-07, "loss": 0.7171731, "num_input_tokens_seen": 293274960, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.18579102, "step": 13592, "time_per_iteration": 4.260854005813599 }, { "auxiliary_loss_clip": 0.01408693, "auxiliary_loss_mlp": 0.01030594, "balance_loss_clip": 1.24690926, "balance_loss_mlp": 1.01252246, "epoch": 0.8172553735157072, "flos": 26734838492160.0, "grad_norm": 2.230681192681461, "language_loss": 0.67442727, "learning_rate": 3.401097564244759e-07, "loss": 0.69882017, "num_input_tokens_seen": 293295945, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18078613, "step": 13593, "time_per_iteration": 2.8703136444091797 }, { "auxiliary_loss_clip": 0.01403819, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.24388587, "balance_loss_mlp": 1.01522195, "epoch": 0.8173154967683751, "flos": 15969573509760.0, "grad_norm": 1.7685695493507874, "language_loss": 0.69900858, "learning_rate": 3.398925286280188e-07, "loss": 0.72338438, "num_input_tokens_seen": 293313300, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18530273, "step": 13594, "time_per_iteration": 2.8059043884277344 }, { "auxiliary_loss_clip": 0.01417144, "auxiliary_loss_mlp": 0.01037006, "balance_loss_clip": 1.25364912, "balance_loss_mlp": 1.01815915, "epoch": 0.8173756200210431, "flos": 25995762380160.0, "grad_norm": 2.043034371449735, "language_loss": 0.66788328, "learning_rate": 3.3967536378300456e-07, "loss": 0.69242477, "num_input_tokens_seen": 293333085, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18835449, "step": 13595, "time_per_iteration": 2.8718745708465576 }, { "auxiliary_loss_clip": 0.01414343, "auxiliary_loss_mlp": 0.01030141, "balance_loss_clip": 1.24895597, "balance_loss_mlp": 1.01178277, "epoch": 0.8174357432737112, "flos": 25674957613440.0, "grad_norm": 1.691058253791697, "language_loss": 0.79323345, "learning_rate": 3.394582618976658e-07, "loss": 0.81767833, "num_input_tokens_seen": 293351895, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.18359375, "step": 13596, "time_per_iteration": 2.895853281021118 }, { "auxiliary_loss_clip": 0.01389208, "auxiliary_loss_mlp": 0.01029964, "balance_loss_clip": 1.23142052, "balance_loss_mlp": 1.011343, "epoch": 0.8174958665263791, "flos": 21845063687040.0, "grad_norm": 10.435880081979729, "language_loss": 0.58622366, "learning_rate": 3.392412229802362e-07, "loss": 0.6104154, "num_input_tokens_seen": 293371165, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18615723, "step": 13597, "time_per_iteration": 2.862119436264038 }, { "auxiliary_loss_clip": 0.0139786, "auxiliary_loss_mlp": 0.01037136, "balance_loss_clip": 1.24030149, "balance_loss_mlp": 1.01853895, "epoch": 0.8175559897790471, "flos": 22465697616000.0, "grad_norm": 1.5786245472374147, "language_loss": 0.82988024, "learning_rate": 3.390242470389462e-07, "loss": 0.85423023, "num_input_tokens_seen": 293391150, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18603516, "step": 13598, "time_per_iteration": 2.9172825813293457 }, { "auxiliary_loss_clip": 0.01417203, "auxiliary_loss_mlp": 0.01033973, "balance_loss_clip": 1.25421119, "balance_loss_mlp": 1.01518559, "epoch": 0.817616113031715, "flos": 23624384434560.0, "grad_norm": 1.8601046879346308, "language_loss": 0.8353442, "learning_rate": 3.3880733408202277e-07, "loss": 0.85985589, "num_input_tokens_seen": 293409440, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.18774414, "step": 13599, "time_per_iteration": 2.854243278503418 }, { "auxiliary_loss_clip": 0.01399065, "auxiliary_loss_mlp": 0.01036945, "balance_loss_clip": 1.24137664, "balance_loss_mlp": 1.01691794, "epoch": 0.817676236284383, "flos": 27683606845440.0, "grad_norm": 1.9713277418723483, "language_loss": 0.84412074, "learning_rate": 3.3859048411769186e-07, "loss": 0.8684808, "num_input_tokens_seen": 293428995, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.20043945, "step": 13600, "time_per_iteration": 2.8763363361358643 }, { "auxiliary_loss_clip": 0.01410486, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.24817443, "balance_loss_mlp": 1.01456881, "epoch": 0.8177363595370509, "flos": 24691730705280.0, "grad_norm": 1.8280905636982194, "language_loss": 0.74341416, "learning_rate": 3.383736971541766e-07, "loss": 0.76785815, "num_input_tokens_seen": 293449155, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19348145, "step": 13601, "time_per_iteration": 2.86928129196167 }, { "auxiliary_loss_clip": 0.01425362, "auxiliary_loss_mlp": 0.01031654, "balance_loss_clip": 1.25818324, "balance_loss_mlp": 1.01215112, "epoch": 0.817796482789719, "flos": 17354874124800.0, "grad_norm": 2.1129279625100956, "language_loss": 0.69408834, "learning_rate": 3.3815697319969737e-07, "loss": 0.71865851, "num_input_tokens_seen": 293466125, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.19519043, "step": 13602, "time_per_iteration": 2.836453676223755 }, { "auxiliary_loss_clip": 0.01398296, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 1.24033856, "balance_loss_mlp": 1.01434481, "epoch": 0.8178566060423869, "flos": 17785660296960.0, "grad_norm": 3.448251294289978, "language_loss": 0.84687346, "learning_rate": 3.379403122624718e-07, "loss": 0.8711831, "num_input_tokens_seen": 293481345, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18334961, "step": 13603, "time_per_iteration": 2.858989715576172 }, { "auxiliary_loss_clip": 0.01400519, "auxiliary_loss_mlp": 0.01031159, "balance_loss_clip": 1.24019825, "balance_loss_mlp": 1.01278913, "epoch": 0.8179167292950549, "flos": 24984139720320.0, "grad_norm": 1.6662148799399195, "language_loss": 0.7037878, "learning_rate": 3.377237143507159e-07, "loss": 0.72810459, "num_input_tokens_seen": 293502330, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18371582, "step": 13604, "time_per_iteration": 2.9248199462890625 }, { "auxiliary_loss_clip": 0.01399734, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 1.2418623, "balance_loss_mlp": 1.01459026, "epoch": 0.8179768525477228, "flos": 22867047406080.0, "grad_norm": 1.7264286835492468, "language_loss": 0.74692047, "learning_rate": 3.3750717947264406e-07, "loss": 0.77126062, "num_input_tokens_seen": 293521415, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19689941, "step": 13605, "time_per_iteration": 2.868864059448242 }, { "auxiliary_loss_clip": 0.01387982, "auxiliary_loss_mlp": 0.01038318, "balance_loss_clip": 1.23077941, "balance_loss_mlp": 1.01979232, "epoch": 0.8180369758003908, "flos": 18524329205760.0, "grad_norm": 3.6635283318678207, "language_loss": 0.74789143, "learning_rate": 3.372907076364666e-07, "loss": 0.77215445, "num_input_tokens_seen": 293539245, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18530273, "step": 13606, "time_per_iteration": 2.8313395977020264 }, { "auxiliary_loss_clip": 0.01392122, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.23479259, "balance_loss_mlp": 1.01663482, "epoch": 0.8180970990530587, "flos": 33195010965120.0, "grad_norm": 1.8365911765911214, "language_loss": 0.66687715, "learning_rate": 3.370742988503916e-07, "loss": 0.69113958, "num_input_tokens_seen": 293560640, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17492676, "step": 13607, "time_per_iteration": 2.9095962047576904 }, { "auxiliary_loss_clip": 0.01402427, "auxiliary_loss_mlp": 0.01031764, "balance_loss_clip": 1.24139774, "balance_loss_mlp": 1.01311922, "epoch": 0.8181572223057267, "flos": 25020996249600.0, "grad_norm": 1.9821678733533423, "language_loss": 0.71513951, "learning_rate": 3.3685795312262634e-07, "loss": 0.73948133, "num_input_tokens_seen": 293579465, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18640137, "step": 13608, "time_per_iteration": 2.873129367828369 }, { "auxiliary_loss_clip": 0.01395743, "auxiliary_loss_mlp": 0.01032909, "balance_loss_clip": 1.23660767, "balance_loss_mlp": 1.01468205, "epoch": 0.8182173455583948, "flos": 28560517176960.0, "grad_norm": 1.7249551515619423, "language_loss": 0.80563682, "learning_rate": 3.366416704613735e-07, "loss": 0.82992333, "num_input_tokens_seen": 293600540, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18237305, "step": 13609, "time_per_iteration": 2.97982120513916 }, { "auxiliary_loss_clip": 0.01183958, "auxiliary_loss_mlp": 0.01024972, "balance_loss_clip": 1.09492874, "balance_loss_mlp": 1.00341892, "epoch": 0.8182774688110627, "flos": 72057539168640.0, "grad_norm": 0.7703080376404929, "language_loss": 0.5591746, "learning_rate": 3.3642545087483544e-07, "loss": 0.58126396, "num_input_tokens_seen": 293665160, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.21582031, "step": 13610, "time_per_iteration": 3.4243316650390625 }, { "auxiliary_loss_clip": 0.01382246, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.22831011, "balance_loss_mlp": 1.01374888, "epoch": 0.8183375920637307, "flos": 19764782657280.0, "grad_norm": 2.4211628357598856, "language_loss": 0.78486025, "learning_rate": 3.362092943712107e-07, "loss": 0.80900127, "num_input_tokens_seen": 293683995, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.1809082, "step": 13611, "time_per_iteration": 2.8364012241363525 }, { "auxiliary_loss_clip": 0.01426859, "auxiliary_loss_mlp": 0.01033236, "balance_loss_clip": 1.2589817, "balance_loss_mlp": 1.0133158, "epoch": 0.8183977153163986, "flos": 22350965506560.0, "grad_norm": 2.193083279597007, "language_loss": 0.77722037, "learning_rate": 3.3599320095869745e-07, "loss": 0.80182135, "num_input_tokens_seen": 293704115, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19921875, "step": 13612, "time_per_iteration": 2.889310836791992 }, { "auxiliary_loss_clip": 0.01397209, "auxiliary_loss_mlp": 0.01030662, "balance_loss_clip": 1.23885846, "balance_loss_mlp": 1.01186252, "epoch": 0.8184578385690666, "flos": 17721448646400.0, "grad_norm": 2.095936772390873, "language_loss": 0.87192839, "learning_rate": 3.3577717064548793e-07, "loss": 0.89620709, "num_input_tokens_seen": 293722225, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18798828, "step": 13613, "time_per_iteration": 2.864138603210449 }, { "auxiliary_loss_clip": 0.01402594, "auxiliary_loss_mlp": 0.01036441, "balance_loss_clip": 1.24379086, "balance_loss_mlp": 1.01772523, "epoch": 0.8185179618217345, "flos": 25711768897920.0, "grad_norm": 1.4036608954610408, "language_loss": 0.73503768, "learning_rate": 3.355612034397746e-07, "loss": 0.75942802, "num_input_tokens_seen": 293743995, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18713379, "step": 13614, "time_per_iteration": 2.873224973678589 }, { "auxiliary_loss_clip": 0.01415296, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.25299859, "balance_loss_mlp": 1.01620936, "epoch": 0.8185780850744026, "flos": 25971981598080.0, "grad_norm": 5.186502430899813, "language_loss": 0.81854641, "learning_rate": 3.353452993497479e-07, "loss": 0.84305227, "num_input_tokens_seen": 293764935, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19067383, "step": 13615, "time_per_iteration": 2.9031550884246826 }, { "auxiliary_loss_clip": 0.01402397, "auxiliary_loss_mlp": 0.01034772, "balance_loss_clip": 1.24237633, "balance_loss_mlp": 1.01529276, "epoch": 0.8186382083270705, "flos": 25239330247680.0, "grad_norm": 2.1206491673360897, "language_loss": 0.76197088, "learning_rate": 3.3512945838359375e-07, "loss": 0.78634256, "num_input_tokens_seen": 293784035, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19482422, "step": 13616, "time_per_iteration": 4.312572240829468 }, { "auxiliary_loss_clip": 0.01393095, "auxiliary_loss_mlp": 0.01034126, "balance_loss_clip": 1.23569465, "balance_loss_mlp": 1.01511216, "epoch": 0.8186983315797385, "flos": 22424135627520.0, "grad_norm": 1.8628166690582386, "language_loss": 0.7567479, "learning_rate": 3.349136805494979e-07, "loss": 0.7810201, "num_input_tokens_seen": 293803360, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19006348, "step": 13617, "time_per_iteration": 2.901494026184082 }, { "auxiliary_loss_clip": 0.01389898, "auxiliary_loss_mlp": 0.01033116, "balance_loss_clip": 1.23222899, "balance_loss_mlp": 1.01463902, "epoch": 0.8187584548324064, "flos": 22028169968640.0, "grad_norm": 1.9961347428385463, "language_loss": 0.68744707, "learning_rate": 3.346979658556415e-07, "loss": 0.71167719, "num_input_tokens_seen": 293821325, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18469238, "step": 13618, "time_per_iteration": 2.9176394939422607 }, { "auxiliary_loss_clip": 0.01423412, "auxiliary_loss_mlp": 0.01035049, "balance_loss_clip": 1.25669456, "balance_loss_mlp": 1.01536798, "epoch": 0.8188185780850744, "flos": 29253552065280.0, "grad_norm": 2.062410412721167, "language_loss": 0.70580137, "learning_rate": 3.344823143102058e-07, "loss": 0.73038602, "num_input_tokens_seen": 293840315, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19665527, "step": 13619, "time_per_iteration": 2.938044548034668 }, { "auxiliary_loss_clip": 0.01404368, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.24291658, "balance_loss_mlp": 1.01553607, "epoch": 0.8188787013377423, "flos": 20704230581760.0, "grad_norm": 1.9534950082902396, "language_loss": 0.7440114, "learning_rate": 3.3426672592136694e-07, "loss": 0.76840389, "num_input_tokens_seen": 293855685, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19335938, "step": 13620, "time_per_iteration": 2.8428852558135986 }, { "auxiliary_loss_clip": 0.0139245, "auxiliary_loss_mlp": 0.0103093, "balance_loss_clip": 1.23495793, "balance_loss_mlp": 1.01205897, "epoch": 0.8189388245904103, "flos": 23743595779200.0, "grad_norm": 1.8107132326994495, "language_loss": 0.76842308, "learning_rate": 3.340512006973011e-07, "loss": 0.7926569, "num_input_tokens_seen": 293875540, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18859863, "step": 13621, "time_per_iteration": 4.277806520462036 }, { "auxiliary_loss_clip": 0.01402575, "auxiliary_loss_mlp": 0.01033274, "balance_loss_clip": 1.2445159, "balance_loss_mlp": 1.01480889, "epoch": 0.8189989478430784, "flos": 28266343614720.0, "grad_norm": 2.2115070196176814, "language_loss": 0.6653049, "learning_rate": 3.3383573864618076e-07, "loss": 0.68966341, "num_input_tokens_seen": 293896570, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18469238, "step": 13622, "time_per_iteration": 2.956760883331299 }, { "auxiliary_loss_clip": 0.01408058, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.24783945, "balance_loss_mlp": 1.01221371, "epoch": 0.8190590710957463, "flos": 21407671774080.0, "grad_norm": 1.7694429607167805, "language_loss": 0.7554701, "learning_rate": 3.3362033977617653e-07, "loss": 0.77987111, "num_input_tokens_seen": 293914680, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19812012, "step": 13623, "time_per_iteration": 2.9342575073242188 }, { "auxiliary_loss_clip": 0.01407345, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.24617624, "balance_loss_mlp": 1.01653743, "epoch": 0.8191191943484143, "flos": 38809655015040.0, "grad_norm": 1.8929206703065888, "language_loss": 0.63991237, "learning_rate": 3.3340500409545527e-07, "loss": 0.66434741, "num_input_tokens_seen": 293936480, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19628906, "step": 13624, "time_per_iteration": 3.0209832191467285 }, { "auxiliary_loss_clip": 0.01389911, "auxiliary_loss_mlp": 0.01032183, "balance_loss_clip": 1.23346376, "balance_loss_mlp": 1.01312184, "epoch": 0.8191793176010822, "flos": 25457257042560.0, "grad_norm": 2.4609487857970276, "language_loss": 0.79125261, "learning_rate": 3.3318973161218386e-07, "loss": 0.81547356, "num_input_tokens_seen": 293957815, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.19055176, "step": 13625, "time_per_iteration": 2.9937992095947266 }, { "auxiliary_loss_clip": 0.01429656, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.25998366, "balance_loss_mlp": 1.01622057, "epoch": 0.8192394408537502, "flos": 25094030636160.0, "grad_norm": 1.9500094621619282, "language_loss": 0.76437545, "learning_rate": 3.329745223345244e-07, "loss": 0.78901905, "num_input_tokens_seen": 293975440, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.18469238, "step": 13626, "time_per_iteration": 2.8793458938598633 }, { "auxiliary_loss_clip": 0.01397251, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.23877645, "balance_loss_mlp": 1.01713729, "epoch": 0.8192995641064181, "flos": 27685461882240.0, "grad_norm": 2.0789645144914783, "language_loss": 0.74311423, "learning_rate": 3.3275937627063823e-07, "loss": 0.76745099, "num_input_tokens_seen": 293997540, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19262695, "step": 13627, "time_per_iteration": 4.376024961471558 }, { "auxiliary_loss_clip": 0.01414715, "auxiliary_loss_mlp": 0.01032839, "balance_loss_clip": 1.25084043, "balance_loss_mlp": 1.01477861, "epoch": 0.8193596873590862, "flos": 21298504775040.0, "grad_norm": 1.9376342572017424, "language_loss": 0.69242108, "learning_rate": 3.3254429342868353e-07, "loss": 0.71689665, "num_input_tokens_seen": 294017030, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.18041992, "step": 13628, "time_per_iteration": 2.8965373039245605 }, { "auxiliary_loss_clip": 0.01421562, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.25629222, "balance_loss_mlp": 1.0165813, "epoch": 0.8194198106117541, "flos": 17501304856320.0, "grad_norm": 1.6044243598610655, "language_loss": 0.85823864, "learning_rate": 3.323292738168171e-07, "loss": 0.88282204, "num_input_tokens_seen": 294035700, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20178223, "step": 13629, "time_per_iteration": 2.8026208877563477 }, { "auxiliary_loss_clip": 0.0140338, "auxiliary_loss_mlp": 0.01033143, "balance_loss_clip": 1.24225378, "balance_loss_mlp": 1.01440322, "epoch": 0.8194799338644221, "flos": 15276267152640.0, "grad_norm": 1.9979581057163118, "language_loss": 0.74643719, "learning_rate": 3.3211431744319084e-07, "loss": 0.77080244, "num_input_tokens_seen": 294049730, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1875, "step": 13630, "time_per_iteration": 2.781601667404175 }, { "auxiliary_loss_clip": 0.01406999, "auxiliary_loss_mlp": 0.01031804, "balance_loss_clip": 1.24410391, "balance_loss_mlp": 1.01312423, "epoch": 0.81954005711709, "flos": 14726541104640.0, "grad_norm": 6.1177006827861495, "language_loss": 0.72865582, "learning_rate": 3.31899424315957e-07, "loss": 0.75304383, "num_input_tokens_seen": 294066545, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18676758, "step": 13631, "time_per_iteration": 2.8208000659942627 }, { "auxiliary_loss_clip": 0.01409072, "auxiliary_loss_mlp": 0.01035169, "balance_loss_clip": 1.24673104, "balance_loss_mlp": 1.01669145, "epoch": 0.819600180369758, "flos": 23083933570560.0, "grad_norm": 1.6783451019322302, "language_loss": 0.76942509, "learning_rate": 3.3168459444326447e-07, "loss": 0.79386747, "num_input_tokens_seen": 294087455, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18493652, "step": 13632, "time_per_iteration": 2.8577749729156494 }, { "auxiliary_loss_clip": 0.01406405, "auxiliary_loss_mlp": 0.01029406, "balance_loss_clip": 1.24502659, "balance_loss_mlp": 1.01138186, "epoch": 0.8196603036224259, "flos": 27611251130880.0, "grad_norm": 1.8870648514627582, "language_loss": 0.66182685, "learning_rate": 3.314698278332588e-07, "loss": 0.686185, "num_input_tokens_seen": 294107480, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18029785, "step": 13633, "time_per_iteration": 2.881197929382324 }, { "auxiliary_loss_clip": 0.01390403, "auxiliary_loss_mlp": 0.01028773, "balance_loss_clip": 1.23377073, "balance_loss_mlp": 1.01073694, "epoch": 0.8197204268750939, "flos": 28592984960640.0, "grad_norm": 1.483150896501977, "language_loss": 0.75866783, "learning_rate": 3.3125512449408513e-07, "loss": 0.78285962, "num_input_tokens_seen": 294130115, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18054199, "step": 13634, "time_per_iteration": 2.9018876552581787 }, { "auxiliary_loss_clip": 0.01396342, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.23928833, "balance_loss_mlp": 1.0172708, "epoch": 0.819780550127762, "flos": 23268713909760.0, "grad_norm": 1.9968649437151198, "language_loss": 0.82090646, "learning_rate": 3.310404844338841e-07, "loss": 0.84522498, "num_input_tokens_seen": 294148495, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18261719, "step": 13635, "time_per_iteration": 2.8636741638183594 }, { "auxiliary_loss_clip": 0.01399439, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.23808312, "balance_loss_mlp": 1.01343608, "epoch": 0.8198406733804299, "flos": 26695855457280.0, "grad_norm": 2.200115374195653, "language_loss": 0.76791167, "learning_rate": 3.308259076607949e-07, "loss": 0.7922377, "num_input_tokens_seen": 294169595, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19714355, "step": 13636, "time_per_iteration": 2.864133358001709 }, { "auxiliary_loss_clip": 0.01405985, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.24704552, "balance_loss_mlp": 1.01466012, "epoch": 0.8199007966330979, "flos": 20093686243200.0, "grad_norm": 1.9848408531013424, "language_loss": 0.81718493, "learning_rate": 3.3061139418295445e-07, "loss": 0.84157205, "num_input_tokens_seen": 294183885, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18066406, "step": 13637, "time_per_iteration": 2.825047016143799 }, { "auxiliary_loss_clip": 0.0139592, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.23747778, "balance_loss_mlp": 1.01566863, "epoch": 0.8199609198857658, "flos": 31914760072320.0, "grad_norm": 2.4306871214085923, "language_loss": 0.72029632, "learning_rate": 3.3039694400849725e-07, "loss": 0.74460137, "num_input_tokens_seen": 294200150, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18933105, "step": 13638, "time_per_iteration": 2.930588483810425 }, { "auxiliary_loss_clip": 0.01409199, "auxiliary_loss_mlp": 0.01032568, "balance_loss_clip": 1.24464035, "balance_loss_mlp": 1.01237416, "epoch": 0.8200210431384338, "flos": 26481683980800.0, "grad_norm": 3.385886401367235, "language_loss": 0.80704951, "learning_rate": 3.3018255714555564e-07, "loss": 0.83146721, "num_input_tokens_seen": 294220385, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.2019043, "step": 13639, "time_per_iteration": 2.9667327404022217 }, { "auxiliary_loss_clip": 0.01389367, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.23132193, "balance_loss_mlp": 1.01337707, "epoch": 0.8200811663911017, "flos": 22101747292800.0, "grad_norm": 1.7304522804709102, "language_loss": 0.7956779, "learning_rate": 3.299682336022589e-07, "loss": 0.81989324, "num_input_tokens_seen": 294239355, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18811035, "step": 13640, "time_per_iteration": 2.8516106605529785 }, { "auxiliary_loss_clip": 0.01423125, "auxiliary_loss_mlp": 0.01035261, "balance_loss_clip": 1.25537777, "balance_loss_mlp": 1.01555538, "epoch": 0.8201412896437698, "flos": 37606872499200.0, "grad_norm": 2.3505325070472436, "language_loss": 0.64389557, "learning_rate": 3.297539733867336e-07, "loss": 0.66847944, "num_input_tokens_seen": 294259395, "router_z_loss_clip": 1.67871094, "router_z_loss_mlp": 0.19714355, "step": 13641, "time_per_iteration": 3.010220766067505 }, { "auxiliary_loss_clip": 0.01396585, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.23675311, "balance_loss_mlp": 1.01350474, "epoch": 0.8202014128964377, "flos": 19655841882240.0, "grad_norm": 1.721750478810888, "language_loss": 0.74162078, "learning_rate": 3.295397765071055e-07, "loss": 0.76590776, "num_input_tokens_seen": 294277365, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18603516, "step": 13642, "time_per_iteration": 2.8540937900543213 }, { "auxiliary_loss_clip": 0.01397167, "auxiliary_loss_mlp": 0.01035384, "balance_loss_clip": 1.23837531, "balance_loss_mlp": 1.01525009, "epoch": 0.8202615361491057, "flos": 31479811378560.0, "grad_norm": 1.5632308343215724, "language_loss": 0.71268678, "learning_rate": 3.2932564297149615e-07, "loss": 0.73701227, "num_input_tokens_seen": 294297555, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.20129395, "step": 13643, "time_per_iteration": 2.9355967044830322 }, { "auxiliary_loss_clip": 0.01400363, "auxiliary_loss_mlp": 0.01033961, "balance_loss_clip": 1.24161983, "balance_loss_mlp": 1.0152688, "epoch": 0.8203216594017736, "flos": 24725782056960.0, "grad_norm": 1.7839900812620881, "language_loss": 0.66279197, "learning_rate": 3.291115727880256e-07, "loss": 0.68713522, "num_input_tokens_seen": 294317600, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18701172, "step": 13644, "time_per_iteration": 2.872098684310913 }, { "auxiliary_loss_clip": 0.01408262, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.24662268, "balance_loss_mlp": 1.01611459, "epoch": 0.8203817826544416, "flos": 26043115703040.0, "grad_norm": 1.4066573696012568, "language_loss": 0.71743143, "learning_rate": 3.2889756596481234e-07, "loss": 0.74186182, "num_input_tokens_seen": 294340215, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18664551, "step": 13645, "time_per_iteration": 2.879178524017334 }, { "auxiliary_loss_clip": 0.01393253, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.23606634, "balance_loss_mlp": 1.01731277, "epoch": 0.8204419059071095, "flos": 25964697185280.0, "grad_norm": 1.9392072925540567, "language_loss": 0.71655375, "learning_rate": 3.286836225099707e-07, "loss": 0.740839, "num_input_tokens_seen": 294358590, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.17980957, "step": 13646, "time_per_iteration": 2.9040184020996094 }, { "auxiliary_loss_clip": 0.01418689, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.25494301, "balance_loss_mlp": 1.01476657, "epoch": 0.8205020291597775, "flos": 23588840004480.0, "grad_norm": 2.3642757223903823, "language_loss": 0.79674739, "learning_rate": 3.284697424316132e-07, "loss": 0.82127136, "num_input_tokens_seen": 294375825, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18933105, "step": 13647, "time_per_iteration": 2.849364757537842 }, { "auxiliary_loss_clip": 0.01395457, "auxiliary_loss_mlp": 0.0103005, "balance_loss_clip": 1.23937583, "balance_loss_mlp": 1.01184702, "epoch": 0.8205621524124456, "flos": 26810949525120.0, "grad_norm": 1.421317001165268, "language_loss": 0.68978417, "learning_rate": 3.2825592573785034e-07, "loss": 0.71403921, "num_input_tokens_seen": 294398500, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18212891, "step": 13648, "time_per_iteration": 2.898265838623047 }, { "auxiliary_loss_clip": 0.01408909, "auxiliary_loss_mlp": 0.01031659, "balance_loss_clip": 1.24726343, "balance_loss_mlp": 1.01338434, "epoch": 0.8206222756651135, "flos": 27539574088320.0, "grad_norm": 3.9286076544463806, "language_loss": 0.80893523, "learning_rate": 3.28042172436791e-07, "loss": 0.83334088, "num_input_tokens_seen": 294418840, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18261719, "step": 13649, "time_per_iteration": 2.8910861015319824 }, { "auxiliary_loss_clip": 0.01409428, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.24831748, "balance_loss_mlp": 1.01210141, "epoch": 0.8206823989177815, "flos": 21188478124800.0, "grad_norm": 1.6626201439531358, "language_loss": 0.69518292, "learning_rate": 3.278284825365396e-07, "loss": 0.71960139, "num_input_tokens_seen": 294438215, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.20349121, "step": 13650, "time_per_iteration": 2.978311777114868 }, { "auxiliary_loss_clip": 0.01407334, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.24645329, "balance_loss_mlp": 1.01235819, "epoch": 0.8207425221704494, "flos": 11516919148800.0, "grad_norm": 2.0958116947717733, "language_loss": 0.62388122, "learning_rate": 3.276148560452001e-07, "loss": 0.64827031, "num_input_tokens_seen": 294455260, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19226074, "step": 13651, "time_per_iteration": 4.307434558868408 }, { "auxiliary_loss_clip": 0.01421374, "auxiliary_loss_mlp": 0.01032622, "balance_loss_clip": 1.25898623, "balance_loss_mlp": 1.0136677, "epoch": 0.8208026454231174, "flos": 19801458207360.0, "grad_norm": 2.0262157922183386, "language_loss": 0.73311204, "learning_rate": 3.2740129297087293e-07, "loss": 0.75765198, "num_input_tokens_seen": 294473205, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1895752, "step": 13652, "time_per_iteration": 2.967442512512207 }, { "auxiliary_loss_clip": 0.01382652, "auxiliary_loss_mlp": 0.01031066, "balance_loss_clip": 1.22748733, "balance_loss_mlp": 1.01302922, "epoch": 0.8208627686757853, "flos": 15675535681920.0, "grad_norm": 2.1597454015734967, "language_loss": 0.73173773, "learning_rate": 3.271877933216558e-07, "loss": 0.75587487, "num_input_tokens_seen": 294490645, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18041992, "step": 13653, "time_per_iteration": 2.8257453441619873 }, { "auxiliary_loss_clip": 0.01428657, "auxiliary_loss_mlp": 0.01036636, "balance_loss_clip": 1.26314211, "balance_loss_mlp": 1.01614428, "epoch": 0.8209228919284534, "flos": 37495443260160.0, "grad_norm": 1.8268479763457002, "language_loss": 0.63936377, "learning_rate": 3.269743571056451e-07, "loss": 0.66401672, "num_input_tokens_seen": 294513500, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.20495605, "step": 13654, "time_per_iteration": 2.971712589263916 }, { "auxiliary_loss_clip": 0.01406561, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.2442807, "balance_loss_mlp": 1.01284254, "epoch": 0.8209830151811213, "flos": 23123504787840.0, "grad_norm": 1.8026333341910348, "language_loss": 0.7085861, "learning_rate": 3.2676098433093447e-07, "loss": 0.73296189, "num_input_tokens_seen": 294535710, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.1817627, "step": 13655, "time_per_iteration": 2.8667402267456055 }, { "auxiliary_loss_clip": 0.01396301, "auxiliary_loss_mlp": 0.01034956, "balance_loss_clip": 1.23802543, "balance_loss_mlp": 1.01624, "epoch": 0.8210431384337893, "flos": 21298097571840.0, "grad_norm": 1.9666750434396565, "language_loss": 0.82932436, "learning_rate": 3.265476750056162e-07, "loss": 0.85363698, "num_input_tokens_seen": 294554055, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18713379, "step": 13656, "time_per_iteration": 4.252135992050171 }, { "auxiliary_loss_clip": 0.01387595, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.23169196, "balance_loss_mlp": 1.01626468, "epoch": 0.8211032616864572, "flos": 11507824944000.0, "grad_norm": 2.1447991137201807, "language_loss": 0.74644238, "learning_rate": 3.2633442913777654e-07, "loss": 0.7706818, "num_input_tokens_seen": 294570390, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.20080566, "step": 13657, "time_per_iteration": 2.8247885704040527 }, { "auxiliary_loss_clip": 0.01397819, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.23885417, "balance_loss_mlp": 1.01470935, "epoch": 0.8211633849391252, "flos": 29832488271360.0, "grad_norm": 1.7331919185024978, "language_loss": 0.56518334, "learning_rate": 3.2612124673550325e-07, "loss": 0.58949363, "num_input_tokens_seen": 294593050, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18493652, "step": 13658, "time_per_iteration": 2.9081149101257324 }, { "auxiliary_loss_clip": 0.01400373, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.23804402, "balance_loss_mlp": 1.01723444, "epoch": 0.8212235081917931, "flos": 13123268449920.0, "grad_norm": 2.2489389178019703, "language_loss": 0.80101293, "learning_rate": 3.259081278068805e-07, "loss": 0.82539034, "num_input_tokens_seen": 294608550, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20129395, "step": 13659, "time_per_iteration": 2.8728461265563965 }, { "auxiliary_loss_clip": 0.01389215, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.23394406, "balance_loss_mlp": 1.01249719, "epoch": 0.8212836314444611, "flos": 40530057753600.0, "grad_norm": 1.5825785860118864, "language_loss": 0.60191619, "learning_rate": 3.256950723599887e-07, "loss": 0.62611043, "num_input_tokens_seen": 294630380, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.17724609, "step": 13660, "time_per_iteration": 2.982999801635742 }, { "auxiliary_loss_clip": 0.01413157, "auxiliary_loss_mlp": 0.0103533, "balance_loss_clip": 1.25083923, "balance_loss_mlp": 1.0163635, "epoch": 0.8213437546971292, "flos": 18779791201920.0, "grad_norm": 1.9095221264378233, "language_loss": 0.7402159, "learning_rate": 3.254820804029075e-07, "loss": 0.76470077, "num_input_tokens_seen": 294648655, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18969727, "step": 13661, "time_per_iteration": 2.850397825241089 }, { "auxiliary_loss_clip": 0.01415593, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.25210583, "balance_loss_mlp": 1.01694059, "epoch": 0.8214038779497971, "flos": 19691657781120.0, "grad_norm": 3.0480544883713634, "language_loss": 0.76007998, "learning_rate": 3.252691519437143e-07, "loss": 0.78459865, "num_input_tokens_seen": 294666915, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19335938, "step": 13662, "time_per_iteration": 4.318909406661987 }, { "auxiliary_loss_clip": 0.0117783, "auxiliary_loss_mlp": 0.01024624, "balance_loss_clip": 1.09023976, "balance_loss_mlp": 1.0016408, "epoch": 0.8214640012024651, "flos": 71635666222080.0, "grad_norm": 0.744445010058128, "language_loss": 0.54041034, "learning_rate": 3.250562869904825e-07, "loss": 0.56243491, "num_input_tokens_seen": 294731545, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.22949219, "step": 13663, "time_per_iteration": 3.4888930320739746 }, { "auxiliary_loss_clip": 0.01400466, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.23901188, "balance_loss_mlp": 1.01611114, "epoch": 0.821524124455133, "flos": 14765433649920.0, "grad_norm": 2.920265910623727, "language_loss": 0.66848946, "learning_rate": 3.248434855512838e-07, "loss": 0.69284701, "num_input_tokens_seen": 294748745, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19177246, "step": 13664, "time_per_iteration": 2.852261543273926 }, { "auxiliary_loss_clip": 0.01398735, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.24135566, "balance_loss_mlp": 1.01462483, "epoch": 0.821584247707801, "flos": 25093080495360.0, "grad_norm": 1.9052408941484495, "language_loss": 0.75660419, "learning_rate": 3.246307476341881e-07, "loss": 0.78091788, "num_input_tokens_seen": 294768955, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17993164, "step": 13665, "time_per_iteration": 2.8690290451049805 }, { "auxiliary_loss_clip": 0.01402423, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.24179065, "balance_loss_mlp": 1.01588786, "epoch": 0.8216443709604689, "flos": 36844558542720.0, "grad_norm": 2.0451820159758856, "language_loss": 0.66091228, "learning_rate": 3.2441807324726256e-07, "loss": 0.68527883, "num_input_tokens_seen": 294789250, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18347168, "step": 13666, "time_per_iteration": 2.970503807067871 }, { "auxiliary_loss_clip": 0.01412781, "auxiliary_loss_mlp": 0.01037928, "balance_loss_clip": 1.25355363, "balance_loss_mlp": 1.01930714, "epoch": 0.821704494213137, "flos": 25092492312960.0, "grad_norm": 1.6336781317191988, "language_loss": 0.7740593, "learning_rate": 3.2420546239857174e-07, "loss": 0.7985664, "num_input_tokens_seen": 294809760, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18603516, "step": 13667, "time_per_iteration": 2.862119674682617 }, { "auxiliary_loss_clip": 0.01409724, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.2481811, "balance_loss_mlp": 1.01493382, "epoch": 0.8217646174658049, "flos": 14364310083840.0, "grad_norm": 1.778425629772752, "language_loss": 0.77607071, "learning_rate": 3.239929150961773e-07, "loss": 0.80051142, "num_input_tokens_seen": 294826495, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19384766, "step": 13668, "time_per_iteration": 2.7893452644348145 }, { "auxiliary_loss_clip": 0.01400497, "auxiliary_loss_mlp": 0.01032297, "balance_loss_clip": 1.24206471, "balance_loss_mlp": 1.01368856, "epoch": 0.8218247407184729, "flos": 22100616172800.0, "grad_norm": 1.942083401316505, "language_loss": 0.74853319, "learning_rate": 3.2378043134813984e-07, "loss": 0.77286112, "num_input_tokens_seen": 294845370, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18603516, "step": 13669, "time_per_iteration": 2.8443350791931152 }, { "auxiliary_loss_clip": 0.0140475, "auxiliary_loss_mlp": 0.01031045, "balance_loss_clip": 1.2454809, "balance_loss_mlp": 1.01223409, "epoch": 0.8218848639711408, "flos": 16772816027520.0, "grad_norm": 1.605876823669812, "language_loss": 0.79401231, "learning_rate": 3.235680111625161e-07, "loss": 0.81837034, "num_input_tokens_seen": 294863740, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18811035, "step": 13670, "time_per_iteration": 2.8055572509765625 }, { "auxiliary_loss_clip": 0.01421952, "auxiliary_loss_mlp": 0.0103486, "balance_loss_clip": 1.25859547, "balance_loss_mlp": 1.01625097, "epoch": 0.8219449872238088, "flos": 26005942460160.0, "grad_norm": 2.293031743267957, "language_loss": 0.75486302, "learning_rate": 3.2335565454736123e-07, "loss": 0.77943116, "num_input_tokens_seen": 294882815, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18591309, "step": 13671, "time_per_iteration": 2.8439908027648926 }, { "auxiliary_loss_clip": 0.0142661, "auxiliary_loss_mlp": 0.01034629, "balance_loss_clip": 1.26085401, "balance_loss_mlp": 1.01600814, "epoch": 0.8220051104764767, "flos": 20788078475520.0, "grad_norm": 1.6870705656880354, "language_loss": 0.77398109, "learning_rate": 3.23143361510728e-07, "loss": 0.79859346, "num_input_tokens_seen": 294901985, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.1862793, "step": 13672, "time_per_iteration": 2.8243961334228516 }, { "auxiliary_loss_clip": 0.0141131, "auxiliary_loss_mlp": 0.01033727, "balance_loss_clip": 1.25120795, "balance_loss_mlp": 1.01473665, "epoch": 0.8220652337291448, "flos": 14583051285120.0, "grad_norm": 2.7563548972293423, "language_loss": 0.75327229, "learning_rate": 3.2293113206066733e-07, "loss": 0.77772266, "num_input_tokens_seen": 294919705, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18994141, "step": 13673, "time_per_iteration": 2.8409719467163086 }, { "auxiliary_loss_clip": 0.0141031, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.24593389, "balance_loss_mlp": 1.01389098, "epoch": 0.8221253569818128, "flos": 23816494431360.0, "grad_norm": 1.6840168862864786, "language_loss": 0.80428112, "learning_rate": 3.227189662052254e-07, "loss": 0.82872128, "num_input_tokens_seen": 294939900, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19824219, "step": 13674, "time_per_iteration": 2.9772467613220215 }, { "auxiliary_loss_clip": 0.01407106, "auxiliary_loss_mlp": 0.01032204, "balance_loss_clip": 1.24663913, "balance_loss_mlp": 1.01314282, "epoch": 0.8221854802344807, "flos": 21298188061440.0, "grad_norm": 1.802651664678103, "language_loss": 0.71545148, "learning_rate": 3.225068639524484e-07, "loss": 0.73984456, "num_input_tokens_seen": 294959110, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19055176, "step": 13675, "time_per_iteration": 2.8192877769470215 }, { "auxiliary_loss_clip": 0.01395033, "auxiliary_loss_mlp": 0.0103244, "balance_loss_clip": 1.23851407, "balance_loss_mlp": 1.014189, "epoch": 0.8222456034871487, "flos": 20965845870720.0, "grad_norm": 2.660701636763627, "language_loss": 0.74564385, "learning_rate": 3.2229482531037965e-07, "loss": 0.76991862, "num_input_tokens_seen": 294978660, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18261719, "step": 13676, "time_per_iteration": 2.835282564163208 }, { "auxiliary_loss_clip": 0.01406398, "auxiliary_loss_mlp": 0.01033644, "balance_loss_clip": 1.24763227, "balance_loss_mlp": 1.01501179, "epoch": 0.8223057267398166, "flos": 21407445550080.0, "grad_norm": 1.8142350739896407, "language_loss": 0.81142485, "learning_rate": 3.2208285028705893e-07, "loss": 0.83582526, "num_input_tokens_seen": 294998075, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.1862793, "step": 13677, "time_per_iteration": 2.8345768451690674 }, { "auxiliary_loss_clip": 0.01409676, "auxiliary_loss_mlp": 0.01036061, "balance_loss_clip": 1.24912667, "balance_loss_mlp": 1.01701176, "epoch": 0.8223658499924846, "flos": 15276629111040.0, "grad_norm": 3.22012263315414, "language_loss": 0.70468175, "learning_rate": 3.218709388905245e-07, "loss": 0.72913909, "num_input_tokens_seen": 295015950, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19055176, "step": 13678, "time_per_iteration": 2.941174268722534 }, { "auxiliary_loss_clip": 0.0139823, "auxiliary_loss_mlp": 0.01035197, "balance_loss_clip": 1.23882735, "balance_loss_mlp": 1.01617157, "epoch": 0.8224259732451525, "flos": 31262246542080.0, "grad_norm": 1.7349581133261578, "language_loss": 0.7197749, "learning_rate": 3.216590911288133e-07, "loss": 0.74410921, "num_input_tokens_seen": 295036800, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19018555, "step": 13679, "time_per_iteration": 2.924630880355835 }, { "auxiliary_loss_clip": 0.01393328, "auxiliary_loss_mlp": 0.01030124, "balance_loss_clip": 1.23559165, "balance_loss_mlp": 1.01192057, "epoch": 0.8224860964978206, "flos": 21583222174080.0, "grad_norm": 2.030442085679249, "language_loss": 0.70538437, "learning_rate": 3.214473070099564e-07, "loss": 0.72961891, "num_input_tokens_seen": 295055300, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18200684, "step": 13680, "time_per_iteration": 2.9089460372924805 }, { "auxiliary_loss_clip": 0.01405769, "auxiliary_loss_mlp": 0.01030524, "balance_loss_clip": 1.24729967, "balance_loss_mlp": 1.01154637, "epoch": 0.8225462197504885, "flos": 25494023082240.0, "grad_norm": 2.3329270094302, "language_loss": 0.60459745, "learning_rate": 3.21235586541986e-07, "loss": 0.62896037, "num_input_tokens_seen": 295076420, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18981934, "step": 13681, "time_per_iteration": 2.9251773357391357 }, { "auxiliary_loss_clip": 0.01419393, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.2548039, "balance_loss_mlp": 1.01849747, "epoch": 0.8226063430031565, "flos": 39400173889920.0, "grad_norm": 2.497758816655288, "language_loss": 0.69925141, "learning_rate": 3.2102392973293047e-07, "loss": 0.72381788, "num_input_tokens_seen": 295100540, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1875, "step": 13682, "time_per_iteration": 3.005311965942383 }, { "auxiliary_loss_clip": 0.01411411, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 1.25019026, "balance_loss_mlp": 1.01433063, "epoch": 0.8226664662558244, "flos": 22824218563200.0, "grad_norm": 2.618459019011458, "language_loss": 0.80074084, "learning_rate": 3.20812336590816e-07, "loss": 0.82519579, "num_input_tokens_seen": 295120180, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19750977, "step": 13683, "time_per_iteration": 2.8630001544952393 }, { "auxiliary_loss_clip": 0.01392193, "auxiliary_loss_mlp": 0.01034845, "balance_loss_clip": 1.23580647, "balance_loss_mlp": 1.01682103, "epoch": 0.8227265895084924, "flos": 25676450691840.0, "grad_norm": 4.048485467748755, "language_loss": 0.86977589, "learning_rate": 3.206008071236661e-07, "loss": 0.89404625, "num_input_tokens_seen": 295138530, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18017578, "step": 13684, "time_per_iteration": 2.877929925918579 }, { "auxiliary_loss_clip": 0.01382083, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.22768927, "balance_loss_mlp": 1.01716256, "epoch": 0.8227867127611603, "flos": 26190994268160.0, "grad_norm": 3.303770459842274, "language_loss": 0.79905272, "learning_rate": 3.2038934133950157e-07, "loss": 0.82322645, "num_input_tokens_seen": 295160260, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18151855, "step": 13685, "time_per_iteration": 2.908799171447754 }, { "auxiliary_loss_clip": 0.01407954, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.24854147, "balance_loss_mlp": 1.01346123, "epoch": 0.8228468360138284, "flos": 22028215213440.0, "grad_norm": 1.5224651959201947, "language_loss": 0.68844295, "learning_rate": 3.2017793924634194e-07, "loss": 0.71284819, "num_input_tokens_seen": 295177055, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19116211, "step": 13686, "time_per_iteration": 4.27991795539856 }, { "auxiliary_loss_clip": 0.01402572, "auxiliary_loss_mlp": 0.0103471, "balance_loss_clip": 1.24099517, "balance_loss_mlp": 1.01536214, "epoch": 0.8229069592664963, "flos": 14911683402240.0, "grad_norm": 2.7821443233419707, "language_loss": 0.79162288, "learning_rate": 3.1996660085220263e-07, "loss": 0.81599569, "num_input_tokens_seen": 295193870, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19348145, "step": 13687, "time_per_iteration": 2.7884604930877686 }, { "auxiliary_loss_clip": 0.01396562, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.23727822, "balance_loss_mlp": 1.00990915, "epoch": 0.8229670825191643, "flos": 15677978901120.0, "grad_norm": 2.4046791614399488, "language_loss": 0.72995955, "learning_rate": 3.1975532616509825e-07, "loss": 0.75421834, "num_input_tokens_seen": 295211040, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1940918, "step": 13688, "time_per_iteration": 2.8004329204559326 }, { "auxiliary_loss_clip": 0.01406315, "auxiliary_loss_mlp": 0.01036474, "balance_loss_clip": 1.24626565, "balance_loss_mlp": 1.01852095, "epoch": 0.8230272057718323, "flos": 23193417283200.0, "grad_norm": 1.6211196503663652, "language_loss": 0.73996115, "learning_rate": 3.1954411519304025e-07, "loss": 0.76438904, "num_input_tokens_seen": 295231300, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.17944336, "step": 13689, "time_per_iteration": 2.8848063945770264 }, { "auxiliary_loss_clip": 0.01405685, "auxiliary_loss_mlp": 0.01036669, "balance_loss_clip": 1.24406278, "balance_loss_mlp": 1.01854944, "epoch": 0.8230873290245002, "flos": 21042183127680.0, "grad_norm": 1.925028251133866, "language_loss": 0.70363772, "learning_rate": 3.1933296794403887e-07, "loss": 0.7280612, "num_input_tokens_seen": 295251045, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18139648, "step": 13690, "time_per_iteration": 2.8648109436035156 }, { "auxiliary_loss_clip": 0.01419054, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.25673807, "balance_loss_mlp": 1.01467919, "epoch": 0.8231474522771682, "flos": 21259657474560.0, "grad_norm": 1.8609847938000594, "language_loss": 0.86105227, "learning_rate": 3.191218844260988e-07, "loss": 0.88557291, "num_input_tokens_seen": 295270225, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18334961, "step": 13691, "time_per_iteration": 4.24226713180542 }, { "auxiliary_loss_clip": 0.01414804, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.25297797, "balance_loss_mlp": 1.01446187, "epoch": 0.8232075755298361, "flos": 23852581799040.0, "grad_norm": 1.6825036275418859, "language_loss": 0.77793479, "learning_rate": 3.189108646472252e-07, "loss": 0.80242044, "num_input_tokens_seen": 295288950, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19299316, "step": 13692, "time_per_iteration": 2.8536882400512695 }, { "auxiliary_loss_clip": 0.01404193, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.24601579, "balance_loss_mlp": 1.01470542, "epoch": 0.8232676987825042, "flos": 21664219645440.0, "grad_norm": 1.9760270965883517, "language_loss": 0.72481072, "learning_rate": 3.186999086154205e-07, "loss": 0.74918413, "num_input_tokens_seen": 295309405, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18444824, "step": 13693, "time_per_iteration": 2.844651222229004 }, { "auxiliary_loss_clip": 0.01394428, "auxiliary_loss_mlp": 0.01032594, "balance_loss_clip": 1.23865128, "balance_loss_mlp": 1.01501119, "epoch": 0.8233278220351721, "flos": 26333805415680.0, "grad_norm": 1.912882273387837, "language_loss": 0.84446073, "learning_rate": 3.1848901633868355e-07, "loss": 0.8687309, "num_input_tokens_seen": 295331115, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.17602539, "step": 13694, "time_per_iteration": 2.892763376235962 }, { "auxiliary_loss_clip": 0.0142096, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.25699389, "balance_loss_mlp": 1.01656556, "epoch": 0.8233879452878401, "flos": 21735806198400.0, "grad_norm": 2.0077266461899637, "language_loss": 0.77817816, "learning_rate": 3.182781878250118e-07, "loss": 0.80273515, "num_input_tokens_seen": 295350495, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1817627, "step": 13695, "time_per_iteration": 2.9473884105682373 }, { "auxiliary_loss_clip": 0.0140359, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.24374652, "balance_loss_mlp": 1.01244473, "epoch": 0.823448068540508, "flos": 20567301258240.0, "grad_norm": 4.891700751175468, "language_loss": 0.82089019, "learning_rate": 3.1806742308239985e-07, "loss": 0.84523511, "num_input_tokens_seen": 295368225, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18457031, "step": 13696, "time_per_iteration": 2.898198366165161 }, { "auxiliary_loss_clip": 0.01179682, "auxiliary_loss_mlp": 0.0102575, "balance_loss_clip": 1.09335947, "balance_loss_mlp": 1.00190818, "epoch": 0.823508191793176, "flos": 67308512232960.0, "grad_norm": 0.7302280209889646, "language_loss": 0.63897443, "learning_rate": 3.178567221188393e-07, "loss": 0.66102874, "num_input_tokens_seen": 295430035, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.23828125, "step": 13697, "time_per_iteration": 6.162862777709961 }, { "auxiliary_loss_clip": 0.01393302, "auxiliary_loss_mlp": 0.01029466, "balance_loss_clip": 1.23816562, "balance_loss_mlp": 1.01176369, "epoch": 0.8235683150458439, "flos": 17936660753280.0, "grad_norm": 1.506967598769438, "language_loss": 0.73304987, "learning_rate": 3.1764608494232037e-07, "loss": 0.75727755, "num_input_tokens_seen": 295447765, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.17700195, "step": 13698, "time_per_iteration": 2.8528225421905518 }, { "auxiliary_loss_clip": 0.01400788, "auxiliary_loss_mlp": 0.01030761, "balance_loss_clip": 1.23958838, "balance_loss_mlp": 1.01199734, "epoch": 0.823628438298512, "flos": 18925271792640.0, "grad_norm": 2.273147455085372, "language_loss": 0.72456121, "learning_rate": 3.174355115608305e-07, "loss": 0.74887681, "num_input_tokens_seen": 295464810, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18762207, "step": 13699, "time_per_iteration": 2.8470635414123535 }, { "auxiliary_loss_clip": 0.0139836, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.24066019, "balance_loss_mlp": 1.01605356, "epoch": 0.8236885615511799, "flos": 18705399471360.0, "grad_norm": 1.908514163666442, "language_loss": 0.82600856, "learning_rate": 3.1722500198235526e-07, "loss": 0.85033554, "num_input_tokens_seen": 295482605, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18273926, "step": 13700, "time_per_iteration": 2.822153329849243 }, { "auxiliary_loss_clip": 0.01411716, "auxiliary_loss_mlp": 0.01034141, "balance_loss_clip": 1.24961019, "balance_loss_mlp": 1.01596141, "epoch": 0.8237486848038479, "flos": 23705246171520.0, "grad_norm": 2.026187317590143, "language_loss": 0.73557115, "learning_rate": 3.170145562148763e-07, "loss": 0.76002967, "num_input_tokens_seen": 295503780, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18188477, "step": 13701, "time_per_iteration": 2.8757972717285156 }, { "auxiliary_loss_clip": 0.01408122, "auxiliary_loss_mlp": 0.01035332, "balance_loss_clip": 1.24568987, "balance_loss_mlp": 1.01664042, "epoch": 0.8238088080565159, "flos": 23451910680960.0, "grad_norm": 1.6268885944412874, "language_loss": 0.6972304, "learning_rate": 3.1680417426637384e-07, "loss": 0.72166497, "num_input_tokens_seen": 295522035, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18676758, "step": 13702, "time_per_iteration": 2.8842036724090576 }, { "auxiliary_loss_clip": 0.01401525, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.24229956, "balance_loss_mlp": 1.01328778, "epoch": 0.8238689313091838, "flos": 22756568307840.0, "grad_norm": 1.7441570479672652, "language_loss": 0.7555939, "learning_rate": 3.1659385614482603e-07, "loss": 0.77993107, "num_input_tokens_seen": 295541190, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18908691, "step": 13703, "time_per_iteration": 2.9083786010742188 }, { "auxiliary_loss_clip": 0.01435508, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.26770413, "balance_loss_mlp": 1.01753426, "epoch": 0.8239290545618518, "flos": 25641177730560.0, "grad_norm": 2.005217590859395, "language_loss": 0.71516812, "learning_rate": 3.1638360185820755e-07, "loss": 0.73988634, "num_input_tokens_seen": 295558860, "router_z_loss_clip": 1.67773438, "router_z_loss_mlp": 0.18798828, "step": 13704, "time_per_iteration": 2.8713295459747314 }, { "auxiliary_loss_clip": 0.01406379, "auxiliary_loss_mlp": 0.01033452, "balance_loss_clip": 1.24645352, "balance_loss_mlp": 1.01523674, "epoch": 0.8239891778145197, "flos": 26036374227840.0, "grad_norm": 1.6738311814079343, "language_loss": 0.6477505, "learning_rate": 3.161734114144916e-07, "loss": 0.67214882, "num_input_tokens_seen": 295578155, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18212891, "step": 13705, "time_per_iteration": 2.881086826324463 }, { "auxiliary_loss_clip": 0.01411423, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.24900341, "balance_loss_mlp": 1.01580906, "epoch": 0.8240493010671878, "flos": 21842756202240.0, "grad_norm": 2.306487423343465, "language_loss": 0.70771974, "learning_rate": 3.1596328482164915e-07, "loss": 0.73217851, "num_input_tokens_seen": 295599170, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18652344, "step": 13706, "time_per_iteration": 2.872955322265625 }, { "auxiliary_loss_clip": 0.01416295, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.25522041, "balance_loss_mlp": 1.01521468, "epoch": 0.8241094243198557, "flos": 18561185735040.0, "grad_norm": 1.7095139559548214, "language_loss": 0.70243692, "learning_rate": 3.157532220876475e-07, "loss": 0.72694033, "num_input_tokens_seen": 295617465, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18835449, "step": 13707, "time_per_iteration": 2.8490333557128906 }, { "auxiliary_loss_clip": 0.014074, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.24694884, "balance_loss_mlp": 1.01576138, "epoch": 0.8241695475725237, "flos": 25458071448960.0, "grad_norm": 1.977391220442281, "language_loss": 0.79873747, "learning_rate": 3.1554322322045226e-07, "loss": 0.82316351, "num_input_tokens_seen": 295634960, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19433594, "step": 13708, "time_per_iteration": 2.852030038833618 }, { "auxiliary_loss_clip": 0.01409765, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.24812174, "balance_loss_mlp": 1.01587915, "epoch": 0.8242296708251916, "flos": 18999030096000.0, "grad_norm": 2.205801656874968, "language_loss": 0.6917944, "learning_rate": 3.1533328822802664e-07, "loss": 0.716241, "num_input_tokens_seen": 295652725, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19018555, "step": 13709, "time_per_iteration": 2.8380861282348633 }, { "auxiliary_loss_clip": 0.0141284, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.25091231, "balance_loss_mlp": 1.01333523, "epoch": 0.8242897940778596, "flos": 22610951982720.0, "grad_norm": 1.9614632224739392, "language_loss": 0.8326726, "learning_rate": 3.151234171183319e-07, "loss": 0.85711771, "num_input_tokens_seen": 295671195, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18347168, "step": 13710, "time_per_iteration": 2.957716703414917 }, { "auxiliary_loss_clip": 0.01394667, "auxiliary_loss_mlp": 0.01030428, "balance_loss_clip": 1.23600721, "balance_loss_mlp": 1.01109254, "epoch": 0.8243499173305275, "flos": 21477493779840.0, "grad_norm": 2.0642391910129256, "language_loss": 0.79153091, "learning_rate": 3.149136098993257e-07, "loss": 0.81578183, "num_input_tokens_seen": 295689130, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19335938, "step": 13711, "time_per_iteration": 2.830284833908081 }, { "auxiliary_loss_clip": 0.01405929, "auxiliary_loss_mlp": 0.01033983, "balance_loss_clip": 1.24577034, "balance_loss_mlp": 1.01544619, "epoch": 0.8244100405831956, "flos": 20019746960640.0, "grad_norm": 2.199617135723057, "language_loss": 0.66096169, "learning_rate": 3.1470386657896473e-07, "loss": 0.68536079, "num_input_tokens_seen": 295706385, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1854248, "step": 13712, "time_per_iteration": 2.814237117767334 }, { "auxiliary_loss_clip": 0.01411346, "auxiliary_loss_mlp": 0.01033783, "balance_loss_clip": 1.25011635, "balance_loss_mlp": 1.01473343, "epoch": 0.8244701638358635, "flos": 26441434091520.0, "grad_norm": 1.7541744669652763, "language_loss": 0.75490308, "learning_rate": 3.14494187165202e-07, "loss": 0.77935433, "num_input_tokens_seen": 295727925, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19055176, "step": 13713, "time_per_iteration": 2.8558037281036377 }, { "auxiliary_loss_clip": 0.01418171, "auxiliary_loss_mlp": 0.01032224, "balance_loss_clip": 1.25479245, "balance_loss_mlp": 1.01435435, "epoch": 0.8245302870885315, "flos": 17649092931840.0, "grad_norm": 2.0685678440554325, "language_loss": 0.81550086, "learning_rate": 3.1428457166598833e-07, "loss": 0.84000486, "num_input_tokens_seen": 295744420, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.17858887, "step": 13714, "time_per_iteration": 2.8215112686157227 }, { "auxiliary_loss_clip": 0.01402569, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.24466348, "balance_loss_mlp": 1.01755655, "epoch": 0.8245904103411995, "flos": 26219616243840.0, "grad_norm": 1.762720717520167, "language_loss": 0.66916823, "learning_rate": 3.1407502008927235e-07, "loss": 0.69356269, "num_input_tokens_seen": 295765105, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1932373, "step": 13715, "time_per_iteration": 2.9206020832061768 }, { "auxiliary_loss_clip": 0.01418591, "auxiliary_loss_mlp": 0.01033908, "balance_loss_clip": 1.25570917, "balance_loss_mlp": 1.0155139, "epoch": 0.8246505335938674, "flos": 24215265267840.0, "grad_norm": 1.6756661431840223, "language_loss": 0.7576068, "learning_rate": 3.1386553244300086e-07, "loss": 0.78213179, "num_input_tokens_seen": 295784200, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18395996, "step": 13716, "time_per_iteration": 2.868501663208008 }, { "auxiliary_loss_clip": 0.01184444, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.09770584, "balance_loss_mlp": 1.00528073, "epoch": 0.8247106568465354, "flos": 67127215743360.0, "grad_norm": 0.7165589340588783, "language_loss": 0.59094143, "learning_rate": 3.136561087351175e-07, "loss": 0.61308855, "num_input_tokens_seen": 295846555, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.25, "step": 13717, "time_per_iteration": 3.4550724029541016 }, { "auxiliary_loss_clip": 0.01410229, "auxiliary_loss_mlp": 0.01032494, "balance_loss_clip": 1.25120437, "balance_loss_mlp": 1.01468456, "epoch": 0.8247707800992033, "flos": 12575940376320.0, "grad_norm": 8.853193984922592, "language_loss": 0.80690682, "learning_rate": 3.1344674897356373e-07, "loss": 0.83133405, "num_input_tokens_seen": 295863425, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.17810059, "step": 13718, "time_per_iteration": 2.8253233432769775 }, { "auxiliary_loss_clip": 0.0139575, "auxiliary_loss_mlp": 0.01037755, "balance_loss_clip": 1.23833227, "balance_loss_mlp": 1.01957536, "epoch": 0.8248309033518714, "flos": 15931857329280.0, "grad_norm": 1.6259874705982873, "language_loss": 0.69496262, "learning_rate": 3.132374531662778e-07, "loss": 0.71929765, "num_input_tokens_seen": 295880925, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.1817627, "step": 13719, "time_per_iteration": 2.822535276412964 }, { "auxiliary_loss_clip": 0.01406593, "auxiliary_loss_mlp": 0.01039271, "balance_loss_clip": 1.24706852, "balance_loss_mlp": 1.02012634, "epoch": 0.8248910266045393, "flos": 17573208122880.0, "grad_norm": 2.8811594534899254, "language_loss": 0.70929086, "learning_rate": 3.13028221321197e-07, "loss": 0.73374951, "num_input_tokens_seen": 295898205, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19140625, "step": 13720, "time_per_iteration": 2.8427984714508057 }, { "auxiliary_loss_clip": 0.01422769, "auxiliary_loss_mlp": 0.01037122, "balance_loss_clip": 1.26087046, "balance_loss_mlp": 1.01801312, "epoch": 0.8249511498572073, "flos": 28630927365120.0, "grad_norm": 1.5901112056385813, "language_loss": 0.76075691, "learning_rate": 3.1281905344625467e-07, "loss": 0.78535581, "num_input_tokens_seen": 295918130, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19104004, "step": 13721, "time_per_iteration": 4.453307867050171 }, { "auxiliary_loss_clip": 0.01407087, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.24813998, "balance_loss_mlp": 1.01254022, "epoch": 0.8250112731098752, "flos": 25567509916800.0, "grad_norm": 10.6541046040222, "language_loss": 0.78538036, "learning_rate": 3.1260994954938305e-07, "loss": 0.80976605, "num_input_tokens_seen": 295937760, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18945312, "step": 13722, "time_per_iteration": 2.8728177547454834 }, { "auxiliary_loss_clip": 0.01404226, "auxiliary_loss_mlp": 0.01036415, "balance_loss_clip": 1.24560559, "balance_loss_mlp": 1.01790166, "epoch": 0.8250713963625432, "flos": 27757772352000.0, "grad_norm": 1.607029304056864, "language_loss": 0.63471437, "learning_rate": 3.1240090963851205e-07, "loss": 0.6591208, "num_input_tokens_seen": 295957585, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18530273, "step": 13723, "time_per_iteration": 2.8797552585601807 }, { "auxiliary_loss_clip": 0.01407451, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.24724233, "balance_loss_mlp": 1.01518714, "epoch": 0.8251315196152111, "flos": 21619083317760.0, "grad_norm": 1.8058379640118993, "language_loss": 0.7544533, "learning_rate": 3.121919337215666e-07, "loss": 0.77886862, "num_input_tokens_seen": 295977135, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18908691, "step": 13724, "time_per_iteration": 2.8725147247314453 }, { "auxiliary_loss_clip": 0.0140938, "auxiliary_loss_mlp": 0.01034106, "balance_loss_clip": 1.2487911, "balance_loss_mlp": 1.01510406, "epoch": 0.8251916428678792, "flos": 28589636845440.0, "grad_norm": 1.7843508208277452, "language_loss": 0.64586318, "learning_rate": 3.1198302180647253e-07, "loss": 0.6702981, "num_input_tokens_seen": 295996265, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18994141, "step": 13725, "time_per_iteration": 2.8608803749084473 }, { "auxiliary_loss_clip": 0.01394327, "auxiliary_loss_mlp": 0.01030039, "balance_loss_clip": 1.2355535, "balance_loss_mlp": 1.01196718, "epoch": 0.8252517661205471, "flos": 23085381404160.0, "grad_norm": 1.6374268147076583, "language_loss": 0.82614505, "learning_rate": 3.1177417390115125e-07, "loss": 0.85038877, "num_input_tokens_seen": 296014745, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18078613, "step": 13726, "time_per_iteration": 2.832667112350464 }, { "auxiliary_loss_clip": 0.01389294, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.23469532, "balance_loss_mlp": 1.01511455, "epoch": 0.8253118893732151, "flos": 31772220393600.0, "grad_norm": 2.527850209555812, "language_loss": 0.7128911, "learning_rate": 3.1156539001352286e-07, "loss": 0.73711377, "num_input_tokens_seen": 296036960, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.17858887, "step": 13727, "time_per_iteration": 4.378351926803589 }, { "auxiliary_loss_clip": 0.01420961, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.25768042, "balance_loss_mlp": 1.01698506, "epoch": 0.8253720126258831, "flos": 18305588004480.0, "grad_norm": 2.6586294009771736, "language_loss": 0.64198291, "learning_rate": 3.113566701515036e-07, "loss": 0.66654539, "num_input_tokens_seen": 296056540, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18310547, "step": 13728, "time_per_iteration": 2.8443081378936768 }, { "auxiliary_loss_clip": 0.01432568, "auxiliary_loss_mlp": 0.01037344, "balance_loss_clip": 1.26677144, "balance_loss_mlp": 1.01881886, "epoch": 0.825432135878551, "flos": 26808642040320.0, "grad_norm": 1.6376045926463716, "language_loss": 0.72119468, "learning_rate": 3.111480143230092e-07, "loss": 0.74589384, "num_input_tokens_seen": 296077950, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.18518066, "step": 13729, "time_per_iteration": 2.8461825847625732 }, { "auxiliary_loss_clip": 0.01183886, "auxiliary_loss_mlp": 0.0102706, "balance_loss_clip": 1.09555674, "balance_loss_mlp": 1.00522089, "epoch": 0.825492259131219, "flos": 54242985427200.0, "grad_norm": 0.848553240234136, "language_loss": 0.6271975, "learning_rate": 3.109394225359514e-07, "loss": 0.64930701, "num_input_tokens_seen": 296127060, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21875, "step": 13730, "time_per_iteration": 3.1700479984283447 }, { "auxiliary_loss_clip": 0.01405783, "auxiliary_loss_mlp": 0.01035656, "balance_loss_clip": 1.24627304, "balance_loss_mlp": 1.01709485, "epoch": 0.825552382383887, "flos": 43770020987520.0, "grad_norm": 2.004350647662021, "language_loss": 0.64086956, "learning_rate": 3.1073089479823945e-07, "loss": 0.66528392, "num_input_tokens_seen": 296147775, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18554688, "step": 13731, "time_per_iteration": 3.0116758346557617 }, { "auxiliary_loss_clip": 0.01429, "auxiliary_loss_mlp": 0.01034383, "balance_loss_clip": 1.26259851, "balance_loss_mlp": 1.01554847, "epoch": 0.825612505636555, "flos": 12610036972800.0, "grad_norm": 2.1265347450304963, "language_loss": 0.69903213, "learning_rate": 3.105224311177812e-07, "loss": 0.72366595, "num_input_tokens_seen": 296163560, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.18835449, "step": 13732, "time_per_iteration": 5.640879154205322 }, { "auxiliary_loss_clip": 0.01424737, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.26018846, "balance_loss_mlp": 1.01487708, "epoch": 0.8256726288892229, "flos": 17602644504960.0, "grad_norm": 5.171241984067599, "language_loss": 0.72795928, "learning_rate": 3.103140315024817e-07, "loss": 0.75254595, "num_input_tokens_seen": 296178730, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19055176, "step": 13733, "time_per_iteration": 2.8274881839752197 }, { "auxiliary_loss_clip": 0.0139783, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.23984754, "balance_loss_mlp": 1.0134778, "epoch": 0.8257327521418909, "flos": 23816403941760.0, "grad_norm": 1.487371191873629, "language_loss": 0.83078879, "learning_rate": 3.1010569596024437e-07, "loss": 0.85509032, "num_input_tokens_seen": 296200175, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18859863, "step": 13734, "time_per_iteration": 2.8727948665618896 }, { "auxiliary_loss_clip": 0.01400769, "auxiliary_loss_mlp": 0.01031951, "balance_loss_clip": 1.2424202, "balance_loss_mlp": 1.01267505, "epoch": 0.8257928753945588, "flos": 19290443725440.0, "grad_norm": 2.1979895668679825, "language_loss": 0.83594298, "learning_rate": 3.098974244989676e-07, "loss": 0.8602702, "num_input_tokens_seen": 296219305, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19262695, "step": 13735, "time_per_iteration": 2.815056800842285 }, { "auxiliary_loss_clip": 0.01401586, "auxiliary_loss_mlp": 0.01035, "balance_loss_clip": 1.24135518, "balance_loss_mlp": 1.01661766, "epoch": 0.8258529986472268, "flos": 18488060858880.0, "grad_norm": 2.1174813257934297, "language_loss": 0.71546495, "learning_rate": 3.096892171265497e-07, "loss": 0.73983085, "num_input_tokens_seen": 296236945, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18371582, "step": 13736, "time_per_iteration": 2.8014867305755615 }, { "auxiliary_loss_clip": 0.01184382, "auxiliary_loss_mlp": 0.01023942, "balance_loss_clip": 1.09723377, "balance_loss_mlp": 1.003438, "epoch": 0.8259131218998947, "flos": 62164252834560.0, "grad_norm": 1.5117590762938031, "language_loss": 0.68055987, "learning_rate": 3.0948107385088665e-07, "loss": 0.70264304, "num_input_tokens_seen": 296294685, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.20507812, "step": 13737, "time_per_iteration": 3.361929416656494 }, { "auxiliary_loss_clip": 0.01414392, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.2537328, "balance_loss_mlp": 1.01609325, "epoch": 0.8259732451525628, "flos": 22167949714560.0, "grad_norm": 1.808236915294887, "language_loss": 0.70490253, "learning_rate": 3.0927299467987e-07, "loss": 0.72938824, "num_input_tokens_seen": 296314790, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18078613, "step": 13738, "time_per_iteration": 2.9188954830169678 }, { "auxiliary_loss_clip": 0.0141481, "auxiliary_loss_mlp": 0.01034078, "balance_loss_clip": 1.25250483, "balance_loss_mlp": 1.01312065, "epoch": 0.8260333684052307, "flos": 38375520727680.0, "grad_norm": 1.9203237173667616, "language_loss": 0.64009142, "learning_rate": 3.090649796213911e-07, "loss": 0.66458035, "num_input_tokens_seen": 296335355, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20959473, "step": 13739, "time_per_iteration": 3.0387895107269287 }, { "auxiliary_loss_clip": 0.01184607, "auxiliary_loss_mlp": 0.01025515, "balance_loss_clip": 1.0962646, "balance_loss_mlp": 1.0036757, "epoch": 0.8260934916578987, "flos": 62214818538240.0, "grad_norm": 0.8421983917953507, "language_loss": 0.59414899, "learning_rate": 3.0885702868333853e-07, "loss": 0.61625016, "num_input_tokens_seen": 296399885, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21875, "step": 13740, "time_per_iteration": 3.336470365524292 }, { "auxiliary_loss_clip": 0.01421263, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.25591254, "balance_loss_mlp": 1.01296139, "epoch": 0.8261536149105667, "flos": 22575814755840.0, "grad_norm": 1.9579697471801056, "language_loss": 0.76098162, "learning_rate": 3.086491418735959e-07, "loss": 0.785519, "num_input_tokens_seen": 296417660, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19506836, "step": 13741, "time_per_iteration": 2.8691625595092773 }, { "auxiliary_loss_clip": 0.01397803, "auxiliary_loss_mlp": 0.01031505, "balance_loss_clip": 1.23838413, "balance_loss_mlp": 1.01293242, "epoch": 0.8262137381632346, "flos": 32538244423680.0, "grad_norm": 1.9757907431425563, "language_loss": 0.63234794, "learning_rate": 3.0844131920004726e-07, "loss": 0.65664101, "num_input_tokens_seen": 296438255, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18591309, "step": 13742, "time_per_iteration": 2.899017095565796 }, { "auxiliary_loss_clip": 0.01432013, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.26349616, "balance_loss_mlp": 1.01511717, "epoch": 0.8262738614159026, "flos": 14144392517760.0, "grad_norm": 2.38240794557015, "language_loss": 0.66505694, "learning_rate": 3.0823356067057327e-07, "loss": 0.68973362, "num_input_tokens_seen": 296454485, "router_z_loss_clip": 1.68652344, "router_z_loss_mlp": 0.20544434, "step": 13743, "time_per_iteration": 2.8157413005828857 }, { "auxiliary_loss_clip": 0.01416534, "auxiliary_loss_mlp": 0.01037557, "balance_loss_clip": 1.25632882, "balance_loss_mlp": 1.01814961, "epoch": 0.8263339846685706, "flos": 19834016480640.0, "grad_norm": 1.6686668017188593, "language_loss": 0.66942811, "learning_rate": 3.0802586629305283e-07, "loss": 0.69396907, "num_input_tokens_seen": 296473740, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.1940918, "step": 13744, "time_per_iteration": 2.8694705963134766 }, { "auxiliary_loss_clip": 0.01405575, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.24598157, "balance_loss_mlp": 1.01461864, "epoch": 0.8263941079212386, "flos": 22755844391040.0, "grad_norm": 1.9069398187531013, "language_loss": 0.7604087, "learning_rate": 3.078182360753612e-07, "loss": 0.78478813, "num_input_tokens_seen": 296493355, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.17773438, "step": 13745, "time_per_iteration": 2.843214988708496 }, { "auxiliary_loss_clip": 0.01380555, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.22637522, "balance_loss_mlp": 1.01275074, "epoch": 0.8264542311739065, "flos": 20130271303680.0, "grad_norm": 1.784709928131952, "language_loss": 0.80036473, "learning_rate": 3.076106700253709e-07, "loss": 0.8244803, "num_input_tokens_seen": 296510520, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.18249512, "step": 13746, "time_per_iteration": 2.8357508182525635 }, { "auxiliary_loss_clip": 0.01429658, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.26442313, "balance_loss_mlp": 1.01436925, "epoch": 0.8265143544265745, "flos": 16845986148480.0, "grad_norm": 1.9569171501791687, "language_loss": 0.69092274, "learning_rate": 3.0740316815095415e-07, "loss": 0.71555543, "num_input_tokens_seen": 296528265, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19238281, "step": 13747, "time_per_iteration": 2.8320279121398926 }, { "auxiliary_loss_clip": 0.01400327, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.23976874, "balance_loss_mlp": 1.01347184, "epoch": 0.8265744776792424, "flos": 22028893885440.0, "grad_norm": 2.5548706761618774, "language_loss": 0.76131427, "learning_rate": 3.0719573045997835e-07, "loss": 0.78563738, "num_input_tokens_seen": 296547810, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18518066, "step": 13748, "time_per_iteration": 2.884262800216675 }, { "auxiliary_loss_clip": 0.0139296, "auxiliary_loss_mlp": 0.01036565, "balance_loss_clip": 1.23738074, "balance_loss_mlp": 1.01925552, "epoch": 0.8266346009319104, "flos": 19254220623360.0, "grad_norm": 1.6629917769944962, "language_loss": 0.63709313, "learning_rate": 3.069883569603102e-07, "loss": 0.66138834, "num_input_tokens_seen": 296565940, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.17333984, "step": 13749, "time_per_iteration": 2.8175060749053955 }, { "auxiliary_loss_clip": 0.01399633, "auxiliary_loss_mlp": 0.01031164, "balance_loss_clip": 1.23947668, "balance_loss_mlp": 1.01251972, "epoch": 0.8266947241845783, "flos": 24176463212160.0, "grad_norm": 1.652893679768725, "language_loss": 0.74252528, "learning_rate": 3.067810476598132e-07, "loss": 0.76683331, "num_input_tokens_seen": 296585090, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18640137, "step": 13750, "time_per_iteration": 2.9163479804992676 }, { "auxiliary_loss_clip": 0.01422627, "auxiliary_loss_mlp": 0.01037175, "balance_loss_clip": 1.26032436, "balance_loss_mlp": 1.01760137, "epoch": 0.8267548474372464, "flos": 21115715207040.0, "grad_norm": 1.8012734655977065, "language_loss": 0.66227949, "learning_rate": 3.065738025663496e-07, "loss": 0.68687749, "num_input_tokens_seen": 296604950, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19567871, "step": 13751, "time_per_iteration": 2.8824455738067627 }, { "auxiliary_loss_clip": 0.01397305, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.23848987, "balance_loss_mlp": 1.01509237, "epoch": 0.8268149706899143, "flos": 39982910659200.0, "grad_norm": 1.4780643242392275, "language_loss": 0.61412036, "learning_rate": 3.0636662168777607e-07, "loss": 0.63842094, "num_input_tokens_seen": 296627780, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.17651367, "step": 13752, "time_per_iteration": 3.0032575130462646 }, { "auxiliary_loss_clip": 0.01183814, "auxiliary_loss_mlp": 0.01019234, "balance_loss_clip": 1.0966686, "balance_loss_mlp": 1.00073326, "epoch": 0.8268750939425823, "flos": 65808597260160.0, "grad_norm": 1.0084175287825399, "language_loss": 0.57485861, "learning_rate": 3.0615950503194986e-07, "loss": 0.59688914, "num_input_tokens_seen": 296683850, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.18457031, "step": 13753, "time_per_iteration": 3.4065821170806885 }, { "auxiliary_loss_clip": 0.01184998, "auxiliary_loss_mlp": 0.01019948, "balance_loss_clip": 1.09876657, "balance_loss_mlp": 0.99801332, "epoch": 0.8269352171952503, "flos": 53005744356480.0, "grad_norm": 0.7030518731018391, "language_loss": 0.54940444, "learning_rate": 3.0595245260672563e-07, "loss": 0.57145393, "num_input_tokens_seen": 296741420, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 0.21972656, "step": 13754, "time_per_iteration": 3.374541997909546 }, { "auxiliary_loss_clip": 0.01401427, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.24297345, "balance_loss_mlp": 1.01296091, "epoch": 0.8269953404479182, "flos": 23086557768960.0, "grad_norm": 2.313020444720935, "language_loss": 0.70086014, "learning_rate": 3.0574546441995354e-07, "loss": 0.7251789, "num_input_tokens_seen": 296759620, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.17480469, "step": 13755, "time_per_iteration": 2.8698792457580566 }, { "auxiliary_loss_clip": 0.01400072, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.24168301, "balance_loss_mlp": 1.01910353, "epoch": 0.8270554637005862, "flos": 14218105576320.0, "grad_norm": 2.0617768681647894, "language_loss": 0.70856071, "learning_rate": 3.0553854047948324e-07, "loss": 0.73292637, "num_input_tokens_seen": 296777275, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.17382812, "step": 13756, "time_per_iteration": 2.8703560829162598 }, { "auxiliary_loss_clip": 0.01404792, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.24608791, "balance_loss_mlp": 1.0137651, "epoch": 0.8271155869532542, "flos": 21771984055680.0, "grad_norm": 2.028972456462824, "language_loss": 0.73579741, "learning_rate": 3.053316807931623e-07, "loss": 0.76016378, "num_input_tokens_seen": 296796655, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18078613, "step": 13757, "time_per_iteration": 4.338456630706787 }, { "auxiliary_loss_clip": 0.01424635, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.26043785, "balance_loss_mlp": 1.01479912, "epoch": 0.8271757102059222, "flos": 15128388587520.0, "grad_norm": 2.6137350218753745, "language_loss": 0.69496179, "learning_rate": 3.0512488536883283e-07, "loss": 0.71956527, "num_input_tokens_seen": 296813705, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20922852, "step": 13758, "time_per_iteration": 2.8267078399658203 }, { "auxiliary_loss_clip": 0.01393105, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.23657393, "balance_loss_mlp": 1.01167202, "epoch": 0.8272358334585901, "flos": 24144312142080.0, "grad_norm": 1.53940253404656, "language_loss": 0.69925618, "learning_rate": 3.0491815421433775e-07, "loss": 0.7234866, "num_input_tokens_seen": 296833985, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18273926, "step": 13759, "time_per_iteration": 2.898010015487671 }, { "auxiliary_loss_clip": 0.01395213, "auxiliary_loss_mlp": 0.01032926, "balance_loss_clip": 1.23701644, "balance_loss_mlp": 1.01426995, "epoch": 0.8272959567112581, "flos": 19000342195200.0, "grad_norm": 1.8999404795689314, "language_loss": 0.71319187, "learning_rate": 3.047114873375161e-07, "loss": 0.73747325, "num_input_tokens_seen": 296850150, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18664551, "step": 13760, "time_per_iteration": 2.887220621109009 }, { "auxiliary_loss_clip": 0.01398198, "auxiliary_loss_mlp": 0.01028826, "balance_loss_clip": 1.24182701, "balance_loss_mlp": 1.01115918, "epoch": 0.827356079963926, "flos": 20641421520000.0, "grad_norm": 1.9251799038220658, "language_loss": 0.77895284, "learning_rate": 3.0450488474620505e-07, "loss": 0.80322313, "num_input_tokens_seen": 296869585, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.17687988, "step": 13761, "time_per_iteration": 4.296947717666626 }, { "auxiliary_loss_clip": 0.01399116, "auxiliary_loss_mlp": 0.01033325, "balance_loss_clip": 1.24273634, "balance_loss_mlp": 1.01568246, "epoch": 0.827416203216594, "flos": 22426171643520.0, "grad_norm": 1.646053291900062, "language_loss": 0.71341634, "learning_rate": 3.042983464482387e-07, "loss": 0.73774076, "num_input_tokens_seen": 296887710, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.17651367, "step": 13762, "time_per_iteration": 2.8422939777374268 }, { "auxiliary_loss_clip": 0.01393132, "auxiliary_loss_mlp": 0.01028754, "balance_loss_clip": 1.2357074, "balance_loss_mlp": 1.01120651, "epoch": 0.827476326469262, "flos": 19035569911680.0, "grad_norm": 1.9245112795267285, "language_loss": 0.71041214, "learning_rate": 3.0409187245144853e-07, "loss": 0.734631, "num_input_tokens_seen": 296906265, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17541504, "step": 13763, "time_per_iteration": 2.8434808254241943 }, { "auxiliary_loss_clip": 0.01186579, "auxiliary_loss_mlp": 0.01020491, "balance_loss_clip": 1.09872591, "balance_loss_mlp": 0.99769861, "epoch": 0.82753644972193, "flos": 68532179863680.0, "grad_norm": 0.8411010192207616, "language_loss": 0.65161258, "learning_rate": 3.038854627636651e-07, "loss": 0.67368329, "num_input_tokens_seen": 296971290, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.22753906, "step": 13764, "time_per_iteration": 3.435088634490967 }, { "auxiliary_loss_clip": 0.01407477, "auxiliary_loss_mlp": 0.01031656, "balance_loss_clip": 1.24837232, "balance_loss_mlp": 1.01280892, "epoch": 0.8275965729745979, "flos": 18414483534720.0, "grad_norm": 1.9768870839153099, "language_loss": 0.78711909, "learning_rate": 3.0367911739271423e-07, "loss": 0.81151038, "num_input_tokens_seen": 296989060, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18847656, "step": 13765, "time_per_iteration": 2.8451600074768066 }, { "auxiliary_loss_clip": 0.01414489, "auxiliary_loss_mlp": 0.0103496, "balance_loss_clip": 1.25092626, "balance_loss_mlp": 1.01636314, "epoch": 0.8276566962272659, "flos": 28523796382080.0, "grad_norm": 1.5531220176619522, "language_loss": 0.62977827, "learning_rate": 3.034728363464214e-07, "loss": 0.65427274, "num_input_tokens_seen": 297011300, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.18603516, "step": 13766, "time_per_iteration": 2.9132919311523438 }, { "auxiliary_loss_clip": 0.01406527, "auxiliary_loss_mlp": 0.01033998, "balance_loss_clip": 1.24676228, "balance_loss_mlp": 1.01491249, "epoch": 0.8277168194799339, "flos": 20239935995520.0, "grad_norm": 2.690903767771868, "language_loss": 0.83442956, "learning_rate": 3.03266619632609e-07, "loss": 0.8588348, "num_input_tokens_seen": 297030350, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1907959, "step": 13767, "time_per_iteration": 5.788644075393677 }, { "auxiliary_loss_clip": 0.01418465, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 1.2567004, "balance_loss_mlp": 1.01695538, "epoch": 0.8277769427326018, "flos": 28488613910400.0, "grad_norm": 1.6231991649956503, "language_loss": 0.69332874, "learning_rate": 3.030604672590964e-07, "loss": 0.71786845, "num_input_tokens_seen": 297049710, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.1854248, "step": 13768, "time_per_iteration": 2.932056188583374 }, { "auxiliary_loss_clip": 0.0138929, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.23221564, "balance_loss_mlp": 1.01119816, "epoch": 0.8278370659852698, "flos": 27208770220800.0, "grad_norm": 3.0577814055711485, "language_loss": 0.75034124, "learning_rate": 3.028543792337006e-07, "loss": 0.77453005, "num_input_tokens_seen": 297070510, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18395996, "step": 13769, "time_per_iteration": 2.926689386367798 }, { "auxiliary_loss_clip": 0.01403393, "auxiliary_loss_mlp": 0.01032123, "balance_loss_clip": 1.24299479, "balance_loss_mlp": 1.01335931, "epoch": 0.8278971892379378, "flos": 37830319159680.0, "grad_norm": 1.7583898899526764, "language_loss": 0.74998093, "learning_rate": 3.0264835556423675e-07, "loss": 0.77433598, "num_input_tokens_seen": 297092585, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.1875, "step": 13770, "time_per_iteration": 3.0069572925567627 }, { "auxiliary_loss_clip": 0.01406937, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.2455107, "balance_loss_mlp": 1.01177561, "epoch": 0.8279573124906058, "flos": 22569118525440.0, "grad_norm": 1.6382520926663422, "language_loss": 0.76394677, "learning_rate": 3.0244239625851785e-07, "loss": 0.78832066, "num_input_tokens_seen": 297110055, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18688965, "step": 13771, "time_per_iteration": 2.866610527038574 }, { "auxiliary_loss_clip": 0.01404195, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.2438271, "balance_loss_mlp": 1.01375866, "epoch": 0.8280174357432737, "flos": 36078308288640.0, "grad_norm": 2.6325633123140726, "language_loss": 0.73431253, "learning_rate": 3.0223650132435284e-07, "loss": 0.75867218, "num_input_tokens_seen": 297132170, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18029785, "step": 13772, "time_per_iteration": 2.960813283920288 }, { "auxiliary_loss_clip": 0.01392083, "auxiliary_loss_mlp": 0.01039428, "balance_loss_clip": 1.23453987, "balance_loss_mlp": 1.02060461, "epoch": 0.8280775589959417, "flos": 22970015867520.0, "grad_norm": 2.257871008968038, "language_loss": 0.75212884, "learning_rate": 3.0203067076955035e-07, "loss": 0.77644396, "num_input_tokens_seen": 297149515, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18835449, "step": 13773, "time_per_iteration": 2.851172685623169 }, { "auxiliary_loss_clip": 0.01393666, "auxiliary_loss_mlp": 0.01031644, "balance_loss_clip": 1.23665917, "balance_loss_mlp": 1.01315498, "epoch": 0.8281376822486096, "flos": 26073276001920.0, "grad_norm": 2.031947889309481, "language_loss": 0.76354223, "learning_rate": 3.01824904601915e-07, "loss": 0.78779531, "num_input_tokens_seen": 297170320, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18481445, "step": 13774, "time_per_iteration": 2.8817636966705322 }, { "auxiliary_loss_clip": 0.01425315, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.25986528, "balance_loss_mlp": 1.02037084, "epoch": 0.8281978055012776, "flos": 20677508887680.0, "grad_norm": 1.726487148782234, "language_loss": 0.75166571, "learning_rate": 3.01619202829249e-07, "loss": 0.77632034, "num_input_tokens_seen": 297189935, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19763184, "step": 13775, "time_per_iteration": 2.852985382080078 }, { "auxiliary_loss_clip": 0.0142076, "auxiliary_loss_mlp": 0.01031295, "balance_loss_clip": 1.25567436, "balance_loss_mlp": 1.01245975, "epoch": 0.8282579287539455, "flos": 29327355613440.0, "grad_norm": 2.041481646389865, "language_loss": 0.73855948, "learning_rate": 3.01413565459353e-07, "loss": 0.76308012, "num_input_tokens_seen": 297210885, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.18823242, "step": 13776, "time_per_iteration": 2.9063913822174072 }, { "auxiliary_loss_clip": 0.01407323, "auxiliary_loss_mlp": 0.01033847, "balance_loss_clip": 1.24539042, "balance_loss_mlp": 1.01420093, "epoch": 0.8283180520066136, "flos": 15714744940800.0, "grad_norm": 2.023957322666098, "language_loss": 0.77834904, "learning_rate": 3.0120799250002483e-07, "loss": 0.80276078, "num_input_tokens_seen": 297228500, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19641113, "step": 13777, "time_per_iteration": 2.799372673034668 }, { "auxiliary_loss_clip": 0.01407927, "auxiliary_loss_mlp": 0.01034612, "balance_loss_clip": 1.24979091, "balance_loss_mlp": 1.0169456, "epoch": 0.8283781752592815, "flos": 24802481272320.0, "grad_norm": 1.4531827329469804, "language_loss": 0.82534993, "learning_rate": 3.010024839590604e-07, "loss": 0.84977531, "num_input_tokens_seen": 297249470, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.17663574, "step": 13778, "time_per_iteration": 2.8737404346466064 }, { "auxiliary_loss_clip": 0.01385053, "auxiliary_loss_mlp": 0.01029546, "balance_loss_clip": 1.2297852, "balance_loss_mlp": 1.01123571, "epoch": 0.8284382985119495, "flos": 18990388339200.0, "grad_norm": 3.358588190796583, "language_loss": 0.748963, "learning_rate": 3.0079703984425187e-07, "loss": 0.77310896, "num_input_tokens_seen": 297265970, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.18310547, "step": 13779, "time_per_iteration": 2.8725521564483643 }, { "auxiliary_loss_clip": 0.01185545, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.09712458, "balance_loss_mlp": 1.00422049, "epoch": 0.8284984217646175, "flos": 61067623898880.0, "grad_norm": 0.7986145217143393, "language_loss": 0.56741965, "learning_rate": 3.0059166016338954e-07, "loss": 0.58954608, "num_input_tokens_seen": 297325525, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.22851562, "step": 13780, "time_per_iteration": 3.4343576431274414 }, { "auxiliary_loss_clip": 0.01394534, "auxiliary_loss_mlp": 0.01029032, "balance_loss_clip": 1.23563695, "balance_loss_mlp": 1.01003027, "epoch": 0.8285585450172854, "flos": 19722949200000.0, "grad_norm": 1.7458301962813136, "language_loss": 0.80901825, "learning_rate": 3.0038634492426205e-07, "loss": 0.83325398, "num_input_tokens_seen": 297345025, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18994141, "step": 13781, "time_per_iteration": 2.842149496078491 }, { "auxiliary_loss_clip": 0.01396853, "auxiliary_loss_mlp": 0.01036566, "balance_loss_clip": 1.2374444, "balance_loss_mlp": 1.01814842, "epoch": 0.8286186682699535, "flos": 21698768689920.0, "grad_norm": 5.99504751780044, "language_loss": 0.76583409, "learning_rate": 3.001810941346543e-07, "loss": 0.79016829, "num_input_tokens_seen": 297363570, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18408203, "step": 13782, "time_per_iteration": 2.8272643089294434 }, { "auxiliary_loss_clip": 0.01411777, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.24981093, "balance_loss_mlp": 1.01518762, "epoch": 0.8286787915226214, "flos": 25786567831680.0, "grad_norm": 1.5634398522893869, "language_loss": 0.76641321, "learning_rate": 2.9997590780234983e-07, "loss": 0.79087698, "num_input_tokens_seen": 297385385, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1940918, "step": 13783, "time_per_iteration": 2.911400079727173 }, { "auxiliary_loss_clip": 0.01416907, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.25528622, "balance_loss_mlp": 1.01416206, "epoch": 0.8287389147752894, "flos": 21298323795840.0, "grad_norm": 1.9597940307982569, "language_loss": 0.74358737, "learning_rate": 2.997707859351304e-07, "loss": 0.7680887, "num_input_tokens_seen": 297403950, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19055176, "step": 13784, "time_per_iteration": 2.8289148807525635 }, { "auxiliary_loss_clip": 0.01414669, "auxiliary_loss_mlp": 0.01035643, "balance_loss_clip": 1.25117874, "balance_loss_mlp": 1.01630747, "epoch": 0.8287990380279573, "flos": 33557332475520.0, "grad_norm": 1.4692543817935058, "language_loss": 0.70422202, "learning_rate": 2.99565728540772e-07, "loss": 0.72872519, "num_input_tokens_seen": 297424565, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19335938, "step": 13785, "time_per_iteration": 2.9389724731445312 }, { "auxiliary_loss_clip": 0.01405161, "auxiliary_loss_mlp": 0.01033573, "balance_loss_clip": 1.24431038, "balance_loss_mlp": 1.01434493, "epoch": 0.8288591612806253, "flos": 22976666853120.0, "grad_norm": 1.4488878342717473, "language_loss": 0.69297773, "learning_rate": 2.993607356270516e-07, "loss": 0.71736509, "num_input_tokens_seen": 297445180, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19238281, "step": 13786, "time_per_iteration": 2.867943286895752 }, { "auxiliary_loss_clip": 0.01430088, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.26393116, "balance_loss_mlp": 1.01873147, "epoch": 0.8289192845332932, "flos": 18598404222720.0, "grad_norm": 1.9811070332490766, "language_loss": 0.7779994, "learning_rate": 2.991558072017426e-07, "loss": 0.80267662, "num_input_tokens_seen": 297463790, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.18908691, "step": 13787, "time_per_iteration": 2.8462703227996826 }, { "auxiliary_loss_clip": 0.01400159, "auxiliary_loss_mlp": 0.01033838, "balance_loss_clip": 1.24199784, "balance_loss_mlp": 1.0148958, "epoch": 0.8289794077859612, "flos": 15458830496640.0, "grad_norm": 2.689517287414441, "language_loss": 0.80811387, "learning_rate": 2.989509432726163e-07, "loss": 0.83245379, "num_input_tokens_seen": 297480100, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18933105, "step": 13788, "time_per_iteration": 2.8292412757873535 }, { "auxiliary_loss_clip": 0.01398528, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.239923, "balance_loss_mlp": 1.01593149, "epoch": 0.8290395310386292, "flos": 28889918455680.0, "grad_norm": 1.4929914655813683, "language_loss": 0.71942478, "learning_rate": 2.9874614384744014e-07, "loss": 0.74375319, "num_input_tokens_seen": 297499890, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18383789, "step": 13789, "time_per_iteration": 2.8737614154815674 }, { "auxiliary_loss_clip": 0.01403328, "auxiliary_loss_mlp": 0.01032042, "balance_loss_clip": 1.24151278, "balance_loss_mlp": 1.01267004, "epoch": 0.8290996542912972, "flos": 36590318156160.0, "grad_norm": 1.8192005061594356, "language_loss": 0.68488634, "learning_rate": 2.985414089339813e-07, "loss": 0.70923996, "num_input_tokens_seen": 297521440, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19372559, "step": 13790, "time_per_iteration": 2.9968698024749756 }, { "auxiliary_loss_clip": 0.01413354, "auxiliary_loss_mlp": 0.01037914, "balance_loss_clip": 1.25065303, "balance_loss_mlp": 1.01750541, "epoch": 0.8291597775439651, "flos": 23633478639360.0, "grad_norm": 1.5845937965421082, "language_loss": 0.77940857, "learning_rate": 2.9833673854000265e-07, "loss": 0.80392122, "num_input_tokens_seen": 297539920, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.20397949, "step": 13791, "time_per_iteration": 2.896021842956543 }, { "auxiliary_loss_clip": 0.01403966, "auxiliary_loss_mlp": 0.01031275, "balance_loss_clip": 1.24692225, "balance_loss_mlp": 1.0136801, "epoch": 0.8292199007966331, "flos": 21407400305280.0, "grad_norm": 4.68108229797551, "language_loss": 0.7064569, "learning_rate": 2.981321326732651e-07, "loss": 0.73080927, "num_input_tokens_seen": 297560000, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.17602539, "step": 13792, "time_per_iteration": 4.34114408493042 }, { "auxiliary_loss_clip": 0.01403476, "auxiliary_loss_mlp": 0.0103045, "balance_loss_clip": 1.24260378, "balance_loss_mlp": 1.01141191, "epoch": 0.829280024049301, "flos": 28779439357440.0, "grad_norm": 1.7909958016959286, "language_loss": 0.65817082, "learning_rate": 2.9792759134152736e-07, "loss": 0.68251002, "num_input_tokens_seen": 297579300, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19042969, "step": 13793, "time_per_iteration": 2.9298954010009766 }, { "auxiliary_loss_clip": 0.01404548, "auxiliary_loss_mlp": 0.01031597, "balance_loss_clip": 1.24254477, "balance_loss_mlp": 1.01257157, "epoch": 0.829340147301969, "flos": 19947617470080.0, "grad_norm": 2.2841251902486563, "language_loss": 0.66851056, "learning_rate": 2.977231145525461e-07, "loss": 0.69287205, "num_input_tokens_seen": 297598095, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19030762, "step": 13794, "time_per_iteration": 2.8325254917144775 }, { "auxiliary_loss_clip": 0.01409985, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.24799776, "balance_loss_mlp": 1.0160408, "epoch": 0.829400270554637, "flos": 25239511226880.0, "grad_norm": 1.8604143880332655, "language_loss": 0.66532934, "learning_rate": 2.975187023140757e-07, "loss": 0.68977886, "num_input_tokens_seen": 297615955, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18933105, "step": 13795, "time_per_iteration": 2.885525703430176 }, { "auxiliary_loss_clip": 0.01387172, "auxiliary_loss_mlp": 0.01031446, "balance_loss_clip": 1.23237062, "balance_loss_mlp": 1.01362419, "epoch": 0.829460393807305, "flos": 24474618316800.0, "grad_norm": 1.837352494805835, "language_loss": 0.67568767, "learning_rate": 2.973143546338661e-07, "loss": 0.69987386, "num_input_tokens_seen": 297636285, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.17822266, "step": 13796, "time_per_iteration": 4.278230428695679 }, { "auxiliary_loss_clip": 0.0139088, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.23388004, "balance_loss_mlp": 1.01352751, "epoch": 0.829520517059973, "flos": 15130922296320.0, "grad_norm": 1.5742848315900135, "language_loss": 0.71950781, "learning_rate": 2.971100715196666e-07, "loss": 0.74373847, "num_input_tokens_seen": 297653315, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18664551, "step": 13797, "time_per_iteration": 2.7943508625030518 }, { "auxiliary_loss_clip": 0.01414937, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.25459433, "balance_loss_mlp": 1.0121069, "epoch": 0.8295806403126409, "flos": 21589963649280.0, "grad_norm": 2.144577036198014, "language_loss": 0.73782277, "learning_rate": 2.969058529792243e-07, "loss": 0.7622745, "num_input_tokens_seen": 297673480, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18127441, "step": 13798, "time_per_iteration": 2.90126371383667 }, { "auxiliary_loss_clip": 0.01390248, "auxiliary_loss_mlp": 0.01031129, "balance_loss_clip": 1.23448038, "balance_loss_mlp": 1.01290131, "epoch": 0.8296407635653089, "flos": 21736756339200.0, "grad_norm": 1.6750068483197647, "language_loss": 0.77115399, "learning_rate": 2.967016990202822e-07, "loss": 0.79536778, "num_input_tokens_seen": 297693250, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18237305, "step": 13799, "time_per_iteration": 2.871572256088257 }, { "auxiliary_loss_clip": 0.01404864, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.2465831, "balance_loss_mlp": 1.01352596, "epoch": 0.8297008868179768, "flos": 11188332276480.0, "grad_norm": 1.9647387688228162, "language_loss": 0.67741919, "learning_rate": 2.9649760965058245e-07, "loss": 0.70179206, "num_input_tokens_seen": 297710975, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18908691, "step": 13800, "time_per_iteration": 2.9485418796539307 }, { "auxiliary_loss_clip": 0.0141793, "auxiliary_loss_mlp": 0.01037387, "balance_loss_clip": 1.25439525, "balance_loss_mlp": 1.01735997, "epoch": 0.8297610100706448, "flos": 20672803428480.0, "grad_norm": 1.7255768260841884, "language_loss": 0.74872392, "learning_rate": 2.9629358487786515e-07, "loss": 0.7732771, "num_input_tokens_seen": 297730860, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20043945, "step": 13801, "time_per_iteration": 2.8644356727600098 }, { "auxiliary_loss_clip": 0.01411458, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.24915183, "balance_loss_mlp": 1.01486802, "epoch": 0.8298211333233128, "flos": 20386095258240.0, "grad_norm": 2.081547259914401, "language_loss": 0.73949367, "learning_rate": 2.9608962470986476e-07, "loss": 0.76393861, "num_input_tokens_seen": 297749765, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1817627, "step": 13802, "time_per_iteration": 5.637048721313477 }, { "auxiliary_loss_clip": 0.01405094, "auxiliary_loss_mlp": 0.01035149, "balance_loss_clip": 1.24443662, "balance_loss_mlp": 1.01638603, "epoch": 0.8298812565759808, "flos": 21519101013120.0, "grad_norm": 2.5048370222987604, "language_loss": 0.75396919, "learning_rate": 2.9588572915431644e-07, "loss": 0.77837163, "num_input_tokens_seen": 297770380, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18762207, "step": 13803, "time_per_iteration": 2.857107162475586 }, { "auxiliary_loss_clip": 0.01402334, "auxiliary_loss_mlp": 0.01034471, "balance_loss_clip": 1.24280059, "balance_loss_mlp": 1.01563549, "epoch": 0.8299413798286487, "flos": 22828516819200.0, "grad_norm": 1.7502463301415667, "language_loss": 0.79656792, "learning_rate": 2.9568189821895215e-07, "loss": 0.82093596, "num_input_tokens_seen": 297789440, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18823242, "step": 13804, "time_per_iteration": 2.820383310317993 }, { "auxiliary_loss_clip": 0.01401974, "auxiliary_loss_mlp": 0.01029287, "balance_loss_clip": 1.24303126, "balance_loss_mlp": 1.01113129, "epoch": 0.8300015030813167, "flos": 29691124957440.0, "grad_norm": 1.4957993418318138, "language_loss": 0.73753601, "learning_rate": 2.954781319115016e-07, "loss": 0.76184869, "num_input_tokens_seen": 297810425, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18164062, "step": 13805, "time_per_iteration": 2.88493013381958 }, { "auxiliary_loss_clip": 0.01404881, "auxiliary_loss_mlp": 0.01030593, "balance_loss_clip": 1.24175, "balance_loss_mlp": 1.01228237, "epoch": 0.8300616263339846, "flos": 19729057248000.0, "grad_norm": 2.4477701978023902, "language_loss": 0.77657557, "learning_rate": 2.952744302396906e-07, "loss": 0.80093026, "num_input_tokens_seen": 297827680, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18322754, "step": 13806, "time_per_iteration": 2.9343559741973877 }, { "auxiliary_loss_clip": 0.01414687, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 1.25068212, "balance_loss_mlp": 1.01465893, "epoch": 0.8301217495866526, "flos": 19911575347200.0, "grad_norm": 1.6372776010493362, "language_loss": 0.64422947, "learning_rate": 2.950707932112444e-07, "loss": 0.66871607, "num_input_tokens_seen": 297848005, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19287109, "step": 13807, "time_per_iteration": 2.8344202041625977 }, { "auxiliary_loss_clip": 0.01406424, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.24675608, "balance_loss_mlp": 1.01516271, "epoch": 0.8301818728393207, "flos": 19724397033600.0, "grad_norm": 1.9124147141527659, "language_loss": 0.74296045, "learning_rate": 2.948672208338847e-07, "loss": 0.76736224, "num_input_tokens_seen": 297866730, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18591309, "step": 13808, "time_per_iteration": 2.8325653076171875 }, { "auxiliary_loss_clip": 0.014114, "auxiliary_loss_mlp": 0.01040943, "balance_loss_clip": 1.2478472, "balance_loss_mlp": 1.02105856, "epoch": 0.8302419960919886, "flos": 28305462384000.0, "grad_norm": 1.8950540626210772, "language_loss": 0.6786449, "learning_rate": 2.9466371311533046e-07, "loss": 0.70316833, "num_input_tokens_seen": 297886390, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19885254, "step": 13809, "time_per_iteration": 3.017317771911621 }, { "auxiliary_loss_clip": 0.01407156, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.24646986, "balance_loss_mlp": 1.01459098, "epoch": 0.8303021193446566, "flos": 18232598862720.0, "grad_norm": 1.9176571859516467, "language_loss": 0.74504006, "learning_rate": 2.9446027006329896e-07, "loss": 0.76946127, "num_input_tokens_seen": 297905110, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20373535, "step": 13810, "time_per_iteration": 2.8718161582946777 }, { "auxiliary_loss_clip": 0.0139346, "auxiliary_loss_mlp": 0.01032841, "balance_loss_clip": 1.23827004, "balance_loss_mlp": 1.01515007, "epoch": 0.8303622425973245, "flos": 23121559261440.0, "grad_norm": 1.627521370914086, "language_loss": 0.81933415, "learning_rate": 2.94256891685505e-07, "loss": 0.84359717, "num_input_tokens_seen": 297925460, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.17675781, "step": 13811, "time_per_iteration": 2.856236219406128 }, { "auxiliary_loss_clip": 0.01414896, "auxiliary_loss_mlp": 0.01034763, "balance_loss_clip": 1.25242352, "balance_loss_mlp": 1.0160954, "epoch": 0.8304223658499925, "flos": 19582219313280.0, "grad_norm": 2.82902023080237, "language_loss": 0.73637748, "learning_rate": 2.9405357798966156e-07, "loss": 0.76087409, "num_input_tokens_seen": 297941760, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18664551, "step": 13812, "time_per_iteration": 2.827937602996826 }, { "auxiliary_loss_clip": 0.01387719, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.23249006, "balance_loss_mlp": 1.01384091, "epoch": 0.8304824891026604, "flos": 24436902136320.0, "grad_norm": 1.6528263411759951, "language_loss": 0.78852808, "learning_rate": 2.9385032898347664e-07, "loss": 0.81272727, "num_input_tokens_seen": 297959745, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.18371582, "step": 13813, "time_per_iteration": 2.938697338104248 }, { "auxiliary_loss_clip": 0.01406632, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.24387729, "balance_loss_mlp": 1.0124439, "epoch": 0.8305426123553284, "flos": 22391758333440.0, "grad_norm": 2.116929789843223, "language_loss": 0.71769214, "learning_rate": 2.93647144674658e-07, "loss": 0.7420733, "num_input_tokens_seen": 297977665, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19030762, "step": 13814, "time_per_iteration": 2.9068620204925537 }, { "auxiliary_loss_clip": 0.01430822, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.26092482, "balance_loss_mlp": 1.01615644, "epoch": 0.8306027356079964, "flos": 14911954871040.0, "grad_norm": 2.0604127175040228, "language_loss": 0.6897819, "learning_rate": 2.9344402507091116e-07, "loss": 0.71444631, "num_input_tokens_seen": 297993525, "router_z_loss_clip": 1.70117188, "router_z_loss_mlp": 0.19458008, "step": 13815, "time_per_iteration": 2.8263626098632812 }, { "auxiliary_loss_clip": 0.01399868, "auxiliary_loss_mlp": 0.01031662, "balance_loss_clip": 1.2405417, "balance_loss_mlp": 1.01205194, "epoch": 0.8306628588606644, "flos": 19653624887040.0, "grad_norm": 1.9877277782258944, "language_loss": 0.77261865, "learning_rate": 2.9324097017993745e-07, "loss": 0.79693401, "num_input_tokens_seen": 298012920, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19604492, "step": 13816, "time_per_iteration": 2.861412763595581 }, { "auxiliary_loss_clip": 0.01400284, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.24258304, "balance_loss_mlp": 1.01340902, "epoch": 0.8307229821133323, "flos": 24400543299840.0, "grad_norm": 1.808198606047099, "language_loss": 0.82005745, "learning_rate": 2.930379800094371e-07, "loss": 0.8443774, "num_input_tokens_seen": 298033310, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18286133, "step": 13817, "time_per_iteration": 2.8481698036193848 }, { "auxiliary_loss_clip": 0.01410262, "auxiliary_loss_mlp": 0.01031154, "balance_loss_clip": 1.24862266, "balance_loss_mlp": 1.01115096, "epoch": 0.8307831053660003, "flos": 21006955411200.0, "grad_norm": 1.501406288564367, "language_loss": 0.7887916, "learning_rate": 2.9283505456710875e-07, "loss": 0.81320578, "num_input_tokens_seen": 298053530, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19995117, "step": 13818, "time_per_iteration": 2.841456651687622 }, { "auxiliary_loss_clip": 0.01413703, "auxiliary_loss_mlp": 0.01036932, "balance_loss_clip": 1.25309372, "balance_loss_mlp": 1.01779938, "epoch": 0.8308432286186682, "flos": 21407174081280.0, "grad_norm": 1.9512801902869126, "language_loss": 0.82265401, "learning_rate": 2.926321938606453e-07, "loss": 0.84716034, "num_input_tokens_seen": 298069305, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19128418, "step": 13819, "time_per_iteration": 2.8304800987243652 }, { "auxiliary_loss_clip": 0.01182486, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 1.09132421, "balance_loss_mlp": 1.00159132, "epoch": 0.8309033518713362, "flos": 62558589680640.0, "grad_norm": 1.1820763034884851, "language_loss": 0.5624426, "learning_rate": 2.924293978977399e-07, "loss": 0.58454466, "num_input_tokens_seen": 298125830, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.26171875, "step": 13820, "time_per_iteration": 3.3950560092926025 }, { "auxiliary_loss_clip": 0.01396942, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.23900807, "balance_loss_mlp": 1.01132274, "epoch": 0.8309634751240043, "flos": 16986942259200.0, "grad_norm": 1.8473455458226038, "language_loss": 0.69060791, "learning_rate": 2.922266666860831e-07, "loss": 0.71487957, "num_input_tokens_seen": 298142320, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18920898, "step": 13821, "time_per_iteration": 2.8396365642547607 }, { "auxiliary_loss_clip": 0.01414288, "auxiliary_loss_mlp": 0.0103569, "balance_loss_clip": 1.24927878, "balance_loss_mlp": 1.01652122, "epoch": 0.8310235983766722, "flos": 22684710286080.0, "grad_norm": 2.3926540006880757, "language_loss": 0.69749916, "learning_rate": 2.920240002333625e-07, "loss": 0.72199899, "num_input_tokens_seen": 298161845, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19152832, "step": 13822, "time_per_iteration": 2.8595006465911865 }, { "auxiliary_loss_clip": 0.01396995, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.23948371, "balance_loss_mlp": 1.01536727, "epoch": 0.8310837216293402, "flos": 30823994977920.0, "grad_norm": 1.8626167078370575, "language_loss": 0.62709379, "learning_rate": 2.918213985472631e-07, "loss": 0.65139717, "num_input_tokens_seen": 298184165, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17993164, "step": 13823, "time_per_iteration": 3.0041024684906006 }, { "auxiliary_loss_clip": 0.01180135, "auxiliary_loss_mlp": 0.01026304, "balance_loss_clip": 1.09215927, "balance_loss_mlp": 1.00322461, "epoch": 0.8311438448820081, "flos": 71309070120960.0, "grad_norm": 0.993166684443333, "language_loss": 0.62023139, "learning_rate": 2.916188616354669e-07, "loss": 0.64229584, "num_input_tokens_seen": 298251720, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.23046875, "step": 13824, "time_per_iteration": 3.3900563716888428 }, { "auxiliary_loss_clip": 0.01409252, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.24897337, "balance_loss_mlp": 1.01510561, "epoch": 0.8312039681346761, "flos": 20896974005760.0, "grad_norm": 1.6203451717894926, "language_loss": 0.7467314, "learning_rate": 2.914163895056552e-07, "loss": 0.77115893, "num_input_tokens_seen": 298271910, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18395996, "step": 13825, "time_per_iteration": 2.855875253677368 }, { "auxiliary_loss_clip": 0.01414659, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.25130475, "balance_loss_mlp": 1.01547718, "epoch": 0.831264091387344, "flos": 17025880049280.0, "grad_norm": 1.8346408364245799, "language_loss": 0.8073535, "learning_rate": 2.9121398216550486e-07, "loss": 0.83184648, "num_input_tokens_seen": 298288105, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19152832, "step": 13826, "time_per_iteration": 2.8469276428222656 }, { "auxiliary_loss_clip": 0.0141465, "auxiliary_loss_mlp": 0.01034175, "balance_loss_clip": 1.25293374, "balance_loss_mlp": 1.01494694, "epoch": 0.831324214640012, "flos": 24428667582720.0, "grad_norm": 1.640102811947179, "language_loss": 0.68756795, "learning_rate": 2.910116396226914e-07, "loss": 0.71205622, "num_input_tokens_seen": 298307600, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19226074, "step": 13827, "time_per_iteration": 4.3899242877960205 }, { "auxiliary_loss_clip": 0.0140934, "auxiliary_loss_mlp": 0.01030192, "balance_loss_clip": 1.24890327, "balance_loss_mlp": 1.01253724, "epoch": 0.83138433789268, "flos": 13551520913280.0, "grad_norm": 5.323310824007471, "language_loss": 0.74745518, "learning_rate": 2.9080936188488834e-07, "loss": 0.77185047, "num_input_tokens_seen": 298323055, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.17651367, "step": 13828, "time_per_iteration": 2.795210123062134 }, { "auxiliary_loss_clip": 0.01403796, "auxiliary_loss_mlp": 0.01035862, "balance_loss_clip": 1.24223542, "balance_loss_mlp": 1.01716995, "epoch": 0.831444461145348, "flos": 44508146958720.0, "grad_norm": 1.8575086513734091, "language_loss": 0.68263209, "learning_rate": 2.906071489597657e-07, "loss": 0.70702869, "num_input_tokens_seen": 298346950, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18688965, "step": 13829, "time_per_iteration": 3.040065050125122 }, { "auxiliary_loss_clip": 0.01418224, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.25390506, "balance_loss_mlp": 1.01621652, "epoch": 0.8315045843980159, "flos": 22713739464960.0, "grad_norm": 1.5897188303259708, "language_loss": 0.83306175, "learning_rate": 2.9040500085499054e-07, "loss": 0.85758996, "num_input_tokens_seen": 298366315, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.18383789, "step": 13830, "time_per_iteration": 2.8714101314544678 }, { "auxiliary_loss_clip": 0.01407068, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.24756122, "balance_loss_mlp": 1.01555705, "epoch": 0.8315647076506839, "flos": 16882299740160.0, "grad_norm": 2.0155672007913883, "language_loss": 0.75130963, "learning_rate": 2.9020291757822925e-07, "loss": 0.77572262, "num_input_tokens_seen": 298385185, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18676758, "step": 13831, "time_per_iteration": 4.271459579467773 }, { "auxiliary_loss_clip": 0.01412166, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.25067103, "balance_loss_mlp": 1.01791167, "epoch": 0.8316248309033518, "flos": 13816755786240.0, "grad_norm": 2.2134743491473974, "language_loss": 0.72110856, "learning_rate": 2.9000089913714523e-07, "loss": 0.74560696, "num_input_tokens_seen": 298402335, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19750977, "step": 13832, "time_per_iteration": 2.842864513397217 }, { "auxiliary_loss_clip": 0.01410522, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 1.25029778, "balance_loss_mlp": 1.01617241, "epoch": 0.8316849541560198, "flos": 23522682827520.0, "grad_norm": 3.29177582688134, "language_loss": 0.85176039, "learning_rate": 2.897989455393979e-07, "loss": 0.87621248, "num_input_tokens_seen": 298423370, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18493652, "step": 13833, "time_per_iteration": 2.8523359298706055 }, { "auxiliary_loss_clip": 0.0142119, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.25745964, "balance_loss_mlp": 1.01523435, "epoch": 0.8317450774086879, "flos": 23782488324480.0, "grad_norm": 1.6525794275635919, "language_loss": 0.77042842, "learning_rate": 2.8959705679264625e-07, "loss": 0.79497921, "num_input_tokens_seen": 298444835, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18664551, "step": 13834, "time_per_iteration": 2.8781626224517822 }, { "auxiliary_loss_clip": 0.01396276, "auxiliary_loss_mlp": 0.01031772, "balance_loss_clip": 1.23860919, "balance_loss_mlp": 1.01398587, "epoch": 0.8318052006613558, "flos": 16223859141120.0, "grad_norm": 2.375229026882135, "language_loss": 0.80512953, "learning_rate": 2.893952329045459e-07, "loss": 0.82940996, "num_input_tokens_seen": 298461845, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.17797852, "step": 13835, "time_per_iteration": 2.805330753326416 }, { "auxiliary_loss_clip": 0.01421107, "auxiliary_loss_mlp": 0.01036984, "balance_loss_clip": 1.25842953, "balance_loss_mlp": 1.01760066, "epoch": 0.8318653239140238, "flos": 19984112040960.0, "grad_norm": 1.9400805507704162, "language_loss": 0.81673443, "learning_rate": 2.8919347388274905e-07, "loss": 0.84131539, "num_input_tokens_seen": 298479095, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19396973, "step": 13836, "time_per_iteration": 4.276665449142456 }, { "auxiliary_loss_clip": 0.01401442, "auxiliary_loss_mlp": 0.01035365, "balance_loss_clip": 1.24356163, "balance_loss_mlp": 1.01668453, "epoch": 0.8319254471666917, "flos": 17711675769600.0, "grad_norm": 1.9503868915883795, "language_loss": 0.77871293, "learning_rate": 2.8899177973490727e-07, "loss": 0.80308104, "num_input_tokens_seen": 298494475, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18676758, "step": 13837, "time_per_iteration": 4.270092010498047 }, { "auxiliary_loss_clip": 0.01421125, "auxiliary_loss_mlp": 0.01033139, "balance_loss_clip": 1.25518894, "balance_loss_mlp": 1.01369619, "epoch": 0.8319855704193597, "flos": 19545950966400.0, "grad_norm": 1.6261930101606357, "language_loss": 0.84394324, "learning_rate": 2.887901504686685e-07, "loss": 0.86848587, "num_input_tokens_seen": 298513185, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19433594, "step": 13838, "time_per_iteration": 2.895066261291504 }, { "auxiliary_loss_clip": 0.01411802, "auxiliary_loss_mlp": 0.01037818, "balance_loss_clip": 1.25292051, "balance_loss_mlp": 1.01844656, "epoch": 0.8320456936720276, "flos": 21187799452800.0, "grad_norm": 1.9240680628079518, "language_loss": 0.75392807, "learning_rate": 2.885885860916795e-07, "loss": 0.77842426, "num_input_tokens_seen": 298531885, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19384766, "step": 13839, "time_per_iteration": 2.8934481143951416 }, { "auxiliary_loss_clip": 0.0141155, "auxiliary_loss_mlp": 0.01033314, "balance_loss_clip": 1.25145805, "balance_loss_mlp": 1.01493168, "epoch": 0.8321058169246957, "flos": 33262706465280.0, "grad_norm": 1.5663870059101805, "language_loss": 0.68276143, "learning_rate": 2.8838708661158253e-07, "loss": 0.70721006, "num_input_tokens_seen": 298554905, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18395996, "step": 13840, "time_per_iteration": 2.919400691986084 }, { "auxiliary_loss_clip": 0.01405822, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.24404609, "balance_loss_mlp": 1.01609886, "epoch": 0.8321659401773636, "flos": 14215752846720.0, "grad_norm": 2.373222359171838, "language_loss": 0.79920954, "learning_rate": 2.8818565203601843e-07, "loss": 0.82361561, "num_input_tokens_seen": 298571185, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18676758, "step": 13841, "time_per_iteration": 2.7953550815582275 }, { "auxiliary_loss_clip": 0.01406656, "auxiliary_loss_mlp": 0.01034468, "balance_loss_clip": 1.24699688, "balance_loss_mlp": 1.01588345, "epoch": 0.8322260634300316, "flos": 15166511971200.0, "grad_norm": 1.867156740371621, "language_loss": 0.69497192, "learning_rate": 2.879842823726262e-07, "loss": 0.71938312, "num_input_tokens_seen": 298588505, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18591309, "step": 13842, "time_per_iteration": 2.806941032409668 }, { "auxiliary_loss_clip": 0.0139779, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.23988509, "balance_loss_mlp": 1.01521826, "epoch": 0.8322861866826995, "flos": 25311595472640.0, "grad_norm": 1.5229698001585692, "language_loss": 0.73444366, "learning_rate": 2.8778297762904124e-07, "loss": 0.75876647, "num_input_tokens_seen": 298609295, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19287109, "step": 13843, "time_per_iteration": 2.869706869125366 }, { "auxiliary_loss_clip": 0.01405347, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.24696445, "balance_loss_mlp": 1.0134871, "epoch": 0.8323463099353675, "flos": 17028187534080.0, "grad_norm": 2.5784479513297853, "language_loss": 0.78823864, "learning_rate": 2.875817378128975e-07, "loss": 0.81262028, "num_input_tokens_seen": 298625765, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.1932373, "step": 13844, "time_per_iteration": 2.8084371089935303 }, { "auxiliary_loss_clip": 0.01183684, "auxiliary_loss_mlp": 0.01018315, "balance_loss_clip": 1.09411907, "balance_loss_mlp": 0.99914646, "epoch": 0.8324064331880354, "flos": 55632720032640.0, "grad_norm": 0.7832195667968963, "language_loss": 0.55308032, "learning_rate": 2.8738056293182624e-07, "loss": 0.5751003, "num_input_tokens_seen": 298683005, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.19140625, "step": 13845, "time_per_iteration": 3.24932861328125 }, { "auxiliary_loss_clip": 0.01416313, "auxiliary_loss_mlp": 0.01037735, "balance_loss_clip": 1.25295496, "balance_loss_mlp": 1.01818442, "epoch": 0.8324665564407034, "flos": 26149251300480.0, "grad_norm": 2.5912098060551925, "language_loss": 0.76222706, "learning_rate": 2.871794529934555e-07, "loss": 0.78676748, "num_input_tokens_seen": 298703060, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19543457, "step": 13846, "time_per_iteration": 2.862269401550293 }, { "auxiliary_loss_clip": 0.01422389, "auxiliary_loss_mlp": 0.0103577, "balance_loss_clip": 1.25567651, "balance_loss_mlp": 1.01574314, "epoch": 0.8325266796933715, "flos": 22057968309120.0, "grad_norm": 1.7820662776425409, "language_loss": 0.79836935, "learning_rate": 2.8697840800541115e-07, "loss": 0.82295096, "num_input_tokens_seen": 298721765, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20031738, "step": 13847, "time_per_iteration": 2.8809046745300293 }, { "auxiliary_loss_clip": 0.01404049, "auxiliary_loss_mlp": 0.01030413, "balance_loss_clip": 1.24629164, "balance_loss_mlp": 1.01317585, "epoch": 0.8325868029460394, "flos": 22825937865600.0, "grad_norm": 1.7267223839842822, "language_loss": 0.74535555, "learning_rate": 2.867774279753175e-07, "loss": 0.76970017, "num_input_tokens_seen": 298740825, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.17224121, "step": 13848, "time_per_iteration": 2.8612771034240723 }, { "auxiliary_loss_clip": 0.01405491, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.24647772, "balance_loss_mlp": 1.0127027, "epoch": 0.8326469261987074, "flos": 14765750363520.0, "grad_norm": 2.0460058860945267, "language_loss": 0.65023941, "learning_rate": 2.8657651291079554e-07, "loss": 0.6746214, "num_input_tokens_seen": 298758515, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19995117, "step": 13849, "time_per_iteration": 2.9180591106414795 }, { "auxiliary_loss_clip": 0.01411809, "auxiliary_loss_mlp": 0.01040127, "balance_loss_clip": 1.24982762, "balance_loss_mlp": 1.02105308, "epoch": 0.8327070494513753, "flos": 22935466823040.0, "grad_norm": 2.23010299603168, "language_loss": 0.80088645, "learning_rate": 2.863756628194638e-07, "loss": 0.82540584, "num_input_tokens_seen": 298776375, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19067383, "step": 13850, "time_per_iteration": 2.867211103439331 }, { "auxiliary_loss_clip": 0.01389462, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.23393154, "balance_loss_mlp": 1.02092266, "epoch": 0.8327671727040433, "flos": 20674432241280.0, "grad_norm": 1.515443416147512, "language_loss": 0.79073322, "learning_rate": 2.8617487770893877e-07, "loss": 0.81502104, "num_input_tokens_seen": 298795135, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.18395996, "step": 13851, "time_per_iteration": 2.847804069519043 }, { "auxiliary_loss_clip": 0.01182526, "auxiliary_loss_mlp": 0.01023815, "balance_loss_clip": 1.09322929, "balance_loss_mlp": 1.00149882, "epoch": 0.8328272959567112, "flos": 56089612454400.0, "grad_norm": 0.7699257045296197, "language_loss": 0.55915427, "learning_rate": 2.859741575868344e-07, "loss": 0.58121771, "num_input_tokens_seen": 298855475, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.22363281, "step": 13852, "time_per_iteration": 3.3332736492156982 }, { "auxiliary_loss_clip": 0.01398173, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.24042654, "balance_loss_mlp": 1.01225173, "epoch": 0.8328874192093793, "flos": 32314888252800.0, "grad_norm": 1.6827397607539019, "language_loss": 0.68080699, "learning_rate": 2.8577350246076125e-07, "loss": 0.70509756, "num_input_tokens_seen": 298875875, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18640137, "step": 13853, "time_per_iteration": 2.966142177581787 }, { "auxiliary_loss_clip": 0.01402035, "auxiliary_loss_mlp": 0.01034505, "balance_loss_clip": 1.24217057, "balance_loss_mlp": 1.01620638, "epoch": 0.8329475424620472, "flos": 23522909051520.0, "grad_norm": 1.674863727058401, "language_loss": 0.79106653, "learning_rate": 2.855729123383286e-07, "loss": 0.81543189, "num_input_tokens_seen": 298895950, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18286133, "step": 13854, "time_per_iteration": 2.945713520050049 }, { "auxiliary_loss_clip": 0.01182348, "auxiliary_loss_mlp": 0.01022158, "balance_loss_clip": 1.0912801, "balance_loss_mlp": 1.00270331, "epoch": 0.8330076657147152, "flos": 67871296045440.0, "grad_norm": 0.780173540828259, "language_loss": 0.58697218, "learning_rate": 2.8537238722714295e-07, "loss": 0.60901725, "num_input_tokens_seen": 298955770, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.19433594, "step": 13855, "time_per_iteration": 3.1872167587280273 }, { "auxiliary_loss_clip": 0.01410541, "auxiliary_loss_mlp": 0.0103211, "balance_loss_clip": 1.2496922, "balance_loss_mlp": 1.01296473, "epoch": 0.8330677889673831, "flos": 22902953794560.0, "grad_norm": 1.8743124848458799, "language_loss": 0.72788632, "learning_rate": 2.8517192713480853e-07, "loss": 0.7523129, "num_input_tokens_seen": 298976545, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19152832, "step": 13856, "time_per_iteration": 2.8364670276641846 }, { "auxiliary_loss_clip": 0.01406986, "auxiliary_loss_mlp": 0.01032781, "balance_loss_clip": 1.2457366, "balance_loss_mlp": 1.01345706, "epoch": 0.8331279122200511, "flos": 27356603541120.0, "grad_norm": 1.7386905376675375, "language_loss": 0.76379251, "learning_rate": 2.8497153206892677e-07, "loss": 0.78819013, "num_input_tokens_seen": 298996750, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1932373, "step": 13857, "time_per_iteration": 2.898599624633789 }, { "auxiliary_loss_clip": 0.01396479, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.24187636, "balance_loss_mlp": 1.01776838, "epoch": 0.833188035472719, "flos": 19947753204480.0, "grad_norm": 1.511043509409384, "language_loss": 0.74041522, "learning_rate": 2.847712020370958e-07, "loss": 0.76473212, "num_input_tokens_seen": 299014895, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.17443848, "step": 13858, "time_per_iteration": 2.8795907497406006 }, { "auxiliary_loss_clip": 0.01426098, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.26010764, "balance_loss_mlp": 1.01478398, "epoch": 0.833248158725387, "flos": 15241265660160.0, "grad_norm": 1.7936178209373406, "language_loss": 0.74157113, "learning_rate": 2.8457093704691316e-07, "loss": 0.76616973, "num_input_tokens_seen": 299032855, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.18969727, "step": 13859, "time_per_iteration": 2.8969931602478027 }, { "auxiliary_loss_clip": 0.01404101, "auxiliary_loss_mlp": 0.01033662, "balance_loss_clip": 1.24699986, "balance_loss_mlp": 1.01563799, "epoch": 0.8333082819780551, "flos": 24546340604160.0, "grad_norm": 1.7744853638417033, "language_loss": 0.80002677, "learning_rate": 2.8437073710597205e-07, "loss": 0.82440448, "num_input_tokens_seen": 299052055, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18029785, "step": 13860, "time_per_iteration": 2.9113199710845947 }, { "auxiliary_loss_clip": 0.0139481, "auxiliary_loss_mlp": 0.01031831, "balance_loss_clip": 1.23757982, "balance_loss_mlp": 1.01338887, "epoch": 0.833368405230723, "flos": 31479901868160.0, "grad_norm": 1.456239034158224, "language_loss": 0.82910335, "learning_rate": 2.841706022218644e-07, "loss": 0.85336977, "num_input_tokens_seen": 299075285, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18444824, "step": 13861, "time_per_iteration": 2.925389289855957 }, { "auxiliary_loss_clip": 0.01408991, "auxiliary_loss_mlp": 0.01034096, "balance_loss_clip": 1.24823523, "balance_loss_mlp": 1.01522493, "epoch": 0.833428528483391, "flos": 14910868995840.0, "grad_norm": 1.922780933051717, "language_loss": 0.79869366, "learning_rate": 2.839705324021806e-07, "loss": 0.82312453, "num_input_tokens_seen": 299092520, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18859863, "step": 13862, "time_per_iteration": 4.251455068588257 }, { "auxiliary_loss_clip": 0.01413804, "auxiliary_loss_mlp": 0.01033761, "balance_loss_clip": 1.25046587, "balance_loss_mlp": 1.01465154, "epoch": 0.8334886517360589, "flos": 22209873661440.0, "grad_norm": 4.156508603609199, "language_loss": 0.7596935, "learning_rate": 2.83770527654505e-07, "loss": 0.78416914, "num_input_tokens_seen": 299109450, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19116211, "step": 13863, "time_per_iteration": 2.832001209259033 }, { "auxiliary_loss_clip": 0.01399361, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.2438972, "balance_loss_mlp": 1.01610756, "epoch": 0.8335487749887269, "flos": 30384702783360.0, "grad_norm": 3.043690085029642, "language_loss": 0.76050162, "learning_rate": 2.835705879864232e-07, "loss": 0.78483546, "num_input_tokens_seen": 299129540, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.17919922, "step": 13864, "time_per_iteration": 2.9099371433258057 }, { "auxiliary_loss_clip": 0.01403872, "auxiliary_loss_mlp": 0.01038387, "balance_loss_clip": 1.24377835, "balance_loss_mlp": 1.01826429, "epoch": 0.8336088982413948, "flos": 24691911684480.0, "grad_norm": 1.8396182014522282, "language_loss": 0.70099396, "learning_rate": 2.833707134055168e-07, "loss": 0.72541654, "num_input_tokens_seen": 299148670, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.20129395, "step": 13865, "time_per_iteration": 2.8608877658843994 }, { "auxiliary_loss_clip": 0.01410709, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.25093472, "balance_loss_mlp": 1.01538157, "epoch": 0.8336690214940629, "flos": 38190514164480.0, "grad_norm": 1.8325559590988134, "language_loss": 0.75967425, "learning_rate": 2.831709039193653e-07, "loss": 0.78411973, "num_input_tokens_seen": 299169330, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18457031, "step": 13866, "time_per_iteration": 4.4581992626190186 }, { "auxiliary_loss_clip": 0.01183511, "auxiliary_loss_mlp": 0.01018019, "balance_loss_clip": 1.09323156, "balance_loss_mlp": 0.99865896, "epoch": 0.8337291447467308, "flos": 55588307621760.0, "grad_norm": 0.8792470021246632, "language_loss": 0.6320833, "learning_rate": 2.8297115953554465e-07, "loss": 0.65409851, "num_input_tokens_seen": 299220980, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.19335938, "step": 13867, "time_per_iteration": 3.2464215755462646 }, { "auxiliary_loss_clip": 0.01406209, "auxiliary_loss_mlp": 0.0102887, "balance_loss_clip": 1.24758303, "balance_loss_mlp": 1.01109576, "epoch": 0.8337892679993988, "flos": 24144085918080.0, "grad_norm": 1.9120264471280546, "language_loss": 0.72810918, "learning_rate": 2.827714802616301e-07, "loss": 0.75246, "num_input_tokens_seen": 299240130, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.1776123, "step": 13868, "time_per_iteration": 2.8798344135284424 }, { "auxiliary_loss_clip": 0.01411995, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.25134325, "balance_loss_mlp": 1.01413703, "epoch": 0.8338493912520667, "flos": 28195616712960.0, "grad_norm": 1.4009839750666562, "language_loss": 0.81275666, "learning_rate": 2.8257186610519325e-07, "loss": 0.83721346, "num_input_tokens_seen": 299260705, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.1953125, "step": 13869, "time_per_iteration": 2.957946538925171 }, { "auxiliary_loss_clip": 0.01406974, "auxiliary_loss_mlp": 0.01036981, "balance_loss_clip": 1.24724853, "balance_loss_mlp": 1.0181694, "epoch": 0.8339095145047347, "flos": 22167090063360.0, "grad_norm": 1.5754887996260096, "language_loss": 0.82944882, "learning_rate": 2.823723170738028e-07, "loss": 0.85388833, "num_input_tokens_seen": 299278925, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18798828, "step": 13870, "time_per_iteration": 2.9224157333374023 }, { "auxiliary_loss_clip": 0.01404178, "auxiliary_loss_mlp": 0.01032656, "balance_loss_clip": 1.24225235, "balance_loss_mlp": 1.01382089, "epoch": 0.8339696377574026, "flos": 17314940949120.0, "grad_norm": 2.7426964735687553, "language_loss": 0.71123314, "learning_rate": 2.821728331750264e-07, "loss": 0.73560148, "num_input_tokens_seen": 299291580, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18835449, "step": 13871, "time_per_iteration": 4.29019832611084 }, { "auxiliary_loss_clip": 0.01385377, "auxiliary_loss_mlp": 0.01035984, "balance_loss_clip": 1.22964585, "balance_loss_mlp": 1.01772094, "epoch": 0.8340297610100706, "flos": 20678232804480.0, "grad_norm": 1.7478522332125257, "language_loss": 0.69703054, "learning_rate": 2.8197341441642853e-07, "loss": 0.72124416, "num_input_tokens_seen": 299310385, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18273926, "step": 13872, "time_per_iteration": 4.409483909606934 }, { "auxiliary_loss_clip": 0.01406006, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.2455343, "balance_loss_mlp": 1.01136613, "epoch": 0.8340898842627387, "flos": 20523477029760.0, "grad_norm": 2.0064878526365875, "language_loss": 0.74492508, "learning_rate": 2.817740608055712e-07, "loss": 0.7692821, "num_input_tokens_seen": 299327660, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18334961, "step": 13873, "time_per_iteration": 2.9339370727539062 }, { "auxiliary_loss_clip": 0.01420818, "auxiliary_loss_mlp": 0.01032214, "balance_loss_clip": 1.25542164, "balance_loss_mlp": 1.01221073, "epoch": 0.8341500075154066, "flos": 21433669551360.0, "grad_norm": 2.2333903213977306, "language_loss": 0.76456505, "learning_rate": 2.81574772350013e-07, "loss": 0.7890954, "num_input_tokens_seen": 299343685, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.1998291, "step": 13874, "time_per_iteration": 2.825222969055176 }, { "auxiliary_loss_clip": 0.0140753, "auxiliary_loss_mlp": 0.01032505, "balance_loss_clip": 1.24841034, "balance_loss_mlp": 1.01482654, "epoch": 0.8342101307680746, "flos": 22101159110400.0, "grad_norm": 2.356400384328307, "language_loss": 0.67237532, "learning_rate": 2.813755490573118e-07, "loss": 0.69677567, "num_input_tokens_seen": 299363305, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.17687988, "step": 13875, "time_per_iteration": 2.8709003925323486 }, { "auxiliary_loss_clip": 0.01398479, "auxiliary_loss_mlp": 0.01032774, "balance_loss_clip": 1.23875451, "balance_loss_mlp": 1.01436794, "epoch": 0.8342702540207425, "flos": 21881422523520.0, "grad_norm": 2.5329766377699197, "language_loss": 0.80543971, "learning_rate": 2.8117639093502243e-07, "loss": 0.82975221, "num_input_tokens_seen": 299382630, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18395996, "step": 13876, "time_per_iteration": 2.849001169204712 }, { "auxiliary_loss_clip": 0.01408988, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.25067973, "balance_loss_mlp": 1.01497388, "epoch": 0.8343303772734105, "flos": 22538686757760.0, "grad_norm": 1.8498019839442936, "language_loss": 0.88504064, "learning_rate": 2.8097729799069615e-07, "loss": 0.90946889, "num_input_tokens_seen": 299402385, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18859863, "step": 13877, "time_per_iteration": 2.869783878326416 }, { "auxiliary_loss_clip": 0.0140066, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.24160123, "balance_loss_mlp": 1.01425469, "epoch": 0.8343905005260784, "flos": 14948087483520.0, "grad_norm": 1.7762660143664695, "language_loss": 0.70258051, "learning_rate": 2.807782702318828e-07, "loss": 0.72691691, "num_input_tokens_seen": 299419820, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.1875, "step": 13878, "time_per_iteration": 2.825199604034424 }, { "auxiliary_loss_clip": 0.01403395, "auxiliary_loss_mlp": 0.01029395, "balance_loss_clip": 1.24283004, "balance_loss_mlp": 1.01175213, "epoch": 0.8344506237787465, "flos": 15020714666880.0, "grad_norm": 1.812823793934417, "language_loss": 0.80112398, "learning_rate": 2.805793076661309e-07, "loss": 0.82545185, "num_input_tokens_seen": 299436265, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.17626953, "step": 13879, "time_per_iteration": 2.8953380584716797 }, { "auxiliary_loss_clip": 0.01391049, "auxiliary_loss_mlp": 0.01029635, "balance_loss_clip": 1.23366296, "balance_loss_mlp": 1.01131296, "epoch": 0.8345107470314144, "flos": 17567733502080.0, "grad_norm": 2.2159556496720993, "language_loss": 0.83977717, "learning_rate": 2.803804103009828e-07, "loss": 0.86398405, "num_input_tokens_seen": 299451660, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18322754, "step": 13880, "time_per_iteration": 2.795341968536377 }, { "auxiliary_loss_clip": 0.01417016, "auxiliary_loss_mlp": 0.0102645, "balance_loss_clip": 1.25530303, "balance_loss_mlp": 1.00874734, "epoch": 0.8345708702840824, "flos": 25196953852800.0, "grad_norm": 1.5396875610739729, "language_loss": 0.78907806, "learning_rate": 2.80181578143982e-07, "loss": 0.81351274, "num_input_tokens_seen": 299472070, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.17712402, "step": 13881, "time_per_iteration": 2.9095630645751953 }, { "auxiliary_loss_clip": 0.01389299, "auxiliary_loss_mlp": 0.01032223, "balance_loss_clip": 1.23502111, "balance_loss_mlp": 1.01473522, "epoch": 0.8346309935367503, "flos": 15091351079040.0, "grad_norm": 2.5731712831638625, "language_loss": 0.78991967, "learning_rate": 2.7998281120266807e-07, "loss": 0.8141349, "num_input_tokens_seen": 299486725, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.17504883, "step": 13882, "time_per_iteration": 2.8088419437408447 }, { "auxiliary_loss_clip": 0.01399286, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.24027646, "balance_loss_mlp": 1.01630855, "epoch": 0.8346911167894183, "flos": 22941122423040.0, "grad_norm": 2.5224582422761883, "language_loss": 0.80948901, "learning_rate": 2.79784109484579e-07, "loss": 0.83383989, "num_input_tokens_seen": 299505435, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19482422, "step": 13883, "time_per_iteration": 2.8681557178497314 }, { "auxiliary_loss_clip": 0.01404332, "auxiliary_loss_mlp": 0.01031076, "balance_loss_clip": 1.2428863, "balance_loss_mlp": 1.01194263, "epoch": 0.8347512400420862, "flos": 20202536528640.0, "grad_norm": 1.937858705413825, "language_loss": 0.74800622, "learning_rate": 2.795854729972482e-07, "loss": 0.77236032, "num_input_tokens_seen": 299523555, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19128418, "step": 13884, "time_per_iteration": 2.872476100921631 }, { "auxiliary_loss_clip": 0.01439107, "auxiliary_loss_mlp": 0.01036802, "balance_loss_clip": 1.26878726, "balance_loss_mlp": 1.01753736, "epoch": 0.8348113632947542, "flos": 25965059143680.0, "grad_norm": 1.8229467995668094, "language_loss": 0.71083599, "learning_rate": 2.7938690174820913e-07, "loss": 0.73559511, "num_input_tokens_seen": 299541660, "router_z_loss_clip": 1.703125, "router_z_loss_mlp": 0.19287109, "step": 13885, "time_per_iteration": 2.876168727874756 }, { "auxiliary_loss_clip": 0.01410315, "auxiliary_loss_mlp": 0.01032195, "balance_loss_clip": 1.24966145, "balance_loss_mlp": 1.01319313, "epoch": 0.8348714865474223, "flos": 34217944824960.0, "grad_norm": 1.723062030652736, "language_loss": 0.7094149, "learning_rate": 2.791883957449912e-07, "loss": 0.73383999, "num_input_tokens_seen": 299562465, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19006348, "step": 13886, "time_per_iteration": 3.0203053951263428 }, { "auxiliary_loss_clip": 0.01402096, "auxiliary_loss_mlp": 0.01030209, "balance_loss_clip": 1.24332082, "balance_loss_mlp": 1.01070642, "epoch": 0.8349316098000902, "flos": 24401040992640.0, "grad_norm": 1.685783299382979, "language_loss": 0.79473382, "learning_rate": 2.7898995499512134e-07, "loss": 0.81905687, "num_input_tokens_seen": 299582700, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19506836, "step": 13887, "time_per_iteration": 2.9629976749420166 }, { "auxiliary_loss_clip": 0.0142936, "auxiliary_loss_mlp": 0.01030389, "balance_loss_clip": 1.26398802, "balance_loss_mlp": 1.01070738, "epoch": 0.8349917330527582, "flos": 23041376196480.0, "grad_norm": 2.320539404251607, "language_loss": 0.64402163, "learning_rate": 2.7879157950612467e-07, "loss": 0.6686191, "num_input_tokens_seen": 299600310, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19677734, "step": 13888, "time_per_iteration": 2.8860671520233154 }, { "auxiliary_loss_clip": 0.01416429, "auxiliary_loss_mlp": 0.01028968, "balance_loss_clip": 1.25200987, "balance_loss_mlp": 1.01013279, "epoch": 0.8350518563054261, "flos": 13633739994240.0, "grad_norm": 2.046323794697516, "language_loss": 0.68476474, "learning_rate": 2.785932692855244e-07, "loss": 0.70921874, "num_input_tokens_seen": 299617025, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18811035, "step": 13889, "time_per_iteration": 2.866093635559082 }, { "auxiliary_loss_clip": 0.01400907, "auxiliary_loss_mlp": 0.01028416, "balance_loss_clip": 1.24149394, "balance_loss_mlp": 1.0108211, "epoch": 0.8351119795580941, "flos": 21589737425280.0, "grad_norm": 1.96736348238121, "language_loss": 0.69393754, "learning_rate": 2.783950243408399e-07, "loss": 0.71823084, "num_input_tokens_seen": 299633050, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.17602539, "step": 13890, "time_per_iteration": 2.8298912048339844 }, { "auxiliary_loss_clip": 0.01418993, "auxiliary_loss_mlp": 0.01035508, "balance_loss_clip": 1.25724316, "balance_loss_mlp": 1.0178411, "epoch": 0.835172102810762, "flos": 20045473269120.0, "grad_norm": 2.0535049419437734, "language_loss": 0.59969527, "learning_rate": 2.7819684467958817e-07, "loss": 0.62424028, "num_input_tokens_seen": 299646445, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.17663574, "step": 13891, "time_per_iteration": 2.809206485748291 }, { "auxiliary_loss_clip": 0.01403179, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.24313045, "balance_loss_mlp": 1.0122056, "epoch": 0.8352322260634301, "flos": 25120526106240.0, "grad_norm": 1.6719112050326395, "language_loss": 0.72193933, "learning_rate": 2.779987303092846e-07, "loss": 0.74628401, "num_input_tokens_seen": 299662665, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19091797, "step": 13892, "time_per_iteration": 2.8929800987243652 }, { "auxiliary_loss_clip": 0.01389302, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.23267126, "balance_loss_mlp": 1.01292539, "epoch": 0.835292349316098, "flos": 24874610762880.0, "grad_norm": 1.4996873557244974, "language_loss": 0.66177332, "learning_rate": 2.7780068123744207e-07, "loss": 0.68598223, "num_input_tokens_seen": 299683585, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18676758, "step": 13893, "time_per_iteration": 2.8998019695281982 }, { "auxiliary_loss_clip": 0.01406109, "auxiliary_loss_mlp": 0.01030693, "balance_loss_clip": 1.24430752, "balance_loss_mlp": 1.01234674, "epoch": 0.835352472568766, "flos": 19875216510720.0, "grad_norm": 1.924401388351334, "language_loss": 0.78811485, "learning_rate": 2.7760269747156996e-07, "loss": 0.81248289, "num_input_tokens_seen": 299702680, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18347168, "step": 13894, "time_per_iteration": 2.853407859802246 }, { "auxiliary_loss_clip": 0.0138606, "auxiliary_loss_mlp": 0.01033555, "balance_loss_clip": 1.23235607, "balance_loss_mlp": 1.01507807, "epoch": 0.8354125958214339, "flos": 22064981253120.0, "grad_norm": 1.8328373977370997, "language_loss": 0.73549378, "learning_rate": 2.7740477901917625e-07, "loss": 0.75968993, "num_input_tokens_seen": 299721050, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.18481445, "step": 13895, "time_per_iteration": 2.9471235275268555 }, { "auxiliary_loss_clip": 0.0142176, "auxiliary_loss_mlp": 0.01036358, "balance_loss_clip": 1.25667024, "balance_loss_mlp": 1.01650977, "epoch": 0.8354727190741019, "flos": 21407807508480.0, "grad_norm": 2.2336434018089806, "language_loss": 0.72949511, "learning_rate": 2.772069258877667e-07, "loss": 0.7540763, "num_input_tokens_seen": 299738255, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19848633, "step": 13896, "time_per_iteration": 2.891248941421509 }, { "auxiliary_loss_clip": 0.01393676, "auxiliary_loss_mlp": 0.01033734, "balance_loss_clip": 1.23731411, "balance_loss_mlp": 1.01524425, "epoch": 0.8355328423267698, "flos": 50858428515840.0, "grad_norm": 2.358987912744662, "language_loss": 0.59107828, "learning_rate": 2.770091380848423e-07, "loss": 0.61535239, "num_input_tokens_seen": 299761315, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18481445, "step": 13897, "time_per_iteration": 4.5042359828948975 }, { "auxiliary_loss_clip": 0.01181191, "auxiliary_loss_mlp": 0.01030712, "balance_loss_clip": 1.09054875, "balance_loss_mlp": 1.00944543, "epoch": 0.8355929655794379, "flos": 65583404035200.0, "grad_norm": 0.7202605376597025, "language_loss": 0.57729656, "learning_rate": 2.7681141561790423e-07, "loss": 0.5994156, "num_input_tokens_seen": 299828735, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.21289062, "step": 13898, "time_per_iteration": 3.381085157394409 }, { "auxiliary_loss_clip": 0.01414738, "auxiliary_loss_mlp": 0.010348, "balance_loss_clip": 1.2513994, "balance_loss_mlp": 1.01505864, "epoch": 0.8356530888321058, "flos": 19179512179200.0, "grad_norm": 1.7703441690426864, "language_loss": 0.80844539, "learning_rate": 2.7661375849444967e-07, "loss": 0.83294082, "num_input_tokens_seen": 299848395, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.1973877, "step": 13899, "time_per_iteration": 2.8491294384002686 }, { "auxiliary_loss_clip": 0.01422181, "auxiliary_loss_mlp": 0.01031956, "balance_loss_clip": 1.2603476, "balance_loss_mlp": 1.0137881, "epoch": 0.8357132120847738, "flos": 44142115374720.0, "grad_norm": 1.845950851198546, "language_loss": 0.69623697, "learning_rate": 2.764161667219749e-07, "loss": 0.72077835, "num_input_tokens_seen": 299871665, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18139648, "step": 13900, "time_per_iteration": 3.036602258682251 }, { "auxiliary_loss_clip": 0.0139744, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.23973227, "balance_loss_mlp": 1.01383209, "epoch": 0.8357733353374418, "flos": 24400498055040.0, "grad_norm": 1.41890178141524, "language_loss": 0.71882915, "learning_rate": 2.762186403079716e-07, "loss": 0.7431286, "num_input_tokens_seen": 299891960, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18664551, "step": 13901, "time_per_iteration": 4.311877250671387 }, { "auxiliary_loss_clip": 0.01421756, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.2566458, "balance_loss_mlp": 1.01429462, "epoch": 0.8358334585901097, "flos": 20924329127040.0, "grad_norm": 2.2262175937414463, "language_loss": 0.80769479, "learning_rate": 2.7602117925992963e-07, "loss": 0.83225131, "num_input_tokens_seen": 299905070, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19628906, "step": 13902, "time_per_iteration": 2.820218086242676 }, { "auxiliary_loss_clip": 0.01398711, "auxiliary_loss_mlp": 0.01037801, "balance_loss_clip": 1.24114561, "balance_loss_mlp": 1.01842999, "epoch": 0.8358935818427777, "flos": 19253360972160.0, "grad_norm": 1.6098319652112272, "language_loss": 0.63061959, "learning_rate": 2.758237835853379e-07, "loss": 0.65498471, "num_input_tokens_seen": 299925130, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19372559, "step": 13903, "time_per_iteration": 2.8082101345062256 }, { "auxiliary_loss_clip": 0.01403705, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.24524307, "balance_loss_mlp": 1.01445699, "epoch": 0.8359537050954456, "flos": 24144719345280.0, "grad_norm": 1.6548987736716696, "language_loss": 0.75131696, "learning_rate": 2.7562645329168054e-07, "loss": 0.77568287, "num_input_tokens_seen": 299943845, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.1842041, "step": 13904, "time_per_iteration": 2.862440824508667 }, { "auxiliary_loss_clip": 0.01387522, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.23108494, "balance_loss_mlp": 1.01414943, "epoch": 0.8360138283481137, "flos": 16189083872640.0, "grad_norm": 5.564296144797552, "language_loss": 0.73314369, "learning_rate": 2.7542918838644104e-07, "loss": 0.75734925, "num_input_tokens_seen": 299961620, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18884277, "step": 13905, "time_per_iteration": 2.861351728439331 }, { "auxiliary_loss_clip": 0.01382405, "auxiliary_loss_mlp": 0.01035052, "balance_loss_clip": 1.2277894, "balance_loss_mlp": 1.01740885, "epoch": 0.8360739516007816, "flos": 22208697296640.0, "grad_norm": 2.5498966064721778, "language_loss": 0.67182642, "learning_rate": 2.752319888771e-07, "loss": 0.69600099, "num_input_tokens_seen": 299982170, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.17651367, "step": 13906, "time_per_iteration": 4.272448539733887 }, { "auxiliary_loss_clip": 0.01398935, "auxiliary_loss_mlp": 0.01033652, "balance_loss_clip": 1.24042249, "balance_loss_mlp": 1.0141139, "epoch": 0.8361340748534496, "flos": 20932925639040.0, "grad_norm": 1.435446814284113, "language_loss": 0.74621993, "learning_rate": 2.7503485477113475e-07, "loss": 0.77054578, "num_input_tokens_seen": 300001330, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.1953125, "step": 13907, "time_per_iteration": 4.302859544754028 }, { "auxiliary_loss_clip": 0.01410768, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.24638724, "balance_loss_mlp": 1.01424766, "epoch": 0.8361941981061175, "flos": 26183845589760.0, "grad_norm": 1.8116171619853463, "language_loss": 0.7621184, "learning_rate": 2.7483778607602005e-07, "loss": 0.78656268, "num_input_tokens_seen": 300020645, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1940918, "step": 13908, "time_per_iteration": 2.940707206726074 }, { "auxiliary_loss_clip": 0.01407671, "auxiliary_loss_mlp": 0.01033506, "balance_loss_clip": 1.24823308, "balance_loss_mlp": 1.01406324, "epoch": 0.8362543213587855, "flos": 24428396113920.0, "grad_norm": 4.132367260813974, "language_loss": 0.72335607, "learning_rate": 2.7464078279922964e-07, "loss": 0.74776787, "num_input_tokens_seen": 300039945, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19445801, "step": 13909, "time_per_iteration": 2.9598684310913086 }, { "auxiliary_loss_clip": 0.01417396, "auxiliary_loss_mlp": 0.01031416, "balance_loss_clip": 1.25337243, "balance_loss_mlp": 1.01248538, "epoch": 0.8363144446114534, "flos": 17211429550080.0, "grad_norm": 1.9186421147274122, "language_loss": 0.74084496, "learning_rate": 2.744438449482338e-07, "loss": 0.76533312, "num_input_tokens_seen": 300058260, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18920898, "step": 13910, "time_per_iteration": 2.835066556930542 }, { "auxiliary_loss_clip": 0.01409778, "auxiliary_loss_mlp": 0.01033219, "balance_loss_clip": 1.24920583, "balance_loss_mlp": 1.01415718, "epoch": 0.8363745678641215, "flos": 19288271975040.0, "grad_norm": 1.8034510714344314, "language_loss": 0.73758256, "learning_rate": 2.742469725305001e-07, "loss": 0.76201248, "num_input_tokens_seen": 300076720, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19055176, "step": 13911, "time_per_iteration": 3.0157644748687744 }, { "auxiliary_loss_clip": 0.01419699, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.25566244, "balance_loss_mlp": 1.01495147, "epoch": 0.8364346911167894, "flos": 11882362550400.0, "grad_norm": 1.934727731985863, "language_loss": 0.79838371, "learning_rate": 2.740501655534946e-07, "loss": 0.82291663, "num_input_tokens_seen": 300092950, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18640137, "step": 13912, "time_per_iteration": 2.8285460472106934 }, { "auxiliary_loss_clip": 0.01397455, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.23935044, "balance_loss_mlp": 1.01288307, "epoch": 0.8364948143694574, "flos": 20233601723520.0, "grad_norm": 1.7014679610261707, "language_loss": 0.79630959, "learning_rate": 2.738534240246797e-07, "loss": 0.82059866, "num_input_tokens_seen": 300110950, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18566895, "step": 13913, "time_per_iteration": 2.8319194316864014 }, { "auxiliary_loss_clip": 0.0141164, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.24889946, "balance_loss_mlp": 1.01460958, "epoch": 0.8365549376221254, "flos": 21621978984960.0, "grad_norm": 1.707706399209918, "language_loss": 0.73967183, "learning_rate": 2.736567479515153e-07, "loss": 0.76413178, "num_input_tokens_seen": 300128705, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19750977, "step": 13914, "time_per_iteration": 2.8211283683776855 }, { "auxiliary_loss_clip": 0.01404172, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.24490404, "balance_loss_mlp": 1.01703405, "epoch": 0.8366150608747933, "flos": 23304484563840.0, "grad_norm": 1.685362826316315, "language_loss": 0.72123945, "learning_rate": 2.7346013734146025e-07, "loss": 0.74564248, "num_input_tokens_seen": 300148635, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19091797, "step": 13915, "time_per_iteration": 2.894049882888794 }, { "auxiliary_loss_clip": 0.014043, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.24264073, "balance_loss_mlp": 1.01413751, "epoch": 0.8366751841274613, "flos": 15275950439040.0, "grad_norm": 1.9358469630159034, "language_loss": 0.73370576, "learning_rate": 2.7326359220197035e-07, "loss": 0.75807363, "num_input_tokens_seen": 300165490, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18347168, "step": 13916, "time_per_iteration": 2.8390722274780273 }, { "auxiliary_loss_clip": 0.01398663, "auxiliary_loss_mlp": 0.01031214, "balance_loss_clip": 1.23779154, "balance_loss_mlp": 1.01154482, "epoch": 0.8367353073801292, "flos": 13232797407360.0, "grad_norm": 2.6208900988817985, "language_loss": 0.75303942, "learning_rate": 2.7306711254049755e-07, "loss": 0.77733827, "num_input_tokens_seen": 300182130, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19677734, "step": 13917, "time_per_iteration": 2.799624443054199 }, { "auxiliary_loss_clip": 0.0138751, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.23336649, "balance_loss_mlp": 1.01587439, "epoch": 0.8367954306327973, "flos": 24215265267840.0, "grad_norm": 1.5881097109904536, "language_loss": 0.79972744, "learning_rate": 2.728706983644933e-07, "loss": 0.8239429, "num_input_tokens_seen": 300203050, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.1817627, "step": 13918, "time_per_iteration": 2.866183280944824 }, { "auxiliary_loss_clip": 0.01407569, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.246737, "balance_loss_mlp": 1.01916027, "epoch": 0.8368555538854652, "flos": 24545345218560.0, "grad_norm": 1.9121510634785384, "language_loss": 0.68366396, "learning_rate": 2.7267434968140457e-07, "loss": 0.70812094, "num_input_tokens_seen": 300224380, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18969727, "step": 13919, "time_per_iteration": 2.892137289047241 }, { "auxiliary_loss_clip": 0.01393937, "auxiliary_loss_mlp": 0.0103038, "balance_loss_clip": 1.23504996, "balance_loss_mlp": 1.01147377, "epoch": 0.8369156771381332, "flos": 20267155382400.0, "grad_norm": 1.919930955129626, "language_loss": 0.74116743, "learning_rate": 2.7247806649867835e-07, "loss": 0.76541054, "num_input_tokens_seen": 300242915, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18908691, "step": 13920, "time_per_iteration": 2.8685429096221924 }, { "auxiliary_loss_clip": 0.01396682, "auxiliary_loss_mlp": 0.01032117, "balance_loss_clip": 1.23681164, "balance_loss_mlp": 1.01288879, "epoch": 0.8369758003908011, "flos": 21845651869440.0, "grad_norm": 1.6977306123200961, "language_loss": 0.69989985, "learning_rate": 2.722818488237566e-07, "loss": 0.72418785, "num_input_tokens_seen": 300261905, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1920166, "step": 13921, "time_per_iteration": 2.842175245285034 }, { "auxiliary_loss_clip": 0.01423824, "auxiliary_loss_mlp": 0.0103638, "balance_loss_clip": 1.25927579, "balance_loss_mlp": 1.01770008, "epoch": 0.8370359236434691, "flos": 21727616889600.0, "grad_norm": 2.3822067493979704, "language_loss": 0.85812199, "learning_rate": 2.720856966640801e-07, "loss": 0.88272405, "num_input_tokens_seen": 300281145, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.18688965, "step": 13922, "time_per_iteration": 2.8453729152679443 }, { "auxiliary_loss_clip": 0.01398367, "auxiliary_loss_mlp": 0.01033186, "balance_loss_clip": 1.24185669, "balance_loss_mlp": 1.01444674, "epoch": 0.837096046896137, "flos": 23159275441920.0, "grad_norm": 1.597410488278112, "language_loss": 0.72758615, "learning_rate": 2.71889610027088e-07, "loss": 0.75190175, "num_input_tokens_seen": 300301610, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.1875, "step": 13923, "time_per_iteration": 2.834040641784668 }, { "auxiliary_loss_clip": 0.01389593, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.23248267, "balance_loss_mlp": 1.01287663, "epoch": 0.8371561701488051, "flos": 24502063927680.0, "grad_norm": 2.0362684770968493, "language_loss": 0.76862746, "learning_rate": 2.7169358892021433e-07, "loss": 0.79285371, "num_input_tokens_seen": 300319420, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.20166016, "step": 13924, "time_per_iteration": 2.8537347316741943 }, { "auxiliary_loss_clip": 0.01396021, "auxiliary_loss_mlp": 0.0102927, "balance_loss_clip": 1.23732257, "balance_loss_mlp": 1.01033926, "epoch": 0.837216293401473, "flos": 29219003020800.0, "grad_norm": 1.6091623228886887, "language_loss": 0.65069127, "learning_rate": 2.7149763335089293e-07, "loss": 0.67494416, "num_input_tokens_seen": 300341325, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18945312, "step": 13925, "time_per_iteration": 2.902864694595337 }, { "auxiliary_loss_clip": 0.0142105, "auxiliary_loss_mlp": 0.01037125, "balance_loss_clip": 1.25720608, "balance_loss_mlp": 1.01720476, "epoch": 0.837276416654141, "flos": 25276277266560.0, "grad_norm": 1.6443519821394001, "language_loss": 0.74940026, "learning_rate": 2.713017433265543e-07, "loss": 0.77398205, "num_input_tokens_seen": 300361620, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19921875, "step": 13926, "time_per_iteration": 2.866297960281372 }, { "auxiliary_loss_clip": 0.01407973, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.24649465, "balance_loss_mlp": 1.01916456, "epoch": 0.837336539906809, "flos": 13890152131200.0, "grad_norm": 2.06925591618755, "language_loss": 0.71878636, "learning_rate": 2.711059188546274e-07, "loss": 0.74323827, "num_input_tokens_seen": 300378675, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18066406, "step": 13927, "time_per_iteration": 2.8178000450134277 }, { "auxiliary_loss_clip": 0.011821, "auxiliary_loss_mlp": 0.01026685, "balance_loss_clip": 1.09118104, "balance_loss_mlp": 1.00503659, "epoch": 0.8373966631594769, "flos": 68903640823680.0, "grad_norm": 0.7140512127585137, "language_loss": 0.58863282, "learning_rate": 2.7091015994253695e-07, "loss": 0.61072063, "num_input_tokens_seen": 300449740, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.21679688, "step": 13928, "time_per_iteration": 3.6446895599365234 }, { "auxiliary_loss_clip": 0.01411269, "auxiliary_loss_mlp": 0.01032787, "balance_loss_clip": 1.25055277, "balance_loss_mlp": 1.01328397, "epoch": 0.8374567864121449, "flos": 20458677196800.0, "grad_norm": 1.6795796961336529, "language_loss": 0.70324284, "learning_rate": 2.707144665977068e-07, "loss": 0.72768337, "num_input_tokens_seen": 300470000, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19506836, "step": 13929, "time_per_iteration": 2.8510799407958984 }, { "auxiliary_loss_clip": 0.01426436, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.26130795, "balance_loss_mlp": 1.01126373, "epoch": 0.8375169096648128, "flos": 41919882848640.0, "grad_norm": 1.6135138752538007, "language_loss": 0.67665017, "learning_rate": 2.705188388275574e-07, "loss": 0.70122451, "num_input_tokens_seen": 300494975, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19726562, "step": 13930, "time_per_iteration": 3.0626227855682373 }, { "auxiliary_loss_clip": 0.01397686, "auxiliary_loss_mlp": 0.01030415, "balance_loss_clip": 1.23971331, "balance_loss_mlp": 1.01240301, "epoch": 0.8375770329174809, "flos": 20018480106240.0, "grad_norm": 1.700432023570513, "language_loss": 0.72284532, "learning_rate": 2.703232766395067e-07, "loss": 0.74712634, "num_input_tokens_seen": 300513175, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18017578, "step": 13931, "time_per_iteration": 2.8467986583709717 }, { "auxiliary_loss_clip": 0.01397653, "auxiliary_loss_mlp": 0.01032903, "balance_loss_clip": 1.23996973, "balance_loss_mlp": 1.01430631, "epoch": 0.8376371561701488, "flos": 22793786795520.0, "grad_norm": 1.6809303652101704, "language_loss": 0.72276425, "learning_rate": 2.701277800409705e-07, "loss": 0.74706978, "num_input_tokens_seen": 300533770, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18591309, "step": 13932, "time_per_iteration": 4.403635501861572 }, { "auxiliary_loss_clip": 0.01403149, "auxiliary_loss_mlp": 0.01032676, "balance_loss_clip": 1.24481201, "balance_loss_mlp": 1.01458025, "epoch": 0.8376972794228168, "flos": 23924892268800.0, "grad_norm": 2.1940090842668263, "language_loss": 0.66682041, "learning_rate": 2.699323490393628e-07, "loss": 0.69117868, "num_input_tokens_seen": 300552995, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18115234, "step": 13933, "time_per_iteration": 2.9162707328796387 }, { "auxiliary_loss_clip": 0.01397593, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.24016929, "balance_loss_mlp": 1.01660061, "epoch": 0.8377574026754847, "flos": 13742454545280.0, "grad_norm": 2.1538959315831194, "language_loss": 0.77074635, "learning_rate": 2.697369836420933e-07, "loss": 0.79507768, "num_input_tokens_seen": 300570275, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18933105, "step": 13934, "time_per_iteration": 2.9695489406585693 }, { "auxiliary_loss_clip": 0.01401692, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.24361217, "balance_loss_mlp": 1.01212239, "epoch": 0.8378175259281527, "flos": 21660916775040.0, "grad_norm": 1.5030183818590426, "language_loss": 0.77686167, "learning_rate": 2.6954168385657115e-07, "loss": 0.80118668, "num_input_tokens_seen": 300590875, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18688965, "step": 13935, "time_per_iteration": 4.333244562149048 }, { "auxiliary_loss_clip": 0.01401566, "auxiliary_loss_mlp": 0.01031032, "balance_loss_clip": 1.24135554, "balance_loss_mlp": 1.01211405, "epoch": 0.8378776491808206, "flos": 15456794480640.0, "grad_norm": 2.3947132158919477, "language_loss": 0.57104003, "learning_rate": 2.6934644969020135e-07, "loss": 0.59536606, "num_input_tokens_seen": 300607490, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18896484, "step": 13936, "time_per_iteration": 2.8110313415527344 }, { "auxiliary_loss_clip": 0.01402835, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.24228811, "balance_loss_mlp": 1.01435089, "epoch": 0.8379377724334887, "flos": 14728531875840.0, "grad_norm": 1.7597549099069183, "language_loss": 0.90355003, "learning_rate": 2.691512811503882e-07, "loss": 0.92791235, "num_input_tokens_seen": 300623635, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19042969, "step": 13937, "time_per_iteration": 2.830812931060791 }, { "auxiliary_loss_clip": 0.01400456, "auxiliary_loss_mlp": 0.01031201, "balance_loss_clip": 1.24126828, "balance_loss_mlp": 1.0119009, "epoch": 0.8379978956861566, "flos": 24545978645760.0, "grad_norm": 1.9717649476116534, "language_loss": 0.82537419, "learning_rate": 2.689561782445313e-07, "loss": 0.84969074, "num_input_tokens_seen": 300643835, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19299316, "step": 13938, "time_per_iteration": 2.870368480682373 }, { "auxiliary_loss_clip": 0.01409974, "auxiliary_loss_mlp": 0.01030713, "balance_loss_clip": 1.24779606, "balance_loss_mlp": 1.01199675, "epoch": 0.8380580189388246, "flos": 18961811608320.0, "grad_norm": 1.622112413648304, "language_loss": 0.7161777, "learning_rate": 2.6876114098002965e-07, "loss": 0.74058461, "num_input_tokens_seen": 300662500, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18701172, "step": 13939, "time_per_iteration": 2.8313231468200684 }, { "auxiliary_loss_clip": 0.01407593, "auxiliary_loss_mlp": 0.01039718, "balance_loss_clip": 1.24468732, "balance_loss_mlp": 1.01995265, "epoch": 0.8381181421914926, "flos": 26551098783360.0, "grad_norm": 1.6406526021446022, "language_loss": 0.76902896, "learning_rate": 2.6856616936428e-07, "loss": 0.79350209, "num_input_tokens_seen": 300681480, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19750977, "step": 13940, "time_per_iteration": 2.914124011993408 }, { "auxiliary_loss_clip": 0.01398742, "auxiliary_loss_mlp": 0.01032682, "balance_loss_clip": 1.23980784, "balance_loss_mlp": 1.01366794, "epoch": 0.8381782654441605, "flos": 23301045959040.0, "grad_norm": 3.331534505850874, "language_loss": 0.77226108, "learning_rate": 2.6837126340467374e-07, "loss": 0.79657531, "num_input_tokens_seen": 300699165, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19006348, "step": 13941, "time_per_iteration": 5.6159162521362305 }, { "auxiliary_loss_clip": 0.01414839, "auxiliary_loss_mlp": 0.01033437, "balance_loss_clip": 1.24938965, "balance_loss_mlp": 1.01336169, "epoch": 0.8382383886968285, "flos": 26769568515840.0, "grad_norm": 2.186201781476688, "language_loss": 0.73839438, "learning_rate": 2.6817642310860276e-07, "loss": 0.76287711, "num_input_tokens_seen": 300714615, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20068359, "step": 13942, "time_per_iteration": 2.87819766998291 }, { "auxiliary_loss_clip": 0.01414789, "auxiliary_loss_mlp": 0.01035813, "balance_loss_clip": 1.24739051, "balance_loss_mlp": 1.01626241, "epoch": 0.8382985119494964, "flos": 26115788131200.0, "grad_norm": 1.4377789064850224, "language_loss": 0.79705822, "learning_rate": 2.679816484834554e-07, "loss": 0.82156432, "num_input_tokens_seen": 300734860, "router_z_loss_clip": 1.67382812, "router_z_loss_mlp": 0.19555664, "step": 13943, "time_per_iteration": 2.883617877960205 }, { "auxiliary_loss_clip": 0.0140912, "auxiliary_loss_mlp": 0.01033838, "balance_loss_clip": 1.24918377, "balance_loss_mlp": 1.0157541, "epoch": 0.8383586352021645, "flos": 16443686217600.0, "grad_norm": 2.033514331247748, "language_loss": 0.85496539, "learning_rate": 2.6778693953661766e-07, "loss": 0.87939501, "num_input_tokens_seen": 300752735, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18078613, "step": 13944, "time_per_iteration": 2.8480868339538574 }, { "auxiliary_loss_clip": 0.01177323, "auxiliary_loss_mlp": 0.01015789, "balance_loss_clip": 1.08766484, "balance_loss_mlp": 0.9990043, "epoch": 0.8384187584548324, "flos": 64226091968640.0, "grad_norm": 0.6664612577384531, "language_loss": 0.50230801, "learning_rate": 2.6759229627547263e-07, "loss": 0.52423906, "num_input_tokens_seen": 300820760, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.16796875, "step": 13945, "time_per_iteration": 3.4434239864349365 }, { "auxiliary_loss_clip": 0.01389066, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 1.23236561, "balance_loss_mlp": 1.01280499, "epoch": 0.8384788817075004, "flos": 22393160922240.0, "grad_norm": 1.6895483874491655, "language_loss": 0.65392959, "learning_rate": 2.673977187074017e-07, "loss": 0.6781438, "num_input_tokens_seen": 300840025, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.19543457, "step": 13946, "time_per_iteration": 2.8558883666992188 }, { "auxiliary_loss_clip": 0.01404297, "auxiliary_loss_mlp": 0.0103249, "balance_loss_clip": 1.2424835, "balance_loss_mlp": 1.01260591, "epoch": 0.8385390049601683, "flos": 29508742592640.0, "grad_norm": 1.6714679024137806, "language_loss": 0.67977899, "learning_rate": 2.672032068397829e-07, "loss": 0.70414686, "num_input_tokens_seen": 300860380, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19897461, "step": 13947, "time_per_iteration": 2.9351742267608643 }, { "auxiliary_loss_clip": 0.01399894, "auxiliary_loss_mlp": 0.01034817, "balance_loss_clip": 1.23885536, "balance_loss_mlp": 1.01557636, "epoch": 0.8385991282128363, "flos": 32720581543680.0, "grad_norm": 1.381900975320931, "language_loss": 0.70254302, "learning_rate": 2.6700876067999176e-07, "loss": 0.72689015, "num_input_tokens_seen": 300881895, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19238281, "step": 13948, "time_per_iteration": 2.9222140312194824 }, { "auxiliary_loss_clip": 0.01386266, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.23184764, "balance_loss_mlp": 1.01402211, "epoch": 0.8386592514655042, "flos": 25450334588160.0, "grad_norm": 1.931124358195655, "language_loss": 0.85809433, "learning_rate": 2.6681438023540194e-07, "loss": 0.88227606, "num_input_tokens_seen": 300901575, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.17883301, "step": 13949, "time_per_iteration": 2.905339241027832 }, { "auxiliary_loss_clip": 0.01382171, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.22687054, "balance_loss_mlp": 1.01477289, "epoch": 0.8387193747181723, "flos": 22025500525440.0, "grad_norm": 2.2664890783047134, "language_loss": 0.70996475, "learning_rate": 2.66620065513385e-07, "loss": 0.73412913, "num_input_tokens_seen": 300919735, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.19482422, "step": 13950, "time_per_iteration": 2.8980000019073486 }, { "auxiliary_loss_clip": 0.01391087, "auxiliary_loss_mlp": 0.01032117, "balance_loss_clip": 1.23431444, "balance_loss_mlp": 1.01342475, "epoch": 0.8387794979708402, "flos": 18159293007360.0, "grad_norm": 1.5937641291998477, "language_loss": 0.65783012, "learning_rate": 2.6642581652130913e-07, "loss": 0.68206215, "num_input_tokens_seen": 300939150, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18688965, "step": 13951, "time_per_iteration": 2.842414617538452 }, { "auxiliary_loss_clip": 0.01413551, "auxiliary_loss_mlp": 0.01034513, "balance_loss_clip": 1.25234103, "balance_loss_mlp": 1.01553524, "epoch": 0.8388396212235082, "flos": 25422255550080.0, "grad_norm": 1.4551412076159658, "language_loss": 0.71319884, "learning_rate": 2.662316332665393e-07, "loss": 0.73767954, "num_input_tokens_seen": 300959730, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18981934, "step": 13952, "time_per_iteration": 2.8691084384918213 }, { "auxiliary_loss_clip": 0.01396625, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.23789978, "balance_loss_mlp": 1.0154978, "epoch": 0.8388997444761762, "flos": 22283179516800.0, "grad_norm": 1.8493916855473893, "language_loss": 0.73882568, "learning_rate": 2.6603751575643987e-07, "loss": 0.76313037, "num_input_tokens_seen": 300976120, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18334961, "step": 13953, "time_per_iteration": 2.826960325241089 }, { "auxiliary_loss_clip": 0.01395356, "auxiliary_loss_mlp": 0.0103243, "balance_loss_clip": 1.23661709, "balance_loss_mlp": 1.01416731, "epoch": 0.8389598677288441, "flos": 19582671761280.0, "grad_norm": 3.509980169931398, "language_loss": 0.68843085, "learning_rate": 2.6584346399837176e-07, "loss": 0.71270871, "num_input_tokens_seen": 300995080, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18249512, "step": 13954, "time_per_iteration": 2.8485045433044434 }, { "auxiliary_loss_clip": 0.01397298, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.23684478, "balance_loss_mlp": 1.01883984, "epoch": 0.8390199909815121, "flos": 17393766670080.0, "grad_norm": 1.8539922198264231, "language_loss": 0.73991084, "learning_rate": 2.656494779996932e-07, "loss": 0.76425844, "num_input_tokens_seen": 301012920, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18615723, "step": 13955, "time_per_iteration": 2.840822458267212 }, { "auxiliary_loss_clip": 0.01400164, "auxiliary_loss_mlp": 0.0103319, "balance_loss_clip": 1.24026179, "balance_loss_mlp": 1.01361585, "epoch": 0.83908011423418, "flos": 24649082841600.0, "grad_norm": 3.3916334914917434, "language_loss": 0.67094153, "learning_rate": 2.6545555776775995e-07, "loss": 0.69527507, "num_input_tokens_seen": 301028875, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19567871, "step": 13956, "time_per_iteration": 2.8404014110565186 }, { "auxiliary_loss_clip": 0.01411533, "auxiliary_loss_mlp": 0.01034427, "balance_loss_clip": 1.24856496, "balance_loss_mlp": 1.01495969, "epoch": 0.8391402374868481, "flos": 24729311151360.0, "grad_norm": 1.7482755051271377, "language_loss": 0.81351781, "learning_rate": 2.6526170330992667e-07, "loss": 0.83797735, "num_input_tokens_seen": 301050115, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19470215, "step": 13957, "time_per_iteration": 2.8553578853607178 }, { "auxiliary_loss_clip": 0.01183998, "auxiliary_loss_mlp": 0.01019921, "balance_loss_clip": 1.09144902, "balance_loss_mlp": 0.99588889, "epoch": 0.839200360739516, "flos": 56902899317760.0, "grad_norm": 0.7493366274113707, "language_loss": 0.53286922, "learning_rate": 2.6506791463354283e-07, "loss": 0.55490845, "num_input_tokens_seen": 301114155, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 0.24023438, "step": 13958, "time_per_iteration": 3.4444868564605713 }, { "auxiliary_loss_clip": 0.01394268, "auxiliary_loss_mlp": 0.01035454, "balance_loss_clip": 1.23611057, "balance_loss_mlp": 1.01584458, "epoch": 0.839260483992184, "flos": 18341946840960.0, "grad_norm": 2.116569525507219, "language_loss": 0.74750829, "learning_rate": 2.648741917459574e-07, "loss": 0.77180552, "num_input_tokens_seen": 301133150, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19616699, "step": 13959, "time_per_iteration": 2.8812437057495117 }, { "auxiliary_loss_clip": 0.01398504, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.24117589, "balance_loss_mlp": 1.01278293, "epoch": 0.8393206072448519, "flos": 27098969794560.0, "grad_norm": 1.9993627601207402, "language_loss": 0.56261194, "learning_rate": 2.646805346545169e-07, "loss": 0.58690709, "num_input_tokens_seen": 301153600, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18237305, "step": 13960, "time_per_iteration": 2.8889992237091064 }, { "auxiliary_loss_clip": 0.01183309, "auxiliary_loss_mlp": 0.0102798, "balance_loss_clip": 1.09079707, "balance_loss_mlp": 1.00203979, "epoch": 0.8393807304975199, "flos": 61548233875200.0, "grad_norm": 0.7820621385232547, "language_loss": 0.60725647, "learning_rate": 2.6448694336656397e-07, "loss": 0.62936932, "num_input_tokens_seen": 301214335, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 0.25976562, "step": 13961, "time_per_iteration": 3.3391737937927246 }, { "auxiliary_loss_clip": 0.01400971, "auxiliary_loss_mlp": 0.01032725, "balance_loss_clip": 1.23952496, "balance_loss_mlp": 1.01348519, "epoch": 0.8394408537501878, "flos": 14900960384640.0, "grad_norm": 3.0016163955632167, "language_loss": 0.69013262, "learning_rate": 2.642934178894405e-07, "loss": 0.71446961, "num_input_tokens_seen": 301228960, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19238281, "step": 13962, "time_per_iteration": 2.8231360912323 }, { "auxiliary_loss_clip": 0.01404651, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.24234581, "balance_loss_mlp": 1.01483285, "epoch": 0.8395009770028559, "flos": 17418904796160.0, "grad_norm": 1.8932022499011594, "language_loss": 0.74412638, "learning_rate": 2.640999582304841e-07, "loss": 0.76852453, "num_input_tokens_seen": 301245875, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20349121, "step": 13963, "time_per_iteration": 2.7899603843688965 }, { "auxiliary_loss_clip": 0.01400741, "auxiliary_loss_mlp": 0.01035664, "balance_loss_clip": 1.24133825, "balance_loss_mlp": 1.01746094, "epoch": 0.8395611002555238, "flos": 27935404012800.0, "grad_norm": 1.5303858358361293, "language_loss": 0.76473188, "learning_rate": 2.6390656439703173e-07, "loss": 0.78909594, "num_input_tokens_seen": 301265550, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18200684, "step": 13964, "time_per_iteration": 2.905437707901001 }, { "auxiliary_loss_clip": 0.01420285, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.25443769, "balance_loss_mlp": 1.02009773, "epoch": 0.8396212235081918, "flos": 11106203685120.0, "grad_norm": 1.899126811316887, "language_loss": 0.78586125, "learning_rate": 2.637132363964161e-07, "loss": 0.81046355, "num_input_tokens_seen": 301282035, "router_z_loss_clip": 1.66113281, "router_z_loss_mlp": 0.19848633, "step": 13965, "time_per_iteration": 2.81062388420105 }, { "auxiliary_loss_clip": 0.01390577, "auxiliary_loss_mlp": 0.01032719, "balance_loss_clip": 1.23255324, "balance_loss_mlp": 1.01383674, "epoch": 0.8396813467608598, "flos": 35749087989120.0, "grad_norm": 1.4859677547340322, "language_loss": 0.66136396, "learning_rate": 2.635199742359684e-07, "loss": 0.68559694, "num_input_tokens_seen": 301305210, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18908691, "step": 13966, "time_per_iteration": 2.9869091510772705 }, { "auxiliary_loss_clip": 0.01392111, "auxiliary_loss_mlp": 0.01033184, "balance_loss_clip": 1.23448873, "balance_loss_mlp": 1.01376534, "epoch": 0.8397414700135277, "flos": 26187781887360.0, "grad_norm": 1.9362657072499365, "language_loss": 0.75405693, "learning_rate": 2.633267779230177e-07, "loss": 0.77830988, "num_input_tokens_seen": 301324885, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.1940918, "step": 13967, "time_per_iteration": 4.403401136398315 }, { "auxiliary_loss_clip": 0.01399362, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.24148583, "balance_loss_mlp": 1.01134026, "epoch": 0.8398015932661957, "flos": 18342354044160.0, "grad_norm": 2.0272156144757765, "language_loss": 0.83148253, "learning_rate": 2.6313364746488974e-07, "loss": 0.85576808, "num_input_tokens_seen": 301343070, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1784668, "step": 13968, "time_per_iteration": 2.8812739849090576 }, { "auxiliary_loss_clip": 0.0140976, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.24734378, "balance_loss_mlp": 1.01710916, "epoch": 0.8398617165188637, "flos": 17387206174080.0, "grad_norm": 1.9894369146000601, "language_loss": 0.77910006, "learning_rate": 2.629405828689075e-07, "loss": 0.80355227, "num_input_tokens_seen": 301359280, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18371582, "step": 13969, "time_per_iteration": 2.807709217071533 }, { "auxiliary_loss_clip": 0.01415368, "auxiliary_loss_mlp": 0.01032723, "balance_loss_clip": 1.25070477, "balance_loss_mlp": 1.01267242, "epoch": 0.8399218397715317, "flos": 22939946058240.0, "grad_norm": 2.0052688715291915, "language_loss": 0.77811712, "learning_rate": 2.627475841423923e-07, "loss": 0.80259806, "num_input_tokens_seen": 301376465, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.20056152, "step": 13970, "time_per_iteration": 4.2758636474609375 }, { "auxiliary_loss_clip": 0.01416709, "auxiliary_loss_mlp": 0.01039422, "balance_loss_clip": 1.25458407, "balance_loss_mlp": 1.02034855, "epoch": 0.8399819630241996, "flos": 23160135093120.0, "grad_norm": 1.7957056013792068, "language_loss": 0.72356093, "learning_rate": 2.625546512926633e-07, "loss": 0.74812222, "num_input_tokens_seen": 301396000, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19055176, "step": 13971, "time_per_iteration": 2.8453540802001953 }, { "auxiliary_loss_clip": 0.01402431, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.24173141, "balance_loss_mlp": 1.01418948, "epoch": 0.8400420862768676, "flos": 16405653323520.0, "grad_norm": 1.6414766583973475, "language_loss": 0.78135359, "learning_rate": 2.623617843270358e-07, "loss": 0.80571151, "num_input_tokens_seen": 301413160, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19165039, "step": 13972, "time_per_iteration": 2.835982322692871 }, { "auxiliary_loss_clip": 0.01395907, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.23892415, "balance_loss_mlp": 1.01245379, "epoch": 0.8401022095295355, "flos": 21297237920640.0, "grad_norm": 1.2748980920187014, "language_loss": 0.68940878, "learning_rate": 2.6216898325282333e-07, "loss": 0.71368235, "num_input_tokens_seen": 301433325, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19006348, "step": 13973, "time_per_iteration": 2.8782694339752197 }, { "auxiliary_loss_clip": 0.01399249, "auxiliary_loss_mlp": 0.01035835, "balance_loss_clip": 1.23939323, "balance_loss_mlp": 1.01621294, "epoch": 0.8401623327822035, "flos": 17320234590720.0, "grad_norm": 2.3767853057992423, "language_loss": 0.78276551, "learning_rate": 2.619762480773382e-07, "loss": 0.80711639, "num_input_tokens_seen": 301450265, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19616699, "step": 13974, "time_per_iteration": 2.8108198642730713 }, { "auxiliary_loss_clip": 0.01398378, "auxiliary_loss_mlp": 0.01031778, "balance_loss_clip": 1.23662531, "balance_loss_mlp": 1.01322937, "epoch": 0.8402224560348714, "flos": 22246820680320.0, "grad_norm": 1.4365673778628576, "language_loss": 0.73441088, "learning_rate": 2.617835788078868e-07, "loss": 0.75871253, "num_input_tokens_seen": 301470760, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.1854248, "step": 13975, "time_per_iteration": 2.914034128189087 }, { "auxiliary_loss_clip": 0.01396527, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.23693585, "balance_loss_mlp": 1.01348531, "epoch": 0.8402825792875395, "flos": 20239574037120.0, "grad_norm": 1.7109512656736308, "language_loss": 0.72944504, "learning_rate": 2.6159097545177645e-07, "loss": 0.75373447, "num_input_tokens_seen": 301489425, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18933105, "step": 13976, "time_per_iteration": 5.592138051986694 }, { "auxiliary_loss_clip": 0.0141134, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.2504189, "balance_loss_mlp": 1.01392603, "epoch": 0.8403427025402074, "flos": 23299145677440.0, "grad_norm": 2.8546557005760547, "language_loss": 0.72964489, "learning_rate": 2.61398438016311e-07, "loss": 0.75408316, "num_input_tokens_seen": 301508885, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18554688, "step": 13977, "time_per_iteration": 2.9244132041931152 }, { "auxiliary_loss_clip": 0.01405731, "auxiliary_loss_mlp": 0.01031817, "balance_loss_clip": 1.24366415, "balance_loss_mlp": 1.0128386, "epoch": 0.8404028257928754, "flos": 32689471104000.0, "grad_norm": 7.473532177222819, "language_loss": 0.69331431, "learning_rate": 2.6120596650879043e-07, "loss": 0.71768975, "num_input_tokens_seen": 301533780, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18969727, "step": 13978, "time_per_iteration": 2.945669651031494 }, { "auxiliary_loss_clip": 0.01379781, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.22434354, "balance_loss_mlp": 1.0145669, "epoch": 0.8404629490455434, "flos": 16189219607040.0, "grad_norm": 2.2243155366717526, "language_loss": 0.78293043, "learning_rate": 2.610135609365145e-07, "loss": 0.80706, "num_input_tokens_seen": 301551775, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18603516, "step": 13979, "time_per_iteration": 2.8013103008270264 }, { "auxiliary_loss_clip": 0.01406392, "auxiliary_loss_mlp": 0.0103533, "balance_loss_clip": 1.24607885, "balance_loss_mlp": 1.01642382, "epoch": 0.8405230722982113, "flos": 15202644583680.0, "grad_norm": 1.820207708901894, "language_loss": 0.78726298, "learning_rate": 2.60821221306778e-07, "loss": 0.8116802, "num_input_tokens_seen": 301570495, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18908691, "step": 13980, "time_per_iteration": 2.895350933074951 }, { "auxiliary_loss_clip": 0.01394193, "auxiliary_loss_mlp": 0.01032281, "balance_loss_clip": 1.23811471, "balance_loss_mlp": 1.01327896, "epoch": 0.8405831955508793, "flos": 27822300716160.0, "grad_norm": 1.9496483916650684, "language_loss": 0.86814475, "learning_rate": 2.606289476268757e-07, "loss": 0.8924095, "num_input_tokens_seen": 301591705, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.19006348, "step": 13981, "time_per_iteration": 2.8705804347991943 }, { "auxiliary_loss_clip": 0.01397973, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.23919606, "balance_loss_mlp": 1.01484728, "epoch": 0.8406433188035473, "flos": 23780090350080.0, "grad_norm": 10.83214169378516, "language_loss": 0.68579948, "learning_rate": 2.6043673990409745e-07, "loss": 0.71012026, "num_input_tokens_seen": 301611670, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19250488, "step": 13982, "time_per_iteration": 2.8297414779663086 }, { "auxiliary_loss_clip": 0.01418621, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.25765657, "balance_loss_mlp": 1.01721478, "epoch": 0.8407034420562153, "flos": 29217690921600.0, "grad_norm": 1.6043755784400509, "language_loss": 0.68812549, "learning_rate": 2.602445981457324e-07, "loss": 0.71267569, "num_input_tokens_seen": 301632540, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19189453, "step": 13983, "time_per_iteration": 2.9001975059509277 }, { "auxiliary_loss_clip": 0.01408723, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.24647093, "balance_loss_mlp": 1.01487958, "epoch": 0.8407635653088832, "flos": 26371431106560.0, "grad_norm": 4.0773522768000126, "language_loss": 0.79893064, "learning_rate": 2.6005252235906684e-07, "loss": 0.82335865, "num_input_tokens_seen": 301651480, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.1920166, "step": 13984, "time_per_iteration": 2.863291025161743 }, { "auxiliary_loss_clip": 0.01397152, "auxiliary_loss_mlp": 0.01035379, "balance_loss_clip": 1.238464, "balance_loss_mlp": 1.01634109, "epoch": 0.8408236885615512, "flos": 21478308186240.0, "grad_norm": 1.9310216823021238, "language_loss": 0.61477697, "learning_rate": 2.598605125513842e-07, "loss": 0.63910234, "num_input_tokens_seen": 301670010, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19042969, "step": 13985, "time_per_iteration": 2.8415231704711914 }, { "auxiliary_loss_clip": 0.01395694, "auxiliary_loss_mlp": 0.01034711, "balance_loss_clip": 1.23365879, "balance_loss_mlp": 1.0151484, "epoch": 0.8408838118142191, "flos": 22973363982720.0, "grad_norm": 1.6570744939726703, "language_loss": 0.82325935, "learning_rate": 2.5966856872996467e-07, "loss": 0.84756339, "num_input_tokens_seen": 301689785, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19567871, "step": 13986, "time_per_iteration": 2.8335986137390137 }, { "auxiliary_loss_clip": 0.01402578, "auxiliary_loss_mlp": 0.01037419, "balance_loss_clip": 1.2440877, "balance_loss_mlp": 1.01797652, "epoch": 0.8409439350668871, "flos": 26812035400320.0, "grad_norm": 1.4798586919145345, "language_loss": 0.66696197, "learning_rate": 2.5947669090208755e-07, "loss": 0.6913619, "num_input_tokens_seen": 301712225, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19433594, "step": 13987, "time_per_iteration": 2.942720413208008 }, { "auxiliary_loss_clip": 0.01407417, "auxiliary_loss_mlp": 0.0103569, "balance_loss_clip": 1.24867523, "balance_loss_mlp": 1.01625896, "epoch": 0.841004058319555, "flos": 26589991328640.0, "grad_norm": 1.9381844313545067, "language_loss": 0.67870718, "learning_rate": 2.5928487907502906e-07, "loss": 0.70313823, "num_input_tokens_seen": 301730955, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19433594, "step": 13988, "time_per_iteration": 2.9030158519744873 }, { "auxiliary_loss_clip": 0.0142058, "auxiliary_loss_mlp": 0.01036487, "balance_loss_clip": 1.25630975, "balance_loss_mlp": 1.01620984, "epoch": 0.8410641815722231, "flos": 14510062143360.0, "grad_norm": 3.161528759633698, "language_loss": 0.82151449, "learning_rate": 2.590931332560622e-07, "loss": 0.84608519, "num_input_tokens_seen": 301746930, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20263672, "step": 13989, "time_per_iteration": 2.8576128482818604 }, { "auxiliary_loss_clip": 0.01417172, "auxiliary_loss_mlp": 0.0103407, "balance_loss_clip": 1.25343299, "balance_loss_mlp": 1.01565242, "epoch": 0.841124304824891, "flos": 29178029214720.0, "grad_norm": 1.9749868923677618, "language_loss": 0.76186585, "learning_rate": 2.5890145345245826e-07, "loss": 0.78637826, "num_input_tokens_seen": 301766945, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18395996, "step": 13990, "time_per_iteration": 2.9193875789642334 }, { "auxiliary_loss_clip": 0.0138799, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.23160708, "balance_loss_mlp": 1.0180006, "epoch": 0.841184428077559, "flos": 22420380309120.0, "grad_norm": 1.8169122782514824, "language_loss": 0.81004679, "learning_rate": 2.5870983967148597e-07, "loss": 0.83430207, "num_input_tokens_seen": 301785460, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.19543457, "step": 13991, "time_per_iteration": 2.857771635055542 }, { "auxiliary_loss_clip": 0.01394114, "auxiliary_loss_mlp": 0.01028446, "balance_loss_clip": 1.23481679, "balance_loss_mlp": 1.01007605, "epoch": 0.841244551330227, "flos": 22972187617920.0, "grad_norm": 2.1073268630272635, "language_loss": 0.713925, "learning_rate": 2.585182919204105e-07, "loss": 0.7381506, "num_input_tokens_seen": 301804180, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18371582, "step": 13992, "time_per_iteration": 2.9334957599639893 }, { "auxiliary_loss_clip": 0.01404595, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 1.24346304, "balance_loss_mlp": 1.01078129, "epoch": 0.8413046745828949, "flos": 21042590330880.0, "grad_norm": 1.5894082270112864, "language_loss": 0.77407259, "learning_rate": 2.583268102064959e-07, "loss": 0.79840684, "num_input_tokens_seen": 301823670, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18066406, "step": 13993, "time_per_iteration": 2.8471293449401855 }, { "auxiliary_loss_clip": 0.01426009, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.25944042, "balance_loss_mlp": 1.01536655, "epoch": 0.841364797835563, "flos": 27063289630080.0, "grad_norm": 2.032309038095702, "language_loss": 0.74780703, "learning_rate": 2.5813539453700393e-07, "loss": 0.77242124, "num_input_tokens_seen": 301845890, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.20043945, "step": 13994, "time_per_iteration": 3.020331859588623 }, { "auxiliary_loss_clip": 0.01389336, "auxiliary_loss_mlp": 0.01032617, "balance_loss_clip": 1.23436773, "balance_loss_mlp": 1.01412725, "epoch": 0.8414249210882309, "flos": 17904916886400.0, "grad_norm": 1.795483429548079, "language_loss": 0.60257363, "learning_rate": 2.5794404491919163e-07, "loss": 0.62679315, "num_input_tokens_seen": 301863985, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.18505859, "step": 13995, "time_per_iteration": 2.839932680130005 }, { "auxiliary_loss_clip": 0.01403419, "auxiliary_loss_mlp": 0.01031347, "balance_loss_clip": 1.24374342, "balance_loss_mlp": 1.01245248, "epoch": 0.8414850443408989, "flos": 25450560812160.0, "grad_norm": 1.585408860258319, "language_loss": 0.7236557, "learning_rate": 2.577527613603163e-07, "loss": 0.7480033, "num_input_tokens_seen": 301882765, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18908691, "step": 13996, "time_per_iteration": 2.8335020542144775 }, { "auxiliary_loss_clip": 0.01391194, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.23235703, "balance_loss_mlp": 1.01525521, "epoch": 0.8415451675935668, "flos": 23230002343680.0, "grad_norm": 1.839808101686761, "language_loss": 0.65414762, "learning_rate": 2.5756154386763017e-07, "loss": 0.67841017, "num_input_tokens_seen": 301902720, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19812012, "step": 13997, "time_per_iteration": 2.8677492141723633 }, { "auxiliary_loss_clip": 0.01419395, "auxiliary_loss_mlp": 0.01034492, "balance_loss_clip": 1.25338411, "balance_loss_mlp": 1.01454806, "epoch": 0.8416052908462348, "flos": 18554353770240.0, "grad_norm": 1.9356833364701034, "language_loss": 0.82644135, "learning_rate": 2.5737039244838565e-07, "loss": 0.85098028, "num_input_tokens_seen": 301921245, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19958496, "step": 13998, "time_per_iteration": 2.8125858306884766 }, { "auxiliary_loss_clip": 0.0139991, "auxiliary_loss_mlp": 0.01032095, "balance_loss_clip": 1.23861814, "balance_loss_mlp": 1.01228273, "epoch": 0.8416654140989027, "flos": 26116602537600.0, "grad_norm": 1.4281029134850054, "language_loss": 0.80963486, "learning_rate": 2.5717930710982984e-07, "loss": 0.83395493, "num_input_tokens_seen": 301942320, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19812012, "step": 13999, "time_per_iteration": 2.904327869415283 }, { "auxiliary_loss_clip": 0.01417267, "auxiliary_loss_mlp": 0.0103735, "balance_loss_clip": 1.25314426, "balance_loss_mlp": 1.01709592, "epoch": 0.8417255373515707, "flos": 26444691717120.0, "grad_norm": 2.614780787618831, "language_loss": 0.67592084, "learning_rate": 2.569882878592096e-07, "loss": 0.70046699, "num_input_tokens_seen": 301963110, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.20251465, "step": 14000, "time_per_iteration": 2.9527430534362793 }, { "auxiliary_loss_clip": 0.01413144, "auxiliary_loss_mlp": 0.01031231, "balance_loss_clip": 1.25063562, "balance_loss_mlp": 1.01199055, "epoch": 0.8417856606042387, "flos": 24728541989760.0, "grad_norm": 2.0688942998555673, "language_loss": 0.80078554, "learning_rate": 2.5679733470376885e-07, "loss": 0.82522935, "num_input_tokens_seen": 301984915, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19238281, "step": 14001, "time_per_iteration": 2.8715648651123047 }, { "auxiliary_loss_clip": 0.01388933, "auxiliary_loss_mlp": 0.01028929, "balance_loss_clip": 1.22924399, "balance_loss_mlp": 1.01111901, "epoch": 0.8418457838569067, "flos": 20860796148480.0, "grad_norm": 1.633157585357178, "language_loss": 0.79269814, "learning_rate": 2.5660644765074703e-07, "loss": 0.81687671, "num_input_tokens_seen": 302004095, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.17810059, "step": 14002, "time_per_iteration": 4.279980659484863 }, { "auxiliary_loss_clip": 0.01387431, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.22944629, "balance_loss_mlp": 1.01344419, "epoch": 0.8419059071095746, "flos": 28673077536000.0, "grad_norm": 1.464944761651403, "language_loss": 0.78592384, "learning_rate": 2.5641562670738334e-07, "loss": 0.81012857, "num_input_tokens_seen": 302027250, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19604492, "step": 14003, "time_per_iteration": 2.9127037525177 }, { "auxiliary_loss_clip": 0.01399814, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.23929751, "balance_loss_mlp": 1.01732373, "epoch": 0.8419660303622426, "flos": 21663767197440.0, "grad_norm": 1.611493621218885, "language_loss": 0.66163713, "learning_rate": 2.5622487188091436e-07, "loss": 0.68601, "num_input_tokens_seen": 302046950, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20153809, "step": 14004, "time_per_iteration": 2.8231866359710693 }, { "auxiliary_loss_clip": 0.01414874, "auxiliary_loss_mlp": 0.01034245, "balance_loss_clip": 1.25103974, "balance_loss_mlp": 1.01439667, "epoch": 0.8420261536149106, "flos": 25311957431040.0, "grad_norm": 2.248147721712523, "language_loss": 0.76975018, "learning_rate": 2.560341831785724e-07, "loss": 0.79424137, "num_input_tokens_seen": 302065470, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19848633, "step": 14005, "time_per_iteration": 4.3720128536224365 }, { "auxiliary_loss_clip": 0.01410089, "auxiliary_loss_mlp": 0.01033318, "balance_loss_clip": 1.24692369, "balance_loss_mlp": 1.01382709, "epoch": 0.8420862768675785, "flos": 18770787486720.0, "grad_norm": 1.6737763087526984, "language_loss": 0.78879356, "learning_rate": 2.5584356060758906e-07, "loss": 0.81322759, "num_input_tokens_seen": 302083190, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19494629, "step": 14006, "time_per_iteration": 2.8771321773529053 }, { "auxiliary_loss_clip": 0.01394934, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.23615634, "balance_loss_mlp": 1.01414371, "epoch": 0.8421464001202466, "flos": 18335974527360.0, "grad_norm": 1.7427925446765211, "language_loss": 0.77876103, "learning_rate": 2.556530041751932e-07, "loss": 0.80303812, "num_input_tokens_seen": 302098820, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18640137, "step": 14007, "time_per_iteration": 2.8132781982421875 }, { "auxiliary_loss_clip": 0.01410241, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.24856305, "balance_loss_mlp": 1.01416469, "epoch": 0.8422065233729145, "flos": 31548864222720.0, "grad_norm": 1.7885499402004554, "language_loss": 0.66601896, "learning_rate": 2.554625138886102e-07, "loss": 0.69045711, "num_input_tokens_seen": 302117075, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.1940918, "step": 14008, "time_per_iteration": 2.9286139011383057 }, { "auxiliary_loss_clip": 0.01186723, "auxiliary_loss_mlp": 0.01028146, "balance_loss_clip": 1.09479237, "balance_loss_mlp": 1.00602078, "epoch": 0.8422666466255825, "flos": 64326933924480.0, "grad_norm": 0.7185765739731961, "language_loss": 0.57020092, "learning_rate": 2.552720897550631e-07, "loss": 0.59234965, "num_input_tokens_seen": 302179735, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 0.22167969, "step": 14009, "time_per_iteration": 3.4123823642730713 }, { "auxiliary_loss_clip": 0.01394996, "auxiliary_loss_mlp": 0.01033248, "balance_loss_clip": 1.23763061, "balance_loss_mlp": 1.01550937, "epoch": 0.8423267698782504, "flos": 24327508913280.0, "grad_norm": 1.193180299352751, "language_loss": 0.78302956, "learning_rate": 2.5508173178177304e-07, "loss": 0.80731201, "num_input_tokens_seen": 302202055, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.17724609, "step": 14010, "time_per_iteration": 4.339692115783691 }, { "auxiliary_loss_clip": 0.01412843, "auxiliary_loss_mlp": 0.0104226, "balance_loss_clip": 1.25071514, "balance_loss_mlp": 1.02169657, "epoch": 0.8423868931309184, "flos": 18305180801280.0, "grad_norm": 1.8978786799752194, "language_loss": 0.7359935, "learning_rate": 2.548914399759592e-07, "loss": 0.76054454, "num_input_tokens_seen": 302221360, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20556641, "step": 14011, "time_per_iteration": 4.234750986099243 }, { "auxiliary_loss_clip": 0.01407691, "auxiliary_loss_mlp": 0.01033721, "balance_loss_clip": 1.24724996, "balance_loss_mlp": 1.01488614, "epoch": 0.8424470163835863, "flos": 23560715721600.0, "grad_norm": 5.899233809465174, "language_loss": 0.85457611, "learning_rate": 2.5470121434483636e-07, "loss": 0.87899023, "num_input_tokens_seen": 302240715, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18835449, "step": 14012, "time_per_iteration": 2.871147871017456 }, { "auxiliary_loss_clip": 0.0137625, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.22407401, "balance_loss_mlp": 1.0122602, "epoch": 0.8425071396362543, "flos": 23780135594880.0, "grad_norm": 1.7313952169243632, "language_loss": 0.68472558, "learning_rate": 2.5451105489561884e-07, "loss": 0.70880663, "num_input_tokens_seen": 302260950, "router_z_loss_clip": 1.52148438, "router_z_loss_mlp": 0.19604492, "step": 14013, "time_per_iteration": 2.9019460678100586 }, { "auxiliary_loss_clip": 0.01402437, "auxiliary_loss_mlp": 0.01034485, "balance_loss_clip": 1.23883629, "balance_loss_mlp": 1.01500654, "epoch": 0.8425672628889223, "flos": 16187183591040.0, "grad_norm": 2.184885725926289, "language_loss": 0.79905903, "learning_rate": 2.5432096163551644e-07, "loss": 0.82342827, "num_input_tokens_seen": 302277500, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19494629, "step": 14014, "time_per_iteration": 2.8369927406311035 }, { "auxiliary_loss_clip": 0.01398523, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.2377454, "balance_loss_mlp": 1.01653612, "epoch": 0.8426273861415903, "flos": 23159592155520.0, "grad_norm": 2.6997474836999675, "language_loss": 0.68389481, "learning_rate": 2.5413093457173884e-07, "loss": 0.70824122, "num_input_tokens_seen": 302297930, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19580078, "step": 14015, "time_per_iteration": 2.821166753768921 }, { "auxiliary_loss_clip": 0.01405811, "auxiliary_loss_mlp": 0.0103282, "balance_loss_clip": 1.24511862, "balance_loss_mlp": 1.01359177, "epoch": 0.8426875093942582, "flos": 17466846301440.0, "grad_norm": 3.8025805908371404, "language_loss": 0.76868343, "learning_rate": 2.5394097371149036e-07, "loss": 0.79306972, "num_input_tokens_seen": 302315735, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19226074, "step": 14016, "time_per_iteration": 2.8158862590789795 }, { "auxiliary_loss_clip": 0.01401445, "auxiliary_loss_mlp": 0.01037529, "balance_loss_clip": 1.24006104, "balance_loss_mlp": 1.01813316, "epoch": 0.8427476326469262, "flos": 19648557469440.0, "grad_norm": 1.9541873987553064, "language_loss": 0.80024534, "learning_rate": 2.5375107906197544e-07, "loss": 0.82463515, "num_input_tokens_seen": 302332790, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19384766, "step": 14017, "time_per_iteration": 2.81432843208313 }, { "auxiliary_loss_clip": 0.01397291, "auxiliary_loss_mlp": 0.01033763, "balance_loss_clip": 1.23745728, "balance_loss_mlp": 1.01515448, "epoch": 0.8428077558995941, "flos": 11946936159360.0, "grad_norm": 2.6194199379643544, "language_loss": 0.63767236, "learning_rate": 2.5356125063039525e-07, "loss": 0.66198289, "num_input_tokens_seen": 302346490, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18603516, "step": 14018, "time_per_iteration": 2.7852909564971924 }, { "auxiliary_loss_clip": 0.0139755, "auxiliary_loss_mlp": 0.01035351, "balance_loss_clip": 1.23768008, "balance_loss_mlp": 1.01673102, "epoch": 0.8428678791522621, "flos": 10458802817280.0, "grad_norm": 1.843133297419182, "language_loss": 0.7968778, "learning_rate": 2.5337148842394687e-07, "loss": 0.82120681, "num_input_tokens_seen": 302363235, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18615723, "step": 14019, "time_per_iteration": 2.961827516555786 }, { "auxiliary_loss_clip": 0.01408013, "auxiliary_loss_mlp": 0.01032675, "balance_loss_clip": 1.24648499, "balance_loss_mlp": 1.01389933, "epoch": 0.8429280024049302, "flos": 28778986909440.0, "grad_norm": 2.0026458643645126, "language_loss": 0.79005104, "learning_rate": 2.531817924498265e-07, "loss": 0.81445789, "num_input_tokens_seen": 302383270, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18774414, "step": 14020, "time_per_iteration": 2.9138615131378174 }, { "auxiliary_loss_clip": 0.01402679, "auxiliary_loss_mlp": 0.01032135, "balance_loss_clip": 1.24252677, "balance_loss_mlp": 1.01335919, "epoch": 0.8429881256575981, "flos": 19546629638400.0, "grad_norm": 1.7089695759067411, "language_loss": 0.72037727, "learning_rate": 2.5299216271522805e-07, "loss": 0.74472541, "num_input_tokens_seen": 302401355, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18762207, "step": 14021, "time_per_iteration": 2.829550266265869 }, { "auxiliary_loss_clip": 0.01420801, "auxiliary_loss_mlp": 0.01036676, "balance_loss_clip": 1.25734258, "balance_loss_mlp": 1.01669693, "epoch": 0.8430482489102661, "flos": 24801802600320.0, "grad_norm": 2.2519606427381214, "language_loss": 0.70545435, "learning_rate": 2.5280259922734125e-07, "loss": 0.73002911, "num_input_tokens_seen": 302419515, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19995117, "step": 14022, "time_per_iteration": 2.8840301036834717 }, { "auxiliary_loss_clip": 0.01417491, "auxiliary_loss_mlp": 0.01035859, "balance_loss_clip": 1.25436699, "balance_loss_mlp": 1.01648736, "epoch": 0.843108372162934, "flos": 21554419219200.0, "grad_norm": 3.157009418094002, "language_loss": 0.73144579, "learning_rate": 2.526131019933553e-07, "loss": 0.75597924, "num_input_tokens_seen": 302438280, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19360352, "step": 14023, "time_per_iteration": 2.893638849258423 }, { "auxiliary_loss_clip": 0.01401371, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.24172664, "balance_loss_mlp": 1.01648021, "epoch": 0.843168495415602, "flos": 24619691704320.0, "grad_norm": 1.4938412093004372, "language_loss": 0.67298329, "learning_rate": 2.524236710204559e-07, "loss": 0.69736743, "num_input_tokens_seen": 302460860, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.20568848, "step": 14024, "time_per_iteration": 2.92244815826416 }, { "auxiliary_loss_clip": 0.01388652, "auxiliary_loss_mlp": 0.01035314, "balance_loss_clip": 1.23159051, "balance_loss_mlp": 1.0160737, "epoch": 0.8432286186682699, "flos": 15131239009920.0, "grad_norm": 1.667957607068702, "language_loss": 0.81677973, "learning_rate": 2.522343063158261e-07, "loss": 0.84101939, "num_input_tokens_seen": 302476980, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19250488, "step": 14025, "time_per_iteration": 2.8040194511413574 }, { "auxiliary_loss_clip": 0.01396167, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.23940563, "balance_loss_mlp": 1.01594448, "epoch": 0.843288741920938, "flos": 20311251079680.0, "grad_norm": 1.9945489373733447, "language_loss": 0.77704805, "learning_rate": 2.5204500788664606e-07, "loss": 0.8013494, "num_input_tokens_seen": 302496380, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18029785, "step": 14026, "time_per_iteration": 2.876418113708496 }, { "auxiliary_loss_clip": 0.01388701, "auxiliary_loss_mlp": 0.01031817, "balance_loss_clip": 1.23132992, "balance_loss_mlp": 1.01251709, "epoch": 0.8433488651736059, "flos": 23342743681920.0, "grad_norm": 1.4648095541988384, "language_loss": 0.83399206, "learning_rate": 2.518557757400945e-07, "loss": 0.85819727, "num_input_tokens_seen": 302516845, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.19299316, "step": 14027, "time_per_iteration": 2.8891735076904297 }, { "auxiliary_loss_clip": 0.01393836, "auxiliary_loss_mlp": 0.01031892, "balance_loss_clip": 1.23467183, "balance_loss_mlp": 1.01253247, "epoch": 0.8434089884262739, "flos": 39472755828480.0, "grad_norm": 1.378020030534551, "language_loss": 0.56909472, "learning_rate": 2.5166660988334754e-07, "loss": 0.59335202, "num_input_tokens_seen": 302538865, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19360352, "step": 14028, "time_per_iteration": 3.07761812210083 }, { "auxiliary_loss_clip": 0.01400661, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 1.24057674, "balance_loss_mlp": 1.01138484, "epoch": 0.8434691116789418, "flos": 23779321188480.0, "grad_norm": 1.69082563606802, "language_loss": 0.6446172, "learning_rate": 2.51477510323578e-07, "loss": 0.66892815, "num_input_tokens_seen": 302557970, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19042969, "step": 14029, "time_per_iteration": 2.9855735301971436 }, { "auxiliary_loss_clip": 0.01392125, "auxiliary_loss_mlp": 0.01028553, "balance_loss_clip": 1.23664927, "balance_loss_mlp": 1.00891948, "epoch": 0.8435292349316098, "flos": 22680864478080.0, "grad_norm": 1.6694303451597923, "language_loss": 0.76098561, "learning_rate": 2.51288477067956e-07, "loss": 0.78519237, "num_input_tokens_seen": 302578915, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.19628906, "step": 14030, "time_per_iteration": 2.9074184894561768 }, { "auxiliary_loss_clip": 0.01386752, "auxiliary_loss_mlp": 0.01035388, "balance_loss_clip": 1.23079336, "balance_loss_mlp": 1.0164454, "epoch": 0.8435893581842777, "flos": 18852916078080.0, "grad_norm": 1.801988249929396, "language_loss": 0.84223819, "learning_rate": 2.510995101236502e-07, "loss": 0.86645961, "num_input_tokens_seen": 302596300, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18933105, "step": 14031, "time_per_iteration": 2.899329423904419 }, { "auxiliary_loss_clip": 0.01393593, "auxiliary_loss_mlp": 0.01033777, "balance_loss_clip": 1.23636222, "balance_loss_mlp": 1.01484632, "epoch": 0.8436494814369457, "flos": 20713958213760.0, "grad_norm": 3.6896473173667053, "language_loss": 0.81289172, "learning_rate": 2.509106094978266e-07, "loss": 0.83716547, "num_input_tokens_seen": 302614975, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18920898, "step": 14032, "time_per_iteration": 2.9662675857543945 }, { "auxiliary_loss_clip": 0.01400196, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.24037409, "balance_loss_mlp": 1.0157423, "epoch": 0.8437096046896138, "flos": 22684348327680.0, "grad_norm": 1.525456629786665, "language_loss": 0.76273894, "learning_rate": 2.507217751976478e-07, "loss": 0.78709632, "num_input_tokens_seen": 302636415, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19812012, "step": 14033, "time_per_iteration": 3.0021893978118896 }, { "auxiliary_loss_clip": 0.01400641, "auxiliary_loss_mlp": 0.01033509, "balance_loss_clip": 1.24146628, "balance_loss_mlp": 1.01513863, "epoch": 0.8437697279422817, "flos": 16188721914240.0, "grad_norm": 1.7652070908045971, "language_loss": 0.83980298, "learning_rate": 2.505330072302743e-07, "loss": 0.86414444, "num_input_tokens_seen": 302653605, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18371582, "step": 14034, "time_per_iteration": 2.9431095123291016 }, { "auxiliary_loss_clip": 0.01392578, "auxiliary_loss_mlp": 0.01032036, "balance_loss_clip": 1.23262691, "balance_loss_mlp": 1.01235414, "epoch": 0.8438298511949497, "flos": 28777222362240.0, "grad_norm": 1.5164761923902352, "language_loss": 0.78895748, "learning_rate": 2.503443056028656e-07, "loss": 0.81320357, "num_input_tokens_seen": 302673965, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19665527, "step": 14035, "time_per_iteration": 2.912457227706909 }, { "auxiliary_loss_clip": 0.01400127, "auxiliary_loss_mlp": 0.01033985, "balance_loss_clip": 1.24066854, "balance_loss_mlp": 1.01473308, "epoch": 0.8438899744476176, "flos": 33736004766720.0, "grad_norm": 1.5953412901726294, "language_loss": 0.72733366, "learning_rate": 2.501556703225751e-07, "loss": 0.75167477, "num_input_tokens_seen": 302695560, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19250488, "step": 14036, "time_per_iteration": 2.972230911254883 }, { "auxiliary_loss_clip": 0.01390745, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.23520398, "balance_loss_mlp": 1.01175594, "epoch": 0.8439500977002856, "flos": 25119530720640.0, "grad_norm": 1.679280939655633, "language_loss": 0.69888151, "learning_rate": 2.49967101396557e-07, "loss": 0.72308743, "num_input_tokens_seen": 302713480, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.18103027, "step": 14037, "time_per_iteration": 4.297449827194214 }, { "auxiliary_loss_clip": 0.01385519, "auxiliary_loss_mlp": 0.01034077, "balance_loss_clip": 1.22826636, "balance_loss_mlp": 1.01522994, "epoch": 0.8440102209529535, "flos": 32862306816000.0, "grad_norm": 1.5125375961035357, "language_loss": 0.69863766, "learning_rate": 2.4977859883196227e-07, "loss": 0.72283363, "num_input_tokens_seen": 302736860, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18859863, "step": 14038, "time_per_iteration": 2.9468817710876465 }, { "auxiliary_loss_clip": 0.01395971, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.23572791, "balance_loss_mlp": 1.01277399, "epoch": 0.8440703442056215, "flos": 23740111929600.0, "grad_norm": 1.6153001544148324, "language_loss": 0.7682991, "learning_rate": 2.49590162635938e-07, "loss": 0.79258335, "num_input_tokens_seen": 302757745, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19689941, "step": 14039, "time_per_iteration": 2.8628385066986084 }, { "auxiliary_loss_clip": 0.0141767, "auxiliary_loss_mlp": 0.01030585, "balance_loss_clip": 1.25212717, "balance_loss_mlp": 1.01234567, "epoch": 0.8441304674582895, "flos": 20203396179840.0, "grad_norm": 1.9768555654996691, "language_loss": 0.80054134, "learning_rate": 2.4940179281563046e-07, "loss": 0.82502389, "num_input_tokens_seen": 302774885, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.18237305, "step": 14040, "time_per_iteration": 4.302589654922485 }, { "auxiliary_loss_clip": 0.01397663, "auxiliary_loss_mlp": 0.01034575, "balance_loss_clip": 1.2381866, "balance_loss_mlp": 1.01541829, "epoch": 0.8441905907109575, "flos": 20226905493120.0, "grad_norm": 1.9506797322979943, "language_loss": 0.70340782, "learning_rate": 2.492134893781821e-07, "loss": 0.72773015, "num_input_tokens_seen": 302791035, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19165039, "step": 14041, "time_per_iteration": 2.8281705379486084 }, { "auxiliary_loss_clip": 0.01398986, "auxiliary_loss_mlp": 0.01036388, "balance_loss_clip": 1.236763, "balance_loss_mlp": 1.01600313, "epoch": 0.8442507139636254, "flos": 13524301526400.0, "grad_norm": 4.700954723765955, "language_loss": 0.70168012, "learning_rate": 2.490252523307341e-07, "loss": 0.72603381, "num_input_tokens_seen": 302808650, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20397949, "step": 14042, "time_per_iteration": 2.8498895168304443 }, { "auxiliary_loss_clip": 0.01389365, "auxiliary_loss_mlp": 0.0103246, "balance_loss_clip": 1.23159695, "balance_loss_mlp": 1.01344562, "epoch": 0.8443108372162934, "flos": 18228526830720.0, "grad_norm": 1.9204816030127811, "language_loss": 0.75885171, "learning_rate": 2.4883708168042373e-07, "loss": 0.78306997, "num_input_tokens_seen": 302824605, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19018555, "step": 14043, "time_per_iteration": 2.810023069381714 }, { "auxiliary_loss_clip": 0.01394173, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 1.23521185, "balance_loss_mlp": 1.01288188, "epoch": 0.8443709604689613, "flos": 16113153818880.0, "grad_norm": 2.2241752936274186, "language_loss": 0.72868967, "learning_rate": 2.486489774343865e-07, "loss": 0.75294352, "num_input_tokens_seen": 302840170, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18334961, "step": 14044, "time_per_iteration": 2.8344521522521973 }, { "auxiliary_loss_clip": 0.01382622, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.22641492, "balance_loss_mlp": 1.01350784, "epoch": 0.8444310837216293, "flos": 18520528642560.0, "grad_norm": 1.60083202108416, "language_loss": 0.75549293, "learning_rate": 2.484609395997559e-07, "loss": 0.77964115, "num_input_tokens_seen": 302858320, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18688965, "step": 14045, "time_per_iteration": 4.23340630531311 }, { "auxiliary_loss_clip": 0.01390044, "auxiliary_loss_mlp": 0.01033152, "balance_loss_clip": 1.2318424, "balance_loss_mlp": 1.01423311, "epoch": 0.8444912069742974, "flos": 14948177973120.0, "grad_norm": 7.105024923158671, "language_loss": 0.79303914, "learning_rate": 2.4827296818366216e-07, "loss": 0.81727111, "num_input_tokens_seen": 302875255, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18908691, "step": 14046, "time_per_iteration": 4.300089120864868 }, { "auxiliary_loss_clip": 0.0139512, "auxiliary_loss_mlp": 0.01031608, "balance_loss_clip": 1.23520589, "balance_loss_mlp": 1.01203382, "epoch": 0.8445513302269653, "flos": 20129864100480.0, "grad_norm": 2.0641921791428137, "language_loss": 0.78594232, "learning_rate": 2.4808506319323255e-07, "loss": 0.81020957, "num_input_tokens_seen": 302894690, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19580078, "step": 14047, "time_per_iteration": 2.8362908363342285 }, { "auxiliary_loss_clip": 0.01393087, "auxiliary_loss_mlp": 0.0103231, "balance_loss_clip": 1.23529971, "balance_loss_mlp": 1.01289129, "epoch": 0.8446114534796333, "flos": 31182153966720.0, "grad_norm": 1.7229578097202274, "language_loss": 0.72707033, "learning_rate": 2.478972246355935e-07, "loss": 0.7513243, "num_input_tokens_seen": 302912405, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19421387, "step": 14048, "time_per_iteration": 2.9037089347839355 }, { "auxiliary_loss_clip": 0.01397783, "auxiliary_loss_mlp": 0.01035901, "balance_loss_clip": 1.23913991, "balance_loss_mlp": 1.01649404, "epoch": 0.8446715767323012, "flos": 23958083969280.0, "grad_norm": 1.5303384440972754, "language_loss": 0.73866558, "learning_rate": 2.477094525178667e-07, "loss": 0.7630024, "num_input_tokens_seen": 302932525, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19421387, "step": 14049, "time_per_iteration": 3.0763416290283203 }, { "auxiliary_loss_clip": 0.01183768, "auxiliary_loss_mlp": 0.01023691, "balance_loss_clip": 1.09254694, "balance_loss_mlp": 1.00023019, "epoch": 0.8447316999849692, "flos": 68015102578560.0, "grad_norm": 0.8098737491884259, "language_loss": 0.60723925, "learning_rate": 2.475217468471729e-07, "loss": 0.62931383, "num_input_tokens_seen": 302991285, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.234375, "step": 14050, "time_per_iteration": 3.2833986282348633 }, { "auxiliary_loss_clip": 0.01390255, "auxiliary_loss_mlp": 0.01031518, "balance_loss_clip": 1.23118401, "balance_loss_mlp": 1.01234961, "epoch": 0.8447918232376371, "flos": 22429067310720.0, "grad_norm": 2.2687726556183576, "language_loss": 0.73104244, "learning_rate": 2.473341076306303e-07, "loss": 0.75526011, "num_input_tokens_seen": 303009515, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19165039, "step": 14051, "time_per_iteration": 2.8351612091064453 }, { "auxiliary_loss_clip": 0.01390097, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.23277211, "balance_loss_mlp": 1.01567125, "epoch": 0.8448519464903052, "flos": 23704160296320.0, "grad_norm": 1.8473062092196968, "language_loss": 0.75881815, "learning_rate": 2.471465348753547e-07, "loss": 0.78306687, "num_input_tokens_seen": 303026905, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.19116211, "step": 14052, "time_per_iteration": 2.8575470447540283 }, { "auxiliary_loss_clip": 0.01377605, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.2247299, "balance_loss_mlp": 1.01596463, "epoch": 0.8449120697429731, "flos": 13743947623680.0, "grad_norm": 1.8851632553705528, "language_loss": 0.74547708, "learning_rate": 2.469590285884575e-07, "loss": 0.76958907, "num_input_tokens_seen": 303045245, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.17626953, "step": 14053, "time_per_iteration": 2.8978829383850098 }, { "auxiliary_loss_clip": 0.01400379, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.2424866, "balance_loss_mlp": 1.01244485, "epoch": 0.8449721929956411, "flos": 20896612047360.0, "grad_norm": 1.7014553033474542, "language_loss": 0.75862479, "learning_rate": 2.467715887770494e-07, "loss": 0.78294337, "num_input_tokens_seen": 303065205, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19030762, "step": 14054, "time_per_iteration": 2.894753932952881 }, { "auxiliary_loss_clip": 0.01417086, "auxiliary_loss_mlp": 0.01030016, "balance_loss_clip": 1.25263476, "balance_loss_mlp": 1.01093078, "epoch": 0.845032316248309, "flos": 33229424275200.0, "grad_norm": 3.671333187467097, "language_loss": 0.78786051, "learning_rate": 2.4658421544823895e-07, "loss": 0.81233156, "num_input_tokens_seen": 303088250, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19091797, "step": 14055, "time_per_iteration": 2.957921266555786 }, { "auxiliary_loss_clip": 0.01391817, "auxiliary_loss_mlp": 0.01032245, "balance_loss_clip": 1.23378897, "balance_loss_mlp": 1.01295674, "epoch": 0.845092439500977, "flos": 23595355255680.0, "grad_norm": 1.7689835384333112, "language_loss": 0.73786122, "learning_rate": 2.463969086091302e-07, "loss": 0.76210189, "num_input_tokens_seen": 303109280, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19287109, "step": 14056, "time_per_iteration": 2.851419687271118 }, { "auxiliary_loss_clip": 0.01406166, "auxiliary_loss_mlp": 0.01037132, "balance_loss_clip": 1.24323201, "balance_loss_mlp": 1.01754582, "epoch": 0.8451525627536449, "flos": 13341692937600.0, "grad_norm": 2.6701080559504726, "language_loss": 0.69171166, "learning_rate": 2.4620966826682686e-07, "loss": 0.71614468, "num_input_tokens_seen": 303126075, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19604492, "step": 14057, "time_per_iteration": 2.814854621887207 }, { "auxiliary_loss_clip": 0.01397435, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.23817134, "balance_loss_mlp": 1.01174629, "epoch": 0.8452126860063129, "flos": 27829313660160.0, "grad_norm": 1.7300387686375354, "language_loss": 0.78243804, "learning_rate": 2.460224944284284e-07, "loss": 0.80672628, "num_input_tokens_seen": 303146920, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19665527, "step": 14058, "time_per_iteration": 2.8979766368865967 }, { "auxiliary_loss_clip": 0.01403679, "auxiliary_loss_mlp": 0.01036125, "balance_loss_clip": 1.24287021, "balance_loss_mlp": 1.01696837, "epoch": 0.845272809258981, "flos": 27135826323840.0, "grad_norm": 1.8755313915809648, "language_loss": 0.70109606, "learning_rate": 2.45835387101033e-07, "loss": 0.72549415, "num_input_tokens_seen": 303167885, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19177246, "step": 14059, "time_per_iteration": 2.8769338130950928 }, { "auxiliary_loss_clip": 0.01417166, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.25158858, "balance_loss_mlp": 1.01561022, "epoch": 0.8453329325116489, "flos": 18341675372160.0, "grad_norm": 2.372029097125862, "language_loss": 0.59000075, "learning_rate": 2.4564834629173516e-07, "loss": 0.61452538, "num_input_tokens_seen": 303185000, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19689941, "step": 14060, "time_per_iteration": 2.881788492202759 }, { "auxiliary_loss_clip": 0.01416742, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.2520566, "balance_loss_mlp": 1.01490605, "epoch": 0.8453930557643169, "flos": 22685705671680.0, "grad_norm": 1.7081899711250015, "language_loss": 0.76331317, "learning_rate": 2.454613720076277e-07, "loss": 0.78782994, "num_input_tokens_seen": 303205210, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20019531, "step": 14061, "time_per_iteration": 2.9991226196289062 }, { "auxiliary_loss_clip": 0.01403486, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.24060106, "balance_loss_mlp": 1.01238239, "epoch": 0.8454531790169848, "flos": 22496219873280.0, "grad_norm": 2.2299601130342364, "language_loss": 0.71365166, "learning_rate": 2.452744642558013e-07, "loss": 0.73801816, "num_input_tokens_seen": 303224655, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.2076416, "step": 14062, "time_per_iteration": 3.018763542175293 }, { "auxiliary_loss_clip": 0.01183974, "auxiliary_loss_mlp": 0.01024561, "balance_loss_clip": 1.09306479, "balance_loss_mlp": 1.00310373, "epoch": 0.8455133022696528, "flos": 58305058260480.0, "grad_norm": 0.6321745036081599, "language_loss": 0.52633214, "learning_rate": 2.450876230433432e-07, "loss": 0.54841751, "num_input_tokens_seen": 303289645, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.21484375, "step": 14063, "time_per_iteration": 3.4655587673187256 }, { "auxiliary_loss_clip": 0.01390443, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.23465967, "balance_loss_mlp": 1.01455307, "epoch": 0.8455734255223207, "flos": 21371267692800.0, "grad_norm": 1.9764796128059385, "language_loss": 0.82882196, "learning_rate": 2.449008483773378e-07, "loss": 0.8530525, "num_input_tokens_seen": 303308350, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18054199, "step": 14064, "time_per_iteration": 2.8386967182159424 }, { "auxiliary_loss_clip": 0.01409268, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.24672747, "balance_loss_mlp": 1.01328182, "epoch": 0.8456335487749888, "flos": 20459039155200.0, "grad_norm": 2.0067298271418528, "language_loss": 0.73294067, "learning_rate": 2.447141402648685e-07, "loss": 0.75737, "num_input_tokens_seen": 303325230, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.20385742, "step": 14065, "time_per_iteration": 2.8496978282928467 }, { "auxiliary_loss_clip": 0.01379985, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.22485352, "balance_loss_mlp": 1.01337278, "epoch": 0.8456936720276567, "flos": 28852835702400.0, "grad_norm": 1.4510701780055577, "language_loss": 0.78220761, "learning_rate": 2.445274987130146e-07, "loss": 0.80632144, "num_input_tokens_seen": 303345810, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18017578, "step": 14066, "time_per_iteration": 2.9248321056365967 }, { "auxiliary_loss_clip": 0.01398685, "auxiliary_loss_mlp": 0.01036135, "balance_loss_clip": 1.23871207, "balance_loss_mlp": 1.01718104, "epoch": 0.8457537952803247, "flos": 22682719514880.0, "grad_norm": 1.6260914943367188, "language_loss": 0.70481652, "learning_rate": 2.4434092372885363e-07, "loss": 0.72916472, "num_input_tokens_seen": 303365140, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1895752, "step": 14067, "time_per_iteration": 2.880302667617798 }, { "auxiliary_loss_clip": 0.01396593, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.23845983, "balance_loss_mlp": 1.0132935, "epoch": 0.8458139185329926, "flos": 33816142586880.0, "grad_norm": 1.8111612529228476, "language_loss": 0.72121322, "learning_rate": 2.4415441531946144e-07, "loss": 0.74550676, "num_input_tokens_seen": 303386150, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19458008, "step": 14068, "time_per_iteration": 2.9755825996398926 }, { "auxiliary_loss_clip": 0.01185967, "auxiliary_loss_mlp": 0.01044685, "balance_loss_clip": 1.09431696, "balance_loss_mlp": 1.01702833, "epoch": 0.8458740417856606, "flos": 70329960489600.0, "grad_norm": 0.6937517197851208, "language_loss": 0.60526264, "learning_rate": 2.4396797349190976e-07, "loss": 0.6275692, "num_input_tokens_seen": 303453770, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 0.27734375, "step": 14069, "time_per_iteration": 3.4512226581573486 }, { "auxiliary_loss_clip": 0.01397515, "auxiliary_loss_mlp": 0.01030405, "balance_loss_clip": 1.23582983, "balance_loss_mlp": 1.01248813, "epoch": 0.8459341650383285, "flos": 24181621119360.0, "grad_norm": 1.4284551395211194, "language_loss": 0.74987113, "learning_rate": 2.4378159825326804e-07, "loss": 0.77415031, "num_input_tokens_seen": 303474520, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.17932129, "step": 14070, "time_per_iteration": 2.9037904739379883 }, { "auxiliary_loss_clip": 0.01394924, "auxiliary_loss_mlp": 0.01031365, "balance_loss_clip": 1.23614168, "balance_loss_mlp": 1.01279235, "epoch": 0.8459942882909965, "flos": 38195491092480.0, "grad_norm": 1.5570534143166308, "language_loss": 0.67560756, "learning_rate": 2.435952896106039e-07, "loss": 0.69987047, "num_input_tokens_seen": 303497345, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18579102, "step": 14071, "time_per_iteration": 2.975773572921753 }, { "auxiliary_loss_clip": 0.01183041, "auxiliary_loss_mlp": 0.01033356, "balance_loss_clip": 1.09303582, "balance_loss_mlp": 1.01227999, "epoch": 0.8460544115436646, "flos": 64147673450880.0, "grad_norm": 0.738063260810199, "language_loss": 0.61062402, "learning_rate": 2.4340904757098313e-07, "loss": 0.63278794, "num_input_tokens_seen": 303554890, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.2109375, "step": 14072, "time_per_iteration": 4.529989957809448 }, { "auxiliary_loss_clip": 0.01403547, "auxiliary_loss_mlp": 0.01034101, "balance_loss_clip": 1.24147129, "balance_loss_mlp": 1.01363242, "epoch": 0.8461145347963325, "flos": 24181575874560.0, "grad_norm": 1.8996454302059484, "language_loss": 0.73456872, "learning_rate": 2.4322287214146664e-07, "loss": 0.75894523, "num_input_tokens_seen": 303574380, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20471191, "step": 14073, "time_per_iteration": 2.8788063526153564 }, { "auxiliary_loss_clip": 0.01430995, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.26275611, "balance_loss_mlp": 1.01112151, "epoch": 0.8461746580490005, "flos": 34906319498880.0, "grad_norm": 1.5968170621135314, "language_loss": 0.78562951, "learning_rate": 2.430367633291155e-07, "loss": 0.81025785, "num_input_tokens_seen": 303594910, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20703125, "step": 14074, "time_per_iteration": 2.974012613296509 }, { "auxiliary_loss_clip": 0.01394466, "auxiliary_loss_mlp": 0.01035326, "balance_loss_clip": 1.23484445, "balance_loss_mlp": 1.01577556, "epoch": 0.8462347813016684, "flos": 25568052854400.0, "grad_norm": 2.2338889269943665, "language_loss": 0.75849229, "learning_rate": 2.4285072114098583e-07, "loss": 0.78279018, "num_input_tokens_seen": 303613520, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19555664, "step": 14075, "time_per_iteration": 4.294775009155273 }, { "auxiliary_loss_clip": 0.01384875, "auxiliary_loss_mlp": 0.01034282, "balance_loss_clip": 1.22780919, "balance_loss_mlp": 1.01534009, "epoch": 0.8462949045543364, "flos": 21335451793920.0, "grad_norm": 2.3466413463148883, "language_loss": 0.74136019, "learning_rate": 2.4266474558413355e-07, "loss": 0.76555169, "num_input_tokens_seen": 303631225, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18945312, "step": 14076, "time_per_iteration": 2.828092336654663 }, { "auxiliary_loss_clip": 0.01408441, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.24548423, "balance_loss_mlp": 1.01504481, "epoch": 0.8463550278070043, "flos": 22647537043200.0, "grad_norm": 3.0719584013184726, "language_loss": 0.78589386, "learning_rate": 2.4247883666560945e-07, "loss": 0.81032073, "num_input_tokens_seen": 303649175, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.1920166, "step": 14077, "time_per_iteration": 2.826051712036133 }, { "auxiliary_loss_clip": 0.01412899, "auxiliary_loss_mlp": 0.01036898, "balance_loss_clip": 1.24944556, "balance_loss_mlp": 1.01662064, "epoch": 0.8464151510596724, "flos": 13013287044480.0, "grad_norm": 1.937292384427296, "language_loss": 0.76162827, "learning_rate": 2.422929943924643e-07, "loss": 0.78612626, "num_input_tokens_seen": 303665915, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20275879, "step": 14078, "time_per_iteration": 2.8548712730407715 }, { "auxiliary_loss_clip": 0.01393524, "auxiliary_loss_mlp": 0.01029148, "balance_loss_clip": 1.2356565, "balance_loss_mlp": 1.00968111, "epoch": 0.8464752743123403, "flos": 15713161372800.0, "grad_norm": 2.1216094155452128, "language_loss": 0.85598409, "learning_rate": 2.4210721877174565e-07, "loss": 0.88021088, "num_input_tokens_seen": 303679985, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19470215, "step": 14079, "time_per_iteration": 2.956251621246338 }, { "auxiliary_loss_clip": 0.01442036, "auxiliary_loss_mlp": 0.01033802, "balance_loss_clip": 1.27260256, "balance_loss_mlp": 1.01303577, "epoch": 0.8465353975650083, "flos": 21664219645440.0, "grad_norm": 2.317006158762273, "language_loss": 0.59383851, "learning_rate": 2.419215098104965e-07, "loss": 0.61859691, "num_input_tokens_seen": 303698470, "router_z_loss_clip": 1.69433594, "router_z_loss_mlp": 0.20751953, "step": 14080, "time_per_iteration": 4.298716068267822 }, { "auxiliary_loss_clip": 0.01419281, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.25228941, "balance_loss_mlp": 1.01475763, "epoch": 0.8465955208176762, "flos": 18524962632960.0, "grad_norm": 13.785412436295507, "language_loss": 0.67266643, "learning_rate": 2.4173586751576014e-07, "loss": 0.69720149, "num_input_tokens_seen": 303716415, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.19470215, "step": 14081, "time_per_iteration": 4.385536193847656 }, { "auxiliary_loss_clip": 0.01407131, "auxiliary_loss_mlp": 0.01033886, "balance_loss_clip": 1.24498224, "balance_loss_mlp": 1.01532483, "epoch": 0.8466556440703442, "flos": 24209564423040.0, "grad_norm": 2.266453586959632, "language_loss": 0.73319328, "learning_rate": 2.41550291894576e-07, "loss": 0.75760347, "num_input_tokens_seen": 303734490, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18579102, "step": 14082, "time_per_iteration": 2.8854384422302246 }, { "auxiliary_loss_clip": 0.01401426, "auxiliary_loss_mlp": 0.01032087, "balance_loss_clip": 1.23974466, "balance_loss_mlp": 1.01260829, "epoch": 0.8467157673230121, "flos": 20385552320640.0, "grad_norm": 1.8504139442598664, "language_loss": 0.76567352, "learning_rate": 2.413647829539809e-07, "loss": 0.79000866, "num_input_tokens_seen": 303752310, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19482422, "step": 14083, "time_per_iteration": 2.8527870178222656 }, { "auxiliary_loss_clip": 0.01407858, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.24415302, "balance_loss_mlp": 1.01070178, "epoch": 0.8467758905756801, "flos": 28484858592000.0, "grad_norm": 2.000356250133995, "language_loss": 0.66680944, "learning_rate": 2.411793407010092e-07, "loss": 0.69120049, "num_input_tokens_seen": 303776065, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.2052002, "step": 14084, "time_per_iteration": 2.9038772583007812 }, { "auxiliary_loss_clip": 0.01398665, "auxiliary_loss_mlp": 0.01033642, "balance_loss_clip": 1.23997426, "balance_loss_mlp": 1.01390123, "epoch": 0.8468360138283482, "flos": 11700025430400.0, "grad_norm": 2.182498675441338, "language_loss": 0.71277976, "learning_rate": 2.409939651426938e-07, "loss": 0.73710287, "num_input_tokens_seen": 303793500, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.1973877, "step": 14085, "time_per_iteration": 2.844148874282837 }, { "auxiliary_loss_clip": 0.0139605, "auxiliary_loss_mlp": 0.01031603, "balance_loss_clip": 1.23620784, "balance_loss_mlp": 1.01287484, "epoch": 0.8468961370810161, "flos": 24618515339520.0, "grad_norm": 1.5123008147622037, "language_loss": 0.71697128, "learning_rate": 2.408086562860634e-07, "loss": 0.74124777, "num_input_tokens_seen": 303814835, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18725586, "step": 14086, "time_per_iteration": 2.9022231101989746 }, { "auxiliary_loss_clip": 0.01402756, "auxiliary_loss_mlp": 0.01034711, "balance_loss_clip": 1.24327636, "balance_loss_mlp": 1.01529145, "epoch": 0.8469562603336841, "flos": 19619302066560.0, "grad_norm": 1.765669431282174, "language_loss": 0.7574113, "learning_rate": 2.4062341413814445e-07, "loss": 0.78178596, "num_input_tokens_seen": 303834505, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19433594, "step": 14087, "time_per_iteration": 2.9288957118988037 }, { "auxiliary_loss_clip": 0.01399685, "auxiliary_loss_mlp": 0.01029546, "balance_loss_clip": 1.23958039, "balance_loss_mlp": 1.01140189, "epoch": 0.847016383586352, "flos": 22649437324800.0, "grad_norm": 1.5933509585098458, "language_loss": 0.74588549, "learning_rate": 2.4043823870596227e-07, "loss": 0.77017784, "num_input_tokens_seen": 303855050, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18139648, "step": 14088, "time_per_iteration": 2.8403165340423584 }, { "auxiliary_loss_clip": 0.01404396, "auxiliary_loss_mlp": 0.01032357, "balance_loss_clip": 1.24326825, "balance_loss_mlp": 1.0124495, "epoch": 0.84707650683902, "flos": 20970098881920.0, "grad_norm": 2.0095607393170787, "language_loss": 0.73267198, "learning_rate": 2.402531299965387e-07, "loss": 0.75703955, "num_input_tokens_seen": 303875635, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19909668, "step": 14089, "time_per_iteration": 2.934007167816162 }, { "auxiliary_loss_clip": 0.01396761, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.24021411, "balance_loss_mlp": 1.01497698, "epoch": 0.8471366300916879, "flos": 24102478684800.0, "grad_norm": 1.9950341709236414, "language_loss": 0.79707968, "learning_rate": 2.400680880168928e-07, "loss": 0.82138574, "num_input_tokens_seen": 303896750, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.1887207, "step": 14090, "time_per_iteration": 2.8600449562072754 }, { "auxiliary_loss_clip": 0.01410367, "auxiliary_loss_mlp": 0.01036223, "balance_loss_clip": 1.24829602, "balance_loss_mlp": 1.01682734, "epoch": 0.847196753344356, "flos": 18342082575360.0, "grad_norm": 5.1436643837460725, "language_loss": 0.78105164, "learning_rate": 2.3988311277404085e-07, "loss": 0.80551755, "num_input_tokens_seen": 303915435, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19396973, "step": 14091, "time_per_iteration": 2.8164889812469482 }, { "auxiliary_loss_clip": 0.01182556, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.09144866, "balance_loss_mlp": 1.0090332, "epoch": 0.8472568765970239, "flos": 49595026671360.0, "grad_norm": 0.8188321023788444, "language_loss": 0.59393477, "learning_rate": 2.396982042749982e-07, "loss": 0.61609101, "num_input_tokens_seen": 303977245, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.24023438, "step": 14092, "time_per_iteration": 3.4411823749542236 }, { "auxiliary_loss_clip": 0.01401124, "auxiliary_loss_mlp": 0.01037481, "balance_loss_clip": 1.24010205, "balance_loss_mlp": 1.0173347, "epoch": 0.8473169998496919, "flos": 19287864771840.0, "grad_norm": 1.804308359681766, "language_loss": 0.71115941, "learning_rate": 2.395133625267756e-07, "loss": 0.73554552, "num_input_tokens_seen": 303996055, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.20166016, "step": 14093, "time_per_iteration": 2.8487629890441895 }, { "auxiliary_loss_clip": 0.01381803, "auxiliary_loss_mlp": 0.01028688, "balance_loss_clip": 1.22579002, "balance_loss_mlp": 1.01003218, "epoch": 0.8473771231023598, "flos": 17684411137920.0, "grad_norm": 1.905838569111715, "language_loss": 0.84005809, "learning_rate": 2.3932858753638263e-07, "loss": 0.86416304, "num_input_tokens_seen": 304012205, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18652344, "step": 14094, "time_per_iteration": 2.8359169960021973 }, { "auxiliary_loss_clip": 0.01385691, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.23019695, "balance_loss_mlp": 1.01758337, "epoch": 0.8474372463550278, "flos": 26371023903360.0, "grad_norm": 1.7589072013503861, "language_loss": 0.71892744, "learning_rate": 2.3914387931082626e-07, "loss": 0.74315357, "num_input_tokens_seen": 304033475, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.1932373, "step": 14095, "time_per_iteration": 2.8978829383850098 }, { "auxiliary_loss_clip": 0.01398435, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.24070072, "balance_loss_mlp": 1.0158186, "epoch": 0.8474973696076957, "flos": 23411932260480.0, "grad_norm": 3.6227634462899116, "language_loss": 0.81938475, "learning_rate": 2.3895923785711105e-07, "loss": 0.84372473, "num_input_tokens_seen": 304051845, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1973877, "step": 14096, "time_per_iteration": 2.829822063446045 }, { "auxiliary_loss_clip": 0.0141713, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.25204182, "balance_loss_mlp": 1.01683712, "epoch": 0.8475574928603637, "flos": 25084755452160.0, "grad_norm": 1.9476072205535784, "language_loss": 0.78154504, "learning_rate": 2.387746631822374e-07, "loss": 0.80607903, "num_input_tokens_seen": 304069965, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19433594, "step": 14097, "time_per_iteration": 2.863884687423706 }, { "auxiliary_loss_clip": 0.01395354, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.23678565, "balance_loss_mlp": 1.01350296, "epoch": 0.8476176161130318, "flos": 19974203429760.0, "grad_norm": 1.909235513062762, "language_loss": 0.81206727, "learning_rate": 2.385901552932048e-07, "loss": 0.83634162, "num_input_tokens_seen": 304086805, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18579102, "step": 14098, "time_per_iteration": 2.8245351314544678 }, { "auxiliary_loss_clip": 0.01391733, "auxiliary_loss_mlp": 0.01037217, "balance_loss_clip": 1.23385739, "balance_loss_mlp": 1.01707101, "epoch": 0.8476777393656997, "flos": 21295156659840.0, "grad_norm": 4.528267847629165, "language_loss": 0.72699815, "learning_rate": 2.3840571419701062e-07, "loss": 0.7512877, "num_input_tokens_seen": 304105865, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.20141602, "step": 14099, "time_per_iteration": 2.8637490272521973 }, { "auxiliary_loss_clip": 0.01398262, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.23841059, "balance_loss_mlp": 1.01667929, "epoch": 0.8477378626183677, "flos": 29983579217280.0, "grad_norm": 3.0645556312214657, "language_loss": 0.64311749, "learning_rate": 2.3822133990064787e-07, "loss": 0.6674757, "num_input_tokens_seen": 304128300, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.20849609, "step": 14100, "time_per_iteration": 2.9130966663360596 }, { "auxiliary_loss_clip": 0.01412719, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.24945569, "balance_loss_mlp": 1.01140749, "epoch": 0.8477979858710356, "flos": 24246918645120.0, "grad_norm": 2.0197228320848426, "language_loss": 0.74640363, "learning_rate": 2.380370324111085e-07, "loss": 0.77083969, "num_input_tokens_seen": 304143695, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19482422, "step": 14101, "time_per_iteration": 2.839693307876587 }, { "auxiliary_loss_clip": 0.01404163, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.24379134, "balance_loss_mlp": 1.01145363, "epoch": 0.8478581091237036, "flos": 25604773649280.0, "grad_norm": 1.7983492302700896, "language_loss": 0.72435027, "learning_rate": 2.3785279173538163e-07, "loss": 0.74869412, "num_input_tokens_seen": 304165800, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18786621, "step": 14102, "time_per_iteration": 2.920447587966919 }, { "auxiliary_loss_clip": 0.01409953, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.24599469, "balance_loss_mlp": 1.01369679, "epoch": 0.8479182323763715, "flos": 12064609180800.0, "grad_norm": 1.8631524375377486, "language_loss": 0.82619357, "learning_rate": 2.3766861788045366e-07, "loss": 0.85062695, "num_input_tokens_seen": 304182910, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19714355, "step": 14103, "time_per_iteration": 2.8113646507263184 }, { "auxiliary_loss_clip": 0.01403205, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.24548137, "balance_loss_mlp": 1.01388347, "epoch": 0.8479783556290396, "flos": 21443170959360.0, "grad_norm": 2.7642406611944854, "language_loss": 0.78930169, "learning_rate": 2.374845108533079e-07, "loss": 0.81366318, "num_input_tokens_seen": 304200175, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.19055176, "step": 14104, "time_per_iteration": 2.847508192062378 }, { "auxiliary_loss_clip": 0.01406072, "auxiliary_loss_mlp": 0.01037004, "balance_loss_clip": 1.2447778, "balance_loss_mlp": 1.01694083, "epoch": 0.8480384788817075, "flos": 19651181667840.0, "grad_norm": 2.113667057088085, "language_loss": 0.79626828, "learning_rate": 2.3730047066092607e-07, "loss": 0.82069904, "num_input_tokens_seen": 304217775, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.20056152, "step": 14105, "time_per_iteration": 2.8985023498535156 }, { "auxiliary_loss_clip": 0.01424055, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.25854635, "balance_loss_mlp": 1.01640892, "epoch": 0.8480986021343755, "flos": 22498663092480.0, "grad_norm": 1.7366685444946626, "language_loss": 0.50716525, "learning_rate": 2.3711649731028749e-07, "loss": 0.53177732, "num_input_tokens_seen": 304235760, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20739746, "step": 14106, "time_per_iteration": 2.8652591705322266 }, { "auxiliary_loss_clip": 0.01393129, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.23404026, "balance_loss_mlp": 1.01419449, "epoch": 0.8481587253870434, "flos": 22100616172800.0, "grad_norm": 4.266370736721215, "language_loss": 0.75738758, "learning_rate": 2.3693259080836792e-07, "loss": 0.7816568, "num_input_tokens_seen": 304253985, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19604492, "step": 14107, "time_per_iteration": 4.2944653034210205 }, { "auxiliary_loss_clip": 0.01395806, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.23668408, "balance_loss_mlp": 1.01479352, "epoch": 0.8482188486397114, "flos": 33595501104000.0, "grad_norm": 3.0255009279742633, "language_loss": 0.73872405, "learning_rate": 2.3674875116214087e-07, "loss": 0.76302624, "num_input_tokens_seen": 304276785, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19616699, "step": 14108, "time_per_iteration": 2.9484353065490723 }, { "auxiliary_loss_clip": 0.01392536, "auxiliary_loss_mlp": 0.01030429, "balance_loss_clip": 1.23629057, "balance_loss_mlp": 1.01080728, "epoch": 0.8482789718923793, "flos": 20928220179840.0, "grad_norm": 1.5748626884519283, "language_loss": 0.73647749, "learning_rate": 2.3656497837857836e-07, "loss": 0.76070714, "num_input_tokens_seen": 304296310, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.19641113, "step": 14109, "time_per_iteration": 4.302866458892822 }, { "auxiliary_loss_clip": 0.01398518, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.23959875, "balance_loss_mlp": 1.01300967, "epoch": 0.8483390951450474, "flos": 12903893821440.0, "grad_norm": 2.028738728930584, "language_loss": 0.74783224, "learning_rate": 2.3638127246464811e-07, "loss": 0.77213705, "num_input_tokens_seen": 304311715, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18933105, "step": 14110, "time_per_iteration": 2.8216323852539062 }, { "auxiliary_loss_clip": 0.01402088, "auxiliary_loss_mlp": 0.01035152, "balance_loss_clip": 1.2418648, "balance_loss_mlp": 1.01578045, "epoch": 0.8483992183977154, "flos": 25092266088960.0, "grad_norm": 1.6082793889656823, "language_loss": 0.76520395, "learning_rate": 2.3619763342731658e-07, "loss": 0.78957641, "num_input_tokens_seen": 304331910, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19372559, "step": 14111, "time_per_iteration": 2.8686282634735107 }, { "auxiliary_loss_clip": 0.01394171, "auxiliary_loss_mlp": 0.01033498, "balance_loss_clip": 1.23629832, "balance_loss_mlp": 1.01447201, "epoch": 0.8484593416503833, "flos": 25568595792000.0, "grad_norm": 2.5235035333807203, "language_loss": 0.68098783, "learning_rate": 2.3601406127354772e-07, "loss": 0.70526451, "num_input_tokens_seen": 304351405, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19018555, "step": 14112, "time_per_iteration": 2.883678436279297 }, { "auxiliary_loss_clip": 0.01392829, "auxiliary_loss_mlp": 0.01028168, "balance_loss_clip": 1.23190093, "balance_loss_mlp": 1.00992942, "epoch": 0.8485194649030513, "flos": 27209810851200.0, "grad_norm": 1.726562405186961, "language_loss": 0.74196613, "learning_rate": 2.3583055601030312e-07, "loss": 0.76617616, "num_input_tokens_seen": 304372935, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18237305, "step": 14113, "time_per_iteration": 2.8724875450134277 }, { "auxiliary_loss_clip": 0.0140044, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.24115705, "balance_loss_mlp": 1.01523316, "epoch": 0.8485795881557192, "flos": 24216305898240.0, "grad_norm": 2.447760845087055, "language_loss": 0.66718143, "learning_rate": 2.3564711764454003e-07, "loss": 0.69153643, "num_input_tokens_seen": 304393070, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19824219, "step": 14114, "time_per_iteration": 2.9304282665252686 }, { "auxiliary_loss_clip": 0.01410288, "auxiliary_loss_mlp": 0.01034216, "balance_loss_clip": 1.24701476, "balance_loss_mlp": 1.01451111, "epoch": 0.8486397114083872, "flos": 21151395371520.0, "grad_norm": 1.6742487043588419, "language_loss": 0.7976765, "learning_rate": 2.3546374618321495e-07, "loss": 0.8221215, "num_input_tokens_seen": 304411195, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19714355, "step": 14115, "time_per_iteration": 4.239477872848511 }, { "auxiliary_loss_clip": 0.01411613, "auxiliary_loss_mlp": 0.01034255, "balance_loss_clip": 1.25007474, "balance_loss_mlp": 1.01600409, "epoch": 0.8486998346610551, "flos": 19984383509760.0, "grad_norm": 2.030838320883373, "language_loss": 0.79555321, "learning_rate": 2.3528044163328187e-07, "loss": 0.82001191, "num_input_tokens_seen": 304429425, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18261719, "step": 14116, "time_per_iteration": 4.330217361450195 }, { "auxiliary_loss_clip": 0.01407349, "auxiliary_loss_mlp": 0.01032579, "balance_loss_clip": 1.24410033, "balance_loss_mlp": 1.01329064, "epoch": 0.8487599579137232, "flos": 19801865410560.0, "grad_norm": 1.976631536553084, "language_loss": 0.69808924, "learning_rate": 2.3509720400169076e-07, "loss": 0.72248852, "num_input_tokens_seen": 304447460, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19299316, "step": 14117, "time_per_iteration": 2.844951629638672 }, { "auxiliary_loss_clip": 0.01404394, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.24158561, "balance_loss_mlp": 1.01361728, "epoch": 0.8488200811663911, "flos": 26407247005440.0, "grad_norm": 2.473468579795717, "language_loss": 0.65884209, "learning_rate": 2.3491403329539096e-07, "loss": 0.6832158, "num_input_tokens_seen": 304468230, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19360352, "step": 14118, "time_per_iteration": 2.8633108139038086 }, { "auxiliary_loss_clip": 0.01400867, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.24244463, "balance_loss_mlp": 1.01411712, "epoch": 0.8488802044190591, "flos": 16367077491840.0, "grad_norm": 1.5294007173838253, "language_loss": 0.73563516, "learning_rate": 2.3473092952132757e-07, "loss": 0.75996983, "num_input_tokens_seen": 304484860, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18493652, "step": 14119, "time_per_iteration": 2.8399314880371094 }, { "auxiliary_loss_clip": 0.01407574, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.24678636, "balance_loss_mlp": 1.01521337, "epoch": 0.848940327671727, "flos": 19218359479680.0, "grad_norm": 1.9660120257505227, "language_loss": 0.79001498, "learning_rate": 2.345478926864446e-07, "loss": 0.81444514, "num_input_tokens_seen": 304503575, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20227051, "step": 14120, "time_per_iteration": 2.8365790843963623 }, { "auxiliary_loss_clip": 0.01420202, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.25684118, "balance_loss_mlp": 1.01548052, "epoch": 0.849000450924395, "flos": 21881060565120.0, "grad_norm": 1.9413138475671718, "language_loss": 0.76493841, "learning_rate": 2.3436492279768227e-07, "loss": 0.78948766, "num_input_tokens_seen": 304525005, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19250488, "step": 14121, "time_per_iteration": 3.0033068656921387 }, { "auxiliary_loss_clip": 0.01184682, "auxiliary_loss_mlp": 0.01020501, "balance_loss_clip": 1.09399056, "balance_loss_mlp": 0.99704039, "epoch": 0.8490605741770629, "flos": 71199405429120.0, "grad_norm": 0.8148754852756624, "language_loss": 0.60228276, "learning_rate": 2.3418201986197883e-07, "loss": 0.62433463, "num_input_tokens_seen": 304585220, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.234375, "step": 14122, "time_per_iteration": 3.335278034210205 }, { "auxiliary_loss_clip": 0.01395297, "auxiliary_loss_mlp": 0.01032683, "balance_loss_clip": 1.23435557, "balance_loss_mlp": 1.01399076, "epoch": 0.849120697429731, "flos": 24984275454720.0, "grad_norm": 1.8501848679385762, "language_loss": 0.80730653, "learning_rate": 2.3399918388627048e-07, "loss": 0.83158636, "num_input_tokens_seen": 304604665, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18688965, "step": 14123, "time_per_iteration": 2.8891613483428955 }, { "auxiliary_loss_clip": 0.01386924, "auxiliary_loss_mlp": 0.01030583, "balance_loss_clip": 1.23187041, "balance_loss_mlp": 1.01241612, "epoch": 0.8491808206823989, "flos": 23040788014080.0, "grad_norm": 2.6008236687248907, "language_loss": 0.83899271, "learning_rate": 2.3381641487749016e-07, "loss": 0.86316782, "num_input_tokens_seen": 304620600, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.18188477, "step": 14124, "time_per_iteration": 2.849990129470825 }, { "auxiliary_loss_clip": 0.01397466, "auxiliary_loss_mlp": 0.01031166, "balance_loss_clip": 1.23889709, "balance_loss_mlp": 1.01172292, "epoch": 0.8492409439350669, "flos": 23889121614720.0, "grad_norm": 1.8047793083671353, "language_loss": 0.73435718, "learning_rate": 2.3363371284256805e-07, "loss": 0.75864351, "num_input_tokens_seen": 304639540, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19421387, "step": 14125, "time_per_iteration": 2.8531100749969482 }, { "auxiliary_loss_clip": 0.01420295, "auxiliary_loss_mlp": 0.01035811, "balance_loss_clip": 1.25469029, "balance_loss_mlp": 1.01560545, "epoch": 0.8493010671877349, "flos": 22430288920320.0, "grad_norm": 1.6789393192214688, "language_loss": 0.74039727, "learning_rate": 2.3345107778843288e-07, "loss": 0.76495832, "num_input_tokens_seen": 304660595, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.20202637, "step": 14126, "time_per_iteration": 2.8507165908813477 }, { "auxiliary_loss_clip": 0.01388725, "auxiliary_loss_mlp": 0.01034363, "balance_loss_clip": 1.23217988, "balance_loss_mlp": 1.01589751, "epoch": 0.8493611904404028, "flos": 17538206630400.0, "grad_norm": 1.4526971388102072, "language_loss": 0.68510675, "learning_rate": 2.3326850972200928e-07, "loss": 0.70933765, "num_input_tokens_seen": 304679580, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18469238, "step": 14127, "time_per_iteration": 2.8638088703155518 }, { "auxiliary_loss_clip": 0.01407233, "auxiliary_loss_mlp": 0.01032596, "balance_loss_clip": 1.24383402, "balance_loss_mlp": 1.01255679, "epoch": 0.8494213136930708, "flos": 19472147418240.0, "grad_norm": 2.0612697359388448, "language_loss": 0.69704425, "learning_rate": 2.330860086502211e-07, "loss": 0.72144258, "num_input_tokens_seen": 304698385, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20068359, "step": 14128, "time_per_iteration": 2.834747552871704 }, { "auxiliary_loss_clip": 0.01396352, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.23812091, "balance_loss_mlp": 1.01695716, "epoch": 0.8494814369457387, "flos": 18779474488320.0, "grad_norm": 1.7379983379315502, "language_loss": 0.78846264, "learning_rate": 2.3290357457998855e-07, "loss": 0.81278956, "num_input_tokens_seen": 304715430, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19396973, "step": 14129, "time_per_iteration": 2.8770384788513184 }, { "auxiliary_loss_clip": 0.01409567, "auxiliary_loss_mlp": 0.01038257, "balance_loss_clip": 1.24975395, "balance_loss_mlp": 1.01957726, "epoch": 0.8495415601984068, "flos": 23341974520320.0, "grad_norm": 1.5940866403287715, "language_loss": 0.68727148, "learning_rate": 2.3272120751823031e-07, "loss": 0.71174973, "num_input_tokens_seen": 304734345, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18701172, "step": 14130, "time_per_iteration": 2.984592914581299 }, { "auxiliary_loss_clip": 0.01399714, "auxiliary_loss_mlp": 0.0103341, "balance_loss_clip": 1.23860443, "balance_loss_mlp": 1.01354957, "epoch": 0.8496016834510747, "flos": 26623590232320.0, "grad_norm": 1.7148812664605608, "language_loss": 0.72175026, "learning_rate": 2.3253890747186e-07, "loss": 0.74608147, "num_input_tokens_seen": 304755030, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1986084, "step": 14131, "time_per_iteration": 2.9303297996520996 }, { "auxiliary_loss_clip": 0.0141008, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.24696612, "balance_loss_mlp": 1.01336551, "epoch": 0.8496618067037427, "flos": 25490448743040.0, "grad_norm": 2.1804339083841615, "language_loss": 0.69269729, "learning_rate": 2.3235667444779162e-07, "loss": 0.71710849, "num_input_tokens_seen": 304774320, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.17675781, "step": 14132, "time_per_iteration": 2.871819496154785 }, { "auxiliary_loss_clip": 0.01398671, "auxiliary_loss_mlp": 0.01035323, "balance_loss_clip": 1.23976099, "balance_loss_mlp": 1.01765656, "epoch": 0.8497219299564106, "flos": 25385580000000.0, "grad_norm": 2.157538710565242, "language_loss": 0.70689487, "learning_rate": 2.3217450845293564e-07, "loss": 0.73123479, "num_input_tokens_seen": 304795355, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.17675781, "step": 14133, "time_per_iteration": 2.8966238498687744 }, { "auxiliary_loss_clip": 0.01182718, "auxiliary_loss_mlp": 0.01038991, "balance_loss_clip": 1.09558046, "balance_loss_mlp": 1.01286054, "epoch": 0.8497820532090786, "flos": 67814142583680.0, "grad_norm": 0.7315035848374647, "language_loss": 0.57616919, "learning_rate": 2.3199240949419918e-07, "loss": 0.59838629, "num_input_tokens_seen": 304863915, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.26171875, "step": 14134, "time_per_iteration": 3.4342432022094727 }, { "auxiliary_loss_clip": 0.01419069, "auxiliary_loss_mlp": 0.01030413, "balance_loss_clip": 1.25469947, "balance_loss_mlp": 1.01172054, "epoch": 0.8498421764617465, "flos": 23450960540160.0, "grad_norm": 1.891978008068613, "language_loss": 0.79184234, "learning_rate": 2.3181037757848787e-07, "loss": 0.81633711, "num_input_tokens_seen": 304881555, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18688965, "step": 14135, "time_per_iteration": 2.8613173961639404 }, { "auxiliary_loss_clip": 0.01412241, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.24871552, "balance_loss_mlp": 1.01348555, "epoch": 0.8499022997144146, "flos": 17721629625600.0, "grad_norm": 1.9572193858628646, "language_loss": 0.64947826, "learning_rate": 2.316284127127044e-07, "loss": 0.67393422, "num_input_tokens_seen": 304898760, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19873047, "step": 14136, "time_per_iteration": 2.811774253845215 }, { "auxiliary_loss_clip": 0.01420207, "auxiliary_loss_mlp": 0.01034384, "balance_loss_clip": 1.25582004, "balance_loss_mlp": 1.01492906, "epoch": 0.8499624229670825, "flos": 18597816040320.0, "grad_norm": 1.9588103129650019, "language_loss": 0.84644175, "learning_rate": 2.3144651490374835e-07, "loss": 0.87098765, "num_input_tokens_seen": 304915465, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19445801, "step": 14137, "time_per_iteration": 2.846207857131958 }, { "auxiliary_loss_clip": 0.01392016, "auxiliary_loss_mlp": 0.01032459, "balance_loss_clip": 1.23514378, "balance_loss_mlp": 1.01321888, "epoch": 0.8500225462197505, "flos": 24354728300160.0, "grad_norm": 2.0130237336770174, "language_loss": 0.79566038, "learning_rate": 2.3126468415851773e-07, "loss": 0.8199051, "num_input_tokens_seen": 304933190, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.19250488, "step": 14138, "time_per_iteration": 2.8721206188201904 }, { "auxiliary_loss_clip": 0.01402405, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.24232364, "balance_loss_mlp": 1.01503301, "epoch": 0.8500826694724185, "flos": 16554346295040.0, "grad_norm": 3.211481603544995, "language_loss": 0.64732546, "learning_rate": 2.310829204839073e-07, "loss": 0.67169189, "num_input_tokens_seen": 304951110, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19189453, "step": 14139, "time_per_iteration": 2.8846235275268555 }, { "auxiliary_loss_clip": 0.01400708, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.24133086, "balance_loss_mlp": 1.01020706, "epoch": 0.8501427927250864, "flos": 16297979402880.0, "grad_norm": 2.4421866563182033, "language_loss": 0.71434915, "learning_rate": 2.3090122388681043e-07, "loss": 0.73864669, "num_input_tokens_seen": 304969095, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18847656, "step": 14140, "time_per_iteration": 2.8724172115325928 }, { "auxiliary_loss_clip": 0.01421937, "auxiliary_loss_mlp": 0.01038634, "balance_loss_clip": 1.25683999, "balance_loss_mlp": 1.019418, "epoch": 0.8502029159777544, "flos": 26699203572480.0, "grad_norm": 1.976687355959325, "language_loss": 0.64821959, "learning_rate": 2.3071959437411648e-07, "loss": 0.67282534, "num_input_tokens_seen": 304989315, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19213867, "step": 14141, "time_per_iteration": 2.8797781467437744 }, { "auxiliary_loss_clip": 0.01400999, "auxiliary_loss_mlp": 0.01038656, "balance_loss_clip": 1.24127352, "balance_loss_mlp": 1.01959419, "epoch": 0.8502630392304223, "flos": 35604467049600.0, "grad_norm": 1.6551513780304723, "language_loss": 0.71471483, "learning_rate": 2.3053803195271214e-07, "loss": 0.73911142, "num_input_tokens_seen": 305011020, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19055176, "step": 14142, "time_per_iteration": 4.364487886428833 }, { "auxiliary_loss_clip": 0.014036, "auxiliary_loss_mlp": 0.0103162, "balance_loss_clip": 1.24191439, "balance_loss_mlp": 1.01308298, "epoch": 0.8503231624830904, "flos": 21658564045440.0, "grad_norm": 1.448282235217926, "language_loss": 0.66237742, "learning_rate": 2.3035653662948375e-07, "loss": 0.68672961, "num_input_tokens_seen": 305033550, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18530273, "step": 14143, "time_per_iteration": 2.9855539798736572 }, { "auxiliary_loss_clip": 0.01413523, "auxiliary_loss_mlp": 0.01034094, "balance_loss_clip": 1.2485013, "balance_loss_mlp": 1.01500821, "epoch": 0.8503832857357583, "flos": 22427212273920.0, "grad_norm": 1.7499364289532935, "language_loss": 0.68614095, "learning_rate": 2.3017510841131216e-07, "loss": 0.71061713, "num_input_tokens_seen": 305052885, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.1907959, "step": 14144, "time_per_iteration": 4.286494016647339 }, { "auxiliary_loss_clip": 0.01393676, "auxiliary_loss_mlp": 0.01033367, "balance_loss_clip": 1.23666573, "balance_loss_mlp": 1.01385283, "epoch": 0.8504434089884263, "flos": 18707299752960.0, "grad_norm": 2.29548064829877, "language_loss": 0.65724361, "learning_rate": 2.299937473050777e-07, "loss": 0.68151402, "num_input_tokens_seen": 305071995, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.1953125, "step": 14145, "time_per_iteration": 2.8191776275634766 }, { "auxiliary_loss_clip": 0.01399718, "auxiliary_loss_mlp": 0.01036212, "balance_loss_clip": 1.24013782, "balance_loss_mlp": 1.01724565, "epoch": 0.8505035322410942, "flos": 20016760803840.0, "grad_norm": 1.655892941909803, "language_loss": 0.85966623, "learning_rate": 2.2981245331765842e-07, "loss": 0.88402551, "num_input_tokens_seen": 305090190, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18969727, "step": 14146, "time_per_iteration": 2.8484158515930176 }, { "auxiliary_loss_clip": 0.01396308, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.23722708, "balance_loss_mlp": 1.01244187, "epoch": 0.8505636554937622, "flos": 20821541644800.0, "grad_norm": 2.3178007780700365, "language_loss": 0.84702408, "learning_rate": 2.2963122645592814e-07, "loss": 0.87129605, "num_input_tokens_seen": 305109355, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18432617, "step": 14147, "time_per_iteration": 2.886324167251587 }, { "auxiliary_loss_clip": 0.01410415, "auxiliary_loss_mlp": 0.01032408, "balance_loss_clip": 1.2460494, "balance_loss_mlp": 1.01323915, "epoch": 0.8506237787464301, "flos": 14182787370240.0, "grad_norm": 2.2339386913747403, "language_loss": 0.87367642, "learning_rate": 2.2945006672675894e-07, "loss": 0.89810467, "num_input_tokens_seen": 305124165, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19152832, "step": 14148, "time_per_iteration": 2.9008705615997314 }, { "auxiliary_loss_clip": 0.01391977, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.23448992, "balance_loss_mlp": 1.01710486, "epoch": 0.8506839019990982, "flos": 23268804399360.0, "grad_norm": 1.648611962043415, "language_loss": 0.72754019, "learning_rate": 2.292689741370204e-07, "loss": 0.75182211, "num_input_tokens_seen": 305143940, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19104004, "step": 14149, "time_per_iteration": 2.857525587081909 }, { "auxiliary_loss_clip": 0.01412721, "auxiliary_loss_mlp": 0.01033561, "balance_loss_clip": 1.25291753, "balance_loss_mlp": 1.014678, "epoch": 0.8507440252517661, "flos": 23669611251840.0, "grad_norm": 1.591690103621371, "language_loss": 0.76911056, "learning_rate": 2.290879486935804e-07, "loss": 0.79357338, "num_input_tokens_seen": 305163505, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18884277, "step": 14150, "time_per_iteration": 4.298710346221924 }, { "auxiliary_loss_clip": 0.01397345, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.24065685, "balance_loss_mlp": 1.01707506, "epoch": 0.8508041485044341, "flos": 18670624202880.0, "grad_norm": 1.5005285808726683, "language_loss": 0.73296428, "learning_rate": 2.2890699040330231e-07, "loss": 0.7573002, "num_input_tokens_seen": 305182325, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.19165039, "step": 14151, "time_per_iteration": 4.236381530761719 }, { "auxiliary_loss_clip": 0.01178621, "auxiliary_loss_mlp": 0.0103451, "balance_loss_clip": 1.09057689, "balance_loss_mlp": 1.01219392, "epoch": 0.8508642717571021, "flos": 52536065639040.0, "grad_norm": 0.8871600074636261, "language_loss": 0.59602201, "learning_rate": 2.2872609927304909e-07, "loss": 0.61815333, "num_input_tokens_seen": 305230775, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.22363281, "step": 14152, "time_per_iteration": 3.104869842529297 }, { "auxiliary_loss_clip": 0.01181994, "auxiliary_loss_mlp": 0.01017763, "balance_loss_clip": 1.09126091, "balance_loss_mlp": 0.9989754, "epoch": 0.85092439500977, "flos": 69327368807040.0, "grad_norm": 0.6910319944795257, "language_loss": 0.61306989, "learning_rate": 2.285452753096797e-07, "loss": 0.63506746, "num_input_tokens_seen": 305296000, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.1875, "step": 14153, "time_per_iteration": 3.3026161193847656 }, { "auxiliary_loss_clip": 0.01392445, "auxiliary_loss_mlp": 0.01033523, "balance_loss_clip": 1.23398352, "balance_loss_mlp": 1.01388884, "epoch": 0.850984518262438, "flos": 24400543299840.0, "grad_norm": 2.743755963588363, "language_loss": 0.81340617, "learning_rate": 2.2836451852005067e-07, "loss": 0.83766586, "num_input_tokens_seen": 305314705, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19641113, "step": 14154, "time_per_iteration": 2.8746299743652344 }, { "auxiliary_loss_clip": 0.01379446, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 1.22415996, "balance_loss_mlp": 1.01515591, "epoch": 0.851044641515106, "flos": 23305253725440.0, "grad_norm": 1.760898297395744, "language_loss": 0.80215758, "learning_rate": 2.281838289110165e-07, "loss": 0.82628089, "num_input_tokens_seen": 305333870, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.17724609, "step": 14155, "time_per_iteration": 2.886051893234253 }, { "auxiliary_loss_clip": 0.01413865, "auxiliary_loss_mlp": 0.01030028, "balance_loss_clip": 1.24990821, "balance_loss_mlp": 1.01118124, "epoch": 0.851104764767774, "flos": 22059099429120.0, "grad_norm": 1.631006370013621, "language_loss": 0.71377325, "learning_rate": 2.2800320648942904e-07, "loss": 0.73821223, "num_input_tokens_seen": 305352780, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.18847656, "step": 14156, "time_per_iteration": 2.8652024269104004 }, { "auxiliary_loss_clip": 0.01388549, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.23292685, "balance_loss_mlp": 1.01341712, "epoch": 0.8511648880204419, "flos": 20714591640960.0, "grad_norm": 1.8513618707799206, "language_loss": 0.74280536, "learning_rate": 2.278226512621386e-07, "loss": 0.76701093, "num_input_tokens_seen": 305371370, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18603516, "step": 14157, "time_per_iteration": 2.830643653869629 }, { "auxiliary_loss_clip": 0.01391239, "auxiliary_loss_mlp": 0.01031957, "balance_loss_clip": 1.23496199, "balance_loss_mlp": 1.01473117, "epoch": 0.8512250112731099, "flos": 24035326122240.0, "grad_norm": 1.9539724572250803, "language_loss": 0.80189592, "learning_rate": 2.2764216323598995e-07, "loss": 0.82612795, "num_input_tokens_seen": 305387955, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.17224121, "step": 14158, "time_per_iteration": 2.84194016456604 }, { "auxiliary_loss_clip": 0.01402387, "auxiliary_loss_mlp": 0.01034741, "balance_loss_clip": 1.24187255, "balance_loss_mlp": 1.01486897, "epoch": 0.8512851345257778, "flos": 22024957587840.0, "grad_norm": 2.1219378123955073, "language_loss": 0.7993542, "learning_rate": 2.27461742417828e-07, "loss": 0.82372546, "num_input_tokens_seen": 305406285, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19873047, "step": 14159, "time_per_iteration": 2.9250481128692627 }, { "auxiliary_loss_clip": 0.01394435, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.23534286, "balance_loss_mlp": 1.01441061, "epoch": 0.8513452577784458, "flos": 14838468036480.0, "grad_norm": 3.5911648205145785, "language_loss": 0.71854603, "learning_rate": 2.2728138881449488e-07, "loss": 0.74282819, "num_input_tokens_seen": 305424500, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19372559, "step": 14160, "time_per_iteration": 2.84149169921875 }, { "auxiliary_loss_clip": 0.01422353, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.25623453, "balance_loss_mlp": 1.01413941, "epoch": 0.8514053810311137, "flos": 33048399254400.0, "grad_norm": 2.6315464233661348, "language_loss": 0.71143401, "learning_rate": 2.2710110243282866e-07, "loss": 0.7360096, "num_input_tokens_seen": 305442990, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.21069336, "step": 14161, "time_per_iteration": 2.9425337314605713 }, { "auxiliary_loss_clip": 0.01406697, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.2425822, "balance_loss_mlp": 1.01446211, "epoch": 0.8514655042837818, "flos": 27576204393600.0, "grad_norm": 2.618115184030919, "language_loss": 0.78748643, "learning_rate": 2.2692088327966653e-07, "loss": 0.81188798, "num_input_tokens_seen": 305463065, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18994141, "step": 14162, "time_per_iteration": 2.928743839263916 }, { "auxiliary_loss_clip": 0.014036, "auxiliary_loss_mlp": 0.01034735, "balance_loss_clip": 1.2430464, "balance_loss_mlp": 1.01566148, "epoch": 0.8515256275364497, "flos": 35570189473920.0, "grad_norm": 2.3302240226559414, "language_loss": 0.78060234, "learning_rate": 2.2674073136184235e-07, "loss": 0.80498564, "num_input_tokens_seen": 305489070, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19067383, "step": 14163, "time_per_iteration": 2.9942266941070557 }, { "auxiliary_loss_clip": 0.01182202, "auxiliary_loss_mlp": 0.01018431, "balance_loss_clip": 1.09218347, "balance_loss_mlp": 0.99878544, "epoch": 0.8515857507891177, "flos": 70237335104640.0, "grad_norm": 0.7161782784210835, "language_loss": 0.55095571, "learning_rate": 2.2656064668618735e-07, "loss": 0.57296211, "num_input_tokens_seen": 305551490, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.19628906, "step": 14164, "time_per_iteration": 3.4478697776794434 }, { "auxiliary_loss_clip": 0.01396057, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.23672438, "balance_loss_mlp": 1.01847625, "epoch": 0.8516458740417857, "flos": 22685750916480.0, "grad_norm": 2.055495676738981, "language_loss": 0.73847765, "learning_rate": 2.2638062925953005e-07, "loss": 0.76281184, "num_input_tokens_seen": 305570535, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18884277, "step": 14165, "time_per_iteration": 2.8972549438476562 }, { "auxiliary_loss_clip": 0.01387868, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.23081446, "balance_loss_mlp": 1.01527166, "epoch": 0.8517059972944536, "flos": 22757699427840.0, "grad_norm": 1.771718443121813, "language_loss": 0.68606102, "learning_rate": 2.26200679088697e-07, "loss": 0.71028471, "num_input_tokens_seen": 305590800, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19238281, "step": 14166, "time_per_iteration": 2.8817315101623535 }, { "auxiliary_loss_clip": 0.01404358, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.24373245, "balance_loss_mlp": 1.01375639, "epoch": 0.8517661205471216, "flos": 21699085403520.0, "grad_norm": 1.8107393039580624, "language_loss": 0.73966074, "learning_rate": 2.260207961805125e-07, "loss": 0.76402777, "num_input_tokens_seen": 305609495, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18603516, "step": 14167, "time_per_iteration": 2.907773494720459 }, { "auxiliary_loss_clip": 0.01404621, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.24517107, "balance_loss_mlp": 1.01729906, "epoch": 0.8518262437997896, "flos": 25385896713600.0, "grad_norm": 1.6365171973575836, "language_loss": 0.81274581, "learning_rate": 2.258409805417969e-07, "loss": 0.83715504, "num_input_tokens_seen": 305629420, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18994141, "step": 14168, "time_per_iteration": 2.8829052448272705 }, { "auxiliary_loss_clip": 0.01390564, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.23222911, "balance_loss_mlp": 1.01022351, "epoch": 0.8518863670524576, "flos": 27246893604480.0, "grad_norm": 1.9860201871148038, "language_loss": 0.76820838, "learning_rate": 2.2566123217936893e-07, "loss": 0.79240382, "num_input_tokens_seen": 305649835, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18737793, "step": 14169, "time_per_iteration": 2.9208052158355713 }, { "auxiliary_loss_clip": 0.01411146, "auxiliary_loss_mlp": 0.01032969, "balance_loss_clip": 1.24834347, "balance_loss_mlp": 1.01321554, "epoch": 0.8519464903051255, "flos": 20969284475520.0, "grad_norm": 1.83677748534767, "language_loss": 0.64964944, "learning_rate": 2.254815511000452e-07, "loss": 0.67409062, "num_input_tokens_seen": 305668840, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19750977, "step": 14170, "time_per_iteration": 2.858258008956909 }, { "auxiliary_loss_clip": 0.01392848, "auxiliary_loss_mlp": 0.01031962, "balance_loss_clip": 1.23367047, "balance_loss_mlp": 1.01234007, "epoch": 0.8520066135577935, "flos": 18450797126400.0, "grad_norm": 2.4016194266621453, "language_loss": 0.87219036, "learning_rate": 2.253019373106384e-07, "loss": 0.89643848, "num_input_tokens_seen": 305686955, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19628906, "step": 14171, "time_per_iteration": 2.8158669471740723 }, { "auxiliary_loss_clip": 0.01420363, "auxiliary_loss_mlp": 0.01031062, "balance_loss_clip": 1.25976515, "balance_loss_mlp": 1.01191747, "epoch": 0.8520667368104614, "flos": 29141579888640.0, "grad_norm": 2.0874027608566132, "language_loss": 0.55684102, "learning_rate": 2.2512239081796003e-07, "loss": 0.58135521, "num_input_tokens_seen": 305706290, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19140625, "step": 14172, "time_per_iteration": 2.906567096710205 }, { "auxiliary_loss_clip": 0.01385901, "auxiliary_loss_mlp": 0.01028848, "balance_loss_clip": 1.23006976, "balance_loss_mlp": 1.01183677, "epoch": 0.8521268600631294, "flos": 16042969854720.0, "grad_norm": 2.0681218714823904, "language_loss": 0.69905043, "learning_rate": 2.2494291162881862e-07, "loss": 0.72319794, "num_input_tokens_seen": 305723835, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.17016602, "step": 14173, "time_per_iteration": 2.8348536491394043 }, { "auxiliary_loss_clip": 0.01401563, "auxiliary_loss_mlp": 0.01035194, "balance_loss_clip": 1.24124718, "balance_loss_mlp": 1.01635885, "epoch": 0.8521869833157973, "flos": 22464883209600.0, "grad_norm": 20.945802773652677, "language_loss": 0.77497113, "learning_rate": 2.247634997500205e-07, "loss": 0.79933876, "num_input_tokens_seen": 305741655, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18835449, "step": 14174, "time_per_iteration": 2.874356985092163 }, { "auxiliary_loss_clip": 0.01419927, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.25699687, "balance_loss_mlp": 1.01537156, "epoch": 0.8522471065684654, "flos": 24982601397120.0, "grad_norm": 1.705523164391855, "language_loss": 0.82445121, "learning_rate": 2.245841551883676e-07, "loss": 0.84899402, "num_input_tokens_seen": 305761890, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18969727, "step": 14175, "time_per_iteration": 2.8754940032958984 }, { "auxiliary_loss_clip": 0.01419609, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.25528872, "balance_loss_mlp": 1.01493025, "epoch": 0.8523072298211333, "flos": 17719095916800.0, "grad_norm": 2.1905907678945833, "language_loss": 0.66042018, "learning_rate": 2.2440487795066153e-07, "loss": 0.68495369, "num_input_tokens_seen": 305779190, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18811035, "step": 14176, "time_per_iteration": 2.8800606727600098 }, { "auxiliary_loss_clip": 0.01386417, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.23039746, "balance_loss_mlp": 1.01271844, "epoch": 0.8523673530738013, "flos": 25456849839360.0, "grad_norm": 2.699769779170139, "language_loss": 0.79131091, "learning_rate": 2.2422566804370068e-07, "loss": 0.81549203, "num_input_tokens_seen": 305799870, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.1895752, "step": 14177, "time_per_iteration": 4.40116810798645 }, { "auxiliary_loss_clip": 0.01409921, "auxiliary_loss_mlp": 0.01031987, "balance_loss_clip": 1.25025392, "balance_loss_mlp": 1.01265085, "epoch": 0.8524274763264693, "flos": 31441733239680.0, "grad_norm": 1.616747128994564, "language_loss": 0.74163902, "learning_rate": 2.2404652547428026e-07, "loss": 0.76605809, "num_input_tokens_seen": 305819695, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19348145, "step": 14178, "time_per_iteration": 2.917104482650757 }, { "auxiliary_loss_clip": 0.01405891, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.24370909, "balance_loss_mlp": 1.01757264, "epoch": 0.8524875995791372, "flos": 17721674870400.0, "grad_norm": 1.780891627058962, "language_loss": 0.76216567, "learning_rate": 2.238674502491935e-07, "loss": 0.78659189, "num_input_tokens_seen": 305837270, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19152832, "step": 14179, "time_per_iteration": 4.3328235149383545 }, { "auxiliary_loss_clip": 0.0139382, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.23694611, "balance_loss_mlp": 1.01491106, "epoch": 0.8525477228318052, "flos": 21696777918720.0, "grad_norm": 1.9590392443048363, "language_loss": 0.83242726, "learning_rate": 2.2368844237523165e-07, "loss": 0.85670686, "num_input_tokens_seen": 305855250, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19226074, "step": 14180, "time_per_iteration": 2.871492624282837 }, { "auxiliary_loss_clip": 0.01397657, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.2385323, "balance_loss_mlp": 1.01566839, "epoch": 0.8526078460844732, "flos": 24837844723200.0, "grad_norm": 4.009947651833669, "language_loss": 0.61770582, "learning_rate": 2.235095018591815e-07, "loss": 0.64203042, "num_input_tokens_seen": 305875660, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19128418, "step": 14181, "time_per_iteration": 2.864065647125244 }, { "auxiliary_loss_clip": 0.0138786, "auxiliary_loss_mlp": 0.0103242, "balance_loss_clip": 1.23110163, "balance_loss_mlp": 1.0136447, "epoch": 0.8526679693371412, "flos": 13524256281600.0, "grad_norm": 2.2034171179376734, "language_loss": 0.72416836, "learning_rate": 2.2333062870782894e-07, "loss": 0.74837118, "num_input_tokens_seen": 305892415, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18786621, "step": 14182, "time_per_iteration": 2.8313300609588623 }, { "auxiliary_loss_clip": 0.0139637, "auxiliary_loss_mlp": 0.01031485, "balance_loss_clip": 1.23841739, "balance_loss_mlp": 1.01276875, "epoch": 0.8527280925898091, "flos": 23524673598720.0, "grad_norm": 1.740743842319948, "language_loss": 0.70624185, "learning_rate": 2.2315182292795697e-07, "loss": 0.73052037, "num_input_tokens_seen": 305912665, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18725586, "step": 14183, "time_per_iteration": 2.9570257663726807 }, { "auxiliary_loss_clip": 0.01396254, "auxiliary_loss_mlp": 0.01034273, "balance_loss_clip": 1.23823142, "balance_loss_mlp": 1.0161773, "epoch": 0.8527882158424771, "flos": 20312698913280.0, "grad_norm": 1.663037909323201, "language_loss": 0.73018652, "learning_rate": 2.2297308452634644e-07, "loss": 0.75449181, "num_input_tokens_seen": 305931515, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18103027, "step": 14184, "time_per_iteration": 2.8784327507019043 }, { "auxiliary_loss_clip": 0.01395941, "auxiliary_loss_mlp": 0.01033279, "balance_loss_clip": 1.23759747, "balance_loss_mlp": 1.01455092, "epoch": 0.852848339095145, "flos": 17211565284480.0, "grad_norm": 1.633183824657198, "language_loss": 0.77420485, "learning_rate": 2.2279441350977457e-07, "loss": 0.79849696, "num_input_tokens_seen": 305949965, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18725586, "step": 14185, "time_per_iteration": 4.240323781967163 }, { "auxiliary_loss_clip": 0.01410148, "auxiliary_loss_mlp": 0.01031289, "balance_loss_clip": 1.24776864, "balance_loss_mlp": 1.01295471, "epoch": 0.852908462347813, "flos": 18378079453440.0, "grad_norm": 1.847150033770184, "language_loss": 0.803491, "learning_rate": 2.2261580988501637e-07, "loss": 0.82790536, "num_input_tokens_seen": 305967820, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18322754, "step": 14186, "time_per_iteration": 2.856426477432251 }, { "auxiliary_loss_clip": 0.0141554, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.25197124, "balance_loss_mlp": 1.01565254, "epoch": 0.8529685856004809, "flos": 18634174876800.0, "grad_norm": 1.6660427133494256, "language_loss": 0.63118321, "learning_rate": 2.224372736588449e-07, "loss": 0.65568596, "num_input_tokens_seen": 305985505, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19067383, "step": 14187, "time_per_iteration": 4.305727958679199 }, { "auxiliary_loss_clip": 0.01416335, "auxiliary_loss_mlp": 0.0103241, "balance_loss_clip": 1.25206399, "balance_loss_mlp": 1.01258564, "epoch": 0.853028708853149, "flos": 29619945607680.0, "grad_norm": 1.6056515081977225, "language_loss": 0.76696408, "learning_rate": 2.2225880483803005e-07, "loss": 0.79145157, "num_input_tokens_seen": 306005220, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19836426, "step": 14188, "time_per_iteration": 2.9282584190368652 }, { "auxiliary_loss_clip": 0.01407798, "auxiliary_loss_mlp": 0.0103373, "balance_loss_clip": 1.24495983, "balance_loss_mlp": 1.01332188, "epoch": 0.8530888321058169, "flos": 26362698860160.0, "grad_norm": 1.4966076422687684, "language_loss": 0.78703231, "learning_rate": 2.2208040342933932e-07, "loss": 0.81144762, "num_input_tokens_seen": 306023785, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.20410156, "step": 14189, "time_per_iteration": 3.0081801414489746 }, { "auxiliary_loss_clip": 0.0140437, "auxiliary_loss_mlp": 0.01034865, "balance_loss_clip": 1.24352574, "balance_loss_mlp": 1.01560044, "epoch": 0.8531489553584849, "flos": 20531711583360.0, "grad_norm": 2.292702199813911, "language_loss": 0.80397189, "learning_rate": 2.2190206943953793e-07, "loss": 0.82836425, "num_input_tokens_seen": 306041600, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19262695, "step": 14190, "time_per_iteration": 2.8562631607055664 }, { "auxiliary_loss_clip": 0.01402336, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.24222374, "balance_loss_mlp": 1.0120337, "epoch": 0.8532090786111529, "flos": 20714048703360.0, "grad_norm": 2.083166879948958, "language_loss": 0.76991451, "learning_rate": 2.2172380287538894e-07, "loss": 0.79424679, "num_input_tokens_seen": 306060345, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18859863, "step": 14191, "time_per_iteration": 2.854310989379883 }, { "auxiliary_loss_clip": 0.01391033, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.23397851, "balance_loss_mlp": 1.01256275, "epoch": 0.8532692018638208, "flos": 19838495715840.0, "grad_norm": 1.8641160502799248, "language_loss": 0.69696367, "learning_rate": 2.2154560374365073e-07, "loss": 0.72118735, "num_input_tokens_seen": 306078285, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18786621, "step": 14192, "time_per_iteration": 2.8516666889190674 }, { "auxiliary_loss_clip": 0.01426204, "auxiliary_loss_mlp": 0.01043213, "balance_loss_clip": 1.25924766, "balance_loss_mlp": 1.02255368, "epoch": 0.8533293251164888, "flos": 21006774432000.0, "grad_norm": 2.1725221233310603, "language_loss": 0.63801205, "learning_rate": 2.2136747205108164e-07, "loss": 0.66270626, "num_input_tokens_seen": 306093760, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.20666504, "step": 14193, "time_per_iteration": 2.8607845306396484 }, { "auxiliary_loss_clip": 0.01397327, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.23812222, "balance_loss_mlp": 1.01052117, "epoch": 0.8533894483691568, "flos": 22429926961920.0, "grad_norm": 1.8082839002133828, "language_loss": 0.77658343, "learning_rate": 2.211894078044365e-07, "loss": 0.80084604, "num_input_tokens_seen": 306112595, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18408203, "step": 14194, "time_per_iteration": 2.8508384227752686 }, { "auxiliary_loss_clip": 0.01396938, "auxiliary_loss_mlp": 0.01028719, "balance_loss_clip": 1.23746741, "balance_loss_mlp": 1.01012254, "epoch": 0.8534495716218248, "flos": 21626548709760.0, "grad_norm": 1.7868816415381785, "language_loss": 0.70495379, "learning_rate": 2.2101141101046705e-07, "loss": 0.72921038, "num_input_tokens_seen": 306131800, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18579102, "step": 14195, "time_per_iteration": 2.8987576961517334 }, { "auxiliary_loss_clip": 0.01392576, "auxiliary_loss_mlp": 0.01032965, "balance_loss_clip": 1.23337245, "balance_loss_mlp": 1.01473844, "epoch": 0.8535096948744927, "flos": 22356485372160.0, "grad_norm": 1.9905930904684503, "language_loss": 0.86767435, "learning_rate": 2.2083348167592343e-07, "loss": 0.89192981, "num_input_tokens_seen": 306150590, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18237305, "step": 14196, "time_per_iteration": 2.958568811416626 }, { "auxiliary_loss_clip": 0.01182802, "auxiliary_loss_mlp": 0.01021888, "balance_loss_clip": 1.09420598, "balance_loss_mlp": 0.99919063, "epoch": 0.8535698181271607, "flos": 52786731686400.0, "grad_norm": 0.7715488349513239, "language_loss": 0.55137146, "learning_rate": 2.2065561980755243e-07, "loss": 0.57341838, "num_input_tokens_seen": 306205850, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.2265625, "step": 14197, "time_per_iteration": 3.296600580215454 }, { "auxiliary_loss_clip": 0.01380319, "auxiliary_loss_mlp": 0.01033783, "balance_loss_clip": 1.22444487, "balance_loss_mlp": 1.01475716, "epoch": 0.8536299413798286, "flos": 19072335951360.0, "grad_norm": 1.5948230222836395, "language_loss": 0.81960535, "learning_rate": 2.2047782541209826e-07, "loss": 0.84374636, "num_input_tokens_seen": 306225220, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.19030762, "step": 14198, "time_per_iteration": 2.8763599395751953 }, { "auxiliary_loss_clip": 0.01400749, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.24150538, "balance_loss_mlp": 1.01812947, "epoch": 0.8536900646324966, "flos": 49361698661760.0, "grad_norm": 1.4708557202917214, "language_loss": 0.69403934, "learning_rate": 2.203000984963035e-07, "loss": 0.71840304, "num_input_tokens_seen": 306249865, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.17504883, "step": 14199, "time_per_iteration": 3.1298577785491943 }, { "auxiliary_loss_clip": 0.01385378, "auxiliary_loss_mlp": 0.01030443, "balance_loss_clip": 1.23029327, "balance_loss_mlp": 1.01170301, "epoch": 0.8537501878851645, "flos": 21772346014080.0, "grad_norm": 1.480048957909808, "language_loss": 0.86894953, "learning_rate": 2.201224390669072e-07, "loss": 0.89310777, "num_input_tokens_seen": 306270215, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18737793, "step": 14200, "time_per_iteration": 2.9250011444091797 }, { "auxiliary_loss_clip": 0.01398377, "auxiliary_loss_mlp": 0.01031098, "balance_loss_clip": 1.23874331, "balance_loss_mlp": 1.01281118, "epoch": 0.8538103111378326, "flos": 22278293078400.0, "grad_norm": 2.325620477315977, "language_loss": 0.78648847, "learning_rate": 2.1994484713064666e-07, "loss": 0.81078327, "num_input_tokens_seen": 306288960, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1829834, "step": 14201, "time_per_iteration": 2.9130282402038574 }, { "auxiliary_loss_clip": 0.01394331, "auxiliary_loss_mlp": 0.01033319, "balance_loss_clip": 1.23796988, "balance_loss_mlp": 1.01450777, "epoch": 0.8538704343905005, "flos": 20313830033280.0, "grad_norm": 1.7161690835959797, "language_loss": 0.69690162, "learning_rate": 2.19767322694256e-07, "loss": 0.72117817, "num_input_tokens_seen": 306308735, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18811035, "step": 14202, "time_per_iteration": 2.8443245887756348 }, { "auxiliary_loss_clip": 0.01399196, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.23914194, "balance_loss_mlp": 1.01470876, "epoch": 0.8539305576431685, "flos": 24766212925440.0, "grad_norm": 1.717924980013312, "language_loss": 0.80813205, "learning_rate": 2.195898657644666e-07, "loss": 0.83245564, "num_input_tokens_seen": 306329015, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18457031, "step": 14203, "time_per_iteration": 2.9611542224884033 }, { "auxiliary_loss_clip": 0.01403298, "auxiliary_loss_mlp": 0.01034359, "balance_loss_clip": 1.24232447, "balance_loss_mlp": 1.01534486, "epoch": 0.8539906808958365, "flos": 26698841614080.0, "grad_norm": 2.0905254135405005, "language_loss": 0.67079747, "learning_rate": 2.1941247634800808e-07, "loss": 0.6951741, "num_input_tokens_seen": 306349085, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18994141, "step": 14204, "time_per_iteration": 2.9992730617523193 }, { "auxiliary_loss_clip": 0.01407063, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 1.24451983, "balance_loss_mlp": 1.01177716, "epoch": 0.8540508041485044, "flos": 13372034215680.0, "grad_norm": 3.8520622741422685, "language_loss": 0.6138829, "learning_rate": 2.1923515445160667e-07, "loss": 0.63825238, "num_input_tokens_seen": 306365385, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18103027, "step": 14205, "time_per_iteration": 2.8239667415618896 }, { "auxiliary_loss_clip": 0.0139638, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.23740757, "balance_loss_mlp": 1.0124681, "epoch": 0.8541109274011724, "flos": 32793706419840.0, "grad_norm": 2.3752266785771394, "language_loss": 0.72852588, "learning_rate": 2.1905790008198655e-07, "loss": 0.75280678, "num_input_tokens_seen": 306384585, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19238281, "step": 14206, "time_per_iteration": 2.9304983615875244 }, { "auxiliary_loss_clip": 0.01400257, "auxiliary_loss_mlp": 0.01030687, "balance_loss_clip": 1.23835897, "balance_loss_mlp": 1.01145887, "epoch": 0.8541710506538404, "flos": 17648188035840.0, "grad_norm": 2.8053383979218385, "language_loss": 0.7714147, "learning_rate": 2.1888071324586987e-07, "loss": 0.79572415, "num_input_tokens_seen": 306401565, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19226074, "step": 14207, "time_per_iteration": 2.9093291759490967 }, { "auxiliary_loss_clip": 0.01403922, "auxiliary_loss_mlp": 0.01035942, "balance_loss_clip": 1.2442292, "balance_loss_mlp": 1.01669002, "epoch": 0.8542311739065084, "flos": 20271815596800.0, "grad_norm": 2.0042638633316385, "language_loss": 0.85283071, "learning_rate": 2.1870359394997485e-07, "loss": 0.87722933, "num_input_tokens_seen": 306419995, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19250488, "step": 14208, "time_per_iteration": 2.8831071853637695 }, { "auxiliary_loss_clip": 0.01400451, "auxiliary_loss_mlp": 0.01035688, "balance_loss_clip": 1.24015319, "balance_loss_mlp": 1.01730597, "epoch": 0.8542912971591763, "flos": 17794709256960.0, "grad_norm": 3.446050582546343, "language_loss": 0.67124462, "learning_rate": 2.1852654220101785e-07, "loss": 0.69560599, "num_input_tokens_seen": 306439240, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18395996, "step": 14209, "time_per_iteration": 2.843174934387207 }, { "auxiliary_loss_clip": 0.01401903, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.24334431, "balance_loss_mlp": 1.0133779, "epoch": 0.8543514204118443, "flos": 26990798181120.0, "grad_norm": 2.040388143902975, "language_loss": 0.70775181, "learning_rate": 2.1834955800571287e-07, "loss": 0.73208952, "num_input_tokens_seen": 306458425, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18481445, "step": 14210, "time_per_iteration": 2.9351930618286133 }, { "auxiliary_loss_clip": 0.01401816, "auxiliary_loss_mlp": 0.01033503, "balance_loss_clip": 1.24130297, "balance_loss_mlp": 1.01497746, "epoch": 0.8544115436645122, "flos": 24035235632640.0, "grad_norm": 1.3605782122619219, "language_loss": 0.70757532, "learning_rate": 2.1817264137077141e-07, "loss": 0.73192847, "num_input_tokens_seen": 306477210, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18530273, "step": 14211, "time_per_iteration": 2.910698413848877 }, { "auxiliary_loss_clip": 0.01410278, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.24915409, "balance_loss_mlp": 1.01557612, "epoch": 0.8544716669171802, "flos": 16626702009600.0, "grad_norm": 29.200085560982096, "language_loss": 0.82451737, "learning_rate": 2.1799579230290166e-07, "loss": 0.84896511, "num_input_tokens_seen": 306495820, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18920898, "step": 14212, "time_per_iteration": 4.22799825668335 }, { "auxiliary_loss_clip": 0.01402903, "auxiliary_loss_mlp": 0.01032336, "balance_loss_clip": 1.24241662, "balance_loss_mlp": 1.01291645, "epoch": 0.8545317901698481, "flos": 40020672084480.0, "grad_norm": 9.932814825348464, "language_loss": 0.67067146, "learning_rate": 2.178190108088105e-07, "loss": 0.69502383, "num_input_tokens_seen": 306516420, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19421387, "step": 14213, "time_per_iteration": 3.026350498199463 }, { "auxiliary_loss_clip": 0.01399288, "auxiliary_loss_mlp": 0.01029571, "balance_loss_clip": 1.24120855, "balance_loss_mlp": 1.01170194, "epoch": 0.8545919134225162, "flos": 19912299264000.0, "grad_norm": 1.6945239643505698, "language_loss": 0.79177696, "learning_rate": 2.1764229689520098e-07, "loss": 0.81606555, "num_input_tokens_seen": 306534785, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.17871094, "step": 14214, "time_per_iteration": 4.356052875518799 }, { "auxiliary_loss_clip": 0.01414766, "auxiliary_loss_mlp": 0.01034001, "balance_loss_clip": 1.24951959, "balance_loss_mlp": 1.01411664, "epoch": 0.8546520366751841, "flos": 18962264056320.0, "grad_norm": 2.2367747968214715, "language_loss": 0.67809623, "learning_rate": 2.1746565056877397e-07, "loss": 0.70258391, "num_input_tokens_seen": 306552440, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19885254, "step": 14215, "time_per_iteration": 2.8245315551757812 }, { "auxiliary_loss_clip": 0.01394713, "auxiliary_loss_mlp": 0.01031376, "balance_loss_clip": 1.23574233, "balance_loss_mlp": 1.01277936, "epoch": 0.8547121599278521, "flos": 35633631962880.0, "grad_norm": 1.6602010047135498, "language_loss": 0.63873345, "learning_rate": 2.172890718362279e-07, "loss": 0.66299433, "num_input_tokens_seen": 306573600, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18615723, "step": 14216, "time_per_iteration": 2.9630746841430664 }, { "auxiliary_loss_clip": 0.0140902, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 1.24760127, "balance_loss_mlp": 1.01578259, "epoch": 0.8547722831805201, "flos": 16918884800640.0, "grad_norm": 1.8934271586189317, "language_loss": 0.66162258, "learning_rate": 2.17112560704259e-07, "loss": 0.6860556, "num_input_tokens_seen": 306592840, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18505859, "step": 14217, "time_per_iteration": 2.878323793411255 }, { "auxiliary_loss_clip": 0.01405289, "auxiliary_loss_mlp": 0.01033874, "balance_loss_clip": 1.2461164, "balance_loss_mlp": 1.01543212, "epoch": 0.854832406433188, "flos": 23012889955200.0, "grad_norm": 1.3642839177199304, "language_loss": 0.65739846, "learning_rate": 2.1693611717956072e-07, "loss": 0.68179011, "num_input_tokens_seen": 306613210, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18432617, "step": 14218, "time_per_iteration": 2.8977439403533936 }, { "auxiliary_loss_clip": 0.01415375, "auxiliary_loss_mlp": 0.01031201, "balance_loss_clip": 1.25110352, "balance_loss_mlp": 1.01237833, "epoch": 0.854892529685856, "flos": 20422318360320.0, "grad_norm": 1.8259067022781588, "language_loss": 0.7085588, "learning_rate": 2.167597412688238e-07, "loss": 0.7330246, "num_input_tokens_seen": 306631620, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.18811035, "step": 14219, "time_per_iteration": 2.8991353511810303 }, { "auxiliary_loss_clip": 0.01410979, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.24712908, "balance_loss_mlp": 1.014364, "epoch": 0.854952652938524, "flos": 16407282136320.0, "grad_norm": 2.2622923875129946, "language_loss": 0.68905157, "learning_rate": 2.1658343297873549e-07, "loss": 0.71349311, "num_input_tokens_seen": 306646695, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18823242, "step": 14220, "time_per_iteration": 4.270198345184326 }, { "auxiliary_loss_clip": 0.01389476, "auxiliary_loss_mlp": 0.01031037, "balance_loss_clip": 1.23267436, "balance_loss_mlp": 1.01157033, "epoch": 0.855012776191192, "flos": 21188568614400.0, "grad_norm": 1.956647764711893, "language_loss": 0.72750676, "learning_rate": 2.164071923159827e-07, "loss": 0.75171191, "num_input_tokens_seen": 306665465, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19470215, "step": 14221, "time_per_iteration": 2.8656015396118164 }, { "auxiliary_loss_clip": 0.01412316, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.2506696, "balance_loss_mlp": 1.01495433, "epoch": 0.8550728994438599, "flos": 26152237457280.0, "grad_norm": 2.033574941804207, "language_loss": 0.60873044, "learning_rate": 2.1623101928724763e-07, "loss": 0.63320345, "num_input_tokens_seen": 306685950, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20031738, "step": 14222, "time_per_iteration": 4.238384962081909 }, { "auxiliary_loss_clip": 0.01386735, "auxiliary_loss_mlp": 0.01030159, "balance_loss_clip": 1.23032391, "balance_loss_mlp": 1.0120039, "epoch": 0.8551330226965279, "flos": 22797813582720.0, "grad_norm": 1.6014204254420639, "language_loss": 0.84770155, "learning_rate": 2.1605491389921093e-07, "loss": 0.87187052, "num_input_tokens_seen": 306705740, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18151855, "step": 14223, "time_per_iteration": 2.8818936347961426 }, { "auxiliary_loss_clip": 0.01402556, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.24324226, "balance_loss_mlp": 1.01196694, "epoch": 0.8551931459491958, "flos": 22429203045120.0, "grad_norm": 1.4482261049951097, "language_loss": 0.74511635, "learning_rate": 2.158788761585515e-07, "loss": 0.76945126, "num_input_tokens_seen": 306725065, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18981934, "step": 14224, "time_per_iteration": 2.835235118865967 }, { "auxiliary_loss_clip": 0.01399166, "auxiliary_loss_mlp": 0.01032875, "balance_loss_clip": 1.2405746, "balance_loss_mlp": 1.01370645, "epoch": 0.8552532692018638, "flos": 19582626516480.0, "grad_norm": 1.7535323636436766, "language_loss": 0.76165074, "learning_rate": 2.1570290607194307e-07, "loss": 0.78597116, "num_input_tokens_seen": 306743630, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19165039, "step": 14225, "time_per_iteration": 2.8344624042510986 }, { "auxiliary_loss_clip": 0.01397222, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.23900783, "balance_loss_mlp": 1.01678419, "epoch": 0.8553133924545318, "flos": 26444058289920.0, "grad_norm": 1.9523988930239955, "language_loss": 0.78024346, "learning_rate": 2.1552700364605925e-07, "loss": 0.80456322, "num_input_tokens_seen": 306763105, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.1796875, "step": 14226, "time_per_iteration": 2.864877462387085 }, { "auxiliary_loss_clip": 0.01412488, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.24887812, "balance_loss_mlp": 1.01164818, "epoch": 0.8553735157071998, "flos": 16371059034240.0, "grad_norm": 1.991450303471412, "language_loss": 0.55131447, "learning_rate": 2.153511688875702e-07, "loss": 0.57574844, "num_input_tokens_seen": 306779875, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19262695, "step": 14227, "time_per_iteration": 2.843902111053467 }, { "auxiliary_loss_clip": 0.01392066, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.23417628, "balance_loss_mlp": 1.01304722, "epoch": 0.8554336389598677, "flos": 20897290719360.0, "grad_norm": 2.0178742720915506, "language_loss": 0.66681743, "learning_rate": 2.151754018031442e-07, "loss": 0.69105327, "num_input_tokens_seen": 306800015, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18481445, "step": 14228, "time_per_iteration": 2.859248399734497 }, { "auxiliary_loss_clip": 0.01410859, "auxiliary_loss_mlp": 0.01038865, "balance_loss_clip": 1.24759996, "balance_loss_mlp": 1.01909983, "epoch": 0.8554937622125357, "flos": 21293754071040.0, "grad_norm": 2.564422376240963, "language_loss": 0.74624473, "learning_rate": 2.1499970239944542e-07, "loss": 0.77074194, "num_input_tokens_seen": 306814160, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19763184, "step": 14229, "time_per_iteration": 2.9162745475769043 }, { "auxiliary_loss_clip": 0.01391067, "auxiliary_loss_mlp": 0.01032441, "balance_loss_clip": 1.23334908, "balance_loss_mlp": 1.01442838, "epoch": 0.8555538854652037, "flos": 22421828142720.0, "grad_norm": 1.7513367405548033, "language_loss": 0.73745441, "learning_rate": 2.1482407068313724e-07, "loss": 0.76168954, "num_input_tokens_seen": 306833310, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18017578, "step": 14230, "time_per_iteration": 2.8326966762542725 }, { "auxiliary_loss_clip": 0.01406913, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.24778104, "balance_loss_mlp": 1.01310635, "epoch": 0.8556140087178716, "flos": 20203396179840.0, "grad_norm": 1.8776566713441984, "language_loss": 0.83858025, "learning_rate": 2.1464850666087897e-07, "loss": 0.86296737, "num_input_tokens_seen": 306851345, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18688965, "step": 14231, "time_per_iteration": 2.8224170207977295 }, { "auxiliary_loss_clip": 0.01411439, "auxiliary_loss_mlp": 0.01030909, "balance_loss_clip": 1.24923635, "balance_loss_mlp": 1.01202583, "epoch": 0.8556741319705397, "flos": 22648396694400.0, "grad_norm": 2.5694770406992244, "language_loss": 0.68577302, "learning_rate": 2.1447301033932796e-07, "loss": 0.7101965, "num_input_tokens_seen": 306871040, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18884277, "step": 14232, "time_per_iteration": 2.935227394104004 }, { "auxiliary_loss_clip": 0.01415389, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.25276351, "balance_loss_mlp": 1.01578128, "epoch": 0.8557342552232076, "flos": 23559584601600.0, "grad_norm": 1.656013210203904, "language_loss": 0.67168397, "learning_rate": 2.1429758172513955e-07, "loss": 0.69618559, "num_input_tokens_seen": 306891625, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18994141, "step": 14233, "time_per_iteration": 2.8890607357025146 }, { "auxiliary_loss_clip": 0.01395343, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.23667932, "balance_loss_mlp": 1.01416135, "epoch": 0.8557943784758756, "flos": 19619392556160.0, "grad_norm": 1.6133727773475648, "language_loss": 0.77639788, "learning_rate": 2.1412222082496556e-07, "loss": 0.80068111, "num_input_tokens_seen": 306910020, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18823242, "step": 14234, "time_per_iteration": 2.8772165775299072 }, { "auxiliary_loss_clip": 0.01179895, "auxiliary_loss_mlp": 0.01023509, "balance_loss_clip": 1.08997691, "balance_loss_mlp": 1.00262392, "epoch": 0.8558545017285435, "flos": 70671107433600.0, "grad_norm": 0.7585330507130328, "language_loss": 0.58052206, "learning_rate": 2.1394692764545684e-07, "loss": 0.60255611, "num_input_tokens_seen": 306969505, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.20898438, "step": 14235, "time_per_iteration": 3.321000337600708 }, { "auxiliary_loss_clip": 0.01181566, "auxiliary_loss_mlp": 0.01016967, "balance_loss_clip": 1.09155083, "balance_loss_mlp": 0.9977026, "epoch": 0.8559146249812115, "flos": 56680402798080.0, "grad_norm": 0.7796074000725324, "language_loss": 0.56648135, "learning_rate": 2.1377170219325858e-07, "loss": 0.58846664, "num_input_tokens_seen": 307027710, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.19238281, "step": 14236, "time_per_iteration": 3.1856298446655273 }, { "auxiliary_loss_clip": 0.01400776, "auxiliary_loss_mlp": 0.01036345, "balance_loss_clip": 1.24037373, "balance_loss_mlp": 1.01740313, "epoch": 0.8559747482338794, "flos": 22897207704960.0, "grad_norm": 2.729575602379337, "language_loss": 0.70763588, "learning_rate": 2.1359654447501673e-07, "loss": 0.73200715, "num_input_tokens_seen": 307045515, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18945312, "step": 14237, "time_per_iteration": 2.9335579872131348 }, { "auxiliary_loss_clip": 0.01394659, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.23477089, "balance_loss_mlp": 1.01219118, "epoch": 0.8560348714865474, "flos": 22612264081920.0, "grad_norm": 3.7936461117371305, "language_loss": 0.64581215, "learning_rate": 2.1342145449737314e-07, "loss": 0.67006117, "num_input_tokens_seen": 307064470, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18054199, "step": 14238, "time_per_iteration": 2.8774185180664062 }, { "auxiliary_loss_clip": 0.01390692, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.23502779, "balance_loss_mlp": 1.01215351, "epoch": 0.8560949947392154, "flos": 17940506561280.0, "grad_norm": 1.557581613063183, "language_loss": 0.69777369, "learning_rate": 2.1324643226696648e-07, "loss": 0.72197014, "num_input_tokens_seen": 307083900, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.16796875, "step": 14239, "time_per_iteration": 2.954288959503174 }, { "auxiliary_loss_clip": 0.01416025, "auxiliary_loss_mlp": 0.01035074, "balance_loss_clip": 1.25157309, "balance_loss_mlp": 1.01521373, "epoch": 0.8561551179918834, "flos": 31037714006400.0, "grad_norm": 2.7345147270848225, "language_loss": 0.67623019, "learning_rate": 2.1307147779043455e-07, "loss": 0.70074117, "num_input_tokens_seen": 307104590, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19848633, "step": 14240, "time_per_iteration": 3.0271949768066406 }, { "auxiliary_loss_clip": 0.01405952, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.24352801, "balance_loss_mlp": 1.01332545, "epoch": 0.8562152412445513, "flos": 30677609491200.0, "grad_norm": 1.762147566644047, "language_loss": 0.62244076, "learning_rate": 2.1289659107441182e-07, "loss": 0.64683378, "num_input_tokens_seen": 307125580, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20019531, "step": 14241, "time_per_iteration": 3.0377180576324463 }, { "auxiliary_loss_clip": 0.01423792, "auxiliary_loss_mlp": 0.01042623, "balance_loss_clip": 1.25679374, "balance_loss_mlp": 1.02253604, "epoch": 0.8562753644972193, "flos": 31588480684800.0, "grad_norm": 1.7995489849670472, "language_loss": 0.75162268, "learning_rate": 2.1272177212552855e-07, "loss": 0.77628684, "num_input_tokens_seen": 307147625, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20080566, "step": 14242, "time_per_iteration": 2.977404832839966 }, { "auxiliary_loss_clip": 0.01404081, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.2422415, "balance_loss_mlp": 1.01439977, "epoch": 0.8563354877498872, "flos": 26224819395840.0, "grad_norm": 1.9949948377397884, "language_loss": 0.77345669, "learning_rate": 2.1254702095041498e-07, "loss": 0.79783773, "num_input_tokens_seen": 307164665, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19604492, "step": 14243, "time_per_iteration": 2.8753256797790527 }, { "auxiliary_loss_clip": 0.01400515, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.24053049, "balance_loss_mlp": 1.01534319, "epoch": 0.8563956110025552, "flos": 24145081303680.0, "grad_norm": 1.7731391617831433, "language_loss": 0.68434417, "learning_rate": 2.123723375556974e-07, "loss": 0.70868468, "num_input_tokens_seen": 307182530, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18188477, "step": 14244, "time_per_iteration": 2.886712074279785 }, { "auxiliary_loss_clip": 0.01178433, "auxiliary_loss_mlp": 0.01026405, "balance_loss_clip": 1.0887301, "balance_loss_mlp": 1.0059967, "epoch": 0.8564557342552233, "flos": 56298897492480.0, "grad_norm": 0.7544603609226492, "language_loss": 0.584894, "learning_rate": 2.1219772194800046e-07, "loss": 0.60694236, "num_input_tokens_seen": 307241240, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.20410156, "step": 14245, "time_per_iteration": 3.2561848163604736 }, { "auxiliary_loss_clip": 0.01420805, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.25593817, "balance_loss_mlp": 1.01455283, "epoch": 0.8565158575078912, "flos": 23451051029760.0, "grad_norm": 1.7255242016901762, "language_loss": 0.78265405, "learning_rate": 2.1202317413394488e-07, "loss": 0.80719352, "num_input_tokens_seen": 307261485, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.18579102, "step": 14246, "time_per_iteration": 2.910412549972534 }, { "auxiliary_loss_clip": 0.01399481, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.23969698, "balance_loss_mlp": 1.01250291, "epoch": 0.8565759807605592, "flos": 20385597565440.0, "grad_norm": 1.9272381886628795, "language_loss": 0.82153398, "learning_rate": 2.1184869412014938e-07, "loss": 0.8458367, "num_input_tokens_seen": 307279160, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18286133, "step": 14247, "time_per_iteration": 4.245622634887695 }, { "auxiliary_loss_clip": 0.01403034, "auxiliary_loss_mlp": 0.01030209, "balance_loss_clip": 1.24226904, "balance_loss_mlp": 1.01148164, "epoch": 0.8566361040132271, "flos": 18816376262400.0, "grad_norm": 1.9044480713077034, "language_loss": 0.7815429, "learning_rate": 2.1167428191323112e-07, "loss": 0.8058753, "num_input_tokens_seen": 307297920, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18725586, "step": 14248, "time_per_iteration": 2.873216390609741 }, { "auxiliary_loss_clip": 0.01413481, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.25093675, "balance_loss_mlp": 1.01553392, "epoch": 0.8566962272658951, "flos": 24546114380160.0, "grad_norm": 2.0913318142225754, "language_loss": 0.78383422, "learning_rate": 2.1149993751980278e-07, "loss": 0.80830842, "num_input_tokens_seen": 307318320, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18408203, "step": 14249, "time_per_iteration": 3.0165634155273438 }, { "auxiliary_loss_clip": 0.01396901, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.23935294, "balance_loss_mlp": 1.0157578, "epoch": 0.856756350518563, "flos": 23187128256000.0, "grad_norm": 1.8536531800362324, "language_loss": 0.79246897, "learning_rate": 2.1132566094647597e-07, "loss": 0.816782, "num_input_tokens_seen": 307336720, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1862793, "step": 14250, "time_per_iteration": 4.30298638343811 }, { "auxiliary_loss_clip": 0.01396442, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.24013603, "balance_loss_mlp": 1.01078176, "epoch": 0.856816473771231, "flos": 20817017164800.0, "grad_norm": 3.429242215545121, "language_loss": 0.80701792, "learning_rate": 2.1115145219985942e-07, "loss": 0.83127588, "num_input_tokens_seen": 307354120, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18566895, "step": 14251, "time_per_iteration": 2.871861457824707 }, { "auxiliary_loss_clip": 0.01390889, "auxiliary_loss_mlp": 0.01031786, "balance_loss_clip": 1.23257399, "balance_loss_mlp": 1.01380932, "epoch": 0.856876597023899, "flos": 20237311797120.0, "grad_norm": 2.0893167875767533, "language_loss": 0.61717236, "learning_rate": 2.1097731128656005e-07, "loss": 0.64139915, "num_input_tokens_seen": 307373165, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.17956543, "step": 14252, "time_per_iteration": 2.8249783515930176 }, { "auxiliary_loss_clip": 0.01402853, "auxiliary_loss_mlp": 0.01037955, "balance_loss_clip": 1.24136674, "balance_loss_mlp": 1.01954961, "epoch": 0.856936720276567, "flos": 18305226046080.0, "grad_norm": 2.0846849947492405, "language_loss": 0.71281713, "learning_rate": 2.1080323821317924e-07, "loss": 0.73722517, "num_input_tokens_seen": 307391000, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18408203, "step": 14253, "time_per_iteration": 2.8363852500915527 }, { "auxiliary_loss_clip": 0.01182631, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.09365296, "balance_loss_mlp": 1.00528705, "epoch": 0.8569968435292349, "flos": 69908114805120.0, "grad_norm": 0.7866517407875013, "language_loss": 0.59296846, "learning_rate": 2.1062923298631907e-07, "loss": 0.61505741, "num_input_tokens_seen": 307452865, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.20996094, "step": 14254, "time_per_iteration": 3.4057068824768066 }, { "auxiliary_loss_clip": 0.01394874, "auxiliary_loss_mlp": 0.01036931, "balance_loss_clip": 1.23749399, "balance_loss_mlp": 1.01667714, "epoch": 0.8570569667819029, "flos": 25859602218240.0, "grad_norm": 1.8973810647033642, "language_loss": 0.81481338, "learning_rate": 2.1045529561257825e-07, "loss": 0.83913136, "num_input_tokens_seen": 307471940, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.20263672, "step": 14255, "time_per_iteration": 4.345525503158569 }, { "auxiliary_loss_clip": 0.01387904, "auxiliary_loss_mlp": 0.01030944, "balance_loss_clip": 1.23279452, "balance_loss_mlp": 1.01311016, "epoch": 0.8571170900345708, "flos": 23267220831360.0, "grad_norm": 2.1810010174447205, "language_loss": 0.68552101, "learning_rate": 2.1028142609855126e-07, "loss": 0.70970947, "num_input_tokens_seen": 307488745, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.1784668, "step": 14256, "time_per_iteration": 2.877020835876465 }, { "auxiliary_loss_clip": 0.0141489, "auxiliary_loss_mlp": 0.01035664, "balance_loss_clip": 1.25313473, "balance_loss_mlp": 1.01679325, "epoch": 0.8571772132872388, "flos": 18927262563840.0, "grad_norm": 1.5958178218768961, "language_loss": 0.7107054, "learning_rate": 2.1010762445083218e-07, "loss": 0.73521101, "num_input_tokens_seen": 307506855, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18859863, "step": 14257, "time_per_iteration": 4.25572395324707 }, { "auxiliary_loss_clip": 0.01393215, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.23618722, "balance_loss_mlp": 1.01262426, "epoch": 0.8572373365399069, "flos": 33261756324480.0, "grad_norm": 3.4478289516647243, "language_loss": 0.77296913, "learning_rate": 2.0993389067601197e-07, "loss": 0.79721552, "num_input_tokens_seen": 307526115, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18798828, "step": 14258, "time_per_iteration": 3.0008463859558105 }, { "auxiliary_loss_clip": 0.01401293, "auxiliary_loss_mlp": 0.01035914, "balance_loss_clip": 1.24391568, "balance_loss_mlp": 1.01676893, "epoch": 0.8572974597925748, "flos": 23336997592320.0, "grad_norm": 1.5536164234470484, "language_loss": 0.6895681, "learning_rate": 2.0976022478067735e-07, "loss": 0.7139402, "num_input_tokens_seen": 307545230, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.19128418, "step": 14259, "time_per_iteration": 2.9589686393737793 }, { "auxiliary_loss_clip": 0.01396319, "auxiliary_loss_mlp": 0.01031359, "balance_loss_clip": 1.23609328, "balance_loss_mlp": 1.01250052, "epoch": 0.8573575830452428, "flos": 24546747807360.0, "grad_norm": 1.6724940992835782, "language_loss": 0.77909714, "learning_rate": 2.0958662677141437e-07, "loss": 0.80337393, "num_input_tokens_seen": 307564900, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18859863, "step": 14260, "time_per_iteration": 2.84981632232666 }, { "auxiliary_loss_clip": 0.01399184, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.23828697, "balance_loss_mlp": 1.0160569, "epoch": 0.8574177062979107, "flos": 24174743909760.0, "grad_norm": 1.8517400339097498, "language_loss": 0.75048506, "learning_rate": 2.09413096654806e-07, "loss": 0.77483046, "num_input_tokens_seen": 307583500, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19287109, "step": 14261, "time_per_iteration": 2.83603572845459 }, { "auxiliary_loss_clip": 0.01400419, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.23850727, "balance_loss_mlp": 1.01633036, "epoch": 0.8574778295505787, "flos": 17939601665280.0, "grad_norm": 1.8437899328278136, "language_loss": 0.79460979, "learning_rate": 2.0923963443743276e-07, "loss": 0.8189832, "num_input_tokens_seen": 307601430, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.20593262, "step": 14262, "time_per_iteration": 2.81742262840271 }, { "auxiliary_loss_clip": 0.01392974, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.23685622, "balance_loss_mlp": 1.01412773, "epoch": 0.8575379528032466, "flos": 21590868545280.0, "grad_norm": 3.7574752809143828, "language_loss": 0.68785763, "learning_rate": 2.0906624012587203e-07, "loss": 0.71211398, "num_input_tokens_seen": 307621495, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.1854248, "step": 14263, "time_per_iteration": 2.796776533126831 }, { "auxiliary_loss_clip": 0.01411654, "auxiliary_loss_mlp": 0.01035659, "balance_loss_clip": 1.24929082, "balance_loss_mlp": 1.01656199, "epoch": 0.8575980760559146, "flos": 21771395873280.0, "grad_norm": 2.948337468359391, "language_loss": 0.80088222, "learning_rate": 2.088929137266986e-07, "loss": 0.82535535, "num_input_tokens_seen": 307640840, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19091797, "step": 14264, "time_per_iteration": 2.816323757171631 }, { "auxiliary_loss_clip": 0.01411288, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.25164998, "balance_loss_mlp": 1.01627886, "epoch": 0.8576581993085826, "flos": 34399739007360.0, "grad_norm": 1.2793005532971435, "language_loss": 0.70050043, "learning_rate": 2.0871965524648582e-07, "loss": 0.724967, "num_input_tokens_seen": 307663820, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19091797, "step": 14265, "time_per_iteration": 2.9305152893066406 }, { "auxiliary_loss_clip": 0.01390994, "auxiliary_loss_mlp": 0.01028697, "balance_loss_clip": 1.23475385, "balance_loss_mlp": 1.01062512, "epoch": 0.8577183225612506, "flos": 23233078990080.0, "grad_norm": 1.9524002903112594, "language_loss": 0.66894913, "learning_rate": 2.085464646918027e-07, "loss": 0.69314599, "num_input_tokens_seen": 307682385, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18066406, "step": 14266, "time_per_iteration": 2.8515117168426514 }, { "auxiliary_loss_clip": 0.01400802, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.24275041, "balance_loss_mlp": 1.01541853, "epoch": 0.8577784458139185, "flos": 28816024417920.0, "grad_norm": 2.3047107091933143, "language_loss": 0.7637881, "learning_rate": 2.0837334206921731e-07, "loss": 0.78814375, "num_input_tokens_seen": 307704680, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19348145, "step": 14267, "time_per_iteration": 2.8905978202819824 }, { "auxiliary_loss_clip": 0.01392501, "auxiliary_loss_mlp": 0.01036291, "balance_loss_clip": 1.23501849, "balance_loss_mlp": 1.01804006, "epoch": 0.8578385690665865, "flos": 19765054126080.0, "grad_norm": 1.6953041399624071, "language_loss": 0.87907243, "learning_rate": 2.082002873852946e-07, "loss": 0.90336037, "num_input_tokens_seen": 307723245, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18261719, "step": 14268, "time_per_iteration": 2.8315558433532715 }, { "auxiliary_loss_clip": 0.01419055, "auxiliary_loss_mlp": 0.01034662, "balance_loss_clip": 1.25583029, "balance_loss_mlp": 1.01568425, "epoch": 0.8578986923192544, "flos": 20713777234560.0, "grad_norm": 1.7741067857405215, "language_loss": 0.74005139, "learning_rate": 2.0802730064659667e-07, "loss": 0.76458859, "num_input_tokens_seen": 307742510, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18981934, "step": 14269, "time_per_iteration": 2.8940114974975586 }, { "auxiliary_loss_clip": 0.01403299, "auxiliary_loss_mlp": 0.01032814, "balance_loss_clip": 1.24214673, "balance_loss_mlp": 1.01385999, "epoch": 0.8579588155719224, "flos": 36115572021120.0, "grad_norm": 1.519727222069618, "language_loss": 0.66777313, "learning_rate": 2.0785438185968252e-07, "loss": 0.69213426, "num_input_tokens_seen": 307766030, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18945312, "step": 14270, "time_per_iteration": 2.9452457427978516 }, { "auxiliary_loss_clip": 0.01393927, "auxiliary_loss_mlp": 0.01029589, "balance_loss_clip": 1.23671508, "balance_loss_mlp": 1.0108732, "epoch": 0.8580189388245905, "flos": 22862884884480.0, "grad_norm": 1.600574567535804, "language_loss": 0.74308056, "learning_rate": 2.0768153103110997e-07, "loss": 0.76731575, "num_input_tokens_seen": 307785800, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18713379, "step": 14271, "time_per_iteration": 2.8707776069641113 }, { "auxiliary_loss_clip": 0.01180227, "auxiliary_loss_mlp": 0.01021706, "balance_loss_clip": 1.09032571, "balance_loss_mlp": 1.00196517, "epoch": 0.8580790620772584, "flos": 69676723042560.0, "grad_norm": 0.8014230951458096, "language_loss": 0.59676313, "learning_rate": 2.0750874816743358e-07, "loss": 0.6187824, "num_input_tokens_seen": 307850995, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.19726562, "step": 14272, "time_per_iteration": 3.385039806365967 }, { "auxiliary_loss_clip": 0.01418307, "auxiliary_loss_mlp": 0.0103313, "balance_loss_clip": 1.25379729, "balance_loss_mlp": 1.01391387, "epoch": 0.8581391853299264, "flos": 13342100140800.0, "grad_norm": 2.228200143606108, "language_loss": 0.76161075, "learning_rate": 2.0733603327520499e-07, "loss": 0.78612506, "num_input_tokens_seen": 307868585, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1920166, "step": 14273, "time_per_iteration": 2.8465416431427 }, { "auxiliary_loss_clip": 0.01391108, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.23172092, "balance_loss_mlp": 1.01267815, "epoch": 0.8581993085825943, "flos": 19654982231040.0, "grad_norm": 1.740841862463271, "language_loss": 0.82619756, "learning_rate": 2.0716338636097385e-07, "loss": 0.85042095, "num_input_tokens_seen": 307886820, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1854248, "step": 14274, "time_per_iteration": 2.834275245666504 }, { "auxiliary_loss_clip": 0.01179138, "auxiliary_loss_mlp": 0.01019761, "balance_loss_clip": 1.08986735, "balance_loss_mlp": 1.00097394, "epoch": 0.8582594318352623, "flos": 55849515707520.0, "grad_norm": 0.7932810194096229, "language_loss": 0.60917389, "learning_rate": 2.0699080743128672e-07, "loss": 0.63116288, "num_input_tokens_seen": 307944020, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.1875, "step": 14275, "time_per_iteration": 3.3716845512390137 }, { "auxiliary_loss_clip": 0.0140476, "auxiliary_loss_mlp": 0.01029427, "balance_loss_clip": 1.24260485, "balance_loss_mlp": 1.0107708, "epoch": 0.8583195550879302, "flos": 24290154691200.0, "grad_norm": 3.7938678535016113, "language_loss": 0.60165733, "learning_rate": 2.0681829649268768e-07, "loss": 0.62599921, "num_input_tokens_seen": 307961055, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18640137, "step": 14276, "time_per_iteration": 2.881760597229004 }, { "auxiliary_loss_clip": 0.01399758, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.23957705, "balance_loss_mlp": 1.01318073, "epoch": 0.8583796783405983, "flos": 13452126791040.0, "grad_norm": 2.4594527959746406, "language_loss": 0.77357149, "learning_rate": 2.0664585355171838e-07, "loss": 0.79789507, "num_input_tokens_seen": 307978690, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19421387, "step": 14277, "time_per_iteration": 2.8129806518554688 }, { "auxiliary_loss_clip": 0.01404939, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.24497902, "balance_loss_mlp": 1.01344419, "epoch": 0.8584398015932662, "flos": 16188902893440.0, "grad_norm": 1.6858748316256966, "language_loss": 0.83873165, "learning_rate": 2.0647347861491803e-07, "loss": 0.86310887, "num_input_tokens_seen": 307995870, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19335938, "step": 14278, "time_per_iteration": 2.8463923931121826 }, { "auxiliary_loss_clip": 0.01418818, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.25525999, "balance_loss_mlp": 1.01376295, "epoch": 0.8584999248459342, "flos": 17457616362240.0, "grad_norm": 2.1239204478919, "language_loss": 0.75895715, "learning_rate": 2.0630117168882366e-07, "loss": 0.78347993, "num_input_tokens_seen": 308013645, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19702148, "step": 14279, "time_per_iteration": 2.875763177871704 }, { "auxiliary_loss_clip": 0.01395815, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.23820233, "balance_loss_mlp": 1.01395786, "epoch": 0.8585600480986021, "flos": 23451548722560.0, "grad_norm": 2.215698549422693, "language_loss": 0.67116308, "learning_rate": 2.0612893277996845e-07, "loss": 0.69545043, "num_input_tokens_seen": 308032490, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18969727, "step": 14280, "time_per_iteration": 3.0334174633026123 }, { "auxiliary_loss_clip": 0.01394493, "auxiliary_loss_mlp": 0.01030809, "balance_loss_clip": 1.23659587, "balance_loss_mlp": 1.01266527, "epoch": 0.8586201713512701, "flos": 19947436490880.0, "grad_norm": 1.8181279265563723, "language_loss": 0.63502955, "learning_rate": 2.0595676189488343e-07, "loss": 0.65928257, "num_input_tokens_seen": 308052110, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18139648, "step": 14281, "time_per_iteration": 2.8748443126678467 }, { "auxiliary_loss_clip": 0.01400817, "auxiliary_loss_mlp": 0.01031119, "balance_loss_clip": 1.24132097, "balance_loss_mlp": 1.01208091, "epoch": 0.858680294603938, "flos": 15313214171520.0, "grad_norm": 3.2058264117172603, "language_loss": 0.74432278, "learning_rate": 2.0578465904009845e-07, "loss": 0.76864219, "num_input_tokens_seen": 308070660, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19042969, "step": 14282, "time_per_iteration": 4.23938512802124 }, { "auxiliary_loss_clip": 0.01394283, "auxiliary_loss_mlp": 0.01031568, "balance_loss_clip": 1.23503411, "balance_loss_mlp": 1.01332879, "epoch": 0.858740417856606, "flos": 22721566815360.0, "grad_norm": 1.7562597010016865, "language_loss": 0.76572728, "learning_rate": 2.0561262422213832e-07, "loss": 0.78998584, "num_input_tokens_seen": 308089520, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18237305, "step": 14283, "time_per_iteration": 2.868083953857422 }, { "auxiliary_loss_clip": 0.01402825, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.24332237, "balance_loss_mlp": 1.01575816, "epoch": 0.8588005411092741, "flos": 34066537165440.0, "grad_norm": 1.779748131504728, "language_loss": 0.60635078, "learning_rate": 2.0544065744752736e-07, "loss": 0.63072574, "num_input_tokens_seen": 308111545, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18884277, "step": 14284, "time_per_iteration": 2.9781763553619385 }, { "auxiliary_loss_clip": 0.01383493, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.22809386, "balance_loss_mlp": 1.01115477, "epoch": 0.858860664361942, "flos": 28925462885760.0, "grad_norm": 1.6394972357545472, "language_loss": 0.76454687, "learning_rate": 2.0526875872278749e-07, "loss": 0.7886709, "num_input_tokens_seen": 308129690, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.17749023, "step": 14285, "time_per_iteration": 4.360584020614624 }, { "auxiliary_loss_clip": 0.01406454, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.24523652, "balance_loss_mlp": 1.01698959, "epoch": 0.85892078761461, "flos": 19802136879360.0, "grad_norm": 2.0727942827938883, "language_loss": 0.75029778, "learning_rate": 2.0509692805443524e-07, "loss": 0.77472758, "num_input_tokens_seen": 308147410, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19543457, "step": 14286, "time_per_iteration": 2.8486523628234863 }, { "auxiliary_loss_clip": 0.01178792, "auxiliary_loss_mlp": 0.0101559, "balance_loss_clip": 1.08978379, "balance_loss_mlp": 0.99756527, "epoch": 0.8589809108672779, "flos": 67135930007040.0, "grad_norm": 0.7611850822519846, "language_loss": 0.49459809, "learning_rate": 2.0492516544898718e-07, "loss": 0.51654196, "num_input_tokens_seen": 308204875, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.18066406, "step": 14287, "time_per_iteration": 3.316173791885376 }, { "auxiliary_loss_clip": 0.01402757, "auxiliary_loss_mlp": 0.0103305, "balance_loss_clip": 1.24327934, "balance_loss_mlp": 1.01405978, "epoch": 0.8590410341199459, "flos": 29728343445120.0, "grad_norm": 1.7418932216879512, "language_loss": 0.7948184, "learning_rate": 2.0475347091295704e-07, "loss": 0.81917644, "num_input_tokens_seen": 308225690, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18994141, "step": 14288, "time_per_iteration": 2.924221992492676 }, { "auxiliary_loss_clip": 0.01412035, "auxiliary_loss_mlp": 0.01033983, "balance_loss_clip": 1.24963856, "balance_loss_mlp": 1.01364625, "epoch": 0.8591011573726138, "flos": 23997881410560.0, "grad_norm": 2.2172759602227097, "language_loss": 0.81090349, "learning_rate": 2.045818444528553e-07, "loss": 0.83536369, "num_input_tokens_seen": 308245255, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20336914, "step": 14289, "time_per_iteration": 3.026127815246582 }, { "auxiliary_loss_clip": 0.01409287, "auxiliary_loss_mlp": 0.01030048, "balance_loss_clip": 1.24926555, "balance_loss_mlp": 1.01196361, "epoch": 0.8591612806252819, "flos": 14436756288000.0, "grad_norm": 1.7828910826089455, "language_loss": 0.66073084, "learning_rate": 2.0441028607518973e-07, "loss": 0.68512422, "num_input_tokens_seen": 308261755, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1809082, "step": 14290, "time_per_iteration": 4.217029094696045 }, { "auxiliary_loss_clip": 0.01417874, "auxiliary_loss_mlp": 0.01034121, "balance_loss_clip": 1.25375485, "balance_loss_mlp": 1.01528633, "epoch": 0.8592214038779498, "flos": 31589747539200.0, "grad_norm": 2.0183768821284156, "language_loss": 0.56388158, "learning_rate": 2.0423879578646642e-07, "loss": 0.58840156, "num_input_tokens_seen": 308285145, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.18847656, "step": 14291, "time_per_iteration": 3.0566208362579346 }, { "auxiliary_loss_clip": 0.01400958, "auxiliary_loss_mlp": 0.01030489, "balance_loss_clip": 1.23976088, "balance_loss_mlp": 1.01197553, "epoch": 0.8592815271306178, "flos": 17466258119040.0, "grad_norm": 4.364510761322985, "language_loss": 0.72562099, "learning_rate": 2.0406737359318792e-07, "loss": 0.74993545, "num_input_tokens_seen": 308304130, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18505859, "step": 14292, "time_per_iteration": 4.281145811080933 }, { "auxiliary_loss_clip": 0.01402922, "auxiliary_loss_mlp": 0.01030915, "balance_loss_clip": 1.24216497, "balance_loss_mlp": 1.01191282, "epoch": 0.8593416503832857, "flos": 25422436529280.0, "grad_norm": 1.7915578039424898, "language_loss": 0.71511233, "learning_rate": 2.038960195018542e-07, "loss": 0.73945069, "num_input_tokens_seen": 308324670, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19018555, "step": 14293, "time_per_iteration": 2.8909337520599365 }, { "auxiliary_loss_clip": 0.01392125, "auxiliary_loss_mlp": 0.01030476, "balance_loss_clip": 1.23481822, "balance_loss_mlp": 1.01187944, "epoch": 0.8594017736359537, "flos": 21006593452800.0, "grad_norm": 2.837678306701247, "language_loss": 0.69896412, "learning_rate": 2.0372473351896358e-07, "loss": 0.72319007, "num_input_tokens_seen": 308344215, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18603516, "step": 14294, "time_per_iteration": 2.843010425567627 }, { "auxiliary_loss_clip": 0.01384835, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.22878718, "balance_loss_mlp": 1.01138282, "epoch": 0.8594618968886216, "flos": 22101249600000.0, "grad_norm": 2.2177795598676826, "language_loss": 0.78144336, "learning_rate": 2.0355351565101087e-07, "loss": 0.80558705, "num_input_tokens_seen": 308360520, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18151855, "step": 14295, "time_per_iteration": 2.8818867206573486 }, { "auxiliary_loss_clip": 0.014188, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.2545625, "balance_loss_mlp": 1.01624107, "epoch": 0.8595220201412896, "flos": 11663757083520.0, "grad_norm": 2.902115513057223, "language_loss": 0.7040236, "learning_rate": 2.0338236590448975e-07, "loss": 0.72857356, "num_input_tokens_seen": 308376865, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19958496, "step": 14296, "time_per_iteration": 2.826753616333008 }, { "auxiliary_loss_clip": 0.01400291, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.24019694, "balance_loss_mlp": 1.01197636, "epoch": 0.8595821433939577, "flos": 25049753959680.0, "grad_norm": 2.3653577850120517, "language_loss": 0.79906833, "learning_rate": 2.0321128428588842e-07, "loss": 0.82338804, "num_input_tokens_seen": 308395870, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19689941, "step": 14297, "time_per_iteration": 2.881380796432495 }, { "auxiliary_loss_clip": 0.0138889, "auxiliary_loss_mlp": 0.01027608, "balance_loss_clip": 1.23186505, "balance_loss_mlp": 1.00907135, "epoch": 0.8596422666466256, "flos": 28523117710080.0, "grad_norm": 2.112773337721862, "language_loss": 0.68921822, "learning_rate": 2.030402708016954e-07, "loss": 0.7133832, "num_input_tokens_seen": 308417250, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18530273, "step": 14298, "time_per_iteration": 2.941148281097412 }, { "auxiliary_loss_clip": 0.0139802, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.24102485, "balance_loss_mlp": 1.01662767, "epoch": 0.8597023898992936, "flos": 13596928709760.0, "grad_norm": 2.133678940546827, "language_loss": 0.69229031, "learning_rate": 2.0286932545839576e-07, "loss": 0.71661913, "num_input_tokens_seen": 308434565, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18225098, "step": 14299, "time_per_iteration": 2.833583354949951 }, { "auxiliary_loss_clip": 0.01398442, "auxiliary_loss_mlp": 0.01035307, "balance_loss_clip": 1.23746264, "balance_loss_mlp": 1.01587558, "epoch": 0.8597625131519615, "flos": 32312037830400.0, "grad_norm": 2.420628480155815, "language_loss": 0.71733624, "learning_rate": 2.0269844826247096e-07, "loss": 0.74167371, "num_input_tokens_seen": 308450040, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19421387, "step": 14300, "time_per_iteration": 2.90362286567688 }, { "auxiliary_loss_clip": 0.01392077, "auxiliary_loss_mlp": 0.01034411, "balance_loss_clip": 1.23451662, "balance_loss_mlp": 1.01533794, "epoch": 0.8598226364046295, "flos": 28741994645760.0, "grad_norm": 1.80383236597738, "language_loss": 0.70341808, "learning_rate": 2.0252763922040116e-07, "loss": 0.72768301, "num_input_tokens_seen": 308470545, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19067383, "step": 14301, "time_per_iteration": 2.895787477493286 }, { "auxiliary_loss_clip": 0.01402297, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.24225235, "balance_loss_mlp": 1.01338816, "epoch": 0.8598827596572974, "flos": 21881874971520.0, "grad_norm": 1.7100516132598813, "language_loss": 0.75470454, "learning_rate": 2.023568983386641e-07, "loss": 0.77905101, "num_input_tokens_seen": 308490020, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1895752, "step": 14302, "time_per_iteration": 2.8710737228393555 }, { "auxiliary_loss_clip": 0.01386607, "auxiliary_loss_mlp": 0.01028521, "balance_loss_clip": 1.22995377, "balance_loss_mlp": 1.01060343, "epoch": 0.8599428829099655, "flos": 23777375662080.0, "grad_norm": 1.6880420334577724, "language_loss": 0.8446005, "learning_rate": 2.02186225623733e-07, "loss": 0.86875176, "num_input_tokens_seen": 308509065, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.17919922, "step": 14303, "time_per_iteration": 2.87199068069458 }, { "auxiliary_loss_clip": 0.01413239, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.25070405, "balance_loss_mlp": 1.01529348, "epoch": 0.8600030061626334, "flos": 16220375291520.0, "grad_norm": 2.1852933972665722, "language_loss": 0.77856988, "learning_rate": 2.0201562108208025e-07, "loss": 0.80304068, "num_input_tokens_seen": 308524725, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18530273, "step": 14304, "time_per_iteration": 2.802100896835327 }, { "auxiliary_loss_clip": 0.01400741, "auxiliary_loss_mlp": 0.01030872, "balance_loss_clip": 1.23985493, "balance_loss_mlp": 1.01121449, "epoch": 0.8600631294153014, "flos": 15677843166720.0, "grad_norm": 2.0471899827067648, "language_loss": 0.54987496, "learning_rate": 2.0184508472017537e-07, "loss": 0.57419109, "num_input_tokens_seen": 308543525, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.1965332, "step": 14305, "time_per_iteration": 2.820127248764038 }, { "auxiliary_loss_clip": 0.01394792, "auxiliary_loss_mlp": 0.0102995, "balance_loss_clip": 1.23589492, "balance_loss_mlp": 1.0115205, "epoch": 0.8601232526679693, "flos": 17501893038720.0, "grad_norm": 1.814234228639476, "language_loss": 0.84157789, "learning_rate": 2.0167461654448558e-07, "loss": 0.86582536, "num_input_tokens_seen": 308557995, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18432617, "step": 14306, "time_per_iteration": 2.8018598556518555 }, { "auxiliary_loss_clip": 0.01387214, "auxiliary_loss_mlp": 0.01028894, "balance_loss_clip": 1.23098254, "balance_loss_mlp": 1.01070237, "epoch": 0.8601833759206373, "flos": 26998897000320.0, "grad_norm": 1.3140799746086815, "language_loss": 0.72150755, "learning_rate": 2.01504216561474e-07, "loss": 0.74566865, "num_input_tokens_seen": 308582750, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18200684, "step": 14307, "time_per_iteration": 2.9462833404541016 }, { "auxiliary_loss_clip": 0.01408242, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.24514282, "balance_loss_mlp": 1.01369083, "epoch": 0.8602434991733052, "flos": 25241004305280.0, "grad_norm": 1.5543447258844592, "language_loss": 0.64511454, "learning_rate": 2.0133388477760316e-07, "loss": 0.66953349, "num_input_tokens_seen": 308603770, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19970703, "step": 14308, "time_per_iteration": 2.8774564266204834 }, { "auxiliary_loss_clip": 0.01179868, "auxiliary_loss_mlp": 0.01022533, "balance_loss_clip": 1.08921337, "balance_loss_mlp": 1.00107503, "epoch": 0.8603036224259732, "flos": 71046911894400.0, "grad_norm": 0.6268461084852628, "language_loss": 0.48541331, "learning_rate": 2.0116362119933172e-07, "loss": 0.50743735, "num_input_tokens_seen": 308667735, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.21484375, "step": 14309, "time_per_iteration": 3.424689292907715 }, { "auxiliary_loss_clip": 0.01410791, "auxiliary_loss_mlp": 0.01032145, "balance_loss_clip": 1.24784875, "balance_loss_mlp": 1.01385856, "epoch": 0.8603637456786413, "flos": 20309667511680.0, "grad_norm": 2.2869293043614047, "language_loss": 0.6732862, "learning_rate": 2.0099342583311563e-07, "loss": 0.69771552, "num_input_tokens_seen": 308686300, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18286133, "step": 14310, "time_per_iteration": 2.898127794265747 }, { "auxiliary_loss_clip": 0.01399207, "auxiliary_loss_mlp": 0.0103008, "balance_loss_clip": 1.23920631, "balance_loss_mlp": 1.01210332, "epoch": 0.8604238689313092, "flos": 21846013827840.0, "grad_norm": 1.6342848116136621, "language_loss": 0.78629059, "learning_rate": 2.0082329868540905e-07, "loss": 0.81058347, "num_input_tokens_seen": 308705825, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.17980957, "step": 14311, "time_per_iteration": 2.8879222869873047 }, { "auxiliary_loss_clip": 0.0139767, "auxiliary_loss_mlp": 0.01030215, "balance_loss_clip": 1.2394557, "balance_loss_mlp": 1.01184487, "epoch": 0.8604839921839772, "flos": 18013224234240.0, "grad_norm": 1.8397629791121286, "language_loss": 0.72065175, "learning_rate": 2.006532397626639e-07, "loss": 0.74493062, "num_input_tokens_seen": 308723340, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18371582, "step": 14312, "time_per_iteration": 2.87563157081604 }, { "auxiliary_loss_clip": 0.01397798, "auxiliary_loss_mlp": 0.01033381, "balance_loss_clip": 1.23809409, "balance_loss_mlp": 1.01491535, "epoch": 0.8605441154366451, "flos": 16260534691200.0, "grad_norm": 2.8628229202226696, "language_loss": 0.78489214, "learning_rate": 2.0048324907132797e-07, "loss": 0.80920392, "num_input_tokens_seen": 308741280, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18444824, "step": 14313, "time_per_iteration": 2.837592363357544 }, { "auxiliary_loss_clip": 0.01390824, "auxiliary_loss_mlp": 0.01033365, "balance_loss_clip": 1.23442793, "balance_loss_mlp": 1.01355267, "epoch": 0.8606042386893131, "flos": 32278981864320.0, "grad_norm": 1.5122201915452007, "language_loss": 0.73433268, "learning_rate": 2.003133266178474e-07, "loss": 0.7585746, "num_input_tokens_seen": 308762875, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.19799805, "step": 14314, "time_per_iteration": 2.9720890522003174 }, { "auxiliary_loss_clip": 0.01390237, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.23144484, "balance_loss_mlp": 1.01324284, "epoch": 0.860664361941981, "flos": 20239347813120.0, "grad_norm": 2.2487109886891314, "language_loss": 0.69745255, "learning_rate": 2.001434724086657e-07, "loss": 0.72167301, "num_input_tokens_seen": 308780315, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18579102, "step": 14315, "time_per_iteration": 2.859649658203125 }, { "auxiliary_loss_clip": 0.01406865, "auxiliary_loss_mlp": 0.01031728, "balance_loss_clip": 1.247172, "balance_loss_mlp": 1.01356006, "epoch": 0.8607244851946491, "flos": 25202428473600.0, "grad_norm": 1.5815990301923042, "language_loss": 0.7270028, "learning_rate": 1.9997368645022418e-07, "loss": 0.75138867, "num_input_tokens_seen": 308799435, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18164062, "step": 14316, "time_per_iteration": 2.936397075653076 }, { "auxiliary_loss_clip": 0.01410958, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.24909651, "balance_loss_mlp": 1.01435971, "epoch": 0.860784608447317, "flos": 20490918756480.0, "grad_norm": 1.7010269583597377, "language_loss": 0.83747292, "learning_rate": 1.9980396874896056e-07, "loss": 0.86191559, "num_input_tokens_seen": 308817730, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18933105, "step": 14317, "time_per_iteration": 4.301350831985474 }, { "auxiliary_loss_clip": 0.01388748, "auxiliary_loss_mlp": 0.01030557, "balance_loss_clip": 1.23236263, "balance_loss_mlp": 1.01153111, "epoch": 0.860844731699985, "flos": 50493347072640.0, "grad_norm": 2.289593956792784, "language_loss": 0.67217344, "learning_rate": 1.996343193113108e-07, "loss": 0.69636649, "num_input_tokens_seen": 308841735, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.19030762, "step": 14318, "time_per_iteration": 3.188929319381714 }, { "auxiliary_loss_clip": 0.01386647, "auxiliary_loss_mlp": 0.01032492, "balance_loss_clip": 1.23109794, "balance_loss_mlp": 1.01446795, "epoch": 0.8609048549526529, "flos": 41187819680640.0, "grad_norm": 1.5262240236159628, "language_loss": 0.72127414, "learning_rate": 1.9946473814370911e-07, "loss": 0.74546558, "num_input_tokens_seen": 308865050, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18029785, "step": 14319, "time_per_iteration": 3.112804651260376 }, { "auxiliary_loss_clip": 0.01407884, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.24632382, "balance_loss_mlp": 1.01384664, "epoch": 0.8609649782053209, "flos": 23961703553280.0, "grad_norm": 1.8725884325476636, "language_loss": 0.68218064, "learning_rate": 1.992952252525839e-07, "loss": 0.70658684, "num_input_tokens_seen": 308885375, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18896484, "step": 14320, "time_per_iteration": 4.301659822463989 }, { "auxiliary_loss_clip": 0.01406816, "auxiliary_loss_mlp": 0.01033568, "balance_loss_clip": 1.24425292, "balance_loss_mlp": 1.01412487, "epoch": 0.8610251014579888, "flos": 23123007095040.0, "grad_norm": 2.1930362057304627, "language_loss": 0.80476248, "learning_rate": 1.9912578064436446e-07, "loss": 0.82916635, "num_input_tokens_seen": 308904700, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19433594, "step": 14321, "time_per_iteration": 2.880366802215576 }, { "auxiliary_loss_clip": 0.01377017, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.22339416, "balance_loss_mlp": 1.01412809, "epoch": 0.8610852247106568, "flos": 19436105295360.0, "grad_norm": 1.8939193009911237, "language_loss": 0.71709406, "learning_rate": 1.9895640432547567e-07, "loss": 0.74120164, "num_input_tokens_seen": 308922985, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.19616699, "step": 14322, "time_per_iteration": 2.8419291973114014 }, { "auxiliary_loss_clip": 0.01423373, "auxiliary_loss_mlp": 0.01037605, "balance_loss_clip": 1.25896311, "balance_loss_mlp": 1.01871085, "epoch": 0.8611453479633249, "flos": 19319970597120.0, "grad_norm": 1.9777990072852365, "language_loss": 0.57049304, "learning_rate": 1.9878709630234102e-07, "loss": 0.59510279, "num_input_tokens_seen": 308940765, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18908691, "step": 14323, "time_per_iteration": 2.8401448726654053 }, { "auxiliary_loss_clip": 0.01397038, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.23940861, "balance_loss_mlp": 1.01341486, "epoch": 0.8612054712159928, "flos": 23262877330560.0, "grad_norm": 3.0182283050245347, "language_loss": 0.76412952, "learning_rate": 1.986178565813801e-07, "loss": 0.78842366, "num_input_tokens_seen": 308960110, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1895752, "step": 14324, "time_per_iteration": 2.865325689315796 }, { "auxiliary_loss_clip": 0.01395902, "auxiliary_loss_mlp": 0.01033638, "balance_loss_clip": 1.23620892, "balance_loss_mlp": 1.01307487, "epoch": 0.8612655944686608, "flos": 16035504462720.0, "grad_norm": 2.122615404936511, "language_loss": 0.67089617, "learning_rate": 1.9844868516901036e-07, "loss": 0.69519162, "num_input_tokens_seen": 308976665, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.20568848, "step": 14325, "time_per_iteration": 4.218362808227539 }, { "auxiliary_loss_clip": 0.01402373, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 1.24116218, "balance_loss_mlp": 1.01603794, "epoch": 0.8613257177213287, "flos": 22503097082880.0, "grad_norm": 1.6769150826950707, "language_loss": 0.65896475, "learning_rate": 1.982795820716472e-07, "loss": 0.68334496, "num_input_tokens_seen": 308997015, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19604492, "step": 14326, "time_per_iteration": 2.866687297821045 }, { "auxiliary_loss_clip": 0.01415598, "auxiliary_loss_mlp": 0.0103206, "balance_loss_clip": 1.2536217, "balance_loss_mlp": 1.01372612, "epoch": 0.8613858409739967, "flos": 17246883490560.0, "grad_norm": 1.9744949956016458, "language_loss": 0.8559382, "learning_rate": 1.9811054729570253e-07, "loss": 0.88041472, "num_input_tokens_seen": 309015250, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18322754, "step": 14327, "time_per_iteration": 4.171824932098389 }, { "auxiliary_loss_clip": 0.01397072, "auxiliary_loss_mlp": 0.01033174, "balance_loss_clip": 1.23874259, "balance_loss_mlp": 1.01412451, "epoch": 0.8614459642266646, "flos": 22831593465600.0, "grad_norm": 2.7403495534756823, "language_loss": 0.75486982, "learning_rate": 1.9794158084758661e-07, "loss": 0.77917224, "num_input_tokens_seen": 309034140, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19055176, "step": 14328, "time_per_iteration": 2.946436643600464 }, { "auxiliary_loss_clip": 0.01394667, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.23704207, "balance_loss_mlp": 1.01240277, "epoch": 0.8615060874793327, "flos": 26515237639680.0, "grad_norm": 1.6536907637625073, "language_loss": 0.80182695, "learning_rate": 1.9777268273370673e-07, "loss": 0.82607162, "num_input_tokens_seen": 309055075, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.17407227, "step": 14329, "time_per_iteration": 2.89574933052063 }, { "auxiliary_loss_clip": 0.01404794, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.24494195, "balance_loss_mlp": 1.01646376, "epoch": 0.8615662107320006, "flos": 24071639713920.0, "grad_norm": 2.196772898633525, "language_loss": 0.78323293, "learning_rate": 1.9760385296046757e-07, "loss": 0.80763453, "num_input_tokens_seen": 309074650, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18908691, "step": 14330, "time_per_iteration": 2.873300313949585 }, { "auxiliary_loss_clip": 0.01383499, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.22617674, "balance_loss_mlp": 1.01466441, "epoch": 0.8616263339846686, "flos": 24174291461760.0, "grad_norm": 1.883580267607281, "language_loss": 0.65503335, "learning_rate": 1.974350915342702e-07, "loss": 0.67920619, "num_input_tokens_seen": 309094385, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19116211, "step": 14331, "time_per_iteration": 2.893131971359253 }, { "auxiliary_loss_clip": 0.0138774, "auxiliary_loss_mlp": 0.01031624, "balance_loss_clip": 1.23204482, "balance_loss_mlp": 1.01442206, "epoch": 0.8616864572373365, "flos": 21733951161600.0, "grad_norm": 1.6160889310737399, "language_loss": 0.76592016, "learning_rate": 1.9726639846151506e-07, "loss": 0.79011381, "num_input_tokens_seen": 309111815, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.17199707, "step": 14332, "time_per_iteration": 2.861217737197876 }, { "auxiliary_loss_clip": 0.01401555, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.24008632, "balance_loss_mlp": 1.01374435, "epoch": 0.8617465804900045, "flos": 23776561255680.0, "grad_norm": 1.857838282623686, "language_loss": 0.67591214, "learning_rate": 1.9709777374859904e-07, "loss": 0.70025802, "num_input_tokens_seen": 309131385, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19287109, "step": 14333, "time_per_iteration": 2.9158599376678467 }, { "auxiliary_loss_clip": 0.01424598, "auxiliary_loss_mlp": 0.0103595, "balance_loss_clip": 1.25860572, "balance_loss_mlp": 1.01569617, "epoch": 0.8618067037426724, "flos": 37717396842240.0, "grad_norm": 2.217732669756166, "language_loss": 0.63312435, "learning_rate": 1.969292174019157e-07, "loss": 0.6577298, "num_input_tokens_seen": 309155020, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.20263672, "step": 14334, "time_per_iteration": 3.045092821121216 }, { "auxiliary_loss_clip": 0.01417372, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.25387144, "balance_loss_mlp": 1.01474977, "epoch": 0.8618668269953405, "flos": 21481113363840.0, "grad_norm": 1.831423798351321, "language_loss": 0.69961691, "learning_rate": 1.967607294278577e-07, "loss": 0.72412872, "num_input_tokens_seen": 309172865, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19067383, "step": 14335, "time_per_iteration": 2.8941683769226074 }, { "auxiliary_loss_clip": 0.0140698, "auxiliary_loss_mlp": 0.01034315, "balance_loss_clip": 1.24749529, "balance_loss_mlp": 1.01595664, "epoch": 0.8619269502480085, "flos": 22241798507520.0, "grad_norm": 1.6141494851472251, "language_loss": 0.83350885, "learning_rate": 1.965923098328135e-07, "loss": 0.85792184, "num_input_tokens_seen": 309193575, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18359375, "step": 14336, "time_per_iteration": 2.9764790534973145 }, { "auxiliary_loss_clip": 0.01417694, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.25277948, "balance_loss_mlp": 1.01331782, "epoch": 0.8619870735006764, "flos": 22720797653760.0, "grad_norm": 1.8888517797854774, "language_loss": 0.6839658, "learning_rate": 1.9642395862316907e-07, "loss": 0.70846963, "num_input_tokens_seen": 309212680, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19372559, "step": 14337, "time_per_iteration": 2.8869025707244873 }, { "auxiliary_loss_clip": 0.013966, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.23788679, "balance_loss_mlp": 1.0098871, "epoch": 0.8620471967533444, "flos": 37533385664640.0, "grad_norm": 1.6914932000710234, "language_loss": 0.6771695, "learning_rate": 1.962556758053089e-07, "loss": 0.70142442, "num_input_tokens_seen": 309234485, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19006348, "step": 14338, "time_per_iteration": 3.0290868282318115 }, { "auxiliary_loss_clip": 0.01404187, "auxiliary_loss_mlp": 0.0103452, "balance_loss_clip": 1.24422264, "balance_loss_mlp": 1.01682973, "epoch": 0.8621073200060123, "flos": 19691884005120.0, "grad_norm": 4.610401707538982, "language_loss": 0.62758565, "learning_rate": 1.9608746138561448e-07, "loss": 0.65197277, "num_input_tokens_seen": 309253630, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.17687988, "step": 14339, "time_per_iteration": 2.9000966548919678 }, { "auxiliary_loss_clip": 0.01388594, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.23136997, "balance_loss_mlp": 1.01324558, "epoch": 0.8621674432586803, "flos": 14544656432640.0, "grad_norm": 1.9494598019263136, "language_loss": 0.63468206, "learning_rate": 1.9591931537046458e-07, "loss": 0.6588859, "num_input_tokens_seen": 309270950, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18554688, "step": 14340, "time_per_iteration": 2.7810592651367188 }, { "auxiliary_loss_clip": 0.01379404, "auxiliary_loss_mlp": 0.01028717, "balance_loss_clip": 1.22673643, "balance_loss_mlp": 1.01032305, "epoch": 0.8622275665113482, "flos": 20749683623040.0, "grad_norm": 1.5873708709124237, "language_loss": 0.80581897, "learning_rate": 1.9575123776623493e-07, "loss": 0.82990021, "num_input_tokens_seen": 309288780, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.18395996, "step": 14341, "time_per_iteration": 2.8484578132629395 }, { "auxiliary_loss_clip": 0.01388815, "auxiliary_loss_mlp": 0.01033655, "balance_loss_clip": 1.23102522, "balance_loss_mlp": 1.01532054, "epoch": 0.8622876897640163, "flos": 24726234504960.0, "grad_norm": 1.6942298347708242, "language_loss": 0.75036353, "learning_rate": 1.9558322857929887e-07, "loss": 0.77458823, "num_input_tokens_seen": 309310875, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18347168, "step": 14342, "time_per_iteration": 2.90468168258667 }, { "auxiliary_loss_clip": 0.0141023, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.24785018, "balance_loss_mlp": 1.01445365, "epoch": 0.8623478130166842, "flos": 17466348608640.0, "grad_norm": 1.8533929931688453, "language_loss": 0.69994974, "learning_rate": 1.95415287816028e-07, "loss": 0.72439259, "num_input_tokens_seen": 309329900, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19616699, "step": 14343, "time_per_iteration": 2.829237222671509 }, { "auxiliary_loss_clip": 0.01390521, "auxiliary_loss_mlp": 0.01038507, "balance_loss_clip": 1.23103118, "balance_loss_mlp": 1.0192306, "epoch": 0.8624079362693522, "flos": 18117052346880.0, "grad_norm": 1.723900139750099, "language_loss": 0.68714195, "learning_rate": 1.9524741548278967e-07, "loss": 0.71143222, "num_input_tokens_seen": 309347870, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19274902, "step": 14344, "time_per_iteration": 2.816903829574585 }, { "auxiliary_loss_clip": 0.01410619, "auxiliary_loss_mlp": 0.01033217, "balance_loss_clip": 1.24810743, "balance_loss_mlp": 1.0142746, "epoch": 0.8624680595220201, "flos": 30679193059200.0, "grad_norm": 1.433944646967897, "language_loss": 0.81736368, "learning_rate": 1.9507961158595054e-07, "loss": 0.841802, "num_input_tokens_seen": 309371695, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18945312, "step": 14345, "time_per_iteration": 2.9425466060638428 }, { "auxiliary_loss_clip": 0.01406344, "auxiliary_loss_mlp": 0.01032813, "balance_loss_clip": 1.24605656, "balance_loss_mlp": 1.01326251, "epoch": 0.8625281827746881, "flos": 38012837258880.0, "grad_norm": 2.0726354337847956, "language_loss": 0.523498, "learning_rate": 1.9491187613187355e-07, "loss": 0.54788953, "num_input_tokens_seen": 309394645, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19555664, "step": 14346, "time_per_iteration": 3.0011844635009766 }, { "auxiliary_loss_clip": 0.01385266, "auxiliary_loss_mlp": 0.01030876, "balance_loss_clip": 1.2264843, "balance_loss_mlp": 1.01218367, "epoch": 0.862588306027356, "flos": 26260816273920.0, "grad_norm": 1.4890589733900672, "language_loss": 0.75587356, "learning_rate": 1.9474420912691913e-07, "loss": 0.78003496, "num_input_tokens_seen": 309413170, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18701172, "step": 14347, "time_per_iteration": 2.8641397953033447 }, { "auxiliary_loss_clip": 0.01408765, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.24811864, "balance_loss_mlp": 1.01545858, "epoch": 0.862648429280024, "flos": 25888224193920.0, "grad_norm": 1.8465364940068945, "language_loss": 0.81481373, "learning_rate": 1.945766105774449e-07, "loss": 0.83924568, "num_input_tokens_seen": 309431315, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18969727, "step": 14348, "time_per_iteration": 2.900980234146118 }, { "auxiliary_loss_clip": 0.01382168, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.22804785, "balance_loss_mlp": 1.0129869, "epoch": 0.862708552532692, "flos": 37830681118080.0, "grad_norm": 1.5781910721507775, "language_loss": 0.66392905, "learning_rate": 1.9440908048980665e-07, "loss": 0.68806493, "num_input_tokens_seen": 309453020, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.18408203, "step": 14349, "time_per_iteration": 3.0167076587677 }, { "auxiliary_loss_clip": 0.0138368, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.22586977, "balance_loss_mlp": 1.01359582, "epoch": 0.86276867578536, "flos": 19099283869440.0, "grad_norm": 3.8162422637771556, "language_loss": 0.71093345, "learning_rate": 1.942416188703573e-07, "loss": 0.73509586, "num_input_tokens_seen": 309469780, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18981934, "step": 14350, "time_per_iteration": 2.887859582901001 }, { "auxiliary_loss_clip": 0.01399649, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.23842621, "balance_loss_mlp": 1.01214278, "epoch": 0.862828799038028, "flos": 22174555455360.0, "grad_norm": 1.776895640338648, "language_loss": 0.78173733, "learning_rate": 1.9407422572544618e-07, "loss": 0.80604863, "num_input_tokens_seen": 309489610, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19348145, "step": 14351, "time_per_iteration": 2.8710227012634277 }, { "auxiliary_loss_clip": 0.01403082, "auxiliary_loss_mlp": 0.01033512, "balance_loss_clip": 1.24430192, "balance_loss_mlp": 1.01484406, "epoch": 0.8628889222906959, "flos": 23155112920320.0, "grad_norm": 1.8814113944041224, "language_loss": 0.85484183, "learning_rate": 1.9390690106142204e-07, "loss": 0.87920773, "num_input_tokens_seen": 309508295, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18676758, "step": 14352, "time_per_iteration": 4.2580389976501465 }, { "auxiliary_loss_clip": 0.01181988, "auxiliary_loss_mlp": 0.01023731, "balance_loss_clip": 1.09131169, "balance_loss_mlp": 1.00360858, "epoch": 0.8629490455433639, "flos": 57848545779840.0, "grad_norm": 0.7974411927820524, "language_loss": 0.62002707, "learning_rate": 1.9373964488462913e-07, "loss": 0.64208424, "num_input_tokens_seen": 309567960, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.20117188, "step": 14353, "time_per_iteration": 3.3525543212890625 }, { "auxiliary_loss_clip": 0.01390304, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.23319912, "balance_loss_mlp": 1.01450586, "epoch": 0.8630091687960318, "flos": 15926880401280.0, "grad_norm": 5.904721604398567, "language_loss": 0.8244732, "learning_rate": 1.9357245720140948e-07, "loss": 0.84869587, "num_input_tokens_seen": 309586050, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.17431641, "step": 14354, "time_per_iteration": 2.8387889862060547 }, { "auxiliary_loss_clip": 0.01399892, "auxiliary_loss_mlp": 0.01033619, "balance_loss_clip": 1.24082971, "balance_loss_mlp": 1.01410484, "epoch": 0.8630692920486999, "flos": 17969128536960.0, "grad_norm": 1.801015706354795, "language_loss": 0.86698776, "learning_rate": 1.934053380181031e-07, "loss": 0.89132285, "num_input_tokens_seen": 309602910, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19494629, "step": 14355, "time_per_iteration": 4.28113865852356 }, { "auxiliary_loss_clip": 0.01406151, "auxiliary_loss_mlp": 0.01034503, "balance_loss_clip": 1.24563837, "balance_loss_mlp": 1.01484561, "epoch": 0.8631294153013678, "flos": 22465245168000.0, "grad_norm": 1.9478915540883277, "language_loss": 0.59255135, "learning_rate": 1.9323828734104763e-07, "loss": 0.61695784, "num_input_tokens_seen": 309621175, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.1965332, "step": 14356, "time_per_iteration": 2.840611696243286 }, { "auxiliary_loss_clip": 0.01417951, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.25342536, "balance_loss_mlp": 1.01718259, "epoch": 0.8631895385540358, "flos": 16845805169280.0, "grad_norm": 1.5661686944474218, "language_loss": 0.77797973, "learning_rate": 1.9307130517657756e-07, "loss": 0.80252254, "num_input_tokens_seen": 309639395, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19140625, "step": 14357, "time_per_iteration": 2.8416881561279297 }, { "auxiliary_loss_clip": 0.01407243, "auxiliary_loss_mlp": 0.01036185, "balance_loss_clip": 1.24659455, "balance_loss_mlp": 1.01714706, "epoch": 0.8632496618067037, "flos": 18706349612160.0, "grad_norm": 2.612709166259756, "language_loss": 0.78403986, "learning_rate": 1.9290439153102468e-07, "loss": 0.80847412, "num_input_tokens_seen": 309657265, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19042969, "step": 14358, "time_per_iteration": 2.850579261779785 }, { "auxiliary_loss_clip": 0.01403254, "auxiliary_loss_mlp": 0.01032583, "balance_loss_clip": 1.24207616, "balance_loss_mlp": 1.0141654, "epoch": 0.8633097850593717, "flos": 24290607139200.0, "grad_norm": 1.280606414428361, "language_loss": 0.75257456, "learning_rate": 1.9273754641071816e-07, "loss": 0.7769329, "num_input_tokens_seen": 309678610, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18432617, "step": 14359, "time_per_iteration": 2.9248881340026855 }, { "auxiliary_loss_clip": 0.01396679, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.24051929, "balance_loss_mlp": 1.0121665, "epoch": 0.8633699083120396, "flos": 21188432880000.0, "grad_norm": 2.0214851925395694, "language_loss": 0.71400326, "learning_rate": 1.9257076982198517e-07, "loss": 0.73828781, "num_input_tokens_seen": 309697710, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19592285, "step": 14360, "time_per_iteration": 4.295778512954712 }, { "auxiliary_loss_clip": 0.01415162, "auxiliary_loss_mlp": 0.01033744, "balance_loss_clip": 1.25220621, "balance_loss_mlp": 1.01456308, "epoch": 0.8634300315647077, "flos": 19254265868160.0, "grad_norm": 2.3498925005756894, "language_loss": 0.76511878, "learning_rate": 1.9240406177114953e-07, "loss": 0.78960782, "num_input_tokens_seen": 309715985, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19189453, "step": 14361, "time_per_iteration": 2.8387696743011475 }, { "auxiliary_loss_clip": 0.01181804, "auxiliary_loss_mlp": 0.01020199, "balance_loss_clip": 1.09248543, "balance_loss_mlp": 0.99969542, "epoch": 0.8634901548173756, "flos": 66225013568640.0, "grad_norm": 0.9525451811913462, "language_loss": 0.5887835, "learning_rate": 1.922374222645329e-07, "loss": 0.61080354, "num_input_tokens_seen": 309779930, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.20507812, "step": 14362, "time_per_iteration": 4.858181476593018 }, { "auxiliary_loss_clip": 0.01413422, "auxiliary_loss_mlp": 0.01032457, "balance_loss_clip": 1.24947751, "balance_loss_mlp": 1.01293027, "epoch": 0.8635502780700436, "flos": 24800038053120.0, "grad_norm": 1.6546876119293759, "language_loss": 0.8133502, "learning_rate": 1.9207085130845524e-07, "loss": 0.83780903, "num_input_tokens_seen": 309800580, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.1953125, "step": 14363, "time_per_iteration": 2.8830459117889404 }, { "auxiliary_loss_clip": 0.01397963, "auxiliary_loss_mlp": 0.01033356, "balance_loss_clip": 1.23694873, "balance_loss_mlp": 1.01398468, "epoch": 0.8636104013227116, "flos": 25200347212800.0, "grad_norm": 2.3580334518710684, "language_loss": 0.73702121, "learning_rate": 1.9190434890923112e-07, "loss": 0.76133442, "num_input_tokens_seen": 309821725, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19360352, "step": 14364, "time_per_iteration": 2.8994593620300293 }, { "auxiliary_loss_clip": 0.01398697, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.23731744, "balance_loss_mlp": 1.01225019, "epoch": 0.8636705245753795, "flos": 23888985880320.0, "grad_norm": 1.6612991342669692, "language_loss": 0.72517765, "learning_rate": 1.917379150731755e-07, "loss": 0.74946696, "num_input_tokens_seen": 309841565, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18005371, "step": 14365, "time_per_iteration": 2.906771659851074 }, { "auxiliary_loss_clip": 0.01408044, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.24490643, "balance_loss_mlp": 1.01407528, "epoch": 0.8637306478280475, "flos": 23120337651840.0, "grad_norm": 2.000355342987979, "language_loss": 0.71363658, "learning_rate": 1.915715498065993e-07, "loss": 0.73804992, "num_input_tokens_seen": 309858635, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19213867, "step": 14366, "time_per_iteration": 2.867645025253296 }, { "auxiliary_loss_clip": 0.01391412, "auxiliary_loss_mlp": 0.01031372, "balance_loss_clip": 1.23472631, "balance_loss_mlp": 1.01320481, "epoch": 0.8637907710807154, "flos": 21916469260800.0, "grad_norm": 1.638527109448551, "language_loss": 0.82330406, "learning_rate": 1.9140525311581146e-07, "loss": 0.84753191, "num_input_tokens_seen": 309877885, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.1817627, "step": 14367, "time_per_iteration": 2.8661606311798096 }, { "auxiliary_loss_clip": 0.01403686, "auxiliary_loss_mlp": 0.01030895, "balance_loss_clip": 1.24268639, "balance_loss_mlp": 1.0111897, "epoch": 0.8638508943333835, "flos": 23589428186880.0, "grad_norm": 2.013782625565588, "language_loss": 0.62249374, "learning_rate": 1.9123902500711743e-07, "loss": 0.64683956, "num_input_tokens_seen": 309893140, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19714355, "step": 14368, "time_per_iteration": 2.913501739501953 }, { "auxiliary_loss_clip": 0.01410032, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 1.25097609, "balance_loss_mlp": 1.01238489, "epoch": 0.8639110175860514, "flos": 25786929790080.0, "grad_norm": 1.9450057806077643, "language_loss": 0.76938939, "learning_rate": 1.91072865486821e-07, "loss": 0.79379445, "num_input_tokens_seen": 309914175, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18078613, "step": 14369, "time_per_iteration": 2.900883436203003 }, { "auxiliary_loss_clip": 0.01403544, "auxiliary_loss_mlp": 0.0103379, "balance_loss_clip": 1.24090004, "balance_loss_mlp": 1.01515722, "epoch": 0.8639711408387194, "flos": 23380324128000.0, "grad_norm": 2.21470983329804, "language_loss": 0.64885759, "learning_rate": 1.9090677456122294e-07, "loss": 0.67323101, "num_input_tokens_seen": 309932395, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18640137, "step": 14370, "time_per_iteration": 2.9256601333618164 }, { "auxiliary_loss_clip": 0.01401723, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.24236, "balance_loss_mlp": 1.01355696, "epoch": 0.8640312640913873, "flos": 22137336967680.0, "grad_norm": 1.5645991047199757, "language_loss": 0.66317391, "learning_rate": 1.907407522366209e-07, "loss": 0.68752229, "num_input_tokens_seen": 309951720, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19567871, "step": 14371, "time_per_iteration": 2.9041545391082764 }, { "auxiliary_loss_clip": 0.01181821, "auxiliary_loss_mlp": 0.01033519, "balance_loss_clip": 1.09142971, "balance_loss_mlp": 1.00948656, "epoch": 0.8640913873440553, "flos": 57595418530560.0, "grad_norm": 0.8640116497098699, "language_loss": 0.57004941, "learning_rate": 1.905747985193107e-07, "loss": 0.59220278, "num_input_tokens_seen": 310006120, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.24023438, "step": 14372, "time_per_iteration": 3.235107898712158 }, { "auxiliary_loss_clip": 0.01381683, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.22593367, "balance_loss_mlp": 1.01458597, "epoch": 0.8641515105967232, "flos": 23997881410560.0, "grad_norm": 1.707327895498149, "language_loss": 0.8025471, "learning_rate": 1.9040891341558597e-07, "loss": 0.82669497, "num_input_tokens_seen": 310026740, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18518066, "step": 14373, "time_per_iteration": 2.9178075790405273 }, { "auxiliary_loss_clip": 0.01400496, "auxiliary_loss_mlp": 0.01030494, "balance_loss_clip": 1.24038315, "balance_loss_mlp": 1.01131344, "epoch": 0.8642116338493913, "flos": 19072154972160.0, "grad_norm": 1.7912564845921626, "language_loss": 0.64736068, "learning_rate": 1.9024309693173656e-07, "loss": 0.67167056, "num_input_tokens_seen": 310044135, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19177246, "step": 14374, "time_per_iteration": 2.8689777851104736 }, { "auxiliary_loss_clip": 0.01393563, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.23794413, "balance_loss_mlp": 1.0165379, "epoch": 0.8642717571020592, "flos": 18261809020800.0, "grad_norm": 7.959977227411173, "language_loss": 0.7802937, "learning_rate": 1.9007734907404993e-07, "loss": 0.80457228, "num_input_tokens_seen": 310061560, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.17749023, "step": 14375, "time_per_iteration": 2.8444693088531494 }, { "auxiliary_loss_clip": 0.01398342, "auxiliary_loss_mlp": 0.01035289, "balance_loss_clip": 1.23834991, "balance_loss_mlp": 1.01619172, "epoch": 0.8643318803547272, "flos": 57682325088000.0, "grad_norm": 1.7322662486247813, "language_loss": 0.61515141, "learning_rate": 1.899116698488117e-07, "loss": 0.63948768, "num_input_tokens_seen": 310087310, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19091797, "step": 14376, "time_per_iteration": 3.2649261951446533 }, { "auxiliary_loss_clip": 0.01387658, "auxiliary_loss_mlp": 0.01026306, "balance_loss_clip": 1.22875834, "balance_loss_mlp": 1.00823414, "epoch": 0.8643920036073952, "flos": 19618940108160.0, "grad_norm": 1.4741071322288308, "language_loss": 0.67018843, "learning_rate": 1.8974605926230457e-07, "loss": 0.69432807, "num_input_tokens_seen": 310106260, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.1809082, "step": 14377, "time_per_iteration": 2.915039539337158 }, { "auxiliary_loss_clip": 0.01402713, "auxiliary_loss_mlp": 0.01034715, "balance_loss_clip": 1.24157476, "balance_loss_mlp": 1.01528442, "epoch": 0.8644521268600631, "flos": 20860162721280.0, "grad_norm": 1.5526342341251569, "language_loss": 0.71328473, "learning_rate": 1.8958051732080804e-07, "loss": 0.73765898, "num_input_tokens_seen": 310125305, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19433594, "step": 14378, "time_per_iteration": 2.8625831604003906 }, { "auxiliary_loss_clip": 0.01178444, "auxiliary_loss_mlp": 0.01025505, "balance_loss_clip": 1.08971655, "balance_loss_mlp": 1.00452447, "epoch": 0.8645122501127311, "flos": 66752451912960.0, "grad_norm": 0.8019795247649073, "language_loss": 0.60355073, "learning_rate": 1.894150440305995e-07, "loss": 0.62559021, "num_input_tokens_seen": 310189270, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.20996094, "step": 14379, "time_per_iteration": 3.3770532608032227 }, { "auxiliary_loss_clip": 0.01392652, "auxiliary_loss_mlp": 0.01033845, "balance_loss_clip": 1.23537827, "balance_loss_mlp": 1.01515269, "epoch": 0.864572373365399, "flos": 21700171278720.0, "grad_norm": 1.6923207498825894, "language_loss": 0.74888408, "learning_rate": 1.8924963939795478e-07, "loss": 0.77314901, "num_input_tokens_seen": 310208395, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18688965, "step": 14380, "time_per_iteration": 2.914415121078491 }, { "auxiliary_loss_clip": 0.01431563, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.26603889, "balance_loss_mlp": 1.01366758, "epoch": 0.8646324966180671, "flos": 20276023363200.0, "grad_norm": 2.7929577454848706, "language_loss": 0.76381123, "learning_rate": 1.8908430342914473e-07, "loss": 0.78844976, "num_input_tokens_seen": 310227415, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.1862793, "step": 14381, "time_per_iteration": 2.8508009910583496 }, { "auxiliary_loss_clip": 0.01388292, "auxiliary_loss_mlp": 0.01031135, "balance_loss_clip": 1.2307725, "balance_loss_mlp": 1.01300335, "epoch": 0.864692619870735, "flos": 11953270431360.0, "grad_norm": 2.35542324904602, "language_loss": 0.85640895, "learning_rate": 1.8891903613043892e-07, "loss": 0.88060319, "num_input_tokens_seen": 310242625, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18139648, "step": 14382, "time_per_iteration": 2.7891809940338135 }, { "auxiliary_loss_clip": 0.01412146, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.25161242, "balance_loss_mlp": 1.01552439, "epoch": 0.864752743123403, "flos": 21480706160640.0, "grad_norm": 1.7703924167265435, "language_loss": 0.76567537, "learning_rate": 1.8875383750810504e-07, "loss": 0.79015315, "num_input_tokens_seen": 310260585, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.2010498, "step": 14383, "time_per_iteration": 2.855681896209717 }, { "auxiliary_loss_clip": 0.01387903, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.23252463, "balance_loss_mlp": 1.0149914, "epoch": 0.8648128663760709, "flos": 19537716412800.0, "grad_norm": 1.768550400359185, "language_loss": 0.85747874, "learning_rate": 1.8858870756840738e-07, "loss": 0.88170159, "num_input_tokens_seen": 310277210, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.19384766, "step": 14384, "time_per_iteration": 2.8080103397369385 }, { "auxiliary_loss_clip": 0.01390193, "auxiliary_loss_mlp": 0.01033885, "balance_loss_clip": 1.23206687, "balance_loss_mlp": 1.01488304, "epoch": 0.8648729896287389, "flos": 21297826103040.0, "grad_norm": 1.8145274736956454, "language_loss": 0.81217933, "learning_rate": 1.884236463176072e-07, "loss": 0.83642012, "num_input_tokens_seen": 310296610, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19006348, "step": 14385, "time_per_iteration": 2.851769208908081 }, { "auxiliary_loss_clip": 0.01415657, "auxiliary_loss_mlp": 0.01031794, "balance_loss_clip": 1.25393796, "balance_loss_mlp": 1.0126133, "epoch": 0.8649331128814068, "flos": 24614262328320.0, "grad_norm": 2.1828517755027184, "language_loss": 0.7359556, "learning_rate": 1.8825865376196437e-07, "loss": 0.76043016, "num_input_tokens_seen": 310316830, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19165039, "step": 14386, "time_per_iteration": 2.860720634460449 }, { "auxiliary_loss_clip": 0.01393689, "auxiliary_loss_mlp": 0.01035293, "balance_loss_clip": 1.23607087, "balance_loss_mlp": 1.0164938, "epoch": 0.8649932361340749, "flos": 15386293802880.0, "grad_norm": 2.324843588159943, "language_loss": 0.82365382, "learning_rate": 1.8809372990773476e-07, "loss": 0.84794366, "num_input_tokens_seen": 310334355, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18811035, "step": 14387, "time_per_iteration": 4.263425588607788 }, { "auxiliary_loss_clip": 0.01390181, "auxiliary_loss_mlp": 0.01031133, "balance_loss_clip": 1.23327112, "balance_loss_mlp": 1.01239312, "epoch": 0.8650533593867428, "flos": 19910670451200.0, "grad_norm": 1.986227268850098, "language_loss": 0.70099217, "learning_rate": 1.8792887476117224e-07, "loss": 0.72520536, "num_input_tokens_seen": 310352900, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.1875, "step": 14388, "time_per_iteration": 2.8305718898773193 }, { "auxiliary_loss_clip": 0.01387526, "auxiliary_loss_mlp": 0.01033867, "balance_loss_clip": 1.23365402, "balance_loss_mlp": 1.01578307, "epoch": 0.8651134826394108, "flos": 25637196188160.0, "grad_norm": 1.49137727930038, "language_loss": 0.90914035, "learning_rate": 1.877640883285283e-07, "loss": 0.93335426, "num_input_tokens_seen": 310372855, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.18078613, "step": 14389, "time_per_iteration": 2.894045829772949 }, { "auxiliary_loss_clip": 0.01399767, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.24284232, "balance_loss_mlp": 1.01258731, "epoch": 0.8651736058920788, "flos": 18743884813440.0, "grad_norm": 1.7517939855764342, "language_loss": 0.71514195, "learning_rate": 1.8759937061605212e-07, "loss": 0.73944956, "num_input_tokens_seen": 310391595, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18395996, "step": 14390, "time_per_iteration": 4.370844841003418 }, { "auxiliary_loss_clip": 0.01400786, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.24058676, "balance_loss_mlp": 1.01544487, "epoch": 0.8652337291447467, "flos": 20786404417920.0, "grad_norm": 1.5228622622994485, "language_loss": 0.82837296, "learning_rate": 1.8743472162998941e-07, "loss": 0.85272819, "num_input_tokens_seen": 310410090, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19274902, "step": 14391, "time_per_iteration": 2.863205671310425 }, { "auxiliary_loss_clip": 0.01178884, "auxiliary_loss_mlp": 0.01020258, "balance_loss_clip": 1.08936572, "balance_loss_mlp": 0.99841911, "epoch": 0.8652938523974147, "flos": 64257383387520.0, "grad_norm": 0.8085762278377054, "language_loss": 0.68123573, "learning_rate": 1.8727014137658337e-07, "loss": 0.70322716, "num_input_tokens_seen": 310470055, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.21875, "step": 14392, "time_per_iteration": 3.2456107139587402 }, { "auxiliary_loss_clip": 0.01414545, "auxiliary_loss_mlp": 0.01033402, "balance_loss_clip": 1.24968219, "balance_loss_mlp": 1.01435256, "epoch": 0.8653539756500827, "flos": 18049583070720.0, "grad_norm": 1.9893432824221735, "language_loss": 0.76697272, "learning_rate": 1.8710562986207523e-07, "loss": 0.79145223, "num_input_tokens_seen": 310487665, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19042969, "step": 14393, "time_per_iteration": 2.8293118476867676 }, { "auxiliary_loss_clip": 0.01396065, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.23489523, "balance_loss_mlp": 1.01504874, "epoch": 0.8654140989027507, "flos": 17390871002880.0, "grad_norm": 1.9748748430717693, "language_loss": 0.74580634, "learning_rate": 1.8694118709270357e-07, "loss": 0.7701062, "num_input_tokens_seen": 310506130, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1887207, "step": 14394, "time_per_iteration": 2.8415701389312744 }, { "auxiliary_loss_clip": 0.01407461, "auxiliary_loss_mlp": 0.01034211, "balance_loss_clip": 1.2466433, "balance_loss_mlp": 1.01461315, "epoch": 0.8654742221554186, "flos": 53302750358400.0, "grad_norm": 2.1401434188140502, "language_loss": 0.66349268, "learning_rate": 1.867768130747036e-07, "loss": 0.68790942, "num_input_tokens_seen": 310532445, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19592285, "step": 14395, "time_per_iteration": 4.64944314956665 }, { "auxiliary_loss_clip": 0.01406016, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.24752069, "balance_loss_mlp": 1.01210976, "epoch": 0.8655343454080866, "flos": 23925208982400.0, "grad_norm": 1.7503631542903044, "language_loss": 0.69221914, "learning_rate": 1.8661250781430838e-07, "loss": 0.71658659, "num_input_tokens_seen": 310552300, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18615723, "step": 14396, "time_per_iteration": 2.862457513809204 }, { "auxiliary_loss_clip": 0.0140608, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.24392056, "balance_loss_mlp": 1.01467705, "epoch": 0.8655944686607545, "flos": 24107772326400.0, "grad_norm": 2.150152491382438, "language_loss": 0.70560205, "learning_rate": 1.8644827131774954e-07, "loss": 0.73000538, "num_input_tokens_seen": 310572710, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19580078, "step": 14397, "time_per_iteration": 4.218133449554443 }, { "auxiliary_loss_clip": 0.0139384, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.23419929, "balance_loss_mlp": 1.01430798, "epoch": 0.8656545919134225, "flos": 23123188074240.0, "grad_norm": 1.968344585016124, "language_loss": 0.6456027, "learning_rate": 1.86284103591253e-07, "loss": 0.66987675, "num_input_tokens_seen": 310592460, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19250488, "step": 14398, "time_per_iteration": 2.9097113609313965 }, { "auxiliary_loss_clip": 0.01394995, "auxiliary_loss_mlp": 0.01031821, "balance_loss_clip": 1.23529828, "balance_loss_mlp": 1.01305711, "epoch": 0.8657147151660904, "flos": 21151531105920.0, "grad_norm": 2.0095503066737437, "language_loss": 0.76900971, "learning_rate": 1.8612000464104517e-07, "loss": 0.79327792, "num_input_tokens_seen": 310609375, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18762207, "step": 14399, "time_per_iteration": 2.8482415676116943 }, { "auxiliary_loss_clip": 0.01387682, "auxiliary_loss_mlp": 0.01030348, "balance_loss_clip": 1.23133922, "balance_loss_mlp": 1.01146483, "epoch": 0.8657748384187585, "flos": 16298024647680.0, "grad_norm": 1.8954014935638521, "language_loss": 0.93681526, "learning_rate": 1.8595597447334855e-07, "loss": 0.96099555, "num_input_tokens_seen": 310627405, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18884277, "step": 14400, "time_per_iteration": 2.838705539703369 }, { "auxiliary_loss_clip": 0.01399872, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.24082279, "balance_loss_mlp": 1.01416326, "epoch": 0.8658349616714264, "flos": 30855376886400.0, "grad_norm": 1.797135061581166, "language_loss": 0.6786207, "learning_rate": 1.8579201309438353e-07, "loss": 0.70295155, "num_input_tokens_seen": 310649945, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19042969, "step": 14401, "time_per_iteration": 2.931502342224121 }, { "auxiliary_loss_clip": 0.01416414, "auxiliary_loss_mlp": 0.01035722, "balance_loss_clip": 1.25315464, "balance_loss_mlp": 1.01643395, "epoch": 0.8658950849240944, "flos": 18962128321920.0, "grad_norm": 2.314460958632759, "language_loss": 0.74977833, "learning_rate": 1.8562812051036714e-07, "loss": 0.77429968, "num_input_tokens_seen": 310668285, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19287109, "step": 14402, "time_per_iteration": 2.8819520473480225 }, { "auxiliary_loss_clip": 0.01386096, "auxiliary_loss_mlp": 0.01030282, "balance_loss_clip": 1.2304213, "balance_loss_mlp": 1.01185226, "epoch": 0.8659552081767624, "flos": 23373899366400.0, "grad_norm": 1.6544455849696553, "language_loss": 0.75542474, "learning_rate": 1.8546429672751397e-07, "loss": 0.77958852, "num_input_tokens_seen": 310687015, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18432617, "step": 14403, "time_per_iteration": 2.841637134552002 }, { "auxiliary_loss_clip": 0.01415572, "auxiliary_loss_mlp": 0.01037394, "balance_loss_clip": 1.25438535, "balance_loss_mlp": 1.01760483, "epoch": 0.8660153314294303, "flos": 23852536554240.0, "grad_norm": 2.0954078133765415, "language_loss": 0.73843741, "learning_rate": 1.853005417520368e-07, "loss": 0.76296711, "num_input_tokens_seen": 310707580, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19787598, "step": 14404, "time_per_iteration": 2.8844516277313232 }, { "auxiliary_loss_clip": 0.01386505, "auxiliary_loss_mlp": 0.01033601, "balance_loss_clip": 1.23017418, "balance_loss_mlp": 1.01337135, "epoch": 0.8660754546820983, "flos": 23122871360640.0, "grad_norm": 1.87285827112881, "language_loss": 0.7186361, "learning_rate": 1.851368555901447e-07, "loss": 0.74283713, "num_input_tokens_seen": 310727300, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.20239258, "step": 14405, "time_per_iteration": 2.8448288440704346 }, { "auxiliary_loss_clip": 0.01408242, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.24521661, "balance_loss_mlp": 1.01377296, "epoch": 0.8661355779347663, "flos": 14400035493120.0, "grad_norm": 1.9237517988577033, "language_loss": 0.67279804, "learning_rate": 1.8497323824804467e-07, "loss": 0.69721049, "num_input_tokens_seen": 310744935, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19226074, "step": 14406, "time_per_iteration": 2.8092682361602783 }, { "auxiliary_loss_clip": 0.01401148, "auxiliary_loss_mlp": 0.0103141, "balance_loss_clip": 1.24189794, "balance_loss_mlp": 1.01319456, "epoch": 0.8661957011874343, "flos": 21879748465920.0, "grad_norm": 1.6718396764205201, "language_loss": 0.83796453, "learning_rate": 1.8480968973194177e-07, "loss": 0.86229014, "num_input_tokens_seen": 310765085, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18237305, "step": 14407, "time_per_iteration": 2.8849070072174072 }, { "auxiliary_loss_clip": 0.01396653, "auxiliary_loss_mlp": 0.0103456, "balance_loss_clip": 1.2384727, "balance_loss_mlp": 1.01505733, "epoch": 0.8662558244401022, "flos": 21845154176640.0, "grad_norm": 1.7367859356854936, "language_loss": 0.70409256, "learning_rate": 1.8464621004803748e-07, "loss": 0.7284047, "num_input_tokens_seen": 310783260, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19506836, "step": 14408, "time_per_iteration": 2.8573081493377686 }, { "auxiliary_loss_clip": 0.01382281, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.22862482, "balance_loss_mlp": 1.01206613, "epoch": 0.8663159476927702, "flos": 17392997508480.0, "grad_norm": 1.9810934208554565, "language_loss": 0.78277981, "learning_rate": 1.844827992025304e-07, "loss": 0.80689573, "num_input_tokens_seen": 310801970, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.17272949, "step": 14409, "time_per_iteration": 2.912222385406494 }, { "auxiliary_loss_clip": 0.01415008, "auxiliary_loss_mlp": 0.01033261, "balance_loss_clip": 1.25249982, "balance_loss_mlp": 1.01384163, "epoch": 0.8663760709454381, "flos": 22758061386240.0, "grad_norm": 1.9015993737721228, "language_loss": 0.77620447, "learning_rate": 1.8431945720161757e-07, "loss": 0.80068719, "num_input_tokens_seen": 310822070, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19433594, "step": 14410, "time_per_iteration": 2.843724012374878 }, { "auxiliary_loss_clip": 0.01400142, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.24024129, "balance_loss_mlp": 1.0145936, "epoch": 0.8664361941981061, "flos": 17383858058880.0, "grad_norm": 2.6900145735173715, "language_loss": 0.78060615, "learning_rate": 1.8415618405149315e-07, "loss": 0.80494237, "num_input_tokens_seen": 310838355, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18884277, "step": 14411, "time_per_iteration": 2.8351473808288574 }, { "auxiliary_loss_clip": 0.01388642, "auxiliary_loss_mlp": 0.01033181, "balance_loss_clip": 1.23035812, "balance_loss_mlp": 1.01500201, "epoch": 0.866496317450774, "flos": 16043060344320.0, "grad_norm": 1.95070639956755, "language_loss": 0.74536204, "learning_rate": 1.8399297975834794e-07, "loss": 0.76958025, "num_input_tokens_seen": 310856055, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18188477, "step": 14412, "time_per_iteration": 2.8567521572113037 }, { "auxiliary_loss_clip": 0.01397502, "auxiliary_loss_mlp": 0.01030016, "balance_loss_clip": 1.24059081, "balance_loss_mlp": 1.01219416, "epoch": 0.8665564407034421, "flos": 20824889760000.0, "grad_norm": 1.7054089966208763, "language_loss": 0.70375097, "learning_rate": 1.83829844328371e-07, "loss": 0.72802615, "num_input_tokens_seen": 310876695, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.17834473, "step": 14413, "time_per_iteration": 2.859772205352783 }, { "auxiliary_loss_clip": 0.01404978, "auxiliary_loss_mlp": 0.01031866, "balance_loss_clip": 1.24474514, "balance_loss_mlp": 1.01259005, "epoch": 0.86661656395611, "flos": 15823640471040.0, "grad_norm": 2.390497911409765, "language_loss": 0.63685083, "learning_rate": 1.8366677776774874e-07, "loss": 0.6612193, "num_input_tokens_seen": 310893880, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19274902, "step": 14414, "time_per_iteration": 2.8281304836273193 }, { "auxiliary_loss_clip": 0.01395806, "auxiliary_loss_mlp": 0.01031435, "balance_loss_clip": 1.23698163, "balance_loss_mlp": 1.0127188, "epoch": 0.866676687208778, "flos": 23046624593280.0, "grad_norm": 1.7212569201313803, "language_loss": 0.64219999, "learning_rate": 1.8350378008266377e-07, "loss": 0.66647243, "num_input_tokens_seen": 310914145, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18713379, "step": 14415, "time_per_iteration": 2.856370687484741 }, { "auxiliary_loss_clip": 0.01181908, "auxiliary_loss_mlp": 0.01022455, "balance_loss_clip": 1.09084606, "balance_loss_mlp": 0.99909037, "epoch": 0.866736810461446, "flos": 63834831768960.0, "grad_norm": 0.7976802636238252, "language_loss": 0.60462558, "learning_rate": 1.8334085127929754e-07, "loss": 0.62666929, "num_input_tokens_seen": 310972825, "router_z_loss_clip": 0.91015625, "router_z_loss_mlp": 0.23339844, "step": 14416, "time_per_iteration": 3.3933756351470947 }, { "auxiliary_loss_clip": 0.01407553, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.24423993, "balance_loss_mlp": 1.01453507, "epoch": 0.8667969337141139, "flos": 20459174889600.0, "grad_norm": 1.9128185503044737, "language_loss": 0.75799274, "learning_rate": 1.831779913638285e-07, "loss": 0.78241414, "num_input_tokens_seen": 310992050, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20068359, "step": 14417, "time_per_iteration": 2.8508450984954834 }, { "auxiliary_loss_clip": 0.01395083, "auxiliary_loss_mlp": 0.01034836, "balance_loss_clip": 1.23581266, "balance_loss_mlp": 1.0159297, "epoch": 0.866857056966782, "flos": 21663902931840.0, "grad_norm": 1.4743111951565562, "language_loss": 0.75161713, "learning_rate": 1.830152003424319e-07, "loss": 0.77591634, "num_input_tokens_seen": 311011105, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18896484, "step": 14418, "time_per_iteration": 2.8395800590515137 }, { "auxiliary_loss_clip": 0.01385657, "auxiliary_loss_mlp": 0.01034736, "balance_loss_clip": 1.22782433, "balance_loss_mlp": 1.0163542, "epoch": 0.8669171802194499, "flos": 22862070478080.0, "grad_norm": 1.4886076272645223, "language_loss": 0.68316185, "learning_rate": 1.8285247822128126e-07, "loss": 0.70736575, "num_input_tokens_seen": 311032080, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18395996, "step": 14419, "time_per_iteration": 2.9261937141418457 }, { "auxiliary_loss_clip": 0.01406893, "auxiliary_loss_mlp": 0.01029039, "balance_loss_clip": 1.24580073, "balance_loss_mlp": 1.01050234, "epoch": 0.8669773034721179, "flos": 18743432365440.0, "grad_norm": 2.1948712456173234, "language_loss": 0.79663378, "learning_rate": 1.826898250065465e-07, "loss": 0.82099307, "num_input_tokens_seen": 311049735, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1854248, "step": 14420, "time_per_iteration": 2.8792357444763184 }, { "auxiliary_loss_clip": 0.0140836, "auxiliary_loss_mlp": 0.01034035, "balance_loss_clip": 1.24904871, "balance_loss_mlp": 1.01512814, "epoch": 0.8670374267247858, "flos": 18925181303040.0, "grad_norm": 1.4585353328520247, "language_loss": 0.83790982, "learning_rate": 1.8252724070439586e-07, "loss": 0.86233383, "num_input_tokens_seen": 311067675, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18908691, "step": 14421, "time_per_iteration": 2.850329637527466 }, { "auxiliary_loss_clip": 0.01180499, "auxiliary_loss_mlp": 0.01033025, "balance_loss_clip": 1.09021389, "balance_loss_mlp": 1.01147175, "epoch": 0.8670975499774538, "flos": 48845245524480.0, "grad_norm": 0.7064659136912858, "language_loss": 0.4923172, "learning_rate": 1.823647253209941e-07, "loss": 0.51445246, "num_input_tokens_seen": 311126605, "router_z_loss_clip": 0.90234375, "router_z_loss_mlp": 0.21582031, "step": 14422, "time_per_iteration": 4.720146179199219 }, { "auxiliary_loss_clip": 0.01390493, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.23179603, "balance_loss_mlp": 1.01505446, "epoch": 0.8671576732301217, "flos": 26146129409280.0, "grad_norm": 2.1534619669700534, "language_loss": 0.74442464, "learning_rate": 1.8220227886250417e-07, "loss": 0.76866907, "num_input_tokens_seen": 311147325, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18908691, "step": 14423, "time_per_iteration": 2.89670991897583 }, { "auxiliary_loss_clip": 0.01377287, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.22352839, "balance_loss_mlp": 1.01128483, "epoch": 0.8672177964827897, "flos": 18376133927040.0, "grad_norm": 2.1172135901875637, "language_loss": 0.76909268, "learning_rate": 1.8203990133508684e-07, "loss": 0.79316705, "num_input_tokens_seen": 311165385, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.18847656, "step": 14424, "time_per_iteration": 2.8538525104522705 }, { "auxiliary_loss_clip": 0.01384577, "auxiliary_loss_mlp": 0.01029073, "balance_loss_clip": 1.23085487, "balance_loss_mlp": 1.01090574, "epoch": 0.8672779197354576, "flos": 28556264165760.0, "grad_norm": 1.7940730205544564, "language_loss": 0.72162575, "learning_rate": 1.8187759274489767e-07, "loss": 0.74576223, "num_input_tokens_seen": 311185860, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.1817627, "step": 14425, "time_per_iteration": 4.299913644790649 }, { "auxiliary_loss_clip": 0.01411077, "auxiliary_loss_mlp": 0.01032042, "balance_loss_clip": 1.24760675, "balance_loss_mlp": 1.01239681, "epoch": 0.8673380429881257, "flos": 22392617984640.0, "grad_norm": 1.7042950753624233, "language_loss": 0.69048333, "learning_rate": 1.817153530980926e-07, "loss": 0.71491444, "num_input_tokens_seen": 311205810, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19641113, "step": 14426, "time_per_iteration": 2.8697080612182617 }, { "auxiliary_loss_clip": 0.01400717, "auxiliary_loss_mlp": 0.01029616, "balance_loss_clip": 1.23855793, "balance_loss_mlp": 1.01115012, "epoch": 0.8673981662407936, "flos": 21006321984000.0, "grad_norm": 1.850816031490293, "language_loss": 0.7160123, "learning_rate": 1.815531824008234e-07, "loss": 0.74031556, "num_input_tokens_seen": 311226080, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18469238, "step": 14427, "time_per_iteration": 2.918419361114502 }, { "auxiliary_loss_clip": 0.01398066, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.24016857, "balance_loss_mlp": 1.01226544, "epoch": 0.8674582894934616, "flos": 24437761787520.0, "grad_norm": 3.590567722558237, "language_loss": 0.68896997, "learning_rate": 1.8139108065924004e-07, "loss": 0.71326804, "num_input_tokens_seen": 311246380, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19470215, "step": 14428, "time_per_iteration": 2.9238839149475098 }, { "auxiliary_loss_clip": 0.01392009, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.23377228, "balance_loss_mlp": 1.01354504, "epoch": 0.8675184127461296, "flos": 20746425997440.0, "grad_norm": 1.8993200281884253, "language_loss": 0.71833885, "learning_rate": 1.812290478794889e-07, "loss": 0.74257886, "num_input_tokens_seen": 311266465, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18444824, "step": 14429, "time_per_iteration": 2.82319712638855 }, { "auxiliary_loss_clip": 0.01388565, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.23071492, "balance_loss_mlp": 1.01360464, "epoch": 0.8675785359987975, "flos": 19145053624320.0, "grad_norm": 2.1073944452532314, "language_loss": 0.67976874, "learning_rate": 1.810670840677151e-07, "loss": 0.70398188, "num_input_tokens_seen": 311285075, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19165039, "step": 14430, "time_per_iteration": 2.84468674659729 }, { "auxiliary_loss_clip": 0.01394984, "auxiliary_loss_mlp": 0.01036627, "balance_loss_clip": 1.23428392, "balance_loss_mlp": 1.01700497, "epoch": 0.8676386592514655, "flos": 22720842898560.0, "grad_norm": 1.8365852097187898, "language_loss": 0.69308484, "learning_rate": 1.8090518923005948e-07, "loss": 0.71740091, "num_input_tokens_seen": 311303230, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19628906, "step": 14431, "time_per_iteration": 5.680430173873901 }, { "auxiliary_loss_clip": 0.01405866, "auxiliary_loss_mlp": 0.01036561, "balance_loss_clip": 1.24579763, "balance_loss_mlp": 1.01777375, "epoch": 0.8676987825041335, "flos": 14217924597120.0, "grad_norm": 2.031742437316324, "language_loss": 0.64877111, "learning_rate": 1.8074336337266116e-07, "loss": 0.67319542, "num_input_tokens_seen": 311318070, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18798828, "step": 14432, "time_per_iteration": 2.796380043029785 }, { "auxiliary_loss_clip": 0.0140398, "auxiliary_loss_mlp": 0.01032768, "balance_loss_clip": 1.24498439, "balance_loss_mlp": 1.01491034, "epoch": 0.8677589057568015, "flos": 13597924095360.0, "grad_norm": 2.1360658810045576, "language_loss": 0.79724222, "learning_rate": 1.8058160650165656e-07, "loss": 0.82160974, "num_input_tokens_seen": 311334885, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.17871094, "step": 14433, "time_per_iteration": 2.807097911834717 }, { "auxiliary_loss_clip": 0.01184901, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.09439826, "balance_loss_mlp": 1.00481915, "epoch": 0.8678190290094694, "flos": 68964142400640.0, "grad_norm": 0.7073135467266112, "language_loss": 0.58531809, "learning_rate": 1.804199186231805e-07, "loss": 0.60746324, "num_input_tokens_seen": 311399780, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.24707031, "step": 14434, "time_per_iteration": 3.444748640060425 }, { "auxiliary_loss_clip": 0.0138382, "auxiliary_loss_mlp": 0.01036, "balance_loss_clip": 1.22765136, "balance_loss_mlp": 1.01708174, "epoch": 0.8678791522621374, "flos": 32569400108160.0, "grad_norm": 1.6699221356110487, "language_loss": 0.80723429, "learning_rate": 1.802582997433628e-07, "loss": 0.83143246, "num_input_tokens_seen": 311419610, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18896484, "step": 14435, "time_per_iteration": 2.964468240737915 }, { "auxiliary_loss_clip": 0.01405935, "auxiliary_loss_mlp": 0.01034064, "balance_loss_clip": 1.24306798, "balance_loss_mlp": 1.01520538, "epoch": 0.8679392755148053, "flos": 35055962611200.0, "grad_norm": 3.2097445115886054, "language_loss": 0.63326466, "learning_rate": 1.8009674986833322e-07, "loss": 0.65766466, "num_input_tokens_seen": 311440045, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18847656, "step": 14436, "time_per_iteration": 3.036362648010254 }, { "auxiliary_loss_clip": 0.01398623, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.23854136, "balance_loss_mlp": 1.01283145, "epoch": 0.8679993987674733, "flos": 18561909651840.0, "grad_norm": 2.2877030935637626, "language_loss": 0.70669997, "learning_rate": 1.7993526900421706e-07, "loss": 0.73100632, "num_input_tokens_seen": 311456660, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19165039, "step": 14437, "time_per_iteration": 2.8597140312194824 }, { "auxiliary_loss_clip": 0.01398194, "auxiliary_loss_mlp": 0.01029286, "balance_loss_clip": 1.23941255, "balance_loss_mlp": 1.01055872, "epoch": 0.8680595220201412, "flos": 27465951519360.0, "grad_norm": 1.9714316225315272, "language_loss": 0.80594003, "learning_rate": 1.797738571571381e-07, "loss": 0.83021486, "num_input_tokens_seen": 311475460, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18737793, "step": 14438, "time_per_iteration": 2.940873146057129 }, { "auxiliary_loss_clip": 0.01389381, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.23266065, "balance_loss_mlp": 1.01385188, "epoch": 0.8681196452728093, "flos": 19218268990080.0, "grad_norm": 1.8888794704080136, "language_loss": 0.68116248, "learning_rate": 1.7961251433321656e-07, "loss": 0.70539039, "num_input_tokens_seen": 311494575, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19580078, "step": 14439, "time_per_iteration": 2.9713308811187744 }, { "auxiliary_loss_clip": 0.01394719, "auxiliary_loss_mlp": 0.01036452, "balance_loss_clip": 1.2358067, "balance_loss_mlp": 1.01774812, "epoch": 0.8681797685254772, "flos": 37574495205120.0, "grad_norm": 1.9303618816215, "language_loss": 0.64751339, "learning_rate": 1.7945124053857085e-07, "loss": 0.67182505, "num_input_tokens_seen": 311515805, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18701172, "step": 14440, "time_per_iteration": 2.987187147140503 }, { "auxiliary_loss_clip": 0.01387421, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.23181403, "balance_loss_mlp": 1.01377869, "epoch": 0.8682398917781452, "flos": 23299100432640.0, "grad_norm": 1.854477245241771, "language_loss": 0.65904927, "learning_rate": 1.7929003577931722e-07, "loss": 0.68324834, "num_input_tokens_seen": 311536000, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18713379, "step": 14441, "time_per_iteration": 2.8658487796783447 }, { "auxiliary_loss_clip": 0.01386658, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.2310667, "balance_loss_mlp": 1.01290774, "epoch": 0.8683000150308132, "flos": 21883096581120.0, "grad_norm": 1.7033733713127164, "language_loss": 0.66539323, "learning_rate": 1.7912890006156722e-07, "loss": 0.68957794, "num_input_tokens_seen": 311556220, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.18908691, "step": 14442, "time_per_iteration": 2.8447670936584473 }, { "auxiliary_loss_clip": 0.014125, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.24891376, "balance_loss_mlp": 1.01196599, "epoch": 0.8683601382834811, "flos": 14655271265280.0, "grad_norm": 1.7510821687258555, "language_loss": 0.72968501, "learning_rate": 1.7896783339143195e-07, "loss": 0.75412667, "num_input_tokens_seen": 311572530, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19714355, "step": 14443, "time_per_iteration": 2.8528592586517334 }, { "auxiliary_loss_clip": 0.01401799, "auxiliary_loss_mlp": 0.01032291, "balance_loss_clip": 1.24087954, "balance_loss_mlp": 1.01251376, "epoch": 0.8684202615361492, "flos": 26371204882560.0, "grad_norm": 1.8396493179734938, "language_loss": 0.8379482, "learning_rate": 1.7880683577501877e-07, "loss": 0.86228907, "num_input_tokens_seen": 311591105, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19775391, "step": 14444, "time_per_iteration": 2.8710856437683105 }, { "auxiliary_loss_clip": 0.0138848, "auxiliary_loss_mlp": 0.0103379, "balance_loss_clip": 1.2295481, "balance_loss_mlp": 1.0143348, "epoch": 0.8684803847888171, "flos": 20713686744960.0, "grad_norm": 4.080260037468577, "language_loss": 0.78015745, "learning_rate": 1.7864590721843342e-07, "loss": 0.80438006, "num_input_tokens_seen": 311608350, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19458008, "step": 14445, "time_per_iteration": 2.883321523666382 }, { "auxiliary_loss_clip": 0.01401437, "auxiliary_loss_mlp": 0.01035957, "balance_loss_clip": 1.24250054, "balance_loss_mlp": 1.01583409, "epoch": 0.8685405080414851, "flos": 22648396694400.0, "grad_norm": 2.266008523187761, "language_loss": 0.68250954, "learning_rate": 1.7848504772777728e-07, "loss": 0.70688349, "num_input_tokens_seen": 311626380, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.20117188, "step": 14446, "time_per_iteration": 2.842822313308716 }, { "auxiliary_loss_clip": 0.01393668, "auxiliary_loss_mlp": 0.01031524, "balance_loss_clip": 1.23594213, "balance_loss_mlp": 1.01181865, "epoch": 0.868600631294153, "flos": 24831555696000.0, "grad_norm": 1.6489880375243862, "language_loss": 0.83118415, "learning_rate": 1.7832425730915102e-07, "loss": 0.85543603, "num_input_tokens_seen": 311644345, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19702148, "step": 14447, "time_per_iteration": 2.8304293155670166 }, { "auxiliary_loss_clip": 0.01393203, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.23480952, "balance_loss_mlp": 1.01201999, "epoch": 0.868660754546821, "flos": 25123557507840.0, "grad_norm": 1.72116615292025, "language_loss": 0.74474394, "learning_rate": 1.781635359686515e-07, "loss": 0.76898265, "num_input_tokens_seen": 311663340, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18640137, "step": 14448, "time_per_iteration": 2.8736226558685303 }, { "auxiliary_loss_clip": 0.01398781, "auxiliary_loss_mlp": 0.0103348, "balance_loss_clip": 1.23919547, "balance_loss_mlp": 1.01410818, "epoch": 0.8687208777994889, "flos": 12685605068160.0, "grad_norm": 4.725529767618878, "language_loss": 0.80926627, "learning_rate": 1.7800288371237303e-07, "loss": 0.8335889, "num_input_tokens_seen": 311679860, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19360352, "step": 14449, "time_per_iteration": 2.7929675579071045 }, { "auxiliary_loss_clip": 0.0118306, "auxiliary_loss_mlp": 0.01021667, "balance_loss_clip": 1.09317994, "balance_loss_mlp": 0.99896908, "epoch": 0.8687810010521569, "flos": 65647841909760.0, "grad_norm": 0.8054968302385487, "language_loss": 0.60713381, "learning_rate": 1.7784230054640758e-07, "loss": 0.62918103, "num_input_tokens_seen": 311738135, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.2265625, "step": 14450, "time_per_iteration": 3.25038743019104 }, { "auxiliary_loss_clip": 0.01405653, "auxiliary_loss_mlp": 0.01028553, "balance_loss_clip": 1.24496269, "balance_loss_mlp": 1.01049256, "epoch": 0.8688411243048249, "flos": 24254836485120.0, "grad_norm": 1.6861252503758863, "language_loss": 0.77151489, "learning_rate": 1.7768178647684517e-07, "loss": 0.79585695, "num_input_tokens_seen": 311756975, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18054199, "step": 14451, "time_per_iteration": 2.8633556365966797 }, { "auxiliary_loss_clip": 0.01399083, "auxiliary_loss_mlp": 0.01031998, "balance_loss_clip": 1.24057388, "balance_loss_mlp": 1.01368713, "epoch": 0.8689012475574929, "flos": 18230562846720.0, "grad_norm": 2.6694233118463098, "language_loss": 0.72438502, "learning_rate": 1.7752134150977205e-07, "loss": 0.74869585, "num_input_tokens_seen": 311771830, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18310547, "step": 14452, "time_per_iteration": 2.8013010025024414 }, { "auxiliary_loss_clip": 0.01406068, "auxiliary_loss_mlp": 0.01031592, "balance_loss_clip": 1.24467766, "balance_loss_mlp": 1.01204169, "epoch": 0.8689613708101608, "flos": 19656339575040.0, "grad_norm": 1.4343282241206077, "language_loss": 0.72962618, "learning_rate": 1.7736096565127201e-07, "loss": 0.75400281, "num_input_tokens_seen": 311790130, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19543457, "step": 14453, "time_per_iteration": 2.873689651489258 }, { "auxiliary_loss_clip": 0.01395066, "auxiliary_loss_mlp": 0.01036388, "balance_loss_clip": 1.2389946, "balance_loss_mlp": 1.0174222, "epoch": 0.8690214940628288, "flos": 11736158042880.0, "grad_norm": 1.986112319122435, "language_loss": 0.74327773, "learning_rate": 1.7720065890742664e-07, "loss": 0.76759231, "num_input_tokens_seen": 311808360, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.1895752, "step": 14454, "time_per_iteration": 2.853628396987915 }, { "auxiliary_loss_clip": 0.01406376, "auxiliary_loss_mlp": 0.01032433, "balance_loss_clip": 1.24797738, "balance_loss_mlp": 1.013515, "epoch": 0.8690816173154968, "flos": 34950596175360.0, "grad_norm": 2.503322793672512, "language_loss": 0.60001934, "learning_rate": 1.7704042128431552e-07, "loss": 0.62440741, "num_input_tokens_seen": 311831325, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18920898, "step": 14455, "time_per_iteration": 2.9873037338256836 }, { "auxiliary_loss_clip": 0.01396965, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.2367444, "balance_loss_mlp": 1.01254439, "epoch": 0.8691417405681647, "flos": 11621018730240.0, "grad_norm": 2.1090960629262394, "language_loss": 0.81179339, "learning_rate": 1.7688025278801378e-07, "loss": 0.83609056, "num_input_tokens_seen": 311848090, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20202637, "step": 14456, "time_per_iteration": 2.8134841918945312 }, { "auxiliary_loss_clip": 0.01407718, "auxiliary_loss_mlp": 0.010362, "balance_loss_clip": 1.24503672, "balance_loss_mlp": 1.01619649, "epoch": 0.8692018638208328, "flos": 24618741563520.0, "grad_norm": 2.3690728456987875, "language_loss": 0.75566232, "learning_rate": 1.7672015342459568e-07, "loss": 0.78010154, "num_input_tokens_seen": 311867855, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.20007324, "step": 14457, "time_per_iteration": 2.8665478229522705 }, { "auxiliary_loss_clip": 0.01395114, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.23811436, "balance_loss_mlp": 1.01236439, "epoch": 0.8692619870735007, "flos": 26006349663360.0, "grad_norm": 1.5228700779669977, "language_loss": 0.79031479, "learning_rate": 1.765601232001328e-07, "loss": 0.81457633, "num_input_tokens_seen": 311888675, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18676758, "step": 14458, "time_per_iteration": 4.248074531555176 }, { "auxiliary_loss_clip": 0.0139958, "auxiliary_loss_mlp": 0.01036135, "balance_loss_clip": 1.24099803, "balance_loss_mlp": 1.0158937, "epoch": 0.8693221103261687, "flos": 18051121393920.0, "grad_norm": 1.5622180623771789, "language_loss": 0.71875501, "learning_rate": 1.7640016212069187e-07, "loss": 0.74311221, "num_input_tokens_seen": 311907310, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.20251465, "step": 14459, "time_per_iteration": 2.8200881481170654 }, { "auxiliary_loss_clip": 0.01374506, "auxiliary_loss_mlp": 0.010291, "balance_loss_clip": 1.2212311, "balance_loss_mlp": 1.01114655, "epoch": 0.8693822335788366, "flos": 27504301127040.0, "grad_norm": 1.3105475782301086, "language_loss": 0.74421406, "learning_rate": 1.762402701923398e-07, "loss": 0.76825017, "num_input_tokens_seen": 311929635, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.17956543, "step": 14460, "time_per_iteration": 2.8865227699279785 }, { "auxiliary_loss_clip": 0.0140285, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.23936594, "balance_loss_mlp": 1.01373529, "epoch": 0.8694423568315046, "flos": 24108405753600.0, "grad_norm": 2.2275915394936994, "language_loss": 0.65699315, "learning_rate": 1.7608044742113947e-07, "loss": 0.68134469, "num_input_tokens_seen": 311948800, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18579102, "step": 14461, "time_per_iteration": 4.285569190979004 }, { "auxiliary_loss_clip": 0.01395934, "auxiliary_loss_mlp": 0.01037386, "balance_loss_clip": 1.23623323, "balance_loss_mlp": 1.01726353, "epoch": 0.8695024800841725, "flos": 18369437696640.0, "grad_norm": 2.2718477663883103, "language_loss": 0.8335799, "learning_rate": 1.7592069381315123e-07, "loss": 0.85791314, "num_input_tokens_seen": 311964090, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.20129395, "step": 14462, "time_per_iteration": 2.8672564029693604 }, { "auxiliary_loss_clip": 0.014054, "auxiliary_loss_mlp": 0.01034174, "balance_loss_clip": 1.24488997, "balance_loss_mlp": 1.01417089, "epoch": 0.8695626033368405, "flos": 14035858945920.0, "grad_norm": 2.0055405937059323, "language_loss": 0.6574741, "learning_rate": 1.757610093744335e-07, "loss": 0.68186986, "num_input_tokens_seen": 311981460, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.20007324, "step": 14463, "time_per_iteration": 2.8214285373687744 }, { "auxiliary_loss_clip": 0.01410964, "auxiliary_loss_mlp": 0.01037462, "balance_loss_clip": 1.2474122, "balance_loss_mlp": 1.01646948, "epoch": 0.8696227265895085, "flos": 16845624190080.0, "grad_norm": 1.7681321598393898, "language_loss": 0.67191786, "learning_rate": 1.7560139411104058e-07, "loss": 0.69640213, "num_input_tokens_seen": 312000115, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20983887, "step": 14464, "time_per_iteration": 2.8059613704681396 }, { "auxiliary_loss_clip": 0.01410058, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.2459867, "balance_loss_mlp": 1.01356912, "epoch": 0.8696828498421765, "flos": 21809157298560.0, "grad_norm": 2.513591311232102, "language_loss": 0.63837385, "learning_rate": 1.7544184802902607e-07, "loss": 0.66280597, "num_input_tokens_seen": 312020770, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19592285, "step": 14465, "time_per_iteration": 2.873141288757324 }, { "auxiliary_loss_clip": 0.01379699, "auxiliary_loss_mlp": 0.01034144, "balance_loss_clip": 1.22531772, "balance_loss_mlp": 1.0156424, "epoch": 0.8697429730948444, "flos": 22905487503360.0, "grad_norm": 1.7539122087683716, "language_loss": 0.84882736, "learning_rate": 1.7528237113443934e-07, "loss": 0.87296581, "num_input_tokens_seen": 312041870, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18518066, "step": 14466, "time_per_iteration": 4.3376171588897705 }, { "auxiliary_loss_clip": 0.01412079, "auxiliary_loss_mlp": 0.01035659, "balance_loss_clip": 1.24822211, "balance_loss_mlp": 1.01589406, "epoch": 0.8698030963475124, "flos": 24728180031360.0, "grad_norm": 2.6200720319862016, "language_loss": 0.62446034, "learning_rate": 1.7512296343332779e-07, "loss": 0.6489377, "num_input_tokens_seen": 312058210, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19763184, "step": 14467, "time_per_iteration": 2.8598620891571045 }, { "auxiliary_loss_clip": 0.01374683, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.22033608, "balance_loss_mlp": 1.01568592, "epoch": 0.8698632196001803, "flos": 28454743537920.0, "grad_norm": 1.4039560967162013, "language_loss": 0.69472814, "learning_rate": 1.7496362493173655e-07, "loss": 0.7188099, "num_input_tokens_seen": 312082665, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.17810059, "step": 14468, "time_per_iteration": 2.9781503677368164 }, { "auxiliary_loss_clip": 0.01389726, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.23240709, "balance_loss_mlp": 1.0168258, "epoch": 0.8699233428528483, "flos": 27647971925760.0, "grad_norm": 1.7153526335948723, "language_loss": 0.71701348, "learning_rate": 1.7480435563570773e-07, "loss": 0.74128199, "num_input_tokens_seen": 312101960, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.20275879, "step": 14469, "time_per_iteration": 2.9022605419158936 }, { "auxiliary_loss_clip": 0.01378816, "auxiliary_loss_mlp": 0.01030526, "balance_loss_clip": 1.22534037, "balance_loss_mlp": 1.01247764, "epoch": 0.8699834661055164, "flos": 20054748453120.0, "grad_norm": 2.090341990675311, "language_loss": 0.84679079, "learning_rate": 1.7464515555128024e-07, "loss": 0.87088418, "num_input_tokens_seen": 312117125, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.18054199, "step": 14470, "time_per_iteration": 2.792091131210327 }, { "auxiliary_loss_clip": 0.01388534, "auxiliary_loss_mlp": 0.0103499, "balance_loss_clip": 1.23079896, "balance_loss_mlp": 1.01573753, "epoch": 0.8700435893581843, "flos": 23743369555200.0, "grad_norm": 1.693350516268456, "language_loss": 0.73578066, "learning_rate": 1.7448602468449148e-07, "loss": 0.76001585, "num_input_tokens_seen": 312135775, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19250488, "step": 14471, "time_per_iteration": 2.919616460800171 }, { "auxiliary_loss_clip": 0.01393918, "auxiliary_loss_mlp": 0.01029952, "balance_loss_clip": 1.23686886, "balance_loss_mlp": 1.01166582, "epoch": 0.8701037126108523, "flos": 23558453481600.0, "grad_norm": 1.3290236387304863, "language_loss": 0.79605037, "learning_rate": 1.7432696304137573e-07, "loss": 0.82028908, "num_input_tokens_seen": 312156070, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.1829834, "step": 14472, "time_per_iteration": 2.8884940147399902 }, { "auxiliary_loss_clip": 0.01388129, "auxiliary_loss_mlp": 0.01031342, "balance_loss_clip": 1.22993183, "balance_loss_mlp": 1.01174414, "epoch": 0.8701638358635202, "flos": 18852689854080.0, "grad_norm": 1.929711190664412, "language_loss": 0.73701346, "learning_rate": 1.741679706279644e-07, "loss": 0.76120818, "num_input_tokens_seen": 312174380, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19592285, "step": 14473, "time_per_iteration": 2.849877119064331 }, { "auxiliary_loss_clip": 0.0139945, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.2385006, "balance_loss_mlp": 1.01391745, "epoch": 0.8702239591161882, "flos": 27939928492800.0, "grad_norm": 1.61462971675426, "language_loss": 0.7284565, "learning_rate": 1.7400904745028644e-07, "loss": 0.75278127, "num_input_tokens_seen": 312195130, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19104004, "step": 14474, "time_per_iteration": 2.8686022758483887 }, { "auxiliary_loss_clip": 0.01394067, "auxiliary_loss_mlp": 0.01030955, "balance_loss_clip": 1.23431873, "balance_loss_mlp": 1.01186943, "epoch": 0.8702840823688561, "flos": 17241770828160.0, "grad_norm": 1.7815105904447601, "language_loss": 0.68201566, "learning_rate": 1.7385019351436925e-07, "loss": 0.70626587, "num_input_tokens_seen": 312212300, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1907959, "step": 14475, "time_per_iteration": 2.7979276180267334 }, { "auxiliary_loss_clip": 0.01394235, "auxiliary_loss_mlp": 0.01031922, "balance_loss_clip": 1.23341429, "balance_loss_mlp": 1.01312232, "epoch": 0.8703442056215241, "flos": 19437055436160.0, "grad_norm": 1.5042676646798343, "language_loss": 0.77594185, "learning_rate": 1.736914088262349e-07, "loss": 0.80020344, "num_input_tokens_seen": 312231735, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18811035, "step": 14476, "time_per_iteration": 2.808225393295288 }, { "auxiliary_loss_clip": 0.01395767, "auxiliary_loss_mlp": 0.01029134, "balance_loss_clip": 1.23862243, "balance_loss_mlp": 1.01035881, "epoch": 0.8704043288741921, "flos": 22284174902400.0, "grad_norm": 1.7894323974053035, "language_loss": 0.72878724, "learning_rate": 1.7353269339190525e-07, "loss": 0.75303626, "num_input_tokens_seen": 312253060, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18786621, "step": 14477, "time_per_iteration": 2.870584011077881 }, { "auxiliary_loss_clip": 0.0138757, "auxiliary_loss_mlp": 0.01031971, "balance_loss_clip": 1.22870731, "balance_loss_mlp": 1.01311159, "epoch": 0.8704644521268601, "flos": 16655957412480.0, "grad_norm": 2.2258998188408827, "language_loss": 0.60105139, "learning_rate": 1.7337404721739946e-07, "loss": 0.62524676, "num_input_tokens_seen": 312269460, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18847656, "step": 14478, "time_per_iteration": 2.8058855533599854 }, { "auxiliary_loss_clip": 0.0139429, "auxiliary_loss_mlp": 0.01028758, "balance_loss_clip": 1.23786926, "balance_loss_mlp": 1.0109005, "epoch": 0.870524575379528, "flos": 24290969097600.0, "grad_norm": 1.4895008568085413, "language_loss": 0.72434747, "learning_rate": 1.732154703087323e-07, "loss": 0.74857795, "num_input_tokens_seen": 312289830, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.17871094, "step": 14479, "time_per_iteration": 2.8648648262023926 }, { "auxiliary_loss_clip": 0.01392962, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.23393726, "balance_loss_mlp": 1.01372492, "epoch": 0.870584698632196, "flos": 28780796701440.0, "grad_norm": 1.4481402979370852, "language_loss": 0.71887904, "learning_rate": 1.7305696267191805e-07, "loss": 0.743146, "num_input_tokens_seen": 312311320, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.20007324, "step": 14480, "time_per_iteration": 2.9199917316436768 }, { "auxiliary_loss_clip": 0.01409102, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.24659133, "balance_loss_mlp": 1.01392734, "epoch": 0.8706448218848639, "flos": 32461635697920.0, "grad_norm": 1.675170825715028, "language_loss": 0.70224488, "learning_rate": 1.728985243129666e-07, "loss": 0.72666568, "num_input_tokens_seen": 312332095, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19055176, "step": 14481, "time_per_iteration": 2.9263155460357666 }, { "auxiliary_loss_clip": 0.01388111, "auxiliary_loss_mlp": 0.01030539, "balance_loss_clip": 1.23043883, "balance_loss_mlp": 1.01227593, "epoch": 0.8707049451375319, "flos": 22758423344640.0, "grad_norm": 1.6613590737162072, "language_loss": 0.77758181, "learning_rate": 1.7274015523788643e-07, "loss": 0.8017683, "num_input_tokens_seen": 312351225, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18261719, "step": 14482, "time_per_iteration": 2.8454833030700684 }, { "auxiliary_loss_clip": 0.01393786, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.23572993, "balance_loss_mlp": 1.01577163, "epoch": 0.8707650683902, "flos": 15860542245120.0, "grad_norm": 1.6826209426664924, "language_loss": 0.77559191, "learning_rate": 1.7258185545268234e-07, "loss": 0.79987884, "num_input_tokens_seen": 312369730, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19128418, "step": 14483, "time_per_iteration": 2.847640037536621 }, { "auxiliary_loss_clip": 0.01418703, "auxiliary_loss_mlp": 0.01038962, "balance_loss_clip": 1.25488508, "balance_loss_mlp": 1.01895905, "epoch": 0.8708251916428679, "flos": 16475792042880.0, "grad_norm": 2.249064849524526, "language_loss": 0.62889218, "learning_rate": 1.7242362496335749e-07, "loss": 0.65346885, "num_input_tokens_seen": 312386780, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.20007324, "step": 14484, "time_per_iteration": 2.829704523086548 }, { "auxiliary_loss_clip": 0.01396821, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.23850346, "balance_loss_mlp": 1.01221478, "epoch": 0.8708853148955359, "flos": 15386339047680.0, "grad_norm": 1.861346753143034, "language_loss": 0.68876284, "learning_rate": 1.7226546377591222e-07, "loss": 0.71304572, "num_input_tokens_seen": 312404875, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19262695, "step": 14485, "time_per_iteration": 2.8546547889709473 }, { "auxiliary_loss_clip": 0.01393558, "auxiliary_loss_mlp": 0.01033312, "balance_loss_clip": 1.2347157, "balance_loss_mlp": 1.0132494, "epoch": 0.8709454381482038, "flos": 30562560668160.0, "grad_norm": 1.9679381951053145, "language_loss": 0.63983279, "learning_rate": 1.7210737189634373e-07, "loss": 0.66410154, "num_input_tokens_seen": 312425280, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.20068359, "step": 14486, "time_per_iteration": 2.8983259201049805 }, { "auxiliary_loss_clip": 0.0140618, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.24365604, "balance_loss_mlp": 1.01729488, "epoch": 0.8710055614008718, "flos": 22611630654720.0, "grad_norm": 1.889378265993119, "language_loss": 0.62403345, "learning_rate": 1.7194934933064653e-07, "loss": 0.64846277, "num_input_tokens_seen": 312443835, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19445801, "step": 14487, "time_per_iteration": 2.8537940979003906 }, { "auxiliary_loss_clip": 0.01392477, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.23432207, "balance_loss_mlp": 1.01372457, "epoch": 0.8710656846535397, "flos": 18452561673600.0, "grad_norm": 1.9138746580842707, "language_loss": 0.6834873, "learning_rate": 1.7179139608481318e-07, "loss": 0.7077409, "num_input_tokens_seen": 312460830, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19177246, "step": 14488, "time_per_iteration": 2.832899570465088 }, { "auxiliary_loss_clip": 0.01400447, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.24024916, "balance_loss_mlp": 1.01149762, "epoch": 0.8711258079062077, "flos": 16511336472960.0, "grad_norm": 1.7589201605360285, "language_loss": 0.85979527, "learning_rate": 1.716335121648338e-07, "loss": 0.88410544, "num_input_tokens_seen": 312477575, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1907959, "step": 14489, "time_per_iteration": 2.877612352371216 }, { "auxiliary_loss_clip": 0.01415212, "auxiliary_loss_mlp": 0.01034711, "balance_loss_clip": 1.24970102, "balance_loss_mlp": 1.01557755, "epoch": 0.8711859311588757, "flos": 15670242040320.0, "grad_norm": 2.121258253828598, "language_loss": 0.76955205, "learning_rate": 1.7147569757669445e-07, "loss": 0.79405129, "num_input_tokens_seen": 312492140, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.19128418, "step": 14490, "time_per_iteration": 2.812361240386963 }, { "auxiliary_loss_clip": 0.01395348, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.23321521, "balance_loss_mlp": 1.01357007, "epoch": 0.8712460544115437, "flos": 15565373297280.0, "grad_norm": 2.257525942588463, "language_loss": 0.77423251, "learning_rate": 1.7131795232638012e-07, "loss": 0.79852796, "num_input_tokens_seen": 312508400, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.20593262, "step": 14491, "time_per_iteration": 2.819559097290039 }, { "auxiliary_loss_clip": 0.01395673, "auxiliary_loss_mlp": 0.01029988, "balance_loss_clip": 1.23789608, "balance_loss_mlp": 1.01081896, "epoch": 0.8713061776642116, "flos": 16772227845120.0, "grad_norm": 2.027780221765637, "language_loss": 0.67562294, "learning_rate": 1.711602764198723e-07, "loss": 0.69987959, "num_input_tokens_seen": 312525915, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19177246, "step": 14492, "time_per_iteration": 4.270235538482666 }, { "auxiliary_loss_clip": 0.01382203, "auxiliary_loss_mlp": 0.01029934, "balance_loss_clip": 1.22587609, "balance_loss_mlp": 1.01143241, "epoch": 0.8713663009168796, "flos": 24290561894400.0, "grad_norm": 1.855502056993065, "language_loss": 0.70015925, "learning_rate": 1.7100266986314992e-07, "loss": 0.7242806, "num_input_tokens_seen": 312544735, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18481445, "step": 14493, "time_per_iteration": 2.8644697666168213 }, { "auxiliary_loss_clip": 0.01399911, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.24001122, "balance_loss_mlp": 1.02093148, "epoch": 0.8714264241695475, "flos": 23803554418560.0, "grad_norm": 2.258292149789749, "language_loss": 0.9006319, "learning_rate": 1.7084513266218936e-07, "loss": 0.92503566, "num_input_tokens_seen": 312557910, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1953125, "step": 14494, "time_per_iteration": 2.8377974033355713 }, { "auxiliary_loss_clip": 0.01391579, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.23531556, "balance_loss_mlp": 1.01611423, "epoch": 0.8714865474222155, "flos": 38012022852480.0, "grad_norm": 1.7521242887682553, "language_loss": 0.59987915, "learning_rate": 1.7068766482296514e-07, "loss": 0.62413859, "num_input_tokens_seen": 312580360, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18237305, "step": 14495, "time_per_iteration": 2.9833669662475586 }, { "auxiliary_loss_clip": 0.01399854, "auxiliary_loss_mlp": 0.01033519, "balance_loss_clip": 1.23957515, "balance_loss_mlp": 1.01402879, "epoch": 0.8715466706748836, "flos": 22465923840000.0, "grad_norm": 1.8832710080933037, "language_loss": 0.81558323, "learning_rate": 1.7053026635144762e-07, "loss": 0.83991694, "num_input_tokens_seen": 312597550, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19482422, "step": 14496, "time_per_iteration": 4.241694211959839 }, { "auxiliary_loss_clip": 0.01394137, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.23487568, "balance_loss_mlp": 1.01427269, "epoch": 0.8716067939275515, "flos": 21225017940480.0, "grad_norm": 2.1897571942012592, "language_loss": 0.79523218, "learning_rate": 1.7037293725360624e-07, "loss": 0.81951511, "num_input_tokens_seen": 312616435, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19885254, "step": 14497, "time_per_iteration": 2.8140978813171387 }, { "auxiliary_loss_clip": 0.01399963, "auxiliary_loss_mlp": 0.01031755, "balance_loss_clip": 1.23762989, "balance_loss_mlp": 1.01144147, "epoch": 0.8716669171802195, "flos": 23006736662400.0, "grad_norm": 2.11917070523064, "language_loss": 0.67965698, "learning_rate": 1.70215677535406e-07, "loss": 0.70397419, "num_input_tokens_seen": 312632770, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.203125, "step": 14498, "time_per_iteration": 2.856161117553711 }, { "auxiliary_loss_clip": 0.01398027, "auxiliary_loss_mlp": 0.01029246, "balance_loss_clip": 1.23799264, "balance_loss_mlp": 1.01036334, "epoch": 0.8717270404328874, "flos": 29794817335680.0, "grad_norm": 1.5228502102404373, "language_loss": 0.57347214, "learning_rate": 1.700584872028108e-07, "loss": 0.59774482, "num_input_tokens_seen": 312651900, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18884277, "step": 14499, "time_per_iteration": 2.9195942878723145 }, { "auxiliary_loss_clip": 0.01399016, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.23794472, "balance_loss_mlp": 1.01456332, "epoch": 0.8717871636855554, "flos": 22028305703040.0, "grad_norm": 1.8921162921051295, "language_loss": 0.806647, "learning_rate": 1.6990136626178097e-07, "loss": 0.83098173, "num_input_tokens_seen": 312671380, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19885254, "step": 14500, "time_per_iteration": 2.8478164672851562 }, { "auxiliary_loss_clip": 0.01393393, "auxiliary_loss_mlp": 0.0103155, "balance_loss_clip": 1.2342627, "balance_loss_mlp": 1.01263189, "epoch": 0.8718472869382233, "flos": 16662925111680.0, "grad_norm": 1.9239426401124038, "language_loss": 0.73399538, "learning_rate": 1.6974431471827466e-07, "loss": 0.75824475, "num_input_tokens_seen": 312689215, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18908691, "step": 14501, "time_per_iteration": 4.219645977020264 }, { "auxiliary_loss_clip": 0.0139837, "auxiliary_loss_mlp": 0.01034573, "balance_loss_clip": 1.2355845, "balance_loss_mlp": 1.01497531, "epoch": 0.8719074101908914, "flos": 19503484081920.0, "grad_norm": 1.6566606128263373, "language_loss": 0.65280735, "learning_rate": 1.695873325782482e-07, "loss": 0.67713678, "num_input_tokens_seen": 312706400, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19604492, "step": 14502, "time_per_iteration": 4.269784688949585 }, { "auxiliary_loss_clip": 0.01394865, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.23562539, "balance_loss_mlp": 1.01630759, "epoch": 0.8719675334435593, "flos": 33083310257280.0, "grad_norm": 1.5840109104823612, "language_loss": 0.69407964, "learning_rate": 1.6943041984765262e-07, "loss": 0.71839315, "num_input_tokens_seen": 312727985, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.20178223, "step": 14503, "time_per_iteration": 2.9754064083099365 }, { "auxiliary_loss_clip": 0.01398741, "auxiliary_loss_mlp": 0.01031901, "balance_loss_clip": 1.23938107, "balance_loss_mlp": 1.01330388, "epoch": 0.8720276566962273, "flos": 13634237687040.0, "grad_norm": 2.1469288547328853, "language_loss": 0.70863909, "learning_rate": 1.6927357653243912e-07, "loss": 0.7329455, "num_input_tokens_seen": 312745025, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18591309, "step": 14504, "time_per_iteration": 2.850471019744873 }, { "auxiliary_loss_clip": 0.01393721, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.2336576, "balance_loss_mlp": 1.01430118, "epoch": 0.8720877799488952, "flos": 23524764088320.0, "grad_norm": 1.7617838773121666, "language_loss": 0.70760155, "learning_rate": 1.691168026385552e-07, "loss": 0.73187709, "num_input_tokens_seen": 312764170, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1953125, "step": 14505, "time_per_iteration": 2.954744577407837 }, { "auxiliary_loss_clip": 0.01396049, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 1.23810768, "balance_loss_mlp": 1.00915825, "epoch": 0.8721479032015632, "flos": 20824030108800.0, "grad_norm": 1.7287702405324836, "language_loss": 0.7912941, "learning_rate": 1.6896009817194545e-07, "loss": 0.8155297, "num_input_tokens_seen": 312783830, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18347168, "step": 14506, "time_per_iteration": 2.835397720336914 }, { "auxiliary_loss_clip": 0.01414365, "auxiliary_loss_mlp": 0.01029696, "balance_loss_clip": 1.25013852, "balance_loss_mlp": 1.01061034, "epoch": 0.8722080264542311, "flos": 19473052314240.0, "grad_norm": 2.8651532762943335, "language_loss": 0.7467159, "learning_rate": 1.6880346313855221e-07, "loss": 0.77115643, "num_input_tokens_seen": 312802015, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1907959, "step": 14507, "time_per_iteration": 2.850752592086792 }, { "auxiliary_loss_clip": 0.01410346, "auxiliary_loss_mlp": 0.01035125, "balance_loss_clip": 1.24646139, "balance_loss_mlp": 1.01530111, "epoch": 0.8722681497068991, "flos": 21771757831680.0, "grad_norm": 2.409411190495622, "language_loss": 0.73404622, "learning_rate": 1.686468975443156e-07, "loss": 0.75850093, "num_input_tokens_seen": 312820650, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19812012, "step": 14508, "time_per_iteration": 2.8691678047180176 }, { "auxiliary_loss_clip": 0.01405364, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.24332738, "balance_loss_mlp": 1.01247096, "epoch": 0.8723282729595672, "flos": 28889013559680.0, "grad_norm": 1.6802051288749196, "language_loss": 0.69389772, "learning_rate": 1.6849040139517202e-07, "loss": 0.71827304, "num_input_tokens_seen": 312841310, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19714355, "step": 14509, "time_per_iteration": 2.9333951473236084 }, { "auxiliary_loss_clip": 0.01396986, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.23791218, "balance_loss_mlp": 1.01464057, "epoch": 0.8723883962122351, "flos": 26480417126400.0, "grad_norm": 1.650314309297747, "language_loss": 0.59249383, "learning_rate": 1.683339746970558e-07, "loss": 0.61680746, "num_input_tokens_seen": 312862100, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.1973877, "step": 14510, "time_per_iteration": 2.84717059135437 }, { "auxiliary_loss_clip": 0.01430416, "auxiliary_loss_mlp": 0.01035416, "balance_loss_clip": 1.26207161, "balance_loss_mlp": 1.0156157, "epoch": 0.8724485194649031, "flos": 20531078156160.0, "grad_norm": 2.0612499261695114, "language_loss": 0.68606877, "learning_rate": 1.6817761745589865e-07, "loss": 0.71072704, "num_input_tokens_seen": 312880220, "router_z_loss_clip": 1.68457031, "router_z_loss_mlp": 0.19799805, "step": 14511, "time_per_iteration": 2.851417064666748 }, { "auxiliary_loss_clip": 0.01395702, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.23470294, "balance_loss_mlp": 1.0141865, "epoch": 0.872508642717571, "flos": 24364184463360.0, "grad_norm": 1.6713380153841249, "language_loss": 0.82528794, "learning_rate": 1.6802132967763027e-07, "loss": 0.84958196, "num_input_tokens_seen": 312900765, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19506836, "step": 14512, "time_per_iteration": 2.861938238143921 }, { "auxiliary_loss_clip": 0.01175715, "auxiliary_loss_mlp": 0.01016015, "balance_loss_clip": 1.09061146, "balance_loss_mlp": 0.9992308, "epoch": 0.872568765970239, "flos": 61436578412160.0, "grad_norm": 0.7838666426177744, "language_loss": 0.58684838, "learning_rate": 1.6786511136817617e-07, "loss": 0.60876572, "num_input_tokens_seen": 312955840, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.16796875, "step": 14513, "time_per_iteration": 3.2189137935638428 }, { "auxiliary_loss_clip": 0.01399984, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 1.24096608, "balance_loss_mlp": 1.01283872, "epoch": 0.8726288892229069, "flos": 22607377643520.0, "grad_norm": 1.7994884710298653, "language_loss": 0.77474624, "learning_rate": 1.6770896253346112e-07, "loss": 0.79906869, "num_input_tokens_seen": 312973565, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19433594, "step": 14514, "time_per_iteration": 2.861544609069824 }, { "auxiliary_loss_clip": 0.01416959, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.25260353, "balance_loss_mlp": 1.0161339, "epoch": 0.872689012475575, "flos": 25896277768320.0, "grad_norm": 2.113841468721925, "language_loss": 0.6572271, "learning_rate": 1.675528831794055e-07, "loss": 0.68175209, "num_input_tokens_seen": 312994660, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1940918, "step": 14515, "time_per_iteration": 2.9077186584472656 }, { "auxiliary_loss_clip": 0.01404025, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.24275291, "balance_loss_mlp": 1.01505089, "epoch": 0.8727491357282429, "flos": 21516612549120.0, "grad_norm": 2.047438846377193, "language_loss": 0.7902782, "learning_rate": 1.6739687331192842e-07, "loss": 0.81466556, "num_input_tokens_seen": 313009860, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19677734, "step": 14516, "time_per_iteration": 2.839146852493286 }, { "auxiliary_loss_clip": 0.01404782, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.24231148, "balance_loss_mlp": 1.01440811, "epoch": 0.8728092589809109, "flos": 19216640177280.0, "grad_norm": 2.868521648496266, "language_loss": 0.72927523, "learning_rate": 1.672409329369453e-07, "loss": 0.75366801, "num_input_tokens_seen": 313027025, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20092773, "step": 14517, "time_per_iteration": 2.8146183490753174 }, { "auxiliary_loss_clip": 0.01384079, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.22673607, "balance_loss_mlp": 1.01346159, "epoch": 0.8728693822335788, "flos": 20605153173120.0, "grad_norm": 2.7998354443366757, "language_loss": 0.7340976, "learning_rate": 1.6708506206036966e-07, "loss": 0.75825691, "num_input_tokens_seen": 313046830, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18395996, "step": 14518, "time_per_iteration": 2.8527588844299316 }, { "auxiliary_loss_clip": 0.01386099, "auxiliary_loss_mlp": 0.01034271, "balance_loss_clip": 1.22984862, "balance_loss_mlp": 1.01531696, "epoch": 0.8729295054862468, "flos": 21739335292800.0, "grad_norm": 3.03799241586724, "language_loss": 0.74364018, "learning_rate": 1.6692926068811275e-07, "loss": 0.76784396, "num_input_tokens_seen": 313067715, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18969727, "step": 14519, "time_per_iteration": 2.8608503341674805 }, { "auxiliary_loss_clip": 0.01405556, "auxiliary_loss_mlp": 0.01033116, "balance_loss_clip": 1.24326122, "balance_loss_mlp": 1.01285088, "epoch": 0.8729896287389147, "flos": 17681741694720.0, "grad_norm": 6.477863747667627, "language_loss": 0.77695763, "learning_rate": 1.6677352882608142e-07, "loss": 0.80134439, "num_input_tokens_seen": 313082305, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20263672, "step": 14520, "time_per_iteration": 2.808223247528076 }, { "auxiliary_loss_clip": 0.01415503, "auxiliary_loss_mlp": 0.01040586, "balance_loss_clip": 1.25166368, "balance_loss_mlp": 1.0193311, "epoch": 0.8730497519915827, "flos": 24582563706240.0, "grad_norm": 2.0124447130800553, "language_loss": 0.83009541, "learning_rate": 1.666178664801816e-07, "loss": 0.85465628, "num_input_tokens_seen": 313101190, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.21240234, "step": 14521, "time_per_iteration": 2.884317636489868 }, { "auxiliary_loss_clip": 0.01406393, "auxiliary_loss_mlp": 0.01033693, "balance_loss_clip": 1.24460363, "balance_loss_mlp": 1.01428533, "epoch": 0.8731098752442508, "flos": 13451629098240.0, "grad_norm": 2.0153919648532455, "language_loss": 0.77352178, "learning_rate": 1.6646227365631616e-07, "loss": 0.79792267, "num_input_tokens_seen": 313118965, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19396973, "step": 14522, "time_per_iteration": 2.8259174823760986 }, { "auxiliary_loss_clip": 0.01390466, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.23269963, "balance_loss_mlp": 1.01056755, "epoch": 0.8731699984969187, "flos": 23484197485440.0, "grad_norm": 1.753240857490492, "language_loss": 0.75801015, "learning_rate": 1.66306750360385e-07, "loss": 0.78219879, "num_input_tokens_seen": 313139280, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.17834473, "step": 14523, "time_per_iteration": 2.865238666534424 }, { "auxiliary_loss_clip": 0.0138578, "auxiliary_loss_mlp": 0.01034133, "balance_loss_clip": 1.22820711, "balance_loss_mlp": 1.01559556, "epoch": 0.8732301217495867, "flos": 17721674870400.0, "grad_norm": 2.510104436979704, "language_loss": 0.80166829, "learning_rate": 1.6615129659828542e-07, "loss": 0.82586735, "num_input_tokens_seen": 313156655, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.1854248, "step": 14524, "time_per_iteration": 2.796372890472412 }, { "auxiliary_loss_clip": 0.01380259, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.22547042, "balance_loss_mlp": 1.01247287, "epoch": 0.8732902450022546, "flos": 22064483560320.0, "grad_norm": 1.998008075763111, "language_loss": 0.78809273, "learning_rate": 1.6599591237591272e-07, "loss": 0.81220198, "num_input_tokens_seen": 313174050, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18200684, "step": 14525, "time_per_iteration": 2.9252774715423584 }, { "auxiliary_loss_clip": 0.01408923, "auxiliary_loss_mlp": 0.010348, "balance_loss_clip": 1.24617076, "balance_loss_mlp": 1.01521409, "epoch": 0.8733503682549226, "flos": 22283089027200.0, "grad_norm": 1.9518667072202396, "language_loss": 0.69945467, "learning_rate": 1.6584059769915902e-07, "loss": 0.72389185, "num_input_tokens_seen": 313192765, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19580078, "step": 14526, "time_per_iteration": 2.9935317039489746 }, { "auxiliary_loss_clip": 0.0141776, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.25160527, "balance_loss_mlp": 1.0169785, "epoch": 0.8734104915075905, "flos": 23373899366400.0, "grad_norm": 2.7999471478207174, "language_loss": 0.61747861, "learning_rate": 1.6568535257391326e-07, "loss": 0.64203578, "num_input_tokens_seen": 313210925, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.20959473, "step": 14527, "time_per_iteration": 4.316193342208862 }, { "auxiliary_loss_clip": 0.01413173, "auxiliary_loss_mlp": 0.01036621, "balance_loss_clip": 1.24397826, "balance_loss_mlp": 1.01525891, "epoch": 0.8734706147602586, "flos": 17721222422400.0, "grad_norm": 2.0380285058876524, "language_loss": 0.67031258, "learning_rate": 1.6553017700606265e-07, "loss": 0.69481051, "num_input_tokens_seen": 313228250, "router_z_loss_clip": 1.69238281, "router_z_loss_mlp": 0.21374512, "step": 14528, "time_per_iteration": 2.8054394721984863 }, { "auxiliary_loss_clip": 0.01387037, "auxiliary_loss_mlp": 0.01031866, "balance_loss_clip": 1.23108196, "balance_loss_mlp": 1.01318645, "epoch": 0.8735307380129265, "flos": 22058194533120.0, "grad_norm": 2.827082939554064, "language_loss": 0.91239512, "learning_rate": 1.6537507100149205e-07, "loss": 0.93658423, "num_input_tokens_seen": 313247880, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18676758, "step": 14529, "time_per_iteration": 2.8768117427825928 }, { "auxiliary_loss_clip": 0.01383533, "auxiliary_loss_mlp": 0.01029845, "balance_loss_clip": 1.22843122, "balance_loss_mlp": 1.01122439, "epoch": 0.8735908612655945, "flos": 25349492632320.0, "grad_norm": 1.7121965263513501, "language_loss": 0.85498393, "learning_rate": 1.6522003456608258e-07, "loss": 0.87911773, "num_input_tokens_seen": 313266790, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18615723, "step": 14530, "time_per_iteration": 4.35499906539917 }, { "auxiliary_loss_clip": 0.01395043, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.23383904, "balance_loss_mlp": 1.01574516, "epoch": 0.8736509845182624, "flos": 21550211452800.0, "grad_norm": 2.7827777118734196, "language_loss": 0.75430954, "learning_rate": 1.650650677057128e-07, "loss": 0.77859831, "num_input_tokens_seen": 313286805, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18103027, "step": 14531, "time_per_iteration": 2.9153575897216797 }, { "auxiliary_loss_clip": 0.01388199, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.23183131, "balance_loss_mlp": 1.01386523, "epoch": 0.8737111077709304, "flos": 22027174583040.0, "grad_norm": 1.9844539594981296, "language_loss": 0.62101412, "learning_rate": 1.6491017042625966e-07, "loss": 0.64521772, "num_input_tokens_seen": 313305415, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18310547, "step": 14532, "time_per_iteration": 2.87565016746521 }, { "auxiliary_loss_clip": 0.01175407, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.08805537, "balance_loss_mlp": 1.0066222, "epoch": 0.8737712310235983, "flos": 70097510113920.0, "grad_norm": 0.8218434768953291, "language_loss": 0.58738869, "learning_rate": 1.6475534273359704e-07, "loss": 0.60940832, "num_input_tokens_seen": 313369940, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.19921875, "step": 14533, "time_per_iteration": 3.4332005977630615 }, { "auxiliary_loss_clip": 0.0138631, "auxiliary_loss_mlp": 0.01029398, "balance_loss_clip": 1.23056281, "balance_loss_mlp": 1.01161182, "epoch": 0.8738313542762663, "flos": 28670181868800.0, "grad_norm": 1.552895539278509, "language_loss": 0.77490318, "learning_rate": 1.646005846335954e-07, "loss": 0.79906023, "num_input_tokens_seen": 313390965, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.17785645, "step": 14534, "time_per_iteration": 2.903783082962036 }, { "auxiliary_loss_clip": 0.0139697, "auxiliary_loss_mlp": 0.01030558, "balance_loss_clip": 1.23647594, "balance_loss_mlp": 1.01279545, "epoch": 0.8738914775289344, "flos": 22356621106560.0, "grad_norm": 6.729236427207113, "language_loss": 0.7545082, "learning_rate": 1.6444589613212357e-07, "loss": 0.77878344, "num_input_tokens_seen": 313409680, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.17773438, "step": 14535, "time_per_iteration": 2.861818552017212 }, { "auxiliary_loss_clip": 0.01394883, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 1.23517179, "balance_loss_mlp": 1.01478982, "epoch": 0.8739516007816023, "flos": 31772310883200.0, "grad_norm": 1.8213681825937396, "language_loss": 0.75026655, "learning_rate": 1.64291277235048e-07, "loss": 0.77455491, "num_input_tokens_seen": 313431335, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19165039, "step": 14536, "time_per_iteration": 5.877175569534302 }, { "auxiliary_loss_clip": 0.01402876, "auxiliary_loss_mlp": 0.01035127, "balance_loss_clip": 1.24078453, "balance_loss_mlp": 1.01655412, "epoch": 0.8740117240342703, "flos": 21220719684480.0, "grad_norm": 2.6658033817075, "language_loss": 0.64779741, "learning_rate": 1.641367279482304e-07, "loss": 0.67217743, "num_input_tokens_seen": 313449225, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18566895, "step": 14537, "time_per_iteration": 2.8556387424468994 }, { "auxiliary_loss_clip": 0.01391422, "auxiliary_loss_mlp": 0.01031554, "balance_loss_clip": 1.23202801, "balance_loss_mlp": 1.01248074, "epoch": 0.8740718472869382, "flos": 25196184691200.0, "grad_norm": 1.8986344905485446, "language_loss": 0.58654159, "learning_rate": 1.6398224827753216e-07, "loss": 0.6107713, "num_input_tokens_seen": 313467715, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.1907959, "step": 14538, "time_per_iteration": 2.8689358234405518 }, { "auxiliary_loss_clip": 0.01379434, "auxiliary_loss_mlp": 0.01030875, "balance_loss_clip": 1.22621775, "balance_loss_mlp": 1.01220679, "epoch": 0.8741319705396062, "flos": 19510542270720.0, "grad_norm": 1.8130766628113275, "language_loss": 0.69230205, "learning_rate": 1.6382783822881142e-07, "loss": 0.71640515, "num_input_tokens_seen": 313486805, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.18652344, "step": 14539, "time_per_iteration": 2.8235397338867188 }, { "auxiliary_loss_clip": 0.01415761, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.25080454, "balance_loss_mlp": 1.01735711, "epoch": 0.8741920937922741, "flos": 14109074311680.0, "grad_norm": 1.723264057671699, "language_loss": 0.7458154, "learning_rate": 1.6367349780792262e-07, "loss": 0.77033579, "num_input_tokens_seen": 313504880, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18908691, "step": 14540, "time_per_iteration": 2.834864377975464 }, { "auxiliary_loss_clip": 0.01392742, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.23385859, "balance_loss_mlp": 1.01513267, "epoch": 0.8742522170449422, "flos": 27721504005120.0, "grad_norm": 1.6711001694633634, "language_loss": 0.79411769, "learning_rate": 1.635192270207193e-07, "loss": 0.81838167, "num_input_tokens_seen": 313524995, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18530273, "step": 14541, "time_per_iteration": 2.90102481842041 }, { "auxiliary_loss_clip": 0.0140872, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.24356198, "balance_loss_mlp": 1.01305151, "epoch": 0.8743123402976101, "flos": 21152571736320.0, "grad_norm": 2.5411470394636213, "language_loss": 0.67308843, "learning_rate": 1.6336502587305035e-07, "loss": 0.69750875, "num_input_tokens_seen": 313541740, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.20263672, "step": 14542, "time_per_iteration": 2.826150417327881 }, { "auxiliary_loss_clip": 0.0117945, "auxiliary_loss_mlp": 0.01031509, "balance_loss_clip": 1.09071875, "balance_loss_mlp": 1.00995588, "epoch": 0.8743724635502781, "flos": 60898344543360.0, "grad_norm": 0.7832091864413139, "language_loss": 0.54505205, "learning_rate": 1.632108943707642e-07, "loss": 0.56716162, "num_input_tokens_seen": 313593445, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.21582031, "step": 14543, "time_per_iteration": 3.1768436431884766 }, { "auxiliary_loss_clip": 0.01402706, "auxiliary_loss_mlp": 0.01032923, "balance_loss_clip": 1.24083781, "balance_loss_mlp": 1.01430202, "epoch": 0.874432586802946, "flos": 28120546310400.0, "grad_norm": 2.246527702620527, "language_loss": 0.71007776, "learning_rate": 1.6305683251970458e-07, "loss": 0.73443413, "num_input_tokens_seen": 313615640, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.1862793, "step": 14544, "time_per_iteration": 2.899484395980835 }, { "auxiliary_loss_clip": 0.0138124, "auxiliary_loss_mlp": 0.01031136, "balance_loss_clip": 1.22722459, "balance_loss_mlp": 1.01345754, "epoch": 0.874492710055614, "flos": 23560806211200.0, "grad_norm": 1.4405795940735293, "language_loss": 0.76682305, "learning_rate": 1.62902840325714e-07, "loss": 0.79094684, "num_input_tokens_seen": 313635550, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.17675781, "step": 14545, "time_per_iteration": 2.933885335922241 }, { "auxiliary_loss_clip": 0.01395377, "auxiliary_loss_mlp": 0.0103762, "balance_loss_clip": 1.23621106, "balance_loss_mlp": 1.01777148, "epoch": 0.8745528333082819, "flos": 40930185934080.0, "grad_norm": 1.5568163303388918, "language_loss": 0.66541523, "learning_rate": 1.6274891779463217e-07, "loss": 0.68974519, "num_input_tokens_seen": 313659275, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.1986084, "step": 14546, "time_per_iteration": 3.012847423553467 }, { "auxiliary_loss_clip": 0.01400234, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.23953891, "balance_loss_mlp": 1.01318264, "epoch": 0.87461295656095, "flos": 23633342904960.0, "grad_norm": 1.5785269501033563, "language_loss": 0.73047793, "learning_rate": 1.6259506493229536e-07, "loss": 0.75479567, "num_input_tokens_seen": 313680595, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18359375, "step": 14547, "time_per_iteration": 2.8574347496032715 }, { "auxiliary_loss_clip": 0.01425768, "auxiliary_loss_mlp": 0.01036381, "balance_loss_clip": 1.25858486, "balance_loss_mlp": 1.01699781, "epoch": 0.874673079813618, "flos": 38806714103040.0, "grad_norm": 3.191415671815469, "language_loss": 0.70813894, "learning_rate": 1.6244128174453752e-07, "loss": 0.73276043, "num_input_tokens_seen": 313699730, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19384766, "step": 14548, "time_per_iteration": 3.0851807594299316 }, { "auxiliary_loss_clip": 0.01405881, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.24234152, "balance_loss_mlp": 1.01565433, "epoch": 0.8747332030662859, "flos": 23706694005120.0, "grad_norm": 1.8748242458868416, "language_loss": 0.71372175, "learning_rate": 1.6228756823719093e-07, "loss": 0.73813522, "num_input_tokens_seen": 313720090, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19812012, "step": 14549, "time_per_iteration": 2.9585132598876953 }, { "auxiliary_loss_clip": 0.01412341, "auxiliary_loss_mlp": 0.0103062, "balance_loss_clip": 1.24648058, "balance_loss_mlp": 1.01187992, "epoch": 0.8747933263189539, "flos": 24473215728000.0, "grad_norm": 2.486587281565092, "language_loss": 0.83685005, "learning_rate": 1.6213392441608352e-07, "loss": 0.86127973, "num_input_tokens_seen": 313736795, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.18725586, "step": 14550, "time_per_iteration": 2.985731840133667 }, { "auxiliary_loss_clip": 0.0140023, "auxiliary_loss_mlp": 0.01036971, "balance_loss_clip": 1.23707414, "balance_loss_mlp": 1.01738465, "epoch": 0.8748534495716218, "flos": 13817298723840.0, "grad_norm": 1.5725863367140747, "language_loss": 0.7315315, "learning_rate": 1.6198035028704183e-07, "loss": 0.75590354, "num_input_tokens_seen": 313754820, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19616699, "step": 14551, "time_per_iteration": 2.814605474472046 }, { "auxiliary_loss_clip": 0.01392083, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.23508894, "balance_loss_mlp": 1.01588321, "epoch": 0.8749135728242898, "flos": 29874185994240.0, "grad_norm": 3.4662093391090045, "language_loss": 0.6547662, "learning_rate": 1.6182684585588934e-07, "loss": 0.67904568, "num_input_tokens_seen": 313775830, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19970703, "step": 14552, "time_per_iteration": 2.916895627975464 }, { "auxiliary_loss_clip": 0.01402615, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.24040937, "balance_loss_mlp": 1.01452589, "epoch": 0.8749736960769577, "flos": 24143995428480.0, "grad_norm": 1.7458883283222435, "language_loss": 0.8027088, "learning_rate": 1.616734111284479e-07, "loss": 0.82708359, "num_input_tokens_seen": 313795745, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20336914, "step": 14553, "time_per_iteration": 2.8357186317443848 }, { "auxiliary_loss_clip": 0.01408618, "auxiliary_loss_mlp": 0.0103219, "balance_loss_clip": 1.24515009, "balance_loss_mlp": 1.01367712, "epoch": 0.8750338193296258, "flos": 17211610529280.0, "grad_norm": 1.9389192577948122, "language_loss": 0.71086687, "learning_rate": 1.6152004611053416e-07, "loss": 0.73527491, "num_input_tokens_seen": 313813895, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18530273, "step": 14554, "time_per_iteration": 2.7916648387908936 }, { "auxiliary_loss_clip": 0.01389832, "auxiliary_loss_mlp": 0.0102958, "balance_loss_clip": 1.23172784, "balance_loss_mlp": 1.01130557, "epoch": 0.8750939425822937, "flos": 23743912492800.0, "grad_norm": 3.5453999436056907, "language_loss": 0.84358829, "learning_rate": 1.6136675080796457e-07, "loss": 0.86778241, "num_input_tokens_seen": 313834225, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18273926, "step": 14555, "time_per_iteration": 2.8653101921081543 }, { "auxiliary_loss_clip": 0.01400613, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.24214292, "balance_loss_mlp": 1.01202679, "epoch": 0.8751540658349617, "flos": 26552863330560.0, "grad_norm": 2.4195983332969693, "language_loss": 0.71671999, "learning_rate": 1.6121352522655252e-07, "loss": 0.74103969, "num_input_tokens_seen": 313854430, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19348145, "step": 14556, "time_per_iteration": 2.8892199993133545 }, { "auxiliary_loss_clip": 0.01406864, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.24334764, "balance_loss_mlp": 1.01240087, "epoch": 0.8752141890876296, "flos": 19395176734080.0, "grad_norm": 1.8417421632178812, "language_loss": 0.77412069, "learning_rate": 1.6106036937210732e-07, "loss": 0.79850668, "num_input_tokens_seen": 313871600, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19335938, "step": 14557, "time_per_iteration": 2.81040358543396 }, { "auxiliary_loss_clip": 0.01403418, "auxiliary_loss_mlp": 0.01036865, "balance_loss_clip": 1.24356651, "balance_loss_mlp": 1.01767278, "epoch": 0.8752743123402976, "flos": 25385353776000.0, "grad_norm": 2.1886773985659587, "language_loss": 0.83521748, "learning_rate": 1.6090728325043767e-07, "loss": 0.85962033, "num_input_tokens_seen": 313891570, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1920166, "step": 14558, "time_per_iteration": 2.8760411739349365 }, { "auxiliary_loss_clip": 0.01183461, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.09518564, "balance_loss_mlp": 1.00884044, "epoch": 0.8753344355929655, "flos": 59979057816960.0, "grad_norm": 0.8086867445110079, "language_loss": 0.56088138, "learning_rate": 1.6075426686734784e-07, "loss": 0.58299601, "num_input_tokens_seen": 313951290, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.19140625, "step": 14559, "time_per_iteration": 3.3502023220062256 }, { "auxiliary_loss_clip": 0.01389778, "auxiliary_loss_mlp": 0.01035663, "balance_loss_clip": 1.23103571, "balance_loss_mlp": 1.01747167, "epoch": 0.8753945588456336, "flos": 17903604787200.0, "grad_norm": 1.6879200994850063, "language_loss": 0.66894281, "learning_rate": 1.606013202286407e-07, "loss": 0.69319725, "num_input_tokens_seen": 313968645, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18188477, "step": 14560, "time_per_iteration": 2.832836866378784 }, { "auxiliary_loss_clip": 0.01389965, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.23142171, "balance_loss_mlp": 1.01106799, "epoch": 0.8754546820983016, "flos": 30925877564160.0, "grad_norm": 1.7609542568341663, "language_loss": 0.79805505, "learning_rate": 1.6044844334011541e-07, "loss": 0.82225358, "num_input_tokens_seen": 313987580, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18823242, "step": 14561, "time_per_iteration": 2.9340782165527344 }, { "auxiliary_loss_clip": 0.01407691, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.24428749, "balance_loss_mlp": 1.01478648, "epoch": 0.8755148053509695, "flos": 20640742848000.0, "grad_norm": 1.9696508094098846, "language_loss": 0.78450501, "learning_rate": 1.6029563620756982e-07, "loss": 0.8089236, "num_input_tokens_seen": 314004460, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19384766, "step": 14562, "time_per_iteration": 4.361043930053711 }, { "auxiliary_loss_clip": 0.01366914, "auxiliary_loss_mlp": 0.01031029, "balance_loss_clip": 1.21368408, "balance_loss_mlp": 1.01122844, "epoch": 0.8755749286036375, "flos": 34983063959040.0, "grad_norm": 1.354483903502839, "language_loss": 0.72335875, "learning_rate": 1.601428988367981e-07, "loss": 0.74733818, "num_input_tokens_seen": 314026855, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.19787598, "step": 14563, "time_per_iteration": 3.018146514892578 }, { "auxiliary_loss_clip": 0.01410474, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.24683428, "balance_loss_mlp": 1.01440406, "epoch": 0.8756350518563054, "flos": 18195470864640.0, "grad_norm": 4.622220576002593, "language_loss": 0.66727519, "learning_rate": 1.5999023123359235e-07, "loss": 0.69171834, "num_input_tokens_seen": 314042830, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19421387, "step": 14564, "time_per_iteration": 2.838650941848755 }, { "auxiliary_loss_clip": 0.01391181, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.22988224, "balance_loss_mlp": 1.01819253, "epoch": 0.8756951751089734, "flos": 20093595753600.0, "grad_norm": 1.6263175282692692, "language_loss": 0.71499807, "learning_rate": 1.598376334037408e-07, "loss": 0.7392804, "num_input_tokens_seen": 314062225, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18859863, "step": 14565, "time_per_iteration": 2.835254669189453 }, { "auxiliary_loss_clip": 0.01420417, "auxiliary_loss_mlp": 0.01036082, "balance_loss_clip": 1.25334954, "balance_loss_mlp": 1.01593542, "epoch": 0.8757552983616413, "flos": 27536271217920.0, "grad_norm": 1.4425460454413448, "language_loss": 0.78255939, "learning_rate": 1.5968510535303102e-07, "loss": 0.80712438, "num_input_tokens_seen": 314082325, "router_z_loss_clip": 1.66992188, "router_z_loss_mlp": 0.20141602, "step": 14566, "time_per_iteration": 4.29935884475708 }, { "auxiliary_loss_clip": 0.0139438, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.23565209, "balance_loss_mlp": 1.01244068, "epoch": 0.8758154216143094, "flos": 18080964979200.0, "grad_norm": 1.6815790117725005, "language_loss": 0.71809208, "learning_rate": 1.5953264708724624e-07, "loss": 0.74234891, "num_input_tokens_seen": 314100310, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18859863, "step": 14567, "time_per_iteration": 2.838391065597534 }, { "auxiliary_loss_clip": 0.01387547, "auxiliary_loss_mlp": 0.01033181, "balance_loss_clip": 1.22979569, "balance_loss_mlp": 1.0140475, "epoch": 0.8758755448669773, "flos": 25056450190080.0, "grad_norm": 1.6454278888543241, "language_loss": 0.74706745, "learning_rate": 1.5938025861216776e-07, "loss": 0.77127481, "num_input_tokens_seen": 314121330, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19116211, "step": 14568, "time_per_iteration": 2.868021249771118 }, { "auxiliary_loss_clip": 0.01398832, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.24015117, "balance_loss_mlp": 1.01560974, "epoch": 0.8759356681196453, "flos": 22867092650880.0, "grad_norm": 2.0044067074477043, "language_loss": 0.87692624, "learning_rate": 1.5922793993357475e-07, "loss": 0.90125889, "num_input_tokens_seen": 314139875, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18835449, "step": 14569, "time_per_iteration": 2.8624300956726074 }, { "auxiliary_loss_clip": 0.0139688, "auxiliary_loss_mlp": 0.01032755, "balance_loss_clip": 1.23741734, "balance_loss_mlp": 1.01334774, "epoch": 0.8759957913723132, "flos": 21042318862080.0, "grad_norm": 1.6172541537692908, "language_loss": 0.7441141, "learning_rate": 1.5907569105724284e-07, "loss": 0.76841044, "num_input_tokens_seen": 314157850, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1940918, "step": 14570, "time_per_iteration": 2.8491714000701904 }, { "auxiliary_loss_clip": 0.01406136, "auxiliary_loss_mlp": 0.01033535, "balance_loss_clip": 1.24298334, "balance_loss_mlp": 1.0144136, "epoch": 0.8760559146249812, "flos": 20019882695040.0, "grad_norm": 1.5786310937617332, "language_loss": 0.68108803, "learning_rate": 1.5892351198894472e-07, "loss": 0.70548469, "num_input_tokens_seen": 314176720, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19116211, "step": 14571, "time_per_iteration": 4.36399245262146 }, { "auxiliary_loss_clip": 0.01389846, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.23225379, "balance_loss_mlp": 1.01394784, "epoch": 0.8761160378776491, "flos": 19983614348160.0, "grad_norm": 1.7881219684113248, "language_loss": 0.63723749, "learning_rate": 1.5877140273445156e-07, "loss": 0.66146815, "num_input_tokens_seen": 314196645, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19287109, "step": 14572, "time_per_iteration": 2.842728853225708 }, { "auxiliary_loss_clip": 0.01389908, "auxiliary_loss_mlp": 0.01030996, "balance_loss_clip": 1.23241735, "balance_loss_mlp": 1.01269722, "epoch": 0.8761761611303172, "flos": 28816341131520.0, "grad_norm": 1.6659248974297587, "language_loss": 0.74431419, "learning_rate": 1.5861936329953162e-07, "loss": 0.76852322, "num_input_tokens_seen": 314217430, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.1829834, "step": 14573, "time_per_iteration": 2.8893232345581055 }, { "auxiliary_loss_clip": 0.01379174, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.22454512, "balance_loss_mlp": 1.01234591, "epoch": 0.8762362843829851, "flos": 18341584882560.0, "grad_norm": 2.179745025027988, "language_loss": 0.74026251, "learning_rate": 1.5846739368994966e-07, "loss": 0.76435846, "num_input_tokens_seen": 314235310, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.18054199, "step": 14574, "time_per_iteration": 2.9263434410095215 }, { "auxiliary_loss_clip": 0.01397753, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 1.23825634, "balance_loss_mlp": 1.01347494, "epoch": 0.8762964076356531, "flos": 15787734082560.0, "grad_norm": 2.015437702081427, "language_loss": 0.7637105, "learning_rate": 1.5831549391146903e-07, "loss": 0.78801978, "num_input_tokens_seen": 314252355, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19677734, "step": 14575, "time_per_iteration": 2.8207294940948486 }, { "auxiliary_loss_clip": 0.013777, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.22214651, "balance_loss_mlp": 1.01482248, "epoch": 0.8763565308883211, "flos": 33188269489920.0, "grad_norm": 1.7746192610879317, "language_loss": 0.67600274, "learning_rate": 1.5816366396984916e-07, "loss": 0.70011526, "num_input_tokens_seen": 314272755, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18737793, "step": 14576, "time_per_iteration": 2.954775094985962 }, { "auxiliary_loss_clip": 0.01390121, "auxiliary_loss_mlp": 0.01029403, "balance_loss_clip": 1.23260164, "balance_loss_mlp": 1.0113194, "epoch": 0.876416654140989, "flos": 15896584368000.0, "grad_norm": 1.9809475219832893, "language_loss": 0.67364711, "learning_rate": 1.5801190387084806e-07, "loss": 0.69784236, "num_input_tokens_seen": 314291365, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18078613, "step": 14577, "time_per_iteration": 2.841046094894409 }, { "auxiliary_loss_clip": 0.01409063, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.24731505, "balance_loss_mlp": 1.0138545, "epoch": 0.876476777393657, "flos": 25896323013120.0, "grad_norm": 2.7868600487990354, "language_loss": 0.71836185, "learning_rate": 1.5786021362021962e-07, "loss": 0.74278986, "num_input_tokens_seen": 314310075, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19897461, "step": 14578, "time_per_iteration": 2.867885112762451 }, { "auxiliary_loss_clip": 0.01409704, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.24710107, "balance_loss_mlp": 1.01706553, "epoch": 0.876536900646325, "flos": 13597607381760.0, "grad_norm": 2.5019893296024174, "language_loss": 0.71905124, "learning_rate": 1.5770859322371676e-07, "loss": 0.7435075, "num_input_tokens_seen": 314325695, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18847656, "step": 14579, "time_per_iteration": 2.86909556388855 }, { "auxiliary_loss_clip": 0.01383515, "auxiliary_loss_mlp": 0.01034091, "balance_loss_clip": 1.2280854, "balance_loss_mlp": 1.01516104, "epoch": 0.876597023898993, "flos": 12210949422720.0, "grad_norm": 1.7978800231177932, "language_loss": 0.71449172, "learning_rate": 1.5755704268708912e-07, "loss": 0.73866779, "num_input_tokens_seen": 314343605, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18933105, "step": 14580, "time_per_iteration": 2.8579955101013184 }, { "auxiliary_loss_clip": 0.01390377, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.23419499, "balance_loss_mlp": 1.01202297, "epoch": 0.8766571471516609, "flos": 25347411371520.0, "grad_norm": 2.7907756847945406, "language_loss": 0.66408157, "learning_rate": 1.5740556201608256e-07, "loss": 0.68828917, "num_input_tokens_seen": 314364275, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18359375, "step": 14581, "time_per_iteration": 2.8741085529327393 }, { "auxiliary_loss_clip": 0.01385503, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 1.22862625, "balance_loss_mlp": 1.0117451, "epoch": 0.8767172704043289, "flos": 30124354348800.0, "grad_norm": 1.4335121941418636, "language_loss": 0.73789227, "learning_rate": 1.572541512164416e-07, "loss": 0.76205099, "num_input_tokens_seen": 314385140, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18615723, "step": 14582, "time_per_iteration": 2.9132890701293945 }, { "auxiliary_loss_clip": 0.01385493, "auxiliary_loss_mlp": 0.01035897, "balance_loss_clip": 1.22710896, "balance_loss_mlp": 1.01638246, "epoch": 0.8767773936569968, "flos": 19290488970240.0, "grad_norm": 2.21964689269826, "language_loss": 0.68084991, "learning_rate": 1.5710281029390826e-07, "loss": 0.70506388, "num_input_tokens_seen": 314403715, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19519043, "step": 14583, "time_per_iteration": 2.8329358100891113 }, { "auxiliary_loss_clip": 0.014168, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.25446367, "balance_loss_mlp": 1.013098, "epoch": 0.8768375169096648, "flos": 21255992645760.0, "grad_norm": 1.9964576809012997, "language_loss": 0.79557854, "learning_rate": 1.5695153925422067e-07, "loss": 0.82006526, "num_input_tokens_seen": 314421880, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18786621, "step": 14584, "time_per_iteration": 2.8180973529815674 }, { "auxiliary_loss_clip": 0.01410485, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.2480123, "balance_loss_mlp": 1.01074874, "epoch": 0.8768976401623327, "flos": 23306158621440.0, "grad_norm": 1.5984353444332926, "language_loss": 0.72999781, "learning_rate": 1.5680033810311555e-07, "loss": 0.75440234, "num_input_tokens_seen": 314441585, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1920166, "step": 14585, "time_per_iteration": 2.9110145568847656 }, { "auxiliary_loss_clip": 0.01380807, "auxiliary_loss_mlp": 0.01032752, "balance_loss_clip": 1.22487926, "balance_loss_mlp": 1.01309419, "epoch": 0.8769577634150008, "flos": 21371222448000.0, "grad_norm": 1.7515902263188996, "language_loss": 0.74703622, "learning_rate": 1.5664920684632654e-07, "loss": 0.77117181, "num_input_tokens_seen": 314459020, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.19665527, "step": 14586, "time_per_iteration": 2.8584306240081787 }, { "auxiliary_loss_clip": 0.01403366, "auxiliary_loss_mlp": 0.01034804, "balance_loss_clip": 1.24390531, "balance_loss_mlp": 1.01604033, "epoch": 0.8770178866676687, "flos": 23524628353920.0, "grad_norm": 1.7428649098725366, "language_loss": 0.79058915, "learning_rate": 1.564981454895844e-07, "loss": 0.81497085, "num_input_tokens_seen": 314478935, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18786621, "step": 14587, "time_per_iteration": 2.868985414505005 }, { "auxiliary_loss_clip": 0.01396109, "auxiliary_loss_mlp": 0.01033142, "balance_loss_clip": 1.23637271, "balance_loss_mlp": 1.01357996, "epoch": 0.8770780099203367, "flos": 19728514310400.0, "grad_norm": 1.6268359934164138, "language_loss": 0.74307525, "learning_rate": 1.5634715403861697e-07, "loss": 0.76736778, "num_input_tokens_seen": 314497635, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19543457, "step": 14588, "time_per_iteration": 2.8455989360809326 }, { "auxiliary_loss_clip": 0.01381422, "auxiliary_loss_mlp": 0.01032047, "balance_loss_clip": 1.22421694, "balance_loss_mlp": 1.01381946, "epoch": 0.8771381331730047, "flos": 21405409534080.0, "grad_norm": 1.9364790967196666, "language_loss": 0.67029893, "learning_rate": 1.5619623249915016e-07, "loss": 0.69443369, "num_input_tokens_seen": 314515445, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18225098, "step": 14589, "time_per_iteration": 2.8480799198150635 }, { "auxiliary_loss_clip": 0.01383104, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.22532821, "balance_loss_mlp": 1.01448607, "epoch": 0.8771982564256726, "flos": 20270186784000.0, "grad_norm": 2.5196384827780807, "language_loss": 0.71562874, "learning_rate": 1.5604538087690732e-07, "loss": 0.73979521, "num_input_tokens_seen": 314533040, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19055176, "step": 14590, "time_per_iteration": 2.952967882156372 }, { "auxiliary_loss_clip": 0.01408915, "auxiliary_loss_mlp": 0.01036082, "balance_loss_clip": 1.24258351, "balance_loss_mlp": 1.01563811, "epoch": 0.8772583796783406, "flos": 12495531087360.0, "grad_norm": 1.9319206192710896, "language_loss": 0.74816173, "learning_rate": 1.558945991776086e-07, "loss": 0.77261162, "num_input_tokens_seen": 314548280, "router_z_loss_clip": 1.6640625, "router_z_loss_mlp": 0.20446777, "step": 14591, "time_per_iteration": 2.776198148727417 }, { "auxiliary_loss_clip": 0.01381547, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.22747695, "balance_loss_mlp": 1.01458931, "epoch": 0.8773185029310085, "flos": 15928690193280.0, "grad_norm": 1.9151856068137083, "language_loss": 0.81317222, "learning_rate": 1.5574388740697096e-07, "loss": 0.83731937, "num_input_tokens_seen": 314565345, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.18579102, "step": 14592, "time_per_iteration": 2.9132721424102783 }, { "auxiliary_loss_clip": 0.01381714, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.22614098, "balance_loss_mlp": 1.01411915, "epoch": 0.8773786261836766, "flos": 21513852616320.0, "grad_norm": 1.690862932115398, "language_loss": 0.83161139, "learning_rate": 1.5559324557071052e-07, "loss": 0.85574925, "num_input_tokens_seen": 314584190, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.17956543, "step": 14593, "time_per_iteration": 2.8362715244293213 }, { "auxiliary_loss_clip": 0.01388449, "auxiliary_loss_mlp": 0.01028872, "balance_loss_clip": 1.23128688, "balance_loss_mlp": 1.01069212, "epoch": 0.8774387494363445, "flos": 26772961875840.0, "grad_norm": 1.4670691109880925, "language_loss": 0.7688396, "learning_rate": 1.5544267367453845e-07, "loss": 0.7930128, "num_input_tokens_seen": 314605625, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18188477, "step": 14594, "time_per_iteration": 2.852783441543579 }, { "auxiliary_loss_clip": 0.01394728, "auxiliary_loss_mlp": 0.01031624, "balance_loss_clip": 1.23414993, "balance_loss_mlp": 1.01289654, "epoch": 0.8774988726890125, "flos": 18488060858880.0, "grad_norm": 2.1349559370271036, "language_loss": 0.78303945, "learning_rate": 1.552921717241651e-07, "loss": 0.80730295, "num_input_tokens_seen": 314622630, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18713379, "step": 14595, "time_per_iteration": 2.8018088340759277 }, { "auxiliary_loss_clip": 0.01397398, "auxiliary_loss_mlp": 0.01035302, "balance_loss_clip": 1.23877704, "balance_loss_mlp": 1.01507223, "epoch": 0.8775589959416804, "flos": 24437218849920.0, "grad_norm": 1.6583380496231788, "language_loss": 0.71212029, "learning_rate": 1.5514173972529743e-07, "loss": 0.73644727, "num_input_tokens_seen": 314642460, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.20227051, "step": 14596, "time_per_iteration": 2.8449621200561523 }, { "auxiliary_loss_clip": 0.01388405, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.23257017, "balance_loss_mlp": 1.01227331, "epoch": 0.8776191191943484, "flos": 23450462847360.0, "grad_norm": 2.1011887746210602, "language_loss": 0.86358106, "learning_rate": 1.5499137768364067e-07, "loss": 0.88777155, "num_input_tokens_seen": 314659875, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18383789, "step": 14597, "time_per_iteration": 4.327143430709839 }, { "auxiliary_loss_clip": 0.01396384, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.2381351, "balance_loss_mlp": 1.01315987, "epoch": 0.8776792424470163, "flos": 26841426537600.0, "grad_norm": 1.6166757779020804, "language_loss": 0.73368144, "learning_rate": 1.5484108560489494e-07, "loss": 0.75796449, "num_input_tokens_seen": 314680260, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18786621, "step": 14598, "time_per_iteration": 2.8756613731384277 }, { "auxiliary_loss_clip": 0.01396831, "auxiliary_loss_mlp": 0.01035327, "balance_loss_clip": 1.23661029, "balance_loss_mlp": 1.01586008, "epoch": 0.8777393656996844, "flos": 15632887818240.0, "grad_norm": 2.6212599022039225, "language_loss": 0.78098023, "learning_rate": 1.5469086349476036e-07, "loss": 0.80530179, "num_input_tokens_seen": 314696260, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19470215, "step": 14599, "time_per_iteration": 2.7501723766326904 }, { "auxiliary_loss_clip": 0.0139872, "auxiliary_loss_mlp": 0.01030656, "balance_loss_clip": 1.23832238, "balance_loss_mlp": 1.01174939, "epoch": 0.8777994889523523, "flos": 18889274914560.0, "grad_norm": 6.159801646665903, "language_loss": 0.68946266, "learning_rate": 1.545407113589332e-07, "loss": 0.71375644, "num_input_tokens_seen": 314714215, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18908691, "step": 14600, "time_per_iteration": 4.201138734817505 }, { "auxiliary_loss_clip": 0.01397392, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.23707318, "balance_loss_mlp": 1.01516521, "epoch": 0.8778596122050203, "flos": 48841408995840.0, "grad_norm": 2.2589993412931935, "language_loss": 0.70093369, "learning_rate": 1.543906292031072e-07, "loss": 0.72525215, "num_input_tokens_seen": 314735700, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19287109, "step": 14601, "time_per_iteration": 3.061053514480591 }, { "auxiliary_loss_clip": 0.01420357, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.25425267, "balance_loss_mlp": 1.01561701, "epoch": 0.8779197354576883, "flos": 25669618727040.0, "grad_norm": 1.7615481831984088, "language_loss": 0.73564279, "learning_rate": 1.542406170329733e-07, "loss": 0.7601977, "num_input_tokens_seen": 314753335, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.19506836, "step": 14602, "time_per_iteration": 2.858706474304199 }, { "auxiliary_loss_clip": 0.01387751, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.23018765, "balance_loss_mlp": 1.01332974, "epoch": 0.8779798587103562, "flos": 18852418385280.0, "grad_norm": 1.7066607159214915, "language_loss": 0.71157008, "learning_rate": 1.5409067485422056e-07, "loss": 0.73576576, "num_input_tokens_seen": 314770800, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18481445, "step": 14603, "time_per_iteration": 2.8407068252563477 }, { "auxiliary_loss_clip": 0.0117706, "auxiliary_loss_mlp": 0.01012994, "balance_loss_clip": 1.08931136, "balance_loss_mlp": 0.99659103, "epoch": 0.8780399819630242, "flos": 68645554629120.0, "grad_norm": 0.7469514709765466, "language_loss": 0.54172802, "learning_rate": 1.539408026725344e-07, "loss": 0.56362855, "num_input_tokens_seen": 314837275, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.1640625, "step": 14604, "time_per_iteration": 3.3702590465545654 }, { "auxiliary_loss_clip": 0.01178228, "auxiliary_loss_mlp": 0.01031862, "balance_loss_clip": 1.08956671, "balance_loss_mlp": 1.00668514, "epoch": 0.8781001052156922, "flos": 65767550947200.0, "grad_norm": 0.7239806653138193, "language_loss": 0.59339118, "learning_rate": 1.537910004935976e-07, "loss": 0.61549211, "num_input_tokens_seen": 314902220, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.25195312, "step": 14605, "time_per_iteration": 3.2586140632629395 }, { "auxiliary_loss_clip": 0.0140417, "auxiliary_loss_mlp": 0.01035298, "balance_loss_clip": 1.24304247, "balance_loss_mlp": 1.01575947, "epoch": 0.8781602284683602, "flos": 22058556491520.0, "grad_norm": 8.75219506488415, "language_loss": 0.85549718, "learning_rate": 1.536412683230912e-07, "loss": 0.87989187, "num_input_tokens_seen": 314921645, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19543457, "step": 14606, "time_per_iteration": 5.681826591491699 }, { "auxiliary_loss_clip": 0.01416572, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.25366354, "balance_loss_mlp": 1.01592314, "epoch": 0.8782203517210281, "flos": 17570945882880.0, "grad_norm": 1.978409700222824, "language_loss": 0.71401924, "learning_rate": 1.534916061666931e-07, "loss": 0.73854101, "num_input_tokens_seen": 314939390, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19689941, "step": 14607, "time_per_iteration": 2.8328754901885986 }, { "auxiliary_loss_clip": 0.01392499, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.2358644, "balance_loss_mlp": 1.01321173, "epoch": 0.8782804749736961, "flos": 25531422549120.0, "grad_norm": 3.6225218746011856, "language_loss": 0.73087358, "learning_rate": 1.533420140300785e-07, "loss": 0.75512034, "num_input_tokens_seen": 314959205, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18969727, "step": 14608, "time_per_iteration": 2.881551742553711 }, { "auxiliary_loss_clip": 0.01415448, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.25058544, "balance_loss_mlp": 1.01525187, "epoch": 0.878340598226364, "flos": 21808750095360.0, "grad_norm": 2.069137994892772, "language_loss": 0.88645566, "learning_rate": 1.5319249191891936e-07, "loss": 0.9109627, "num_input_tokens_seen": 314977485, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20031738, "step": 14609, "time_per_iteration": 2.8457159996032715 }, { "auxiliary_loss_clip": 0.01395443, "auxiliary_loss_mlp": 0.01032743, "balance_loss_clip": 1.23567319, "balance_loss_mlp": 1.01203609, "epoch": 0.878400721479032, "flos": 21111416951040.0, "grad_norm": 1.5599117450577145, "language_loss": 0.7061249, "learning_rate": 1.5304303983888643e-07, "loss": 0.73040676, "num_input_tokens_seen": 314997830, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.20703125, "step": 14610, "time_per_iteration": 2.903596878051758 }, { "auxiliary_loss_clip": 0.01386773, "auxiliary_loss_mlp": 0.01035438, "balance_loss_clip": 1.2306838, "balance_loss_mlp": 1.01599479, "epoch": 0.8784608447316999, "flos": 20933423331840.0, "grad_norm": 1.8702494056574528, "language_loss": 0.81922328, "learning_rate": 1.5289365779564612e-07, "loss": 0.84344536, "num_input_tokens_seen": 315016480, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.19433594, "step": 14611, "time_per_iteration": 2.8051180839538574 }, { "auxiliary_loss_clip": 0.01394445, "auxiliary_loss_mlp": 0.01030941, "balance_loss_clip": 1.2344799, "balance_loss_mlp": 1.012321, "epoch": 0.878520967984368, "flos": 23340933889920.0, "grad_norm": 1.6036091875098124, "language_loss": 0.77053881, "learning_rate": 1.5274434579486338e-07, "loss": 0.79479271, "num_input_tokens_seen": 315036135, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18615723, "step": 14612, "time_per_iteration": 2.832141876220703 }, { "auxiliary_loss_clip": 0.01389753, "auxiliary_loss_mlp": 0.01035448, "balance_loss_clip": 1.23250699, "balance_loss_mlp": 1.01617205, "epoch": 0.8785810912370359, "flos": 25529612757120.0, "grad_norm": 1.6176320926207248, "language_loss": 0.72766954, "learning_rate": 1.525951038422002e-07, "loss": 0.75192153, "num_input_tokens_seen": 315057995, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.19274902, "step": 14613, "time_per_iteration": 2.848081111907959 }, { "auxiliary_loss_clip": 0.01179751, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.09296107, "balance_loss_mlp": 1.00841904, "epoch": 0.8786412144897039, "flos": 61865961995520.0, "grad_norm": 1.0427451514799915, "language_loss": 0.64621568, "learning_rate": 1.5244593194331667e-07, "loss": 0.66830051, "num_input_tokens_seen": 315104010, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.203125, "step": 14614, "time_per_iteration": 3.0853798389434814 }, { "auxiliary_loss_clip": 0.01178879, "auxiliary_loss_mlp": 0.01036877, "balance_loss_clip": 1.09280634, "balance_loss_mlp": 1.01532364, "epoch": 0.8787013377423719, "flos": 71023628805120.0, "grad_norm": 0.6636522128484623, "language_loss": 0.58651423, "learning_rate": 1.5229683010386762e-07, "loss": 0.60867178, "num_input_tokens_seen": 315174550, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.21582031, "step": 14615, "time_per_iteration": 3.3477370738983154 }, { "auxiliary_loss_clip": 0.01389036, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.22949767, "balance_loss_mlp": 1.0158534, "epoch": 0.8787614609950398, "flos": 17356231468800.0, "grad_norm": 2.617062150205355, "language_loss": 0.73336446, "learning_rate": 1.5214779832950807e-07, "loss": 0.75759882, "num_input_tokens_seen": 315191825, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18579102, "step": 14616, "time_per_iteration": 2.7977709770202637 }, { "auxiliary_loss_clip": 0.01175215, "auxiliary_loss_mlp": 0.01019611, "balance_loss_clip": 1.08943594, "balance_loss_mlp": 1.0046382, "epoch": 0.8788215842477078, "flos": 72543370279680.0, "grad_norm": 0.8560705821834971, "language_loss": 0.57990479, "learning_rate": 1.5199883662588953e-07, "loss": 0.60185301, "num_input_tokens_seen": 315255075, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.14941406, "step": 14617, "time_per_iteration": 3.402327299118042 }, { "auxiliary_loss_clip": 0.01382969, "auxiliary_loss_mlp": 0.01032791, "balance_loss_clip": 1.22727919, "balance_loss_mlp": 1.01370549, "epoch": 0.8788817075003758, "flos": 24838342416000.0, "grad_norm": 1.763252977387195, "language_loss": 0.84048969, "learning_rate": 1.5184994499865987e-07, "loss": 0.86464727, "num_input_tokens_seen": 315273995, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.19091797, "step": 14618, "time_per_iteration": 2.876068353652954 }, { "auxiliary_loss_clip": 0.01376423, "auxiliary_loss_mlp": 0.01027606, "balance_loss_clip": 1.22369206, "balance_loss_mlp": 1.00909317, "epoch": 0.8789418307530438, "flos": 22649211100800.0, "grad_norm": 1.5968210255990656, "language_loss": 0.69723642, "learning_rate": 1.5170112345346598e-07, "loss": 0.7212767, "num_input_tokens_seen": 315294485, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.18505859, "step": 14619, "time_per_iteration": 2.859360456466675 }, { "auxiliary_loss_clip": 0.01403226, "auxiliary_loss_mlp": 0.01036355, "balance_loss_clip": 1.2418592, "balance_loss_mlp": 1.01772261, "epoch": 0.8790019540057117, "flos": 19793585612160.0, "grad_norm": 1.878970990706346, "language_loss": 0.776941, "learning_rate": 1.5155237199595016e-07, "loss": 0.80133682, "num_input_tokens_seen": 315310420, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18615723, "step": 14620, "time_per_iteration": 2.92250919342041 }, { "auxiliary_loss_clip": 0.01401596, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.24087989, "balance_loss_mlp": 1.00953364, "epoch": 0.8790620772583797, "flos": 20239347813120.0, "grad_norm": 1.707788577652126, "language_loss": 0.80085862, "learning_rate": 1.514036906317542e-07, "loss": 0.82516849, "num_input_tokens_seen": 315330110, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19873047, "step": 14621, "time_per_iteration": 2.8714306354522705 }, { "auxiliary_loss_clip": 0.01413481, "auxiliary_loss_mlp": 0.01035595, "balance_loss_clip": 1.24906647, "balance_loss_mlp": 1.01621151, "epoch": 0.8791222005110476, "flos": 24140602068480.0, "grad_norm": 1.56990054084633, "language_loss": 0.67528582, "learning_rate": 1.5125507936651506e-07, "loss": 0.69977659, "num_input_tokens_seen": 315350080, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19396973, "step": 14622, "time_per_iteration": 2.8703458309173584 }, { "auxiliary_loss_clip": 0.01385225, "auxiliary_loss_mlp": 0.01033557, "balance_loss_clip": 1.22909999, "balance_loss_mlp": 1.01482975, "epoch": 0.8791823237637156, "flos": 21623607797760.0, "grad_norm": 2.116895632549563, "language_loss": 0.73523831, "learning_rate": 1.511065382058687e-07, "loss": 0.75942612, "num_input_tokens_seen": 315366360, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18725586, "step": 14623, "time_per_iteration": 2.802783250808716 }, { "auxiliary_loss_clip": 0.01384386, "auxiliary_loss_mlp": 0.01033567, "balance_loss_clip": 1.22617042, "balance_loss_mlp": 1.01446986, "epoch": 0.8792424470163835, "flos": 24254067323520.0, "grad_norm": 3.6767223591398803, "language_loss": 0.79262757, "learning_rate": 1.5095806715544801e-07, "loss": 0.81680715, "num_input_tokens_seen": 315385890, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19104004, "step": 14624, "time_per_iteration": 2.868772268295288 }, { "auxiliary_loss_clip": 0.01394809, "auxiliary_loss_mlp": 0.01036265, "balance_loss_clip": 1.23492205, "balance_loss_mlp": 1.01493812, "epoch": 0.8793025702690516, "flos": 24902237352960.0, "grad_norm": 1.9675887066562934, "language_loss": 0.8028627, "learning_rate": 1.5080966622088265e-07, "loss": 0.82717347, "num_input_tokens_seen": 315403400, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.21325684, "step": 14625, "time_per_iteration": 2.8570480346679688 }, { "auxiliary_loss_clip": 0.01382149, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.2262882, "balance_loss_mlp": 1.01242638, "epoch": 0.8793626935217195, "flos": 25383724963200.0, "grad_norm": 1.4131729968783362, "language_loss": 0.7427336, "learning_rate": 1.5066133540779967e-07, "loss": 0.76687109, "num_input_tokens_seen": 315423670, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.19165039, "step": 14626, "time_per_iteration": 2.8758046627044678 }, { "auxiliary_loss_clip": 0.01401841, "auxiliary_loss_mlp": 0.01031025, "balance_loss_clip": 1.23828506, "balance_loss_mlp": 1.01195121, "epoch": 0.8794228167743875, "flos": 34691288371200.0, "grad_norm": 1.441605071358997, "language_loss": 0.7142241, "learning_rate": 1.505130747218246e-07, "loss": 0.73855281, "num_input_tokens_seen": 315446265, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19091797, "step": 14627, "time_per_iteration": 2.9727444648742676 }, { "auxiliary_loss_clip": 0.01392651, "auxiliary_loss_mlp": 0.01031863, "balance_loss_clip": 1.23369765, "balance_loss_mlp": 1.01172817, "epoch": 0.8794829400270555, "flos": 19473504762240.0, "grad_norm": 1.6657758457116463, "language_loss": 0.72853303, "learning_rate": 1.5036488416857873e-07, "loss": 0.75277817, "num_input_tokens_seen": 315464655, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.20141602, "step": 14628, "time_per_iteration": 2.8266279697418213 }, { "auxiliary_loss_clip": 0.01398766, "auxiliary_loss_mlp": 0.01033049, "balance_loss_clip": 1.23792553, "balance_loss_mlp": 1.01256931, "epoch": 0.8795430632797234, "flos": 15239727336960.0, "grad_norm": 2.9938388218570546, "language_loss": 0.70213294, "learning_rate": 1.5021676375368175e-07, "loss": 0.72645104, "num_input_tokens_seen": 315481090, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20483398, "step": 14629, "time_per_iteration": 2.801964282989502 }, { "auxiliary_loss_clip": 0.01384133, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.22763729, "balance_loss_mlp": 1.01100588, "epoch": 0.8796031865323914, "flos": 27755962560000.0, "grad_norm": 1.5641589233544713, "language_loss": 0.69605541, "learning_rate": 1.5006871348275053e-07, "loss": 0.72019416, "num_input_tokens_seen": 315502010, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18737793, "step": 14630, "time_per_iteration": 2.88619327545166 }, { "auxiliary_loss_clip": 0.01376618, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.2226311, "balance_loss_mlp": 1.01686215, "epoch": 0.8796633097850594, "flos": 31297971951360.0, "grad_norm": 1.8233736347772387, "language_loss": 0.75051999, "learning_rate": 1.499207333613999e-07, "loss": 0.77464867, "num_input_tokens_seen": 315523040, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.1940918, "step": 14631, "time_per_iteration": 2.918863534927368 }, { "auxiliary_loss_clip": 0.01374494, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.22157502, "balance_loss_mlp": 1.01323009, "epoch": 0.8797234330377274, "flos": 24253750609920.0, "grad_norm": 6.619806536576093, "language_loss": 0.69976431, "learning_rate": 1.4977282339523954e-07, "loss": 0.72383773, "num_input_tokens_seen": 315541865, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.19616699, "step": 14632, "time_per_iteration": 2.844898223876953 }, { "auxiliary_loss_clip": 0.01395035, "auxiliary_loss_mlp": 0.01034815, "balance_loss_clip": 1.23633075, "balance_loss_mlp": 1.01645672, "epoch": 0.8797835562903953, "flos": 24177277618560.0, "grad_norm": 1.9382144419997884, "language_loss": 0.65660071, "learning_rate": 1.4962498358987929e-07, "loss": 0.68089926, "num_input_tokens_seen": 315561470, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18334961, "step": 14633, "time_per_iteration": 4.290616273880005 }, { "auxiliary_loss_clip": 0.0138691, "auxiliary_loss_mlp": 0.01034406, "balance_loss_clip": 1.22890973, "balance_loss_mlp": 1.01557112, "epoch": 0.8798436795430633, "flos": 19294922960640.0, "grad_norm": 1.4185437058910058, "language_loss": 0.8464939, "learning_rate": 1.4947721395092528e-07, "loss": 0.87070704, "num_input_tokens_seen": 315583140, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18859863, "step": 14634, "time_per_iteration": 2.9137935638427734 }, { "auxiliary_loss_clip": 0.01393343, "auxiliary_loss_mlp": 0.01032556, "balance_loss_clip": 1.23377872, "balance_loss_mlp": 1.01364899, "epoch": 0.8799038027957312, "flos": 28189915868160.0, "grad_norm": 1.7244594685217904, "language_loss": 0.80747795, "learning_rate": 1.4932951448398056e-07, "loss": 0.83173692, "num_input_tokens_seen": 315601935, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18920898, "step": 14635, "time_per_iteration": 4.290014266967773 }, { "auxiliary_loss_clip": 0.01392361, "auxiliary_loss_mlp": 0.01033898, "balance_loss_clip": 1.23282146, "balance_loss_mlp": 1.01489651, "epoch": 0.8799639260483992, "flos": 24655462358400.0, "grad_norm": 2.0788539888289694, "language_loss": 0.65596986, "learning_rate": 1.4918188519464648e-07, "loss": 0.68023247, "num_input_tokens_seen": 315619995, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18994141, "step": 14636, "time_per_iteration": 2.8439388275146484 }, { "auxiliary_loss_clip": 0.01390131, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.23079228, "balance_loss_mlp": 1.01470566, "epoch": 0.8800240493010671, "flos": 22210688067840.0, "grad_norm": 1.5280392005664172, "language_loss": 0.70879126, "learning_rate": 1.4903432608852074e-07, "loss": 0.73303986, "num_input_tokens_seen": 315637895, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.20019531, "step": 14637, "time_per_iteration": 2.8422062397003174 }, { "auxiliary_loss_clip": 0.01398891, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.24013162, "balance_loss_mlp": 1.01210284, "epoch": 0.8800841725537352, "flos": 14254102454400.0, "grad_norm": 2.6093583225675876, "language_loss": 0.67923868, "learning_rate": 1.4888683717119843e-07, "loss": 0.70354676, "num_input_tokens_seen": 315655520, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19812012, "step": 14638, "time_per_iteration": 2.810321807861328 }, { "auxiliary_loss_clip": 0.01406538, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.24612379, "balance_loss_mlp": 1.01255453, "epoch": 0.8801442958064031, "flos": 37430145734400.0, "grad_norm": 2.2266169242758878, "language_loss": 0.59313339, "learning_rate": 1.4873941844827286e-07, "loss": 0.61751777, "num_input_tokens_seen": 315678955, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19348145, "step": 14639, "time_per_iteration": 3.013446569442749 }, { "auxiliary_loss_clip": 0.01390442, "auxiliary_loss_mlp": 0.01033019, "balance_loss_clip": 1.23135364, "balance_loss_mlp": 1.01315844, "epoch": 0.8802044190590711, "flos": 25058531450880.0, "grad_norm": 1.45009205721012, "language_loss": 0.74961412, "learning_rate": 1.4859206992533402e-07, "loss": 0.77384871, "num_input_tokens_seen": 315700360, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19848633, "step": 14640, "time_per_iteration": 2.9164555072784424 }, { "auxiliary_loss_clip": 0.01390264, "auxiliary_loss_mlp": 0.01040996, "balance_loss_clip": 1.23073173, "balance_loss_mlp": 1.01965714, "epoch": 0.8802645423117391, "flos": 24144493121280.0, "grad_norm": 2.0177159684772863, "language_loss": 0.70870179, "learning_rate": 1.4844479160796985e-07, "loss": 0.73301435, "num_input_tokens_seen": 315719270, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.21337891, "step": 14641, "time_per_iteration": 5.67152214050293 }, { "auxiliary_loss_clip": 0.01402593, "auxiliary_loss_mlp": 0.01030614, "balance_loss_clip": 1.23952246, "balance_loss_mlp": 1.01070607, "epoch": 0.880324665564407, "flos": 17940189847680.0, "grad_norm": 4.169805865458593, "language_loss": 0.86125231, "learning_rate": 1.4829758350176457e-07, "loss": 0.88558441, "num_input_tokens_seen": 315737425, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19897461, "step": 14642, "time_per_iteration": 2.8256890773773193 }, { "auxiliary_loss_clip": 0.01390349, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.23245859, "balance_loss_mlp": 1.01826787, "epoch": 0.880384788817075, "flos": 21297418899840.0, "grad_norm": 1.6647989203135247, "language_loss": 0.79552573, "learning_rate": 1.4815044561230038e-07, "loss": 0.81980979, "num_input_tokens_seen": 315755725, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19787598, "step": 14643, "time_per_iteration": 2.895221471786499 }, { "auxiliary_loss_clip": 0.01378656, "auxiliary_loss_mlp": 0.01028412, "balance_loss_clip": 1.22411537, "balance_loss_mlp": 1.01012516, "epoch": 0.880444912069743, "flos": 12466094705280.0, "grad_norm": 1.6173723647366962, "language_loss": 0.73938197, "learning_rate": 1.4800337794515705e-07, "loss": 0.76345265, "num_input_tokens_seen": 315773835, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.1829834, "step": 14644, "time_per_iteration": 2.820234775543213 }, { "auxiliary_loss_clip": 0.01406461, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.24347115, "balance_loss_mlp": 1.01000643, "epoch": 0.880505035322411, "flos": 13633830483840.0, "grad_norm": 2.0447695726905732, "language_loss": 0.80270636, "learning_rate": 1.47856380505911e-07, "loss": 0.82706308, "num_input_tokens_seen": 315790615, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19213867, "step": 14645, "time_per_iteration": 2.8141956329345703 }, { "auxiliary_loss_clip": 0.01380749, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.22723866, "balance_loss_mlp": 1.00905371, "epoch": 0.8805651585750789, "flos": 23192874345600.0, "grad_norm": 2.33224410654509, "language_loss": 0.64830041, "learning_rate": 1.477094533001364e-07, "loss": 0.67239463, "num_input_tokens_seen": 315811010, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.19616699, "step": 14646, "time_per_iteration": 2.875166177749634 }, { "auxiliary_loss_clip": 0.01407559, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.24293959, "balance_loss_mlp": 1.01440287, "epoch": 0.8806252818277469, "flos": 14911230954240.0, "grad_norm": 2.548596802725856, "language_loss": 0.78838444, "learning_rate": 1.475625963334055e-07, "loss": 0.81280029, "num_input_tokens_seen": 315828130, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19641113, "step": 14647, "time_per_iteration": 2.814800500869751 }, { "auxiliary_loss_clip": 0.01395829, "auxiliary_loss_mlp": 0.01034057, "balance_loss_clip": 1.2377516, "balance_loss_mlp": 1.01571107, "epoch": 0.8806854050804148, "flos": 17648052301440.0, "grad_norm": 2.09306913546854, "language_loss": 0.75911963, "learning_rate": 1.4741580961128652e-07, "loss": 0.78341854, "num_input_tokens_seen": 315844900, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18347168, "step": 14648, "time_per_iteration": 2.834747314453125 }, { "auxiliary_loss_clip": 0.01394679, "auxiliary_loss_mlp": 0.01030899, "balance_loss_clip": 1.23420823, "balance_loss_mlp": 1.01276755, "epoch": 0.8807455283330828, "flos": 25341891505920.0, "grad_norm": 1.8865277113238375, "language_loss": 0.65804571, "learning_rate": 1.4726909313934522e-07, "loss": 0.68230152, "num_input_tokens_seen": 315863745, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18139648, "step": 14649, "time_per_iteration": 2.8540308475494385 }, { "auxiliary_loss_clip": 0.01393243, "auxiliary_loss_mlp": 0.01031231, "balance_loss_clip": 1.2354275, "balance_loss_mlp": 1.01209795, "epoch": 0.8808056515857507, "flos": 25276141532160.0, "grad_norm": 1.2582015675526097, "language_loss": 0.62888777, "learning_rate": 1.4712244692314578e-07, "loss": 0.65313256, "num_input_tokens_seen": 315885765, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19140625, "step": 14650, "time_per_iteration": 2.936044216156006 }, { "auxiliary_loss_clip": 0.01389803, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 1.23302698, "balance_loss_mlp": 1.016729, "epoch": 0.8808657748384188, "flos": 26590217552640.0, "grad_norm": 1.4040303004930477, "language_loss": 0.72979677, "learning_rate": 1.4697587096824914e-07, "loss": 0.75405377, "num_input_tokens_seen": 315907340, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19177246, "step": 14651, "time_per_iteration": 2.8755311965942383 }, { "auxiliary_loss_clip": 0.01403738, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.2420454, "balance_loss_mlp": 1.01591921, "epoch": 0.8809258980910867, "flos": 18670488468480.0, "grad_norm": 1.8098815385336255, "language_loss": 0.72145784, "learning_rate": 1.4682936528021284e-07, "loss": 0.74584538, "num_input_tokens_seen": 315924935, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19091797, "step": 14652, "time_per_iteration": 2.803046226501465 }, { "auxiliary_loss_clip": 0.01385529, "auxiliary_loss_mlp": 0.01030742, "balance_loss_clip": 1.22723889, "balance_loss_mlp": 1.01193106, "epoch": 0.8809860213437547, "flos": 19801820165760.0, "grad_norm": 2.966308396807985, "language_loss": 0.75780642, "learning_rate": 1.4668292986459286e-07, "loss": 0.78196913, "num_input_tokens_seen": 315943165, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18823242, "step": 14653, "time_per_iteration": 2.8150997161865234 }, { "auxiliary_loss_clip": 0.01408321, "auxiliary_loss_mlp": 0.01030672, "balance_loss_clip": 1.2457304, "balance_loss_mlp": 1.01180089, "epoch": 0.8810461445964227, "flos": 17903378563200.0, "grad_norm": 1.6879760842927332, "language_loss": 0.7205438, "learning_rate": 1.465365647269421e-07, "loss": 0.74493372, "num_input_tokens_seen": 315961340, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1887207, "step": 14654, "time_per_iteration": 2.81980037689209 }, { "auxiliary_loss_clip": 0.01388394, "auxiliary_loss_mlp": 0.01032869, "balance_loss_clip": 1.23092985, "balance_loss_mlp": 1.01381946, "epoch": 0.8811062678490906, "flos": 29174545365120.0, "grad_norm": 1.4654721071532706, "language_loss": 0.72857606, "learning_rate": 1.4639026987281012e-07, "loss": 0.75278872, "num_input_tokens_seen": 315981335, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19067383, "step": 14655, "time_per_iteration": 2.8943212032318115 }, { "auxiliary_loss_clip": 0.01395126, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.2370944, "balance_loss_mlp": 1.0148747, "epoch": 0.8811663911017587, "flos": 20348333832960.0, "grad_norm": 1.6513214945740973, "language_loss": 0.82102609, "learning_rate": 1.462440453077449e-07, "loss": 0.84531981, "num_input_tokens_seen": 316001325, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19384766, "step": 14656, "time_per_iteration": 2.877187490463257 }, { "auxiliary_loss_clip": 0.01406719, "auxiliary_loss_mlp": 0.01036337, "balance_loss_clip": 1.24514055, "balance_loss_mlp": 1.01728749, "epoch": 0.8812265143544266, "flos": 25896911195520.0, "grad_norm": 1.654069535204542, "language_loss": 0.69418991, "learning_rate": 1.460978910372914e-07, "loss": 0.71862048, "num_input_tokens_seen": 316022540, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19055176, "step": 14657, "time_per_iteration": 2.87732195854187 }, { "auxiliary_loss_clip": 0.01401016, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.2410965, "balance_loss_mlp": 1.01268744, "epoch": 0.8812866376070946, "flos": 27206100777600.0, "grad_norm": 3.537876153128997, "language_loss": 0.851008, "learning_rate": 1.4595180706699207e-07, "loss": 0.87533998, "num_input_tokens_seen": 316037735, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19482422, "step": 14658, "time_per_iteration": 2.827751874923706 }, { "auxiliary_loss_clip": 0.01407404, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.24286866, "balance_loss_mlp": 1.01164556, "epoch": 0.8813467608597625, "flos": 23818168488960.0, "grad_norm": 1.9417624458216536, "language_loss": 0.78033793, "learning_rate": 1.4580579340238554e-07, "loss": 0.8047201, "num_input_tokens_seen": 316058105, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19189453, "step": 14659, "time_per_iteration": 2.881575584411621 }, { "auxiliary_loss_clip": 0.01402218, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.24309742, "balance_loss_mlp": 1.01531124, "epoch": 0.8814068841124305, "flos": 21115262759040.0, "grad_norm": 2.4163419735577487, "language_loss": 0.61581939, "learning_rate": 1.4565985004900894e-07, "loss": 0.64018011, "num_input_tokens_seen": 316074415, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.1854248, "step": 14660, "time_per_iteration": 2.834075450897217 }, { "auxiliary_loss_clip": 0.01386986, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.2282517, "balance_loss_mlp": 1.01232195, "epoch": 0.8814670073650984, "flos": 24727591848960.0, "grad_norm": 1.8888203124881329, "language_loss": 0.78693771, "learning_rate": 1.455139770123972e-07, "loss": 0.81112319, "num_input_tokens_seen": 316094405, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19226074, "step": 14661, "time_per_iteration": 2.9403293132781982 }, { "auxiliary_loss_clip": 0.0139561, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.2351526, "balance_loss_mlp": 1.01960027, "epoch": 0.8815271306177664, "flos": 22976576363520.0, "grad_norm": 1.7448500772317281, "language_loss": 0.77149117, "learning_rate": 1.45368174298081e-07, "loss": 0.79583687, "num_input_tokens_seen": 316113390, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19360352, "step": 14662, "time_per_iteration": 2.8587751388549805 }, { "auxiliary_loss_clip": 0.01373136, "auxiliary_loss_mlp": 0.01028756, "balance_loss_clip": 1.2190733, "balance_loss_mlp": 1.01042187, "epoch": 0.8815872538704344, "flos": 19468527834240.0, "grad_norm": 2.082861270564137, "language_loss": 0.74088538, "learning_rate": 1.4522244191158929e-07, "loss": 0.7649042, "num_input_tokens_seen": 316131085, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.18322754, "step": 14663, "time_per_iteration": 2.7927353382110596 }, { "auxiliary_loss_clip": 0.01382514, "auxiliary_loss_mlp": 0.01034125, "balance_loss_clip": 1.22565079, "balance_loss_mlp": 1.01584983, "epoch": 0.8816473771231024, "flos": 32168231297280.0, "grad_norm": 2.1107612138307905, "language_loss": 0.70891547, "learning_rate": 1.450767798584489e-07, "loss": 0.73308182, "num_input_tokens_seen": 316151440, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18286133, "step": 14664, "time_per_iteration": 2.9138131141662598 }, { "auxiliary_loss_clip": 0.01385274, "auxiliary_loss_mlp": 0.01034477, "balance_loss_clip": 1.22913396, "balance_loss_mlp": 1.01588047, "epoch": 0.8817075003757703, "flos": 19691657781120.0, "grad_norm": 1.4826056831422372, "language_loss": 0.82104129, "learning_rate": 1.449311881441828e-07, "loss": 0.8452388, "num_input_tokens_seen": 316170750, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18603516, "step": 14665, "time_per_iteration": 2.8089053630828857 }, { "auxiliary_loss_clip": 0.01399327, "auxiliary_loss_mlp": 0.0103592, "balance_loss_clip": 1.24033082, "balance_loss_mlp": 1.01634574, "epoch": 0.8817676236284383, "flos": 15676893025920.0, "grad_norm": 2.4574365174258923, "language_loss": 0.59452116, "learning_rate": 1.447856667743117e-07, "loss": 0.61887372, "num_input_tokens_seen": 316187265, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19567871, "step": 14666, "time_per_iteration": 2.829272508621216 }, { "auxiliary_loss_clip": 0.01398033, "auxiliary_loss_mlp": 0.01032377, "balance_loss_clip": 1.23801064, "balance_loss_mlp": 1.01149118, "epoch": 0.8818277468811063, "flos": 17904283459200.0, "grad_norm": 1.8884215161452804, "language_loss": 0.85036731, "learning_rate": 1.4464021575435403e-07, "loss": 0.8746714, "num_input_tokens_seen": 316206555, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.20861816, "step": 14667, "time_per_iteration": 4.244984865188599 }, { "auxiliary_loss_clip": 0.01398987, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.2397809, "balance_loss_mlp": 1.01668262, "epoch": 0.8818878701337742, "flos": 18779745957120.0, "grad_norm": 1.7942799746604732, "language_loss": 0.62765533, "learning_rate": 1.4449483508982563e-07, "loss": 0.6520083, "num_input_tokens_seen": 316225210, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19641113, "step": 14668, "time_per_iteration": 2.7874624729156494 }, { "auxiliary_loss_clip": 0.01393028, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.23679614, "balance_loss_mlp": 1.01409233, "epoch": 0.8819479933864423, "flos": 17721222422400.0, "grad_norm": 2.2509816766220903, "language_loss": 0.5805071, "learning_rate": 1.4434952478623918e-07, "loss": 0.60475844, "num_input_tokens_seen": 316242685, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18017578, "step": 14669, "time_per_iteration": 2.788069009780884 }, { "auxiliary_loss_clip": 0.01402622, "auxiliary_loss_mlp": 0.01030695, "balance_loss_clip": 1.24343491, "balance_loss_mlp": 1.01191974, "epoch": 0.8820081166391102, "flos": 11736158042880.0, "grad_norm": 1.7452395347996843, "language_loss": 0.72886097, "learning_rate": 1.442042848491043e-07, "loss": 0.75319421, "num_input_tokens_seen": 316260935, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18774414, "step": 14670, "time_per_iteration": 4.259251832962036 }, { "auxiliary_loss_clip": 0.01391087, "auxiliary_loss_mlp": 0.01034144, "balance_loss_clip": 1.23167396, "balance_loss_mlp": 1.01406932, "epoch": 0.8820682398917782, "flos": 27501812663040.0, "grad_norm": 1.9147881197651986, "language_loss": 0.74496883, "learning_rate": 1.44059115283929e-07, "loss": 0.76922119, "num_input_tokens_seen": 316281190, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.20056152, "step": 14671, "time_per_iteration": 2.9013023376464844 }, { "auxiliary_loss_clip": 0.01387325, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.22688222, "balance_loss_mlp": 1.01506484, "epoch": 0.8821283631444461, "flos": 16882752188160.0, "grad_norm": 2.162455596590144, "language_loss": 0.8577444, "learning_rate": 1.43914016096218e-07, "loss": 0.88197011, "num_input_tokens_seen": 316297115, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.2019043, "step": 14672, "time_per_iteration": 2.7974395751953125 }, { "auxiliary_loss_clip": 0.01386569, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.23099816, "balance_loss_mlp": 1.01214349, "epoch": 0.8821884863971141, "flos": 24291919238400.0, "grad_norm": 1.513010406770979, "language_loss": 0.7350657, "learning_rate": 1.4376898729147336e-07, "loss": 0.75924492, "num_input_tokens_seen": 316318235, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.19226074, "step": 14673, "time_per_iteration": 2.862677812576294 }, { "auxiliary_loss_clip": 0.01181732, "auxiliary_loss_mlp": 0.01037803, "balance_loss_clip": 1.09359145, "balance_loss_mlp": 1.01644027, "epoch": 0.882248609649782, "flos": 59465934812160.0, "grad_norm": 0.8069653660355437, "language_loss": 0.4945522, "learning_rate": 1.4362402887519487e-07, "loss": 0.51674753, "num_input_tokens_seen": 316384705, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21386719, "step": 14674, "time_per_iteration": 3.4424238204956055 }, { "auxiliary_loss_clip": 0.01390401, "auxiliary_loss_mlp": 0.01032513, "balance_loss_clip": 1.23011541, "balance_loss_mlp": 1.01284337, "epoch": 0.88230873290245, "flos": 19947255511680.0, "grad_norm": 13.531308756841925, "language_loss": 0.77637005, "learning_rate": 1.4347914085287971e-07, "loss": 0.80059922, "num_input_tokens_seen": 316401165, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.1965332, "step": 14675, "time_per_iteration": 2.839144468307495 }, { "auxiliary_loss_clip": 0.01382578, "auxiliary_loss_mlp": 0.01030714, "balance_loss_clip": 1.22632384, "balance_loss_mlp": 1.0109849, "epoch": 0.882368856155118, "flos": 16371466237440.0, "grad_norm": 1.8854906958969841, "language_loss": 0.79784405, "learning_rate": 1.4333432323002105e-07, "loss": 0.82197702, "num_input_tokens_seen": 316418780, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.1973877, "step": 14676, "time_per_iteration": 5.695163249969482 }, { "auxiliary_loss_clip": 0.01178367, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.09150064, "balance_loss_mlp": 1.00797987, "epoch": 0.882428979407786, "flos": 70630423079040.0, "grad_norm": 0.6984338893827733, "language_loss": 0.54819661, "learning_rate": 1.431895760121109e-07, "loss": 0.57026321, "num_input_tokens_seen": 316482030, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.203125, "step": 14677, "time_per_iteration": 3.3942410945892334 }, { "auxiliary_loss_clip": 0.01390343, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.23149753, "balance_loss_mlp": 1.01045465, "epoch": 0.8824891026604539, "flos": 18159383496960.0, "grad_norm": 2.4151939500506665, "language_loss": 0.65733933, "learning_rate": 1.4304489920463847e-07, "loss": 0.68153125, "num_input_tokens_seen": 316499175, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18408203, "step": 14678, "time_per_iteration": 2.855454683303833 }, { "auxiliary_loss_clip": 0.01396317, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.23514509, "balance_loss_mlp": 1.01201165, "epoch": 0.8825492259131219, "flos": 27243500244480.0, "grad_norm": 2.1290948752392884, "language_loss": 0.71840727, "learning_rate": 1.4290029281308936e-07, "loss": 0.74268353, "num_input_tokens_seen": 316519495, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19299316, "step": 14679, "time_per_iteration": 2.9078238010406494 }, { "auxiliary_loss_clip": 0.0139075, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.23302436, "balance_loss_mlp": 1.01003945, "epoch": 0.8826093491657898, "flos": 22284898819200.0, "grad_norm": 1.7754005530504624, "language_loss": 0.64456415, "learning_rate": 1.4275575684294694e-07, "loss": 0.66875666, "num_input_tokens_seen": 316538180, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18469238, "step": 14680, "time_per_iteration": 2.9168174266815186 }, { "auxiliary_loss_clip": 0.01389194, "auxiliary_loss_mlp": 0.01034805, "balance_loss_clip": 1.23299527, "balance_loss_mlp": 1.01508772, "epoch": 0.8826694724184578, "flos": 14211499835520.0, "grad_norm": 2.52099955218005, "language_loss": 0.78151923, "learning_rate": 1.4261129129969328e-07, "loss": 0.80575919, "num_input_tokens_seen": 316551750, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19726562, "step": 14681, "time_per_iteration": 2.8062493801116943 }, { "auxiliary_loss_clip": 0.01405075, "auxiliary_loss_mlp": 0.01033604, "balance_loss_clip": 1.24423003, "balance_loss_mlp": 1.01411343, "epoch": 0.8827295956711259, "flos": 20641512009600.0, "grad_norm": 1.711714890215758, "language_loss": 0.73553276, "learning_rate": 1.424668961888047e-07, "loss": 0.75991952, "num_input_tokens_seen": 316570680, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19494629, "step": 14682, "time_per_iteration": 2.85417103767395 }, { "auxiliary_loss_clip": 0.01403662, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.24091315, "balance_loss_mlp": 1.01560807, "epoch": 0.8827897189237938, "flos": 18521705007360.0, "grad_norm": 1.9611074930448564, "language_loss": 0.74972653, "learning_rate": 1.4232257151575765e-07, "loss": 0.77411616, "num_input_tokens_seen": 316588635, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19726562, "step": 14683, "time_per_iteration": 2.8227174282073975 }, { "auxiliary_loss_clip": 0.01397287, "auxiliary_loss_mlp": 0.0103263, "balance_loss_clip": 1.23739791, "balance_loss_mlp": 1.01340175, "epoch": 0.8828498421764618, "flos": 22757654183040.0, "grad_norm": 3.328002884025606, "language_loss": 0.65913546, "learning_rate": 1.4217831728602492e-07, "loss": 0.68343461, "num_input_tokens_seen": 316607550, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19226074, "step": 14684, "time_per_iteration": 2.847733974456787 }, { "auxiliary_loss_clip": 0.01394035, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.2353121, "balance_loss_mlp": 1.00962543, "epoch": 0.8829099654291297, "flos": 15020624177280.0, "grad_norm": 1.7341622441579412, "language_loss": 0.69846249, "learning_rate": 1.4203413350507677e-07, "loss": 0.72268581, "num_input_tokens_seen": 316624460, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18676758, "step": 14685, "time_per_iteration": 2.822073221206665 }, { "auxiliary_loss_clip": 0.01405412, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.24390268, "balance_loss_mlp": 1.01458955, "epoch": 0.8829700886817977, "flos": 16727498720640.0, "grad_norm": 2.1176045835643893, "language_loss": 0.75260484, "learning_rate": 1.418900201783806e-07, "loss": 0.77699989, "num_input_tokens_seen": 316640765, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19506836, "step": 14686, "time_per_iteration": 2.7869625091552734 }, { "auxiliary_loss_clip": 0.01390912, "auxiliary_loss_mlp": 0.01031546, "balance_loss_clip": 1.23420882, "balance_loss_mlp": 1.01167417, "epoch": 0.8830302119344656, "flos": 15270385328640.0, "grad_norm": 1.8934848280686465, "language_loss": 0.64098388, "learning_rate": 1.417459773114007e-07, "loss": 0.66520846, "num_input_tokens_seen": 316656120, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19873047, "step": 14687, "time_per_iteration": 2.781559467315674 }, { "auxiliary_loss_clip": 0.01400421, "auxiliary_loss_mlp": 0.01035885, "balance_loss_clip": 1.23841047, "balance_loss_mlp": 1.01612043, "epoch": 0.8830903351871336, "flos": 28628529390720.0, "grad_norm": 2.334643278213978, "language_loss": 0.69770777, "learning_rate": 1.4160200490959984e-07, "loss": 0.72207087, "num_input_tokens_seen": 316676095, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19763184, "step": 14688, "time_per_iteration": 2.8685600757598877 }, { "auxiliary_loss_clip": 0.01391175, "auxiliary_loss_mlp": 0.01028756, "balance_loss_clip": 1.23665822, "balance_loss_mlp": 1.01068377, "epoch": 0.8831504584398016, "flos": 28013324837760.0, "grad_norm": 1.7447590976521856, "language_loss": 0.68046916, "learning_rate": 1.4145810297843697e-07, "loss": 0.70466852, "num_input_tokens_seen": 316696235, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.18066406, "step": 14689, "time_per_iteration": 2.8704299926757812 }, { "auxiliary_loss_clip": 0.01391727, "auxiliary_loss_mlp": 0.01037967, "balance_loss_clip": 1.23535657, "balance_loss_mlp": 1.01903665, "epoch": 0.8832105816924696, "flos": 26591212938240.0, "grad_norm": 1.3095291994454632, "language_loss": 0.74950457, "learning_rate": 1.4131427152336905e-07, "loss": 0.77380145, "num_input_tokens_seen": 316719680, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18920898, "step": 14690, "time_per_iteration": 2.9246413707733154 }, { "auxiliary_loss_clip": 0.01399265, "auxiliary_loss_mlp": 0.01034796, "balance_loss_clip": 1.23945773, "balance_loss_mlp": 1.01513886, "epoch": 0.8832707049451375, "flos": 24909250296960.0, "grad_norm": 1.4347491246826043, "language_loss": 0.73835397, "learning_rate": 1.4117051054985018e-07, "loss": 0.76269454, "num_input_tokens_seen": 316739830, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19665527, "step": 14691, "time_per_iteration": 2.9272353649139404 }, { "auxiliary_loss_clip": 0.01420713, "auxiliary_loss_mlp": 0.01034397, "balance_loss_clip": 1.25456905, "balance_loss_mlp": 1.01457286, "epoch": 0.8833308281978055, "flos": 15459735392640.0, "grad_norm": 1.6236386308368105, "language_loss": 0.52558005, "learning_rate": 1.4102682006333243e-07, "loss": 0.5501312, "num_input_tokens_seen": 316758105, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19824219, "step": 14692, "time_per_iteration": 2.8005425930023193 }, { "auxiliary_loss_clip": 0.01405618, "auxiliary_loss_mlp": 0.01034822, "balance_loss_clip": 1.2443924, "balance_loss_mlp": 1.01610625, "epoch": 0.8833909514504734, "flos": 20310798631680.0, "grad_norm": 2.238487788074904, "language_loss": 0.61443245, "learning_rate": 1.4088320006926346e-07, "loss": 0.63883686, "num_input_tokens_seen": 316777455, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18713379, "step": 14693, "time_per_iteration": 2.836475133895874 }, { "auxiliary_loss_clip": 0.01393492, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.23804116, "balance_loss_mlp": 1.01050413, "epoch": 0.8834510747031414, "flos": 20383199591040.0, "grad_norm": 1.5096283413884424, "language_loss": 0.7585212, "learning_rate": 1.407396505730898e-07, "loss": 0.78274208, "num_input_tokens_seen": 316796300, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18103027, "step": 14694, "time_per_iteration": 2.8115313053131104 }, { "auxiliary_loss_clip": 0.01403612, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.24080968, "balance_loss_mlp": 1.01198816, "epoch": 0.8835111979558095, "flos": 29763933120000.0, "grad_norm": 1.8851781011123867, "language_loss": 0.73455971, "learning_rate": 1.4059617158025527e-07, "loss": 0.75889552, "num_input_tokens_seen": 316819090, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.17980957, "step": 14695, "time_per_iteration": 2.8939731121063232 }, { "auxiliary_loss_clip": 0.01385956, "auxiliary_loss_mlp": 0.01030463, "balance_loss_clip": 1.23157632, "balance_loss_mlp": 1.01193786, "epoch": 0.8835713212084774, "flos": 24145352772480.0, "grad_norm": 1.5898887494571001, "language_loss": 0.80644858, "learning_rate": 1.404527630961998e-07, "loss": 0.83061278, "num_input_tokens_seen": 316839250, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18530273, "step": 14696, "time_per_iteration": 2.8476998805999756 }, { "auxiliary_loss_clip": 0.01402485, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.24264336, "balance_loss_mlp": 1.0126214, "epoch": 0.8836314444611454, "flos": 27683697335040.0, "grad_norm": 1.6128189820190961, "language_loss": 0.7562505, "learning_rate": 1.4030942512636236e-07, "loss": 0.78058815, "num_input_tokens_seen": 316861315, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18664551, "step": 14697, "time_per_iteration": 2.888587474822998 }, { "auxiliary_loss_clip": 0.01387015, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.22941446, "balance_loss_mlp": 1.01410532, "epoch": 0.8836915677138133, "flos": 16845759924480.0, "grad_norm": 3.021325482852787, "language_loss": 0.72681469, "learning_rate": 1.401661576761779e-07, "loss": 0.75102055, "num_input_tokens_seen": 316879325, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.19458008, "step": 14698, "time_per_iteration": 2.823514938354492 }, { "auxiliary_loss_clip": 0.01174659, "auxiliary_loss_mlp": 0.01037899, "balance_loss_clip": 1.0892632, "balance_loss_mlp": 1.02082825, "epoch": 0.8837516909664813, "flos": 69344245117440.0, "grad_norm": 0.8058994451819829, "language_loss": 0.53791553, "learning_rate": 1.4002296075107856e-07, "loss": 0.56004113, "num_input_tokens_seen": 316936425, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.17089844, "step": 14699, "time_per_iteration": 3.3409698009490967 }, { "auxiliary_loss_clip": 0.01400593, "auxiliary_loss_mlp": 0.01034869, "balance_loss_clip": 1.2381072, "balance_loss_mlp": 1.01497293, "epoch": 0.8838118142191492, "flos": 21334682632320.0, "grad_norm": 1.6449291572744014, "language_loss": 0.77377844, "learning_rate": 1.3987983435649508e-07, "loss": 0.79813302, "num_input_tokens_seen": 316956360, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19885254, "step": 14700, "time_per_iteration": 2.8452768325805664 }, { "auxiliary_loss_clip": 0.01391106, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.23427463, "balance_loss_mlp": 1.01424527, "epoch": 0.8838719374718172, "flos": 21480525181440.0, "grad_norm": 1.833565775177554, "language_loss": 0.73935735, "learning_rate": 1.3973677849785494e-07, "loss": 0.76360053, "num_input_tokens_seen": 316975295, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18969727, "step": 14701, "time_per_iteration": 2.850094795227051 }, { "auxiliary_loss_clip": 0.01410386, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.24657977, "balance_loss_mlp": 1.01311338, "epoch": 0.8839320607244852, "flos": 26480145657600.0, "grad_norm": 1.8152130786431204, "language_loss": 0.71832752, "learning_rate": 1.3959379318058262e-07, "loss": 0.74275148, "num_input_tokens_seen": 316994520, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.18896484, "step": 14702, "time_per_iteration": 4.294740915298462 }, { "auxiliary_loss_clip": 0.01412593, "auxiliary_loss_mlp": 0.01038954, "balance_loss_clip": 1.24998045, "balance_loss_mlp": 1.0198921, "epoch": 0.8839921839771532, "flos": 45238264600320.0, "grad_norm": 1.4925866168409894, "language_loss": 0.72295594, "learning_rate": 1.3945087841010006e-07, "loss": 0.74747145, "num_input_tokens_seen": 317018095, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19055176, "step": 14703, "time_per_iteration": 3.074967861175537 }, { "auxiliary_loss_clip": 0.01382735, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.22744465, "balance_loss_mlp": 1.0161407, "epoch": 0.8840523072298211, "flos": 20015358215040.0, "grad_norm": 2.031003357288446, "language_loss": 0.6734938, "learning_rate": 1.3930803419182645e-07, "loss": 0.69766676, "num_input_tokens_seen": 317035755, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18432617, "step": 14704, "time_per_iteration": 2.878918409347534 }, { "auxiliary_loss_clip": 0.01383474, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.22827375, "balance_loss_mlp": 1.01416695, "epoch": 0.8841124304824891, "flos": 24436766401920.0, "grad_norm": 1.680686572695812, "language_loss": 0.71047419, "learning_rate": 1.3916526053117905e-07, "loss": 0.73463613, "num_input_tokens_seen": 317055765, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18554688, "step": 14705, "time_per_iteration": 2.913623332977295 }, { "auxiliary_loss_clip": 0.01399053, "auxiliary_loss_mlp": 0.01032359, "balance_loss_clip": 1.2416544, "balance_loss_mlp": 1.01392901, "epoch": 0.884172553735157, "flos": 31296976565760.0, "grad_norm": 1.391201441425496, "language_loss": 0.7112639, "learning_rate": 1.3902255743357104e-07, "loss": 0.735578, "num_input_tokens_seen": 317077955, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18432617, "step": 14706, "time_per_iteration": 4.303986072540283 }, { "auxiliary_loss_clip": 0.01388855, "auxiliary_loss_mlp": 0.0102951, "balance_loss_clip": 1.23080683, "balance_loss_mlp": 1.01069915, "epoch": 0.884232676987825, "flos": 21399437220480.0, "grad_norm": 1.6425181189454023, "language_loss": 0.7497623, "learning_rate": 1.3887992490441413e-07, "loss": 0.77394593, "num_input_tokens_seen": 317095825, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18811035, "step": 14707, "time_per_iteration": 2.849536180496216 }, { "auxiliary_loss_clip": 0.01175375, "auxiliary_loss_mlp": 0.01025283, "balance_loss_clip": 1.08975935, "balance_loss_mlp": 1.00229979, "epoch": 0.8842928002404931, "flos": 57938167025280.0, "grad_norm": 0.7952078902702773, "language_loss": 0.60462129, "learning_rate": 1.387373629491173e-07, "loss": 0.62662786, "num_input_tokens_seen": 317152875, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 0.22949219, "step": 14708, "time_per_iteration": 3.182222843170166 }, { "auxiliary_loss_clip": 0.0137189, "auxiliary_loss_mlp": 0.01030589, "balance_loss_clip": 1.21907914, "balance_loss_mlp": 1.01226652, "epoch": 0.884352923493161, "flos": 41478826106880.0, "grad_norm": 2.443848941452672, "language_loss": 0.68073821, "learning_rate": 1.3859487157308625e-07, "loss": 0.70476294, "num_input_tokens_seen": 317176725, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.18322754, "step": 14709, "time_per_iteration": 3.027759552001953 }, { "auxiliary_loss_clip": 0.01412462, "auxiliary_loss_mlp": 0.01038519, "balance_loss_clip": 1.2493279, "balance_loss_mlp": 1.01751423, "epoch": 0.884413046745829, "flos": 46558991606400.0, "grad_norm": 1.5555773606010121, "language_loss": 0.63250017, "learning_rate": 1.3845245078172373e-07, "loss": 0.65700996, "num_input_tokens_seen": 317206880, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.21008301, "step": 14710, "time_per_iteration": 3.100559949874878 }, { "auxiliary_loss_clip": 0.01382358, "auxiliary_loss_mlp": 0.01029013, "balance_loss_clip": 1.2277844, "balance_loss_mlp": 1.01138234, "epoch": 0.8844731699984969, "flos": 19144646421120.0, "grad_norm": 2.589008412065707, "language_loss": 0.65017414, "learning_rate": 1.38310100580431e-07, "loss": 0.6742878, "num_input_tokens_seen": 317224135, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.1763916, "step": 14711, "time_per_iteration": 5.764604806900024 }, { "auxiliary_loss_clip": 0.01407502, "auxiliary_loss_mlp": 0.0103163, "balance_loss_clip": 1.24325776, "balance_loss_mlp": 1.01218736, "epoch": 0.8845332932511649, "flos": 23271654821760.0, "grad_norm": 3.190897292869493, "language_loss": 0.76333845, "learning_rate": 1.38167820974606e-07, "loss": 0.78772974, "num_input_tokens_seen": 317244505, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19421387, "step": 14712, "time_per_iteration": 3.002335548400879 }, { "auxiliary_loss_clip": 0.01396775, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.23690283, "balance_loss_mlp": 1.01371038, "epoch": 0.8845934165038328, "flos": 17573027143680.0, "grad_norm": 2.2936405426944577, "language_loss": 0.81858867, "learning_rate": 1.3802561196964368e-07, "loss": 0.84289575, "num_input_tokens_seen": 317257830, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.20214844, "step": 14713, "time_per_iteration": 2.7990305423736572 }, { "auxiliary_loss_clip": 0.01379601, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.22339904, "balance_loss_mlp": 1.01109028, "epoch": 0.8846535397565009, "flos": 27495976083840.0, "grad_norm": 1.6001586576420759, "language_loss": 0.56792188, "learning_rate": 1.3788347357093688e-07, "loss": 0.59201443, "num_input_tokens_seen": 317278430, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.1854248, "step": 14714, "time_per_iteration": 2.8852453231811523 }, { "auxiliary_loss_clip": 0.01393842, "auxiliary_loss_mlp": 0.01032732, "balance_loss_clip": 1.23570848, "balance_loss_mlp": 1.01318121, "epoch": 0.8847136630091688, "flos": 28771657251840.0, "grad_norm": 1.964297818113568, "language_loss": 0.744798, "learning_rate": 1.377414057838755e-07, "loss": 0.76906377, "num_input_tokens_seen": 317295970, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1953125, "step": 14715, "time_per_iteration": 2.8961360454559326 }, { "auxiliary_loss_clip": 0.01394494, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.23563361, "balance_loss_mlp": 1.01549816, "epoch": 0.8847737862618368, "flos": 23487319376640.0, "grad_norm": 3.082521939036507, "language_loss": 0.75456047, "learning_rate": 1.375994086138461e-07, "loss": 0.77884293, "num_input_tokens_seen": 317316185, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18261719, "step": 14716, "time_per_iteration": 2.8309836387634277 }, { "auxiliary_loss_clip": 0.01392281, "auxiliary_loss_mlp": 0.01034027, "balance_loss_clip": 1.23395956, "balance_loss_mlp": 1.0145247, "epoch": 0.8848339095145047, "flos": 18670036020480.0, "grad_norm": 6.745210672580485, "language_loss": 0.71922469, "learning_rate": 1.3745748206623397e-07, "loss": 0.74348778, "num_input_tokens_seen": 317333275, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19494629, "step": 14717, "time_per_iteration": 2.844132900238037 }, { "auxiliary_loss_clip": 0.01386195, "auxiliary_loss_mlp": 0.01030905, "balance_loss_clip": 1.23239923, "balance_loss_mlp": 1.01296449, "epoch": 0.8848940327671727, "flos": 32283053896320.0, "grad_norm": 2.2908160396383277, "language_loss": 0.74443233, "learning_rate": 1.373156261464208e-07, "loss": 0.76860332, "num_input_tokens_seen": 317351245, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.1796875, "step": 14718, "time_per_iteration": 2.9203860759735107 }, { "auxiliary_loss_clip": 0.0140786, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.24453545, "balance_loss_mlp": 1.01376653, "epoch": 0.8849541560198406, "flos": 24031751783040.0, "grad_norm": 2.1357624346538544, "language_loss": 0.79098046, "learning_rate": 1.3717384085978602e-07, "loss": 0.81538451, "num_input_tokens_seen": 317370740, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18762207, "step": 14719, "time_per_iteration": 2.903578996658325 }, { "auxiliary_loss_clip": 0.01398275, "auxiliary_loss_mlp": 0.01031116, "balance_loss_clip": 1.23850799, "balance_loss_mlp": 1.01296031, "epoch": 0.8850142792725086, "flos": 16881756802560.0, "grad_norm": 3.7429451997242262, "language_loss": 0.72620153, "learning_rate": 1.3703212621170579e-07, "loss": 0.75049543, "num_input_tokens_seen": 317388370, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18151855, "step": 14720, "time_per_iteration": 2.9609827995300293 }, { "auxiliary_loss_clip": 0.01403246, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.24060464, "balance_loss_mlp": 1.01313436, "epoch": 0.8850744025251767, "flos": 24034556960640.0, "grad_norm": 2.5205157507236438, "language_loss": 0.83135426, "learning_rate": 1.3689048220755383e-07, "loss": 0.85571378, "num_input_tokens_seen": 317407390, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19567871, "step": 14721, "time_per_iteration": 2.862091541290283 }, { "auxiliary_loss_clip": 0.01402295, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.24073899, "balance_loss_mlp": 1.01354623, "epoch": 0.8851345257778446, "flos": 47971466363520.0, "grad_norm": 2.2047839429568064, "language_loss": 0.63444453, "learning_rate": 1.3674890885270186e-07, "loss": 0.65879524, "num_input_tokens_seen": 317430825, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19213867, "step": 14722, "time_per_iteration": 3.0744693279266357 }, { "auxiliary_loss_clip": 0.01401511, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.24097586, "balance_loss_mlp": 1.0135839, "epoch": 0.8851946490305126, "flos": 36624595731840.0, "grad_norm": 1.9042372505390657, "language_loss": 0.69264102, "learning_rate": 1.3660740615251754e-07, "loss": 0.71697807, "num_input_tokens_seen": 317451905, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18603516, "step": 14723, "time_per_iteration": 3.0083343982696533 }, { "auxiliary_loss_clip": 0.01392066, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.23378015, "balance_loss_mlp": 1.0138669, "epoch": 0.8852547722831805, "flos": 21554419219200.0, "grad_norm": 1.8217480645293103, "language_loss": 0.78490889, "learning_rate": 1.3646597411236703e-07, "loss": 0.80916548, "num_input_tokens_seen": 317470030, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19726562, "step": 14724, "time_per_iteration": 2.8453662395477295 }, { "auxiliary_loss_clip": 0.01175574, "auxiliary_loss_mlp": 0.01021299, "balance_loss_clip": 1.09005284, "balance_loss_mlp": 1.00079525, "epoch": 0.8853148955358485, "flos": 63088154530560.0, "grad_norm": 0.7979720175269285, "language_loss": 0.58971143, "learning_rate": 1.363246127376143e-07, "loss": 0.61168015, "num_input_tokens_seen": 317527460, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.20507812, "step": 14725, "time_per_iteration": 3.2374956607818604 }, { "auxiliary_loss_clip": 0.01413285, "auxiliary_loss_mlp": 0.01036297, "balance_loss_clip": 1.24602234, "balance_loss_mlp": 1.01674652, "epoch": 0.8853750187885164, "flos": 18158795314560.0, "grad_norm": 3.6566098158840115, "language_loss": 0.70079643, "learning_rate": 1.3618332203361837e-07, "loss": 0.72529221, "num_input_tokens_seen": 317544070, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19555664, "step": 14726, "time_per_iteration": 2.8083276748657227 }, { "auxiliary_loss_clip": 0.01398678, "auxiliary_loss_mlp": 0.01027803, "balance_loss_clip": 1.24100804, "balance_loss_mlp": 1.00965929, "epoch": 0.8854351420411845, "flos": 39585994859520.0, "grad_norm": 1.2500599109928099, "language_loss": 0.70328671, "learning_rate": 1.3604210200573785e-07, "loss": 0.72755158, "num_input_tokens_seen": 317570275, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18151855, "step": 14727, "time_per_iteration": 3.031891345977783 }, { "auxiliary_loss_clip": 0.01403099, "auxiliary_loss_mlp": 0.01032428, "balance_loss_clip": 1.24419451, "balance_loss_mlp": 1.01318741, "epoch": 0.8854952652938524, "flos": 23779909370880.0, "grad_norm": 1.7965082292852292, "language_loss": 0.71211576, "learning_rate": 1.3590095265932733e-07, "loss": 0.73647106, "num_input_tokens_seen": 317590160, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19250488, "step": 14728, "time_per_iteration": 2.8584160804748535 }, { "auxiliary_loss_clip": 0.014007, "auxiliary_loss_mlp": 0.01035327, "balance_loss_clip": 1.23950934, "balance_loss_mlp": 1.01632547, "epoch": 0.8855553885465204, "flos": 18297896388480.0, "grad_norm": 2.2549921725414093, "language_loss": 0.67313385, "learning_rate": 1.3575987399973987e-07, "loss": 0.69749415, "num_input_tokens_seen": 317608340, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19006348, "step": 14729, "time_per_iteration": 2.800593852996826 }, { "auxiliary_loss_clip": 0.01391078, "auxiliary_loss_mlp": 0.01029889, "balance_loss_clip": 1.23403764, "balance_loss_mlp": 1.01219857, "epoch": 0.8856155117991883, "flos": 36881324582400.0, "grad_norm": 1.5134567813420785, "language_loss": 0.63181049, "learning_rate": 1.3561886603232453e-07, "loss": 0.65602016, "num_input_tokens_seen": 317629910, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.17675781, "step": 14730, "time_per_iteration": 2.97753643989563 }, { "auxiliary_loss_clip": 0.01394305, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.23713779, "balance_loss_mlp": 1.01294565, "epoch": 0.8856756350518563, "flos": 22174057762560.0, "grad_norm": 1.3800644391154413, "language_loss": 0.80016607, "learning_rate": 1.3547792876242904e-07, "loss": 0.82443237, "num_input_tokens_seen": 317650265, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.19372559, "step": 14731, "time_per_iteration": 2.8784258365631104 }, { "auxiliary_loss_clip": 0.01398301, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.23774624, "balance_loss_mlp": 1.01677477, "epoch": 0.8857357583045242, "flos": 20750769498240.0, "grad_norm": 1.6417657044478902, "language_loss": 0.840707, "learning_rate": 1.3533706219539708e-07, "loss": 0.86505079, "num_input_tokens_seen": 317669045, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19311523, "step": 14732, "time_per_iteration": 2.887178421020508 }, { "auxiliary_loss_clip": 0.0118, "auxiliary_loss_mlp": 0.01023608, "balance_loss_clip": 1.09200585, "balance_loss_mlp": 1.00148296, "epoch": 0.8857958815571922, "flos": 69925669787520.0, "grad_norm": 0.9343370876685249, "language_loss": 0.60044718, "learning_rate": 1.3519626633657045e-07, "loss": 0.62248325, "num_input_tokens_seen": 317728065, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.22167969, "step": 14733, "time_per_iteration": 3.3171210289001465 }, { "auxiliary_loss_clip": 0.01418277, "auxiliary_loss_mlp": 0.01036297, "balance_loss_clip": 1.25712895, "balance_loss_mlp": 1.01685405, "epoch": 0.8858560048098603, "flos": 15130243624320.0, "grad_norm": 2.4033379728614728, "language_loss": 0.67487001, "learning_rate": 1.3505554119128838e-07, "loss": 0.6994158, "num_input_tokens_seen": 317746120, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19433594, "step": 14734, "time_per_iteration": 2.86643123626709 }, { "auxiliary_loss_clip": 0.0139518, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.23959517, "balance_loss_mlp": 1.01292121, "epoch": 0.8859161280625282, "flos": 16617924518400.0, "grad_norm": 1.869077586948502, "language_loss": 0.76457965, "learning_rate": 1.3491488676488682e-07, "loss": 0.78884786, "num_input_tokens_seen": 317762280, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18713379, "step": 14735, "time_per_iteration": 2.8493192195892334 }, { "auxiliary_loss_clip": 0.01405323, "auxiliary_loss_mlp": 0.01034934, "balance_loss_clip": 1.24402165, "balance_loss_mlp": 1.01707602, "epoch": 0.8859762513151962, "flos": 18702911007360.0, "grad_norm": 2.009181288660724, "language_loss": 0.71178502, "learning_rate": 1.3477430306270066e-07, "loss": 0.73618758, "num_input_tokens_seen": 317780615, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.1784668, "step": 14736, "time_per_iteration": 2.8475871086120605 }, { "auxiliary_loss_clip": 0.01399308, "auxiliary_loss_mlp": 0.01029675, "balance_loss_clip": 1.23922539, "balance_loss_mlp": 1.01080418, "epoch": 0.8860363745678641, "flos": 19546358169600.0, "grad_norm": 2.257005972185825, "language_loss": 0.85498571, "learning_rate": 1.3463379009005892e-07, "loss": 0.87927556, "num_input_tokens_seen": 317798830, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18884277, "step": 14737, "time_per_iteration": 4.309977769851685 }, { "auxiliary_loss_clip": 0.01429677, "auxiliary_loss_mlp": 0.01038341, "balance_loss_clip": 1.2638917, "balance_loss_mlp": 1.01753926, "epoch": 0.8860964978205321, "flos": 35968507862400.0, "grad_norm": 2.394788163522259, "language_loss": 0.68944013, "learning_rate": 1.3449334785229093e-07, "loss": 0.71412027, "num_input_tokens_seen": 317819235, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.20812988, "step": 14738, "time_per_iteration": 2.993105888366699 }, { "auxiliary_loss_clip": 0.01413825, "auxiliary_loss_mlp": 0.0103044, "balance_loss_clip": 1.24856234, "balance_loss_mlp": 1.01069927, "epoch": 0.8861566210732, "flos": 21221805559680.0, "grad_norm": 2.011439887344891, "language_loss": 0.75287449, "learning_rate": 1.343529763547222e-07, "loss": 0.77731711, "num_input_tokens_seen": 317836785, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.1973877, "step": 14739, "time_per_iteration": 2.8484365940093994 }, { "auxiliary_loss_clip": 0.01387807, "auxiliary_loss_mlp": 0.01031863, "balance_loss_clip": 1.23127067, "balance_loss_mlp": 1.01342154, "epoch": 0.886216744325868, "flos": 14616695433600.0, "grad_norm": 1.8224282368091416, "language_loss": 0.87457108, "learning_rate": 1.3421267560267559e-07, "loss": 0.89876777, "num_input_tokens_seen": 317854225, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18432617, "step": 14740, "time_per_iteration": 4.256930828094482 }, { "auxiliary_loss_clip": 0.01389268, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 1.23192954, "balance_loss_mlp": 1.01361775, "epoch": 0.886276867578536, "flos": 26662980470400.0, "grad_norm": 2.9989338409695305, "language_loss": 0.64191556, "learning_rate": 1.34072445601471e-07, "loss": 0.6661371, "num_input_tokens_seen": 317874865, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19262695, "step": 14741, "time_per_iteration": 2.8840200901031494 }, { "auxiliary_loss_clip": 0.01397362, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.23864698, "balance_loss_mlp": 1.01255393, "epoch": 0.886336990831204, "flos": 16772861272320.0, "grad_norm": 2.0326417940863726, "language_loss": 0.73292148, "learning_rate": 1.3393228635642717e-07, "loss": 0.7572059, "num_input_tokens_seen": 317892830, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18530273, "step": 14742, "time_per_iteration": 2.8549904823303223 }, { "auxiliary_loss_clip": 0.01392822, "auxiliary_loss_mlp": 0.01030428, "balance_loss_clip": 1.23505402, "balance_loss_mlp": 1.01116407, "epoch": 0.8863971140838719, "flos": 25276277266560.0, "grad_norm": 2.6730731877364504, "language_loss": 0.60012996, "learning_rate": 1.3379219787285733e-07, "loss": 0.62436247, "num_input_tokens_seen": 317911780, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19274902, "step": 14743, "time_per_iteration": 2.8797836303710938 }, { "auxiliary_loss_clip": 0.0139625, "auxiliary_loss_mlp": 0.01036012, "balance_loss_clip": 1.23596621, "balance_loss_mlp": 1.0158776, "epoch": 0.8864572373365399, "flos": 23414782682880.0, "grad_norm": 2.0242777445327307, "language_loss": 0.60597587, "learning_rate": 1.3365218015607437e-07, "loss": 0.63029844, "num_input_tokens_seen": 317932855, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20117188, "step": 14744, "time_per_iteration": 2.8451130390167236 }, { "auxiliary_loss_clip": 0.01397011, "auxiliary_loss_mlp": 0.01035028, "balance_loss_clip": 1.23735809, "balance_loss_mlp": 1.01590657, "epoch": 0.8865173605892078, "flos": 18557204192640.0, "grad_norm": 1.5646710540414905, "language_loss": 0.77315676, "learning_rate": 1.3351223321138762e-07, "loss": 0.79747719, "num_input_tokens_seen": 317952090, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19116211, "step": 14745, "time_per_iteration": 2.835024118423462 }, { "auxiliary_loss_clip": 0.01399607, "auxiliary_loss_mlp": 0.01030965, "balance_loss_clip": 1.24061835, "balance_loss_mlp": 1.01178467, "epoch": 0.8865774838418758, "flos": 19035162708480.0, "grad_norm": 1.6599917988235127, "language_loss": 0.78474176, "learning_rate": 1.3337235704410454e-07, "loss": 0.80904746, "num_input_tokens_seen": 317970370, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19177246, "step": 14746, "time_per_iteration": 5.56698751449585 }, { "auxiliary_loss_clip": 0.01396469, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.23647368, "balance_loss_mlp": 1.01354074, "epoch": 0.8866376070945439, "flos": 22173152866560.0, "grad_norm": 3.9770684939385914, "language_loss": 0.76914901, "learning_rate": 1.3323255165952873e-07, "loss": 0.79343551, "num_input_tokens_seen": 317989125, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18640137, "step": 14747, "time_per_iteration": 2.8566722869873047 }, { "auxiliary_loss_clip": 0.01382217, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 1.22644138, "balance_loss_mlp": 1.0111835, "epoch": 0.8866977303472118, "flos": 20714003458560.0, "grad_norm": 1.7302410660980432, "language_loss": 0.83657503, "learning_rate": 1.3309281706296127e-07, "loss": 0.8606962, "num_input_tokens_seen": 318007820, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18713379, "step": 14748, "time_per_iteration": 2.831908702850342 }, { "auxiliary_loss_clip": 0.01398855, "auxiliary_loss_mlp": 0.01033514, "balance_loss_clip": 1.2391504, "balance_loss_mlp": 1.01476264, "epoch": 0.8867578535998798, "flos": 48810479535360.0, "grad_norm": 1.7501662849102422, "language_loss": 0.78128815, "learning_rate": 1.3295315325970148e-07, "loss": 0.80561185, "num_input_tokens_seen": 318030435, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1875, "step": 14749, "time_per_iteration": 3.09627103805542 }, { "auxiliary_loss_clip": 0.01416376, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.25207222, "balance_loss_mlp": 1.01570368, "epoch": 0.8868179768525477, "flos": 21115081779840.0, "grad_norm": 2.1556390418573237, "language_loss": 0.7079975, "learning_rate": 1.328135602550451e-07, "loss": 0.73250937, "num_input_tokens_seen": 318049465, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19116211, "step": 14750, "time_per_iteration": 2.8390960693359375 }, { "auxiliary_loss_clip": 0.01388912, "auxiliary_loss_mlp": 0.01031831, "balance_loss_clip": 1.23131645, "balance_loss_mlp": 1.01307917, "epoch": 0.8868781001052157, "flos": 21839815290240.0, "grad_norm": 1.8616904726454888, "language_loss": 0.59805256, "learning_rate": 1.3267403805428546e-07, "loss": 0.62225997, "num_input_tokens_seen": 318067760, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1875, "step": 14751, "time_per_iteration": 2.85882568359375 }, { "auxiliary_loss_clip": 0.01396454, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 1.2380898, "balance_loss_mlp": 1.01236296, "epoch": 0.8869382233578836, "flos": 13524256281600.0, "grad_norm": 2.0473336505144237, "language_loss": 0.81663215, "learning_rate": 1.3253458666271344e-07, "loss": 0.84090602, "num_input_tokens_seen": 318082785, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18591309, "step": 14752, "time_per_iteration": 2.835252285003662 }, { "auxiliary_loss_clip": 0.01416396, "auxiliary_loss_mlp": 0.01036256, "balance_loss_clip": 1.25077176, "balance_loss_mlp": 1.01702714, "epoch": 0.8869983466105517, "flos": 22713694220160.0, "grad_norm": 1.8336089423351132, "language_loss": 0.81159008, "learning_rate": 1.3239520608561793e-07, "loss": 0.83611655, "num_input_tokens_seen": 318101925, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.19226074, "step": 14753, "time_per_iteration": 2.833667516708374 }, { "auxiliary_loss_clip": 0.01391226, "auxiliary_loss_mlp": 0.01032403, "balance_loss_clip": 1.23404181, "balance_loss_mlp": 1.01373529, "epoch": 0.8870584698632196, "flos": 15349075315200.0, "grad_norm": 1.6171843117629625, "language_loss": 0.65899289, "learning_rate": 1.3225589632828248e-07, "loss": 0.68322915, "num_input_tokens_seen": 318119945, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18688965, "step": 14754, "time_per_iteration": 2.808683395385742 }, { "auxiliary_loss_clip": 0.01407497, "auxiliary_loss_mlp": 0.01038408, "balance_loss_clip": 1.24748194, "balance_loss_mlp": 1.01976335, "epoch": 0.8871185931158876, "flos": 26627074081920.0, "grad_norm": 1.8575969421827914, "language_loss": 0.75640953, "learning_rate": 1.3211665739599065e-07, "loss": 0.78086853, "num_input_tokens_seen": 318139685, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18652344, "step": 14755, "time_per_iteration": 2.982423782348633 }, { "auxiliary_loss_clip": 0.01384922, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.22759891, "balance_loss_mlp": 1.01572943, "epoch": 0.8871787163685555, "flos": 21809112053760.0, "grad_norm": 1.4480773271636003, "language_loss": 0.78683627, "learning_rate": 1.3197748929402262e-07, "loss": 0.81103063, "num_input_tokens_seen": 318160375, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18786621, "step": 14756, "time_per_iteration": 2.8380849361419678 }, { "auxiliary_loss_clip": 0.01398894, "auxiliary_loss_mlp": 0.01031145, "balance_loss_clip": 1.23850942, "balance_loss_mlp": 1.0115943, "epoch": 0.8872388396212235, "flos": 14911185709440.0, "grad_norm": 2.0971922420638154, "language_loss": 0.77797061, "learning_rate": 1.3183839202765535e-07, "loss": 0.80227101, "num_input_tokens_seen": 318177995, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19543457, "step": 14757, "time_per_iteration": 2.82704758644104 }, { "auxiliary_loss_clip": 0.01380222, "auxiliary_loss_mlp": 0.0103295, "balance_loss_clip": 1.2253449, "balance_loss_mlp": 1.01478219, "epoch": 0.8872989628738914, "flos": 26443289128320.0, "grad_norm": 1.9124951787432027, "language_loss": 0.68626922, "learning_rate": 1.316993656021632e-07, "loss": 0.71040094, "num_input_tokens_seen": 318197030, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.1817627, "step": 14758, "time_per_iteration": 2.892263650894165 }, { "auxiliary_loss_clip": 0.01408181, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 1.24843788, "balance_loss_mlp": 1.01507616, "epoch": 0.8873590861265594, "flos": 48159685307520.0, "grad_norm": 2.1323864599403763, "language_loss": 0.69545496, "learning_rate": 1.3156041002281915e-07, "loss": 0.71986872, "num_input_tokens_seen": 318221780, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18127441, "step": 14759, "time_per_iteration": 3.111626625061035 }, { "auxiliary_loss_clip": 0.01390182, "auxiliary_loss_mlp": 0.01032054, "balance_loss_clip": 1.23118567, "balance_loss_mlp": 1.01298058, "epoch": 0.8874192093792275, "flos": 18342173064960.0, "grad_norm": 1.7036508040764797, "language_loss": 0.75124896, "learning_rate": 1.3142152529489092e-07, "loss": 0.77547133, "num_input_tokens_seen": 318239710, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.1907959, "step": 14760, "time_per_iteration": 2.8150553703308105 }, { "auxiliary_loss_clip": 0.01404193, "auxiliary_loss_mlp": 0.01035147, "balance_loss_clip": 1.24202919, "balance_loss_mlp": 1.01646698, "epoch": 0.8874793326318954, "flos": 17903061849600.0, "grad_norm": 2.2391403563615846, "language_loss": 0.77485275, "learning_rate": 1.3128271142364565e-07, "loss": 0.79924619, "num_input_tokens_seen": 318257425, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18688965, "step": 14761, "time_per_iteration": 2.9017693996429443 }, { "auxiliary_loss_clip": 0.01395181, "auxiliary_loss_mlp": 0.01037256, "balance_loss_clip": 1.23601437, "balance_loss_mlp": 1.01806331, "epoch": 0.8875394558845634, "flos": 31114548956160.0, "grad_norm": 4.228236028862086, "language_loss": 0.62364644, "learning_rate": 1.3114396841434717e-07, "loss": 0.6479708, "num_input_tokens_seen": 318278485, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1920166, "step": 14762, "time_per_iteration": 2.9655661582946777 }, { "auxiliary_loss_clip": 0.01388727, "auxiliary_loss_mlp": 0.01036809, "balance_loss_clip": 1.23025036, "balance_loss_mlp": 1.01607895, "epoch": 0.8875995791372313, "flos": 21151893064320.0, "grad_norm": 1.9301268997996266, "language_loss": 0.65314281, "learning_rate": 1.3100529627225697e-07, "loss": 0.6773982, "num_input_tokens_seen": 318297560, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.20727539, "step": 14763, "time_per_iteration": 2.9769885540008545 }, { "auxiliary_loss_clip": 0.014032, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.24286842, "balance_loss_mlp": 1.01330829, "epoch": 0.8876597023898993, "flos": 17463543431040.0, "grad_norm": 2.067845645802123, "language_loss": 0.72158712, "learning_rate": 1.3086669500263335e-07, "loss": 0.74594009, "num_input_tokens_seen": 318313060, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18786621, "step": 14764, "time_per_iteration": 2.863757610321045 }, { "auxiliary_loss_clip": 0.01406121, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.2417382, "balance_loss_mlp": 1.01521599, "epoch": 0.8877198256425672, "flos": 22717404293760.0, "grad_norm": 2.1287099665450264, "language_loss": 0.66678023, "learning_rate": 1.3072816461073166e-07, "loss": 0.69117951, "num_input_tokens_seen": 318332030, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.18579102, "step": 14765, "time_per_iteration": 2.882183313369751 }, { "auxiliary_loss_clip": 0.01385477, "auxiliary_loss_mlp": 0.01033298, "balance_loss_clip": 1.22919953, "balance_loss_mlp": 1.01527405, "epoch": 0.8877799488952353, "flos": 24545797666560.0, "grad_norm": 1.559400922641486, "language_loss": 0.77103519, "learning_rate": 1.3058970510180568e-07, "loss": 0.795223, "num_input_tokens_seen": 318351090, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18017578, "step": 14766, "time_per_iteration": 2.858431339263916 }, { "auxiliary_loss_clip": 0.01384887, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.22903991, "balance_loss_mlp": 1.01781368, "epoch": 0.8878400721479032, "flos": 20968877272320.0, "grad_norm": 1.6403579674035713, "language_loss": 0.74072301, "learning_rate": 1.3045131648110496e-07, "loss": 0.76494539, "num_input_tokens_seen": 318372000, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.19543457, "step": 14767, "time_per_iteration": 2.8486826419830322 }, { "auxiliary_loss_clip": 0.01379583, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 1.22541106, "balance_loss_mlp": 1.0122478, "epoch": 0.8879001954005712, "flos": 25304582528640.0, "grad_norm": 1.8676047330731924, "language_loss": 0.71290523, "learning_rate": 1.303129987538778e-07, "loss": 0.73700809, "num_input_tokens_seen": 318391530, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.18457031, "step": 14768, "time_per_iteration": 2.880800247192383 }, { "auxiliary_loss_clip": 0.01397547, "auxiliary_loss_mlp": 0.01033438, "balance_loss_clip": 1.24034798, "balance_loss_mlp": 1.01388788, "epoch": 0.8879603186532391, "flos": 23195724768000.0, "grad_norm": 3.027365545877491, "language_loss": 0.71235049, "learning_rate": 1.3017475192536932e-07, "loss": 0.73666036, "num_input_tokens_seen": 318410690, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19555664, "step": 14769, "time_per_iteration": 2.8483471870422363 }, { "auxiliary_loss_clip": 0.0139006, "auxiliary_loss_mlp": 0.01030219, "balance_loss_clip": 1.23176146, "balance_loss_mlp": 1.01151538, "epoch": 0.8880204419059071, "flos": 13661502318720.0, "grad_norm": 2.4022083809383274, "language_loss": 0.68323982, "learning_rate": 1.3003657600082174e-07, "loss": 0.70744258, "num_input_tokens_seen": 318427380, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18713379, "step": 14770, "time_per_iteration": 2.8085718154907227 }, { "auxiliary_loss_clip": 0.01381794, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.22788846, "balance_loss_mlp": 1.01441121, "epoch": 0.888080565158575, "flos": 20641738233600.0, "grad_norm": 1.9563058970175538, "language_loss": 0.66244644, "learning_rate": 1.2989847098547424e-07, "loss": 0.68659329, "num_input_tokens_seen": 318448530, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.18457031, "step": 14771, "time_per_iteration": 2.8890223503112793 }, { "auxiliary_loss_clip": 0.01388639, "auxiliary_loss_mlp": 0.0103098, "balance_loss_clip": 1.23052835, "balance_loss_mlp": 1.01177585, "epoch": 0.888140688411243, "flos": 28631379813120.0, "grad_norm": 1.7648679946304118, "language_loss": 0.83096796, "learning_rate": 1.2976043688456396e-07, "loss": 0.85516417, "num_input_tokens_seen": 318468655, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19189453, "step": 14772, "time_per_iteration": 4.3276686668396 }, { "auxiliary_loss_clip": 0.01375768, "auxiliary_loss_mlp": 0.01029843, "balance_loss_clip": 1.22227693, "balance_loss_mlp": 1.01125789, "epoch": 0.8882008116639111, "flos": 25531196325120.0, "grad_norm": 1.4807914832940732, "language_loss": 0.7705121, "learning_rate": 1.296224737033258e-07, "loss": 0.79456818, "num_input_tokens_seen": 318488740, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.18579102, "step": 14773, "time_per_iteration": 2.9128007888793945 }, { "auxiliary_loss_clip": 0.01401347, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.24520481, "balance_loss_mlp": 1.01186132, "epoch": 0.888260934916579, "flos": 27685416637440.0, "grad_norm": 1.841575234898447, "language_loss": 0.75690907, "learning_rate": 1.294845814469907e-07, "loss": 0.78122795, "num_input_tokens_seen": 318508810, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18676758, "step": 14774, "time_per_iteration": 2.9575769901275635 }, { "auxiliary_loss_clip": 0.0140129, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.23972726, "balance_loss_mlp": 1.01431823, "epoch": 0.888321058169247, "flos": 21619671500160.0, "grad_norm": 3.156793921351749, "language_loss": 0.73423243, "learning_rate": 1.2934676012078783e-07, "loss": 0.75858772, "num_input_tokens_seen": 318526860, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19909668, "step": 14775, "time_per_iteration": 4.286986351013184 }, { "auxiliary_loss_clip": 0.01389913, "auxiliary_loss_mlp": 0.01035989, "balance_loss_clip": 1.23276925, "balance_loss_mlp": 1.01723695, "epoch": 0.8883811814219149, "flos": 18157890418560.0, "grad_norm": 1.5832308900704664, "language_loss": 0.80931258, "learning_rate": 1.292090097299432e-07, "loss": 0.83357155, "num_input_tokens_seen": 318545180, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18762207, "step": 14776, "time_per_iteration": 2.8827366828918457 }, { "auxiliary_loss_clip": 0.01412406, "auxiliary_loss_mlp": 0.01029888, "balance_loss_clip": 1.24737287, "balance_loss_mlp": 1.01101673, "epoch": 0.8884413046745829, "flos": 28335034500480.0, "grad_norm": 2.5697752150512296, "language_loss": 0.70094281, "learning_rate": 1.290713302796802e-07, "loss": 0.72536576, "num_input_tokens_seen": 318564350, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.18847656, "step": 14777, "time_per_iteration": 2.9598469734191895 }, { "auxiliary_loss_clip": 0.01388359, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.22998631, "balance_loss_mlp": 1.01694465, "epoch": 0.8885014279272508, "flos": 15167145398400.0, "grad_norm": 1.8235069131314823, "language_loss": 0.71518111, "learning_rate": 1.2893372177522e-07, "loss": 0.73943031, "num_input_tokens_seen": 318582275, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19616699, "step": 14778, "time_per_iteration": 2.8434598445892334 }, { "auxiliary_loss_clip": 0.01397858, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.23937774, "balance_loss_mlp": 1.01519966, "epoch": 0.8885615511799189, "flos": 19109101991040.0, "grad_norm": 1.6033118456894666, "language_loss": 0.78073394, "learning_rate": 1.287961842217804e-07, "loss": 0.80505395, "num_input_tokens_seen": 318601230, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18933105, "step": 14779, "time_per_iteration": 2.882605791091919 }, { "auxiliary_loss_clip": 0.01182831, "auxiliary_loss_mlp": 0.01021395, "balance_loss_clip": 1.09220636, "balance_loss_mlp": 1.00089097, "epoch": 0.8886216744325868, "flos": 51208551912960.0, "grad_norm": 0.8731558610077517, "language_loss": 0.56878889, "learning_rate": 1.2865871762457747e-07, "loss": 0.59083116, "num_input_tokens_seen": 318645595, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.20507812, "step": 14780, "time_per_iteration": 3.141737699508667 }, { "auxiliary_loss_clip": 0.01179823, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.08989549, "balance_loss_mlp": 1.0088582, "epoch": 0.8886817976852548, "flos": 61644931292160.0, "grad_norm": 0.7921524195841102, "language_loss": 0.6240533, "learning_rate": 1.2852132198882326e-07, "loss": 0.64613467, "num_input_tokens_seen": 318707850, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.19433594, "step": 14781, "time_per_iteration": 6.080317735671997 }, { "auxiliary_loss_clip": 0.01177349, "auxiliary_loss_mlp": 0.01016626, "balance_loss_clip": 1.09162927, "balance_loss_mlp": 0.99802935, "epoch": 0.8887419209379227, "flos": 60674128721280.0, "grad_norm": 0.7954218612379419, "language_loss": 0.58138037, "learning_rate": 1.2838399731972805e-07, "loss": 0.60332012, "num_input_tokens_seen": 318764915, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.18554688, "step": 14782, "time_per_iteration": 3.1010873317718506 }, { "auxiliary_loss_clip": 0.01389399, "auxiliary_loss_mlp": 0.01032976, "balance_loss_clip": 1.23336935, "balance_loss_mlp": 1.01490426, "epoch": 0.8888020441905907, "flos": 29217917145600.0, "grad_norm": 1.7644295952441622, "language_loss": 0.66631997, "learning_rate": 1.2824674362249922e-07, "loss": 0.69054377, "num_input_tokens_seen": 318785660, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18078613, "step": 14783, "time_per_iteration": 2.905606985092163 }, { "auxiliary_loss_clip": 0.01404761, "auxiliary_loss_mlp": 0.0103085, "balance_loss_clip": 1.24298644, "balance_loss_mlp": 1.01172864, "epoch": 0.8888621674432586, "flos": 22172564684160.0, "grad_norm": 1.5644940305233204, "language_loss": 0.78151822, "learning_rate": 1.281095609023415e-07, "loss": 0.80587429, "num_input_tokens_seen": 318806080, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19116211, "step": 14784, "time_per_iteration": 2.8618788719177246 }, { "auxiliary_loss_clip": 0.01405267, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.2438705, "balance_loss_mlp": 1.01339281, "epoch": 0.8889222906959267, "flos": 27684330762240.0, "grad_norm": 2.6230170539145496, "language_loss": 0.61253166, "learning_rate": 1.279724491644565e-07, "loss": 0.63690305, "num_input_tokens_seen": 318826445, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18493652, "step": 14785, "time_per_iteration": 2.8985042572021484 }, { "auxiliary_loss_clip": 0.01395967, "auxiliary_loss_mlp": 0.01033691, "balance_loss_clip": 1.23704755, "balance_loss_mlp": 1.01417673, "epoch": 0.8889824139485947, "flos": 14175457712640.0, "grad_norm": 1.6682415632806102, "language_loss": 0.65927911, "learning_rate": 1.278354084140445e-07, "loss": 0.68357575, "num_input_tokens_seen": 318843915, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19506836, "step": 14786, "time_per_iteration": 2.8045055866241455 }, { "auxiliary_loss_clip": 0.01403216, "auxiliary_loss_mlp": 0.01032183, "balance_loss_clip": 1.23878217, "balance_loss_mlp": 1.01234627, "epoch": 0.8890425372012626, "flos": 12858938472960.0, "grad_norm": 2.4405306350724008, "language_loss": 0.86405641, "learning_rate": 1.276984386563009e-07, "loss": 0.88841033, "num_input_tokens_seen": 318859670, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19824219, "step": 14787, "time_per_iteration": 2.810202121734619 }, { "auxiliary_loss_clip": 0.01397254, "auxiliary_loss_mlp": 0.01029733, "balance_loss_clip": 1.23822618, "balance_loss_mlp": 1.01205468, "epoch": 0.8891026604539306, "flos": 21699266382720.0, "grad_norm": 3.995504524885821, "language_loss": 0.71071142, "learning_rate": 1.2756153989642027e-07, "loss": 0.7349813, "num_input_tokens_seen": 318877855, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.17687988, "step": 14788, "time_per_iteration": 2.916994571685791 }, { "auxiliary_loss_clip": 0.01375531, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.22177505, "balance_loss_mlp": 1.0150907, "epoch": 0.8891627837065985, "flos": 21881377278720.0, "grad_norm": 1.608644428927032, "language_loss": 0.70506036, "learning_rate": 1.274247121395935e-07, "loss": 0.7291522, "num_input_tokens_seen": 318896045, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.18554688, "step": 14789, "time_per_iteration": 2.841963052749634 }, { "auxiliary_loss_clip": 0.01386888, "auxiliary_loss_mlp": 0.01034979, "balance_loss_clip": 1.23075235, "balance_loss_mlp": 1.01575065, "epoch": 0.8892229069592665, "flos": 21590280362880.0, "grad_norm": 1.7583989328613445, "language_loss": 0.71147251, "learning_rate": 1.2728795539100956e-07, "loss": 0.73569125, "num_input_tokens_seen": 318915515, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19238281, "step": 14790, "time_per_iteration": 2.915573835372925 }, { "auxiliary_loss_clip": 0.01395647, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.23772943, "balance_loss_mlp": 1.01177025, "epoch": 0.8892830302119344, "flos": 23086376789760.0, "grad_norm": 1.6208983139777713, "language_loss": 0.73104858, "learning_rate": 1.2715126965585387e-07, "loss": 0.7553075, "num_input_tokens_seen": 318934305, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18481445, "step": 14791, "time_per_iteration": 2.873307704925537 }, { "auxiliary_loss_clip": 0.01384043, "auxiliary_loss_mlp": 0.01032206, "balance_loss_clip": 1.22915375, "balance_loss_mlp": 1.01352549, "epoch": 0.8893431534646025, "flos": 23081535596160.0, "grad_norm": 1.6801908262064733, "language_loss": 0.74423254, "learning_rate": 1.2701465493931008e-07, "loss": 0.76839495, "num_input_tokens_seen": 318953880, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18701172, "step": 14792, "time_per_iteration": 2.911698818206787 }, { "auxiliary_loss_clip": 0.01405753, "auxiliary_loss_mlp": 0.0103995, "balance_loss_clip": 1.24300504, "balance_loss_mlp": 1.0199703, "epoch": 0.8894032767172704, "flos": 22465018944000.0, "grad_norm": 1.9836397656014977, "language_loss": 0.6697346, "learning_rate": 1.2687811124655801e-07, "loss": 0.69419169, "num_input_tokens_seen": 318971395, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.1998291, "step": 14793, "time_per_iteration": 2.8469762802124023 }, { "auxiliary_loss_clip": 0.01395506, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.23352695, "balance_loss_mlp": 1.01438856, "epoch": 0.8894633999699384, "flos": 25349266408320.0, "grad_norm": 1.8538369998312316, "language_loss": 0.72388238, "learning_rate": 1.2674163858277552e-07, "loss": 0.74818456, "num_input_tokens_seen": 318990580, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.20324707, "step": 14794, "time_per_iteration": 2.8777291774749756 }, { "auxiliary_loss_clip": 0.014224, "auxiliary_loss_mlp": 0.01036949, "balance_loss_clip": 1.25789189, "balance_loss_mlp": 1.01696968, "epoch": 0.8895235232226063, "flos": 21003381072000.0, "grad_norm": 1.4627923299260905, "language_loss": 0.75880158, "learning_rate": 1.2660523695313785e-07, "loss": 0.78339505, "num_input_tokens_seen": 319010040, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19995117, "step": 14795, "time_per_iteration": 2.873260259628296 }, { "auxiliary_loss_clip": 0.01176423, "auxiliary_loss_mlp": 0.01018873, "balance_loss_clip": 1.09122229, "balance_loss_mlp": 0.99903661, "epoch": 0.8895836464752743, "flos": 69762769948800.0, "grad_norm": 0.7712750999078115, "language_loss": 0.56105828, "learning_rate": 1.2646890636281727e-07, "loss": 0.58301127, "num_input_tokens_seen": 319063860, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.19824219, "step": 14796, "time_per_iteration": 3.26432466506958 }, { "auxiliary_loss_clip": 0.01403311, "auxiliary_loss_mlp": 0.01034445, "balance_loss_clip": 1.24266398, "balance_loss_mlp": 1.01557422, "epoch": 0.8896437697279422, "flos": 23232219338880.0, "grad_norm": 2.0145341322049046, "language_loss": 0.70703828, "learning_rate": 1.263326468169843e-07, "loss": 0.73141587, "num_input_tokens_seen": 319082335, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18884277, "step": 14797, "time_per_iteration": 2.896883964538574 }, { "auxiliary_loss_clip": 0.01176924, "auxiliary_loss_mlp": 0.01012246, "balance_loss_clip": 1.09045601, "balance_loss_mlp": 0.99603349, "epoch": 0.8897038929806103, "flos": 70782310448640.0, "grad_norm": 0.7491099033150069, "language_loss": 0.58029723, "learning_rate": 1.2619645832080417e-07, "loss": 0.60218894, "num_input_tokens_seen": 319147075, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.16210938, "step": 14798, "time_per_iteration": 3.341984272003174 }, { "auxiliary_loss_clip": 0.0140103, "auxiliary_loss_mlp": 0.01032403, "balance_loss_clip": 1.24087, "balance_loss_mlp": 1.01415253, "epoch": 0.8897640162332782, "flos": 19254537336960.0, "grad_norm": 1.6303523690411268, "language_loss": 0.79812479, "learning_rate": 1.2606034087944251e-07, "loss": 0.8224591, "num_input_tokens_seen": 319166630, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18249512, "step": 14799, "time_per_iteration": 2.850330114364624 }, { "auxiliary_loss_clip": 0.01180152, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.09204912, "balance_loss_mlp": 1.00269914, "epoch": 0.8898241394859462, "flos": 41381124024960.0, "grad_norm": 0.9072990851573018, "language_loss": 0.58165514, "learning_rate": 1.2592429449806053e-07, "loss": 0.60369253, "num_input_tokens_seen": 319221865, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20898438, "step": 14800, "time_per_iteration": 3.2400875091552734 }, { "auxiliary_loss_clip": 0.0139649, "auxiliary_loss_mlp": 0.01029095, "balance_loss_clip": 1.23773909, "balance_loss_mlp": 1.01041532, "epoch": 0.8898842627386142, "flos": 18994777084800.0, "grad_norm": 1.4694763429949533, "language_loss": 0.66807085, "learning_rate": 1.2578831918181698e-07, "loss": 0.69232678, "num_input_tokens_seen": 319240710, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18676758, "step": 14801, "time_per_iteration": 2.8980751037597656 }, { "auxiliary_loss_clip": 0.01408539, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.24557757, "balance_loss_mlp": 1.01852798, "epoch": 0.8899443859912821, "flos": 13223024530560.0, "grad_norm": 6.185669304730591, "language_loss": 0.76347947, "learning_rate": 1.256524149358682e-07, "loss": 0.78795272, "num_input_tokens_seen": 319256495, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.20239258, "step": 14802, "time_per_iteration": 2.960324287414551 }, { "auxiliary_loss_clip": 0.01392012, "auxiliary_loss_mlp": 0.01032273, "balance_loss_clip": 1.23604739, "balance_loss_mlp": 1.01343846, "epoch": 0.8900045092439501, "flos": 22684981754880.0, "grad_norm": 1.8360796849122258, "language_loss": 0.73884594, "learning_rate": 1.2551658176536805e-07, "loss": 0.76308882, "num_input_tokens_seen": 319273620, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18823242, "step": 14803, "time_per_iteration": 2.9188499450683594 }, { "auxiliary_loss_clip": 0.01387632, "auxiliary_loss_mlp": 0.01035149, "balance_loss_clip": 1.22885096, "balance_loss_mlp": 1.01654041, "epoch": 0.890064632496618, "flos": 21151350126720.0, "grad_norm": 2.0221248151939255, "language_loss": 0.73132563, "learning_rate": 1.2538081967546664e-07, "loss": 0.75555348, "num_input_tokens_seen": 319291720, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18615723, "step": 14804, "time_per_iteration": 2.824906587600708 }, { "auxiliary_loss_clip": 0.013932, "auxiliary_loss_mlp": 0.01031218, "balance_loss_clip": 1.23411703, "balance_loss_mlp": 1.0125742, "epoch": 0.8901247557492861, "flos": 23405643233280.0, "grad_norm": 1.6888449310644211, "language_loss": 0.81887591, "learning_rate": 1.252451286713123e-07, "loss": 0.8431201, "num_input_tokens_seen": 319310380, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18652344, "step": 14805, "time_per_iteration": 2.9111576080322266 }, { "auxiliary_loss_clip": 0.01406203, "auxiliary_loss_mlp": 0.01033471, "balance_loss_clip": 1.2440958, "balance_loss_mlp": 1.01375425, "epoch": 0.890184879001954, "flos": 29181694043520.0, "grad_norm": 3.172825397120138, "language_loss": 0.67584288, "learning_rate": 1.251095087580505e-07, "loss": 0.7002396, "num_input_tokens_seen": 319331765, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19726562, "step": 14806, "time_per_iteration": 2.9290060997009277 }, { "auxiliary_loss_clip": 0.01389677, "auxiliary_loss_mlp": 0.01031296, "balance_loss_clip": 1.23000801, "balance_loss_mlp": 1.01284266, "epoch": 0.890245002254622, "flos": 14435715657600.0, "grad_norm": 1.8676538135398753, "language_loss": 0.68394488, "learning_rate": 1.2497395994082438e-07, "loss": 0.70815462, "num_input_tokens_seen": 319349135, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18432617, "step": 14807, "time_per_iteration": 4.3083178997039795 }, { "auxiliary_loss_clip": 0.01390943, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.23406935, "balance_loss_mlp": 1.01422048, "epoch": 0.8903051255072899, "flos": 22392256026240.0, "grad_norm": 2.9599850613438687, "language_loss": 0.76290345, "learning_rate": 1.248384822247732e-07, "loss": 0.78714073, "num_input_tokens_seen": 319368410, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18591309, "step": 14808, "time_per_iteration": 2.856003522872925 }, { "auxiliary_loss_clip": 0.01397009, "auxiliary_loss_mlp": 0.010324, "balance_loss_clip": 1.23775423, "balance_loss_mlp": 1.01422095, "epoch": 0.8903652487599579, "flos": 20787173579520.0, "grad_norm": 1.8116827401831268, "language_loss": 0.82204032, "learning_rate": 1.2470307561503513e-07, "loss": 0.84633446, "num_input_tokens_seen": 319387535, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18188477, "step": 14809, "time_per_iteration": 2.834502935409546 }, { "auxiliary_loss_clip": 0.01384261, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.2269119, "balance_loss_mlp": 1.01543581, "epoch": 0.8904253720126258, "flos": 24434866120320.0, "grad_norm": 1.6932647114692645, "language_loss": 0.69410086, "learning_rate": 1.2456774011674442e-07, "loss": 0.71828985, "num_input_tokens_seen": 319407210, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.1920166, "step": 14810, "time_per_iteration": 2.856969118118286 }, { "auxiliary_loss_clip": 0.01396825, "auxiliary_loss_mlp": 0.0103429, "balance_loss_clip": 1.23564255, "balance_loss_mlp": 1.01527643, "epoch": 0.8904854952652939, "flos": 19473323783040.0, "grad_norm": 1.9177185888195598, "language_loss": 0.71414155, "learning_rate": 1.2443247573503257e-07, "loss": 0.73845267, "num_input_tokens_seen": 319425340, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19006348, "step": 14811, "time_per_iteration": 4.304624319076538 }, { "auxiliary_loss_clip": 0.01408603, "auxiliary_loss_mlp": 0.01036948, "balance_loss_clip": 1.24660301, "balance_loss_mlp": 1.01779091, "epoch": 0.8905456185179618, "flos": 50817183240960.0, "grad_norm": 1.7975341973492116, "language_loss": 0.66308492, "learning_rate": 1.2429728247502924e-07, "loss": 0.68754041, "num_input_tokens_seen": 319448150, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19152832, "step": 14812, "time_per_iteration": 3.1073338985443115 }, { "auxiliary_loss_clip": 0.01392149, "auxiliary_loss_mlp": 0.01033316, "balance_loss_clip": 1.23488355, "balance_loss_mlp": 1.01427817, "epoch": 0.8906057417706298, "flos": 17793713871360.0, "grad_norm": 1.8467808716969305, "language_loss": 0.69177675, "learning_rate": 1.24162160341861e-07, "loss": 0.71603143, "num_input_tokens_seen": 319466115, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.19042969, "step": 14813, "time_per_iteration": 2.945223093032837 }, { "auxiliary_loss_clip": 0.01425092, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.25787687, "balance_loss_mlp": 1.01471925, "epoch": 0.8906658650232978, "flos": 21954954602880.0, "grad_norm": 1.8228046334976127, "language_loss": 0.76899123, "learning_rate": 1.2402710934065198e-07, "loss": 0.79359466, "num_input_tokens_seen": 319485255, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.20532227, "step": 14814, "time_per_iteration": 2.9189345836639404 }, { "auxiliary_loss_clip": 0.01407117, "auxiliary_loss_mlp": 0.0103281, "balance_loss_clip": 1.24440122, "balance_loss_mlp": 1.01315284, "epoch": 0.8907259882759657, "flos": 21297645123840.0, "grad_norm": 2.113040481547012, "language_loss": 0.74949306, "learning_rate": 1.2389212947652229e-07, "loss": 0.77389234, "num_input_tokens_seen": 319501800, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19641113, "step": 14815, "time_per_iteration": 2.8409223556518555 }, { "auxiliary_loss_clip": 0.01383206, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.22779918, "balance_loss_mlp": 1.01339614, "epoch": 0.8907861115286337, "flos": 20129999834880.0, "grad_norm": 1.8298547515222485, "language_loss": 0.75660414, "learning_rate": 1.237572207545914e-07, "loss": 0.78075594, "num_input_tokens_seen": 319520415, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18566895, "step": 14816, "time_per_iteration": 4.346631288528442 }, { "auxiliary_loss_clip": 0.01394187, "auxiliary_loss_mlp": 0.01032736, "balance_loss_clip": 1.23510504, "balance_loss_mlp": 1.01403165, "epoch": 0.8908462347813016, "flos": 20093776732800.0, "grad_norm": 1.7796978623017419, "language_loss": 0.77754253, "learning_rate": 1.2362238317997476e-07, "loss": 0.80181175, "num_input_tokens_seen": 319538410, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18701172, "step": 14817, "time_per_iteration": 2.829658269882202 }, { "auxiliary_loss_clip": 0.0117558, "auxiliary_loss_mlp": 0.01022643, "balance_loss_clip": 1.09049988, "balance_loss_mlp": 1.0037607, "epoch": 0.8909063580339697, "flos": 65533155045120.0, "grad_norm": 0.7640084695616988, "language_loss": 0.5652765, "learning_rate": 1.2348761675778517e-07, "loss": 0.58725876, "num_input_tokens_seen": 319602565, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.18847656, "step": 14818, "time_per_iteration": 3.3866817951202393 }, { "auxiliary_loss_clip": 0.01395061, "auxiliary_loss_mlp": 0.01031509, "balance_loss_clip": 1.23560476, "balance_loss_mlp": 1.01284087, "epoch": 0.8909664812866376, "flos": 29875407603840.0, "grad_norm": 2.0138189572472944, "language_loss": 0.65218329, "learning_rate": 1.2335292149313325e-07, "loss": 0.67644894, "num_input_tokens_seen": 319624645, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18664551, "step": 14819, "time_per_iteration": 2.9420526027679443 }, { "auxiliary_loss_clip": 0.01401755, "auxiliary_loss_mlp": 0.01033145, "balance_loss_clip": 1.24109101, "balance_loss_mlp": 1.01360667, "epoch": 0.8910266045393056, "flos": 25458026204160.0, "grad_norm": 2.533814927096091, "language_loss": 0.79633743, "learning_rate": 1.2321829739112731e-07, "loss": 0.8206864, "num_input_tokens_seen": 319644040, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19543457, "step": 14820, "time_per_iteration": 2.8889687061309814 }, { "auxiliary_loss_clip": 0.01393399, "auxiliary_loss_mlp": 0.01035528, "balance_loss_clip": 1.23460603, "balance_loss_mlp": 1.01725364, "epoch": 0.8910867277919735, "flos": 24509936522880.0, "grad_norm": 3.072288970763731, "language_loss": 0.7705667, "learning_rate": 1.2308374445687087e-07, "loss": 0.79485607, "num_input_tokens_seen": 319663930, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18286133, "step": 14821, "time_per_iteration": 2.920757532119751 }, { "auxiliary_loss_clip": 0.01173304, "auxiliary_loss_mlp": 0.01036458, "balance_loss_clip": 1.08808899, "balance_loss_mlp": 1.01280713, "epoch": 0.8911468510446415, "flos": 60716550360960.0, "grad_norm": 0.7880436118365125, "language_loss": 0.59328085, "learning_rate": 1.2294926269546712e-07, "loss": 0.6153785, "num_input_tokens_seen": 319721245, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.23632812, "step": 14822, "time_per_iteration": 3.1713337898254395 }, { "auxiliary_loss_clip": 0.01405093, "auxiliary_loss_mlp": 0.01036725, "balance_loss_clip": 1.2440989, "balance_loss_mlp": 1.01774693, "epoch": 0.8912069742973094, "flos": 25348316267520.0, "grad_norm": 1.8805742384550568, "language_loss": 0.70029575, "learning_rate": 1.2281485211201515e-07, "loss": 0.72471392, "num_input_tokens_seen": 319741200, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18969727, "step": 14823, "time_per_iteration": 2.86421799659729 }, { "auxiliary_loss_clip": 0.01390193, "auxiliary_loss_mlp": 0.01029147, "balance_loss_clip": 1.23375094, "balance_loss_mlp": 1.01024044, "epoch": 0.8912670975499775, "flos": 18232689352320.0, "grad_norm": 7.08825110661804, "language_loss": 0.69672608, "learning_rate": 1.2268051271161262e-07, "loss": 0.72091949, "num_input_tokens_seen": 319759265, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18896484, "step": 14824, "time_per_iteration": 2.8212883472442627 }, { "auxiliary_loss_clip": 0.01400586, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.23808384, "balance_loss_mlp": 1.01294398, "epoch": 0.8913272208026454, "flos": 26515463863680.0, "grad_norm": 2.76357497615334, "language_loss": 0.7189275, "learning_rate": 1.2254624449935303e-07, "loss": 0.74325168, "num_input_tokens_seen": 319777560, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18884277, "step": 14825, "time_per_iteration": 2.8829452991485596 }, { "auxiliary_loss_clip": 0.01391751, "auxiliary_loss_mlp": 0.01029516, "balance_loss_clip": 1.23482478, "balance_loss_mlp": 1.01027596, "epoch": 0.8913873440553134, "flos": 18810358704000.0, "grad_norm": 1.8414918098706514, "language_loss": 0.72567934, "learning_rate": 1.2241204748032786e-07, "loss": 0.749892, "num_input_tokens_seen": 319794125, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19226074, "step": 14826, "time_per_iteration": 2.8330910205841064 }, { "auxiliary_loss_clip": 0.01403275, "auxiliary_loss_mlp": 0.01030732, "balance_loss_clip": 1.24501777, "balance_loss_mlp": 1.01289809, "epoch": 0.8914474673079814, "flos": 20894214072960.0, "grad_norm": 2.6761845018215964, "language_loss": 0.7619822, "learning_rate": 1.2227792165962615e-07, "loss": 0.78632236, "num_input_tokens_seen": 319810310, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.1784668, "step": 14827, "time_per_iteration": 2.8203821182250977 }, { "auxiliary_loss_clip": 0.01392095, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 1.23277152, "balance_loss_mlp": 1.01212525, "epoch": 0.8915075905606493, "flos": 20960371249920.0, "grad_norm": 1.7064370199183105, "language_loss": 0.78750956, "learning_rate": 1.221438670423336e-07, "loss": 0.81174654, "num_input_tokens_seen": 319828505, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19482422, "step": 14828, "time_per_iteration": 2.8341658115386963 }, { "auxiliary_loss_clip": 0.01386507, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.22876406, "balance_loss_mlp": 1.01197016, "epoch": 0.8915677138133173, "flos": 23086783992960.0, "grad_norm": 12.4846334874325, "language_loss": 0.75954032, "learning_rate": 1.2200988363353392e-07, "loss": 0.7837199, "num_input_tokens_seen": 319848680, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19494629, "step": 14829, "time_per_iteration": 2.8687570095062256 }, { "auxiliary_loss_clip": 0.01400695, "auxiliary_loss_mlp": 0.01030455, "balance_loss_clip": 1.24040544, "balance_loss_mlp": 1.01181018, "epoch": 0.8916278370659853, "flos": 23450598581760.0, "grad_norm": 1.6385935845973019, "language_loss": 0.85233957, "learning_rate": 1.2187597143830773e-07, "loss": 0.87665105, "num_input_tokens_seen": 319868835, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18652344, "step": 14830, "time_per_iteration": 2.868070125579834 }, { "auxiliary_loss_clip": 0.01388996, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.23259199, "balance_loss_mlp": 1.01650262, "epoch": 0.8916879603186533, "flos": 25172675377920.0, "grad_norm": 1.6545170191282255, "language_loss": 0.7571516, "learning_rate": 1.2174213046173299e-07, "loss": 0.78139192, "num_input_tokens_seen": 319891585, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.1854248, "step": 14831, "time_per_iteration": 3.0424954891204834 }, { "auxiliary_loss_clip": 0.01406614, "auxiliary_loss_mlp": 0.01029733, "balance_loss_clip": 1.24262369, "balance_loss_mlp": 1.00976551, "epoch": 0.8917480835713212, "flos": 20239438302720.0, "grad_norm": 1.8811377268405864, "language_loss": 0.7345736, "learning_rate": 1.216083607088847e-07, "loss": 0.75893706, "num_input_tokens_seen": 319910315, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19970703, "step": 14832, "time_per_iteration": 2.856053590774536 }, { "auxiliary_loss_clip": 0.0139391, "auxiliary_loss_mlp": 0.01034384, "balance_loss_clip": 1.23216641, "balance_loss_mlp": 1.01591861, "epoch": 0.8918082068239892, "flos": 26112485260800.0, "grad_norm": 1.8934386084725479, "language_loss": 0.67622912, "learning_rate": 1.214746621848355e-07, "loss": 0.70051205, "num_input_tokens_seen": 319932275, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18469238, "step": 14833, "time_per_iteration": 2.9080209732055664 }, { "auxiliary_loss_clip": 0.01398909, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.23787999, "balance_loss_mlp": 1.01301312, "epoch": 0.8918683300766571, "flos": 24843364588800.0, "grad_norm": 1.6215955693471722, "language_loss": 0.74810916, "learning_rate": 1.2134103489465575e-07, "loss": 0.77242362, "num_input_tokens_seen": 319955335, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19519043, "step": 14834, "time_per_iteration": 2.9243602752685547 }, { "auxiliary_loss_clip": 0.01390673, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.23152995, "balance_loss_mlp": 1.0138166, "epoch": 0.8919284533293251, "flos": 22314154222080.0, "grad_norm": 2.688130993998457, "language_loss": 0.79655862, "learning_rate": 1.2120747884341188e-07, "loss": 0.82079399, "num_input_tokens_seen": 319973990, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19042969, "step": 14835, "time_per_iteration": 2.8676562309265137 }, { "auxiliary_loss_clip": 0.01389736, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.23362076, "balance_loss_mlp": 1.01419294, "epoch": 0.891988576581993, "flos": 30385924392960.0, "grad_norm": 1.3822514019868815, "language_loss": 0.74631977, "learning_rate": 1.210739940361689e-07, "loss": 0.77054757, "num_input_tokens_seen": 319995555, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18835449, "step": 14836, "time_per_iteration": 2.93780517578125 }, { "auxiliary_loss_clip": 0.0139326, "auxiliary_loss_mlp": 0.01031952, "balance_loss_clip": 1.23418975, "balance_loss_mlp": 1.0137006, "epoch": 0.8920486998346611, "flos": 15559581962880.0, "grad_norm": 2.0820406802794174, "language_loss": 0.69284129, "learning_rate": 1.2094058047798838e-07, "loss": 0.71709341, "num_input_tokens_seen": 320012385, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18261719, "step": 14837, "time_per_iteration": 2.831418037414551 }, { "auxiliary_loss_clip": 0.0140774, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.24408484, "balance_loss_mlp": 1.01243615, "epoch": 0.892108823087329, "flos": 21224836961280.0, "grad_norm": 1.8039551479214098, "language_loss": 0.68428504, "learning_rate": 1.2080723817392913e-07, "loss": 0.70868087, "num_input_tokens_seen": 320032390, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19396973, "step": 14838, "time_per_iteration": 2.8639755249023438 }, { "auxiliary_loss_clip": 0.01391234, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.23187566, "balance_loss_mlp": 1.01140571, "epoch": 0.892168946339997, "flos": 21988463016960.0, "grad_norm": 2.569774000061412, "language_loss": 0.77339095, "learning_rate": 1.2067396712904777e-07, "loss": 0.79760647, "num_input_tokens_seen": 320052885, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18920898, "step": 14839, "time_per_iteration": 2.846349000930786 }, { "auxiliary_loss_clip": 0.01178631, "auxiliary_loss_mlp": 0.01024264, "balance_loss_clip": 1.08917511, "balance_loss_mlp": 1.00547695, "epoch": 0.892229069592665, "flos": 67505671664640.0, "grad_norm": 0.6806142078209703, "language_loss": 0.49524796, "learning_rate": 1.205407673483978e-07, "loss": 0.51727688, "num_input_tokens_seen": 320113685, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.1875, "step": 14840, "time_per_iteration": 3.3247530460357666 }, { "auxiliary_loss_clip": 0.01416377, "auxiliary_loss_mlp": 0.01038946, "balance_loss_clip": 1.25108886, "balance_loss_mlp": 1.0184536, "epoch": 0.8922891928453329, "flos": 19467894407040.0, "grad_norm": 2.6831511479963863, "language_loss": 0.65251988, "learning_rate": 1.2040763883703074e-07, "loss": 0.67707306, "num_input_tokens_seen": 320130810, "router_z_loss_clip": 1.65625, "router_z_loss_mlp": 0.20495605, "step": 14841, "time_per_iteration": 2.8285574913024902 }, { "auxiliary_loss_clip": 0.01389293, "auxiliary_loss_mlp": 0.01031942, "balance_loss_clip": 1.23491764, "balance_loss_mlp": 1.01327348, "epoch": 0.8923493160980009, "flos": 23377745174400.0, "grad_norm": 1.3952010303814268, "language_loss": 0.68883896, "learning_rate": 1.2027458159999438e-07, "loss": 0.71305138, "num_input_tokens_seen": 320152170, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18676758, "step": 14842, "time_per_iteration": 4.272842645645142 }, { "auxiliary_loss_clip": 0.01385223, "auxiliary_loss_mlp": 0.01030925, "balance_loss_clip": 1.22847462, "balance_loss_mlp": 1.01253057, "epoch": 0.8924094393506689, "flos": 26188189090560.0, "grad_norm": 2.802043831752032, "language_loss": 0.81484532, "learning_rate": 1.2014159564233373e-07, "loss": 0.83900684, "num_input_tokens_seen": 320172360, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18395996, "step": 14843, "time_per_iteration": 2.901437997817993 }, { "auxiliary_loss_clip": 0.01418276, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.25330985, "balance_loss_mlp": 1.012321, "epoch": 0.8924695626033369, "flos": 22028803395840.0, "grad_norm": 2.054040075220133, "language_loss": 0.6916967, "learning_rate": 1.2000868096909257e-07, "loss": 0.71619928, "num_input_tokens_seen": 320192130, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.1965332, "step": 14844, "time_per_iteration": 2.8695871829986572 }, { "auxiliary_loss_clip": 0.01397871, "auxiliary_loss_mlp": 0.01037002, "balance_loss_clip": 1.23710918, "balance_loss_mlp": 1.01664138, "epoch": 0.8925296858560048, "flos": 14802244934400.0, "grad_norm": 2.103000500900352, "language_loss": 0.92530751, "learning_rate": 1.1987583758531038e-07, "loss": 0.94965625, "num_input_tokens_seen": 320207760, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20373535, "step": 14845, "time_per_iteration": 4.2432942390441895 }, { "auxiliary_loss_clip": 0.01381041, "auxiliary_loss_mlp": 0.01031224, "balance_loss_clip": 1.22497439, "balance_loss_mlp": 1.01219797, "epoch": 0.8925898091086728, "flos": 22356937820160.0, "grad_norm": 1.678559172231663, "language_loss": 0.72783518, "learning_rate": 1.1974306549602476e-07, "loss": 0.75195777, "num_input_tokens_seen": 320225325, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.19042969, "step": 14846, "time_per_iteration": 2.9055559635162354 }, { "auxiliary_loss_clip": 0.0140012, "auxiliary_loss_mlp": 0.0103401, "balance_loss_clip": 1.23879659, "balance_loss_mlp": 1.01549709, "epoch": 0.8926499323613407, "flos": 45822539692800.0, "grad_norm": 1.9845605453228268, "language_loss": 0.57834542, "learning_rate": 1.1961036470627094e-07, "loss": 0.60268664, "num_input_tokens_seen": 320247645, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18518066, "step": 14847, "time_per_iteration": 3.0493314266204834 }, { "auxiliary_loss_clip": 0.01395171, "auxiliary_loss_mlp": 0.01032806, "balance_loss_clip": 1.23536921, "balance_loss_mlp": 1.01451898, "epoch": 0.8927100556140087, "flos": 22137110743680.0, "grad_norm": 2.514105990303816, "language_loss": 0.77806652, "learning_rate": 1.1947773522108052e-07, "loss": 0.80234629, "num_input_tokens_seen": 320266005, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1829834, "step": 14848, "time_per_iteration": 2.9532394409179688 }, { "auxiliary_loss_clip": 0.01380983, "auxiliary_loss_mlp": 0.01033615, "balance_loss_clip": 1.22452021, "balance_loss_mlp": 1.01457727, "epoch": 0.8927701788666766, "flos": 28341956954880.0, "grad_norm": 1.78540595940651, "language_loss": 0.69638795, "learning_rate": 1.1934517704548251e-07, "loss": 0.72053397, "num_input_tokens_seen": 320285555, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.19030762, "step": 14849, "time_per_iteration": 2.9224915504455566 }, { "auxiliary_loss_clip": 0.0140742, "auxiliary_loss_mlp": 0.01031942, "balance_loss_clip": 1.24652529, "balance_loss_mlp": 1.01316619, "epoch": 0.8928303021193447, "flos": 25304220570240.0, "grad_norm": 2.078379576998031, "language_loss": 0.81310105, "learning_rate": 1.1921269018450364e-07, "loss": 0.83749467, "num_input_tokens_seen": 320305395, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18786621, "step": 14850, "time_per_iteration": 2.863088607788086 }, { "auxiliary_loss_clip": 0.01385009, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.22836554, "balance_loss_mlp": 1.01522088, "epoch": 0.8928904253720126, "flos": 22246865925120.0, "grad_norm": 1.814181351356151, "language_loss": 0.75947249, "learning_rate": 1.1908027464316872e-07, "loss": 0.78367031, "num_input_tokens_seen": 320324220, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.1953125, "step": 14851, "time_per_iteration": 5.6998679637908936 }, { "auxiliary_loss_clip": 0.01396273, "auxiliary_loss_mlp": 0.01036602, "balance_loss_clip": 1.23888135, "balance_loss_mlp": 1.01798153, "epoch": 0.8929505486246806, "flos": 27104580149760.0, "grad_norm": 1.7485078838153287, "language_loss": 0.79107887, "learning_rate": 1.1894793042649775e-07, "loss": 0.81540763, "num_input_tokens_seen": 320347195, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.1862793, "step": 14852, "time_per_iteration": 2.9078919887542725 }, { "auxiliary_loss_clip": 0.01388538, "auxiliary_loss_mlp": 0.01028476, "balance_loss_clip": 1.23195601, "balance_loss_mlp": 1.00979567, "epoch": 0.8930106718773486, "flos": 23049701239680.0, "grad_norm": 1.3571857237596423, "language_loss": 0.69384098, "learning_rate": 1.1881565753951006e-07, "loss": 0.71801114, "num_input_tokens_seen": 320366850, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18676758, "step": 14853, "time_per_iteration": 2.8777825832366943 }, { "auxiliary_loss_clip": 0.01395031, "auxiliary_loss_mlp": 0.01032093, "balance_loss_clip": 1.23683512, "balance_loss_mlp": 1.01326954, "epoch": 0.8930707951300165, "flos": 35640192458880.0, "grad_norm": 1.7704576110261916, "language_loss": 0.68032426, "learning_rate": 1.1868345598722118e-07, "loss": 0.70459545, "num_input_tokens_seen": 320388895, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18811035, "step": 14854, "time_per_iteration": 2.9884860515594482 }, { "auxiliary_loss_clip": 0.01377086, "auxiliary_loss_mlp": 0.01028592, "balance_loss_clip": 1.22222674, "balance_loss_mlp": 1.00975692, "epoch": 0.8931309183826845, "flos": 23050379911680.0, "grad_norm": 1.4714071736324421, "language_loss": 0.75428641, "learning_rate": 1.1855132577464399e-07, "loss": 0.7783432, "num_input_tokens_seen": 320408520, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18847656, "step": 14855, "time_per_iteration": 2.8647921085357666 }, { "auxiliary_loss_clip": 0.01386218, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.22955704, "balance_loss_mlp": 1.01263249, "epoch": 0.8931910416353525, "flos": 26515644842880.0, "grad_norm": 1.903024653105993, "language_loss": 0.65057755, "learning_rate": 1.1841926690678893e-07, "loss": 0.67475128, "num_input_tokens_seen": 320427400, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18530273, "step": 14856, "time_per_iteration": 2.878382921218872 }, { "auxiliary_loss_clip": 0.01392068, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 1.23322415, "balance_loss_mlp": 1.01381671, "epoch": 0.8932511648880205, "flos": 24984592168320.0, "grad_norm": 8.747086676662974, "language_loss": 0.66767627, "learning_rate": 1.1828727938866378e-07, "loss": 0.69193554, "num_input_tokens_seen": 320447570, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.20056152, "step": 14857, "time_per_iteration": 2.9058046340942383 }, { "auxiliary_loss_clip": 0.01396988, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.23725879, "balance_loss_mlp": 1.01539111, "epoch": 0.8933112881406884, "flos": 24471043977600.0, "grad_norm": 2.681321419401168, "language_loss": 0.76150012, "learning_rate": 1.1815536322527408e-07, "loss": 0.78581518, "num_input_tokens_seen": 320464405, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19128418, "step": 14858, "time_per_iteration": 2.8919506072998047 }, { "auxiliary_loss_clip": 0.01399053, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.23913836, "balance_loss_mlp": 1.014045, "epoch": 0.8933714113933564, "flos": 28304738467200.0, "grad_norm": 1.7223950654973423, "language_loss": 0.70326257, "learning_rate": 1.1802351842162139e-07, "loss": 0.72758752, "num_input_tokens_seen": 320485525, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1940918, "step": 14859, "time_per_iteration": 2.927227258682251 }, { "auxiliary_loss_clip": 0.01384002, "auxiliary_loss_mlp": 0.01033548, "balance_loss_clip": 1.23086548, "balance_loss_mlp": 1.01497531, "epoch": 0.8934315346460243, "flos": 21445025996160.0, "grad_norm": 1.6323809530731974, "language_loss": 0.75563419, "learning_rate": 1.1789174498270526e-07, "loss": 0.77980971, "num_input_tokens_seen": 320506725, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.18554688, "step": 14860, "time_per_iteration": 2.893035650253296 }, { "auxiliary_loss_clip": 0.01401798, "auxiliary_loss_mlp": 0.01032095, "balance_loss_clip": 1.24118662, "balance_loss_mlp": 1.01389194, "epoch": 0.8934916578986923, "flos": 23780271329280.0, "grad_norm": 1.7366407249101758, "language_loss": 0.57906199, "learning_rate": 1.1776004291352303e-07, "loss": 0.60340095, "num_input_tokens_seen": 320525425, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18212891, "step": 14861, "time_per_iteration": 2.856999158859253 }, { "auxiliary_loss_clip": 0.01390484, "auxiliary_loss_mlp": 0.01032848, "balance_loss_clip": 1.23350263, "balance_loss_mlp": 1.01351213, "epoch": 0.8935517811513602, "flos": 18925362282240.0, "grad_norm": 3.1284777741882093, "language_loss": 0.64587677, "learning_rate": 1.176284122190685e-07, "loss": 0.67011011, "num_input_tokens_seen": 320543010, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.1932373, "step": 14862, "time_per_iteration": 2.8163230419158936 }, { "auxiliary_loss_clip": 0.01391738, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.23396957, "balance_loss_mlp": 1.0149827, "epoch": 0.8936119044040283, "flos": 24072092161920.0, "grad_norm": 2.1122035363941274, "language_loss": 0.78733897, "learning_rate": 1.1749685290433298e-07, "loss": 0.81159669, "num_input_tokens_seen": 320562180, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19055176, "step": 14863, "time_per_iteration": 2.849299907684326 }, { "auxiliary_loss_clip": 0.01385636, "auxiliary_loss_mlp": 0.01033221, "balance_loss_clip": 1.23035169, "balance_loss_mlp": 1.01494598, "epoch": 0.8936720276566962, "flos": 21333868225920.0, "grad_norm": 2.7722192401017165, "language_loss": 0.71556503, "learning_rate": 1.1736536497430627e-07, "loss": 0.7397536, "num_input_tokens_seen": 320580395, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18286133, "step": 14864, "time_per_iteration": 2.85001277923584 }, { "auxiliary_loss_clip": 0.01427558, "auxiliary_loss_mlp": 0.01035317, "balance_loss_clip": 1.26085496, "balance_loss_mlp": 1.01680446, "epoch": 0.8937321509093642, "flos": 18415388430720.0, "grad_norm": 2.184772256643288, "language_loss": 0.76592433, "learning_rate": 1.1723394843397283e-07, "loss": 0.79055309, "num_input_tokens_seen": 320599505, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.18505859, "step": 14865, "time_per_iteration": 2.8527615070343018 }, { "auxiliary_loss_clip": 0.01377465, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.2233634, "balance_loss_mlp": 1.01432931, "epoch": 0.8937922741620322, "flos": 22064981253120.0, "grad_norm": 1.5288440267674976, "language_loss": 0.72575808, "learning_rate": 1.1710260328831668e-07, "loss": 0.74985862, "num_input_tokens_seen": 320619825, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.18249512, "step": 14866, "time_per_iteration": 2.884913444519043 }, { "auxiliary_loss_clip": 0.01413111, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.2498517, "balance_loss_mlp": 1.01354551, "epoch": 0.8938523974147001, "flos": 25674912368640.0, "grad_norm": 1.5848300162795328, "language_loss": 0.84611136, "learning_rate": 1.1697132954231869e-07, "loss": 0.87057966, "num_input_tokens_seen": 320638515, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.20166016, "step": 14867, "time_per_iteration": 2.8877944946289062 }, { "auxiliary_loss_clip": 0.0139831, "auxiliary_loss_mlp": 0.01029843, "balance_loss_clip": 1.23841572, "balance_loss_mlp": 1.01199758, "epoch": 0.8939125206673681, "flos": 25754823964800.0, "grad_norm": 1.5082726576980523, "language_loss": 0.81069756, "learning_rate": 1.168401272009567e-07, "loss": 0.83497918, "num_input_tokens_seen": 320659430, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.17834473, "step": 14868, "time_per_iteration": 2.888139009475708 }, { "auxiliary_loss_clip": 0.01400645, "auxiliary_loss_mlp": 0.0103243, "balance_loss_clip": 1.24023628, "balance_loss_mlp": 1.01309443, "epoch": 0.8939726439200361, "flos": 27355924869120.0, "grad_norm": 1.6247200843733187, "language_loss": 0.77889121, "learning_rate": 1.167089962692056e-07, "loss": 0.80322194, "num_input_tokens_seen": 320679295, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19335938, "step": 14869, "time_per_iteration": 2.886737823486328 }, { "auxiliary_loss_clip": 0.01385818, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.22866666, "balance_loss_mlp": 1.01030231, "epoch": 0.8940327671727041, "flos": 20348333832960.0, "grad_norm": 2.5939794596570893, "language_loss": 0.66254503, "learning_rate": 1.1657793675203853e-07, "loss": 0.68669277, "num_input_tokens_seen": 320697535, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18664551, "step": 14870, "time_per_iteration": 2.8719136714935303 }, { "auxiliary_loss_clip": 0.01177752, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.08925259, "balance_loss_mlp": 1.01048076, "epoch": 0.894092890425372, "flos": 58434386457600.0, "grad_norm": 0.8020235241826532, "language_loss": 0.55976874, "learning_rate": 1.1644694865442461e-07, "loss": 0.58186471, "num_input_tokens_seen": 320758635, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21386719, "step": 14871, "time_per_iteration": 3.45035457611084 }, { "auxiliary_loss_clip": 0.01389001, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.23283637, "balance_loss_mlp": 1.01683235, "epoch": 0.89415301367804, "flos": 19839445856640.0, "grad_norm": 1.8690289736410954, "language_loss": 0.77589691, "learning_rate": 1.16316031981331e-07, "loss": 0.80013913, "num_input_tokens_seen": 320777175, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18408203, "step": 14872, "time_per_iteration": 2.867182731628418 }, { "auxiliary_loss_clip": 0.01383923, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.22867846, "balance_loss_mlp": 1.01528931, "epoch": 0.8942131369307079, "flos": 25786839300480.0, "grad_norm": 1.5844905572421342, "language_loss": 0.67811817, "learning_rate": 1.1618518673772215e-07, "loss": 0.70229155, "num_input_tokens_seen": 320797670, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.18127441, "step": 14873, "time_per_iteration": 2.8889942169189453 }, { "auxiliary_loss_clip": 0.01388599, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.23164511, "balance_loss_mlp": 1.01596439, "epoch": 0.8942732601833759, "flos": 23159456421120.0, "grad_norm": 1.905727877526288, "language_loss": 0.6068821, "learning_rate": 1.1605441292856033e-07, "loss": 0.63112998, "num_input_tokens_seen": 320817410, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.20214844, "step": 14874, "time_per_iteration": 3.0060155391693115 }, { "auxiliary_loss_clip": 0.01399978, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.23917818, "balance_loss_mlp": 1.01516473, "epoch": 0.8943333834360438, "flos": 27867165575040.0, "grad_norm": 1.94570299844284, "language_loss": 0.76564515, "learning_rate": 1.1592371055880356e-07, "loss": 0.78999126, "num_input_tokens_seen": 320836745, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19470215, "step": 14875, "time_per_iteration": 2.951315402984619 }, { "auxiliary_loss_clip": 0.01422887, "auxiliary_loss_mlp": 0.01034812, "balance_loss_clip": 1.25708175, "balance_loss_mlp": 1.01466513, "epoch": 0.8943935066887119, "flos": 22174148252160.0, "grad_norm": 1.9138042995977387, "language_loss": 0.78199863, "learning_rate": 1.1579307963340857e-07, "loss": 0.80657566, "num_input_tokens_seen": 320853305, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.20141602, "step": 14876, "time_per_iteration": 2.853595018386841 }, { "auxiliary_loss_clip": 0.01390564, "auxiliary_loss_mlp": 0.01029354, "balance_loss_clip": 1.23253131, "balance_loss_mlp": 1.01074517, "epoch": 0.8944536299413798, "flos": 21479891754240.0, "grad_norm": 1.6721103734528484, "language_loss": 0.7917977, "learning_rate": 1.156625201573287e-07, "loss": 0.81599689, "num_input_tokens_seen": 320872885, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18615723, "step": 14877, "time_per_iteration": 4.2834084033966064 }, { "auxiliary_loss_clip": 0.01378151, "auxiliary_loss_mlp": 0.01029834, "balance_loss_clip": 1.22041988, "balance_loss_mlp": 1.01089132, "epoch": 0.8945137531940478, "flos": 17757716993280.0, "grad_norm": 2.0179480973376336, "language_loss": 0.76138616, "learning_rate": 1.155320321355151e-07, "loss": 0.78546607, "num_input_tokens_seen": 320889755, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18920898, "step": 14878, "time_per_iteration": 2.8930141925811768 }, { "auxiliary_loss_clip": 0.01405575, "auxiliary_loss_mlp": 0.01029618, "balance_loss_clip": 1.24342918, "balance_loss_mlp": 1.00988841, "epoch": 0.8945738764467158, "flos": 21152164533120.0, "grad_norm": 1.5493044936778437, "language_loss": 0.768888, "learning_rate": 1.1540161557291539e-07, "loss": 0.79323989, "num_input_tokens_seen": 320907860, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19750977, "step": 14879, "time_per_iteration": 2.841632604598999 }, { "auxiliary_loss_clip": 0.0139597, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.23679388, "balance_loss_mlp": 1.01564968, "epoch": 0.8946339996993837, "flos": 14911230954240.0, "grad_norm": 1.7249583936240358, "language_loss": 0.75621605, "learning_rate": 1.1527127047447538e-07, "loss": 0.78052223, "num_input_tokens_seen": 320925825, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19006348, "step": 14880, "time_per_iteration": 4.230382442474365 }, { "auxiliary_loss_clip": 0.01393199, "auxiliary_loss_mlp": 0.01037483, "balance_loss_clip": 1.23558426, "balance_loss_mlp": 1.01756334, "epoch": 0.8946941229520518, "flos": 27393912518400.0, "grad_norm": 1.7978872943104527, "language_loss": 0.83764017, "learning_rate": 1.1514099684513822e-07, "loss": 0.861947, "num_input_tokens_seen": 320946165, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19921875, "step": 14881, "time_per_iteration": 2.9411323070526123 }, { "auxiliary_loss_clip": 0.01383421, "auxiliary_loss_mlp": 0.01031524, "balance_loss_clip": 1.22706556, "balance_loss_mlp": 1.01150858, "epoch": 0.8947542462047197, "flos": 31808986433280.0, "grad_norm": 1.5609375632349982, "language_loss": 0.67684889, "learning_rate": 1.1501079468984287e-07, "loss": 0.70099843, "num_input_tokens_seen": 320969330, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.20019531, "step": 14882, "time_per_iteration": 2.9044177532196045 }, { "auxiliary_loss_clip": 0.01420941, "auxiliary_loss_mlp": 0.01032206, "balance_loss_clip": 1.25680602, "balance_loss_mlp": 1.01251304, "epoch": 0.8948143694573877, "flos": 20892585260160.0, "grad_norm": 2.037925081755209, "language_loss": 0.76603061, "learning_rate": 1.1488066401352691e-07, "loss": 0.79056209, "num_input_tokens_seen": 320985055, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19702148, "step": 14883, "time_per_iteration": 2.8201096057891846 }, { "auxiliary_loss_clip": 0.01382266, "auxiliary_loss_mlp": 0.0103081, "balance_loss_clip": 1.22731042, "balance_loss_mlp": 1.0131191, "epoch": 0.8948744927100556, "flos": 28226003235840.0, "grad_norm": 1.55723128940774, "language_loss": 0.73238635, "learning_rate": 1.147506048211253e-07, "loss": 0.75651705, "num_input_tokens_seen": 321004720, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.17700195, "step": 14884, "time_per_iteration": 2.8887739181518555 }, { "auxiliary_loss_clip": 0.01382147, "auxiliary_loss_mlp": 0.01028794, "balance_loss_clip": 1.22615659, "balance_loss_mlp": 1.01098442, "epoch": 0.8949346159627236, "flos": 21911266108800.0, "grad_norm": 1.823817247996896, "language_loss": 0.76435524, "learning_rate": 1.1462061711756987e-07, "loss": 0.78846467, "num_input_tokens_seen": 321022350, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.17810059, "step": 14885, "time_per_iteration": 2.850036144256592 }, { "auxiliary_loss_clip": 0.01408502, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.2452724, "balance_loss_mlp": 1.01573133, "epoch": 0.8949947392153915, "flos": 21368553004800.0, "grad_norm": 1.7946325939684513, "language_loss": 0.81993598, "learning_rate": 1.1449070090778911e-07, "loss": 0.84437037, "num_input_tokens_seen": 321040450, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19213867, "step": 14886, "time_per_iteration": 4.310011386871338 }, { "auxiliary_loss_clip": 0.01397064, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.23751688, "balance_loss_mlp": 1.01303792, "epoch": 0.8950548624680595, "flos": 52462922780160.0, "grad_norm": 1.5954864656282428, "language_loss": 0.64617127, "learning_rate": 1.1436085619671043e-07, "loss": 0.67045587, "num_input_tokens_seen": 321063970, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18359375, "step": 14887, "time_per_iteration": 3.098759889602661 }, { "auxiliary_loss_clip": 0.014104, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.24755502, "balance_loss_mlp": 1.01555037, "epoch": 0.8951149857207275, "flos": 20131130954880.0, "grad_norm": 1.6616490969100226, "language_loss": 0.61832553, "learning_rate": 1.1423108298925698e-07, "loss": 0.64277947, "num_input_tokens_seen": 321083840, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19433594, "step": 14888, "time_per_iteration": 2.838829278945923 }, { "auxiliary_loss_clip": 0.01403103, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.2410233, "balance_loss_mlp": 1.01221442, "epoch": 0.8951751089733955, "flos": 29874864666240.0, "grad_norm": 1.9296884324803931, "language_loss": 0.708332, "learning_rate": 1.1410138129034952e-07, "loss": 0.73267615, "num_input_tokens_seen": 321104165, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19116211, "step": 14889, "time_per_iteration": 2.9052276611328125 }, { "auxiliary_loss_clip": 0.01393827, "auxiliary_loss_mlp": 0.01031812, "balance_loss_clip": 1.23375285, "balance_loss_mlp": 1.01234508, "epoch": 0.8952352322260634, "flos": 15269797146240.0, "grad_norm": 2.301357924661621, "language_loss": 0.71986437, "learning_rate": 1.1397175110490676e-07, "loss": 0.74412078, "num_input_tokens_seen": 321117290, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19458008, "step": 14890, "time_per_iteration": 2.869342088699341 }, { "auxiliary_loss_clip": 0.01399781, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.23883581, "balance_loss_mlp": 1.01006246, "epoch": 0.8952953554787314, "flos": 26809908894720.0, "grad_norm": 1.9066392773876915, "language_loss": 0.76414704, "learning_rate": 1.1384219243784454e-07, "loss": 0.78842759, "num_input_tokens_seen": 321137115, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18212891, "step": 14891, "time_per_iteration": 2.931612730026245 }, { "auxiliary_loss_clip": 0.01404365, "auxiliary_loss_mlp": 0.01031491, "balance_loss_clip": 1.23990583, "balance_loss_mlp": 1.01275182, "epoch": 0.8953554787313994, "flos": 14145342658560.0, "grad_norm": 1.6721689731768619, "language_loss": 0.77319825, "learning_rate": 1.1371270529407517e-07, "loss": 0.79755676, "num_input_tokens_seen": 321154490, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.1875, "step": 14892, "time_per_iteration": 2.806199789047241 }, { "auxiliary_loss_clip": 0.01386187, "auxiliary_loss_mlp": 0.01031098, "balance_loss_clip": 1.22880614, "balance_loss_mlp": 1.01259685, "epoch": 0.8954156019840673, "flos": 25714483585920.0, "grad_norm": 1.325315830610486, "language_loss": 0.82040316, "learning_rate": 1.1358328967850895e-07, "loss": 0.844576, "num_input_tokens_seen": 321175625, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18518066, "step": 14893, "time_per_iteration": 2.890418529510498 }, { "auxiliary_loss_clip": 0.01382994, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.22702777, "balance_loss_mlp": 1.01511049, "epoch": 0.8954757252367354, "flos": 21917962339200.0, "grad_norm": 3.7545082245550487, "language_loss": 0.75899899, "learning_rate": 1.1345394559605348e-07, "loss": 0.78315675, "num_input_tokens_seen": 321193895, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.17663574, "step": 14894, "time_per_iteration": 2.8413522243499756 }, { "auxiliary_loss_clip": 0.01404904, "auxiliary_loss_mlp": 0.01037408, "balance_loss_clip": 1.24261653, "balance_loss_mlp": 1.01716614, "epoch": 0.8955358484894033, "flos": 12978421286400.0, "grad_norm": 1.8408517318934141, "language_loss": 0.67812973, "learning_rate": 1.1332467305161352e-07, "loss": 0.70255291, "num_input_tokens_seen": 321211610, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20227051, "step": 14895, "time_per_iteration": 2.8283369541168213 }, { "auxiliary_loss_clip": 0.01411843, "auxiliary_loss_mlp": 0.01032835, "balance_loss_clip": 1.24851239, "balance_loss_mlp": 1.01287985, "epoch": 0.8955959717420713, "flos": 17282654144640.0, "grad_norm": 1.5196775360065886, "language_loss": 0.67725515, "learning_rate": 1.1319547205009094e-07, "loss": 0.701702, "num_input_tokens_seen": 321229805, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19958496, "step": 14896, "time_per_iteration": 2.7953009605407715 }, { "auxiliary_loss_clip": 0.0139378, "auxiliary_loss_mlp": 0.0103106, "balance_loss_clip": 1.2352699, "balance_loss_mlp": 1.01327348, "epoch": 0.8956560949947392, "flos": 14802154444800.0, "grad_norm": 1.8080017313862775, "language_loss": 0.76605976, "learning_rate": 1.1306634259638492e-07, "loss": 0.79030818, "num_input_tokens_seen": 321247165, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.17797852, "step": 14897, "time_per_iteration": 2.8564233779907227 }, { "auxiliary_loss_clip": 0.01177968, "auxiliary_loss_mlp": 0.01028305, "balance_loss_clip": 1.09113955, "balance_loss_mlp": 1.00789595, "epoch": 0.8957162182474072, "flos": 63637201906560.0, "grad_norm": 0.7385006455685597, "language_loss": 0.55388683, "learning_rate": 1.129372846953931e-07, "loss": 0.57594949, "num_input_tokens_seen": 321308425, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.20410156, "step": 14898, "time_per_iteration": 3.386202573776245 }, { "auxiliary_loss_clip": 0.01395199, "auxiliary_loss_mlp": 0.0103222, "balance_loss_clip": 1.23552907, "balance_loss_mlp": 1.01252604, "epoch": 0.8957763415000751, "flos": 25020770025600.0, "grad_norm": 1.4362180586618565, "language_loss": 0.7107861, "learning_rate": 1.12808298352008e-07, "loss": 0.73506033, "num_input_tokens_seen": 321329295, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19702148, "step": 14899, "time_per_iteration": 2.8619983196258545 }, { "auxiliary_loss_clip": 0.01406671, "auxiliary_loss_mlp": 0.01032342, "balance_loss_clip": 1.24450707, "balance_loss_mlp": 1.01298249, "epoch": 0.8958364647527431, "flos": 19838133757440.0, "grad_norm": 1.636626987025454, "language_loss": 0.7455405, "learning_rate": 1.1267938357112106e-07, "loss": 0.7699306, "num_input_tokens_seen": 321347580, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19360352, "step": 14900, "time_per_iteration": 2.856999635696411 }, { "auxiliary_loss_clip": 0.01179076, "auxiliary_loss_mlp": 0.01032499, "balance_loss_clip": 1.09018886, "balance_loss_mlp": 1.01333046, "epoch": 0.895896588005411, "flos": 65565939542400.0, "grad_norm": 0.7746626803559181, "language_loss": 0.61817622, "learning_rate": 1.1255054035762124e-07, "loss": 0.64029199, "num_input_tokens_seen": 321407820, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.19140625, "step": 14901, "time_per_iteration": 3.3061769008636475 }, { "auxiliary_loss_clip": 0.01394149, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.23260975, "balance_loss_mlp": 1.01305592, "epoch": 0.8959567112580791, "flos": 25601425534080.0, "grad_norm": 2.3287152942435516, "language_loss": 0.71372098, "learning_rate": 1.1242176871639441e-07, "loss": 0.73798561, "num_input_tokens_seen": 321426745, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19250488, "step": 14902, "time_per_iteration": 2.903937339782715 }, { "auxiliary_loss_clip": 0.01388267, "auxiliary_loss_mlp": 0.01034436, "balance_loss_clip": 1.23088074, "balance_loss_mlp": 1.01551723, "epoch": 0.896016834510747, "flos": 24211102746240.0, "grad_norm": 1.7676821768179656, "language_loss": 0.78718889, "learning_rate": 1.1229306865232313e-07, "loss": 0.81141597, "num_input_tokens_seen": 321446165, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18908691, "step": 14903, "time_per_iteration": 2.867157220840454 }, { "auxiliary_loss_clip": 0.01398191, "auxiliary_loss_mlp": 0.01033493, "balance_loss_clip": 1.2350235, "balance_loss_mlp": 1.01296568, "epoch": 0.896076957763415, "flos": 23086331544960.0, "grad_norm": 1.6341616573086277, "language_loss": 0.73379493, "learning_rate": 1.121644401702877e-07, "loss": 0.75811183, "num_input_tokens_seen": 321465285, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20532227, "step": 14904, "time_per_iteration": 2.8379688262939453 }, { "auxiliary_loss_clip": 0.0139645, "auxiliary_loss_mlp": 0.01030689, "balance_loss_clip": 1.2359159, "balance_loss_mlp": 1.0106734, "epoch": 0.8961370810160829, "flos": 22246730190720.0, "grad_norm": 1.7949229245403966, "language_loss": 0.75535077, "learning_rate": 1.12035883275166e-07, "loss": 0.77962214, "num_input_tokens_seen": 321483670, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.20019531, "step": 14905, "time_per_iteration": 2.858520746231079 }, { "auxiliary_loss_clip": 0.01385885, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.22833455, "balance_loss_mlp": 1.01102519, "epoch": 0.8961972042687509, "flos": 23081761820160.0, "grad_norm": 5.3755787293201545, "language_loss": 0.77256852, "learning_rate": 1.1190739797183279e-07, "loss": 0.7967366, "num_input_tokens_seen": 321501190, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19897461, "step": 14906, "time_per_iteration": 2.8435401916503906 }, { "auxiliary_loss_clip": 0.01404407, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.24331141, "balance_loss_mlp": 1.01742136, "epoch": 0.896257327521419, "flos": 18194746947840.0, "grad_norm": 2.3232191294148876, "language_loss": 0.75303197, "learning_rate": 1.1177898426515996e-07, "loss": 0.77743518, "num_input_tokens_seen": 321518540, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18518066, "step": 14907, "time_per_iteration": 2.820803165435791 }, { "auxiliary_loss_clip": 0.01394845, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.23784971, "balance_loss_mlp": 1.0137738, "epoch": 0.8963174507740869, "flos": 17904645417600.0, "grad_norm": 1.8198733721957683, "language_loss": 0.83378613, "learning_rate": 1.1165064216001785e-07, "loss": 0.85805619, "num_input_tokens_seen": 321536555, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18383789, "step": 14908, "time_per_iteration": 2.804722785949707 }, { "auxiliary_loss_clip": 0.01397811, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.23571122, "balance_loss_mlp": 1.01179123, "epoch": 0.8963775740267549, "flos": 21042002148480.0, "grad_norm": 2.309145626475353, "language_loss": 0.71202546, "learning_rate": 1.1152237166127232e-07, "loss": 0.73631477, "num_input_tokens_seen": 321557655, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.1932373, "step": 14909, "time_per_iteration": 2.8672733306884766 }, { "auxiliary_loss_clip": 0.01413637, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.25255084, "balance_loss_mlp": 1.01498091, "epoch": 0.8964376972794228, "flos": 23188530844800.0, "grad_norm": 1.7280629106692729, "language_loss": 0.73180878, "learning_rate": 1.113941727737877e-07, "loss": 0.75629091, "num_input_tokens_seen": 321576160, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19604492, "step": 14910, "time_per_iteration": 2.849710702896118 }, { "auxiliary_loss_clip": 0.01397528, "auxiliary_loss_mlp": 0.01027787, "balance_loss_clip": 1.23850727, "balance_loss_mlp": 1.00973916, "epoch": 0.8964978205320908, "flos": 24983823006720.0, "grad_norm": 2.2212249552120635, "language_loss": 0.630817, "learning_rate": 1.1126604550242502e-07, "loss": 0.65507019, "num_input_tokens_seen": 321596205, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18029785, "step": 14911, "time_per_iteration": 2.87783145904541 }, { "auxiliary_loss_clip": 0.01408324, "auxiliary_loss_mlp": 0.01034373, "balance_loss_clip": 1.24801481, "balance_loss_mlp": 1.01502514, "epoch": 0.8965579437847587, "flos": 19181412460800.0, "grad_norm": 1.7527952485621654, "language_loss": 0.7599721, "learning_rate": 1.111379898520437e-07, "loss": 0.78439909, "num_input_tokens_seen": 321614800, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19335938, "step": 14912, "time_per_iteration": 4.268129587173462 }, { "auxiliary_loss_clip": 0.01397739, "auxiliary_loss_mlp": 0.0103574, "balance_loss_clip": 1.23691404, "balance_loss_mlp": 1.01580858, "epoch": 0.8966180670374267, "flos": 24286535107200.0, "grad_norm": 1.8766779004103074, "language_loss": 0.82718575, "learning_rate": 1.1101000582749876e-07, "loss": 0.85152054, "num_input_tokens_seen": 321633445, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19909668, "step": 14913, "time_per_iteration": 2.903698682785034 }, { "auxiliary_loss_clip": 0.0140104, "auxiliary_loss_mlp": 0.01035646, "balance_loss_clip": 1.24047494, "balance_loss_mlp": 1.01586962, "epoch": 0.8966781902900947, "flos": 13561112810880.0, "grad_norm": 56.69349409892734, "language_loss": 0.62459493, "learning_rate": 1.1088209343364407e-07, "loss": 0.64896184, "num_input_tokens_seen": 321650890, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19787598, "step": 14914, "time_per_iteration": 2.8519060611724854 }, { "auxiliary_loss_clip": 0.01179211, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.09173584, "balance_loss_mlp": 1.01085782, "epoch": 0.8967383135427627, "flos": 65095582152960.0, "grad_norm": 0.7242255757736406, "language_loss": 0.55092877, "learning_rate": 1.1075425267532956e-07, "loss": 0.57302308, "num_input_tokens_seen": 321710960, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.19335938, "step": 14915, "time_per_iteration": 4.791531085968018 }, { "auxiliary_loss_clip": 0.01395277, "auxiliary_loss_mlp": 0.01031981, "balance_loss_clip": 1.23725939, "balance_loss_mlp": 1.01314592, "epoch": 0.8967984367954306, "flos": 29724271413120.0, "grad_norm": 1.45635249999403, "language_loss": 0.71809006, "learning_rate": 1.1062648355740289e-07, "loss": 0.74236262, "num_input_tokens_seen": 321733290, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18847656, "step": 14916, "time_per_iteration": 2.960327625274658 }, { "auxiliary_loss_clip": 0.0138529, "auxiliary_loss_mlp": 0.01031735, "balance_loss_clip": 1.22759879, "balance_loss_mlp": 1.01311469, "epoch": 0.8968585600480986, "flos": 25713442955520.0, "grad_norm": 2.0352971314126433, "language_loss": 0.78555697, "learning_rate": 1.1049878608470931e-07, "loss": 0.80972725, "num_input_tokens_seen": 321753120, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.1862793, "step": 14917, "time_per_iteration": 2.939617395401001 }, { "auxiliary_loss_clip": 0.01411089, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.24739909, "balance_loss_mlp": 1.01977277, "epoch": 0.8969186833007665, "flos": 30056885072640.0, "grad_norm": 1.9108909284943532, "language_loss": 0.69075787, "learning_rate": 1.1037116026209137e-07, "loss": 0.71526426, "num_input_tokens_seen": 321772840, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19787598, "step": 14918, "time_per_iteration": 2.910747528076172 }, { "auxiliary_loss_clip": 0.01397265, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.23615313, "balance_loss_mlp": 1.01392961, "epoch": 0.8969788065534345, "flos": 22827838147200.0, "grad_norm": 1.676263001795645, "language_loss": 0.84351969, "learning_rate": 1.102436060943881e-07, "loss": 0.86781627, "num_input_tokens_seen": 321791020, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18457031, "step": 14919, "time_per_iteration": 2.8515143394470215 }, { "auxiliary_loss_clip": 0.01405145, "auxiliary_loss_mlp": 0.01032999, "balance_loss_clip": 1.24297309, "balance_loss_mlp": 1.01317418, "epoch": 0.8970389298061026, "flos": 13269382467840.0, "grad_norm": 3.1196640515329004, "language_loss": 0.73933429, "learning_rate": 1.1011612358643696e-07, "loss": 0.76371574, "num_input_tokens_seen": 321810075, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19824219, "step": 14920, "time_per_iteration": 2.7997632026672363 }, { "auxiliary_loss_clip": 0.01405262, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.24382746, "balance_loss_mlp": 1.01464856, "epoch": 0.8970990530587705, "flos": 10272257930880.0, "grad_norm": 3.1000559566748174, "language_loss": 0.91980624, "learning_rate": 1.0998871274307164e-07, "loss": 0.94419986, "num_input_tokens_seen": 321822635, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19433594, "step": 14921, "time_per_iteration": 4.2373206615448 }, { "auxiliary_loss_clip": 0.01408037, "auxiliary_loss_mlp": 0.01037504, "balance_loss_clip": 1.24475861, "balance_loss_mlp": 1.01705956, "epoch": 0.8971591763114385, "flos": 20312291710080.0, "grad_norm": 1.914666733288133, "language_loss": 0.74038523, "learning_rate": 1.0986137356912384e-07, "loss": 0.7648406, "num_input_tokens_seen": 321841130, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.2043457, "step": 14922, "time_per_iteration": 4.234920263290405 }, { "auxiliary_loss_clip": 0.0139145, "auxiliary_loss_mlp": 0.01033505, "balance_loss_clip": 1.23246312, "balance_loss_mlp": 1.01384759, "epoch": 0.8972192995641064, "flos": 23267130341760.0, "grad_norm": 1.8180971742544967, "language_loss": 0.71134472, "learning_rate": 1.097341060694219e-07, "loss": 0.73559427, "num_input_tokens_seen": 321859855, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19665527, "step": 14923, "time_per_iteration": 2.834177017211914 }, { "auxiliary_loss_clip": 0.01398751, "auxiliary_loss_mlp": 0.01033585, "balance_loss_clip": 1.23615098, "balance_loss_mlp": 1.01395094, "epoch": 0.8972794228167744, "flos": 18378848615040.0, "grad_norm": 1.878736373944294, "language_loss": 0.71606135, "learning_rate": 1.0960691024879221e-07, "loss": 0.74038476, "num_input_tokens_seen": 321877990, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19628906, "step": 14924, "time_per_iteration": 2.8700478076934814 }, { "auxiliary_loss_clip": 0.01403662, "auxiliary_loss_mlp": 0.01032986, "balance_loss_clip": 1.24332809, "balance_loss_mlp": 1.01479495, "epoch": 0.8973395460694423, "flos": 23962789428480.0, "grad_norm": 1.4565493086420698, "language_loss": 0.73168337, "learning_rate": 1.0947978611205844e-07, "loss": 0.75604987, "num_input_tokens_seen": 321898120, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18188477, "step": 14925, "time_per_iteration": 2.8877785205841064 }, { "auxiliary_loss_clip": 0.01410945, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.25035071, "balance_loss_mlp": 1.00963116, "epoch": 0.8973996693221103, "flos": 24981108318720.0, "grad_norm": 2.3559347196033196, "language_loss": 0.83450174, "learning_rate": 1.0935273366404008e-07, "loss": 0.85890305, "num_input_tokens_seen": 321918140, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19555664, "step": 14926, "time_per_iteration": 2.870877981185913 }, { "auxiliary_loss_clip": 0.0140658, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.24570847, "balance_loss_mlp": 1.01918387, "epoch": 0.8974597925747783, "flos": 25749892281600.0, "grad_norm": 2.375419315999326, "language_loss": 0.79392922, "learning_rate": 1.092257529095555e-07, "loss": 0.81837463, "num_input_tokens_seen": 321938580, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18774414, "step": 14927, "time_per_iteration": 2.883622884750366 }, { "auxiliary_loss_clip": 0.01398464, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.23910952, "balance_loss_mlp": 1.01229584, "epoch": 0.8975199158274463, "flos": 38086278848640.0, "grad_norm": 1.5226393584668971, "language_loss": 0.67530394, "learning_rate": 1.0909884385341994e-07, "loss": 0.6996063, "num_input_tokens_seen": 321961135, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19494629, "step": 14928, "time_per_iteration": 3.0160059928894043 }, { "auxiliary_loss_clip": 0.01400528, "auxiliary_loss_mlp": 0.0103766, "balance_loss_clip": 1.23836458, "balance_loss_mlp": 1.01715565, "epoch": 0.8975800390801142, "flos": 25422843732480.0, "grad_norm": 1.939274561686016, "language_loss": 0.72040343, "learning_rate": 1.0897200650044602e-07, "loss": 0.74478531, "num_input_tokens_seen": 321980945, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20507812, "step": 14929, "time_per_iteration": 2.9110107421875 }, { "auxiliary_loss_clip": 0.01407425, "auxiliary_loss_mlp": 0.01030804, "balance_loss_clip": 1.24767363, "balance_loss_mlp": 1.01254141, "epoch": 0.8976401623327822, "flos": 21768907409280.0, "grad_norm": 1.6546910748837127, "language_loss": 0.6904453, "learning_rate": 1.0884524085544256e-07, "loss": 0.71482766, "num_input_tokens_seen": 322000350, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18273926, "step": 14930, "time_per_iteration": 2.868788480758667 }, { "auxiliary_loss_clip": 0.01395043, "auxiliary_loss_mlp": 0.01030697, "balance_loss_clip": 1.23618078, "balance_loss_mlp": 1.01218343, "epoch": 0.8977002855854501, "flos": 13853386091520.0, "grad_norm": 1.9916619313856292, "language_loss": 0.75960696, "learning_rate": 1.0871854692321769e-07, "loss": 0.78386438, "num_input_tokens_seen": 322018980, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18518066, "step": 14931, "time_per_iteration": 2.860804796218872 }, { "auxiliary_loss_clip": 0.01393655, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.23637474, "balance_loss_mlp": 1.01558268, "epoch": 0.8977604088381181, "flos": 19437010191360.0, "grad_norm": 1.7952407939825439, "language_loss": 0.63758951, "learning_rate": 1.0859192470857492e-07, "loss": 0.66187322, "num_input_tokens_seen": 322037675, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19116211, "step": 14932, "time_per_iteration": 2.861637830734253 }, { "auxiliary_loss_clip": 0.01376937, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 1.22258329, "balance_loss_mlp": 1.01490927, "epoch": 0.8978205320907862, "flos": 22750957952640.0, "grad_norm": 1.8246003909372621, "language_loss": 0.72853547, "learning_rate": 1.0846537421631552e-07, "loss": 0.75263411, "num_input_tokens_seen": 322055130, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.18017578, "step": 14933, "time_per_iteration": 2.8492212295532227 }, { "auxiliary_loss_clip": 0.01411469, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.24728, "balance_loss_mlp": 1.01334262, "epoch": 0.8978806553434541, "flos": 21370317552000.0, "grad_norm": 1.5478708347058663, "language_loss": 0.75207663, "learning_rate": 1.0833889545123898e-07, "loss": 0.77651215, "num_input_tokens_seen": 322074850, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18725586, "step": 14934, "time_per_iteration": 2.954237222671509 }, { "auxiliary_loss_clip": 0.01399934, "auxiliary_loss_mlp": 0.01037131, "balance_loss_clip": 1.24106884, "balance_loss_mlp": 1.01795006, "epoch": 0.8979407785961221, "flos": 20934237738240.0, "grad_norm": 1.6427031859079393, "language_loss": 0.61309105, "learning_rate": 1.0821248841814123e-07, "loss": 0.63746166, "num_input_tokens_seen": 322093315, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19177246, "step": 14935, "time_per_iteration": 2.842212200164795 }, { "auxiliary_loss_clip": 0.01387426, "auxiliary_loss_mlp": 0.01031799, "balance_loss_clip": 1.23154426, "balance_loss_mlp": 1.0125227, "epoch": 0.89800090184879, "flos": 25239963674880.0, "grad_norm": 1.9109760084499046, "language_loss": 0.77227038, "learning_rate": 1.0808615312181512e-07, "loss": 0.79646266, "num_input_tokens_seen": 322112555, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.19299316, "step": 14936, "time_per_iteration": 2.951972484588623 }, { "auxiliary_loss_clip": 0.01396198, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.23737168, "balance_loss_mlp": 1.01189661, "epoch": 0.898061025101458, "flos": 22572285661440.0, "grad_norm": 1.5271420033959466, "language_loss": 0.74475533, "learning_rate": 1.0795988956705193e-07, "loss": 0.76902223, "num_input_tokens_seen": 322130440, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18591309, "step": 14937, "time_per_iteration": 2.901740312576294 }, { "auxiliary_loss_clip": 0.01176301, "auxiliary_loss_mlp": 0.01016654, "balance_loss_clip": 1.09088075, "balance_loss_mlp": 0.99643594, "epoch": 0.8981211483541259, "flos": 56217764286720.0, "grad_norm": 0.8502263214230118, "language_loss": 0.63510394, "learning_rate": 1.0783369775863915e-07, "loss": 0.65703356, "num_input_tokens_seen": 322187295, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 0.20214844, "step": 14938, "time_per_iteration": 3.258723497390747 }, { "auxiliary_loss_clip": 0.0137847, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.22421932, "balance_loss_mlp": 1.01256835, "epoch": 0.898181271606794, "flos": 16399726254720.0, "grad_norm": 5.4935586022674245, "language_loss": 0.81055635, "learning_rate": 1.0770757770136251e-07, "loss": 0.83465803, "num_input_tokens_seen": 322202965, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.19128418, "step": 14939, "time_per_iteration": 2.825197219848633 }, { "auxiliary_loss_clip": 0.01180306, "auxiliary_loss_mlp": 0.01030069, "balance_loss_clip": 1.09201455, "balance_loss_mlp": 1.00508273, "epoch": 0.8982413948594619, "flos": 63473460399360.0, "grad_norm": 0.7213614532845988, "language_loss": 0.52839673, "learning_rate": 1.0758152940000375e-07, "loss": 0.55050051, "num_input_tokens_seen": 322269490, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.25, "step": 14940, "time_per_iteration": 3.402911901473999 }, { "auxiliary_loss_clip": 0.01396773, "auxiliary_loss_mlp": 0.01030462, "balance_loss_clip": 1.23574281, "balance_loss_mlp": 1.01152003, "epoch": 0.8983015181121299, "flos": 21845380400640.0, "grad_norm": 1.9574564760962954, "language_loss": 0.785375, "learning_rate": 1.0745555285934327e-07, "loss": 0.80964744, "num_input_tokens_seen": 322288060, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18969727, "step": 14941, "time_per_iteration": 2.8725717067718506 }, { "auxiliary_loss_clip": 0.01410722, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.25037003, "balance_loss_mlp": 1.01503611, "epoch": 0.8983616413647978, "flos": 28961821722240.0, "grad_norm": 2.7604447514136816, "language_loss": 0.74139184, "learning_rate": 1.0732964808415834e-07, "loss": 0.76583982, "num_input_tokens_seen": 322307930, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19042969, "step": 14942, "time_per_iteration": 2.9091529846191406 }, { "auxiliary_loss_clip": 0.01410894, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.24947667, "balance_loss_mlp": 1.01615882, "epoch": 0.8984217646174658, "flos": 17793985340160.0, "grad_norm": 2.0460097342123773, "language_loss": 0.80984998, "learning_rate": 1.0720381507922205e-07, "loss": 0.83431578, "num_input_tokens_seen": 322326155, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.1953125, "step": 14943, "time_per_iteration": 2.801833391189575 }, { "auxiliary_loss_clip": 0.01403687, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.24107623, "balance_loss_mlp": 1.01172197, "epoch": 0.8984818878701337, "flos": 23415054151680.0, "grad_norm": 1.668510459666449, "language_loss": 0.72063822, "learning_rate": 1.0707805384930701e-07, "loss": 0.74500096, "num_input_tokens_seen": 322345850, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20874023, "step": 14944, "time_per_iteration": 2.843008041381836 }, { "auxiliary_loss_clip": 0.01412357, "auxiliary_loss_mlp": 0.01032033, "balance_loss_clip": 1.24839973, "balance_loss_mlp": 1.01244724, "epoch": 0.8985420111228017, "flos": 22356213903360.0, "grad_norm": 2.1135863761534726, "language_loss": 0.77556419, "learning_rate": 1.0695236439918187e-07, "loss": 0.80000818, "num_input_tokens_seen": 322364715, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19592285, "step": 14945, "time_per_iteration": 2.9376699924468994 }, { "auxiliary_loss_clip": 0.01421656, "auxiliary_loss_mlp": 0.01033873, "balance_loss_clip": 1.25363564, "balance_loss_mlp": 1.01388144, "epoch": 0.8986021343754698, "flos": 21401473236480.0, "grad_norm": 2.1223793660880603, "language_loss": 0.74150407, "learning_rate": 1.0682674673361302e-07, "loss": 0.76605928, "num_input_tokens_seen": 322383570, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.20007324, "step": 14946, "time_per_iteration": 2.8550164699554443 }, { "auxiliary_loss_clip": 0.01389827, "auxiliary_loss_mlp": 0.01031711, "balance_loss_clip": 1.2307601, "balance_loss_mlp": 1.01132607, "epoch": 0.8986622576281377, "flos": 21335497038720.0, "grad_norm": 1.9311676189088844, "language_loss": 0.64494491, "learning_rate": 1.0670120085736334e-07, "loss": 0.66916025, "num_input_tokens_seen": 322401375, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.20385742, "step": 14947, "time_per_iteration": 4.283062934875488 }, { "auxiliary_loss_clip": 0.01396836, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.23928738, "balance_loss_mlp": 1.01359701, "epoch": 0.8987223808808057, "flos": 23998967285760.0, "grad_norm": 1.870229896600675, "language_loss": 0.70222372, "learning_rate": 1.0657572677519411e-07, "loss": 0.72652155, "num_input_tokens_seen": 322421890, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19360352, "step": 14948, "time_per_iteration": 2.8663530349731445 }, { "auxiliary_loss_clip": 0.01397467, "auxiliary_loss_mlp": 0.0103308, "balance_loss_clip": 1.23760247, "balance_loss_mlp": 1.0137682, "epoch": 0.8987825041334736, "flos": 41516044594560.0, "grad_norm": 2.1654189303297593, "language_loss": 0.75112796, "learning_rate": 1.0645032449186309e-07, "loss": 0.77543342, "num_input_tokens_seen": 322445730, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19299316, "step": 14949, "time_per_iteration": 3.1202852725982666 }, { "auxiliary_loss_clip": 0.01404648, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.2420485, "balance_loss_mlp": 1.01451337, "epoch": 0.8988426273861416, "flos": 27575978169600.0, "grad_norm": 1.6265615082958633, "language_loss": 0.76284313, "learning_rate": 1.0632499401212513e-07, "loss": 0.78723538, "num_input_tokens_seen": 322464595, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20068359, "step": 14950, "time_per_iteration": 4.395023345947266 }, { "auxiliary_loss_clip": 0.01400448, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 1.2419064, "balance_loss_mlp": 1.01240993, "epoch": 0.8989027506388095, "flos": 17101583879040.0, "grad_norm": 1.763255679302362, "language_loss": 0.67292702, "learning_rate": 1.0619973534073334e-07, "loss": 0.69724131, "num_input_tokens_seen": 322483305, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18579102, "step": 14951, "time_per_iteration": 2.859564781188965 }, { "auxiliary_loss_clip": 0.01424372, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.25763297, "balance_loss_mlp": 1.01361656, "epoch": 0.8989628738914776, "flos": 20563998387840.0, "grad_norm": 3.495001125493466, "language_loss": 0.7413038, "learning_rate": 1.0607454848243769e-07, "loss": 0.76586795, "num_input_tokens_seen": 322501905, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.18432617, "step": 14952, "time_per_iteration": 2.8744077682495117 }, { "auxiliary_loss_clip": 0.01393941, "auxiliary_loss_mlp": 0.01033181, "balance_loss_clip": 1.23456264, "balance_loss_mlp": 1.01392865, "epoch": 0.8990229971441455, "flos": 16259177347200.0, "grad_norm": 2.1920551993502637, "language_loss": 0.57856786, "learning_rate": 1.0594943344198481e-07, "loss": 0.60283905, "num_input_tokens_seen": 322518135, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19250488, "step": 14953, "time_per_iteration": 2.8160533905029297 }, { "auxiliary_loss_clip": 0.01391016, "auxiliary_loss_mlp": 0.01036641, "balance_loss_clip": 1.23242378, "balance_loss_mlp": 1.01706636, "epoch": 0.8990831203968135, "flos": 21991403928960.0, "grad_norm": 2.1179209743516805, "language_loss": 0.82806343, "learning_rate": 1.0582439022411915e-07, "loss": 0.85233998, "num_input_tokens_seen": 322537905, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19555664, "step": 14954, "time_per_iteration": 2.861647367477417 }, { "auxiliary_loss_clip": 0.01390124, "auxiliary_loss_mlp": 0.01036581, "balance_loss_clip": 1.23368645, "balance_loss_mlp": 1.01778173, "epoch": 0.8991432436494814, "flos": 27456857314560.0, "grad_norm": 1.958462507660104, "language_loss": 0.61656201, "learning_rate": 1.0569941883358224e-07, "loss": 0.64082903, "num_input_tokens_seen": 322557945, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18811035, "step": 14955, "time_per_iteration": 2.9080724716186523 }, { "auxiliary_loss_clip": 0.01391442, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.23484182, "balance_loss_mlp": 1.01257515, "epoch": 0.8992033669021494, "flos": 21589782670080.0, "grad_norm": 2.221927097287631, "language_loss": 0.55849826, "learning_rate": 1.0557451927511341e-07, "loss": 0.58272755, "num_input_tokens_seen": 322575765, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18908691, "step": 14956, "time_per_iteration": 4.264169216156006 }, { "auxiliary_loss_clip": 0.01408367, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.24796593, "balance_loss_mlp": 1.01582468, "epoch": 0.8992634901548173, "flos": 28595744893440.0, "grad_norm": 1.5844502074665592, "language_loss": 0.80813986, "learning_rate": 1.0544969155344863e-07, "loss": 0.83257198, "num_input_tokens_seen": 322595665, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19018555, "step": 14957, "time_per_iteration": 4.424845933914185 }, { "auxiliary_loss_clip": 0.01404273, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.24193096, "balance_loss_mlp": 1.01446986, "epoch": 0.8993236134074853, "flos": 19877569240320.0, "grad_norm": 1.9226351639369275, "language_loss": 0.79816294, "learning_rate": 1.0532493567332123e-07, "loss": 0.8225435, "num_input_tokens_seen": 322614755, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19299316, "step": 14958, "time_per_iteration": 2.8851354122161865 }, { "auxiliary_loss_clip": 0.01407136, "auxiliary_loss_mlp": 0.0103267, "balance_loss_clip": 1.24886763, "balance_loss_mlp": 1.01455057, "epoch": 0.8993837366601534, "flos": 19400017927680.0, "grad_norm": 7.553448122320239, "language_loss": 0.75325584, "learning_rate": 1.0520025163946277e-07, "loss": 0.77765387, "num_input_tokens_seen": 322633425, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18115234, "step": 14959, "time_per_iteration": 2.999762535095215 }, { "auxiliary_loss_clip": 0.01384201, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.2286706, "balance_loss_mlp": 1.01242018, "epoch": 0.8994438599128213, "flos": 18560959511040.0, "grad_norm": 1.924872951006241, "language_loss": 0.69754761, "learning_rate": 1.0507563945660015e-07, "loss": 0.72170252, "num_input_tokens_seen": 322652065, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18859863, "step": 14960, "time_per_iteration": 2.8315882682800293 }, { "auxiliary_loss_clip": 0.01396522, "auxiliary_loss_mlp": 0.01035585, "balance_loss_clip": 1.23788929, "balance_loss_mlp": 1.01659513, "epoch": 0.8995039831654893, "flos": 24438757173120.0, "grad_norm": 1.7415309111215422, "language_loss": 0.66681945, "learning_rate": 1.049510991294591e-07, "loss": 0.69114053, "num_input_tokens_seen": 322673275, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18994141, "step": 14961, "time_per_iteration": 2.891075372695923 }, { "auxiliary_loss_clip": 0.01380369, "auxiliary_loss_mlp": 0.01031788, "balance_loss_clip": 1.22446966, "balance_loss_mlp": 1.01319146, "epoch": 0.8995641064181572, "flos": 21261150552960.0, "grad_norm": 1.4488278201650604, "language_loss": 0.8323943, "learning_rate": 1.0482663066276254e-07, "loss": 0.85651588, "num_input_tokens_seen": 322693375, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18615723, "step": 14962, "time_per_iteration": 2.8461315631866455 }, { "auxiliary_loss_clip": 0.01411645, "auxiliary_loss_mlp": 0.01033164, "balance_loss_clip": 1.24708915, "balance_loss_mlp": 1.01379275, "epoch": 0.8996242296708252, "flos": 23524130661120.0, "grad_norm": 1.8756818355817162, "language_loss": 0.76876247, "learning_rate": 1.047022340612298e-07, "loss": 0.79321051, "num_input_tokens_seen": 322712615, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19384766, "step": 14963, "time_per_iteration": 2.877379894256592 }, { "auxiliary_loss_clip": 0.01177213, "auxiliary_loss_mlp": 0.01025403, "balance_loss_clip": 1.08902001, "balance_loss_mlp": 1.00203788, "epoch": 0.8996843529234931, "flos": 62432157150720.0, "grad_norm": 0.780399617482122, "language_loss": 0.57576913, "learning_rate": 1.0457790932957867e-07, "loss": 0.59779531, "num_input_tokens_seen": 322766855, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.23339844, "step": 14964, "time_per_iteration": 3.186002016067505 }, { "auxiliary_loss_clip": 0.01426476, "auxiliary_loss_mlp": 0.01038118, "balance_loss_clip": 1.25964308, "balance_loss_mlp": 1.01736379, "epoch": 0.8997444761761612, "flos": 24245787525120.0, "grad_norm": 3.2292521408412562, "language_loss": 0.68731278, "learning_rate": 1.0445365647252269e-07, "loss": 0.71195865, "num_input_tokens_seen": 322781130, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.2076416, "step": 14965, "time_per_iteration": 2.857436180114746 }, { "auxiliary_loss_clip": 0.01400555, "auxiliary_loss_mlp": 0.01031966, "balance_loss_clip": 1.23933947, "balance_loss_mlp": 1.01310706, "epoch": 0.8998045994288291, "flos": 21371041468800.0, "grad_norm": 1.9060932553691323, "language_loss": 0.73060673, "learning_rate": 1.0432947549477433e-07, "loss": 0.75493193, "num_input_tokens_seen": 322800310, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18859863, "step": 14966, "time_per_iteration": 2.8818886280059814 }, { "auxiliary_loss_clip": 0.01390427, "auxiliary_loss_mlp": 0.01035178, "balance_loss_clip": 1.23192787, "balance_loss_mlp": 1.0158186, "epoch": 0.8998647226814971, "flos": 28997954334720.0, "grad_norm": 1.7734973693838822, "language_loss": 0.7404778, "learning_rate": 1.0420536640104205e-07, "loss": 0.76473391, "num_input_tokens_seen": 322820955, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19360352, "step": 14967, "time_per_iteration": 2.899163007736206 }, { "auxiliary_loss_clip": 0.01392466, "auxiliary_loss_mlp": 0.01029625, "balance_loss_clip": 1.23332751, "balance_loss_mlp": 1.01056385, "epoch": 0.899924845934165, "flos": 13633785239040.0, "grad_norm": 2.304703582166835, "language_loss": 0.73390651, "learning_rate": 1.040813291960323e-07, "loss": 0.75812751, "num_input_tokens_seen": 322838780, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19055176, "step": 14968, "time_per_iteration": 2.8371238708496094 }, { "auxiliary_loss_clip": 0.0140617, "auxiliary_loss_mlp": 0.01031067, "balance_loss_clip": 1.24568987, "balance_loss_mlp": 1.01206529, "epoch": 0.899984969186833, "flos": 20891363650560.0, "grad_norm": 1.7352290231196592, "language_loss": 0.70982742, "learning_rate": 1.0395736388444864e-07, "loss": 0.73419976, "num_input_tokens_seen": 322856710, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19006348, "step": 14969, "time_per_iteration": 2.8017430305480957 }, { "auxiliary_loss_clip": 0.01404104, "auxiliary_loss_mlp": 0.01029339, "balance_loss_clip": 1.24130201, "balance_loss_mlp": 1.01040864, "epoch": 0.9000450924395009, "flos": 20931251581440.0, "grad_norm": 1.9178957988034635, "language_loss": 0.76958382, "learning_rate": 1.0383347047099201e-07, "loss": 0.79391825, "num_input_tokens_seen": 322876070, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18933105, "step": 14970, "time_per_iteration": 2.861865520477295 }, { "auxiliary_loss_clip": 0.01401644, "auxiliary_loss_mlp": 0.01031266, "balance_loss_clip": 1.24046826, "balance_loss_mlp": 1.0123713, "epoch": 0.900105215692169, "flos": 17173849104000.0, "grad_norm": 1.9662780210821396, "language_loss": 0.73737633, "learning_rate": 1.0370964896035972e-07, "loss": 0.7617054, "num_input_tokens_seen": 322895095, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18896484, "step": 14971, "time_per_iteration": 2.88590669631958 }, { "auxiliary_loss_clip": 0.01394231, "auxiliary_loss_mlp": 0.01031075, "balance_loss_clip": 1.23384798, "balance_loss_mlp": 1.01137018, "epoch": 0.900165338944837, "flos": 19939880609280.0, "grad_norm": 2.296111038577219, "language_loss": 0.82553566, "learning_rate": 1.035858993572476e-07, "loss": 0.84978873, "num_input_tokens_seen": 322911845, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19714355, "step": 14972, "time_per_iteration": 2.8654143810272217 }, { "auxiliary_loss_clip": 0.01418121, "auxiliary_loss_mlp": 0.01033379, "balance_loss_clip": 1.25204074, "balance_loss_mlp": 1.015378, "epoch": 0.9002254621975049, "flos": 16115642282880.0, "grad_norm": 1.980231841510622, "language_loss": 0.82392263, "learning_rate": 1.0346222166634855e-07, "loss": 0.84843761, "num_input_tokens_seen": 322928170, "router_z_loss_clip": 1.66015625, "router_z_loss_mlp": 0.18017578, "step": 14973, "time_per_iteration": 2.8257462978363037 }, { "auxiliary_loss_clip": 0.0139526, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 1.23574245, "balance_loss_mlp": 1.0160954, "epoch": 0.9002855854501729, "flos": 28487980483200.0, "grad_norm": 1.6491419786867012, "language_loss": 0.59016705, "learning_rate": 1.0333861589235193e-07, "loss": 0.61447978, "num_input_tokens_seen": 322948165, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19921875, "step": 14974, "time_per_iteration": 2.875247001647949 }, { "auxiliary_loss_clip": 0.0141473, "auxiliary_loss_mlp": 0.0103708, "balance_loss_clip": 1.25316072, "balance_loss_mlp": 1.01798272, "epoch": 0.9003457087028408, "flos": 25641449199360.0, "grad_norm": 1.6670736863557445, "language_loss": 0.63985741, "learning_rate": 1.0321508203994489e-07, "loss": 0.66437542, "num_input_tokens_seen": 322968880, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19104004, "step": 14975, "time_per_iteration": 2.9198639392852783 }, { "auxiliary_loss_clip": 0.01411055, "auxiliary_loss_mlp": 0.01034033, "balance_loss_clip": 1.24937809, "balance_loss_mlp": 1.01517367, "epoch": 0.9004058319555088, "flos": 24400136096640.0, "grad_norm": 1.6829378313851888, "language_loss": 0.73473549, "learning_rate": 1.0309162011381257e-07, "loss": 0.75918633, "num_input_tokens_seen": 322989395, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1887207, "step": 14976, "time_per_iteration": 2.8396847248077393 }, { "auxiliary_loss_clip": 0.01407502, "auxiliary_loss_mlp": 0.01036986, "balance_loss_clip": 1.2477088, "balance_loss_mlp": 1.01722121, "epoch": 0.9004659552081767, "flos": 29071214945280.0, "grad_norm": 1.7887874369945571, "language_loss": 0.70540631, "learning_rate": 1.0296823011863565e-07, "loss": 0.72985125, "num_input_tokens_seen": 323009060, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19763184, "step": 14977, "time_per_iteration": 2.9585163593292236 }, { "auxiliary_loss_clip": 0.01403214, "auxiliary_loss_mlp": 0.01032833, "balance_loss_clip": 1.24240136, "balance_loss_mlp": 1.01292455, "epoch": 0.9005260784608448, "flos": 16772589803520.0, "grad_norm": 2.6954913168255126, "language_loss": 0.67419869, "learning_rate": 1.0284491205909351e-07, "loss": 0.69855917, "num_input_tokens_seen": 323027530, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19909668, "step": 14978, "time_per_iteration": 2.803304672241211 }, { "auxiliary_loss_clip": 0.01415929, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.25214171, "balance_loss_mlp": 1.01665759, "epoch": 0.9005862017135127, "flos": 20385688055040.0, "grad_norm": 1.7523069480390823, "language_loss": 0.79761994, "learning_rate": 1.0272166593986286e-07, "loss": 0.822142, "num_input_tokens_seen": 323045370, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19616699, "step": 14979, "time_per_iteration": 2.8385813236236572 }, { "auxiliary_loss_clip": 0.01178208, "auxiliary_loss_mlp": 0.01028488, "balance_loss_clip": 1.09115016, "balance_loss_mlp": 1.00826979, "epoch": 0.9006463249661807, "flos": 67609499777280.0, "grad_norm": 0.7275611645191706, "language_loss": 0.53740919, "learning_rate": 1.0259849176561642e-07, "loss": 0.55947614, "num_input_tokens_seen": 323105660, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.20214844, "step": 14980, "time_per_iteration": 3.343348264694214 }, { "auxiliary_loss_clip": 0.01414813, "auxiliary_loss_mlp": 0.01038471, "balance_loss_clip": 1.25033975, "balance_loss_mlp": 1.01902795, "epoch": 0.9007064482188486, "flos": 28305055180800.0, "grad_norm": 1.9670491609016096, "language_loss": 0.82888305, "learning_rate": 1.0247538954102553e-07, "loss": 0.85341585, "num_input_tokens_seen": 323126365, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19445801, "step": 14981, "time_per_iteration": 2.8850014209747314 }, { "auxiliary_loss_clip": 0.01383415, "auxiliary_loss_mlp": 0.01032891, "balance_loss_clip": 1.22708631, "balance_loss_mlp": 1.01303101, "epoch": 0.9007665714715166, "flos": 21626277240960.0, "grad_norm": 2.9168919393766983, "language_loss": 0.82213718, "learning_rate": 1.0235235927075758e-07, "loss": 0.84630024, "num_input_tokens_seen": 323145655, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.1986084, "step": 14982, "time_per_iteration": 4.35743522644043 }, { "auxiliary_loss_clip": 0.01390504, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.23480701, "balance_loss_mlp": 1.01153731, "epoch": 0.9008266947241845, "flos": 26553089554560.0, "grad_norm": 1.803294039112789, "language_loss": 0.72382247, "learning_rate": 1.0222940095947885e-07, "loss": 0.74802577, "num_input_tokens_seen": 323164540, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.18273926, "step": 14983, "time_per_iteration": 2.918055534362793 }, { "auxiliary_loss_clip": 0.01389704, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.231969, "balance_loss_mlp": 1.01356685, "epoch": 0.9008868179768525, "flos": 23120428141440.0, "grad_norm": 2.627502837861305, "language_loss": 0.75351715, "learning_rate": 1.0210651461185115e-07, "loss": 0.77773303, "num_input_tokens_seen": 323186960, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18334961, "step": 14984, "time_per_iteration": 2.8451180458068848 }, { "auxiliary_loss_clip": 0.01383384, "auxiliary_loss_mlp": 0.01034394, "balance_loss_clip": 1.22763896, "balance_loss_mlp": 1.01441503, "epoch": 0.9009469412295206, "flos": 19069711752960.0, "grad_norm": 1.7382208170488438, "language_loss": 0.70725578, "learning_rate": 1.0198370023253456e-07, "loss": 0.73143357, "num_input_tokens_seen": 323206135, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.1998291, "step": 14985, "time_per_iteration": 4.348379135131836 }, { "auxiliary_loss_clip": 0.01397534, "auxiliary_loss_mlp": 0.01035934, "balance_loss_clip": 1.23454261, "balance_loss_mlp": 1.01693201, "epoch": 0.9010070644821885, "flos": 23232807521280.0, "grad_norm": 1.9889175541751458, "language_loss": 0.71050274, "learning_rate": 1.0186095782618643e-07, "loss": 0.73483741, "num_input_tokens_seen": 323225980, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19018555, "step": 14986, "time_per_iteration": 2.8853344917297363 }, { "auxiliary_loss_clip": 0.01397664, "auxiliary_loss_mlp": 0.01033461, "balance_loss_clip": 1.2365191, "balance_loss_mlp": 1.01424479, "epoch": 0.9010671877348565, "flos": 17393902404480.0, "grad_norm": 1.6212834883149296, "language_loss": 0.7752974, "learning_rate": 1.0173828739746104e-07, "loss": 0.79960859, "num_input_tokens_seen": 323243700, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19213867, "step": 14987, "time_per_iteration": 2.8055732250213623 }, { "auxiliary_loss_clip": 0.01387167, "auxiliary_loss_mlp": 0.0103223, "balance_loss_clip": 1.22918677, "balance_loss_mlp": 1.01316893, "epoch": 0.9011273109875244, "flos": 21918188563200.0, "grad_norm": 1.8418701423583734, "language_loss": 0.74332774, "learning_rate": 1.0161568895100981e-07, "loss": 0.76752174, "num_input_tokens_seen": 323261535, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19042969, "step": 14988, "time_per_iteration": 2.8713104724884033 }, { "auxiliary_loss_clip": 0.01411988, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 1.24758935, "balance_loss_mlp": 1.0127871, "epoch": 0.9011874342401924, "flos": 24071413489920.0, "grad_norm": 1.7673051977403775, "language_loss": 0.70089817, "learning_rate": 1.0149316249148188e-07, "loss": 0.72535896, "num_input_tokens_seen": 323281855, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.2130127, "step": 14989, "time_per_iteration": 2.8861498832702637 }, { "auxiliary_loss_clip": 0.01403765, "auxiliary_loss_mlp": 0.01032641, "balance_loss_clip": 1.24244189, "balance_loss_mlp": 1.01346016, "epoch": 0.9012475574928603, "flos": 16766210286720.0, "grad_norm": 2.2272823174433634, "language_loss": 0.80886698, "learning_rate": 1.0137070802352376e-07, "loss": 0.83323109, "num_input_tokens_seen": 323299505, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19177246, "step": 14990, "time_per_iteration": 2.819833278656006 }, { "auxiliary_loss_clip": 0.01417772, "auxiliary_loss_mlp": 0.01033847, "balance_loss_clip": 1.25280166, "balance_loss_mlp": 1.01519072, "epoch": 0.9013076807455284, "flos": 19979768540160.0, "grad_norm": 1.951679508232669, "language_loss": 0.78490353, "learning_rate": 1.0124832555177842e-07, "loss": 0.80941981, "num_input_tokens_seen": 323318365, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18676758, "step": 14991, "time_per_iteration": 2.9604649543762207 }, { "auxiliary_loss_clip": 0.01176999, "auxiliary_loss_mlp": 0.0102655, "balance_loss_clip": 1.08870077, "balance_loss_mlp": 1.001755, "epoch": 0.9013678039981963, "flos": 65210992934400.0, "grad_norm": 0.784085663637888, "language_loss": 0.60251749, "learning_rate": 1.0112601508088726e-07, "loss": 0.62455308, "num_input_tokens_seen": 323371835, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.24804688, "step": 14992, "time_per_iteration": 6.019509315490723 }, { "auxiliary_loss_clip": 0.01384536, "auxiliary_loss_mlp": 0.01029922, "balance_loss_clip": 1.22718239, "balance_loss_mlp": 1.01084828, "epoch": 0.9014279272508643, "flos": 20530761442560.0, "grad_norm": 1.8612018048384171, "language_loss": 0.8357805, "learning_rate": 1.0100377661548764e-07, "loss": 0.85992509, "num_input_tokens_seen": 323388495, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.19067383, "step": 14993, "time_per_iteration": 2.873408555984497 }, { "auxiliary_loss_clip": 0.01403276, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.24285293, "balance_loss_mlp": 1.01663029, "epoch": 0.9014880505035322, "flos": 17317384168320.0, "grad_norm": 1.8971110779131388, "language_loss": 0.74283946, "learning_rate": 1.0088161016021502e-07, "loss": 0.76723671, "num_input_tokens_seen": 323405280, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19812012, "step": 14994, "time_per_iteration": 2.855670928955078 }, { "auxiliary_loss_clip": 0.01387416, "auxiliary_loss_mlp": 0.01034496, "balance_loss_clip": 1.22985959, "balance_loss_mlp": 1.01570857, "epoch": 0.9015481737562002, "flos": 28414629383040.0, "grad_norm": 1.6628790613384987, "language_loss": 0.65608901, "learning_rate": 1.0075951571970187e-07, "loss": 0.68030816, "num_input_tokens_seen": 323425310, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18786621, "step": 14995, "time_per_iteration": 2.9220998287200928 }, { "auxiliary_loss_clip": 0.01403245, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.24146116, "balance_loss_mlp": 1.01359892, "epoch": 0.9016082970088681, "flos": 29764747526400.0, "grad_norm": 1.582819939730781, "language_loss": 0.6725412, "learning_rate": 1.0063749329857873e-07, "loss": 0.69690084, "num_input_tokens_seen": 323447805, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19116211, "step": 14996, "time_per_iteration": 2.981539249420166 }, { "auxiliary_loss_clip": 0.01397168, "auxiliary_loss_mlp": 0.01030301, "balance_loss_clip": 1.23869622, "balance_loss_mlp": 1.01157355, "epoch": 0.9016684202615362, "flos": 23523587723520.0, "grad_norm": 1.6129012922050416, "language_loss": 0.66521561, "learning_rate": 1.0051554290147168e-07, "loss": 0.68949032, "num_input_tokens_seen": 323467150, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18725586, "step": 14997, "time_per_iteration": 2.8603146076202393 }, { "auxiliary_loss_clip": 0.01392977, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.23483324, "balance_loss_mlp": 1.01325345, "epoch": 0.9017285435142042, "flos": 16987123238400.0, "grad_norm": 1.8555927297354655, "language_loss": 0.78992313, "learning_rate": 1.0039366453300613e-07, "loss": 0.81417596, "num_input_tokens_seen": 323484250, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19055176, "step": 14998, "time_per_iteration": 2.8485498428344727 }, { "auxiliary_loss_clip": 0.01401228, "auxiliary_loss_mlp": 0.01035307, "balance_loss_clip": 1.23945761, "balance_loss_mlp": 1.01603103, "epoch": 0.9017886667668721, "flos": 21402197153280.0, "grad_norm": 2.3749138137584995, "language_loss": 0.75646943, "learning_rate": 1.0027185819780281e-07, "loss": 0.78083479, "num_input_tokens_seen": 323502910, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19274902, "step": 14999, "time_per_iteration": 2.823133945465088 }, { "auxiliary_loss_clip": 0.01396407, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.23626876, "balance_loss_mlp": 1.01335788, "epoch": 0.9018487900195401, "flos": 21006502963200.0, "grad_norm": 2.3510407211616724, "language_loss": 0.76840413, "learning_rate": 1.0015012390048117e-07, "loss": 0.79269564, "num_input_tokens_seen": 323521820, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19384766, "step": 15000, "time_per_iteration": 2.8289384841918945 }, { "auxiliary_loss_clip": 0.01396415, "auxiliary_loss_mlp": 0.01028593, "balance_loss_clip": 1.23801541, "balance_loss_mlp": 1.01091409, "epoch": 0.901908913272208, "flos": 53376508661760.0, "grad_norm": 2.209401656370959, "language_loss": 0.81900781, "learning_rate": 1.0002846164565704e-07, "loss": 0.8432579, "num_input_tokens_seen": 323543200, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.17675781, "step": 15001, "time_per_iteration": 3.1479477882385254 }, { "auxiliary_loss_clip": 0.01396529, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.2383225, "balance_loss_mlp": 1.01415622, "epoch": 0.901969036524876, "flos": 22099711276800.0, "grad_norm": 1.401678750203868, "language_loss": 0.78901935, "learning_rate": 9.990687143794407e-08, "loss": 0.8133173, "num_input_tokens_seen": 323563075, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19116211, "step": 15002, "time_per_iteration": 2.895836353302002 }, { "auxiliary_loss_clip": 0.01391644, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.23180294, "balance_loss_mlp": 1.01293254, "epoch": 0.9020291597775439, "flos": 23843759063040.0, "grad_norm": 1.8251259579086383, "language_loss": 0.69458812, "learning_rate": 9.978535328195347e-08, "loss": 0.71883351, "num_input_tokens_seen": 323579065, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19970703, "step": 15003, "time_per_iteration": 2.8359463214874268 }, { "auxiliary_loss_clip": 0.0140026, "auxiliary_loss_mlp": 0.01032604, "balance_loss_clip": 1.23867273, "balance_loss_mlp": 1.01329255, "epoch": 0.902089283030212, "flos": 18333395573760.0, "grad_norm": 2.132241042124498, "language_loss": 0.86749351, "learning_rate": 9.9663907182292e-08, "loss": 0.89182216, "num_input_tokens_seen": 323594835, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19311523, "step": 15004, "time_per_iteration": 2.847820997238159 }, { "auxiliary_loss_clip": 0.01404881, "auxiliary_loss_mlp": 0.01032855, "balance_loss_clip": 1.24324656, "balance_loss_mlp": 1.0139606, "epoch": 0.9021494062828799, "flos": 24180987692160.0, "grad_norm": 1.9964171982654428, "language_loss": 0.72663689, "learning_rate": 9.954253314356575e-08, "loss": 0.75101423, "num_input_tokens_seen": 323611475, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18896484, "step": 15005, "time_per_iteration": 2.922121047973633 }, { "auxiliary_loss_clip": 0.01407934, "auxiliary_loss_mlp": 0.01034004, "balance_loss_clip": 1.24422526, "balance_loss_mlp": 1.01483488, "epoch": 0.9022095295355479, "flos": 21626955912960.0, "grad_norm": 1.8552956679601875, "language_loss": 0.72223759, "learning_rate": 9.942123117037748e-08, "loss": 0.74665689, "num_input_tokens_seen": 323629730, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19165039, "step": 15006, "time_per_iteration": 2.895777940750122 }, { "auxiliary_loss_clip": 0.01410275, "auxiliary_loss_mlp": 0.01033465, "balance_loss_clip": 1.24738348, "balance_loss_mlp": 1.0144037, "epoch": 0.9022696527882158, "flos": 18733433264640.0, "grad_norm": 1.8689200073244538, "language_loss": 0.85644281, "learning_rate": 9.930000126732618e-08, "loss": 0.88088018, "num_input_tokens_seen": 323646000, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1907959, "step": 15007, "time_per_iteration": 2.792048454284668 }, { "auxiliary_loss_clip": 0.0139065, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.2337203, "balance_loss_mlp": 1.01597166, "epoch": 0.9023297760408838, "flos": 26772599917440.0, "grad_norm": 1.6037615669652006, "language_loss": 0.79029715, "learning_rate": 9.917884343900928e-08, "loss": 0.81455684, "num_input_tokens_seen": 323667250, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19348145, "step": 15008, "time_per_iteration": 2.91572904586792 }, { "auxiliary_loss_clip": 0.01382603, "auxiliary_loss_mlp": 0.01034257, "balance_loss_clip": 1.22876072, "balance_loss_mlp": 1.01499295, "epoch": 0.9023898992935517, "flos": 20532435500160.0, "grad_norm": 1.7147044535236158, "language_loss": 0.74146473, "learning_rate": 9.905775769002156e-08, "loss": 0.76563334, "num_input_tokens_seen": 323687150, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.19274902, "step": 15009, "time_per_iteration": 2.831244707107544 }, { "auxiliary_loss_clip": 0.01398648, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.23994482, "balance_loss_mlp": 1.01293182, "epoch": 0.9024500225462198, "flos": 17465443712640.0, "grad_norm": 2.7128486828665785, "language_loss": 0.73880959, "learning_rate": 9.893674402495399e-08, "loss": 0.76312065, "num_input_tokens_seen": 323703660, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.1953125, "step": 15010, "time_per_iteration": 2.8113389015197754 }, { "auxiliary_loss_clip": 0.01407043, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.24693799, "balance_loss_mlp": 1.01532257, "epoch": 0.9025101457988878, "flos": 20823351436800.0, "grad_norm": 1.99455696976596, "language_loss": 0.75501668, "learning_rate": 9.881580244839538e-08, "loss": 0.77943575, "num_input_tokens_seen": 323722060, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19543457, "step": 15011, "time_per_iteration": 2.889805793762207 }, { "auxiliary_loss_clip": 0.01411411, "auxiliary_loss_mlp": 0.01031276, "balance_loss_clip": 1.24800968, "balance_loss_mlp": 1.01198792, "epoch": 0.9025702690515557, "flos": 19035796135680.0, "grad_norm": 1.8677627298355568, "language_loss": 0.74148774, "learning_rate": 9.869493296493204e-08, "loss": 0.76591462, "num_input_tokens_seen": 323740645, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19299316, "step": 15012, "time_per_iteration": 2.8074276447296143 }, { "auxiliary_loss_clip": 0.01387589, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.23061287, "balance_loss_mlp": 1.01671767, "epoch": 0.9026303923042237, "flos": 19692336453120.0, "grad_norm": 1.7717504150444503, "language_loss": 0.69703615, "learning_rate": 9.857413557914763e-08, "loss": 0.72126484, "num_input_tokens_seen": 323758905, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18566895, "step": 15013, "time_per_iteration": 2.889007091522217 }, { "auxiliary_loss_clip": 0.01383092, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.22859371, "balance_loss_mlp": 1.01306581, "epoch": 0.9026905155568916, "flos": 24618832053120.0, "grad_norm": 1.3489306351923243, "language_loss": 0.73534811, "learning_rate": 9.845341029562249e-08, "loss": 0.75949991, "num_input_tokens_seen": 323780595, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.19030762, "step": 15014, "time_per_iteration": 2.874812602996826 }, { "auxiliary_loss_clip": 0.01394211, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.23454189, "balance_loss_mlp": 1.01038325, "epoch": 0.9027506388095596, "flos": 20531485359360.0, "grad_norm": 1.8420466350413023, "language_loss": 0.72740126, "learning_rate": 9.833275711893474e-08, "loss": 0.75163269, "num_input_tokens_seen": 323798160, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1854248, "step": 15015, "time_per_iteration": 2.8220744132995605 }, { "auxiliary_loss_clip": 0.01398633, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.23852837, "balance_loss_mlp": 1.01309645, "epoch": 0.9028107620622275, "flos": 22794917915520.0, "grad_norm": 1.9065097691585, "language_loss": 0.69909894, "learning_rate": 9.821217605365895e-08, "loss": 0.72340333, "num_input_tokens_seen": 323816810, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18688965, "step": 15016, "time_per_iteration": 2.858342170715332 }, { "auxiliary_loss_clip": 0.01393332, "auxiliary_loss_mlp": 0.01030998, "balance_loss_clip": 1.23552549, "balance_loss_mlp": 1.01336646, "epoch": 0.9028708853148956, "flos": 25421395898880.0, "grad_norm": 1.6732392843849997, "language_loss": 0.71488458, "learning_rate": 9.809166710436855e-08, "loss": 0.73912787, "num_input_tokens_seen": 323836900, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1763916, "step": 15017, "time_per_iteration": 4.3809874057769775 }, { "auxiliary_loss_clip": 0.014092, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.2496686, "balance_loss_mlp": 1.01642215, "epoch": 0.9029310085675635, "flos": 21881241544320.0, "grad_norm": 1.5442089545404236, "language_loss": 0.69876087, "learning_rate": 9.797123027563237e-08, "loss": 0.72320431, "num_input_tokens_seen": 323855325, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18725586, "step": 15018, "time_per_iteration": 2.911013126373291 }, { "auxiliary_loss_clip": 0.01399269, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.23852718, "balance_loss_mlp": 1.01305342, "epoch": 0.9029911318202315, "flos": 26225769536640.0, "grad_norm": 1.6767336079138262, "language_loss": 0.70076513, "learning_rate": 9.785086557201782e-08, "loss": 0.7250815, "num_input_tokens_seen": 323875650, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19299316, "step": 15019, "time_per_iteration": 2.960486650466919 }, { "auxiliary_loss_clip": 0.013854, "auxiliary_loss_mlp": 0.01033873, "balance_loss_clip": 1.22866464, "balance_loss_mlp": 1.01456106, "epoch": 0.9030512550728994, "flos": 15969935468160.0, "grad_norm": 2.762514776339677, "language_loss": 0.72668833, "learning_rate": 9.773057299808951e-08, "loss": 0.75088108, "num_input_tokens_seen": 323892920, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.19299316, "step": 15020, "time_per_iteration": 4.405425548553467 }, { "auxiliary_loss_clip": 0.01403441, "auxiliary_loss_mlp": 0.0103191, "balance_loss_clip": 1.24171424, "balance_loss_mlp": 1.01263404, "epoch": 0.9031113783255674, "flos": 23998107634560.0, "grad_norm": 2.0677393131178476, "language_loss": 0.74754089, "learning_rate": 9.7610352558408e-08, "loss": 0.77189445, "num_input_tokens_seen": 323913835, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19287109, "step": 15021, "time_per_iteration": 2.8666322231292725 }, { "auxiliary_loss_clip": 0.01400379, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.23742533, "balance_loss_mlp": 1.01512122, "epoch": 0.9031715015782353, "flos": 22247499352320.0, "grad_norm": 2.2259277771936787, "language_loss": 0.73236609, "learning_rate": 9.749020425753251e-08, "loss": 0.75671399, "num_input_tokens_seen": 323933440, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19299316, "step": 15022, "time_per_iteration": 2.8310277462005615 }, { "auxiliary_loss_clip": 0.0137265, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.22125196, "balance_loss_mlp": 1.01390314, "epoch": 0.9032316248309034, "flos": 26334393598080.0, "grad_norm": 1.9181883458393991, "language_loss": 0.73353249, "learning_rate": 9.737012810001943e-08, "loss": 0.75759339, "num_input_tokens_seen": 323954090, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.1953125, "step": 15023, "time_per_iteration": 2.8623573780059814 }, { "auxiliary_loss_clip": 0.01402009, "auxiliary_loss_mlp": 0.01035288, "balance_loss_clip": 1.2421937, "balance_loss_mlp": 1.01582074, "epoch": 0.9032917480835713, "flos": 22646586902400.0, "grad_norm": 1.7447037284930753, "language_loss": 0.83346581, "learning_rate": 9.725012409042155e-08, "loss": 0.85783887, "num_input_tokens_seen": 323974040, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19470215, "step": 15024, "time_per_iteration": 2.889345407485962 }, { "auxiliary_loss_clip": 0.0140653, "auxiliary_loss_mlp": 0.01030511, "balance_loss_clip": 1.24455285, "balance_loss_mlp": 1.01218808, "epoch": 0.9033518713362393, "flos": 23889393083520.0, "grad_norm": 1.4974614148044234, "language_loss": 0.69978881, "learning_rate": 9.713019223328966e-08, "loss": 0.72415918, "num_input_tokens_seen": 323996125, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18334961, "step": 15025, "time_per_iteration": 2.89776611328125 }, { "auxiliary_loss_clip": 0.01395027, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.23648298, "balance_loss_mlp": 1.0136888, "epoch": 0.9034119945889073, "flos": 26916225471360.0, "grad_norm": 1.6217589118594695, "language_loss": 0.7773329, "learning_rate": 9.70103325331717e-08, "loss": 0.80161488, "num_input_tokens_seen": 324017645, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19494629, "step": 15026, "time_per_iteration": 2.9505929946899414 }, { "auxiliary_loss_clip": 0.01395809, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.23795485, "balance_loss_mlp": 1.0141362, "epoch": 0.9034721178415752, "flos": 20859936497280.0, "grad_norm": 2.531609208181517, "language_loss": 0.69276571, "learning_rate": 9.68905449946129e-08, "loss": 0.71704423, "num_input_tokens_seen": 324036875, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.17883301, "step": 15027, "time_per_iteration": 4.281956195831299 }, { "auxiliary_loss_clip": 0.01372874, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.22014272, "balance_loss_mlp": 1.01320994, "epoch": 0.9035322410942432, "flos": 22243834523520.0, "grad_norm": 1.570357919657266, "language_loss": 0.76448053, "learning_rate": 9.677082962215477e-08, "loss": 0.78852773, "num_input_tokens_seen": 324057045, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.18640137, "step": 15028, "time_per_iteration": 2.8731536865234375 }, { "auxiliary_loss_clip": 0.01385492, "auxiliary_loss_mlp": 0.0103195, "balance_loss_clip": 1.22818744, "balance_loss_mlp": 1.013556, "epoch": 0.9035923643469111, "flos": 25934944089600.0, "grad_norm": 1.7318960744603689, "language_loss": 0.70051277, "learning_rate": 9.665118642033765e-08, "loss": 0.72468722, "num_input_tokens_seen": 324079735, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18395996, "step": 15029, "time_per_iteration": 2.921647071838379 }, { "auxiliary_loss_clip": 0.01405163, "auxiliary_loss_mlp": 0.01034939, "balance_loss_clip": 1.24270773, "balance_loss_mlp": 1.01536453, "epoch": 0.9036524875995792, "flos": 20349555442560.0, "grad_norm": 2.1276346148857277, "language_loss": 0.750489, "learning_rate": 9.653161539369858e-08, "loss": 0.77489007, "num_input_tokens_seen": 324097785, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19555664, "step": 15030, "time_per_iteration": 2.863332509994507 }, { "auxiliary_loss_clip": 0.01412097, "auxiliary_loss_mlp": 0.01037933, "balance_loss_clip": 1.24922347, "balance_loss_mlp": 1.01859736, "epoch": 0.9037126108522471, "flos": 40129386635520.0, "grad_norm": 1.7423074441692434, "language_loss": 0.68818277, "learning_rate": 9.641211654677151e-08, "loss": 0.71268308, "num_input_tokens_seen": 324121625, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1932373, "step": 15031, "time_per_iteration": 3.0670485496520996 }, { "auxiliary_loss_clip": 0.01386736, "auxiliary_loss_mlp": 0.01034984, "balance_loss_clip": 1.22867918, "balance_loss_mlp": 1.01563644, "epoch": 0.9037727341049151, "flos": 23342653192320.0, "grad_norm": 1.572226001459117, "language_loss": 0.7745223, "learning_rate": 9.629268988408723e-08, "loss": 0.79873955, "num_input_tokens_seen": 324142535, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19348145, "step": 15032, "time_per_iteration": 2.960700273513794 }, { "auxiliary_loss_clip": 0.01409025, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.24824607, "balance_loss_mlp": 1.01495326, "epoch": 0.903832857357583, "flos": 12830180762880.0, "grad_norm": 1.7770680348802017, "language_loss": 0.76693094, "learning_rate": 9.617333541017502e-08, "loss": 0.79135674, "num_input_tokens_seen": 324159610, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18603516, "step": 15033, "time_per_iteration": 2.801617383956909 }, { "auxiliary_loss_clip": 0.01402147, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.24232554, "balance_loss_mlp": 1.01713419, "epoch": 0.903892980610251, "flos": 25714121627520.0, "grad_norm": 1.7009051357532672, "language_loss": 0.74038076, "learning_rate": 9.605405312956105e-08, "loss": 0.76476014, "num_input_tokens_seen": 324182510, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18640137, "step": 15034, "time_per_iteration": 2.9046735763549805 }, { "auxiliary_loss_clip": 0.01388247, "auxiliary_loss_mlp": 0.01032655, "balance_loss_clip": 1.23117232, "balance_loss_mlp": 1.01427317, "epoch": 0.9039531038629189, "flos": 14692263528960.0, "grad_norm": 1.6553627193769427, "language_loss": 0.64903688, "learning_rate": 9.593484304676791e-08, "loss": 0.67324585, "num_input_tokens_seen": 324200555, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18395996, "step": 15035, "time_per_iteration": 2.8165464401245117 }, { "auxiliary_loss_clip": 0.0139976, "auxiliary_loss_mlp": 0.01033556, "balance_loss_clip": 1.24017739, "balance_loss_mlp": 1.01385117, "epoch": 0.904013227115587, "flos": 24035642835840.0, "grad_norm": 2.041605968614431, "language_loss": 0.63463056, "learning_rate": 9.581570516631643e-08, "loss": 0.65896368, "num_input_tokens_seen": 324220255, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19714355, "step": 15036, "time_per_iteration": 2.854534387588501 }, { "auxiliary_loss_clip": 0.01379152, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 1.22464466, "balance_loss_mlp": 1.01233518, "epoch": 0.9040733503682549, "flos": 22866640202880.0, "grad_norm": 1.5981831636041934, "language_loss": 0.82825786, "learning_rate": 9.569663949272455e-08, "loss": 0.85236168, "num_input_tokens_seen": 324237855, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.18896484, "step": 15037, "time_per_iteration": 2.838592052459717 }, { "auxiliary_loss_clip": 0.01406732, "auxiliary_loss_mlp": 0.0102902, "balance_loss_clip": 1.24263358, "balance_loss_mlp": 1.01029253, "epoch": 0.9041334736209229, "flos": 19984609733760.0, "grad_norm": 1.603441645381469, "language_loss": 0.68429041, "learning_rate": 9.557764603050667e-08, "loss": 0.70864785, "num_input_tokens_seen": 324257050, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18725586, "step": 15038, "time_per_iteration": 2.813791275024414 }, { "auxiliary_loss_clip": 0.01387429, "auxiliary_loss_mlp": 0.01033244, "balance_loss_clip": 1.22876024, "balance_loss_mlp": 1.0138731, "epoch": 0.9041935968735909, "flos": 17539609219200.0, "grad_norm": 1.9047258911435485, "language_loss": 0.77112275, "learning_rate": 9.545872478417494e-08, "loss": 0.79532945, "num_input_tokens_seen": 324275510, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19372559, "step": 15039, "time_per_iteration": 2.7967288494110107 }, { "auxiliary_loss_clip": 0.01389396, "auxiliary_loss_mlp": 0.01027638, "balance_loss_clip": 1.23224092, "balance_loss_mlp": 1.00848103, "epoch": 0.9042537201262588, "flos": 22789443294720.0, "grad_norm": 1.482906053111994, "language_loss": 0.70704746, "learning_rate": 9.533987575823977e-08, "loss": 0.7312178, "num_input_tokens_seen": 324295150, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19152832, "step": 15040, "time_per_iteration": 2.834010124206543 }, { "auxiliary_loss_clip": 0.01402737, "auxiliary_loss_mlp": 0.01031226, "balance_loss_clip": 1.24480021, "balance_loss_mlp": 1.01233089, "epoch": 0.9043138433789268, "flos": 20605288907520.0, "grad_norm": 1.7462628048577862, "language_loss": 0.68475431, "learning_rate": 9.522109895720709e-08, "loss": 0.70909393, "num_input_tokens_seen": 324313855, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18896484, "step": 15041, "time_per_iteration": 2.8524210453033447 }, { "auxiliary_loss_clip": 0.01387465, "auxiliary_loss_mlp": 0.01030806, "balance_loss_clip": 1.22862077, "balance_loss_mlp": 1.01122034, "epoch": 0.9043739666315948, "flos": 32975772071040.0, "grad_norm": 1.682550221538491, "language_loss": 0.58080959, "learning_rate": 9.510239438558155e-08, "loss": 0.60499239, "num_input_tokens_seen": 324338465, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19580078, "step": 15042, "time_per_iteration": 2.9535481929779053 }, { "auxiliary_loss_clip": 0.01175694, "auxiliary_loss_mlp": 0.01019903, "balance_loss_clip": 1.08997345, "balance_loss_mlp": 1.0003531, "epoch": 0.9044340898842628, "flos": 67327678045440.0, "grad_norm": 0.7786547474183138, "language_loss": 0.56979036, "learning_rate": 9.498376204786351e-08, "loss": 0.59174633, "num_input_tokens_seen": 324398740, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.1953125, "step": 15043, "time_per_iteration": 3.299680233001709 }, { "auxiliary_loss_clip": 0.01395811, "auxiliary_loss_mlp": 0.01033283, "balance_loss_clip": 1.23578882, "balance_loss_mlp": 1.0136137, "epoch": 0.9044942131369307, "flos": 17722308297600.0, "grad_norm": 1.8068257662490432, "language_loss": 0.70780671, "learning_rate": 9.486520194855274e-08, "loss": 0.73209769, "num_input_tokens_seen": 324417335, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19665527, "step": 15044, "time_per_iteration": 2.8622124195098877 }, { "auxiliary_loss_clip": 0.01397602, "auxiliary_loss_mlp": 0.0103162, "balance_loss_clip": 1.23629081, "balance_loss_mlp": 1.01262999, "epoch": 0.9045543363895987, "flos": 17829213056640.0, "grad_norm": 2.228365855559933, "language_loss": 0.7045598, "learning_rate": 9.474671409214407e-08, "loss": 0.72885203, "num_input_tokens_seen": 324433240, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18981934, "step": 15045, "time_per_iteration": 2.805387258529663 }, { "auxiliary_loss_clip": 0.01407631, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.24655294, "balance_loss_mlp": 1.01356387, "epoch": 0.9046144596422666, "flos": 21883006091520.0, "grad_norm": 1.7345568012434123, "language_loss": 0.66614592, "learning_rate": 9.462829848313081e-08, "loss": 0.69055796, "num_input_tokens_seen": 324452675, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.20019531, "step": 15046, "time_per_iteration": 2.9846582412719727 }, { "auxiliary_loss_clip": 0.01406296, "auxiliary_loss_mlp": 0.01034649, "balance_loss_clip": 1.24309409, "balance_loss_mlp": 1.01543212, "epoch": 0.9046745828949346, "flos": 17680746309120.0, "grad_norm": 1.9364562854649534, "language_loss": 0.62371111, "learning_rate": 9.450995512600379e-08, "loss": 0.64812052, "num_input_tokens_seen": 324467865, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19226074, "step": 15047, "time_per_iteration": 2.7966630458831787 }, { "auxiliary_loss_clip": 0.01404319, "auxiliary_loss_mlp": 0.01033935, "balance_loss_clip": 1.24677336, "balance_loss_mlp": 1.0148735, "epoch": 0.9047347061476025, "flos": 25713533445120.0, "grad_norm": 1.5395964758557645, "language_loss": 0.71326667, "learning_rate": 9.439168402525032e-08, "loss": 0.73764914, "num_input_tokens_seen": 324490430, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19067383, "step": 15048, "time_per_iteration": 2.901310682296753 }, { "auxiliary_loss_clip": 0.01394735, "auxiliary_loss_mlp": 0.01035577, "balance_loss_clip": 1.23394406, "balance_loss_mlp": 1.01534748, "epoch": 0.9047948294002706, "flos": 15166376236800.0, "grad_norm": 2.037991755486922, "language_loss": 0.75628757, "learning_rate": 9.427348518535483e-08, "loss": 0.78059071, "num_input_tokens_seen": 324506620, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20239258, "step": 15049, "time_per_iteration": 2.8241183757781982 }, { "auxiliary_loss_clip": 0.01393467, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.2358216, "balance_loss_mlp": 1.01390767, "epoch": 0.9048549526529385, "flos": 21882463153920.0, "grad_norm": 1.7496697335475913, "language_loss": 0.76085734, "learning_rate": 9.415535861079993e-08, "loss": 0.78512073, "num_input_tokens_seen": 324525505, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.1895752, "step": 15050, "time_per_iteration": 2.8752262592315674 }, { "auxiliary_loss_clip": 0.01405042, "auxiliary_loss_mlp": 0.01036258, "balance_loss_clip": 1.24264717, "balance_loss_mlp": 1.0171845, "epoch": 0.9049150759056065, "flos": 23556191241600.0, "grad_norm": 1.829112515292373, "language_loss": 0.82857478, "learning_rate": 9.403730430606472e-08, "loss": 0.85298789, "num_input_tokens_seen": 324544415, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19055176, "step": 15051, "time_per_iteration": 2.855921745300293 }, { "auxiliary_loss_clip": 0.01399292, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.23952818, "balance_loss_mlp": 1.01314831, "epoch": 0.9049751991582745, "flos": 19655298944640.0, "grad_norm": 1.979868117487741, "language_loss": 0.90088952, "learning_rate": 9.391932227562582e-08, "loss": 0.92518842, "num_input_tokens_seen": 324562555, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.17456055, "step": 15052, "time_per_iteration": 4.25718355178833 }, { "auxiliary_loss_clip": 0.01409107, "auxiliary_loss_mlp": 0.01033736, "balance_loss_clip": 1.24558401, "balance_loss_mlp": 1.01448393, "epoch": 0.9050353224109424, "flos": 15604627800960.0, "grad_norm": 2.2019581099372734, "language_loss": 0.774939, "learning_rate": 9.380141252395724e-08, "loss": 0.79936743, "num_input_tokens_seen": 324580865, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19262695, "step": 15053, "time_per_iteration": 2.8298351764678955 }, { "auxiliary_loss_clip": 0.01392167, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.23349404, "balance_loss_mlp": 1.01451135, "epoch": 0.9050954456636104, "flos": 28195209509760.0, "grad_norm": 2.318823445917886, "language_loss": 0.73244053, "learning_rate": 9.368357505553049e-08, "loss": 0.75669479, "num_input_tokens_seen": 324600665, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18762207, "step": 15054, "time_per_iteration": 2.888201951980591 }, { "auxiliary_loss_clip": 0.01394248, "auxiliary_loss_mlp": 0.01029343, "balance_loss_clip": 1.23517895, "balance_loss_mlp": 1.01110435, "epoch": 0.9051555689162784, "flos": 25740662342400.0, "grad_norm": 1.5772345556631429, "language_loss": 0.83562714, "learning_rate": 9.356580987481333e-08, "loss": 0.85986304, "num_input_tokens_seen": 324618145, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18237305, "step": 15055, "time_per_iteration": 4.282470941543579 }, { "auxiliary_loss_clip": 0.01390027, "auxiliary_loss_mlp": 0.0103459, "balance_loss_clip": 1.23247635, "balance_loss_mlp": 1.01638675, "epoch": 0.9052156921689464, "flos": 23267401810560.0, "grad_norm": 1.7522026478643404, "language_loss": 0.85495543, "learning_rate": 9.344811698627176e-08, "loss": 0.87920159, "num_input_tokens_seen": 324638165, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18212891, "step": 15056, "time_per_iteration": 2.854918956756592 }, { "auxiliary_loss_clip": 0.01403617, "auxiliary_loss_mlp": 0.01031374, "balance_loss_clip": 1.24281573, "balance_loss_mlp": 1.01303959, "epoch": 0.9052758154216143, "flos": 29574764035200.0, "grad_norm": 1.84902053353987, "language_loss": 0.73212969, "learning_rate": 9.333049639436863e-08, "loss": 0.75647956, "num_input_tokens_seen": 324658560, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18347168, "step": 15057, "time_per_iteration": 2.897400379180908 }, { "auxiliary_loss_clip": 0.0138427, "auxiliary_loss_mlp": 0.01030484, "balance_loss_clip": 1.22955894, "balance_loss_mlp": 1.01083887, "epoch": 0.9053359386742823, "flos": 22137925150080.0, "grad_norm": 1.4569565934955013, "language_loss": 0.81466377, "learning_rate": 9.321294810356418e-08, "loss": 0.83881128, "num_input_tokens_seen": 324679185, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.19641113, "step": 15058, "time_per_iteration": 2.8499104976654053 }, { "auxiliary_loss_clip": 0.01177044, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.08957326, "balance_loss_mlp": 1.01054001, "epoch": 0.9053960619269502, "flos": 67124591544960.0, "grad_norm": 0.6699087946403369, "language_loss": 0.51354086, "learning_rate": 9.309547211831592e-08, "loss": 0.53563321, "num_input_tokens_seen": 324744830, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.21679688, "step": 15059, "time_per_iteration": 3.4356815814971924 }, { "auxiliary_loss_clip": 0.01398423, "auxiliary_loss_mlp": 0.0102988, "balance_loss_clip": 1.23697972, "balance_loss_mlp": 1.01167655, "epoch": 0.9054561851796182, "flos": 15823957184640.0, "grad_norm": 1.7220357325320055, "language_loss": 0.67530572, "learning_rate": 9.297806844307831e-08, "loss": 0.69958878, "num_input_tokens_seen": 324762905, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18225098, "step": 15060, "time_per_iteration": 2.826357841491699 }, { "auxiliary_loss_clip": 0.01410777, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.24877429, "balance_loss_mlp": 1.0141716, "epoch": 0.9055163084322861, "flos": 17576058545280.0, "grad_norm": 2.5026388616703183, "language_loss": 0.64950371, "learning_rate": 9.286073708230357e-08, "loss": 0.67394751, "num_input_tokens_seen": 324781905, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19445801, "step": 15061, "time_per_iteration": 2.811018943786621 }, { "auxiliary_loss_clip": 0.01394047, "auxiliary_loss_mlp": 0.01032586, "balance_loss_clip": 1.23421013, "balance_loss_mlp": 1.01280963, "epoch": 0.9055764316849542, "flos": 17648549994240.0, "grad_norm": 1.6993397126738374, "language_loss": 0.72265226, "learning_rate": 9.274347804044058e-08, "loss": 0.74691856, "num_input_tokens_seen": 324799260, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19775391, "step": 15062, "time_per_iteration": 4.286927938461304 }, { "auxiliary_loss_clip": 0.01395843, "auxiliary_loss_mlp": 0.01032831, "balance_loss_clip": 1.23727214, "balance_loss_mlp": 1.0138526, "epoch": 0.9056365549376221, "flos": 20130542772480.0, "grad_norm": 1.6449551996514218, "language_loss": 0.7124967, "learning_rate": 9.2626291321936e-08, "loss": 0.7367835, "num_input_tokens_seen": 324817800, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18981934, "step": 15063, "time_per_iteration": 2.8913259506225586 }, { "auxiliary_loss_clip": 0.01381542, "auxiliary_loss_mlp": 0.01030154, "balance_loss_clip": 1.22672284, "balance_loss_mlp": 1.01193929, "epoch": 0.9056966781902901, "flos": 27609984276480.0, "grad_norm": 1.6383164382867228, "language_loss": 0.72884262, "learning_rate": 9.250917693123406e-08, "loss": 0.75295961, "num_input_tokens_seen": 324838445, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18225098, "step": 15064, "time_per_iteration": 2.920121431350708 }, { "auxiliary_loss_clip": 0.0139818, "auxiliary_loss_mlp": 0.01030605, "balance_loss_clip": 1.23630393, "balance_loss_mlp": 1.01197267, "epoch": 0.9057568014429581, "flos": 25930781568000.0, "grad_norm": 4.6766154397594715, "language_loss": 0.71370071, "learning_rate": 9.23921348727752e-08, "loss": 0.73798859, "num_input_tokens_seen": 324859895, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.1862793, "step": 15065, "time_per_iteration": 2.8761324882507324 }, { "auxiliary_loss_clip": 0.01401478, "auxiliary_loss_mlp": 0.0103569, "balance_loss_clip": 1.24203408, "balance_loss_mlp": 1.01666379, "epoch": 0.905816924695626, "flos": 22940805709440.0, "grad_norm": 1.663514413521259, "language_loss": 0.64134359, "learning_rate": 9.227516515099743e-08, "loss": 0.66571534, "num_input_tokens_seen": 324879580, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19030762, "step": 15066, "time_per_iteration": 2.9378576278686523 }, { "auxiliary_loss_clip": 0.01417144, "auxiliary_loss_mlp": 0.01033395, "balance_loss_clip": 1.2494204, "balance_loss_mlp": 1.01347542, "epoch": 0.905877047948294, "flos": 22165913698560.0, "grad_norm": 2.052818970766034, "language_loss": 0.80539751, "learning_rate": 9.215826777033675e-08, "loss": 0.82990289, "num_input_tokens_seen": 324898950, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19909668, "step": 15067, "time_per_iteration": 2.875018358230591 }, { "auxiliary_loss_clip": 0.01403489, "auxiliary_loss_mlp": 0.01033198, "balance_loss_clip": 1.24167418, "balance_loss_mlp": 1.01251531, "epoch": 0.905937171200962, "flos": 15313259416320.0, "grad_norm": 1.700427815820377, "language_loss": 0.70498991, "learning_rate": 9.204144273522563e-08, "loss": 0.72935677, "num_input_tokens_seen": 324917455, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.20690918, "step": 15068, "time_per_iteration": 2.9475185871124268 }, { "auxiliary_loss_clip": 0.01382269, "auxiliary_loss_mlp": 0.01034986, "balance_loss_clip": 1.22716057, "balance_loss_mlp": 1.01652062, "epoch": 0.90599729445363, "flos": 19472192663040.0, "grad_norm": 2.593862243588848, "language_loss": 0.8656615, "learning_rate": 9.19246900500943e-08, "loss": 0.88983411, "num_input_tokens_seen": 324934495, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.18481445, "step": 15069, "time_per_iteration": 2.801565647125244 }, { "auxiliary_loss_clip": 0.01410455, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.24716294, "balance_loss_mlp": 1.01164007, "epoch": 0.9060574177062979, "flos": 23743595779200.0, "grad_norm": 2.915105026426814, "language_loss": 0.60137868, "learning_rate": 9.180800971936987e-08, "loss": 0.62578809, "num_input_tokens_seen": 324953230, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18835449, "step": 15070, "time_per_iteration": 2.8373055458068848 }, { "auxiliary_loss_clip": 0.0140741, "auxiliary_loss_mlp": 0.0102903, "balance_loss_clip": 1.2433883, "balance_loss_mlp": 1.00959897, "epoch": 0.9061175409589659, "flos": 17320008366720.0, "grad_norm": 5.5684998499968295, "language_loss": 0.82136333, "learning_rate": 9.169140174747724e-08, "loss": 0.84572768, "num_input_tokens_seen": 324969880, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19433594, "step": 15071, "time_per_iteration": 2.8442718982696533 }, { "auxiliary_loss_clip": 0.01404383, "auxiliary_loss_mlp": 0.01034043, "balance_loss_clip": 1.241907, "balance_loss_mlp": 1.01454008, "epoch": 0.9061776642116338, "flos": 17785615052160.0, "grad_norm": 1.8525092867849862, "language_loss": 0.62575907, "learning_rate": 9.157486613883758e-08, "loss": 0.65014327, "num_input_tokens_seen": 324987005, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19506836, "step": 15072, "time_per_iteration": 2.8864874839782715 }, { "auxiliary_loss_clip": 0.01397335, "auxiliary_loss_mlp": 0.01034426, "balance_loss_clip": 1.23783302, "balance_loss_mlp": 1.01488745, "epoch": 0.9062377874643018, "flos": 42793037861760.0, "grad_norm": 2.745093344131427, "language_loss": 0.73930812, "learning_rate": 9.145840289787021e-08, "loss": 0.76362568, "num_input_tokens_seen": 325010700, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19555664, "step": 15073, "time_per_iteration": 3.0293798446655273 }, { "auxiliary_loss_clip": 0.01386535, "auxiliary_loss_mlp": 0.0102997, "balance_loss_clip": 1.23024511, "balance_loss_mlp": 1.01117074, "epoch": 0.9062979107169697, "flos": 16370018403840.0, "grad_norm": 1.8117981518229371, "language_loss": 0.817756, "learning_rate": 9.134201202899161e-08, "loss": 0.84192103, "num_input_tokens_seen": 325028760, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18811035, "step": 15074, "time_per_iteration": 2.852825880050659 }, { "auxiliary_loss_clip": 0.01177497, "auxiliary_loss_mlp": 0.01021244, "balance_loss_clip": 1.08916712, "balance_loss_mlp": 0.99873775, "epoch": 0.9063580339696378, "flos": 69348588618240.0, "grad_norm": 0.7445175106688425, "language_loss": 0.52425981, "learning_rate": 9.122569353661513e-08, "loss": 0.54624718, "num_input_tokens_seen": 325093545, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.22460938, "step": 15075, "time_per_iteration": 3.3948209285736084 }, { "auxiliary_loss_clip": 0.01174469, "auxiliary_loss_mlp": 0.01017174, "balance_loss_clip": 1.08849669, "balance_loss_mlp": 0.99791002, "epoch": 0.9064181572223057, "flos": 58823673868800.0, "grad_norm": 0.7244433172132912, "language_loss": 0.6220957, "learning_rate": 9.11094474251517e-08, "loss": 0.64401209, "num_input_tokens_seen": 325152295, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.19238281, "step": 15076, "time_per_iteration": 3.2045018672943115 }, { "auxiliary_loss_clip": 0.01387277, "auxiliary_loss_mlp": 0.01033273, "balance_loss_clip": 1.23003697, "balance_loss_mlp": 1.01505756, "epoch": 0.9064782804749737, "flos": 21772798462080.0, "grad_norm": 2.165120878380871, "language_loss": 0.82767075, "learning_rate": 9.09932736990091e-08, "loss": 0.85187626, "num_input_tokens_seen": 325169705, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18225098, "step": 15077, "time_per_iteration": 2.8462929725646973 }, { "auxiliary_loss_clip": 0.01392999, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.23667955, "balance_loss_mlp": 1.01190591, "epoch": 0.9065384037276417, "flos": 21407264570880.0, "grad_norm": 2.0473128233237476, "language_loss": 0.84924424, "learning_rate": 9.08771723625934e-08, "loss": 0.87347329, "num_input_tokens_seen": 325189175, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18005371, "step": 15078, "time_per_iteration": 2.8452298641204834 }, { "auxiliary_loss_clip": 0.01372864, "auxiliary_loss_mlp": 0.01030636, "balance_loss_clip": 1.21967745, "balance_loss_mlp": 1.01161027, "epoch": 0.9065985269803096, "flos": 38296197313920.0, "grad_norm": 1.6681643765382916, "language_loss": 0.66358525, "learning_rate": 9.076114342030617e-08, "loss": 0.68762028, "num_input_tokens_seen": 325211020, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.19030762, "step": 15079, "time_per_iteration": 3.0046846866607666 }, { "auxiliary_loss_clip": 0.01387634, "auxiliary_loss_mlp": 0.01031962, "balance_loss_clip": 1.22904491, "balance_loss_mlp": 1.01357985, "epoch": 0.9066586502329776, "flos": 44833838163840.0, "grad_norm": 2.4334130044853435, "language_loss": 0.71264195, "learning_rate": 9.064518687654765e-08, "loss": 0.73683798, "num_input_tokens_seen": 325236970, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18359375, "step": 15080, "time_per_iteration": 3.053250551223755 }, { "auxiliary_loss_clip": 0.01423978, "auxiliary_loss_mlp": 0.01032029, "balance_loss_clip": 1.25902438, "balance_loss_mlp": 1.01321769, "epoch": 0.9067187734856456, "flos": 18633043756800.0, "grad_norm": 2.823165874955985, "language_loss": 0.71764028, "learning_rate": 9.052930273571547e-08, "loss": 0.74220026, "num_input_tokens_seen": 325252670, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.18823242, "step": 15081, "time_per_iteration": 2.8097054958343506 }, { "auxiliary_loss_clip": 0.01387142, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.23059654, "balance_loss_mlp": 1.01631391, "epoch": 0.9067788967383136, "flos": 22757880407040.0, "grad_norm": 20.849941928366338, "language_loss": 0.74312508, "learning_rate": 9.04134910022032e-08, "loss": 0.76733863, "num_input_tokens_seen": 325273860, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.17895508, "step": 15082, "time_per_iteration": 2.8673455715179443 }, { "auxiliary_loss_clip": 0.01390831, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 1.23307157, "balance_loss_mlp": 1.01703095, "epoch": 0.9068390199909815, "flos": 27681751808640.0, "grad_norm": 1.7421827828429575, "language_loss": 0.78724396, "learning_rate": 9.029775168040266e-08, "loss": 0.81151533, "num_input_tokens_seen": 325294140, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19274902, "step": 15083, "time_per_iteration": 2.874119997024536 }, { "auxiliary_loss_clip": 0.01384554, "auxiliary_loss_mlp": 0.01035195, "balance_loss_clip": 1.22937846, "balance_loss_mlp": 1.01641941, "epoch": 0.9068991432436495, "flos": 24254836485120.0, "grad_norm": 1.5302329239129722, "language_loss": 0.69440472, "learning_rate": 9.01820847747028e-08, "loss": 0.7186023, "num_input_tokens_seen": 325313130, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.18762207, "step": 15084, "time_per_iteration": 2.8608689308166504 }, { "auxiliary_loss_clip": 0.01398334, "auxiliary_loss_mlp": 0.01032842, "balance_loss_clip": 1.23905897, "balance_loss_mlp": 1.01442409, "epoch": 0.9069592664963174, "flos": 28043756605440.0, "grad_norm": 2.066108753794254, "language_loss": 0.6734342, "learning_rate": 9.006649028948965e-08, "loss": 0.69774598, "num_input_tokens_seen": 325334880, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18408203, "step": 15085, "time_per_iteration": 2.913005828857422 }, { "auxiliary_loss_clip": 0.01176187, "auxiliary_loss_mlp": 0.01023287, "balance_loss_clip": 1.09106326, "balance_loss_mlp": 1.00516725, "epoch": 0.9070193897489854, "flos": 68806445713920.0, "grad_norm": 0.7958609430588538, "language_loss": 0.61353821, "learning_rate": 8.995096822914638e-08, "loss": 0.63553292, "num_input_tokens_seen": 325394175, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.18164062, "step": 15086, "time_per_iteration": 3.350461721420288 }, { "auxiliary_loss_clip": 0.01387005, "auxiliary_loss_mlp": 0.01031422, "balance_loss_clip": 1.2295773, "balance_loss_mlp": 1.01246798, "epoch": 0.9070795130016533, "flos": 23452317884160.0, "grad_norm": 1.50717260446102, "language_loss": 0.73630655, "learning_rate": 8.983551859805416e-08, "loss": 0.76049083, "num_input_tokens_seen": 325415020, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18945312, "step": 15087, "time_per_iteration": 4.326995134353638 }, { "auxiliary_loss_clip": 0.01391989, "auxiliary_loss_mlp": 0.01029926, "balance_loss_clip": 1.23322868, "balance_loss_mlp": 1.0115205, "epoch": 0.9071396362543214, "flos": 18925678995840.0, "grad_norm": 4.160169684458599, "language_loss": 0.77507055, "learning_rate": 8.972014140059058e-08, "loss": 0.7992897, "num_input_tokens_seen": 325433595, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18408203, "step": 15088, "time_per_iteration": 2.8666350841522217 }, { "auxiliary_loss_clip": 0.01383067, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.22883642, "balance_loss_mlp": 1.01840675, "epoch": 0.9071997595069893, "flos": 25239601716480.0, "grad_norm": 2.2778523795424976, "language_loss": 0.74015784, "learning_rate": 8.960483664113038e-08, "loss": 0.76435691, "num_input_tokens_seen": 325451605, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.18444824, "step": 15089, "time_per_iteration": 3.049724578857422 }, { "auxiliary_loss_clip": 0.01382966, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.22910237, "balance_loss_mlp": 1.01454902, "epoch": 0.9072598827596573, "flos": 24355678440960.0, "grad_norm": 2.067233158013782, "language_loss": 0.76130581, "learning_rate": 8.948960432404628e-08, "loss": 0.78546667, "num_input_tokens_seen": 325470645, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.18579102, "step": 15090, "time_per_iteration": 2.869291067123413 }, { "auxiliary_loss_clip": 0.01408701, "auxiliary_loss_mlp": 0.01032758, "balance_loss_clip": 1.24615693, "balance_loss_mlp": 1.01349354, "epoch": 0.9073200060123253, "flos": 22685343713280.0, "grad_norm": 2.021321107565697, "language_loss": 0.78149235, "learning_rate": 8.93744444537079e-08, "loss": 0.80590695, "num_input_tokens_seen": 325488070, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19250488, "step": 15091, "time_per_iteration": 4.293613910675049 }, { "auxiliary_loss_clip": 0.01380755, "auxiliary_loss_mlp": 0.01030082, "balance_loss_clip": 1.22708213, "balance_loss_mlp": 1.01262999, "epoch": 0.9073801292649932, "flos": 23705834353920.0, "grad_norm": 1.6038848613057661, "language_loss": 0.85968322, "learning_rate": 8.925935703448217e-08, "loss": 0.88379157, "num_input_tokens_seen": 325509285, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.17456055, "step": 15092, "time_per_iteration": 2.9290308952331543 }, { "auxiliary_loss_clip": 0.01406769, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.24774182, "balance_loss_mlp": 1.01789045, "epoch": 0.9074402525176612, "flos": 25386620630400.0, "grad_norm": 1.6298829534969845, "language_loss": 0.79457545, "learning_rate": 8.914434207073296e-08, "loss": 0.8190006, "num_input_tokens_seen": 325529360, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.1784668, "step": 15093, "time_per_iteration": 2.884214162826538 }, { "auxiliary_loss_clip": 0.01175474, "auxiliary_loss_mlp": 0.01019767, "balance_loss_clip": 1.08871436, "balance_loss_mlp": 1.00002599, "epoch": 0.9075003757703292, "flos": 67677195277440.0, "grad_norm": 0.7414614047336038, "language_loss": 0.57024771, "learning_rate": 8.902939956682188e-08, "loss": 0.59220016, "num_input_tokens_seen": 325583565, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.19726562, "step": 15094, "time_per_iteration": 3.2655086517333984 }, { "auxiliary_loss_clip": 0.01410521, "auxiliary_loss_mlp": 0.01034946, "balance_loss_clip": 1.24817979, "balance_loss_mlp": 1.01525295, "epoch": 0.9075604990229972, "flos": 22463344886400.0, "grad_norm": 1.8834012573745251, "language_loss": 0.72232997, "learning_rate": 8.891452952710742e-08, "loss": 0.74678469, "num_input_tokens_seen": 325603690, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19689941, "step": 15095, "time_per_iteration": 2.8991854190826416 }, { "auxiliary_loss_clip": 0.01394956, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.2368381, "balance_loss_mlp": 1.01545596, "epoch": 0.9076206222756651, "flos": 19546086700800.0, "grad_norm": 1.8249169883387788, "language_loss": 0.74896806, "learning_rate": 8.879973195594526e-08, "loss": 0.77326131, "num_input_tokens_seen": 325622255, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18920898, "step": 15096, "time_per_iteration": 4.249268054962158 }, { "auxiliary_loss_clip": 0.01406057, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.24374008, "balance_loss_mlp": 1.01579547, "epoch": 0.9076807455283331, "flos": 30128924073600.0, "grad_norm": 1.988444705193047, "language_loss": 0.57539654, "learning_rate": 8.868500685768898e-08, "loss": 0.59981239, "num_input_tokens_seen": 325640165, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19750977, "step": 15097, "time_per_iteration": 4.275999069213867 }, { "auxiliary_loss_clip": 0.01384565, "auxiliary_loss_mlp": 0.01026814, "balance_loss_clip": 1.22740507, "balance_loss_mlp": 1.00913501, "epoch": 0.907740868781001, "flos": 18706756815360.0, "grad_norm": 1.5827266778627698, "language_loss": 0.80199611, "learning_rate": 8.857035423668935e-08, "loss": 0.82610989, "num_input_tokens_seen": 325659455, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.17687988, "step": 15098, "time_per_iteration": 2.8163399696350098 }, { "auxiliary_loss_clip": 0.01401728, "auxiliary_loss_mlp": 0.01029618, "balance_loss_clip": 1.2391082, "balance_loss_mlp": 1.01111627, "epoch": 0.907800992033669, "flos": 22649256345600.0, "grad_norm": 1.6406146409155598, "language_loss": 0.6653415, "learning_rate": 8.845577409729266e-08, "loss": 0.68965495, "num_input_tokens_seen": 325678095, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18493652, "step": 15099, "time_per_iteration": 2.8763582706451416 }, { "auxiliary_loss_clip": 0.01409596, "auxiliary_loss_mlp": 0.01037471, "balance_loss_clip": 1.24734926, "balance_loss_mlp": 1.01792073, "epoch": 0.907861115286337, "flos": 21297464144640.0, "grad_norm": 1.918406221125474, "language_loss": 0.71257156, "learning_rate": 8.834126644384477e-08, "loss": 0.73704219, "num_input_tokens_seen": 325695825, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19543457, "step": 15100, "time_per_iteration": 2.8390393257141113 }, { "auxiliary_loss_clip": 0.01177678, "auxiliary_loss_mlp": 0.01014051, "balance_loss_clip": 1.09142375, "balance_loss_mlp": 0.9963131, "epoch": 0.907921238539005, "flos": 69771004502400.0, "grad_norm": 0.6265732015261626, "language_loss": 0.53442717, "learning_rate": 8.822683128068775e-08, "loss": 0.55634451, "num_input_tokens_seen": 325764515, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 0.17773438, "step": 15101, "time_per_iteration": 3.4344167709350586 }, { "auxiliary_loss_clip": 0.01398785, "auxiliary_loss_mlp": 0.01034941, "balance_loss_clip": 1.23967838, "balance_loss_mlp": 1.01541424, "epoch": 0.9079813617916729, "flos": 23487907559040.0, "grad_norm": 1.8665340191141773, "language_loss": 0.69185251, "learning_rate": 8.811246861216081e-08, "loss": 0.71618974, "num_input_tokens_seen": 325783235, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1953125, "step": 15102, "time_per_iteration": 2.9113478660583496 }, { "auxiliary_loss_clip": 0.01388476, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.23101532, "balance_loss_mlp": 1.01331854, "epoch": 0.9080414850443409, "flos": 22940262771840.0, "grad_norm": 1.8624228031326542, "language_loss": 0.79834825, "learning_rate": 8.799817844260049e-08, "loss": 0.82255095, "num_input_tokens_seen": 325800195, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18469238, "step": 15103, "time_per_iteration": 2.8403337001800537 }, { "auxiliary_loss_clip": 0.01392341, "auxiliary_loss_mlp": 0.01036026, "balance_loss_clip": 1.2320838, "balance_loss_mlp": 1.01711953, "epoch": 0.9081016082970089, "flos": 26188234335360.0, "grad_norm": 1.7893076026054535, "language_loss": 0.7281158, "learning_rate": 8.78839607763413e-08, "loss": 0.75239944, "num_input_tokens_seen": 325820215, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18896484, "step": 15104, "time_per_iteration": 2.905101776123047 }, { "auxiliary_loss_clip": 0.01386617, "auxiliary_loss_mlp": 0.01030739, "balance_loss_clip": 1.22999489, "balance_loss_mlp": 1.01217794, "epoch": 0.9081617315496768, "flos": 24472853769600.0, "grad_norm": 1.7277364466417617, "language_loss": 0.77955115, "learning_rate": 8.77698156177138e-08, "loss": 0.80372471, "num_input_tokens_seen": 325838415, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18554688, "step": 15105, "time_per_iteration": 2.936025619506836 }, { "auxiliary_loss_clip": 0.01397173, "auxiliary_loss_mlp": 0.01035027, "balance_loss_clip": 1.23772943, "balance_loss_mlp": 1.0164063, "epoch": 0.9082218548023449, "flos": 24755761376640.0, "grad_norm": 1.9599480610510869, "language_loss": 0.74068284, "learning_rate": 8.765574297104628e-08, "loss": 0.76500481, "num_input_tokens_seen": 325855580, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18603516, "step": 15106, "time_per_iteration": 2.9089343547821045 }, { "auxiliary_loss_clip": 0.01403436, "auxiliary_loss_mlp": 0.01036096, "balance_loss_clip": 1.24262667, "balance_loss_mlp": 1.01609266, "epoch": 0.9082819780550128, "flos": 24430839333120.0, "grad_norm": 1.777705639420387, "language_loss": 0.81251121, "learning_rate": 8.754174284066462e-08, "loss": 0.83690655, "num_input_tokens_seen": 325874890, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19995117, "step": 15107, "time_per_iteration": 2.8594207763671875 }, { "auxiliary_loss_clip": 0.01176287, "auxiliary_loss_mlp": 0.010159, "balance_loss_clip": 1.09005594, "balance_loss_mlp": 0.99511021, "epoch": 0.9083421013076808, "flos": 59642028149760.0, "grad_norm": 0.8099126663880823, "language_loss": 0.59637177, "learning_rate": 8.742781523089205e-08, "loss": 0.61829364, "num_input_tokens_seen": 325935835, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.20800781, "step": 15108, "time_per_iteration": 3.324327230453491 }, { "auxiliary_loss_clip": 0.01397526, "auxiliary_loss_mlp": 0.01031056, "balance_loss_clip": 1.23574018, "balance_loss_mlp": 1.01235175, "epoch": 0.9084022245603487, "flos": 33633307774080.0, "grad_norm": 1.64750093921466, "language_loss": 0.74567276, "learning_rate": 8.73139601460482e-08, "loss": 0.76995862, "num_input_tokens_seen": 325958035, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18725586, "step": 15109, "time_per_iteration": 2.9706268310546875 }, { "auxiliary_loss_clip": 0.01385573, "auxiliary_loss_mlp": 0.01030672, "balance_loss_clip": 1.2296524, "balance_loss_mlp": 1.01152742, "epoch": 0.9084623478130167, "flos": 24982194193920.0, "grad_norm": 1.558963460634957, "language_loss": 0.72656667, "learning_rate": 8.720017759045073e-08, "loss": 0.75072908, "num_input_tokens_seen": 325979870, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.19152832, "step": 15110, "time_per_iteration": 2.893172264099121 }, { "auxiliary_loss_clip": 0.01392143, "auxiliary_loss_mlp": 0.01034837, "balance_loss_clip": 1.23514962, "balance_loss_mlp": 1.01585901, "epoch": 0.9085224710656846, "flos": 31472843679360.0, "grad_norm": 1.6951084814385649, "language_loss": 0.69459844, "learning_rate": 8.708646756841421e-08, "loss": 0.71886832, "num_input_tokens_seen": 325998245, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18969727, "step": 15111, "time_per_iteration": 2.9517035484313965 }, { "auxiliary_loss_clip": 0.01176566, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.09072483, "balance_loss_mlp": 1.01012802, "epoch": 0.9085825943183526, "flos": 64945622327040.0, "grad_norm": 0.6965448566653105, "language_loss": 0.51862007, "learning_rate": 8.697283008425026e-08, "loss": 0.54073298, "num_input_tokens_seen": 326061770, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.24609375, "step": 15112, "time_per_iteration": 3.357377290725708 }, { "auxiliary_loss_clip": 0.0139478, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.23540187, "balance_loss_mlp": 1.01397192, "epoch": 0.9086427175710206, "flos": 18962580769920.0, "grad_norm": 3.051126959636042, "language_loss": 0.71048641, "learning_rate": 8.685926514226837e-08, "loss": 0.73475814, "num_input_tokens_seen": 326080945, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1842041, "step": 15113, "time_per_iteration": 2.8245859146118164 }, { "auxiliary_loss_clip": 0.01395313, "auxiliary_loss_mlp": 0.01030766, "balance_loss_clip": 1.23686934, "balance_loss_mlp": 1.01203823, "epoch": 0.9087028408236886, "flos": 34026332520960.0, "grad_norm": 1.9315613141334236, "language_loss": 0.79869199, "learning_rate": 8.674577274677508e-08, "loss": 0.82295281, "num_input_tokens_seen": 326100630, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18725586, "step": 15114, "time_per_iteration": 2.942697048187256 }, { "auxiliary_loss_clip": 0.01416822, "auxiliary_loss_mlp": 0.01035371, "balance_loss_clip": 1.25215387, "balance_loss_mlp": 1.01543975, "epoch": 0.9087629640763565, "flos": 21954592644480.0, "grad_norm": 31.462972631070038, "language_loss": 0.71674025, "learning_rate": 8.663235290207405e-08, "loss": 0.7412622, "num_input_tokens_seen": 326120145, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19934082, "step": 15115, "time_per_iteration": 2.837405204772949 }, { "auxiliary_loss_clip": 0.01417133, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.25197446, "balance_loss_mlp": 1.0172286, "epoch": 0.9088230873290245, "flos": 21773069930880.0, "grad_norm": 1.5427004004490101, "language_loss": 0.66014564, "learning_rate": 8.651900561246561e-08, "loss": 0.68468755, "num_input_tokens_seen": 326140715, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19824219, "step": 15116, "time_per_iteration": 2.853137969970703 }, { "auxiliary_loss_clip": 0.01381039, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.22590947, "balance_loss_mlp": 1.01414025, "epoch": 0.9088832105816925, "flos": 21550663900800.0, "grad_norm": 1.9310747254237142, "language_loss": 0.69912398, "learning_rate": 8.640573088224812e-08, "loss": 0.72327447, "num_input_tokens_seen": 326159130, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.19885254, "step": 15117, "time_per_iteration": 2.8283915519714355 }, { "auxiliary_loss_clip": 0.01392907, "auxiliary_loss_mlp": 0.01031511, "balance_loss_clip": 1.23455811, "balance_loss_mlp": 1.01271152, "epoch": 0.9089433338343604, "flos": 26008340434560.0, "grad_norm": 1.4728157815752887, "language_loss": 0.75456429, "learning_rate": 8.629252871571745e-08, "loss": 0.77880853, "num_input_tokens_seen": 326181375, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18798828, "step": 15118, "time_per_iteration": 2.8804609775543213 }, { "auxiliary_loss_clip": 0.01418688, "auxiliary_loss_mlp": 0.01036089, "balance_loss_clip": 1.25196087, "balance_loss_mlp": 1.01614499, "epoch": 0.9090034570870285, "flos": 21188251900800.0, "grad_norm": 2.074008027988038, "language_loss": 0.73328334, "learning_rate": 8.617939911716554e-08, "loss": 0.7578311, "num_input_tokens_seen": 326199740, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.19946289, "step": 15119, "time_per_iteration": 2.870600700378418 }, { "auxiliary_loss_clip": 0.01421046, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 1.25583291, "balance_loss_mlp": 1.01238382, "epoch": 0.9090635803396964, "flos": 16149919858560.0, "grad_norm": 2.3041320153256444, "language_loss": 0.71708417, "learning_rate": 8.60663420908827e-08, "loss": 0.74161196, "num_input_tokens_seen": 326214350, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19335938, "step": 15120, "time_per_iteration": 2.779383659362793 }, { "auxiliary_loss_clip": 0.01407924, "auxiliary_loss_mlp": 0.01032174, "balance_loss_clip": 1.24680614, "balance_loss_mlp": 1.01200402, "epoch": 0.9091237035923644, "flos": 20600402469120.0, "grad_norm": 2.0967677456509426, "language_loss": 0.66720223, "learning_rate": 8.595335764115596e-08, "loss": 0.6916033, "num_input_tokens_seen": 326234580, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.20166016, "step": 15121, "time_per_iteration": 2.8634984493255615 }, { "auxiliary_loss_clip": 0.01395472, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.23638558, "balance_loss_mlp": 1.01720548, "epoch": 0.9091838268450323, "flos": 52245629412480.0, "grad_norm": 2.0652921110401614, "language_loss": 0.71405101, "learning_rate": 8.58404457722699e-08, "loss": 0.73836935, "num_input_tokens_seen": 326259080, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19140625, "step": 15122, "time_per_iteration": 4.5781755447387695 }, { "auxiliary_loss_clip": 0.01395746, "auxiliary_loss_mlp": 0.01032166, "balance_loss_clip": 1.23803425, "balance_loss_mlp": 1.01360548, "epoch": 0.9092439500977003, "flos": 20569653987840.0, "grad_norm": 1.479356186498681, "language_loss": 0.7507956, "learning_rate": 8.572760648850575e-08, "loss": 0.77507472, "num_input_tokens_seen": 326280175, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18566895, "step": 15123, "time_per_iteration": 2.9025211334228516 }, { "auxiliary_loss_clip": 0.01385655, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.22939682, "balance_loss_mlp": 1.01595259, "epoch": 0.9093040733503682, "flos": 28628665125120.0, "grad_norm": 1.9464265572336512, "language_loss": 0.76597512, "learning_rate": 8.561483979414253e-08, "loss": 0.79018664, "num_input_tokens_seen": 326297990, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.1953125, "step": 15124, "time_per_iteration": 2.932560443878174 }, { "auxiliary_loss_clip": 0.01394165, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.23685873, "balance_loss_mlp": 1.01339412, "epoch": 0.9093641966030362, "flos": 23450508092160.0, "grad_norm": 4.975815988473684, "language_loss": 0.73851418, "learning_rate": 8.55021456934566e-08, "loss": 0.76281607, "num_input_tokens_seen": 326316735, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.22631836, "step": 15125, "time_per_iteration": 4.310215711593628 }, { "auxiliary_loss_clip": 0.01385232, "auxiliary_loss_mlp": 0.01033989, "balance_loss_clip": 1.22997487, "balance_loss_mlp": 1.01498687, "epoch": 0.9094243198557042, "flos": 16808903395200.0, "grad_norm": 1.6262684825920743, "language_loss": 0.79840219, "learning_rate": 8.538952419072143e-08, "loss": 0.8225944, "num_input_tokens_seen": 326334370, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.19006348, "step": 15126, "time_per_iteration": 2.810014009475708 }, { "auxiliary_loss_clip": 0.01397978, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.24061596, "balance_loss_mlp": 1.01584148, "epoch": 0.9094844431083722, "flos": 24282236851200.0, "grad_norm": 1.677925969579953, "language_loss": 0.76517117, "learning_rate": 8.527697529020694e-08, "loss": 0.7894963, "num_input_tokens_seen": 326353435, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18676758, "step": 15127, "time_per_iteration": 2.9155213832855225 }, { "auxiliary_loss_clip": 0.01397082, "auxiliary_loss_mlp": 0.0103708, "balance_loss_clip": 1.2361747, "balance_loss_mlp": 1.01727974, "epoch": 0.9095445663610401, "flos": 21954683134080.0, "grad_norm": 2.039281886024452, "language_loss": 0.63612115, "learning_rate": 8.516449899618173e-08, "loss": 0.6604628, "num_input_tokens_seen": 326371810, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19799805, "step": 15128, "time_per_iteration": 2.83559250831604 }, { "auxiliary_loss_clip": 0.01386187, "auxiliary_loss_mlp": 0.01032908, "balance_loss_clip": 1.22916317, "balance_loss_mlp": 1.01371527, "epoch": 0.9096046896137081, "flos": 19802544082560.0, "grad_norm": 4.8008102315739904, "language_loss": 0.77268654, "learning_rate": 8.505209531291013e-08, "loss": 0.7968775, "num_input_tokens_seen": 326391380, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.1920166, "step": 15129, "time_per_iteration": 2.8571431636810303 }, { "auxiliary_loss_clip": 0.01393431, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.23405313, "balance_loss_mlp": 1.01379204, "epoch": 0.909664812866376, "flos": 22648351449600.0, "grad_norm": 1.866536889574983, "language_loss": 0.83965182, "learning_rate": 8.49397642446552e-08, "loss": 0.86391318, "num_input_tokens_seen": 326408800, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18908691, "step": 15130, "time_per_iteration": 2.8612613677978516 }, { "auxiliary_loss_clip": 0.01402034, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.2415942, "balance_loss_mlp": 1.01275563, "epoch": 0.909724936119044, "flos": 39865011413760.0, "grad_norm": 8.068537886758703, "language_loss": 0.75529552, "learning_rate": 8.482750579567644e-08, "loss": 0.77963024, "num_input_tokens_seen": 326431565, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18676758, "step": 15131, "time_per_iteration": 3.097151041030884 }, { "auxiliary_loss_clip": 0.014019, "auxiliary_loss_mlp": 0.01032685, "balance_loss_clip": 1.24246025, "balance_loss_mlp": 1.01333714, "epoch": 0.9097850593717121, "flos": 35083001018880.0, "grad_norm": 2.7318293049800393, "language_loss": 0.60349196, "learning_rate": 8.471531997023085e-08, "loss": 0.62783784, "num_input_tokens_seen": 326451715, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19348145, "step": 15132, "time_per_iteration": 5.710118055343628 }, { "auxiliary_loss_clip": 0.01397718, "auxiliary_loss_mlp": 0.01034218, "balance_loss_clip": 1.23977458, "balance_loss_mlp": 1.01582408, "epoch": 0.90984518262438, "flos": 23377835664000.0, "grad_norm": 1.3472164983443906, "language_loss": 0.8289814, "learning_rate": 8.460320677257193e-08, "loss": 0.85330081, "num_input_tokens_seen": 326470855, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18395996, "step": 15133, "time_per_iteration": 2.8587417602539062 }, { "auxiliary_loss_clip": 0.01398015, "auxiliary_loss_mlp": 0.01033114, "balance_loss_clip": 1.23719072, "balance_loss_mlp": 1.0148747, "epoch": 0.909905305877048, "flos": 27533827998720.0, "grad_norm": 1.6954698397567738, "language_loss": 0.74595344, "learning_rate": 8.449116620695118e-08, "loss": 0.77026474, "num_input_tokens_seen": 326490480, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18237305, "step": 15134, "time_per_iteration": 2.8762855529785156 }, { "auxiliary_loss_clip": 0.01423892, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.25779057, "balance_loss_mlp": 1.01690912, "epoch": 0.9099654291297159, "flos": 24357262008960.0, "grad_norm": 1.5809503843085893, "language_loss": 0.73850298, "learning_rate": 8.437919827761786e-08, "loss": 0.76310396, "num_input_tokens_seen": 326509445, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19287109, "step": 15135, "time_per_iteration": 2.854510545730591 }, { "auxiliary_loss_clip": 0.01389789, "auxiliary_loss_mlp": 0.0103059, "balance_loss_clip": 1.23289144, "balance_loss_mlp": 1.01253009, "epoch": 0.9100255523823839, "flos": 21225153674880.0, "grad_norm": 1.8907806755836871, "language_loss": 0.70380354, "learning_rate": 8.426730298881702e-08, "loss": 0.72800732, "num_input_tokens_seen": 326528380, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18066406, "step": 15136, "time_per_iteration": 2.8556480407714844 }, { "auxiliary_loss_clip": 0.01175971, "auxiliary_loss_mlp": 0.01025173, "balance_loss_clip": 1.09009862, "balance_loss_mlp": 1.00514627, "epoch": 0.9100856756350518, "flos": 46075576452480.0, "grad_norm": 0.8152814190759695, "language_loss": 0.59272122, "learning_rate": 8.415548034479214e-08, "loss": 0.61473268, "num_input_tokens_seen": 326576940, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.20019531, "step": 15137, "time_per_iteration": 3.085944414138794 }, { "auxiliary_loss_clip": 0.01399798, "auxiliary_loss_mlp": 0.01030259, "balance_loss_clip": 1.23988986, "balance_loss_mlp": 1.01210332, "epoch": 0.9101457988877198, "flos": 20239528792320.0, "grad_norm": 1.5432992828819696, "language_loss": 0.83177704, "learning_rate": 8.40437303497834e-08, "loss": 0.85607755, "num_input_tokens_seen": 326596100, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18151855, "step": 15138, "time_per_iteration": 2.8535947799682617 }, { "auxiliary_loss_clip": 0.01382462, "auxiliary_loss_mlp": 0.01032879, "balance_loss_clip": 1.22933483, "balance_loss_mlp": 1.01381731, "epoch": 0.9102059221403878, "flos": 26626485899520.0, "grad_norm": 5.220587164965373, "language_loss": 0.81661916, "learning_rate": 8.39320530080283e-08, "loss": 0.84077257, "num_input_tokens_seen": 326615700, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.19042969, "step": 15139, "time_per_iteration": 2.9006919860839844 }, { "auxiliary_loss_clip": 0.01386382, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.22920978, "balance_loss_mlp": 1.01357603, "epoch": 0.9102660453930558, "flos": 21918686256000.0, "grad_norm": 2.3996005578753414, "language_loss": 0.78302234, "learning_rate": 8.382044832376167e-08, "loss": 0.80720574, "num_input_tokens_seen": 326635905, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18383789, "step": 15140, "time_per_iteration": 2.8628859519958496 }, { "auxiliary_loss_clip": 0.01385574, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.22716081, "balance_loss_mlp": 1.01413107, "epoch": 0.9103261686457237, "flos": 36191049626880.0, "grad_norm": 1.6815110399493267, "language_loss": 0.6689955, "learning_rate": 8.370891630121569e-08, "loss": 0.6931802, "num_input_tokens_seen": 326661855, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18786621, "step": 15141, "time_per_iteration": 3.0448150634765625 }, { "auxiliary_loss_clip": 0.01403432, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.24162507, "balance_loss_mlp": 1.01567185, "epoch": 0.9103862918983917, "flos": 23889121614720.0, "grad_norm": 1.841721287826934, "language_loss": 0.75770211, "learning_rate": 8.359745694462005e-08, "loss": 0.78208983, "num_input_tokens_seen": 326679320, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19677734, "step": 15142, "time_per_iteration": 2.919987201690674 }, { "auxiliary_loss_clip": 0.01394847, "auxiliary_loss_mlp": 0.0103771, "balance_loss_clip": 1.23665881, "balance_loss_mlp": 1.01918483, "epoch": 0.9104464151510596, "flos": 14947770769920.0, "grad_norm": 1.9127252840448863, "language_loss": 0.65597969, "learning_rate": 8.348607025820076e-08, "loss": 0.6803053, "num_input_tokens_seen": 326698110, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18518066, "step": 15143, "time_per_iteration": 2.9090960025787354 }, { "auxiliary_loss_clip": 0.01396903, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.23589921, "balance_loss_mlp": 1.01580715, "epoch": 0.9105065384037276, "flos": 33668671224960.0, "grad_norm": 2.333842782363234, "language_loss": 0.61863863, "learning_rate": 8.337475624618152e-08, "loss": 0.64295959, "num_input_tokens_seen": 326718370, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19384766, "step": 15144, "time_per_iteration": 2.9157614707946777 }, { "auxiliary_loss_clip": 0.01374397, "auxiliary_loss_mlp": 0.01030472, "balance_loss_clip": 1.22300613, "balance_loss_mlp": 1.01234078, "epoch": 0.9105666616563957, "flos": 24327463668480.0, "grad_norm": 2.3736720224355716, "language_loss": 0.71624994, "learning_rate": 8.326351491278382e-08, "loss": 0.74029869, "num_input_tokens_seen": 326738445, "router_z_loss_clip": 1.51464844, "router_z_loss_mlp": 0.18115234, "step": 15145, "time_per_iteration": 2.9644694328308105 }, { "auxiliary_loss_clip": 0.01381331, "auxiliary_loss_mlp": 0.01030124, "balance_loss_clip": 1.22749805, "balance_loss_mlp": 1.01145601, "epoch": 0.9106267849090636, "flos": 29983850686080.0, "grad_norm": 2.1503242749873257, "language_loss": 0.71506405, "learning_rate": 8.315234626222545e-08, "loss": 0.7391786, "num_input_tokens_seen": 326758855, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.18664551, "step": 15146, "time_per_iteration": 2.9375815391540527 }, { "auxiliary_loss_clip": 0.01391466, "auxiliary_loss_mlp": 0.01035795, "balance_loss_clip": 1.2329644, "balance_loss_mlp": 1.01624441, "epoch": 0.9106869081617316, "flos": 25348632981120.0, "grad_norm": 2.296726091513083, "language_loss": 0.73746806, "learning_rate": 8.304125029872233e-08, "loss": 0.76174062, "num_input_tokens_seen": 326777140, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19543457, "step": 15147, "time_per_iteration": 2.9135732650756836 }, { "auxiliary_loss_clip": 0.01404971, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.24228036, "balance_loss_mlp": 1.01295424, "epoch": 0.9107470314143995, "flos": 18196194781440.0, "grad_norm": 1.8365945791704876, "language_loss": 0.80950427, "learning_rate": 8.293022702648711e-08, "loss": 0.83387238, "num_input_tokens_seen": 326794070, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18884277, "step": 15148, "time_per_iteration": 2.9536545276641846 }, { "auxiliary_loss_clip": 0.01403783, "auxiliary_loss_mlp": 0.01031758, "balance_loss_clip": 1.24185193, "balance_loss_mlp": 1.01324534, "epoch": 0.9108071546670675, "flos": 23561484883200.0, "grad_norm": 2.3329974114949983, "language_loss": 0.68966937, "learning_rate": 8.281927644972996e-08, "loss": 0.71402478, "num_input_tokens_seen": 326814695, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18518066, "step": 15149, "time_per_iteration": 2.8825302124023438 }, { "auxiliary_loss_clip": 0.01408717, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.24721813, "balance_loss_mlp": 1.01513696, "epoch": 0.9108672779197354, "flos": 25641449199360.0, "grad_norm": 1.567015150447077, "language_loss": 0.64215642, "learning_rate": 8.270839857265776e-08, "loss": 0.6665839, "num_input_tokens_seen": 326835295, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18896484, "step": 15150, "time_per_iteration": 2.907957077026367 }, { "auxiliary_loss_clip": 0.01411348, "auxiliary_loss_mlp": 0.01032909, "balance_loss_clip": 1.25093961, "balance_loss_mlp": 1.01408613, "epoch": 0.9109274011724035, "flos": 22347436412160.0, "grad_norm": 1.8308869809480104, "language_loss": 0.739815, "learning_rate": 8.259759339947514e-08, "loss": 0.76425761, "num_input_tokens_seen": 326853350, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18823242, "step": 15151, "time_per_iteration": 2.861001491546631 }, { "auxiliary_loss_clip": 0.01386227, "auxiliary_loss_mlp": 0.01030238, "balance_loss_clip": 1.22897613, "balance_loss_mlp": 1.01171267, "epoch": 0.9109875244250714, "flos": 26699565530880.0, "grad_norm": 1.6347548072396723, "language_loss": 0.64770621, "learning_rate": 8.248686093438429e-08, "loss": 0.67187083, "num_input_tokens_seen": 326873425, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18518066, "step": 15152, "time_per_iteration": 2.9067063331604004 }, { "auxiliary_loss_clip": 0.01390709, "auxiliary_loss_mlp": 0.01029883, "balance_loss_clip": 1.23134112, "balance_loss_mlp": 1.01028538, "epoch": 0.9110476476777394, "flos": 22940488995840.0, "grad_norm": 1.9150378056173416, "language_loss": 0.7373144, "learning_rate": 8.23762011815834e-08, "loss": 0.76152027, "num_input_tokens_seen": 326893455, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19604492, "step": 15153, "time_per_iteration": 2.887770414352417 }, { "auxiliary_loss_clip": 0.01409233, "auxiliary_loss_mlp": 0.01032065, "balance_loss_clip": 1.24707496, "balance_loss_mlp": 1.01368284, "epoch": 0.9111077709304073, "flos": 13478848485120.0, "grad_norm": 3.549504024638522, "language_loss": 0.72767591, "learning_rate": 8.226561414526956e-08, "loss": 0.7520889, "num_input_tokens_seen": 326910210, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18395996, "step": 15154, "time_per_iteration": 2.813575267791748 }, { "auxiliary_loss_clip": 0.01386149, "auxiliary_loss_mlp": 0.0103321, "balance_loss_clip": 1.23030245, "balance_loss_mlp": 1.01468539, "epoch": 0.9111678941830753, "flos": 20860434190080.0, "grad_norm": 1.7555069346544643, "language_loss": 0.82767272, "learning_rate": 8.215509982963564e-08, "loss": 0.85186636, "num_input_tokens_seen": 326929350, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18530273, "step": 15155, "time_per_iteration": 2.891860008239746 }, { "auxiliary_loss_clip": 0.01392459, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.23602104, "balance_loss_mlp": 1.01327693, "epoch": 0.9112280174357432, "flos": 19691612536320.0, "grad_norm": 1.417939110633223, "language_loss": 0.60092556, "learning_rate": 8.204465823887252e-08, "loss": 0.6251716, "num_input_tokens_seen": 326949060, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18884277, "step": 15156, "time_per_iteration": 2.8353707790374756 }, { "auxiliary_loss_clip": 0.01404613, "auxiliary_loss_mlp": 0.01031763, "balance_loss_clip": 1.24142075, "balance_loss_mlp": 1.0123558, "epoch": 0.9112881406884112, "flos": 25458071448960.0, "grad_norm": 2.1670765474565643, "language_loss": 0.74869698, "learning_rate": 8.193428937716796e-08, "loss": 0.77306068, "num_input_tokens_seen": 326968950, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.1940918, "step": 15157, "time_per_iteration": 4.3113555908203125 }, { "auxiliary_loss_clip": 0.01406344, "auxiliary_loss_mlp": 0.01033215, "balance_loss_clip": 1.24540687, "balance_loss_mlp": 1.01504791, "epoch": 0.9113482639410793, "flos": 33078378574080.0, "grad_norm": 2.471589830470554, "language_loss": 0.59722912, "learning_rate": 8.182399324870747e-08, "loss": 0.62162471, "num_input_tokens_seen": 326989455, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.1817627, "step": 15158, "time_per_iteration": 2.9585824012756348 }, { "auxiliary_loss_clip": 0.01386313, "auxiliary_loss_mlp": 0.01032712, "balance_loss_clip": 1.22926068, "balance_loss_mlp": 1.01463985, "epoch": 0.9114083871937472, "flos": 21845697114240.0, "grad_norm": 1.6050822550618948, "language_loss": 0.68107283, "learning_rate": 8.171376985767375e-08, "loss": 0.70526302, "num_input_tokens_seen": 327009640, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18078613, "step": 15159, "time_per_iteration": 2.849174737930298 }, { "auxiliary_loss_clip": 0.01385599, "auxiliary_loss_mlp": 0.01030498, "balance_loss_clip": 1.22679043, "balance_loss_mlp": 1.01132894, "epoch": 0.9114685104464152, "flos": 27100327138560.0, "grad_norm": 2.6333685880415967, "language_loss": 0.78950489, "learning_rate": 8.160361920824588e-08, "loss": 0.81366587, "num_input_tokens_seen": 327027690, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19177246, "step": 15160, "time_per_iteration": 4.325668811798096 }, { "auxiliary_loss_clip": 0.01410433, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.24930358, "balance_loss_mlp": 1.00940752, "epoch": 0.9115286336990831, "flos": 17975870012160.0, "grad_norm": 1.8310780616975546, "language_loss": 0.69265532, "learning_rate": 8.149354130460073e-08, "loss": 0.7170471, "num_input_tokens_seen": 327045915, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1932373, "step": 15161, "time_per_iteration": 2.8188445568084717 }, { "auxiliary_loss_clip": 0.01397175, "auxiliary_loss_mlp": 0.01031228, "balance_loss_clip": 1.23775721, "balance_loss_mlp": 1.01191568, "epoch": 0.9115887569517511, "flos": 22940172282240.0, "grad_norm": 1.5670943958760184, "language_loss": 0.7690407, "learning_rate": 8.138353615091321e-08, "loss": 0.79332477, "num_input_tokens_seen": 327066355, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.1932373, "step": 15162, "time_per_iteration": 2.8609020709991455 }, { "auxiliary_loss_clip": 0.01400975, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.24126077, "balance_loss_mlp": 1.01593351, "epoch": 0.911648880204419, "flos": 23999148264960.0, "grad_norm": 1.789910518853387, "language_loss": 0.67601269, "learning_rate": 8.127360375135395e-08, "loss": 0.7003758, "num_input_tokens_seen": 327086735, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19396973, "step": 15163, "time_per_iteration": 2.8994369506835938 }, { "auxiliary_loss_clip": 0.01421176, "auxiliary_loss_mlp": 0.01035259, "balance_loss_clip": 1.25671768, "balance_loss_mlp": 1.01460004, "epoch": 0.911709003457087, "flos": 17064003432960.0, "grad_norm": 1.9224805025395915, "language_loss": 0.71383464, "learning_rate": 8.116374411009186e-08, "loss": 0.73839897, "num_input_tokens_seen": 327104035, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20629883, "step": 15164, "time_per_iteration": 2.813257932662964 }, { "auxiliary_loss_clip": 0.01389321, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.23467577, "balance_loss_mlp": 1.01511741, "epoch": 0.911769126709755, "flos": 21663450483840.0, "grad_norm": 1.5317197331876806, "language_loss": 0.76726186, "learning_rate": 8.105395723129315e-08, "loss": 0.79148602, "num_input_tokens_seen": 327124370, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.17980957, "step": 15165, "time_per_iteration": 2.843998908996582 }, { "auxiliary_loss_clip": 0.01406443, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.24507833, "balance_loss_mlp": 1.01524138, "epoch": 0.911829249962423, "flos": 24801033438720.0, "grad_norm": 3.3941007081692574, "language_loss": 0.73081261, "learning_rate": 8.094424311912074e-08, "loss": 0.75521815, "num_input_tokens_seen": 327140915, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.1887207, "step": 15166, "time_per_iteration": 2.8748795986175537 }, { "auxiliary_loss_clip": 0.01409055, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.24630487, "balance_loss_mlp": 1.01396942, "epoch": 0.9118893732150909, "flos": 20969148741120.0, "grad_norm": 2.382963108698475, "language_loss": 0.73546171, "learning_rate": 8.083460177773482e-08, "loss": 0.75989002, "num_input_tokens_seen": 327158940, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19836426, "step": 15167, "time_per_iteration": 5.661333322525024 }, { "auxiliary_loss_clip": 0.01173809, "auxiliary_loss_mlp": 0.0103086, "balance_loss_clip": 1.087286, "balance_loss_mlp": 1.00339413, "epoch": 0.9119494964677589, "flos": 67948990646400.0, "grad_norm": 0.7705465213616485, "language_loss": 0.65559494, "learning_rate": 8.072503321129298e-08, "loss": 0.67764163, "num_input_tokens_seen": 327217450, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.27539062, "step": 15168, "time_per_iteration": 3.3370957374572754 }, { "auxiliary_loss_clip": 0.01405128, "auxiliary_loss_mlp": 0.01033659, "balance_loss_clip": 1.24625087, "balance_loss_mlp": 1.01477623, "epoch": 0.9120096197204268, "flos": 18560688042240.0, "grad_norm": 1.9629559383217052, "language_loss": 0.7862072, "learning_rate": 8.061553742395033e-08, "loss": 0.81059504, "num_input_tokens_seen": 327233905, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18884277, "step": 15169, "time_per_iteration": 2.8416216373443604 }, { "auxiliary_loss_clip": 0.01399793, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.2400074, "balance_loss_mlp": 1.0142293, "epoch": 0.9120697429730948, "flos": 19034981729280.0, "grad_norm": 1.501198985738292, "language_loss": 0.83178473, "learning_rate": 8.05061144198591e-08, "loss": 0.85612357, "num_input_tokens_seen": 327252430, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1986084, "step": 15170, "time_per_iteration": 2.838719367980957 }, { "auxiliary_loss_clip": 0.01401273, "auxiliary_loss_mlp": 0.01035096, "balance_loss_clip": 1.24100888, "balance_loss_mlp": 1.01634407, "epoch": 0.9121298662257629, "flos": 17172129801600.0, "grad_norm": 2.223382196724005, "language_loss": 0.78149211, "learning_rate": 8.039676420316799e-08, "loss": 0.80585581, "num_input_tokens_seen": 327269215, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1875, "step": 15171, "time_per_iteration": 2.8506052494049072 }, { "auxiliary_loss_clip": 0.01384867, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.22798908, "balance_loss_mlp": 1.01292527, "epoch": 0.9121899894784308, "flos": 19692200718720.0, "grad_norm": 1.552821584746152, "language_loss": 0.67570341, "learning_rate": 8.02874867780241e-08, "loss": 0.69987249, "num_input_tokens_seen": 327290320, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.19128418, "step": 15172, "time_per_iteration": 2.862438201904297 }, { "auxiliary_loss_clip": 0.01402261, "auxiliary_loss_mlp": 0.01035222, "balance_loss_clip": 1.2418654, "balance_loss_mlp": 1.01670897, "epoch": 0.9122501127310988, "flos": 22245644315520.0, "grad_norm": 2.3172268348310356, "language_loss": 0.76063752, "learning_rate": 8.017828214857103e-08, "loss": 0.7850123, "num_input_tokens_seen": 327310150, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18530273, "step": 15173, "time_per_iteration": 2.936774730682373 }, { "auxiliary_loss_clip": 0.01416556, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.25175059, "balance_loss_mlp": 1.01360953, "epoch": 0.9123102359837667, "flos": 15964596581760.0, "grad_norm": 2.5893646585268892, "language_loss": 0.66191173, "learning_rate": 8.00691503189499e-08, "loss": 0.6864177, "num_input_tokens_seen": 327326660, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.20410156, "step": 15174, "time_per_iteration": 2.821871280670166 }, { "auxiliary_loss_clip": 0.01396526, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.23594117, "balance_loss_mlp": 1.01278603, "epoch": 0.9123703592364347, "flos": 25166748309120.0, "grad_norm": 2.001926730789998, "language_loss": 0.76128608, "learning_rate": 7.996009129329894e-08, "loss": 0.78557515, "num_input_tokens_seen": 327346700, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19604492, "step": 15175, "time_per_iteration": 2.881073236465454 }, { "auxiliary_loss_clip": 0.01175796, "auxiliary_loss_mlp": 0.01024652, "balance_loss_clip": 1.08746028, "balance_loss_mlp": 1.00233638, "epoch": 0.9124304824891026, "flos": 60831146736000.0, "grad_norm": 0.9652431988079677, "language_loss": 0.58437115, "learning_rate": 7.985110507575421e-08, "loss": 0.60637563, "num_input_tokens_seen": 327403050, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.22363281, "step": 15176, "time_per_iteration": 3.404252290725708 }, { "auxiliary_loss_clip": 0.01404241, "auxiliary_loss_mlp": 0.01034213, "balance_loss_clip": 1.24325156, "balance_loss_mlp": 1.01579547, "epoch": 0.9124906057417707, "flos": 18160197903360.0, "grad_norm": 1.7180963967766834, "language_loss": 0.6671868, "learning_rate": 7.97421916704475e-08, "loss": 0.69157135, "num_input_tokens_seen": 327422225, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1842041, "step": 15177, "time_per_iteration": 2.881483316421509 }, { "auxiliary_loss_clip": 0.01399017, "auxiliary_loss_mlp": 0.01036533, "balance_loss_clip": 1.24157751, "balance_loss_mlp": 1.01840079, "epoch": 0.9125507289944386, "flos": 11692741017600.0, "grad_norm": 1.8861226040422145, "language_loss": 0.8153131, "learning_rate": 7.963335108150926e-08, "loss": 0.83966863, "num_input_tokens_seen": 327437025, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18127441, "step": 15178, "time_per_iteration": 2.802271842956543 }, { "auxiliary_loss_clip": 0.01387151, "auxiliary_loss_mlp": 0.01030765, "balance_loss_clip": 1.22932374, "balance_loss_mlp": 1.01239443, "epoch": 0.9126108522471066, "flos": 17757762238080.0, "grad_norm": 2.7087797619569987, "language_loss": 0.79802561, "learning_rate": 7.952458331306711e-08, "loss": 0.82220477, "num_input_tokens_seen": 327453915, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18395996, "step": 15179, "time_per_iteration": 2.8539934158325195 }, { "auxiliary_loss_clip": 0.01392624, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.23493791, "balance_loss_mlp": 1.0145247, "epoch": 0.9126709754997745, "flos": 27647067029760.0, "grad_norm": 1.538645331325776, "language_loss": 0.68882871, "learning_rate": 7.941588836924507e-08, "loss": 0.71308029, "num_input_tokens_seen": 327474415, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18005371, "step": 15180, "time_per_iteration": 2.9671337604522705 }, { "auxiliary_loss_clip": 0.01381142, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.22555375, "balance_loss_mlp": 1.01124823, "epoch": 0.9127310987524425, "flos": 15933576631680.0, "grad_norm": 1.6616686520377464, "language_loss": 0.75612593, "learning_rate": 7.930726625416495e-08, "loss": 0.78024292, "num_input_tokens_seen": 327492750, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.1932373, "step": 15181, "time_per_iteration": 2.871734619140625 }, { "auxiliary_loss_clip": 0.01415282, "auxiliary_loss_mlp": 0.01030792, "balance_loss_clip": 1.25111008, "balance_loss_mlp": 1.01238608, "epoch": 0.9127912220051104, "flos": 21545144035200.0, "grad_norm": 1.7482004793750665, "language_loss": 0.75594628, "learning_rate": 7.919871697194614e-08, "loss": 0.78040707, "num_input_tokens_seen": 327509470, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18408203, "step": 15182, "time_per_iteration": 2.9049365520477295 }, { "auxiliary_loss_clip": 0.01406156, "auxiliary_loss_mlp": 0.0103027, "balance_loss_clip": 1.24415922, "balance_loss_mlp": 1.01130414, "epoch": 0.9128513452577784, "flos": 24074852094720.0, "grad_norm": 1.8798402500355733, "language_loss": 0.77034587, "learning_rate": 7.909024052670421e-08, "loss": 0.79471016, "num_input_tokens_seen": 327530520, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18981934, "step": 15183, "time_per_iteration": 2.8924717903137207 }, { "auxiliary_loss_clip": 0.01411397, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.24871826, "balance_loss_mlp": 1.01197791, "epoch": 0.9129114685104465, "flos": 16225035505920.0, "grad_norm": 2.3613370613423568, "language_loss": 0.7745406, "learning_rate": 7.898183692255256e-08, "loss": 0.79896289, "num_input_tokens_seen": 327546960, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.18847656, "step": 15184, "time_per_iteration": 2.8206751346588135 }, { "auxiliary_loss_clip": 0.01404175, "auxiliary_loss_mlp": 0.01035222, "balance_loss_clip": 1.24404955, "balance_loss_mlp": 1.01574326, "epoch": 0.9129715917631144, "flos": 19392100087680.0, "grad_norm": 6.599140649747385, "language_loss": 0.74980605, "learning_rate": 7.887350616360233e-08, "loss": 0.77420002, "num_input_tokens_seen": 327564830, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19494629, "step": 15185, "time_per_iteration": 2.813835382461548 }, { "auxiliary_loss_clip": 0.01389562, "auxiliary_loss_mlp": 0.01028863, "balance_loss_clip": 1.23069167, "balance_loss_mlp": 1.01018286, "epoch": 0.9130317150157824, "flos": 20599045125120.0, "grad_norm": 2.045955429393509, "language_loss": 0.68743527, "learning_rate": 7.876524825396158e-08, "loss": 0.7116195, "num_input_tokens_seen": 327583675, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18664551, "step": 15186, "time_per_iteration": 2.8619987964630127 }, { "auxiliary_loss_clip": 0.01413719, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.24776852, "balance_loss_mlp": 1.01455128, "epoch": 0.9130918382684503, "flos": 20197740579840.0, "grad_norm": 1.755688224127138, "language_loss": 0.78468049, "learning_rate": 7.865706319773502e-08, "loss": 0.80915928, "num_input_tokens_seen": 327602280, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19616699, "step": 15187, "time_per_iteration": 2.830235004425049 }, { "auxiliary_loss_clip": 0.01397573, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.2382679, "balance_loss_mlp": 1.01344538, "epoch": 0.9131519615211183, "flos": 25567871875200.0, "grad_norm": 5.173013383387578, "language_loss": 0.66434205, "learning_rate": 7.854895099902515e-08, "loss": 0.68864048, "num_input_tokens_seen": 327623515, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18835449, "step": 15188, "time_per_iteration": 2.8934803009033203 }, { "auxiliary_loss_clip": 0.01385533, "auxiliary_loss_mlp": 0.01033234, "balance_loss_clip": 1.22760355, "balance_loss_mlp": 1.01442301, "epoch": 0.9132120847737862, "flos": 17940642295680.0, "grad_norm": 3.661693145025564, "language_loss": 0.77271092, "learning_rate": 7.844091166193157e-08, "loss": 0.7968986, "num_input_tokens_seen": 327642875, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18823242, "step": 15189, "time_per_iteration": 2.821176528930664 }, { "auxiliary_loss_clip": 0.01391716, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 1.23452771, "balance_loss_mlp": 1.01342058, "epoch": 0.9132722080264543, "flos": 20057236917120.0, "grad_norm": 2.0012024000488067, "language_loss": 0.76359093, "learning_rate": 7.8332945190551e-08, "loss": 0.78781998, "num_input_tokens_seen": 327662450, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.1776123, "step": 15190, "time_per_iteration": 2.832014799118042 }, { "auxiliary_loss_clip": 0.01179102, "auxiliary_loss_mlp": 0.01030117, "balance_loss_clip": 1.09332514, "balance_loss_mlp": 1.00675189, "epoch": 0.9133323312791222, "flos": 70473540798720.0, "grad_norm": 0.7185280936978781, "language_loss": 0.57436472, "learning_rate": 7.822505158897797e-08, "loss": 0.59645694, "num_input_tokens_seen": 327723845, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.23339844, "step": 15191, "time_per_iteration": 3.42051100730896 }, { "auxiliary_loss_clip": 0.01403975, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.2422204, "balance_loss_mlp": 1.01436639, "epoch": 0.9133924545317902, "flos": 25494746999040.0, "grad_norm": 1.704292704954527, "language_loss": 0.74842942, "learning_rate": 7.81172308613034e-08, "loss": 0.77280313, "num_input_tokens_seen": 327742590, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19030762, "step": 15192, "time_per_iteration": 2.924443244934082 }, { "auxiliary_loss_clip": 0.0139415, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.23739409, "balance_loss_mlp": 1.01215792, "epoch": 0.9134525777844581, "flos": 39945013499520.0, "grad_norm": 3.249399597383762, "language_loss": 0.7015965, "learning_rate": 7.800948301161647e-08, "loss": 0.72584718, "num_input_tokens_seen": 327764350, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18762207, "step": 15193, "time_per_iteration": 4.403618574142456 }, { "auxiliary_loss_clip": 0.01397006, "auxiliary_loss_mlp": 0.01034621, "balance_loss_clip": 1.2395668, "balance_loss_mlp": 1.01596451, "epoch": 0.9135127010371261, "flos": 20896521557760.0, "grad_norm": 1.5579720582158325, "language_loss": 0.742311, "learning_rate": 7.790180804400215e-08, "loss": 0.76662719, "num_input_tokens_seen": 327783120, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18652344, "step": 15194, "time_per_iteration": 2.8864023685455322 }, { "auxiliary_loss_clip": 0.0141438, "auxiliary_loss_mlp": 0.01034959, "balance_loss_clip": 1.25025582, "balance_loss_mlp": 1.01508737, "epoch": 0.913572824289794, "flos": 20822898988800.0, "grad_norm": 1.9459057696976252, "language_loss": 0.62887526, "learning_rate": 7.779420596254383e-08, "loss": 0.65336871, "num_input_tokens_seen": 327801960, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.1986084, "step": 15195, "time_per_iteration": 2.862072229385376 }, { "auxiliary_loss_clip": 0.0139755, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.23750782, "balance_loss_mlp": 1.01479292, "epoch": 0.913632947542462, "flos": 25714438341120.0, "grad_norm": 1.465303757649483, "language_loss": 0.71928024, "learning_rate": 7.768667677132201e-08, "loss": 0.74358666, "num_input_tokens_seen": 327823795, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1829834, "step": 15196, "time_per_iteration": 4.385135173797607 }, { "auxiliary_loss_clip": 0.01388187, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 1.23030758, "balance_loss_mlp": 1.01448202, "epoch": 0.9136930707951301, "flos": 26297310844800.0, "grad_norm": 1.681796734765295, "language_loss": 0.7229411, "learning_rate": 7.757922047441411e-08, "loss": 0.7471512, "num_input_tokens_seen": 327845175, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18334961, "step": 15197, "time_per_iteration": 2.910677433013916 }, { "auxiliary_loss_clip": 0.01413108, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.25041628, "balance_loss_mlp": 1.01019883, "epoch": 0.913753194047798, "flos": 22102290230400.0, "grad_norm": 3.2164674566511113, "language_loss": 0.79425061, "learning_rate": 7.747183707589489e-08, "loss": 0.81867456, "num_input_tokens_seen": 327863150, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1907959, "step": 15198, "time_per_iteration": 2.845381736755371 }, { "auxiliary_loss_clip": 0.01386203, "auxiliary_loss_mlp": 0.01033093, "balance_loss_clip": 1.22945333, "balance_loss_mlp": 1.01360273, "epoch": 0.913813317300466, "flos": 23597843719680.0, "grad_norm": 2.024838788742723, "language_loss": 0.68087566, "learning_rate": 7.736452657983616e-08, "loss": 0.70506859, "num_input_tokens_seen": 327883445, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.19494629, "step": 15199, "time_per_iteration": 2.8685526847839355 }, { "auxiliary_loss_clip": 0.01407694, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.24567771, "balance_loss_mlp": 1.01702762, "epoch": 0.9138734405531339, "flos": 28888153908480.0, "grad_norm": 1.5648012623925622, "language_loss": 0.67937279, "learning_rate": 7.725728899030714e-08, "loss": 0.70380253, "num_input_tokens_seen": 327905745, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18261719, "step": 15200, "time_per_iteration": 2.9092416763305664 }, { "auxiliary_loss_clip": 0.01384637, "auxiliary_loss_mlp": 0.01032532, "balance_loss_clip": 1.22874331, "balance_loss_mlp": 1.01395917, "epoch": 0.9139335638058019, "flos": 22831548220800.0, "grad_norm": 2.15282289461593, "language_loss": 0.7196762, "learning_rate": 7.715012431137435e-08, "loss": 0.74384785, "num_input_tokens_seen": 327925435, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18579102, "step": 15201, "time_per_iteration": 4.336301565170288 }, { "auxiliary_loss_clip": 0.01389724, "auxiliary_loss_mlp": 0.01028818, "balance_loss_clip": 1.23130918, "balance_loss_mlp": 1.01153278, "epoch": 0.9139936870584698, "flos": 18013178989440.0, "grad_norm": 1.961614724687617, "language_loss": 0.71774006, "learning_rate": 7.704303254710165e-08, "loss": 0.74192542, "num_input_tokens_seen": 327944145, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.17297363, "step": 15202, "time_per_iteration": 2.8878252506256104 }, { "auxiliary_loss_clip": 0.01400279, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.24008441, "balance_loss_mlp": 1.01524067, "epoch": 0.9140538103111379, "flos": 15821875923840.0, "grad_norm": 1.9310505856269153, "language_loss": 0.67265856, "learning_rate": 7.693601370155001e-08, "loss": 0.69700611, "num_input_tokens_seen": 327960565, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19226074, "step": 15203, "time_per_iteration": 4.28705096244812 }, { "auxiliary_loss_clip": 0.01405572, "auxiliary_loss_mlp": 0.01030533, "balance_loss_clip": 1.24549651, "balance_loss_mlp": 1.01175797, "epoch": 0.9141139335638058, "flos": 23997383717760.0, "grad_norm": 1.8004550808078363, "language_loss": 0.69929183, "learning_rate": 7.682906777877751e-08, "loss": 0.72365284, "num_input_tokens_seen": 327981180, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18774414, "step": 15204, "time_per_iteration": 2.9007856845855713 }, { "auxiliary_loss_clip": 0.01394906, "auxiliary_loss_mlp": 0.01031406, "balance_loss_clip": 1.23418355, "balance_loss_mlp": 1.01216555, "epoch": 0.9141740568164738, "flos": 24035009408640.0, "grad_norm": 2.0465285358987915, "language_loss": 0.60554314, "learning_rate": 7.672219478283915e-08, "loss": 0.62980628, "num_input_tokens_seen": 328001500, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19238281, "step": 15205, "time_per_iteration": 2.8596856594085693 }, { "auxiliary_loss_clip": 0.01383698, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 1.22798657, "balance_loss_mlp": 1.01250243, "epoch": 0.9142341800691417, "flos": 27030188419200.0, "grad_norm": 1.6672996821432178, "language_loss": 0.81890011, "learning_rate": 7.661539471778811e-08, "loss": 0.84304833, "num_input_tokens_seen": 328023025, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18603516, "step": 15206, "time_per_iteration": 2.9018874168395996 }, { "auxiliary_loss_clip": 0.01409391, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.24775708, "balance_loss_mlp": 1.01058102, "epoch": 0.9142943033218097, "flos": 20422182625920.0, "grad_norm": 4.466811883069632, "language_loss": 0.74835294, "learning_rate": 7.650866758767382e-08, "loss": 0.77273601, "num_input_tokens_seen": 328041410, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18310547, "step": 15207, "time_per_iteration": 2.8425028324127197 }, { "auxiliary_loss_clip": 0.01393001, "auxiliary_loss_mlp": 0.01030116, "balance_loss_clip": 1.23381126, "balance_loss_mlp": 1.01072097, "epoch": 0.9143544265744776, "flos": 19764737412480.0, "grad_norm": 1.5785509198621657, "language_loss": 0.73545134, "learning_rate": 7.640201339654373e-08, "loss": 0.75968248, "num_input_tokens_seen": 328060495, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1940918, "step": 15208, "time_per_iteration": 2.8482542037963867 }, { "auxiliary_loss_clip": 0.01399239, "auxiliary_loss_mlp": 0.010309, "balance_loss_clip": 1.24197757, "balance_loss_mlp": 1.01297104, "epoch": 0.9144145498271457, "flos": 17174346796800.0, "grad_norm": 2.181262536085906, "language_loss": 0.86402571, "learning_rate": 7.629543214844237e-08, "loss": 0.88832712, "num_input_tokens_seen": 328076905, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17932129, "step": 15209, "time_per_iteration": 2.804022789001465 }, { "auxiliary_loss_clip": 0.01390396, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.23244059, "balance_loss_mlp": 1.01762617, "epoch": 0.9144746730798137, "flos": 23734908777600.0, "grad_norm": 1.6504452523624553, "language_loss": 0.75939059, "learning_rate": 7.618892384741093e-08, "loss": 0.78365469, "num_input_tokens_seen": 328096960, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18395996, "step": 15210, "time_per_iteration": 2.891416072845459 }, { "auxiliary_loss_clip": 0.01403878, "auxiliary_loss_mlp": 0.01033362, "balance_loss_clip": 1.24251139, "balance_loss_mlp": 1.01444364, "epoch": 0.9145347963324816, "flos": 25858697322240.0, "grad_norm": 1.7462231235447103, "language_loss": 0.78504723, "learning_rate": 7.6082488497488e-08, "loss": 0.80941963, "num_input_tokens_seen": 328115445, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18908691, "step": 15211, "time_per_iteration": 2.8979239463806152 }, { "auxiliary_loss_clip": 0.01396266, "auxiliary_loss_mlp": 0.01029393, "balance_loss_clip": 1.23714495, "balance_loss_mlp": 1.0106771, "epoch": 0.9145949195851496, "flos": 19251596424960.0, "grad_norm": 1.6649963950058662, "language_loss": 0.83566153, "learning_rate": 7.597612610270986e-08, "loss": 0.85991818, "num_input_tokens_seen": 328133965, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18701172, "step": 15212, "time_per_iteration": 2.846406936645508 }, { "auxiliary_loss_clip": 0.01388855, "auxiliary_loss_mlp": 0.01033691, "balance_loss_clip": 1.2316103, "balance_loss_mlp": 1.0148797, "epoch": 0.9146550428378175, "flos": 18305542759680.0, "grad_norm": 1.678227531559026, "language_loss": 0.8443743, "learning_rate": 7.586983666711022e-08, "loss": 0.86859977, "num_input_tokens_seen": 328151520, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18798828, "step": 15213, "time_per_iteration": 2.8153235912323 }, { "auxiliary_loss_clip": 0.01395739, "auxiliary_loss_mlp": 0.01034624, "balance_loss_clip": 1.23609579, "balance_loss_mlp": 1.01638484, "epoch": 0.9147151660904855, "flos": 20093912467200.0, "grad_norm": 1.5738265332531631, "language_loss": 0.72393548, "learning_rate": 7.576362019471894e-08, "loss": 0.74823916, "num_input_tokens_seen": 328171275, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18249512, "step": 15214, "time_per_iteration": 2.808718681335449 }, { "auxiliary_loss_clip": 0.01413067, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.24981546, "balance_loss_mlp": 1.01636755, "epoch": 0.9147752893431534, "flos": 24399774138240.0, "grad_norm": 1.5540838822279412, "language_loss": 0.63605982, "learning_rate": 7.565747668956413e-08, "loss": 0.66054374, "num_input_tokens_seen": 328192115, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18945312, "step": 15215, "time_per_iteration": 2.910959243774414 }, { "auxiliary_loss_clip": 0.01418907, "auxiliary_loss_mlp": 0.01034367, "balance_loss_clip": 1.25476289, "balance_loss_mlp": 1.01488781, "epoch": 0.9148354125958215, "flos": 18159519231360.0, "grad_norm": 2.5270075078727876, "language_loss": 0.77034914, "learning_rate": 7.555140615567058e-08, "loss": 0.79488188, "num_input_tokens_seen": 328208990, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19458008, "step": 15216, "time_per_iteration": 2.847985029220581 }, { "auxiliary_loss_clip": 0.01396715, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.23734522, "balance_loss_mlp": 1.01318204, "epoch": 0.9148955358484894, "flos": 23377926153600.0, "grad_norm": 2.1406500351979334, "language_loss": 0.68845689, "learning_rate": 7.544540859706062e-08, "loss": 0.71275014, "num_input_tokens_seen": 328227840, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19433594, "step": 15217, "time_per_iteration": 2.901492118835449 }, { "auxiliary_loss_clip": 0.01396954, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.23865116, "balance_loss_mlp": 1.01375914, "epoch": 0.9149556591011574, "flos": 18085353724800.0, "grad_norm": 1.8723388391471645, "language_loss": 0.80856645, "learning_rate": 7.533948401775347e-08, "loss": 0.83286029, "num_input_tokens_seen": 328246250, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18664551, "step": 15218, "time_per_iteration": 2.8249731063842773 }, { "auxiliary_loss_clip": 0.01179169, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.09206176, "balance_loss_mlp": 1.00398302, "epoch": 0.9150157823538253, "flos": 54610962537600.0, "grad_norm": 0.8461120594568002, "language_loss": 0.59383631, "learning_rate": 7.523363242176595e-08, "loss": 0.6159234, "num_input_tokens_seen": 328303625, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.25585938, "step": 15219, "time_per_iteration": 3.2968616485595703 }, { "auxiliary_loss_clip": 0.01387366, "auxiliary_loss_mlp": 0.01033626, "balance_loss_clip": 1.23049188, "balance_loss_mlp": 1.01516032, "epoch": 0.9150759056064933, "flos": 17901659260800.0, "grad_norm": 1.84685194628434, "language_loss": 0.79247308, "learning_rate": 7.512785381311216e-08, "loss": 0.81668305, "num_input_tokens_seen": 328322135, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18469238, "step": 15220, "time_per_iteration": 2.846248149871826 }, { "auxiliary_loss_clip": 0.01402801, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.24057138, "balance_loss_mlp": 1.01438856, "epoch": 0.9151360288591612, "flos": 18081598406400.0, "grad_norm": 1.777110540378662, "language_loss": 0.66326487, "learning_rate": 7.50221481958031e-08, "loss": 0.68763137, "num_input_tokens_seen": 328340750, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19482422, "step": 15221, "time_per_iteration": 2.8657407760620117 }, { "auxiliary_loss_clip": 0.01394602, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.23468399, "balance_loss_mlp": 1.01638961, "epoch": 0.9151961521118293, "flos": 19363975804800.0, "grad_norm": 1.7890905096433132, "language_loss": 0.85015649, "learning_rate": 7.491651557384692e-08, "loss": 0.8744536, "num_input_tokens_seen": 328359995, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18737793, "step": 15222, "time_per_iteration": 2.9546730518341064 }, { "auxiliary_loss_clip": 0.01180534, "auxiliary_loss_mlp": 0.01038183, "balance_loss_clip": 1.09153342, "balance_loss_mlp": 1.01357794, "epoch": 0.9152562753644973, "flos": 72178515060480.0, "grad_norm": 0.7255830156947594, "language_loss": 0.49695674, "learning_rate": 7.481095595124953e-08, "loss": 0.51914394, "num_input_tokens_seen": 328426865, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.24609375, "step": 15223, "time_per_iteration": 3.3283181190490723 }, { "auxiliary_loss_clip": 0.01400039, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 1.23942089, "balance_loss_mlp": 1.0181973, "epoch": 0.9153163986171652, "flos": 20786721131520.0, "grad_norm": 1.7284340160350067, "language_loss": 0.73071706, "learning_rate": 7.470546933201349e-08, "loss": 0.75509393, "num_input_tokens_seen": 328445970, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19445801, "step": 15224, "time_per_iteration": 2.932164430618286 }, { "auxiliary_loss_clip": 0.01391663, "auxiliary_loss_mlp": 0.01029575, "balance_loss_clip": 1.23393464, "balance_loss_mlp": 1.01065624, "epoch": 0.9153765218698332, "flos": 23050651380480.0, "grad_norm": 1.7290092547202913, "language_loss": 0.82962382, "learning_rate": 7.460005572013895e-08, "loss": 0.85383618, "num_input_tokens_seen": 328464585, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18920898, "step": 15225, "time_per_iteration": 2.8880815505981445 }, { "auxiliary_loss_clip": 0.01395322, "auxiliary_loss_mlp": 0.01029217, "balance_loss_clip": 1.23492122, "balance_loss_mlp": 1.01021469, "epoch": 0.9154366451225011, "flos": 29003926648320.0, "grad_norm": 1.3458988061239618, "language_loss": 0.71296322, "learning_rate": 7.44947151196238e-08, "loss": 0.7372086, "num_input_tokens_seen": 328490155, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18994141, "step": 15226, "time_per_iteration": 2.9552230834960938 }, { "auxiliary_loss_clip": 0.01395599, "auxiliary_loss_mlp": 0.0103546, "balance_loss_clip": 1.23564053, "balance_loss_mlp": 1.01658916, "epoch": 0.9154967683751691, "flos": 22319628842880.0, "grad_norm": 1.836220480059665, "language_loss": 0.75689638, "learning_rate": 7.43894475344613e-08, "loss": 0.78120697, "num_input_tokens_seen": 328508275, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18859863, "step": 15227, "time_per_iteration": 2.901482105255127 }, { "auxiliary_loss_clip": 0.0138992, "auxiliary_loss_mlp": 0.01032009, "balance_loss_clip": 1.23240542, "balance_loss_mlp": 1.01363873, "epoch": 0.915556891627837, "flos": 24582156503040.0, "grad_norm": 1.5241746761710058, "language_loss": 0.74552655, "learning_rate": 7.428425296864404e-08, "loss": 0.76974583, "num_input_tokens_seen": 328529425, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18359375, "step": 15228, "time_per_iteration": 4.347513198852539 }, { "auxiliary_loss_clip": 0.01387475, "auxiliary_loss_mlp": 0.01035088, "balance_loss_clip": 1.2290889, "balance_loss_mlp": 1.01723015, "epoch": 0.9156170148805051, "flos": 22175053148160.0, "grad_norm": 1.6539613488754643, "language_loss": 0.73024607, "learning_rate": 7.417913142616106e-08, "loss": 0.75447166, "num_input_tokens_seen": 328550200, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.1784668, "step": 15229, "time_per_iteration": 2.8836474418640137 }, { "auxiliary_loss_clip": 0.0140134, "auxiliary_loss_mlp": 0.01034795, "balance_loss_clip": 1.24165702, "balance_loss_mlp": 1.01469648, "epoch": 0.915677138133173, "flos": 20929803747840.0, "grad_norm": 2.6500273046771956, "language_loss": 0.83494943, "learning_rate": 7.407408291099848e-08, "loss": 0.85931075, "num_input_tokens_seen": 328568540, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.2010498, "step": 15230, "time_per_iteration": 2.8780415058135986 }, { "auxiliary_loss_clip": 0.01391572, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.23446465, "balance_loss_mlp": 1.01293075, "epoch": 0.915737261385841, "flos": 24353823404160.0, "grad_norm": 5.177584408598246, "language_loss": 0.84522569, "learning_rate": 7.396910742713957e-08, "loss": 0.86945271, "num_input_tokens_seen": 328587300, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18212891, "step": 15231, "time_per_iteration": 4.3860368728637695 }, { "auxiliary_loss_clip": 0.01384939, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.22785962, "balance_loss_mlp": 1.011729, "epoch": 0.9157973846385089, "flos": 26772826141440.0, "grad_norm": 1.5161218934317031, "language_loss": 0.73122156, "learning_rate": 7.386420497856516e-08, "loss": 0.75537407, "num_input_tokens_seen": 328610055, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18579102, "step": 15232, "time_per_iteration": 2.914599657058716 }, { "auxiliary_loss_clip": 0.01404088, "auxiliary_loss_mlp": 0.01031844, "balance_loss_clip": 1.24310207, "balance_loss_mlp": 1.0130806, "epoch": 0.9158575078911769, "flos": 18487925124480.0, "grad_norm": 2.83464728369176, "language_loss": 0.69468558, "learning_rate": 7.375937556925338e-08, "loss": 0.71904486, "num_input_tokens_seen": 328626815, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1875, "step": 15233, "time_per_iteration": 2.96437406539917 }, { "auxiliary_loss_clip": 0.01404909, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.24230838, "balance_loss_mlp": 1.01574326, "epoch": 0.9159176311438448, "flos": 21808976319360.0, "grad_norm": 2.838109132012694, "language_loss": 0.70397818, "learning_rate": 7.365461920317861e-08, "loss": 0.72837281, "num_input_tokens_seen": 328643995, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.18811035, "step": 15234, "time_per_iteration": 2.8652570247650146 }, { "auxiliary_loss_clip": 0.01418599, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.25548458, "balance_loss_mlp": 1.01634955, "epoch": 0.9159777543965129, "flos": 24792753640320.0, "grad_norm": 1.6169670913879444, "language_loss": 0.88993752, "learning_rate": 7.354993588431391e-08, "loss": 0.91448325, "num_input_tokens_seen": 328659565, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19628906, "step": 15235, "time_per_iteration": 2.9045631885528564 }, { "auxiliary_loss_clip": 0.01402425, "auxiliary_loss_mlp": 0.01032806, "balance_loss_clip": 1.24099731, "balance_loss_mlp": 1.01343489, "epoch": 0.9160378776491809, "flos": 26879685655680.0, "grad_norm": 1.5793573189527998, "language_loss": 0.77800471, "learning_rate": 7.344532561662853e-08, "loss": 0.80235702, "num_input_tokens_seen": 328679045, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19372559, "step": 15236, "time_per_iteration": 4.329482316970825 }, { "auxiliary_loss_clip": 0.01179041, "auxiliary_loss_mlp": 0.01044327, "balance_loss_clip": 1.09066677, "balance_loss_mlp": 1.02115238, "epoch": 0.9160980009018488, "flos": 70609836695040.0, "grad_norm": 0.6798137799792019, "language_loss": 0.6226697, "learning_rate": 7.334078840409019e-08, "loss": 0.64490342, "num_input_tokens_seen": 328744565, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.23144531, "step": 15237, "time_per_iteration": 3.379786968231201 }, { "auxiliary_loss_clip": 0.01402409, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.24151325, "balance_loss_mlp": 1.01383257, "epoch": 0.9161581241545168, "flos": 16297888913280.0, "grad_norm": 2.320840338126538, "language_loss": 0.75394815, "learning_rate": 7.323632425066151e-08, "loss": 0.77830648, "num_input_tokens_seen": 328762455, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19604492, "step": 15238, "time_per_iteration": 4.271705627441406 }, { "auxiliary_loss_clip": 0.01399089, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.23807669, "balance_loss_mlp": 1.01193786, "epoch": 0.9162182474071847, "flos": 18446272646400.0, "grad_norm": 1.9513793748587411, "language_loss": 0.75684857, "learning_rate": 7.313193316030464e-08, "loss": 0.78114593, "num_input_tokens_seen": 328780320, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18688965, "step": 15239, "time_per_iteration": 2.8524742126464844 }, { "auxiliary_loss_clip": 0.01404081, "auxiliary_loss_mlp": 0.01030128, "balance_loss_clip": 1.24317431, "balance_loss_mlp": 1.01044679, "epoch": 0.9162783706598527, "flos": 19175394902400.0, "grad_norm": 2.173450935259786, "language_loss": 0.64508617, "learning_rate": 7.302761513697819e-08, "loss": 0.66942823, "num_input_tokens_seen": 328797570, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19665527, "step": 15240, "time_per_iteration": 2.8394808769226074 }, { "auxiliary_loss_clip": 0.01397683, "auxiliary_loss_mlp": 0.01029767, "balance_loss_clip": 1.2392664, "balance_loss_mlp": 1.01118183, "epoch": 0.9163384939125206, "flos": 20422816053120.0, "grad_norm": 1.8914261036838191, "language_loss": 0.76891494, "learning_rate": 7.292337018463746e-08, "loss": 0.79318941, "num_input_tokens_seen": 328814075, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18591309, "step": 15241, "time_per_iteration": 2.879179000854492 }, { "auxiliary_loss_clip": 0.01446968, "auxiliary_loss_mlp": 0.01034713, "balance_loss_clip": 1.27562284, "balance_loss_mlp": 1.01524651, "epoch": 0.9163986171651887, "flos": 19655298944640.0, "grad_norm": 2.1484523815354515, "language_loss": 0.68929458, "learning_rate": 7.281919830723549e-08, "loss": 0.71411133, "num_input_tokens_seen": 328831990, "router_z_loss_clip": 1.71386719, "router_z_loss_mlp": 0.19458008, "step": 15242, "time_per_iteration": 2.828090190887451 }, { "auxiliary_loss_clip": 0.01394308, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.23405612, "balance_loss_mlp": 1.01721263, "epoch": 0.9164587404178566, "flos": 12830678455680.0, "grad_norm": 1.9119668292084162, "language_loss": 0.8164196, "learning_rate": 7.271509950872334e-08, "loss": 0.84072411, "num_input_tokens_seen": 328849105, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18933105, "step": 15243, "time_per_iteration": 2.8173904418945312 }, { "auxiliary_loss_clip": 0.01416003, "auxiliary_loss_mlp": 0.01037731, "balance_loss_clip": 1.2515794, "balance_loss_mlp": 1.01832378, "epoch": 0.9165188636705246, "flos": 22319493108480.0, "grad_norm": 1.749825684113621, "language_loss": 0.82132745, "learning_rate": 7.261107379304721e-08, "loss": 0.84586477, "num_input_tokens_seen": 328866810, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19396973, "step": 15244, "time_per_iteration": 2.8434932231903076 }, { "auxiliary_loss_clip": 0.01405627, "auxiliary_loss_mlp": 0.01035123, "balance_loss_clip": 1.2409997, "balance_loss_mlp": 1.01556039, "epoch": 0.9165789869231925, "flos": 18232463128320.0, "grad_norm": 2.3362215547208622, "language_loss": 0.74076176, "learning_rate": 7.250712116415214e-08, "loss": 0.76516926, "num_input_tokens_seen": 328885325, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19555664, "step": 15245, "time_per_iteration": 2.8322644233703613 }, { "auxiliary_loss_clip": 0.01399145, "auxiliary_loss_mlp": 0.01037729, "balance_loss_clip": 1.24039054, "balance_loss_mlp": 1.01858425, "epoch": 0.9166391101758605, "flos": 13697544441600.0, "grad_norm": 1.6092439558897738, "language_loss": 0.75119019, "learning_rate": 7.240324162598033e-08, "loss": 0.77555889, "num_input_tokens_seen": 328902655, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19152832, "step": 15246, "time_per_iteration": 2.8385536670684814 }, { "auxiliary_loss_clip": 0.01409548, "auxiliary_loss_mlp": 0.01033334, "balance_loss_clip": 1.24962354, "balance_loss_mlp": 1.01371169, "epoch": 0.9166992334285284, "flos": 17355462307200.0, "grad_norm": 1.9507248774964434, "language_loss": 0.76316291, "learning_rate": 7.229943518247106e-08, "loss": 0.78759181, "num_input_tokens_seen": 328918440, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19616699, "step": 15247, "time_per_iteration": 2.8454172611236572 }, { "auxiliary_loss_clip": 0.01410561, "auxiliary_loss_mlp": 0.01032381, "balance_loss_clip": 1.24789476, "balance_loss_mlp": 1.01376009, "epoch": 0.9167593566811965, "flos": 23741288294400.0, "grad_norm": 2.183755052310745, "language_loss": 0.77202791, "learning_rate": 7.219570183756052e-08, "loss": 0.79645729, "num_input_tokens_seen": 328938055, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1862793, "step": 15248, "time_per_iteration": 2.894878625869751 }, { "auxiliary_loss_clip": 0.01397584, "auxiliary_loss_mlp": 0.01031569, "balance_loss_clip": 1.23762751, "balance_loss_mlp": 1.0124594, "epoch": 0.9168194799338644, "flos": 27829585128960.0, "grad_norm": 4.531161110362229, "language_loss": 0.75153553, "learning_rate": 7.209204159518178e-08, "loss": 0.77582705, "num_input_tokens_seen": 328957895, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19104004, "step": 15249, "time_per_iteration": 2.919281244277954 }, { "auxiliary_loss_clip": 0.01393674, "auxiliary_loss_mlp": 0.01033824, "balance_loss_clip": 1.23410094, "balance_loss_mlp": 1.01492965, "epoch": 0.9168796031865324, "flos": 21725399894400.0, "grad_norm": 2.3967380342444784, "language_loss": 0.7710495, "learning_rate": 7.198845445926616e-08, "loss": 0.79532444, "num_input_tokens_seen": 328971365, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18896484, "step": 15250, "time_per_iteration": 2.834374189376831 }, { "auxiliary_loss_clip": 0.01401145, "auxiliary_loss_mlp": 0.01035296, "balance_loss_clip": 1.24319077, "balance_loss_mlp": 1.01553082, "epoch": 0.9169397264392004, "flos": 23414918417280.0, "grad_norm": 1.7774202356039672, "language_loss": 0.76405716, "learning_rate": 7.188494043374138e-08, "loss": 0.78842157, "num_input_tokens_seen": 328990830, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19763184, "step": 15251, "time_per_iteration": 2.8456451892852783 }, { "auxiliary_loss_clip": 0.01397698, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.23895752, "balance_loss_mlp": 1.01355958, "epoch": 0.9169998496918683, "flos": 23961432084480.0, "grad_norm": 3.009396563248502, "language_loss": 0.81657326, "learning_rate": 7.178149952253298e-08, "loss": 0.84087813, "num_input_tokens_seen": 329008345, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19238281, "step": 15252, "time_per_iteration": 2.892542839050293 }, { "auxiliary_loss_clip": 0.01402147, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.24323201, "balance_loss_mlp": 1.0128274, "epoch": 0.9170599729445363, "flos": 18341539637760.0, "grad_norm": 1.5961268986143755, "language_loss": 0.77709126, "learning_rate": 7.167813172956316e-08, "loss": 0.80142057, "num_input_tokens_seen": 329027440, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.17956543, "step": 15253, "time_per_iteration": 2.8378188610076904 }, { "auxiliary_loss_clip": 0.01398627, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.23789823, "balance_loss_mlp": 1.01203513, "epoch": 0.9171200961972042, "flos": 22685207978880.0, "grad_norm": 1.6807375618102127, "language_loss": 0.73455888, "learning_rate": 7.157483705875256e-08, "loss": 0.75884932, "num_input_tokens_seen": 329046445, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18383789, "step": 15254, "time_per_iteration": 2.8554773330688477 }, { "auxiliary_loss_clip": 0.01383032, "auxiliary_loss_mlp": 0.01029784, "balance_loss_clip": 1.22743821, "balance_loss_mlp": 1.01166439, "epoch": 0.9171802194498723, "flos": 26729273381760.0, "grad_norm": 1.6414741254078207, "language_loss": 0.79448116, "learning_rate": 7.14716155140167e-08, "loss": 0.81860936, "num_input_tokens_seen": 329065555, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18115234, "step": 15255, "time_per_iteration": 2.8666257858276367 }, { "auxiliary_loss_clip": 0.01398523, "auxiliary_loss_mlp": 0.01035135, "balance_loss_clip": 1.23806071, "balance_loss_mlp": 1.0162518, "epoch": 0.9172403427025402, "flos": 37903489280640.0, "grad_norm": 2.085737333673275, "language_loss": 0.68951499, "learning_rate": 7.136846709927047e-08, "loss": 0.71385157, "num_input_tokens_seen": 329087515, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18884277, "step": 15256, "time_per_iteration": 2.983759641647339 }, { "auxiliary_loss_clip": 0.01398414, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.24101782, "balance_loss_mlp": 1.01552498, "epoch": 0.9173004659552082, "flos": 17063822453760.0, "grad_norm": 1.6006284203809102, "language_loss": 0.84563029, "learning_rate": 7.126539181842561e-08, "loss": 0.86995345, "num_input_tokens_seen": 329106820, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18383789, "step": 15257, "time_per_iteration": 2.8749446868896484 }, { "auxiliary_loss_clip": 0.01385674, "auxiliary_loss_mlp": 0.01033805, "balance_loss_clip": 1.22854161, "balance_loss_mlp": 1.0164721, "epoch": 0.9173605892078761, "flos": 22212090656640.0, "grad_norm": 1.6319944138628661, "language_loss": 0.77818882, "learning_rate": 7.116238967539012e-08, "loss": 0.80238366, "num_input_tokens_seen": 329126515, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17333984, "step": 15258, "time_per_iteration": 2.9538776874542236 }, { "auxiliary_loss_clip": 0.01389882, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.23217082, "balance_loss_mlp": 1.0153724, "epoch": 0.9174207124605441, "flos": 16516268156160.0, "grad_norm": 1.891269149287408, "language_loss": 0.79738224, "learning_rate": 7.105946067406999e-08, "loss": 0.82162869, "num_input_tokens_seen": 329142660, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19372559, "step": 15259, "time_per_iteration": 2.874805212020874 }, { "auxiliary_loss_clip": 0.01396367, "auxiliary_loss_mlp": 0.01035396, "balance_loss_clip": 1.23864317, "balance_loss_mlp": 1.01658475, "epoch": 0.917480835713212, "flos": 24546431093760.0, "grad_norm": 1.5500452088547994, "language_loss": 0.77056825, "learning_rate": 7.095660481836895e-08, "loss": 0.79488587, "num_input_tokens_seen": 329162575, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18811035, "step": 15260, "time_per_iteration": 2.907242774963379 }, { "auxiliary_loss_clip": 0.01396218, "auxiliary_loss_mlp": 0.01032196, "balance_loss_clip": 1.23721576, "balance_loss_mlp": 1.01313448, "epoch": 0.9175409589658801, "flos": 20888965676160.0, "grad_norm": 1.5740382774989974, "language_loss": 0.61427039, "learning_rate": 7.085382211218637e-08, "loss": 0.63855457, "num_input_tokens_seen": 329182090, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19067383, "step": 15261, "time_per_iteration": 2.9197559356689453 }, { "auxiliary_loss_clip": 0.01389561, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.23164701, "balance_loss_mlp": 1.01252961, "epoch": 0.917601082218548, "flos": 14282362471680.0, "grad_norm": 2.235580347357797, "language_loss": 0.74440044, "learning_rate": 7.075111255942002e-08, "loss": 0.76860458, "num_input_tokens_seen": 329196535, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18322754, "step": 15262, "time_per_iteration": 2.881556987762451 }, { "auxiliary_loss_clip": 0.01408114, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.24440193, "balance_loss_mlp": 1.01651502, "epoch": 0.917661205471216, "flos": 19108649543040.0, "grad_norm": 1.7819490562048943, "language_loss": 0.78428102, "learning_rate": 7.064847616396496e-08, "loss": 0.8087194, "num_input_tokens_seen": 329215135, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19213867, "step": 15263, "time_per_iteration": 4.351459741592407 }, { "auxiliary_loss_clip": 0.0140837, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.24366415, "balance_loss_mlp": 1.01452374, "epoch": 0.917721328723884, "flos": 21116665347840.0, "grad_norm": 1.9015875950070904, "language_loss": 0.7616564, "learning_rate": 7.054591292971324e-08, "loss": 0.78608, "num_input_tokens_seen": 329235150, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19458008, "step": 15264, "time_per_iteration": 2.833146333694458 }, { "auxiliary_loss_clip": 0.01411487, "auxiliary_loss_mlp": 0.0103555, "balance_loss_clip": 1.25110173, "balance_loss_mlp": 1.01673853, "epoch": 0.9177814519765519, "flos": 21952873342080.0, "grad_norm": 2.214711130837956, "language_loss": 0.837551, "learning_rate": 7.044342286055394e-08, "loss": 0.86202139, "num_input_tokens_seen": 329254365, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18798828, "step": 15265, "time_per_iteration": 2.8300843238830566 }, { "auxiliary_loss_clip": 0.01406411, "auxiliary_loss_mlp": 0.01040348, "balance_loss_clip": 1.2418642, "balance_loss_mlp": 1.02022529, "epoch": 0.9178415752292199, "flos": 24216396387840.0, "grad_norm": 1.6972012248836394, "language_loss": 0.73735523, "learning_rate": 7.034100596037306e-08, "loss": 0.76182282, "num_input_tokens_seen": 329274385, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20117188, "step": 15266, "time_per_iteration": 4.385447025299072 }, { "auxiliary_loss_clip": 0.01400246, "auxiliary_loss_mlp": 0.01031282, "balance_loss_clip": 1.2395885, "balance_loss_mlp": 1.01304257, "epoch": 0.9179016984818879, "flos": 20050223973120.0, "grad_norm": 1.942922273062093, "language_loss": 0.78596312, "learning_rate": 7.023866223305486e-08, "loss": 0.81027836, "num_input_tokens_seen": 329292160, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18249512, "step": 15267, "time_per_iteration": 2.8551549911499023 }, { "auxiliary_loss_clip": 0.01180928, "auxiliary_loss_mlp": 0.01035854, "balance_loss_clip": 1.09148073, "balance_loss_mlp": 1.01353765, "epoch": 0.9179618217345559, "flos": 65589512083200.0, "grad_norm": 0.7379390295632065, "language_loss": 0.5631333, "learning_rate": 7.013639168247975e-08, "loss": 0.5853011, "num_input_tokens_seen": 329351870, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.22363281, "step": 15268, "time_per_iteration": 3.3699913024902344 }, { "auxiliary_loss_clip": 0.01406443, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.24629223, "balance_loss_mlp": 1.01076424, "epoch": 0.9180219449872238, "flos": 21334546897920.0, "grad_norm": 2.0181412730517976, "language_loss": 0.77724314, "learning_rate": 7.0034194312526e-08, "loss": 0.80160725, "num_input_tokens_seen": 329370930, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19189453, "step": 15269, "time_per_iteration": 2.85247540473938 }, { "auxiliary_loss_clip": 0.01392252, "auxiliary_loss_mlp": 0.01036137, "balance_loss_clip": 1.23407793, "balance_loss_mlp": 1.01676559, "epoch": 0.9180820682398918, "flos": 41074173446400.0, "grad_norm": 2.0372050232013366, "language_loss": 0.73161149, "learning_rate": 6.993207012706936e-08, "loss": 0.75589538, "num_input_tokens_seen": 329391275, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19384766, "step": 15270, "time_per_iteration": 3.012798309326172 }, { "auxiliary_loss_clip": 0.01387745, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.23022485, "balance_loss_mlp": 1.01138937, "epoch": 0.9181421914925597, "flos": 28084187473920.0, "grad_norm": 1.6114798640713504, "language_loss": 0.80779159, "learning_rate": 6.98300191299821e-08, "loss": 0.831985, "num_input_tokens_seen": 329412775, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.20214844, "step": 15271, "time_per_iteration": 2.9160687923431396 }, { "auxiliary_loss_clip": 0.01394313, "auxiliary_loss_mlp": 0.01031184, "balance_loss_clip": 1.23366344, "balance_loss_mlp": 1.0128262, "epoch": 0.9182023147452277, "flos": 29181467819520.0, "grad_norm": 1.84591993919902, "language_loss": 0.73084104, "learning_rate": 6.972804132513355e-08, "loss": 0.75509596, "num_input_tokens_seen": 329432440, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18359375, "step": 15272, "time_per_iteration": 4.304319381713867 }, { "auxiliary_loss_clip": 0.01391826, "auxiliary_loss_mlp": 0.01035912, "balance_loss_clip": 1.23236036, "balance_loss_mlp": 1.0179708, "epoch": 0.9182624379978956, "flos": 24071413489920.0, "grad_norm": 2.1113965447273326, "language_loss": 0.7304498, "learning_rate": 6.962613671639105e-08, "loss": 0.75472713, "num_input_tokens_seen": 329450605, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.17944336, "step": 15273, "time_per_iteration": 4.430824279785156 }, { "auxiliary_loss_clip": 0.01376931, "auxiliary_loss_mlp": 0.01027007, "balance_loss_clip": 1.22235179, "balance_loss_mlp": 1.00937581, "epoch": 0.9183225612505637, "flos": 23303624912640.0, "grad_norm": 1.4521223189260974, "language_loss": 0.748649, "learning_rate": 6.952430530761933e-08, "loss": 0.77268839, "num_input_tokens_seen": 329470550, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.17626953, "step": 15274, "time_per_iteration": 2.88554048538208 }, { "auxiliary_loss_clip": 0.01402697, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.24138105, "balance_loss_mlp": 1.01367164, "epoch": 0.9183826845032316, "flos": 19618578149760.0, "grad_norm": 2.0244418396424284, "language_loss": 0.69935906, "learning_rate": 6.942254710267902e-08, "loss": 0.72370768, "num_input_tokens_seen": 329489765, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18493652, "step": 15275, "time_per_iteration": 2.8554248809814453 }, { "auxiliary_loss_clip": 0.0139319, "auxiliary_loss_mlp": 0.01026372, "balance_loss_clip": 1.23456454, "balance_loss_mlp": 1.00782311, "epoch": 0.9184428077558996, "flos": 18488332327680.0, "grad_norm": 2.005094974271042, "language_loss": 0.73402941, "learning_rate": 6.932086210542953e-08, "loss": 0.75822502, "num_input_tokens_seen": 329507040, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18554688, "step": 15276, "time_per_iteration": 2.8442280292510986 }, { "auxiliary_loss_clip": 0.01402173, "auxiliary_loss_mlp": 0.01034297, "balance_loss_clip": 1.24266255, "balance_loss_mlp": 1.01602221, "epoch": 0.9185029310085676, "flos": 20750859987840.0, "grad_norm": 1.6455978030338418, "language_loss": 0.74247682, "learning_rate": 6.921925031972642e-08, "loss": 0.76684153, "num_input_tokens_seen": 329525540, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18273926, "step": 15277, "time_per_iteration": 2.844045877456665 }, { "auxiliary_loss_clip": 0.01179418, "auxiliary_loss_mlp": 0.01045979, "balance_loss_clip": 1.09206343, "balance_loss_mlp": 1.02499866, "epoch": 0.9185630542612355, "flos": 68240901968640.0, "grad_norm": 0.7199013596637794, "language_loss": 0.59216583, "learning_rate": 6.91177117494226e-08, "loss": 0.61441976, "num_input_tokens_seen": 329592905, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.20996094, "step": 15278, "time_per_iteration": 3.428173065185547 }, { "auxiliary_loss_clip": 0.01378799, "auxiliary_loss_mlp": 0.01033158, "balance_loss_clip": 1.22299218, "balance_loss_mlp": 1.01515722, "epoch": 0.9186231775139035, "flos": 12246991545600.0, "grad_norm": 1.7742288904641985, "language_loss": 0.64697689, "learning_rate": 6.901624639836879e-08, "loss": 0.67109644, "num_input_tokens_seen": 329610150, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18005371, "step": 15279, "time_per_iteration": 2.8651609420776367 }, { "auxiliary_loss_clip": 0.01174464, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.0893631, "balance_loss_mlp": 1.0081557, "epoch": 0.9186833007665715, "flos": 63969498852480.0, "grad_norm": 0.8634388105783173, "language_loss": 0.60188365, "learning_rate": 6.891485427041211e-08, "loss": 0.62395012, "num_input_tokens_seen": 329673650, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.24023438, "step": 15280, "time_per_iteration": 3.250854969024658 }, { "auxiliary_loss_clip": 0.01404718, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.24344516, "balance_loss_mlp": 1.01442814, "epoch": 0.9187434240192395, "flos": 19984293020160.0, "grad_norm": 1.6837797886922972, "language_loss": 0.7071318, "learning_rate": 6.881353536939815e-08, "loss": 0.73152328, "num_input_tokens_seen": 329692520, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.20007324, "step": 15281, "time_per_iteration": 2.8851330280303955 }, { "auxiliary_loss_clip": 0.01395424, "auxiliary_loss_mlp": 0.01034253, "balance_loss_clip": 1.23548627, "balance_loss_mlp": 1.01482201, "epoch": 0.9188035472719074, "flos": 25238561086080.0, "grad_norm": 1.927065472783901, "language_loss": 0.8534739, "learning_rate": 6.871228969916831e-08, "loss": 0.87777066, "num_input_tokens_seen": 329713750, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19433594, "step": 15282, "time_per_iteration": 2.8666088581085205 }, { "auxiliary_loss_clip": 0.01400906, "auxiliary_loss_mlp": 0.01034024, "balance_loss_clip": 1.24289918, "balance_loss_mlp": 1.01485467, "epoch": 0.9188636705245754, "flos": 18414664513920.0, "grad_norm": 1.8234078124676185, "language_loss": 0.6075381, "learning_rate": 6.861111726356194e-08, "loss": 0.63188744, "num_input_tokens_seen": 329730960, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19165039, "step": 15283, "time_per_iteration": 2.823317766189575 }, { "auxiliary_loss_clip": 0.01414616, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.2499404, "balance_loss_mlp": 1.01256013, "epoch": 0.9189237937772433, "flos": 23779683146880.0, "grad_norm": 1.4933247987533482, "language_loss": 0.6619274, "learning_rate": 6.851001806641554e-08, "loss": 0.68638611, "num_input_tokens_seen": 329750975, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18688965, "step": 15284, "time_per_iteration": 2.8839762210845947 }, { "auxiliary_loss_clip": 0.01396213, "auxiliary_loss_mlp": 0.01033153, "balance_loss_clip": 1.23690844, "balance_loss_mlp": 1.01359105, "epoch": 0.9189839170299113, "flos": 21224384513280.0, "grad_norm": 1.8281059050128534, "language_loss": 0.74020463, "learning_rate": 6.840899211156292e-08, "loss": 0.76449829, "num_input_tokens_seen": 329769645, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19567871, "step": 15285, "time_per_iteration": 2.85089373588562 }, { "auxiliary_loss_clip": 0.01384522, "auxiliary_loss_mlp": 0.01034428, "balance_loss_clip": 1.22667122, "balance_loss_mlp": 1.01533043, "epoch": 0.9190440402825792, "flos": 16736095232640.0, "grad_norm": 2.54085811877475, "language_loss": 0.72536325, "learning_rate": 6.830803940283458e-08, "loss": 0.74955273, "num_input_tokens_seen": 329788185, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19091797, "step": 15286, "time_per_iteration": 2.834083080291748 }, { "auxiliary_loss_clip": 0.01397118, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.23796725, "balance_loss_mlp": 1.01683378, "epoch": 0.9191041635352473, "flos": 23451774946560.0, "grad_norm": 1.6281771963018994, "language_loss": 0.73922282, "learning_rate": 6.820715994405945e-08, "loss": 0.7635538, "num_input_tokens_seen": 329806780, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19152832, "step": 15287, "time_per_iteration": 2.8573148250579834 }, { "auxiliary_loss_clip": 0.01402624, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.24133515, "balance_loss_mlp": 1.01315379, "epoch": 0.9191642867879152, "flos": 18816919200000.0, "grad_norm": 2.3162578518735604, "language_loss": 0.65993357, "learning_rate": 6.810635373906226e-08, "loss": 0.68430078, "num_input_tokens_seen": 329826350, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20935059, "step": 15288, "time_per_iteration": 2.84517240524292 }, { "auxiliary_loss_clip": 0.01399247, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.23905325, "balance_loss_mlp": 1.01584935, "epoch": 0.9192244100405832, "flos": 32173705918080.0, "grad_norm": 1.8014855117846167, "language_loss": 0.71313477, "learning_rate": 6.800562079166549e-08, "loss": 0.73748124, "num_input_tokens_seen": 329846160, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19555664, "step": 15289, "time_per_iteration": 2.9161102771759033 }, { "auxiliary_loss_clip": 0.01402855, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.24335933, "balance_loss_mlp": 1.0163573, "epoch": 0.9192845332932512, "flos": 16365041475840.0, "grad_norm": 2.0209914166034193, "language_loss": 0.75622904, "learning_rate": 6.790496110568921e-08, "loss": 0.78061414, "num_input_tokens_seen": 329862020, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19287109, "step": 15290, "time_per_iteration": 2.813507556915283 }, { "auxiliary_loss_clip": 0.01389215, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.23207998, "balance_loss_mlp": 1.01093459, "epoch": 0.9193446565459191, "flos": 26625626248320.0, "grad_norm": 2.3585322575054963, "language_loss": 0.7278735, "learning_rate": 6.78043746849506e-08, "loss": 0.75206184, "num_input_tokens_seen": 329880185, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18664551, "step": 15291, "time_per_iteration": 2.8639259338378906 }, { "auxiliary_loss_clip": 0.01390961, "auxiliary_loss_mlp": 0.01031087, "balance_loss_clip": 1.23389137, "balance_loss_mlp": 1.0130384, "epoch": 0.9194047797985871, "flos": 22502418410880.0, "grad_norm": 1.7731809954041826, "language_loss": 0.71079147, "learning_rate": 6.770386153326346e-08, "loss": 0.73501194, "num_input_tokens_seen": 329900255, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18066406, "step": 15292, "time_per_iteration": 2.9565138816833496 }, { "auxiliary_loss_clip": 0.0140137, "auxiliary_loss_mlp": 0.01033831, "balance_loss_clip": 1.23963523, "balance_loss_mlp": 1.01401806, "epoch": 0.9194649030512551, "flos": 25088691749760.0, "grad_norm": 1.8251744884155028, "language_loss": 0.7383492, "learning_rate": 6.760342165443988e-08, "loss": 0.76270121, "num_input_tokens_seen": 329919095, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19812012, "step": 15293, "time_per_iteration": 2.857969045639038 }, { "auxiliary_loss_clip": 0.01400829, "auxiliary_loss_mlp": 0.01033505, "balance_loss_clip": 1.24204659, "balance_loss_mlp": 1.01503956, "epoch": 0.9195250263039231, "flos": 11918585652480.0, "grad_norm": 1.9706265175302111, "language_loss": 0.78872132, "learning_rate": 6.750305505228837e-08, "loss": 0.81306463, "num_input_tokens_seen": 329936505, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18481445, "step": 15294, "time_per_iteration": 2.8027405738830566 }, { "auxiliary_loss_clip": 0.01406238, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.2435472, "balance_loss_mlp": 1.01747787, "epoch": 0.919585149556591, "flos": 21843887322240.0, "grad_norm": 1.5268534553284165, "language_loss": 0.77810574, "learning_rate": 6.74027617306141e-08, "loss": 0.80254686, "num_input_tokens_seen": 329956795, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20397949, "step": 15295, "time_per_iteration": 2.8774046897888184 }, { "auxiliary_loss_clip": 0.01390831, "auxiliary_loss_mlp": 0.01029461, "balance_loss_clip": 1.23718131, "balance_loss_mlp": 1.0119493, "epoch": 0.919645272809259, "flos": 28195797692160.0, "grad_norm": 2.023914655681007, "language_loss": 0.72710109, "learning_rate": 6.730254169322114e-08, "loss": 0.75130397, "num_input_tokens_seen": 329977195, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.17504883, "step": 15296, "time_per_iteration": 2.907451629638672 }, { "auxiliary_loss_clip": 0.01383508, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.22660422, "balance_loss_mlp": 1.01641011, "epoch": 0.9197053960619269, "flos": 18341992085760.0, "grad_norm": 4.000792272020894, "language_loss": 0.7533083, "learning_rate": 6.720239494390912e-08, "loss": 0.77750224, "num_input_tokens_seen": 329992095, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19482422, "step": 15297, "time_per_iteration": 2.8440535068511963 }, { "auxiliary_loss_clip": 0.01388015, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.23008966, "balance_loss_mlp": 1.01478195, "epoch": 0.9197655193145949, "flos": 28195028530560.0, "grad_norm": 5.777005437320414, "language_loss": 0.74556583, "learning_rate": 6.710232148647676e-08, "loss": 0.76978558, "num_input_tokens_seen": 330011490, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19189453, "step": 15298, "time_per_iteration": 4.380703926086426 }, { "auxiliary_loss_clip": 0.01407376, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.24527979, "balance_loss_mlp": 1.0152657, "epoch": 0.9198256425672628, "flos": 17313945563520.0, "grad_norm": 1.9270295545195613, "language_loss": 0.80304468, "learning_rate": 6.70023213247175e-08, "loss": 0.82746637, "num_input_tokens_seen": 330027885, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1953125, "step": 15299, "time_per_iteration": 2.795534133911133 }, { "auxiliary_loss_clip": 0.0139699, "auxiliary_loss_mlp": 0.01033383, "balance_loss_clip": 1.23807442, "balance_loss_mlp": 1.01423764, "epoch": 0.9198857658199309, "flos": 17867562664320.0, "grad_norm": 2.1494109768106493, "language_loss": 0.64817882, "learning_rate": 6.690239446242385e-08, "loss": 0.67248249, "num_input_tokens_seen": 330046230, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19152832, "step": 15300, "time_per_iteration": 2.828676223754883 }, { "auxiliary_loss_clip": 0.01373378, "auxiliary_loss_mlp": 0.01028738, "balance_loss_clip": 1.22077, "balance_loss_mlp": 1.01170313, "epoch": 0.9199458890725988, "flos": 22137744170880.0, "grad_norm": 1.7115546422966972, "language_loss": 0.70440352, "learning_rate": 6.680254090338545e-08, "loss": 0.72842467, "num_input_tokens_seen": 330065535, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.17041016, "step": 15301, "time_per_iteration": 4.321869850158691 }, { "auxiliary_loss_clip": 0.01406558, "auxiliary_loss_mlp": 0.01034371, "balance_loss_clip": 1.24580359, "balance_loss_mlp": 1.01464176, "epoch": 0.9200060123252668, "flos": 16042562651520.0, "grad_norm": 1.6865449254505391, "language_loss": 0.71326482, "learning_rate": 6.670276065138814e-08, "loss": 0.73767412, "num_input_tokens_seen": 330082920, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19726562, "step": 15302, "time_per_iteration": 2.9079232215881348 }, { "auxiliary_loss_clip": 0.01404836, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.24285793, "balance_loss_mlp": 1.01481497, "epoch": 0.9200661355779348, "flos": 26874346769280.0, "grad_norm": 1.634503676530838, "language_loss": 0.77603257, "learning_rate": 6.660305371021579e-08, "loss": 0.80040526, "num_input_tokens_seen": 330101165, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.17614746, "step": 15303, "time_per_iteration": 2.8875041007995605 }, { "auxiliary_loss_clip": 0.01406949, "auxiliary_loss_mlp": 0.01034356, "balance_loss_clip": 1.24882007, "balance_loss_mlp": 1.01572311, "epoch": 0.9201262588306027, "flos": 12794410108800.0, "grad_norm": 1.978585718842766, "language_loss": 0.88868046, "learning_rate": 6.650342008365006e-08, "loss": 0.91309345, "num_input_tokens_seen": 330118775, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18640137, "step": 15304, "time_per_iteration": 2.7883200645446777 }, { "auxiliary_loss_clip": 0.01413675, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.25063491, "balance_loss_mlp": 1.01493096, "epoch": 0.9201863820832707, "flos": 20641421520000.0, "grad_norm": 1.9700693556251907, "language_loss": 0.78336072, "learning_rate": 6.64038597754677e-08, "loss": 0.80784965, "num_input_tokens_seen": 330135570, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.20288086, "step": 15305, "time_per_iteration": 2.8257133960723877 }, { "auxiliary_loss_clip": 0.0139016, "auxiliary_loss_mlp": 0.01032648, "balance_loss_clip": 1.23075187, "balance_loss_mlp": 1.01411128, "epoch": 0.9202465053359387, "flos": 26406975536640.0, "grad_norm": 2.0912529366472334, "language_loss": 0.82770836, "learning_rate": 6.630437278944501e-08, "loss": 0.85193646, "num_input_tokens_seen": 330152840, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.1854248, "step": 15306, "time_per_iteration": 2.8687961101531982 }, { "auxiliary_loss_clip": 0.01393717, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.23629308, "balance_loss_mlp": 1.01165378, "epoch": 0.9203066285886067, "flos": 10495252143360.0, "grad_norm": 2.187629368673581, "language_loss": 0.72916347, "learning_rate": 6.62049591293541e-08, "loss": 0.75339198, "num_input_tokens_seen": 330168605, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.17492676, "step": 15307, "time_per_iteration": 4.2179882526397705 }, { "auxiliary_loss_clip": 0.01410148, "auxiliary_loss_mlp": 0.0103414, "balance_loss_clip": 1.2479744, "balance_loss_mlp": 1.01193106, "epoch": 0.9203667518412746, "flos": 19400108417280.0, "grad_norm": 1.8215912320509062, "language_loss": 0.79377413, "learning_rate": 6.610561879896526e-08, "loss": 0.81821704, "num_input_tokens_seen": 330186160, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.2220459, "step": 15308, "time_per_iteration": 4.203775405883789 }, { "auxiliary_loss_clip": 0.01393303, "auxiliary_loss_mlp": 0.01032377, "balance_loss_clip": 1.23462212, "balance_loss_mlp": 1.0133512, "epoch": 0.9204268750939426, "flos": 15933576631680.0, "grad_norm": 1.809850349557736, "language_loss": 0.79196882, "learning_rate": 6.600635180204484e-08, "loss": 0.81622571, "num_input_tokens_seen": 330201780, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19018555, "step": 15309, "time_per_iteration": 2.7997732162475586 }, { "auxiliary_loss_clip": 0.01396466, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.23608994, "balance_loss_mlp": 1.01203251, "epoch": 0.9204869983466105, "flos": 16480361767680.0, "grad_norm": 2.9117239600965066, "language_loss": 0.66949254, "learning_rate": 6.590715814235781e-08, "loss": 0.69376493, "num_input_tokens_seen": 330219165, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18762207, "step": 15310, "time_per_iteration": 2.8129992485046387 }, { "auxiliary_loss_clip": 0.01395815, "auxiliary_loss_mlp": 0.01030835, "balance_loss_clip": 1.23536754, "balance_loss_mlp": 1.01155865, "epoch": 0.9205471215992785, "flos": 21548084947200.0, "grad_norm": 2.532308133573014, "language_loss": 0.66495693, "learning_rate": 6.580803782366495e-08, "loss": 0.68922341, "num_input_tokens_seen": 330238975, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19274902, "step": 15311, "time_per_iteration": 2.886121988296509 }, { "auxiliary_loss_clip": 0.01394537, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.23428428, "balance_loss_mlp": 1.01600623, "epoch": 0.9206072448519464, "flos": 25015793097600.0, "grad_norm": 1.9256586188559217, "language_loss": 0.7695421, "learning_rate": 6.570899084972503e-08, "loss": 0.79383349, "num_input_tokens_seen": 330259755, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18603516, "step": 15312, "time_per_iteration": 2.8597910404205322 }, { "auxiliary_loss_clip": 0.01389649, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.23451793, "balance_loss_mlp": 1.01632452, "epoch": 0.9206673681046145, "flos": 20532571234560.0, "grad_norm": 1.669277366692225, "language_loss": 0.79150736, "learning_rate": 6.561001722429394e-08, "loss": 0.81575358, "num_input_tokens_seen": 330277660, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18640137, "step": 15313, "time_per_iteration": 2.8319849967956543 }, { "auxiliary_loss_clip": 0.01399445, "auxiliary_loss_mlp": 0.01032914, "balance_loss_clip": 1.23893785, "balance_loss_mlp": 1.01318479, "epoch": 0.9207274913572824, "flos": 20892404280960.0, "grad_norm": 1.8155330192538022, "language_loss": 0.79307544, "learning_rate": 6.55111169511251e-08, "loss": 0.81739897, "num_input_tokens_seen": 330295455, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19726562, "step": 15314, "time_per_iteration": 2.8369131088256836 }, { "auxiliary_loss_clip": 0.01421863, "auxiliary_loss_mlp": 0.01034369, "balance_loss_clip": 1.25551295, "balance_loss_mlp": 1.01462817, "epoch": 0.9207876146099504, "flos": 22717811496960.0, "grad_norm": 1.67903664927146, "language_loss": 0.79631811, "learning_rate": 6.541229003396864e-08, "loss": 0.82088041, "num_input_tokens_seen": 330315310, "router_z_loss_clip": 1.66503906, "router_z_loss_mlp": 0.1973877, "step": 15315, "time_per_iteration": 2.8352620601654053 }, { "auxiliary_loss_clip": 0.01419116, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.25421119, "balance_loss_mlp": 1.01667094, "epoch": 0.9208477378626184, "flos": 18514511084160.0, "grad_norm": 1.8000234294543411, "language_loss": 0.77446598, "learning_rate": 6.531353647657156e-08, "loss": 0.79901928, "num_input_tokens_seen": 330333260, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19543457, "step": 15316, "time_per_iteration": 2.8204267024993896 }, { "auxiliary_loss_clip": 0.01406089, "auxiliary_loss_mlp": 0.01037294, "balance_loss_clip": 1.24477434, "balance_loss_mlp": 1.01688528, "epoch": 0.9209078611152863, "flos": 23009134636800.0, "grad_norm": 1.6798810912931423, "language_loss": 0.69579732, "learning_rate": 6.521485628267931e-08, "loss": 0.72023118, "num_input_tokens_seen": 330352465, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20397949, "step": 15317, "time_per_iteration": 2.955392360687256 }, { "auxiliary_loss_clip": 0.01395646, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.2366749, "balance_loss_mlp": 1.01387322, "epoch": 0.9209679843679544, "flos": 24072273141120.0, "grad_norm": 1.6508630409571223, "language_loss": 0.84230673, "learning_rate": 6.511624945603378e-08, "loss": 0.8665961, "num_input_tokens_seen": 330372685, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19421387, "step": 15318, "time_per_iteration": 2.843653678894043 }, { "auxiliary_loss_clip": 0.01394256, "auxiliary_loss_mlp": 0.01034419, "balance_loss_clip": 1.23512042, "balance_loss_mlp": 1.01495218, "epoch": 0.9210281076206223, "flos": 13561339034880.0, "grad_norm": 2.4305603902380173, "language_loss": 0.86611187, "learning_rate": 6.501771600037354e-08, "loss": 0.89039862, "num_input_tokens_seen": 330388860, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19470215, "step": 15319, "time_per_iteration": 2.7987968921661377 }, { "auxiliary_loss_clip": 0.01174701, "auxiliary_loss_mlp": 0.01020339, "balance_loss_clip": 1.08829403, "balance_loss_mlp": 1.00260031, "epoch": 0.9210882308732903, "flos": 71460432535680.0, "grad_norm": 0.769243309131967, "language_loss": 0.5623067, "learning_rate": 6.491925591943559e-08, "loss": 0.58425707, "num_input_tokens_seen": 330448735, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.17773438, "step": 15320, "time_per_iteration": 3.373079299926758 }, { "auxiliary_loss_clip": 0.01403359, "auxiliary_loss_mlp": 0.01034174, "balance_loss_clip": 1.23866343, "balance_loss_mlp": 1.0136106, "epoch": 0.9211483541259582, "flos": 18516728079360.0, "grad_norm": 2.1756345412580926, "language_loss": 0.64986277, "learning_rate": 6.482086921695384e-08, "loss": 0.67423815, "num_input_tokens_seen": 330465600, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20568848, "step": 15321, "time_per_iteration": 2.8464059829711914 }, { "auxiliary_loss_clip": 0.01379786, "auxiliary_loss_mlp": 0.01029262, "balance_loss_clip": 1.2266494, "balance_loss_mlp": 1.01078475, "epoch": 0.9212084773786262, "flos": 23268623420160.0, "grad_norm": 1.4266917066165652, "language_loss": 0.71874309, "learning_rate": 6.47225558966582e-08, "loss": 0.74283361, "num_input_tokens_seen": 330485770, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 0.18469238, "step": 15322, "time_per_iteration": 2.895540952682495 }, { "auxiliary_loss_clip": 0.01390329, "auxiliary_loss_mlp": 0.01034702, "balance_loss_clip": 1.23228836, "balance_loss_mlp": 1.01605761, "epoch": 0.9212686006312941, "flos": 16298115137280.0, "grad_norm": 3.0229059573195167, "language_loss": 0.70716405, "learning_rate": 6.462431596227725e-08, "loss": 0.73141432, "num_input_tokens_seen": 330504255, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18640137, "step": 15323, "time_per_iteration": 2.805449962615967 }, { "auxiliary_loss_clip": 0.01404946, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.24183798, "balance_loss_mlp": 1.01647198, "epoch": 0.9213287238839621, "flos": 19793630856960.0, "grad_norm": 1.8895014902475942, "language_loss": 0.75549746, "learning_rate": 6.452614941753597e-08, "loss": 0.77991736, "num_input_tokens_seen": 330520705, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.20544434, "step": 15324, "time_per_iteration": 2.821956157684326 }, { "auxiliary_loss_clip": 0.01389038, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.23073983, "balance_loss_mlp": 1.01694012, "epoch": 0.92138884713663, "flos": 21039649418880.0, "grad_norm": 1.7855358775431383, "language_loss": 0.71364719, "learning_rate": 6.442805626615744e-08, "loss": 0.73790371, "num_input_tokens_seen": 330539245, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19677734, "step": 15325, "time_per_iteration": 2.852393627166748 }, { "auxiliary_loss_clip": 0.01394928, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.23585725, "balance_loss_mlp": 1.01371229, "epoch": 0.9214489703892981, "flos": 28599590701440.0, "grad_norm": 1.7234948251457534, "language_loss": 0.78594351, "learning_rate": 6.433003651186109e-08, "loss": 0.81022549, "num_input_tokens_seen": 330561815, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19543457, "step": 15326, "time_per_iteration": 2.8858280181884766 }, { "auxiliary_loss_clip": 0.01399774, "auxiliary_loss_mlp": 0.01034657, "balance_loss_clip": 1.23838758, "balance_loss_mlp": 1.01545215, "epoch": 0.921509093641966, "flos": 16369837424640.0, "grad_norm": 2.551779633885365, "language_loss": 0.72104686, "learning_rate": 6.42320901583635e-08, "loss": 0.74539119, "num_input_tokens_seen": 330579760, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.1920166, "step": 15327, "time_per_iteration": 2.782902240753174 }, { "auxiliary_loss_clip": 0.01414103, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.25116479, "balance_loss_mlp": 1.01945949, "epoch": 0.921569216894634, "flos": 26841381292800.0, "grad_norm": 2.1425972750356688, "language_loss": 0.78149194, "learning_rate": 6.413421720937906e-08, "loss": 0.80601919, "num_input_tokens_seen": 330598545, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19152832, "step": 15328, "time_per_iteration": 2.849195957183838 }, { "auxiliary_loss_clip": 0.01385225, "auxiliary_loss_mlp": 0.01030379, "balance_loss_clip": 1.22811997, "balance_loss_mlp": 1.01204443, "epoch": 0.921629340147302, "flos": 24656005296000.0, "grad_norm": 2.927058594993416, "language_loss": 0.72058135, "learning_rate": 6.4036417668619e-08, "loss": 0.74473733, "num_input_tokens_seen": 330616700, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18322754, "step": 15329, "time_per_iteration": 2.8509557247161865 }, { "auxiliary_loss_clip": 0.01389993, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.23239136, "balance_loss_mlp": 1.01232266, "epoch": 0.9216894633999699, "flos": 15095061152640.0, "grad_norm": 3.0027311975654514, "language_loss": 0.87399179, "learning_rate": 6.393869153979192e-08, "loss": 0.89819503, "num_input_tokens_seen": 330633355, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18005371, "step": 15330, "time_per_iteration": 2.8531572818756104 }, { "auxiliary_loss_clip": 0.01399177, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.23838329, "balance_loss_mlp": 1.01155186, "epoch": 0.921749586652638, "flos": 19212884858880.0, "grad_norm": 1.9176839548145632, "language_loss": 0.7672748, "learning_rate": 6.384103882660397e-08, "loss": 0.79157329, "num_input_tokens_seen": 330651470, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19116211, "step": 15331, "time_per_iteration": 2.857767343521118 }, { "auxiliary_loss_clip": 0.01393544, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.23400116, "balance_loss_mlp": 1.01386738, "epoch": 0.9218097099053059, "flos": 20532299765760.0, "grad_norm": 1.8946470075362702, "language_loss": 0.76976365, "learning_rate": 6.374345953275794e-08, "loss": 0.79402637, "num_input_tokens_seen": 330669170, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18859863, "step": 15332, "time_per_iteration": 2.8147265911102295 }, { "auxiliary_loss_clip": 0.01387851, "auxiliary_loss_mlp": 0.01032223, "balance_loss_clip": 1.2297287, "balance_loss_mlp": 1.01410294, "epoch": 0.9218698331579739, "flos": 17357769792000.0, "grad_norm": 2.1905504084626277, "language_loss": 0.74964464, "learning_rate": 6.364595366195358e-08, "loss": 0.77384537, "num_input_tokens_seen": 330686635, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18139648, "step": 15333, "time_per_iteration": 4.2445502281188965 }, { "auxiliary_loss_clip": 0.01178279, "auxiliary_loss_mlp": 0.01031154, "balance_loss_clip": 1.09072447, "balance_loss_mlp": 1.00654948, "epoch": 0.9219299564106418, "flos": 61985309074560.0, "grad_norm": 0.8047373640313156, "language_loss": 0.52925825, "learning_rate": 6.354852121788879e-08, "loss": 0.55135256, "num_input_tokens_seen": 330749160, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.24609375, "step": 15334, "time_per_iteration": 3.318010091781616 }, { "auxiliary_loss_clip": 0.01390178, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.23383939, "balance_loss_mlp": 1.01170254, "epoch": 0.9219900796633098, "flos": 15709179830400.0, "grad_norm": 2.190086271923082, "language_loss": 0.63060045, "learning_rate": 6.345116220425839e-08, "loss": 0.65481359, "num_input_tokens_seen": 330766840, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.19433594, "step": 15335, "time_per_iteration": 2.829707145690918 }, { "auxiliary_loss_clip": 0.0140276, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.2431531, "balance_loss_mlp": 1.01494801, "epoch": 0.9220502029159777, "flos": 24942487242240.0, "grad_norm": 1.6502912404919745, "language_loss": 0.72544599, "learning_rate": 6.335387662475366e-08, "loss": 0.74981225, "num_input_tokens_seen": 330785585, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18920898, "step": 15336, "time_per_iteration": 4.393793106079102 }, { "auxiliary_loss_clip": 0.01391626, "auxiliary_loss_mlp": 0.01030974, "balance_loss_clip": 1.23512197, "balance_loss_mlp": 1.01397443, "epoch": 0.9221103261686457, "flos": 15675128478720.0, "grad_norm": 1.803001884291993, "language_loss": 0.72593945, "learning_rate": 6.325666448306433e-08, "loss": 0.75016546, "num_input_tokens_seen": 330800750, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.16992188, "step": 15337, "time_per_iteration": 2.7989416122436523 }, { "auxiliary_loss_clip": 0.01181731, "auxiliary_loss_mlp": 0.0104344, "balance_loss_clip": 1.0913707, "balance_loss_mlp": 1.01692772, "epoch": 0.9221704494213137, "flos": 67547867080320.0, "grad_norm": 0.8836295383695084, "language_loss": 0.65383911, "learning_rate": 6.31595257828763e-08, "loss": 0.67609084, "num_input_tokens_seen": 330863640, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 0.265625, "step": 15338, "time_per_iteration": 3.235442876815796 }, { "auxiliary_loss_clip": 0.01404147, "auxiliary_loss_mlp": 0.01034896, "balance_loss_clip": 1.24412668, "balance_loss_mlp": 1.01554835, "epoch": 0.9222305726739817, "flos": 30239131703040.0, "grad_norm": 1.6808689760019158, "language_loss": 0.67595446, "learning_rate": 6.306246052787289e-08, "loss": 0.70034492, "num_input_tokens_seen": 330884675, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19348145, "step": 15339, "time_per_iteration": 2.941366195678711 }, { "auxiliary_loss_clip": 0.01392424, "auxiliary_loss_mlp": 0.01031235, "balance_loss_clip": 1.233042, "balance_loss_mlp": 1.01210237, "epoch": 0.9222906959266496, "flos": 25347954309120.0, "grad_norm": 2.067827232934132, "language_loss": 0.7289077, "learning_rate": 6.296546872173513e-08, "loss": 0.75314426, "num_input_tokens_seen": 330904125, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19140625, "step": 15340, "time_per_iteration": 2.862283945083618 }, { "auxiliary_loss_clip": 0.01397845, "auxiliary_loss_mlp": 0.01034205, "balance_loss_clip": 1.2403276, "balance_loss_mlp": 1.01573932, "epoch": 0.9223508191793176, "flos": 27611341620480.0, "grad_norm": 1.5390161180509006, "language_loss": 0.70785582, "learning_rate": 6.286855036814098e-08, "loss": 0.7321763, "num_input_tokens_seen": 330925140, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18469238, "step": 15341, "time_per_iteration": 2.9116036891937256 }, { "auxiliary_loss_clip": 0.01381889, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 1.2283082, "balance_loss_mlp": 1.0161469, "epoch": 0.9224109424319856, "flos": 27318887360640.0, "grad_norm": 1.7400149760509922, "language_loss": 0.67988372, "learning_rate": 6.277170547076571e-08, "loss": 0.70404458, "num_input_tokens_seen": 330946625, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.18041992, "step": 15342, "time_per_iteration": 4.282375335693359 }, { "auxiliary_loss_clip": 0.01401001, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.24077296, "balance_loss_mlp": 1.01421857, "epoch": 0.9224710656846535, "flos": 48223580244480.0, "grad_norm": 1.9717361764336299, "language_loss": 0.70310253, "learning_rate": 6.26749340332815e-08, "loss": 0.7274335, "num_input_tokens_seen": 330967795, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.17895508, "step": 15343, "time_per_iteration": 4.432230472564697 }, { "auxiliary_loss_clip": 0.01179557, "auxiliary_loss_mlp": 0.01026447, "balance_loss_clip": 1.09212303, "balance_loss_mlp": 1.00126958, "epoch": 0.9225311889373216, "flos": 66755754783360.0, "grad_norm": 0.9877778365909519, "language_loss": 0.52096355, "learning_rate": 6.257823605935786e-08, "loss": 0.54302359, "num_input_tokens_seen": 331040850, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.25195312, "step": 15344, "time_per_iteration": 3.537489414215088 }, { "auxiliary_loss_clip": 0.01387051, "auxiliary_loss_mlp": 0.01032527, "balance_loss_clip": 1.23305869, "balance_loss_mlp": 1.01393008, "epoch": 0.9225913121899895, "flos": 22280962521600.0, "grad_norm": 1.5879415604495823, "language_loss": 0.70942545, "learning_rate": 6.248161155266162e-08, "loss": 0.73362118, "num_input_tokens_seen": 331060595, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.18579102, "step": 15345, "time_per_iteration": 2.8364100456237793 }, { "auxiliary_loss_clip": 0.01402134, "auxiliary_loss_mlp": 0.01039971, "balance_loss_clip": 1.24299097, "balance_loss_mlp": 1.02088547, "epoch": 0.9226514354426575, "flos": 20091966940800.0, "grad_norm": 1.700504777171097, "language_loss": 0.7808609, "learning_rate": 6.238506051685677e-08, "loss": 0.80528188, "num_input_tokens_seen": 331080195, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1907959, "step": 15346, "time_per_iteration": 2.857239246368408 }, { "auxiliary_loss_clip": 0.01417677, "auxiliary_loss_mlp": 0.01035157, "balance_loss_clip": 1.25290322, "balance_loss_mlp": 1.01565409, "epoch": 0.9227115586953254, "flos": 16079464425600.0, "grad_norm": 1.760423133367011, "language_loss": 0.77786767, "learning_rate": 6.228858295560457e-08, "loss": 0.80239606, "num_input_tokens_seen": 331097645, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19519043, "step": 15347, "time_per_iteration": 2.806594133377075 }, { "auxiliary_loss_clip": 0.01380892, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.22654819, "balance_loss_mlp": 1.01039696, "epoch": 0.9227716819479934, "flos": 20454967123200.0, "grad_norm": 1.4695805987386088, "language_loss": 0.76987785, "learning_rate": 6.219217887256367e-08, "loss": 0.79397184, "num_input_tokens_seen": 331116830, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18127441, "step": 15348, "time_per_iteration": 2.9550981521606445 }, { "auxiliary_loss_clip": 0.01397979, "auxiliary_loss_mlp": 0.0103465, "balance_loss_clip": 1.23580396, "balance_loss_mlp": 1.0149802, "epoch": 0.9228318052006613, "flos": 25017693379200.0, "grad_norm": 2.3074366504644064, "language_loss": 0.68477339, "learning_rate": 6.209584827138959e-08, "loss": 0.70909965, "num_input_tokens_seen": 331137235, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19677734, "step": 15349, "time_per_iteration": 2.8561244010925293 }, { "auxiliary_loss_clip": 0.01403818, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.24198067, "balance_loss_mlp": 1.01519227, "epoch": 0.9228919284533293, "flos": 12684881151360.0, "grad_norm": 2.8995115457269853, "language_loss": 0.87869346, "learning_rate": 6.199959115573495e-08, "loss": 0.90307885, "num_input_tokens_seen": 331153155, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.1953125, "step": 15350, "time_per_iteration": 2.836699962615967 }, { "auxiliary_loss_clip": 0.0117973, "auxiliary_loss_mlp": 0.01046402, "balance_loss_clip": 1.09257865, "balance_loss_mlp": 1.01912713, "epoch": 0.9229520517059973, "flos": 70015472012160.0, "grad_norm": 0.7710484731428903, "language_loss": 0.60393149, "learning_rate": 6.190340752924994e-08, "loss": 0.62619287, "num_input_tokens_seen": 331214895, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.2734375, "step": 15351, "time_per_iteration": 3.322584629058838 }, { "auxiliary_loss_clip": 0.01410127, "auxiliary_loss_mlp": 0.01030014, "balance_loss_clip": 1.24642801, "balance_loss_mlp": 1.01195347, "epoch": 0.9230121749586653, "flos": 14802109200000.0, "grad_norm": 1.9129608629482966, "language_loss": 0.78831851, "learning_rate": 6.180729739558233e-08, "loss": 0.81271994, "num_input_tokens_seen": 331232185, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.18054199, "step": 15352, "time_per_iteration": 2.8323557376861572 }, { "auxiliary_loss_clip": 0.01416466, "auxiliary_loss_mlp": 0.01037185, "balance_loss_clip": 1.25172102, "balance_loss_mlp": 1.01700258, "epoch": 0.9230722982113332, "flos": 22977752728320.0, "grad_norm": 2.137089888184517, "language_loss": 0.60047692, "learning_rate": 6.171126075837585e-08, "loss": 0.62501341, "num_input_tokens_seen": 331251065, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.20166016, "step": 15353, "time_per_iteration": 2.9186341762542725 }, { "auxiliary_loss_clip": 0.01393868, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.23641336, "balance_loss_mlp": 1.01250064, "epoch": 0.9231324214640012, "flos": 18560597552640.0, "grad_norm": 12.782841877349565, "language_loss": 0.74940014, "learning_rate": 6.161529762127293e-08, "loss": 0.77364612, "num_input_tokens_seen": 331269110, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18237305, "step": 15354, "time_per_iteration": 2.829019546508789 }, { "auxiliary_loss_clip": 0.01405862, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.24056721, "balance_loss_mlp": 1.01219749, "epoch": 0.9231925447166691, "flos": 22090752806400.0, "grad_norm": 2.2222156322561766, "language_loss": 0.6616869, "learning_rate": 6.1519407987912e-08, "loss": 0.68607217, "num_input_tokens_seen": 331286555, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.20446777, "step": 15355, "time_per_iteration": 2.803534507751465 }, { "auxiliary_loss_clip": 0.01388796, "auxiliary_loss_mlp": 0.01031781, "balance_loss_clip": 1.23241222, "balance_loss_mlp": 1.01368499, "epoch": 0.9232526679693371, "flos": 26552184658560.0, "grad_norm": 1.6780826912140967, "language_loss": 0.7495774, "learning_rate": 6.142359186192947e-08, "loss": 0.77378321, "num_input_tokens_seen": 331307660, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18115234, "step": 15356, "time_per_iteration": 2.8956570625305176 }, { "auxiliary_loss_clip": 0.01396218, "auxiliary_loss_mlp": 0.01030966, "balance_loss_clip": 1.23669982, "balance_loss_mlp": 1.01118898, "epoch": 0.9233127912220052, "flos": 14764754977920.0, "grad_norm": 1.8475532748650503, "language_loss": 0.61681724, "learning_rate": 6.132784924695844e-08, "loss": 0.64108908, "num_input_tokens_seen": 331324885, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19763184, "step": 15357, "time_per_iteration": 2.804544687271118 }, { "auxiliary_loss_clip": 0.01404158, "auxiliary_loss_mlp": 0.01033373, "balance_loss_clip": 1.24044204, "balance_loss_mlp": 1.01341736, "epoch": 0.9233729144746731, "flos": 25271888520960.0, "grad_norm": 1.3859654387999516, "language_loss": 0.7018708, "learning_rate": 6.123218014662956e-08, "loss": 0.72624612, "num_input_tokens_seen": 331345885, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19934082, "step": 15358, "time_per_iteration": 2.8624107837677 }, { "auxiliary_loss_clip": 0.01389561, "auxiliary_loss_mlp": 0.01033085, "balance_loss_clip": 1.23140526, "balance_loss_mlp": 1.01405907, "epoch": 0.9234330377273411, "flos": 27859971651840.0, "grad_norm": 4.671304278896834, "language_loss": 0.74422187, "learning_rate": 6.113658456457104e-08, "loss": 0.76844835, "num_input_tokens_seen": 331364320, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19030762, "step": 15359, "time_per_iteration": 2.8641719818115234 }, { "auxiliary_loss_clip": 0.01403131, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.24300313, "balance_loss_mlp": 1.01335955, "epoch": 0.923493160980009, "flos": 24619148766720.0, "grad_norm": 2.4298393484471905, "language_loss": 0.6570648, "learning_rate": 6.104106250440732e-08, "loss": 0.68142188, "num_input_tokens_seen": 331384135, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19213867, "step": 15360, "time_per_iteration": 2.8536829948425293 }, { "auxiliary_loss_clip": 0.01179808, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 1.09236121, "balance_loss_mlp": 1.00505161, "epoch": 0.923553284232677, "flos": 67733099867520.0, "grad_norm": 0.7680831528932573, "language_loss": 0.5523603, "learning_rate": 6.094561396976083e-08, "loss": 0.57443011, "num_input_tokens_seen": 331440645, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.22167969, "step": 15361, "time_per_iteration": 3.255507707595825 }, { "auxiliary_loss_clip": 0.01412602, "auxiliary_loss_mlp": 0.01033071, "balance_loss_clip": 1.24795961, "balance_loss_mlp": 1.01467705, "epoch": 0.9236134074853449, "flos": 18816557241600.0, "grad_norm": 2.0299892907349535, "language_loss": 0.70792627, "learning_rate": 6.085023896425112e-08, "loss": 0.73238301, "num_input_tokens_seen": 331459580, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.18395996, "step": 15362, "time_per_iteration": 2.9089877605438232 }, { "auxiliary_loss_clip": 0.01413214, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.24894297, "balance_loss_mlp": 1.01408947, "epoch": 0.923673530738013, "flos": 27793904964480.0, "grad_norm": 1.5710097991348202, "language_loss": 0.76482123, "learning_rate": 6.075493749149463e-08, "loss": 0.78929859, "num_input_tokens_seen": 331481560, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.2043457, "step": 15363, "time_per_iteration": 2.9085824489593506 }, { "auxiliary_loss_clip": 0.01404188, "auxiliary_loss_mlp": 0.01031539, "balance_loss_clip": 1.24460483, "balance_loss_mlp": 1.01225066, "epoch": 0.9237336539906809, "flos": 26808370571520.0, "grad_norm": 2.030296818179137, "language_loss": 0.84107423, "learning_rate": 6.065970955510514e-08, "loss": 0.86543143, "num_input_tokens_seen": 331499090, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19287109, "step": 15364, "time_per_iteration": 2.8963844776153564 }, { "auxiliary_loss_clip": 0.01398808, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.24005318, "balance_loss_mlp": 1.01430535, "epoch": 0.9237937772433489, "flos": 23598477146880.0, "grad_norm": 1.7178121904351211, "language_loss": 0.68330765, "learning_rate": 6.056455515869419e-08, "loss": 0.70761698, "num_input_tokens_seen": 331519420, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.17822266, "step": 15365, "time_per_iteration": 2.896214485168457 }, { "auxiliary_loss_clip": 0.01398532, "auxiliary_loss_mlp": 0.01034791, "balance_loss_clip": 1.23932719, "balance_loss_mlp": 1.01569414, "epoch": 0.9238539004960168, "flos": 26151694519680.0, "grad_norm": 2.161479666263676, "language_loss": 0.63483047, "learning_rate": 6.046947430586913e-08, "loss": 0.65916371, "num_input_tokens_seen": 331538720, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19091797, "step": 15366, "time_per_iteration": 2.8952066898345947 }, { "auxiliary_loss_clip": 0.01389805, "auxiliary_loss_mlp": 0.01031204, "balance_loss_clip": 1.23292136, "balance_loss_mlp": 1.0118444, "epoch": 0.9239140237486848, "flos": 21077546578560.0, "grad_norm": 1.389385536408186, "language_loss": 0.75401133, "learning_rate": 6.037446700023619e-08, "loss": 0.77822137, "num_input_tokens_seen": 331558505, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19360352, "step": 15367, "time_per_iteration": 2.8543620109558105 }, { "auxiliary_loss_clip": 0.0137744, "auxiliary_loss_mlp": 0.01033182, "balance_loss_clip": 1.22554314, "balance_loss_mlp": 1.01435876, "epoch": 0.9239741470013527, "flos": 24618605829120.0, "grad_norm": 1.8308677880189417, "language_loss": 0.6531288, "learning_rate": 6.027953324539759e-08, "loss": 0.67723501, "num_input_tokens_seen": 331578440, "router_z_loss_clip": 1.51953125, "router_z_loss_mlp": 0.18823242, "step": 15368, "time_per_iteration": 4.290413856506348 }, { "auxiliary_loss_clip": 0.01414714, "auxiliary_loss_mlp": 0.01034479, "balance_loss_clip": 1.2517395, "balance_loss_mlp": 1.01533377, "epoch": 0.9240342702540207, "flos": 24729175416960.0, "grad_norm": 1.712310540463667, "language_loss": 0.75642776, "learning_rate": 6.018467304495401e-08, "loss": 0.78091967, "num_input_tokens_seen": 331598945, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19152832, "step": 15369, "time_per_iteration": 2.853825569152832 }, { "auxiliary_loss_clip": 0.01417391, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.25198853, "balance_loss_mlp": 1.02089119, "epoch": 0.9240943935066888, "flos": 20859710273280.0, "grad_norm": 1.8400534618230138, "language_loss": 0.77378118, "learning_rate": 6.008988640250145e-08, "loss": 0.79836261, "num_input_tokens_seen": 331616700, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.1986084, "step": 15370, "time_per_iteration": 2.8508455753326416 }, { "auxiliary_loss_clip": 0.01399374, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.24011981, "balance_loss_mlp": 1.01617682, "epoch": 0.9241545167593567, "flos": 24473034748800.0, "grad_norm": 1.9666054559392503, "language_loss": 0.67093456, "learning_rate": 5.999517332163528e-08, "loss": 0.69527233, "num_input_tokens_seen": 331635625, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18212891, "step": 15371, "time_per_iteration": 4.3189496994018555 }, { "auxiliary_loss_clip": 0.01177843, "auxiliary_loss_mlp": 0.01033193, "balance_loss_clip": 1.09150779, "balance_loss_mlp": 1.00839698, "epoch": 0.9242146400120247, "flos": 61858062138240.0, "grad_norm": 0.7392553713229377, "language_loss": 0.57740307, "learning_rate": 5.99005338059464e-08, "loss": 0.59951341, "num_input_tokens_seen": 331698595, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 0.24804688, "step": 15372, "time_per_iteration": 3.298266649246216 }, { "auxiliary_loss_clip": 0.01384841, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.23016834, "balance_loss_mlp": 1.01222444, "epoch": 0.9242747632646926, "flos": 22056972923520.0, "grad_norm": 1.722602784454251, "language_loss": 0.70518386, "learning_rate": 5.98059678590237e-08, "loss": 0.72933459, "num_input_tokens_seen": 331717975, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18017578, "step": 15373, "time_per_iteration": 2.8402249813079834 }, { "auxiliary_loss_clip": 0.01399603, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.23994279, "balance_loss_mlp": 1.01598489, "epoch": 0.9243348865173606, "flos": 18487427431680.0, "grad_norm": 3.302380146821489, "language_loss": 0.76091635, "learning_rate": 5.971147548445299e-08, "loss": 0.78525805, "num_input_tokens_seen": 331737220, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18579102, "step": 15374, "time_per_iteration": 2.833189010620117 }, { "auxiliary_loss_clip": 0.0139622, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.2372402, "balance_loss_mlp": 1.01367283, "epoch": 0.9243950097700285, "flos": 23269302092160.0, "grad_norm": 1.6104188438905056, "language_loss": 0.65554804, "learning_rate": 5.961705668581784e-08, "loss": 0.67983508, "num_input_tokens_seen": 331757300, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18811035, "step": 15375, "time_per_iteration": 2.865429639816284 }, { "auxiliary_loss_clip": 0.01395713, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.23878407, "balance_loss_mlp": 1.01472604, "epoch": 0.9244551330226966, "flos": 29760223046400.0, "grad_norm": 2.1023233929341485, "language_loss": 0.67654598, "learning_rate": 5.952271146669829e-08, "loss": 0.70083678, "num_input_tokens_seen": 331776995, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18652344, "step": 15376, "time_per_iteration": 2.9252164363861084 }, { "auxiliary_loss_clip": 0.01180713, "auxiliary_loss_mlp": 0.01037186, "balance_loss_clip": 1.09322047, "balance_loss_mlp": 1.01363015, "epoch": 0.9245152562753645, "flos": 68896537390080.0, "grad_norm": 0.6510989781460519, "language_loss": 0.61173582, "learning_rate": 5.94284398306717e-08, "loss": 0.63391483, "num_input_tokens_seen": 331845015, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.23535156, "step": 15377, "time_per_iteration": 3.395092248916626 }, { "auxiliary_loss_clip": 0.01396702, "auxiliary_loss_mlp": 0.01031982, "balance_loss_clip": 1.23782086, "balance_loss_mlp": 1.01353991, "epoch": 0.9245753795280325, "flos": 21589058753280.0, "grad_norm": 1.6047680216222764, "language_loss": 0.74676275, "learning_rate": 5.933424178131341e-08, "loss": 0.77104962, "num_input_tokens_seen": 331862795, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18457031, "step": 15378, "time_per_iteration": 5.628398180007935 }, { "auxiliary_loss_clip": 0.01401724, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.24133039, "balance_loss_mlp": 1.01364839, "epoch": 0.9246355027807004, "flos": 34509448944000.0, "grad_norm": 4.366674835744232, "language_loss": 0.63546932, "learning_rate": 5.924011732219503e-08, "loss": 0.65981293, "num_input_tokens_seen": 331882535, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18994141, "step": 15379, "time_per_iteration": 2.9438424110412598 }, { "auxiliary_loss_clip": 0.01390628, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.23439431, "balance_loss_mlp": 1.01625443, "epoch": 0.9246956260333684, "flos": 15960479304960.0, "grad_norm": 1.9345295429167837, "language_loss": 0.85116267, "learning_rate": 5.914606645688591e-08, "loss": 0.87542272, "num_input_tokens_seen": 331899335, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19128418, "step": 15380, "time_per_iteration": 2.8681411743164062 }, { "auxiliary_loss_clip": 0.01401206, "auxiliary_loss_mlp": 0.01032493, "balance_loss_clip": 1.2385242, "balance_loss_mlp": 1.01310956, "epoch": 0.9247557492860363, "flos": 23378966784000.0, "grad_norm": 1.906807415582918, "language_loss": 0.74125904, "learning_rate": 5.905208918895233e-08, "loss": 0.76559603, "num_input_tokens_seen": 331919030, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19384766, "step": 15381, "time_per_iteration": 2.972817897796631 }, { "auxiliary_loss_clip": 0.01399532, "auxiliary_loss_mlp": 0.01034922, "balance_loss_clip": 1.24068189, "balance_loss_mlp": 1.01630187, "epoch": 0.9248158725387043, "flos": 23050334666880.0, "grad_norm": 1.979077482593437, "language_loss": 0.79152942, "learning_rate": 5.8958185521958524e-08, "loss": 0.81587398, "num_input_tokens_seen": 331936465, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.1862793, "step": 15382, "time_per_iteration": 2.8791143894195557 }, { "auxiliary_loss_clip": 0.01401857, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.24026549, "balance_loss_mlp": 1.01340175, "epoch": 0.9248759957913724, "flos": 22531040386560.0, "grad_norm": 1.524587271053467, "language_loss": 0.75127017, "learning_rate": 5.886435545946455e-08, "loss": 0.77561873, "num_input_tokens_seen": 331954625, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19592285, "step": 15383, "time_per_iteration": 2.8699631690979004 }, { "auxiliary_loss_clip": 0.01394371, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.23630071, "balance_loss_mlp": 1.01470256, "epoch": 0.9249361190440403, "flos": 25458207183360.0, "grad_norm": 1.7354771387759966, "language_loss": 0.76046753, "learning_rate": 5.8770599005028456e-08, "loss": 0.78474122, "num_input_tokens_seen": 331975865, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18310547, "step": 15384, "time_per_iteration": 2.9247310161590576 }, { "auxiliary_loss_clip": 0.01384829, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.22895908, "balance_loss_mlp": 1.01565051, "epoch": 0.9249962422967083, "flos": 12384418561920.0, "grad_norm": 3.7755490660835034, "language_loss": 0.67348057, "learning_rate": 5.8676916162206045e-08, "loss": 0.69767749, "num_input_tokens_seen": 331992760, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.19226074, "step": 15385, "time_per_iteration": 2.8124303817749023 }, { "auxiliary_loss_clip": 0.01392957, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.23551738, "balance_loss_mlp": 1.01202619, "epoch": 0.9250563655493762, "flos": 22939629344640.0, "grad_norm": 1.8316392536887356, "language_loss": 0.81366527, "learning_rate": 5.85833069345496e-08, "loss": 0.83790112, "num_input_tokens_seen": 332011890, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18579102, "step": 15386, "time_per_iteration": 2.8352081775665283 }, { "auxiliary_loss_clip": 0.01394347, "auxiliary_loss_mlp": 0.01032975, "balance_loss_clip": 1.23812008, "balance_loss_mlp": 1.01436663, "epoch": 0.9251164888020442, "flos": 18487834634880.0, "grad_norm": 4.2238303899838705, "language_loss": 0.76131105, "learning_rate": 5.8489771325608504e-08, "loss": 0.78558433, "num_input_tokens_seen": 332029485, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18603516, "step": 15387, "time_per_iteration": 2.9227821826934814 }, { "auxiliary_loss_clip": 0.01387014, "auxiliary_loss_mlp": 0.01035167, "balance_loss_clip": 1.23070669, "balance_loss_mlp": 1.01633191, "epoch": 0.9251766120547121, "flos": 33050163801600.0, "grad_norm": 1.2563008386865662, "language_loss": 0.70667022, "learning_rate": 5.839630933893014e-08, "loss": 0.73089206, "num_input_tokens_seen": 332052970, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18847656, "step": 15388, "time_per_iteration": 2.952864170074463 }, { "auxiliary_loss_clip": 0.01403363, "auxiliary_loss_mlp": 0.0103298, "balance_loss_clip": 1.24157178, "balance_loss_mlp": 1.01408494, "epoch": 0.9252367353073802, "flos": 24398235815040.0, "grad_norm": 2.161792832276638, "language_loss": 0.8250975, "learning_rate": 5.8302920978058115e-08, "loss": 0.84946096, "num_input_tokens_seen": 332070395, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18884277, "step": 15389, "time_per_iteration": 2.868638753890991 }, { "auxiliary_loss_clip": 0.01430691, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.26235938, "balance_loss_mlp": 1.01655591, "epoch": 0.9252968585600481, "flos": 18926221933440.0, "grad_norm": 1.7331879159378432, "language_loss": 0.80058289, "learning_rate": 5.820960624653381e-08, "loss": 0.82525766, "num_input_tokens_seen": 332090185, "router_z_loss_clip": 1.68066406, "router_z_loss_mlp": 0.20227051, "step": 15390, "time_per_iteration": 2.8247954845428467 }, { "auxiliary_loss_clip": 0.01406622, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.24420142, "balance_loss_mlp": 1.0131644, "epoch": 0.9253569818127161, "flos": 21735218016000.0, "grad_norm": 1.8592709098460587, "language_loss": 0.76446152, "learning_rate": 5.811636514789597e-08, "loss": 0.78885382, "num_input_tokens_seen": 332109050, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19445801, "step": 15391, "time_per_iteration": 2.8217084407806396 }, { "auxiliary_loss_clip": 0.0139972, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.23878908, "balance_loss_mlp": 1.01349366, "epoch": 0.925417105065384, "flos": 34253579744640.0, "grad_norm": 2.2966719320860816, "language_loss": 0.53147322, "learning_rate": 5.80231976856802e-08, "loss": 0.55579937, "num_input_tokens_seen": 332131180, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19396973, "step": 15392, "time_per_iteration": 3.0000407695770264 }, { "auxiliary_loss_clip": 0.01396226, "auxiliary_loss_mlp": 0.01027897, "balance_loss_clip": 1.23571956, "balance_loss_mlp": 1.00946689, "epoch": 0.925477228318052, "flos": 25970986212480.0, "grad_norm": 1.7952875798868602, "language_loss": 0.77764893, "learning_rate": 5.7930103863419454e-08, "loss": 0.80189013, "num_input_tokens_seen": 332149555, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18432617, "step": 15393, "time_per_iteration": 2.8743226528167725 }, { "auxiliary_loss_clip": 0.01398006, "auxiliary_loss_mlp": 0.01029404, "balance_loss_clip": 1.23876154, "balance_loss_mlp": 1.01060486, "epoch": 0.9255373515707199, "flos": 11845687000320.0, "grad_norm": 1.852032016558179, "language_loss": 0.69995618, "learning_rate": 5.783708368464357e-08, "loss": 0.72423029, "num_input_tokens_seen": 332165830, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18811035, "step": 15394, "time_per_iteration": 2.8122100830078125 }, { "auxiliary_loss_clip": 0.01412287, "auxiliary_loss_mlp": 0.01031812, "balance_loss_clip": 1.25180006, "balance_loss_mlp": 1.01279759, "epoch": 0.925597474823388, "flos": 21444347324160.0, "grad_norm": 2.281987195007043, "language_loss": 0.73343527, "learning_rate": 5.7744137152879956e-08, "loss": 0.75787622, "num_input_tokens_seen": 332185130, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19018555, "step": 15395, "time_per_iteration": 2.8177051544189453 }, { "auxiliary_loss_clip": 0.01384436, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.22671437, "balance_loss_mlp": 1.01031637, "epoch": 0.925657598076056, "flos": 22868042791680.0, "grad_norm": 2.2387384160435375, "language_loss": 0.73078996, "learning_rate": 5.7651264271653785e-08, "loss": 0.75491679, "num_input_tokens_seen": 332203695, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.17932129, "step": 15396, "time_per_iteration": 2.8439695835113525 }, { "auxiliary_loss_clip": 0.01395191, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.23715258, "balance_loss_mlp": 1.01214457, "epoch": 0.9257177213287239, "flos": 25715388481920.0, "grad_norm": 1.6606145415107847, "language_loss": 0.87630427, "learning_rate": 5.755846504448603e-08, "loss": 0.90056372, "num_input_tokens_seen": 332224850, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18615723, "step": 15397, "time_per_iteration": 2.875885009765625 }, { "auxiliary_loss_clip": 0.01180878, "auxiliary_loss_mlp": 0.01021929, "balance_loss_clip": 1.09284472, "balance_loss_mlp": 1.00199747, "epoch": 0.9257778445813919, "flos": 59620898828160.0, "grad_norm": 0.8010465012553011, "language_loss": 0.55136049, "learning_rate": 5.746573947489586e-08, "loss": 0.57338858, "num_input_tokens_seen": 332278085, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.19921875, "step": 15398, "time_per_iteration": 3.2037580013275146 }, { "auxiliary_loss_clip": 0.0141119, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.24687755, "balance_loss_mlp": 1.01366234, "epoch": 0.9258379678340598, "flos": 27720961067520.0, "grad_norm": 1.9486914966108844, "language_loss": 0.77663314, "learning_rate": 5.7373087566400025e-08, "loss": 0.80107915, "num_input_tokens_seen": 332297875, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1973877, "step": 15399, "time_per_iteration": 2.9123294353485107 }, { "auxiliary_loss_clip": 0.01369299, "auxiliary_loss_mlp": 0.01031055, "balance_loss_clip": 1.2152307, "balance_loss_mlp": 1.01342392, "epoch": 0.9258980910867278, "flos": 24874339294080.0, "grad_norm": 1.4684713352904821, "language_loss": 0.79023874, "learning_rate": 5.7280509322510826e-08, "loss": 0.81424224, "num_input_tokens_seen": 332318500, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.1763916, "step": 15400, "time_per_iteration": 2.859886646270752 }, { "auxiliary_loss_clip": 0.01179594, "auxiliary_loss_mlp": 0.01018499, "balance_loss_clip": 1.09091794, "balance_loss_mlp": 0.99789965, "epoch": 0.9259582143393957, "flos": 63164129829120.0, "grad_norm": 0.7488689176818936, "language_loss": 0.51346791, "learning_rate": 5.718800474673946e-08, "loss": 0.53544885, "num_input_tokens_seen": 332381980, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.20605469, "step": 15401, "time_per_iteration": 3.243189811706543 }, { "auxiliary_loss_clip": 0.01376231, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.22252917, "balance_loss_mlp": 1.01701832, "epoch": 0.9260183375920638, "flos": 24135987098880.0, "grad_norm": 1.981483072038302, "language_loss": 0.82771575, "learning_rate": 5.709557384259378e-08, "loss": 0.85183072, "num_input_tokens_seen": 332399510, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.18249512, "step": 15402, "time_per_iteration": 2.8693394660949707 }, { "auxiliary_loss_clip": 0.01177714, "auxiliary_loss_mlp": 0.01016763, "balance_loss_clip": 1.09106874, "balance_loss_mlp": 0.9975937, "epoch": 0.9260784608447317, "flos": 63076997047680.0, "grad_norm": 6.6063425167505905, "language_loss": 0.51145327, "learning_rate": 5.700321661357876e-08, "loss": 0.53339803, "num_input_tokens_seen": 332459130, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.19140625, "step": 15403, "time_per_iteration": 4.727404356002808 }, { "auxiliary_loss_clip": 0.0117753, "auxiliary_loss_mlp": 0.01019956, "balance_loss_clip": 1.09217381, "balance_loss_mlp": 1.00002432, "epoch": 0.9261385840973997, "flos": 70619609571840.0, "grad_norm": 0.6843885729580603, "language_loss": 0.58756483, "learning_rate": 5.69109330631965e-08, "loss": 0.60953963, "num_input_tokens_seen": 332526555, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 0.19921875, "step": 15404, "time_per_iteration": 3.2892258167266846 }, { "auxiliary_loss_clip": 0.01400371, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.23958814, "balance_loss_mlp": 1.0114553, "epoch": 0.9261987073500676, "flos": 20239528792320.0, "grad_norm": 1.9524668823766087, "language_loss": 0.727072, "learning_rate": 5.681872319494596e-08, "loss": 0.75138718, "num_input_tokens_seen": 332544005, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19714355, "step": 15405, "time_per_iteration": 2.858870029449463 }, { "auxiliary_loss_clip": 0.01412797, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.25079918, "balance_loss_mlp": 1.01618171, "epoch": 0.9262588306027356, "flos": 20962543000320.0, "grad_norm": 1.667722031048722, "language_loss": 0.69416606, "learning_rate": 5.672658701232458e-08, "loss": 0.71865159, "num_input_tokens_seen": 332563070, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19555664, "step": 15406, "time_per_iteration": 2.831692695617676 }, { "auxiliary_loss_clip": 0.01407221, "auxiliary_loss_mlp": 0.01035149, "balance_loss_clip": 1.24546778, "balance_loss_mlp": 1.01614749, "epoch": 0.9263189538554035, "flos": 22167497266560.0, "grad_norm": 2.8088653825929493, "language_loss": 0.77165115, "learning_rate": 5.663452451882555e-08, "loss": 0.79607487, "num_input_tokens_seen": 332579620, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18994141, "step": 15407, "time_per_iteration": 4.302434682846069 }, { "auxiliary_loss_clip": 0.0142139, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.25338101, "balance_loss_mlp": 1.01619792, "epoch": 0.9263790771080715, "flos": 18196421005440.0, "grad_norm": 1.8497815815524838, "language_loss": 0.73178673, "learning_rate": 5.6542535717940096e-08, "loss": 0.75636542, "num_input_tokens_seen": 332597795, "router_z_loss_clip": 1.68164062, "router_z_loss_mlp": 0.20275879, "step": 15408, "time_per_iteration": 2.8828623294830322 }, { "auxiliary_loss_clip": 0.01383832, "auxiliary_loss_mlp": 0.01031894, "balance_loss_clip": 1.2276988, "balance_loss_mlp": 1.01425076, "epoch": 0.9264392003607396, "flos": 48195546451200.0, "grad_norm": 1.7536748269212328, "language_loss": 0.69251943, "learning_rate": 5.645062061315675e-08, "loss": 0.71667671, "num_input_tokens_seen": 332620375, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.17651367, "step": 15409, "time_per_iteration": 3.1143994331359863 }, { "auxiliary_loss_clip": 0.01410377, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.24958932, "balance_loss_mlp": 1.01724279, "epoch": 0.9264993236134075, "flos": 26398967207040.0, "grad_norm": 2.1462337988597575, "language_loss": 0.76771098, "learning_rate": 5.6358779207960506e-08, "loss": 0.79217982, "num_input_tokens_seen": 332639510, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19262695, "step": 15410, "time_per_iteration": 2.905673027038574 }, { "auxiliary_loss_clip": 0.01405048, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.24362683, "balance_loss_mlp": 1.01316285, "epoch": 0.9265594468660755, "flos": 20929079831040.0, "grad_norm": 1.6157170114984925, "language_loss": 0.83035028, "learning_rate": 5.6267011505833905e-08, "loss": 0.85471886, "num_input_tokens_seen": 332658350, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.1862793, "step": 15411, "time_per_iteration": 2.8602707386016846 }, { "auxiliary_loss_clip": 0.01414327, "auxiliary_loss_mlp": 0.01034257, "balance_loss_clip": 1.25288904, "balance_loss_mlp": 1.01618505, "epoch": 0.9266195701187434, "flos": 17532641520000.0, "grad_norm": 1.6500069655131637, "language_loss": 0.7599147, "learning_rate": 5.617531751025728e-08, "loss": 0.78440058, "num_input_tokens_seen": 332676715, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18066406, "step": 15412, "time_per_iteration": 2.8743443489074707 }, { "auxiliary_loss_clip": 0.01396235, "auxiliary_loss_mlp": 0.01030384, "balance_loss_clip": 1.23672462, "balance_loss_mlp": 1.01225257, "epoch": 0.9266796933714114, "flos": 33700686560640.0, "grad_norm": 1.651224767801993, "language_loss": 0.67921132, "learning_rate": 5.6083697224707406e-08, "loss": 0.7034775, "num_input_tokens_seen": 332701470, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18139648, "step": 15413, "time_per_iteration": 5.8645100593566895 }, { "auxiliary_loss_clip": 0.01404187, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.24304605, "balance_loss_mlp": 1.01435769, "epoch": 0.9267398166240793, "flos": 18925814730240.0, "grad_norm": 1.7297126711430377, "language_loss": 0.76199102, "learning_rate": 5.5992150652658167e-08, "loss": 0.78637356, "num_input_tokens_seen": 332719060, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19702148, "step": 15414, "time_per_iteration": 2.8258206844329834 }, { "auxiliary_loss_clip": 0.01390734, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.23341632, "balance_loss_mlp": 1.01077545, "epoch": 0.9267999398767474, "flos": 20486982458880.0, "grad_norm": 1.9630219058933642, "language_loss": 0.82516348, "learning_rate": 5.59006777975819e-08, "loss": 0.84936881, "num_input_tokens_seen": 332736345, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19042969, "step": 15415, "time_per_iteration": 2.8239364624023438 }, { "auxiliary_loss_clip": 0.01403255, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.24084115, "balance_loss_mlp": 1.01362252, "epoch": 0.9268600631294153, "flos": 24799811829120.0, "grad_norm": 1.6768369428768497, "language_loss": 0.54799104, "learning_rate": 5.580927866294671e-08, "loss": 0.57235342, "num_input_tokens_seen": 332756270, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.19372559, "step": 15416, "time_per_iteration": 2.875593662261963 }, { "auxiliary_loss_clip": 0.01392709, "auxiliary_loss_mlp": 0.01030402, "balance_loss_clip": 1.23636413, "balance_loss_mlp": 1.01205611, "epoch": 0.9269201863820833, "flos": 18706304367360.0, "grad_norm": 1.5013577471267507, "language_loss": 0.7324543, "learning_rate": 5.571795325221807e-08, "loss": 0.75668538, "num_input_tokens_seen": 332775185, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18359375, "step": 15417, "time_per_iteration": 2.8136303424835205 }, { "auxiliary_loss_clip": 0.01396632, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 1.23713303, "balance_loss_mlp": 1.01238227, "epoch": 0.9269803096347512, "flos": 20934011514240.0, "grad_norm": 2.5063021282623987, "language_loss": 0.76214749, "learning_rate": 5.5626701568859624e-08, "loss": 0.78643316, "num_input_tokens_seen": 332794320, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19555664, "step": 15418, "time_per_iteration": 2.821510076522827 }, { "auxiliary_loss_clip": 0.0139513, "auxiliary_loss_mlp": 0.01034015, "balance_loss_clip": 1.23709917, "balance_loss_mlp": 1.01541805, "epoch": 0.9270404328874192, "flos": 28014682181760.0, "grad_norm": 1.4366616919612427, "language_loss": 0.77286446, "learning_rate": 5.553552361633174e-08, "loss": 0.79715586, "num_input_tokens_seen": 332818095, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18579102, "step": 15419, "time_per_iteration": 2.9277560710906982 }, { "auxiliary_loss_clip": 0.0137459, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.22054279, "balance_loss_mlp": 1.01215696, "epoch": 0.9271005561400871, "flos": 25902159592320.0, "grad_norm": 1.605783427351983, "language_loss": 0.75880808, "learning_rate": 5.5444419398091636e-08, "loss": 0.78284872, "num_input_tokens_seen": 332839860, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.17321777, "step": 15420, "time_per_iteration": 2.8773722648620605 }, { "auxiliary_loss_clip": 0.0140362, "auxiliary_loss_mlp": 0.01035403, "balance_loss_clip": 1.24185359, "balance_loss_mlp": 1.0160557, "epoch": 0.9271606793927551, "flos": 27065597114880.0, "grad_norm": 4.034143324091733, "language_loss": 0.77128834, "learning_rate": 5.535338891759389e-08, "loss": 0.79567862, "num_input_tokens_seen": 332861155, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19348145, "step": 15421, "time_per_iteration": 2.9718427658081055 }, { "auxiliary_loss_clip": 0.01392301, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.23290122, "balance_loss_mlp": 1.01368856, "epoch": 0.9272208026454232, "flos": 26220068691840.0, "grad_norm": 2.018817800138314, "language_loss": 0.73626482, "learning_rate": 5.526243217829041e-08, "loss": 0.76051331, "num_input_tokens_seen": 332881110, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.1887207, "step": 15422, "time_per_iteration": 3.018231153488159 }, { "auxiliary_loss_clip": 0.01395099, "auxiliary_loss_mlp": 0.01041515, "balance_loss_clip": 1.23365188, "balance_loss_mlp": 1.02095151, "epoch": 0.9272809258980911, "flos": 12466049460480.0, "grad_norm": 5.825333033894933, "language_loss": 0.78618526, "learning_rate": 5.517154918363065e-08, "loss": 0.8105514, "num_input_tokens_seen": 332899350, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20556641, "step": 15423, "time_per_iteration": 3.0010883808135986 }, { "auxiliary_loss_clip": 0.01409359, "auxiliary_loss_mlp": 0.01030689, "balance_loss_clip": 1.24657357, "balance_loss_mlp": 1.01119852, "epoch": 0.9273410491507591, "flos": 22867183140480.0, "grad_norm": 8.889624381136501, "language_loss": 0.76023126, "learning_rate": 5.508073993706053e-08, "loss": 0.78463179, "num_input_tokens_seen": 332918105, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19494629, "step": 15424, "time_per_iteration": 2.9217419624328613 }, { "auxiliary_loss_clip": 0.01178065, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.0904603, "balance_loss_mlp": 1.01244795, "epoch": 0.927401172403427, "flos": 47691110448000.0, "grad_norm": 0.7801516692567665, "language_loss": 0.60751653, "learning_rate": 5.499000444202351e-08, "loss": 0.62963247, "num_input_tokens_seen": 332969490, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.2109375, "step": 15425, "time_per_iteration": 3.1258013248443604 }, { "auxiliary_loss_clip": 0.01394204, "auxiliary_loss_mlp": 0.01031493, "balance_loss_clip": 1.23586595, "balance_loss_mlp": 1.01255095, "epoch": 0.927461295656095, "flos": 29984710337280.0, "grad_norm": 1.703763798715788, "language_loss": 0.71538353, "learning_rate": 5.489934270196106e-08, "loss": 0.73964047, "num_input_tokens_seen": 332988805, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18933105, "step": 15426, "time_per_iteration": 2.9189705848693848 }, { "auxiliary_loss_clip": 0.01406467, "auxiliary_loss_mlp": 0.0103096, "balance_loss_clip": 1.24772787, "balance_loss_mlp": 1.01269686, "epoch": 0.9275214189087629, "flos": 20384964138240.0, "grad_norm": 1.7307038911390453, "language_loss": 0.83313549, "learning_rate": 5.480875472030977e-08, "loss": 0.85750973, "num_input_tokens_seen": 333007960, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18261719, "step": 15427, "time_per_iteration": 2.8471221923828125 }, { "auxiliary_loss_clip": 0.0140167, "auxiliary_loss_mlp": 0.01030577, "balance_loss_clip": 1.24243498, "balance_loss_mlp": 1.01233828, "epoch": 0.927581542161431, "flos": 22393522880640.0, "grad_norm": 1.5718315870045299, "language_loss": 0.77391124, "learning_rate": 5.471824050050555e-08, "loss": 0.79823369, "num_input_tokens_seen": 333026035, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18237305, "step": 15428, "time_per_iteration": 2.868229389190674 }, { "auxiliary_loss_clip": 0.01390253, "auxiliary_loss_mlp": 0.0103442, "balance_loss_clip": 1.23199916, "balance_loss_mlp": 1.01465547, "epoch": 0.9276416654140989, "flos": 23963287121280.0, "grad_norm": 1.7220557332613533, "language_loss": 0.74346983, "learning_rate": 5.4627800045980555e-08, "loss": 0.76771653, "num_input_tokens_seen": 333045590, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19763184, "step": 15429, "time_per_iteration": 2.8907630443573 }, { "auxiliary_loss_clip": 0.0138078, "auxiliary_loss_mlp": 0.01030195, "balance_loss_clip": 1.22474575, "balance_loss_mlp": 1.01125252, "epoch": 0.9277017886667669, "flos": 13925651316480.0, "grad_norm": 1.8609064816622367, "language_loss": 0.75755256, "learning_rate": 5.45374333601647e-08, "loss": 0.78166234, "num_input_tokens_seen": 333063355, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18945312, "step": 15430, "time_per_iteration": 2.9311399459838867 }, { "auxiliary_loss_clip": 0.01407898, "auxiliary_loss_mlp": 0.01031108, "balance_loss_clip": 1.2467382, "balance_loss_mlp": 1.01097369, "epoch": 0.9277619119194348, "flos": 35680894796160.0, "grad_norm": 1.4726563937669495, "language_loss": 0.77292871, "learning_rate": 5.444714044648391e-08, "loss": 0.7973187, "num_input_tokens_seen": 333088045, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.20129395, "step": 15431, "time_per_iteration": 3.0472373962402344 }, { "auxiliary_loss_clip": 0.01388868, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.23188937, "balance_loss_mlp": 1.01390159, "epoch": 0.9278220351721028, "flos": 23851586413440.0, "grad_norm": 1.9745414072805318, "language_loss": 0.71642464, "learning_rate": 5.4356921308363e-08, "loss": 0.74063724, "num_input_tokens_seen": 333108005, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18493652, "step": 15432, "time_per_iteration": 2.9828691482543945 }, { "auxiliary_loss_clip": 0.01404617, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.24287105, "balance_loss_mlp": 1.01352966, "epoch": 0.9278821584247707, "flos": 15235610060160.0, "grad_norm": 2.205155149380228, "language_loss": 0.83941466, "learning_rate": 5.4266775949222354e-08, "loss": 0.86377782, "num_input_tokens_seen": 333124335, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18151855, "step": 15433, "time_per_iteration": 2.8203232288360596 }, { "auxiliary_loss_clip": 0.01377371, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.2235781, "balance_loss_mlp": 1.01193714, "epoch": 0.9279422816774388, "flos": 24692228398080.0, "grad_norm": 4.274568609144984, "language_loss": 0.67971778, "learning_rate": 5.417670437248056e-08, "loss": 0.70379418, "num_input_tokens_seen": 333143995, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.18334961, "step": 15434, "time_per_iteration": 2.860714912414551 }, { "auxiliary_loss_clip": 0.01375274, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.22335672, "balance_loss_mlp": 1.01654792, "epoch": 0.9280024049301068, "flos": 19177838121600.0, "grad_norm": 2.677407243313559, "language_loss": 0.69528139, "learning_rate": 5.40867065815529e-08, "loss": 0.71938825, "num_input_tokens_seen": 333162805, "router_z_loss_clip": 1.51855469, "router_z_loss_mlp": 0.18859863, "step": 15435, "time_per_iteration": 2.8710451126098633 }, { "auxiliary_loss_clip": 0.01395099, "auxiliary_loss_mlp": 0.01032398, "balance_loss_clip": 1.2360518, "balance_loss_mlp": 1.01316929, "epoch": 0.9280625281827747, "flos": 11399291372160.0, "grad_norm": 1.9992657638309568, "language_loss": 0.72712463, "learning_rate": 5.399678257985263e-08, "loss": 0.75139958, "num_input_tokens_seen": 333175770, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19238281, "step": 15436, "time_per_iteration": 2.8025665283203125 }, { "auxiliary_loss_clip": 0.01394964, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.23707891, "balance_loss_mlp": 1.01345205, "epoch": 0.9281226514354427, "flos": 24795106369920.0, "grad_norm": 1.926569264431376, "language_loss": 0.67787218, "learning_rate": 5.390693237078925e-08, "loss": 0.7021414, "num_input_tokens_seen": 333194775, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18493652, "step": 15437, "time_per_iteration": 2.861618995666504 }, { "auxiliary_loss_clip": 0.01402198, "auxiliary_loss_mlp": 0.01038277, "balance_loss_clip": 1.23909152, "balance_loss_mlp": 1.01677132, "epoch": 0.9281827746881106, "flos": 15090808141440.0, "grad_norm": 2.1577432218025563, "language_loss": 0.71880764, "learning_rate": 5.3817155957770254e-08, "loss": 0.7432124, "num_input_tokens_seen": 333208920, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.21520996, "step": 15438, "time_per_iteration": 4.19848895072937 }, { "auxiliary_loss_clip": 0.01407486, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.24594975, "balance_loss_mlp": 1.01385355, "epoch": 0.9282428979407786, "flos": 24145986199680.0, "grad_norm": 1.7651335255279244, "language_loss": 0.65561461, "learning_rate": 5.3727453344199366e-08, "loss": 0.68000847, "num_input_tokens_seen": 333229350, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18054199, "step": 15439, "time_per_iteration": 2.859358549118042 }, { "auxiliary_loss_clip": 0.01399769, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.24003053, "balance_loss_mlp": 1.01501989, "epoch": 0.9283030211934465, "flos": 24833003529600.0, "grad_norm": 1.6950998679595688, "language_loss": 0.70792651, "learning_rate": 5.363782453347876e-08, "loss": 0.7322638, "num_input_tokens_seen": 333246125, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18933105, "step": 15440, "time_per_iteration": 2.8510501384735107 }, { "auxiliary_loss_clip": 0.01406499, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.24241102, "balance_loss_mlp": 1.01644003, "epoch": 0.9283631444461146, "flos": 23989827836160.0, "grad_norm": 1.669068908184784, "language_loss": 0.77068555, "learning_rate": 5.354826952900682e-08, "loss": 0.79510581, "num_input_tokens_seen": 333263685, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19104004, "step": 15441, "time_per_iteration": 4.413456678390503 }, { "auxiliary_loss_clip": 0.01383355, "auxiliary_loss_mlp": 0.01027803, "balance_loss_clip": 1.22852802, "balance_loss_mlp": 1.01039863, "epoch": 0.9284232676987825, "flos": 22794963160320.0, "grad_norm": 1.69040443915942, "language_loss": 0.64514256, "learning_rate": 5.345878833417949e-08, "loss": 0.66925406, "num_input_tokens_seen": 333282435, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.17407227, "step": 15442, "time_per_iteration": 2.843022346496582 }, { "auxiliary_loss_clip": 0.01407439, "auxiliary_loss_mlp": 0.01032534, "balance_loss_clip": 1.24361372, "balance_loss_mlp": 1.01379395, "epoch": 0.9284833909514505, "flos": 19509999333120.0, "grad_norm": 2.438023358481136, "language_loss": 0.81480992, "learning_rate": 5.3369380952390295e-08, "loss": 0.83920968, "num_input_tokens_seen": 333300400, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18762207, "step": 15443, "time_per_iteration": 2.8598039150238037 }, { "auxiliary_loss_clip": 0.01408403, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.24766147, "balance_loss_mlp": 1.01305151, "epoch": 0.9285435142041184, "flos": 23196539174400.0, "grad_norm": 1.8353384305732534, "language_loss": 0.65761244, "learning_rate": 5.328004738702896e-08, "loss": 0.68201721, "num_input_tokens_seen": 333318980, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19006348, "step": 15444, "time_per_iteration": 2.873081922531128 }, { "auxiliary_loss_clip": 0.01395371, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.23503506, "balance_loss_mlp": 1.01329184, "epoch": 0.9286036374567864, "flos": 17684365893120.0, "grad_norm": 2.042333716497454, "language_loss": 0.74084258, "learning_rate": 5.3190787641483215e-08, "loss": 0.76511896, "num_input_tokens_seen": 333334135, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18969727, "step": 15445, "time_per_iteration": 2.801954746246338 }, { "auxiliary_loss_clip": 0.01395901, "auxiliary_loss_mlp": 0.01032545, "balance_loss_clip": 1.23682487, "balance_loss_mlp": 1.01328063, "epoch": 0.9286637607094543, "flos": 20896476312960.0, "grad_norm": 1.7982940718524525, "language_loss": 0.71815288, "learning_rate": 5.3101601719138135e-08, "loss": 0.74243736, "num_input_tokens_seen": 333353325, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19262695, "step": 15446, "time_per_iteration": 2.886275053024292 }, { "auxiliary_loss_clip": 0.01427791, "auxiliary_loss_mlp": 0.01029968, "balance_loss_clip": 1.26107788, "balance_loss_mlp": 1.01178885, "epoch": 0.9287238839621224, "flos": 19035343687680.0, "grad_norm": 1.848944680981843, "language_loss": 0.69696689, "learning_rate": 5.301248962337523e-08, "loss": 0.72154444, "num_input_tokens_seen": 333371110, "router_z_loss_clip": 1.66601562, "router_z_loss_mlp": 0.1817627, "step": 15447, "time_per_iteration": 2.8518166542053223 }, { "auxiliary_loss_clip": 0.01376706, "auxiliary_loss_mlp": 0.01032167, "balance_loss_clip": 1.22422266, "balance_loss_mlp": 1.01439309, "epoch": 0.9287840072147904, "flos": 20566396362240.0, "grad_norm": 2.3289081071173747, "language_loss": 0.72867674, "learning_rate": 5.292345135757403e-08, "loss": 0.75276542, "num_input_tokens_seen": 333391420, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.17785645, "step": 15448, "time_per_iteration": 5.764143228530884 }, { "auxiliary_loss_clip": 0.01391461, "auxiliary_loss_mlp": 0.01030891, "balance_loss_clip": 1.23326683, "balance_loss_mlp": 1.011603, "epoch": 0.9288441304674583, "flos": 21260517125760.0, "grad_norm": 1.7473995488421628, "language_loss": 0.74996889, "learning_rate": 5.283448692511072e-08, "loss": 0.77419239, "num_input_tokens_seen": 333410365, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19274902, "step": 15449, "time_per_iteration": 2.811849355697632 }, { "auxiliary_loss_clip": 0.01398114, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.23903275, "balance_loss_mlp": 1.01120317, "epoch": 0.9289042537201263, "flos": 27680575443840.0, "grad_norm": 1.961665688874903, "language_loss": 0.68557924, "learning_rate": 5.27455963293586e-08, "loss": 0.70986676, "num_input_tokens_seen": 333430000, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19445801, "step": 15450, "time_per_iteration": 2.865870714187622 }, { "auxiliary_loss_clip": 0.01395814, "auxiliary_loss_mlp": 0.01029026, "balance_loss_clip": 1.23522043, "balance_loss_mlp": 1.00986874, "epoch": 0.9289643769727942, "flos": 19327119275520.0, "grad_norm": 5.120934890009868, "language_loss": 0.72858953, "learning_rate": 5.265677957368875e-08, "loss": 0.7528379, "num_input_tokens_seen": 333445800, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19165039, "step": 15451, "time_per_iteration": 2.8286430835723877 }, { "auxiliary_loss_clip": 0.01400193, "auxiliary_loss_mlp": 0.01035774, "balance_loss_clip": 1.2401464, "balance_loss_mlp": 1.01724863, "epoch": 0.9290245002254622, "flos": 14064571411200.0, "grad_norm": 2.074166530762542, "language_loss": 0.74226093, "learning_rate": 5.25680366614687e-08, "loss": 0.76662058, "num_input_tokens_seen": 333461550, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18530273, "step": 15452, "time_per_iteration": 2.8131699562072754 }, { "auxiliary_loss_clip": 0.01398242, "auxiliary_loss_mlp": 0.01036588, "balance_loss_clip": 1.24115133, "balance_loss_mlp": 1.01724029, "epoch": 0.9290846234781301, "flos": 20056603489920.0, "grad_norm": 1.8531168929229556, "language_loss": 0.74603724, "learning_rate": 5.2479367596064196e-08, "loss": 0.7703855, "num_input_tokens_seen": 333478835, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.19360352, "step": 15453, "time_per_iteration": 2.8120665550231934 }, { "auxiliary_loss_clip": 0.01177317, "auxiliary_loss_mlp": 0.01036021, "balance_loss_clip": 1.09123957, "balance_loss_mlp": 1.01284683, "epoch": 0.9291447467307982, "flos": 61254015068160.0, "grad_norm": 0.8343949688529111, "language_loss": 0.6066829, "learning_rate": 5.2390772380837226e-08, "loss": 0.62881631, "num_input_tokens_seen": 333535250, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.23144531, "step": 15454, "time_per_iteration": 3.249325752258301 }, { "auxiliary_loss_clip": 0.01411259, "auxiliary_loss_mlp": 0.0103659, "balance_loss_clip": 1.25009418, "balance_loss_mlp": 1.01750422, "epoch": 0.9292048699834661, "flos": 20561736147840.0, "grad_norm": 1.5635289844004099, "language_loss": 0.70058358, "learning_rate": 5.230225101914709e-08, "loss": 0.72506201, "num_input_tokens_seen": 333553805, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.1907959, "step": 15455, "time_per_iteration": 2.8329641819000244 }, { "auxiliary_loss_clip": 0.01405066, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.24497938, "balance_loss_mlp": 1.01347125, "epoch": 0.9292649932361341, "flos": 23634293045760.0, "grad_norm": 1.6249263218817902, "language_loss": 0.65462375, "learning_rate": 5.22138035143509e-08, "loss": 0.67900574, "num_input_tokens_seen": 333572800, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19641113, "step": 15456, "time_per_iteration": 2.846841335296631 }, { "auxiliary_loss_clip": 0.01396224, "auxiliary_loss_mlp": 0.01032379, "balance_loss_clip": 1.23922014, "balance_loss_mlp": 1.01247144, "epoch": 0.929325116488802, "flos": 15017547530880.0, "grad_norm": 1.6880542702097234, "language_loss": 0.6881209, "learning_rate": 5.2125429869802615e-08, "loss": 0.71240699, "num_input_tokens_seen": 333588520, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19909668, "step": 15457, "time_per_iteration": 3.008776903152466 }, { "auxiliary_loss_clip": 0.01404239, "auxiliary_loss_mlp": 0.01031741, "balance_loss_clip": 1.24268103, "balance_loss_mlp": 1.01310897, "epoch": 0.92938523974147, "flos": 17976277215360.0, "grad_norm": 8.031203657693178, "language_loss": 0.81766844, "learning_rate": 5.203713008885291e-08, "loss": 0.84202832, "num_input_tokens_seen": 333603435, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.1862793, "step": 15458, "time_per_iteration": 2.8365681171417236 }, { "auxiliary_loss_clip": 0.01404733, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.24479854, "balance_loss_mlp": 1.0172987, "epoch": 0.9294453629941379, "flos": 23013432892800.0, "grad_norm": 1.720808273458055, "language_loss": 0.72796905, "learning_rate": 5.194890417485065e-08, "loss": 0.75237918, "num_input_tokens_seen": 333623305, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18969727, "step": 15459, "time_per_iteration": 2.9145281314849854 }, { "auxiliary_loss_clip": 0.01402706, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.24308395, "balance_loss_mlp": 1.01206422, "epoch": 0.929505486246806, "flos": 17063912943360.0, "grad_norm": 2.5833687990375878, "language_loss": 0.59963191, "learning_rate": 5.1860752131141384e-08, "loss": 0.62397194, "num_input_tokens_seen": 333641205, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19238281, "step": 15460, "time_per_iteration": 2.80780029296875 }, { "auxiliary_loss_clip": 0.01408734, "auxiliary_loss_mlp": 0.01033069, "balance_loss_clip": 1.24647164, "balance_loss_mlp": 1.01400805, "epoch": 0.9295656094994739, "flos": 27350766961920.0, "grad_norm": 2.0615734432473336, "language_loss": 0.81050146, "learning_rate": 5.177267396106733e-08, "loss": 0.83491945, "num_input_tokens_seen": 333659615, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19067383, "step": 15461, "time_per_iteration": 2.963458776473999 }, { "auxiliary_loss_clip": 0.01397822, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.23940372, "balance_loss_mlp": 1.01364708, "epoch": 0.9296257327521419, "flos": 21481022874240.0, "grad_norm": 2.5567888995192996, "language_loss": 0.7872315, "learning_rate": 5.168466966796869e-08, "loss": 0.81153125, "num_input_tokens_seen": 333678985, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18505859, "step": 15462, "time_per_iteration": 2.899139165878296 }, { "auxiliary_loss_clip": 0.0139372, "auxiliary_loss_mlp": 0.01030305, "balance_loss_clip": 1.23453164, "balance_loss_mlp": 1.0124352, "epoch": 0.9296858560048099, "flos": 16371330503040.0, "grad_norm": 3.3537462363934614, "language_loss": 0.63932645, "learning_rate": 5.159673925518282e-08, "loss": 0.66356665, "num_input_tokens_seen": 333696410, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17858887, "step": 15463, "time_per_iteration": 2.8409464359283447 }, { "auxiliary_loss_clip": 0.01398692, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.23957491, "balance_loss_mlp": 1.01127172, "epoch": 0.9297459792574778, "flos": 29869797248640.0, "grad_norm": 1.5026908600929316, "language_loss": 0.71338594, "learning_rate": 5.15088827260437e-08, "loss": 0.73766565, "num_input_tokens_seen": 333716615, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18029785, "step": 15464, "time_per_iteration": 2.9600491523742676 }, { "auxiliary_loss_clip": 0.01413519, "auxiliary_loss_mlp": 0.01033187, "balance_loss_clip": 1.2517333, "balance_loss_mlp": 1.0143162, "epoch": 0.9298061025101458, "flos": 15933395652480.0, "grad_norm": 1.7867679505395258, "language_loss": 0.78061175, "learning_rate": 5.1421100083883115e-08, "loss": 0.8050788, "num_input_tokens_seen": 333732800, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.1887207, "step": 15465, "time_per_iteration": 2.810683250427246 }, { "auxiliary_loss_clip": 0.01180507, "auxiliary_loss_mlp": 0.01026328, "balance_loss_clip": 1.09312689, "balance_loss_mlp": 1.00344014, "epoch": 0.9298662257628137, "flos": 64130362675200.0, "grad_norm": 1.0041184608424722, "language_loss": 0.56475514, "learning_rate": 5.133339133202952e-08, "loss": 0.58682346, "num_input_tokens_seen": 333799300, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.22851562, "step": 15466, "time_per_iteration": 3.4860854148864746 }, { "auxiliary_loss_clip": 0.01412578, "auxiliary_loss_mlp": 0.01035214, "balance_loss_clip": 1.24971497, "balance_loss_mlp": 1.01503229, "epoch": 0.9299263490154818, "flos": 24290969097600.0, "grad_norm": 1.4595564802218013, "language_loss": 0.73509037, "learning_rate": 5.1245756473809355e-08, "loss": 0.75956821, "num_input_tokens_seen": 333820360, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.2019043, "step": 15467, "time_per_iteration": 2.9434194564819336 }, { "auxiliary_loss_clip": 0.01405987, "auxiliary_loss_mlp": 0.01032659, "balance_loss_clip": 1.24556065, "balance_loss_mlp": 1.01416969, "epoch": 0.9299864722681497, "flos": 23304665543040.0, "grad_norm": 1.7138567186543188, "language_loss": 0.72202504, "learning_rate": 5.1158195512545076e-08, "loss": 0.7464115, "num_input_tokens_seen": 333840415, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18493652, "step": 15468, "time_per_iteration": 2.869332790374756 }, { "auxiliary_loss_clip": 0.01410022, "auxiliary_loss_mlp": 0.01031098, "balance_loss_clip": 1.24655104, "balance_loss_mlp": 1.01196527, "epoch": 0.9300465955208177, "flos": 21405500023680.0, "grad_norm": 2.2263819183914255, "language_loss": 0.75439489, "learning_rate": 5.107070845155737e-08, "loss": 0.77880603, "num_input_tokens_seen": 333859910, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19140625, "step": 15469, "time_per_iteration": 2.8619065284729004 }, { "auxiliary_loss_clip": 0.01409384, "auxiliary_loss_mlp": 0.01035358, "balance_loss_clip": 1.24725449, "balance_loss_mlp": 1.01635551, "epoch": 0.9301067187734856, "flos": 24582111258240.0, "grad_norm": 1.9070902894506503, "language_loss": 0.76470065, "learning_rate": 5.098329529416379e-08, "loss": 0.78914803, "num_input_tokens_seen": 333880495, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19006348, "step": 15470, "time_per_iteration": 2.882906913757324 }, { "auxiliary_loss_clip": 0.01395548, "auxiliary_loss_mlp": 0.01030477, "balance_loss_clip": 1.23822999, "balance_loss_mlp": 1.01289332, "epoch": 0.9301668420261536, "flos": 22206254077440.0, "grad_norm": 1.7248033246758643, "language_loss": 0.75034904, "learning_rate": 5.089595604367902e-08, "loss": 0.77460933, "num_input_tokens_seen": 333897640, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.17590332, "step": 15471, "time_per_iteration": 2.863020420074463 }, { "auxiliary_loss_clip": 0.01391468, "auxiliary_loss_mlp": 0.01035354, "balance_loss_clip": 1.23336983, "balance_loss_mlp": 1.01685238, "epoch": 0.9302269652788215, "flos": 17756269159680.0, "grad_norm": 3.0694294444238888, "language_loss": 0.69885314, "learning_rate": 5.080869070341487e-08, "loss": 0.72312135, "num_input_tokens_seen": 333913670, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18505859, "step": 15472, "time_per_iteration": 2.848362922668457 }, { "auxiliary_loss_clip": 0.0137463, "auxiliary_loss_mlp": 0.01030854, "balance_loss_clip": 1.22090125, "balance_loss_mlp": 1.01211452, "epoch": 0.9302870885314896, "flos": 19400198906880.0, "grad_norm": 1.6682812068819237, "language_loss": 0.89358014, "learning_rate": 5.0721499276680233e-08, "loss": 0.91763496, "num_input_tokens_seen": 333934105, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.18737793, "step": 15473, "time_per_iteration": 4.369073390960693 }, { "auxiliary_loss_clip": 0.01405179, "auxiliary_loss_mlp": 0.01034397, "balance_loss_clip": 1.24310911, "balance_loss_mlp": 1.0145843, "epoch": 0.9303472117841575, "flos": 21769767060480.0, "grad_norm": 1.843197473469873, "language_loss": 0.65752774, "learning_rate": 5.063438176678203e-08, "loss": 0.68192351, "num_input_tokens_seen": 333953635, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19799805, "step": 15474, "time_per_iteration": 2.8484296798706055 }, { "auxiliary_loss_clip": 0.01401723, "auxiliary_loss_mlp": 0.01035949, "balance_loss_clip": 1.24253392, "balance_loss_mlp": 1.0170548, "epoch": 0.9304073350368255, "flos": 19619211576960.0, "grad_norm": 1.835652518903361, "language_loss": 0.75263071, "learning_rate": 5.054733817702339e-08, "loss": 0.7770074, "num_input_tokens_seen": 333971825, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18896484, "step": 15475, "time_per_iteration": 2.8758738040924072 }, { "auxiliary_loss_clip": 0.01392948, "auxiliary_loss_mlp": 0.01027234, "balance_loss_clip": 1.23388088, "balance_loss_mlp": 1.00956702, "epoch": 0.9304674582894935, "flos": 30452714997120.0, "grad_norm": 1.9980120870498477, "language_loss": 0.6694622, "learning_rate": 5.0460368510704786e-08, "loss": 0.69366401, "num_input_tokens_seen": 333990120, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.17663574, "step": 15476, "time_per_iteration": 4.457249879837036 }, { "auxiliary_loss_clip": 0.01404283, "auxiliary_loss_mlp": 0.01034367, "balance_loss_clip": 1.24361491, "balance_loss_mlp": 1.01470923, "epoch": 0.9305275815421614, "flos": 17794754501760.0, "grad_norm": 2.3515241115700327, "language_loss": 0.69696951, "learning_rate": 5.0373472771124914e-08, "loss": 0.72135597, "num_input_tokens_seen": 334007970, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19665527, "step": 15477, "time_per_iteration": 2.8219234943389893 }, { "auxiliary_loss_clip": 0.01387924, "auxiliary_loss_mlp": 0.01030722, "balance_loss_clip": 1.2319169, "balance_loss_mlp": 1.01338959, "epoch": 0.9305877047948294, "flos": 25308609315840.0, "grad_norm": 1.6345233383513778, "language_loss": 0.58830214, "learning_rate": 5.0286650961578027e-08, "loss": 0.61248857, "num_input_tokens_seen": 334027120, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.17333984, "step": 15478, "time_per_iteration": 2.899062395095825 }, { "auxiliary_loss_clip": 0.01412842, "auxiliary_loss_mlp": 0.01032005, "balance_loss_clip": 1.2475971, "balance_loss_mlp": 1.01293182, "epoch": 0.9306478280474973, "flos": 16984589529600.0, "grad_norm": 2.03581271570634, "language_loss": 0.7995832, "learning_rate": 5.01999030853566e-08, "loss": 0.82403165, "num_input_tokens_seen": 334042785, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19067383, "step": 15479, "time_per_iteration": 2.8235297203063965 }, { "auxiliary_loss_clip": 0.0139886, "auxiliary_loss_mlp": 0.01031372, "balance_loss_clip": 1.2388916, "balance_loss_mlp": 1.01337147, "epoch": 0.9307079513001654, "flos": 35676958498560.0, "grad_norm": 24.500826176437645, "language_loss": 0.69193089, "learning_rate": 5.0113229145750445e-08, "loss": 0.71623319, "num_input_tokens_seen": 334063480, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18005371, "step": 15480, "time_per_iteration": 2.9760823249816895 }, { "auxiliary_loss_clip": 0.01406072, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.24577832, "balance_loss_mlp": 1.01406527, "epoch": 0.9307680745528333, "flos": 19217454583680.0, "grad_norm": 1.9198016412630379, "language_loss": 0.68475646, "learning_rate": 5.002662914604583e-08, "loss": 0.70913547, "num_input_tokens_seen": 334082005, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.1776123, "step": 15481, "time_per_iteration": 2.8418664932250977 }, { "auxiliary_loss_clip": 0.01399473, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.24092627, "balance_loss_mlp": 1.01337051, "epoch": 0.9308281978055013, "flos": 19072019237760.0, "grad_norm": 3.3141222292463564, "language_loss": 0.75843906, "learning_rate": 4.994010308952701e-08, "loss": 0.78275156, "num_input_tokens_seen": 334101375, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18408203, "step": 15482, "time_per_iteration": 2.899949312210083 }, { "auxiliary_loss_clip": 0.01388164, "auxiliary_loss_mlp": 0.01030333, "balance_loss_clip": 1.23194551, "balance_loss_mlp": 1.01184404, "epoch": 0.9308883210581692, "flos": 20531168645760.0, "grad_norm": 1.6905379955966857, "language_loss": 0.80030274, "learning_rate": 4.985365097947469e-08, "loss": 0.82448775, "num_input_tokens_seen": 334119460, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18481445, "step": 15483, "time_per_iteration": 5.622990369796753 }, { "auxiliary_loss_clip": 0.01394523, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 1.23564303, "balance_loss_mlp": 1.01128232, "epoch": 0.9309484443108372, "flos": 13008038647680.0, "grad_norm": 2.015901487020223, "language_loss": 0.75239956, "learning_rate": 4.976727281916782e-08, "loss": 0.77664673, "num_input_tokens_seen": 334136065, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18920898, "step": 15484, "time_per_iteration": 2.8016207218170166 }, { "auxiliary_loss_clip": 0.01404834, "auxiliary_loss_mlp": 0.01036553, "balance_loss_clip": 1.24339843, "balance_loss_mlp": 1.01752734, "epoch": 0.9310085675635051, "flos": 12575352193920.0, "grad_norm": 2.3175244887908253, "language_loss": 0.77351135, "learning_rate": 4.968096861188087e-08, "loss": 0.79792523, "num_input_tokens_seen": 334153690, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19018555, "step": 15485, "time_per_iteration": 2.8489465713500977 }, { "auxiliary_loss_clip": 0.01408545, "auxiliary_loss_mlp": 0.01035077, "balance_loss_clip": 1.24544239, "balance_loss_mlp": 1.01541924, "epoch": 0.9310686908161732, "flos": 23488133783040.0, "grad_norm": 2.1994976877598202, "language_loss": 0.79084432, "learning_rate": 4.959473836088723e-08, "loss": 0.8152805, "num_input_tokens_seen": 334171880, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19665527, "step": 15486, "time_per_iteration": 2.876664638519287 }, { "auxiliary_loss_clip": 0.01405718, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.24368465, "balance_loss_mlp": 1.01126838, "epoch": 0.9311288140688411, "flos": 24180897202560.0, "grad_norm": 2.1062959573629776, "language_loss": 0.78004003, "learning_rate": 4.950858206945674e-08, "loss": 0.80439913, "num_input_tokens_seen": 334190005, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18920898, "step": 15487, "time_per_iteration": 2.881012201309204 }, { "auxiliary_loss_clip": 0.01394743, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.23548198, "balance_loss_mlp": 1.01295733, "epoch": 0.9311889373215091, "flos": 35603743132800.0, "grad_norm": 2.413693513029208, "language_loss": 0.68221402, "learning_rate": 4.942249974085633e-08, "loss": 0.70648295, "num_input_tokens_seen": 334209545, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19189453, "step": 15488, "time_per_iteration": 2.964520215988159 }, { "auxiliary_loss_clip": 0.01382354, "auxiliary_loss_mlp": 0.01027945, "balance_loss_clip": 1.22689104, "balance_loss_mlp": 1.01039732, "epoch": 0.9312490605741771, "flos": 20240297953920.0, "grad_norm": 2.0224055311726845, "language_loss": 0.75728869, "learning_rate": 4.933649137834983e-08, "loss": 0.78139162, "num_input_tokens_seen": 334228900, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.17553711, "step": 15489, "time_per_iteration": 2.8474299907684326 }, { "auxiliary_loss_clip": 0.0141314, "auxiliary_loss_mlp": 0.01033874, "balance_loss_clip": 1.24912345, "balance_loss_mlp": 1.01348901, "epoch": 0.931309183826845, "flos": 13957576162560.0, "grad_norm": 2.105668953530822, "language_loss": 0.81723392, "learning_rate": 4.925055698519931e-08, "loss": 0.84170401, "num_input_tokens_seen": 334245500, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.20385742, "step": 15490, "time_per_iteration": 2.940305471420288 }, { "auxiliary_loss_clip": 0.01400414, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.23912501, "balance_loss_mlp": 1.01153755, "epoch": 0.931369307079513, "flos": 20166268181760.0, "grad_norm": 1.6912054180218208, "language_loss": 0.72774243, "learning_rate": 4.9164696564663264e-08, "loss": 0.7520541, "num_input_tokens_seen": 334264370, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1920166, "step": 15491, "time_per_iteration": 2.9710636138916016 }, { "auxiliary_loss_clip": 0.01384309, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 1.22814667, "balance_loss_mlp": 1.01217699, "epoch": 0.931429430332181, "flos": 25349673611520.0, "grad_norm": 1.7583584433213046, "language_loss": 0.75425017, "learning_rate": 4.9078910119997096e-08, "loss": 0.77839959, "num_input_tokens_seen": 334283905, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18444824, "step": 15492, "time_per_iteration": 2.865389108657837 }, { "auxiliary_loss_clip": 0.01186909, "auxiliary_loss_mlp": 0.0103972, "balance_loss_clip": 1.09701061, "balance_loss_mlp": 1.01635504, "epoch": 0.931489553584849, "flos": 71255699239680.0, "grad_norm": 0.7131006879813684, "language_loss": 0.53454566, "learning_rate": 4.899319765445442e-08, "loss": 0.55681193, "num_input_tokens_seen": 334339925, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.23339844, "step": 15493, "time_per_iteration": 3.229370594024658 }, { "auxiliary_loss_clip": 0.01404392, "auxiliary_loss_mlp": 0.0103242, "balance_loss_clip": 1.2448647, "balance_loss_mlp": 1.0136683, "epoch": 0.9315496768375169, "flos": 14650792030080.0, "grad_norm": 2.989405385540274, "language_loss": 0.71822214, "learning_rate": 4.890755917128531e-08, "loss": 0.74259031, "num_input_tokens_seen": 334357225, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18774414, "step": 15494, "time_per_iteration": 2.830939531326294 }, { "auxiliary_loss_clip": 0.01404025, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.24137282, "balance_loss_mlp": 1.01159143, "epoch": 0.9316098000901849, "flos": 28341685486080.0, "grad_norm": 1.524988114132579, "language_loss": 0.68876576, "learning_rate": 4.882199467373671e-08, "loss": 0.71311665, "num_input_tokens_seen": 334375945, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19458008, "step": 15495, "time_per_iteration": 2.951896905899048 }, { "auxiliary_loss_clip": 0.01387917, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.23134255, "balance_loss_mlp": 1.01559114, "epoch": 0.9316699233428528, "flos": 28524565543680.0, "grad_norm": 1.796601263248502, "language_loss": 0.62562013, "learning_rate": 4.8736504165053815e-08, "loss": 0.64982927, "num_input_tokens_seen": 334395310, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.17407227, "step": 15496, "time_per_iteration": 2.97820782661438 }, { "auxiliary_loss_clip": 0.01393479, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.23363328, "balance_loss_mlp": 1.01581323, "epoch": 0.9317300465955208, "flos": 33706568384640.0, "grad_norm": 1.5969162434435715, "language_loss": 0.77744097, "learning_rate": 4.865108764847825e-08, "loss": 0.80172098, "num_input_tokens_seen": 334416965, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18713379, "step": 15497, "time_per_iteration": 2.9990978240966797 }, { "auxiliary_loss_clip": 0.0141234, "auxiliary_loss_mlp": 0.01037254, "balance_loss_clip": 1.24879622, "balance_loss_mlp": 1.01775169, "epoch": 0.9317901698481887, "flos": 23668389642240.0, "grad_norm": 1.610980878514427, "language_loss": 0.66951978, "learning_rate": 4.856574512724898e-08, "loss": 0.6940158, "num_input_tokens_seen": 334435620, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19494629, "step": 15498, "time_per_iteration": 2.872574806213379 }, { "auxiliary_loss_clip": 0.01400455, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.24047172, "balance_loss_mlp": 1.01540947, "epoch": 0.9318502931008568, "flos": 20969917902720.0, "grad_norm": 1.7433947083530605, "language_loss": 0.80240792, "learning_rate": 4.8480476604602305e-08, "loss": 0.82675683, "num_input_tokens_seen": 334456210, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19030762, "step": 15499, "time_per_iteration": 2.904355764389038 }, { "auxiliary_loss_clip": 0.01382558, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.2283076, "balance_loss_mlp": 1.01305652, "epoch": 0.9319104163535247, "flos": 23451593967360.0, "grad_norm": 1.6866448625865487, "language_loss": 0.77328813, "learning_rate": 4.8395282083771196e-08, "loss": 0.79743814, "num_input_tokens_seen": 334475485, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.1940918, "step": 15500, "time_per_iteration": 2.8918533325195312 }, { "auxiliary_loss_clip": 0.01388127, "auxiliary_loss_mlp": 0.01029162, "balance_loss_clip": 1.23203373, "balance_loss_mlp": 1.01052928, "epoch": 0.9319705396061927, "flos": 22357933205760.0, "grad_norm": 1.5980520414634642, "language_loss": 0.72939521, "learning_rate": 4.8310161567987064e-08, "loss": 0.75356811, "num_input_tokens_seen": 334494740, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18652344, "step": 15501, "time_per_iteration": 2.9042775630950928 }, { "auxiliary_loss_clip": 0.01398313, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.2368443, "balance_loss_mlp": 1.01029384, "epoch": 0.9320306628588607, "flos": 21002702400000.0, "grad_norm": 2.5065456392803522, "language_loss": 0.672454, "learning_rate": 4.822511506047666e-08, "loss": 0.69672227, "num_input_tokens_seen": 334511910, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18212891, "step": 15502, "time_per_iteration": 2.9093074798583984 }, { "auxiliary_loss_clip": 0.0140846, "auxiliary_loss_mlp": 0.01036124, "balance_loss_clip": 1.24635577, "balance_loss_mlp": 1.01705098, "epoch": 0.9320907861115286, "flos": 24549824453760.0, "grad_norm": 1.4922832727814672, "language_loss": 0.6618911, "learning_rate": 4.814014256446586e-08, "loss": 0.68633687, "num_input_tokens_seen": 334533150, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19067383, "step": 15503, "time_per_iteration": 2.9178342819213867 }, { "auxiliary_loss_clip": 0.01403777, "auxiliary_loss_mlp": 0.01029631, "balance_loss_clip": 1.24126768, "balance_loss_mlp": 1.01058125, "epoch": 0.9321509093641966, "flos": 19793178408960.0, "grad_norm": 1.5282282575169222, "language_loss": 0.75753772, "learning_rate": 4.805524408317652e-08, "loss": 0.78187186, "num_input_tokens_seen": 334550940, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19055176, "step": 15504, "time_per_iteration": 2.8795325756073 }, { "auxiliary_loss_clip": 0.01412172, "auxiliary_loss_mlp": 0.01034781, "balance_loss_clip": 1.25141978, "balance_loss_mlp": 1.01471817, "epoch": 0.9322110326168646, "flos": 24983415803520.0, "grad_norm": 4.794244815377157, "language_loss": 0.72093689, "learning_rate": 4.797041961982762e-08, "loss": 0.74540639, "num_input_tokens_seen": 334570935, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20043945, "step": 15505, "time_per_iteration": 2.923491954803467 }, { "auxiliary_loss_clip": 0.0139794, "auxiliary_loss_mlp": 0.01035732, "balance_loss_clip": 1.23620033, "balance_loss_mlp": 1.01606286, "epoch": 0.9322711558695326, "flos": 16152227343360.0, "grad_norm": 1.8128670908932207, "language_loss": 0.76193595, "learning_rate": 4.788566917763614e-08, "loss": 0.78627264, "num_input_tokens_seen": 334589315, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19665527, "step": 15506, "time_per_iteration": 2.8324222564697266 }, { "auxiliary_loss_clip": 0.01384288, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.22940707, "balance_loss_mlp": 1.01032424, "epoch": 0.9323312791222005, "flos": 23742917107200.0, "grad_norm": 2.449768793748549, "language_loss": 0.83756793, "learning_rate": 4.780099275981597e-08, "loss": 0.86169809, "num_input_tokens_seen": 334608990, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.18395996, "step": 15507, "time_per_iteration": 2.8786239624023438 }, { "auxiliary_loss_clip": 0.0139821, "auxiliary_loss_mlp": 0.0103563, "balance_loss_clip": 1.23676038, "balance_loss_mlp": 1.01591265, "epoch": 0.9323914023748685, "flos": 20787852251520.0, "grad_norm": 1.602206790039189, "language_loss": 0.68255544, "learning_rate": 4.771639036957742e-08, "loss": 0.70689386, "num_input_tokens_seen": 334628655, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19726562, "step": 15508, "time_per_iteration": 4.287375211715698 }, { "auxiliary_loss_clip": 0.01393071, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.23574543, "balance_loss_mlp": 1.01398778, "epoch": 0.9324515256275364, "flos": 23925797164800.0, "grad_norm": 1.721947269409349, "language_loss": 0.73215258, "learning_rate": 4.7631862010129033e-08, "loss": 0.7564196, "num_input_tokens_seen": 334648295, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.19641113, "step": 15509, "time_per_iteration": 2.905116319656372 }, { "auxiliary_loss_clip": 0.01395568, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.23584318, "balance_loss_mlp": 1.01206589, "epoch": 0.9325116488802044, "flos": 18014491088640.0, "grad_norm": 1.8185875024737304, "language_loss": 0.74653953, "learning_rate": 4.754740768467624e-08, "loss": 0.770805, "num_input_tokens_seen": 334666280, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18920898, "step": 15510, "time_per_iteration": 2.816983461380005 }, { "auxiliary_loss_clip": 0.01415647, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.25136852, "balance_loss_mlp": 1.01647425, "epoch": 0.9325717721328723, "flos": 29033136806400.0, "grad_norm": 2.1543788971977813, "language_loss": 0.70921445, "learning_rate": 4.746302739642161e-08, "loss": 0.7337265, "num_input_tokens_seen": 334688830, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19067383, "step": 15511, "time_per_iteration": 4.370168685913086 }, { "auxiliary_loss_clip": 0.01398515, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.23927069, "balance_loss_mlp": 1.01792336, "epoch": 0.9326318953855404, "flos": 21654627747840.0, "grad_norm": 2.1571795446524322, "language_loss": 0.78686076, "learning_rate": 4.737872114856412e-08, "loss": 0.81121254, "num_input_tokens_seen": 334705205, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18725586, "step": 15512, "time_per_iteration": 2.8278722763061523 }, { "auxiliary_loss_clip": 0.01395483, "auxiliary_loss_mlp": 0.01030829, "balance_loss_clip": 1.23745549, "balance_loss_mlp": 1.01179171, "epoch": 0.9326920186382083, "flos": 26076352648320.0, "grad_norm": 1.4466495654822484, "language_loss": 0.80880153, "learning_rate": 4.7294488944301436e-08, "loss": 0.83306468, "num_input_tokens_seen": 334723830, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19042969, "step": 15513, "time_per_iteration": 2.852989435195923 }, { "auxiliary_loss_clip": 0.01417324, "auxiliary_loss_mlp": 0.01037697, "balance_loss_clip": 1.25308442, "balance_loss_mlp": 1.01629889, "epoch": 0.9327521418908763, "flos": 12064609180800.0, "grad_norm": 2.1499748871037165, "language_loss": 0.81602192, "learning_rate": 4.721033078682768e-08, "loss": 0.84057212, "num_input_tokens_seen": 334740825, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.21411133, "step": 15514, "time_per_iteration": 2.82910418510437 }, { "auxiliary_loss_clip": 0.01391015, "auxiliary_loss_mlp": 0.01036498, "balance_loss_clip": 1.23455834, "balance_loss_mlp": 1.01779413, "epoch": 0.9328122651435443, "flos": 43849661114880.0, "grad_norm": 2.2540383642343644, "language_loss": 0.72434092, "learning_rate": 4.7126246679333626e-08, "loss": 0.74861604, "num_input_tokens_seen": 334765825, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18701172, "step": 15515, "time_per_iteration": 3.1750433444976807 }, { "auxiliary_loss_clip": 0.01420662, "auxiliary_loss_mlp": 0.01040275, "balance_loss_clip": 1.2554121, "balance_loss_mlp": 1.02089155, "epoch": 0.9328723883962122, "flos": 15203278010880.0, "grad_norm": 2.474715698899694, "language_loss": 0.81592816, "learning_rate": 4.704223662500806e-08, "loss": 0.84053755, "num_input_tokens_seen": 334782680, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19384766, "step": 15516, "time_per_iteration": 2.86899733543396 }, { "auxiliary_loss_clip": 0.01404159, "auxiliary_loss_mlp": 0.01034805, "balance_loss_clip": 1.24157965, "balance_loss_mlp": 1.01601791, "epoch": 0.9329325116488802, "flos": 20270186784000.0, "grad_norm": 1.809479818349552, "language_loss": 0.81659985, "learning_rate": 4.695830062703643e-08, "loss": 0.84098947, "num_input_tokens_seen": 334800160, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18798828, "step": 15517, "time_per_iteration": 2.8763108253479004 }, { "auxiliary_loss_clip": 0.01406029, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.24486351, "balance_loss_mlp": 1.0124507, "epoch": 0.9329926349015482, "flos": 13122725512320.0, "grad_norm": 1.9559774054271344, "language_loss": 0.75743866, "learning_rate": 4.687443868860219e-08, "loss": 0.78182077, "num_input_tokens_seen": 334815840, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19702148, "step": 15518, "time_per_iteration": 5.769965171813965 }, { "auxiliary_loss_clip": 0.01402235, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.24348509, "balance_loss_mlp": 1.01762438, "epoch": 0.9330527581542162, "flos": 23050832359680.0, "grad_norm": 2.074152970682481, "language_loss": 0.76739913, "learning_rate": 4.679065081288458e-08, "loss": 0.79179192, "num_input_tokens_seen": 334834735, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19421387, "step": 15519, "time_per_iteration": 2.8666892051696777 }, { "auxiliary_loss_clip": 0.01392346, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.23390603, "balance_loss_mlp": 1.0141865, "epoch": 0.9331128814068841, "flos": 15567997495680.0, "grad_norm": 1.8846013303014395, "language_loss": 0.83949375, "learning_rate": 4.6706937003061275e-08, "loss": 0.86375052, "num_input_tokens_seen": 334853490, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19152832, "step": 15520, "time_per_iteration": 2.820233106613159 }, { "auxiliary_loss_clip": 0.01384564, "auxiliary_loss_mlp": 0.01030717, "balance_loss_clip": 1.22702193, "balance_loss_mlp": 1.01225114, "epoch": 0.9331730046595521, "flos": 22281550704000.0, "grad_norm": 1.6922956985972841, "language_loss": 0.76697099, "learning_rate": 4.6623297262306846e-08, "loss": 0.79112381, "num_input_tokens_seen": 334873675, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18457031, "step": 15521, "time_per_iteration": 2.92683482170105 }, { "auxiliary_loss_clip": 0.01407752, "auxiliary_loss_mlp": 0.01031979, "balance_loss_clip": 1.24924374, "balance_loss_mlp": 1.01341844, "epoch": 0.93323312791222, "flos": 15785833800960.0, "grad_norm": 1.5737427967463962, "language_loss": 0.78213996, "learning_rate": 4.6539731593792545e-08, "loss": 0.80653727, "num_input_tokens_seen": 334890970, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18554688, "step": 15522, "time_per_iteration": 2.85965633392334 }, { "auxiliary_loss_clip": 0.01396464, "auxiliary_loss_mlp": 0.01032294, "balance_loss_clip": 1.23708272, "balance_loss_mlp": 1.01304138, "epoch": 0.933293251164888, "flos": 22019301987840.0, "grad_norm": 1.9525539800648035, "language_loss": 0.63748342, "learning_rate": 4.6456240000687373e-08, "loss": 0.66177094, "num_input_tokens_seen": 334906635, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19250488, "step": 15523, "time_per_iteration": 2.824198007583618 }, { "auxiliary_loss_clip": 0.01400613, "auxiliary_loss_mlp": 0.01035531, "balance_loss_clip": 1.24215794, "balance_loss_mlp": 1.01689839, "epoch": 0.933353374417556, "flos": 26042527520640.0, "grad_norm": 1.6019940144560039, "language_loss": 0.69019413, "learning_rate": 4.63728224861577e-08, "loss": 0.71455556, "num_input_tokens_seen": 334926230, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.1862793, "step": 15524, "time_per_iteration": 2.883239269256592 }, { "auxiliary_loss_clip": 0.01403912, "auxiliary_loss_mlp": 0.01035577, "balance_loss_clip": 1.24254, "balance_loss_mlp": 1.01589549, "epoch": 0.933413497670224, "flos": 24910924354560.0, "grad_norm": 1.6545781087448683, "language_loss": 0.74585158, "learning_rate": 4.628947905336589e-08, "loss": 0.77024651, "num_input_tokens_seen": 334946680, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19677734, "step": 15525, "time_per_iteration": 2.8922815322875977 }, { "auxiliary_loss_clip": 0.01388445, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.23200583, "balance_loss_mlp": 1.01473391, "epoch": 0.9334736209228919, "flos": 23697283086720.0, "grad_norm": 1.9667043249449998, "language_loss": 0.84389007, "learning_rate": 4.6206209705473175e-08, "loss": 0.8681134, "num_input_tokens_seen": 334964785, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.19152832, "step": 15526, "time_per_iteration": 2.8421835899353027 }, { "auxiliary_loss_clip": 0.01402018, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.24012947, "balance_loss_mlp": 1.0106504, "epoch": 0.9335337441755599, "flos": 15385705620480.0, "grad_norm": 1.72922197481918, "language_loss": 0.69753528, "learning_rate": 4.61230144456366e-08, "loss": 0.72185194, "num_input_tokens_seen": 334982400, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18994141, "step": 15527, "time_per_iteration": 2.81846284866333 }, { "auxiliary_loss_clip": 0.01404569, "auxiliary_loss_mlp": 0.01032879, "balance_loss_clip": 1.24148107, "balance_loss_mlp": 1.0129354, "epoch": 0.9335938674282279, "flos": 16115189834880.0, "grad_norm": 2.316801211349661, "language_loss": 0.66278559, "learning_rate": 4.603989327701141e-08, "loss": 0.68716007, "num_input_tokens_seen": 334999685, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19946289, "step": 15528, "time_per_iteration": 2.8125429153442383 }, { "auxiliary_loss_clip": 0.01402696, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.24080801, "balance_loss_mlp": 1.01446962, "epoch": 0.9336539906808958, "flos": 18962173566720.0, "grad_norm": 2.022948114664363, "language_loss": 0.7602731, "learning_rate": 4.5956846202748867e-08, "loss": 0.78463912, "num_input_tokens_seen": 335019160, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19421387, "step": 15529, "time_per_iteration": 2.8415586948394775 }, { "auxiliary_loss_clip": 0.01396939, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.23782504, "balance_loss_mlp": 1.01392412, "epoch": 0.9337141139335638, "flos": 18116961857280.0, "grad_norm": 3.281671061384607, "language_loss": 0.6344955, "learning_rate": 4.5873873225998674e-08, "loss": 0.65878993, "num_input_tokens_seen": 335037350, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18579102, "step": 15530, "time_per_iteration": 2.8474481105804443 }, { "auxiliary_loss_clip": 0.01377255, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.22257853, "balance_loss_mlp": 1.01401091, "epoch": 0.9337742371862318, "flos": 17354104963200.0, "grad_norm": 1.908940206300086, "language_loss": 0.72863519, "learning_rate": 4.5790974349907194e-08, "loss": 0.75273836, "num_input_tokens_seen": 335056060, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.19067383, "step": 15531, "time_per_iteration": 2.8562252521514893 }, { "auxiliary_loss_clip": 0.01397147, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.23799646, "balance_loss_mlp": 1.0119009, "epoch": 0.9338343604388998, "flos": 29070762497280.0, "grad_norm": 1.6091420699236476, "language_loss": 0.71731067, "learning_rate": 4.5708149577617925e-08, "loss": 0.74159777, "num_input_tokens_seen": 335075410, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19665527, "step": 15532, "time_per_iteration": 2.900383472442627 }, { "auxiliary_loss_clip": 0.01400032, "auxiliary_loss_mlp": 0.01033013, "balance_loss_clip": 1.23891091, "balance_loss_mlp": 1.01355839, "epoch": 0.9338944836915677, "flos": 18670081265280.0, "grad_norm": 1.4947229987298272, "language_loss": 0.73796171, "learning_rate": 4.5625398912271016e-08, "loss": 0.76229215, "num_input_tokens_seen": 335095190, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19445801, "step": 15533, "time_per_iteration": 2.832350015640259 }, { "auxiliary_loss_clip": 0.01393664, "auxiliary_loss_mlp": 0.01033209, "balance_loss_clip": 1.23646927, "balance_loss_mlp": 1.01481533, "epoch": 0.9339546069442357, "flos": 16626385296000.0, "grad_norm": 1.7924302237219938, "language_loss": 0.80884051, "learning_rate": 4.554272235700507e-08, "loss": 0.83310926, "num_input_tokens_seen": 335113825, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18408203, "step": 15534, "time_per_iteration": 2.8324015140533447 }, { "auxiliary_loss_clip": 0.01374645, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 1.22365284, "balance_loss_mlp": 1.01290238, "epoch": 0.9340147301969036, "flos": 23702848197120.0, "grad_norm": 1.7718300959999087, "language_loss": 0.75039494, "learning_rate": 4.546011991495513e-08, "loss": 0.774459, "num_input_tokens_seen": 335136425, "router_z_loss_clip": 1.51074219, "router_z_loss_mlp": 0.18847656, "step": 15535, "time_per_iteration": 2.86625599861145 }, { "auxiliary_loss_clip": 0.01403515, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.24338746, "balance_loss_mlp": 1.0114857, "epoch": 0.9340748534495716, "flos": 28665295430400.0, "grad_norm": 1.9528418821346103, "language_loss": 0.78338492, "learning_rate": 4.537759158925292e-08, "loss": 0.80772471, "num_input_tokens_seen": 335157925, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18969727, "step": 15536, "time_per_iteration": 2.896688461303711 }, { "auxiliary_loss_clip": 0.01398586, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.23802876, "balance_loss_mlp": 1.01244175, "epoch": 0.9341349767022396, "flos": 24910336172160.0, "grad_norm": 1.467517994044706, "language_loss": 0.81143749, "learning_rate": 4.5295137383028593e-08, "loss": 0.83574045, "num_input_tokens_seen": 335177840, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19262695, "step": 15537, "time_per_iteration": 2.9201860427856445 }, { "auxiliary_loss_clip": 0.01413089, "auxiliary_loss_mlp": 0.01032167, "balance_loss_clip": 1.25073779, "balance_loss_mlp": 1.0136894, "epoch": 0.9341950999549076, "flos": 29071441169280.0, "grad_norm": 1.7331553638721642, "language_loss": 0.78790957, "learning_rate": 4.5212757299408764e-08, "loss": 0.81236213, "num_input_tokens_seen": 335199470, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18469238, "step": 15538, "time_per_iteration": 2.913827657699585 }, { "auxiliary_loss_clip": 0.01385924, "auxiliary_loss_mlp": 0.01032112, "balance_loss_clip": 1.22829616, "balance_loss_mlp": 1.01326537, "epoch": 0.9342552232075755, "flos": 23597662740480.0, "grad_norm": 1.6398652049345188, "language_loss": 0.73962355, "learning_rate": 4.513045134151672e-08, "loss": 0.76380396, "num_input_tokens_seen": 335218885, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18859863, "step": 15539, "time_per_iteration": 2.8836214542388916 }, { "auxiliary_loss_clip": 0.0138744, "auxiliary_loss_mlp": 0.01033688, "balance_loss_clip": 1.2312969, "balance_loss_mlp": 1.0158422, "epoch": 0.9343153464602435, "flos": 36735255809280.0, "grad_norm": 1.4767676506708085, "language_loss": 0.6545285, "learning_rate": 4.504821951247373e-08, "loss": 0.67873973, "num_input_tokens_seen": 335239485, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.17858887, "step": 15540, "time_per_iteration": 3.0191965103149414 }, { "auxiliary_loss_clip": 0.01394607, "auxiliary_loss_mlp": 0.01030988, "balance_loss_clip": 1.2355547, "balance_loss_mlp": 1.01211715, "epoch": 0.9343754697129115, "flos": 22246594456320.0, "grad_norm": 2.881702445961392, "language_loss": 0.76760346, "learning_rate": 4.496606181539864e-08, "loss": 0.79185945, "num_input_tokens_seen": 335258355, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18859863, "step": 15541, "time_per_iteration": 2.894556760787964 }, { "auxiliary_loss_clip": 0.01395304, "auxiliary_loss_mlp": 0.01031871, "balance_loss_clip": 1.237396, "balance_loss_mlp": 1.01278591, "epoch": 0.9344355929655794, "flos": 29720878053120.0, "grad_norm": 1.9214964556906093, "language_loss": 0.6778962, "learning_rate": 4.4883978253406066e-08, "loss": 0.70216799, "num_input_tokens_seen": 335276835, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19067383, "step": 15542, "time_per_iteration": 2.9487991333007812 }, { "auxiliary_loss_clip": 0.01396121, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.23825598, "balance_loss_mlp": 1.00900304, "epoch": 0.9344957162182475, "flos": 18889772607360.0, "grad_norm": 1.7890967742813073, "language_loss": 0.70534372, "learning_rate": 4.480196882960907e-08, "loss": 0.7295928, "num_input_tokens_seen": 335296220, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19787598, "step": 15543, "time_per_iteration": 4.296624422073364 }, { "auxiliary_loss_clip": 0.01414063, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.25006652, "balance_loss_mlp": 1.01483226, "epoch": 0.9345558394709154, "flos": 27429275969280.0, "grad_norm": 1.9226675916395377, "language_loss": 0.70552075, "learning_rate": 4.4720033547117394e-08, "loss": 0.73002946, "num_input_tokens_seen": 335316335, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.21984863, "step": 15544, "time_per_iteration": 2.909769296646118 }, { "auxiliary_loss_clip": 0.0140513, "auxiliary_loss_mlp": 0.01029466, "balance_loss_clip": 1.24372578, "balance_loss_mlp": 1.01096475, "epoch": 0.9346159627235834, "flos": 20751176701440.0, "grad_norm": 2.365760803643456, "language_loss": 0.78244817, "learning_rate": 4.463817240903789e-08, "loss": 0.80679417, "num_input_tokens_seen": 335335545, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18493652, "step": 15545, "time_per_iteration": 2.8744499683380127 }, { "auxiliary_loss_clip": 0.01405817, "auxiliary_loss_mlp": 0.01029778, "balance_loss_clip": 1.24287701, "balance_loss_mlp": 1.01087189, "epoch": 0.9346760859762513, "flos": 21079084901760.0, "grad_norm": 1.6009421487637157, "language_loss": 0.69753408, "learning_rate": 4.455638541847495e-08, "loss": 0.72189009, "num_input_tokens_seen": 335355350, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18920898, "step": 15546, "time_per_iteration": 2.880429267883301 }, { "auxiliary_loss_clip": 0.0139604, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.24097347, "balance_loss_mlp": 1.0135392, "epoch": 0.9347362092289193, "flos": 29216740780800.0, "grad_norm": 1.7222280192593653, "language_loss": 0.83387172, "learning_rate": 4.447467257852966e-08, "loss": 0.8581503, "num_input_tokens_seen": 335375160, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.18273926, "step": 15547, "time_per_iteration": 4.318403244018555 }, { "auxiliary_loss_clip": 0.01387244, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.23179746, "balance_loss_mlp": 1.01206684, "epoch": 0.9347963324815872, "flos": 19436783967360.0, "grad_norm": 1.7404132689911553, "language_loss": 0.84262055, "learning_rate": 4.439303389230087e-08, "loss": 0.86679405, "num_input_tokens_seen": 335394080, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18029785, "step": 15548, "time_per_iteration": 2.807861804962158 }, { "auxiliary_loss_clip": 0.01411563, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.24684095, "balance_loss_mlp": 1.01492524, "epoch": 0.9348564557342552, "flos": 36915240199680.0, "grad_norm": 1.7678605164075185, "language_loss": 0.66120124, "learning_rate": 4.4311469362884326e-08, "loss": 0.68566394, "num_input_tokens_seen": 335414230, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19787598, "step": 15549, "time_per_iteration": 2.9554319381713867 }, { "auxiliary_loss_clip": 0.01408665, "auxiliary_loss_mlp": 0.01035771, "balance_loss_clip": 1.24840546, "balance_loss_mlp": 1.01572013, "epoch": 0.9349165789869232, "flos": 21700307013120.0, "grad_norm": 1.7107477847247619, "language_loss": 0.80494797, "learning_rate": 4.4229978993372665e-08, "loss": 0.82939231, "num_input_tokens_seen": 335432890, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.20056152, "step": 15550, "time_per_iteration": 2.8479788303375244 }, { "auxiliary_loss_clip": 0.01396412, "auxiliary_loss_mlp": 0.01034784, "balance_loss_clip": 1.23805225, "balance_loss_mlp": 1.01544809, "epoch": 0.9349767022395912, "flos": 18853459015680.0, "grad_norm": 1.7715649438077197, "language_loss": 0.76539409, "learning_rate": 4.4148562786856524e-08, "loss": 0.78970599, "num_input_tokens_seen": 335452085, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19335938, "step": 15551, "time_per_iteration": 2.863986015319824 }, { "auxiliary_loss_clip": 0.01389296, "auxiliary_loss_mlp": 0.01031168, "balance_loss_clip": 1.23419404, "balance_loss_mlp": 1.01334655, "epoch": 0.9350368254922591, "flos": 24984365944320.0, "grad_norm": 1.6626048545321366, "language_loss": 0.73791707, "learning_rate": 4.406722074642255e-08, "loss": 0.76212174, "num_input_tokens_seen": 335472130, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.17834473, "step": 15552, "time_per_iteration": 2.9901864528656006 }, { "auxiliary_loss_clip": 0.01400187, "auxiliary_loss_mlp": 0.01030382, "balance_loss_clip": 1.24076581, "balance_loss_mlp": 1.01251292, "epoch": 0.9350969487449271, "flos": 23079590069760.0, "grad_norm": 1.712687664754558, "language_loss": 0.78124905, "learning_rate": 4.3985952875155386e-08, "loss": 0.80555475, "num_input_tokens_seen": 335489970, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.17883301, "step": 15553, "time_per_iteration": 5.580382585525513 }, { "auxiliary_loss_clip": 0.01401781, "auxiliary_loss_mlp": 0.01037307, "balance_loss_clip": 1.24107766, "balance_loss_mlp": 1.01741099, "epoch": 0.9351570719975951, "flos": 18634536835200.0, "grad_norm": 2.4448154641405146, "language_loss": 0.79192197, "learning_rate": 4.390475917613723e-08, "loss": 0.81631291, "num_input_tokens_seen": 335509125, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19897461, "step": 15554, "time_per_iteration": 2.82419753074646 }, { "auxiliary_loss_clip": 0.01381645, "auxiliary_loss_mlp": 0.01030245, "balance_loss_clip": 1.22655249, "balance_loss_mlp": 1.01245904, "epoch": 0.935217195250263, "flos": 15896901081600.0, "grad_norm": 1.8328274831235163, "language_loss": 0.69627661, "learning_rate": 4.382363965244695e-08, "loss": 0.72039545, "num_input_tokens_seen": 335525620, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.17797852, "step": 15555, "time_per_iteration": 2.856189250946045 }, { "auxiliary_loss_clip": 0.01388933, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.23144579, "balance_loss_mlp": 1.01292109, "epoch": 0.935277318502931, "flos": 24401267216640.0, "grad_norm": 1.5878715864064672, "language_loss": 0.75880569, "learning_rate": 4.374259430715965e-08, "loss": 0.78301227, "num_input_tokens_seen": 335547565, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18798828, "step": 15556, "time_per_iteration": 2.9026272296905518 }, { "auxiliary_loss_clip": 0.01395014, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.23645318, "balance_loss_mlp": 1.01289582, "epoch": 0.935337441755599, "flos": 27611839313280.0, "grad_norm": 1.4763172747621789, "language_loss": 0.7357884, "learning_rate": 4.366162314334953e-08, "loss": 0.76005977, "num_input_tokens_seen": 335570285, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19238281, "step": 15557, "time_per_iteration": 2.940505266189575 }, { "auxiliary_loss_clip": 0.01400401, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.2405858, "balance_loss_mlp": 1.01273847, "epoch": 0.935397565008267, "flos": 20491914142080.0, "grad_norm": 1.5638769152602652, "language_loss": 0.63703585, "learning_rate": 4.358072616408681e-08, "loss": 0.66135979, "num_input_tokens_seen": 335588600, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19274902, "step": 15558, "time_per_iteration": 2.8757224082946777 }, { "auxiliary_loss_clip": 0.01403513, "auxiliary_loss_mlp": 0.01034998, "balance_loss_clip": 1.244241, "balance_loss_mlp": 1.01498246, "epoch": 0.9354576882609349, "flos": 23663548448640.0, "grad_norm": 1.8951485304941278, "language_loss": 0.74455488, "learning_rate": 4.34999033724388e-08, "loss": 0.76893997, "num_input_tokens_seen": 335606235, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.20007324, "step": 15559, "time_per_iteration": 2.8501529693603516 }, { "auxiliary_loss_clip": 0.01398426, "auxiliary_loss_mlp": 0.01028088, "balance_loss_clip": 1.24033451, "balance_loss_mlp": 1.01088643, "epoch": 0.9355178115136029, "flos": 36698761238400.0, "grad_norm": 1.5636549476693904, "language_loss": 0.64529705, "learning_rate": 4.341915477147062e-08, "loss": 0.66956216, "num_input_tokens_seen": 335628240, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.17199707, "step": 15560, "time_per_iteration": 2.9709084033966064 }, { "auxiliary_loss_clip": 0.01427793, "auxiliary_loss_mlp": 0.01037494, "balance_loss_clip": 1.25798368, "balance_loss_mlp": 1.01622653, "epoch": 0.9355779347662708, "flos": 14467278545280.0, "grad_norm": 1.9927432104178116, "language_loss": 0.64983219, "learning_rate": 4.3338480364244034e-08, "loss": 0.67448509, "num_input_tokens_seen": 335643755, "router_z_loss_clip": 1.69824219, "router_z_loss_mlp": 0.21264648, "step": 15561, "time_per_iteration": 2.800750732421875 }, { "auxiliary_loss_clip": 0.01389961, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.23259854, "balance_loss_mlp": 1.01452529, "epoch": 0.9356380580189388, "flos": 23196539174400.0, "grad_norm": 1.8351018270596962, "language_loss": 0.75941736, "learning_rate": 4.325788015381859e-08, "loss": 0.78365779, "num_input_tokens_seen": 335665160, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19580078, "step": 15562, "time_per_iteration": 2.9363183975219727 }, { "auxiliary_loss_clip": 0.01179786, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.09280348, "balance_loss_mlp": 1.01183403, "epoch": 0.9356981812716068, "flos": 67501056695040.0, "grad_norm": 0.9412912110840377, "language_loss": 0.62397099, "learning_rate": 4.31773541432503e-08, "loss": 0.64610279, "num_input_tokens_seen": 335715240, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.21582031, "step": 15563, "time_per_iteration": 3.206998109817505 }, { "auxiliary_loss_clip": 0.01385723, "auxiliary_loss_mlp": 0.01033544, "balance_loss_clip": 1.2302227, "balance_loss_mlp": 1.01363564, "epoch": 0.9357583045242748, "flos": 24692499866880.0, "grad_norm": 1.5438989481067655, "language_loss": 0.78656834, "learning_rate": 4.3096902335592714e-08, "loss": 0.81076103, "num_input_tokens_seen": 335734970, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.19897461, "step": 15564, "time_per_iteration": 2.8773157596588135 }, { "auxiliary_loss_clip": 0.01407308, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.24400699, "balance_loss_mlp": 1.0141623, "epoch": 0.9358184277769427, "flos": 19473142803840.0, "grad_norm": 2.353952734640925, "language_loss": 0.78959119, "learning_rate": 4.301652473389694e-08, "loss": 0.81399894, "num_input_tokens_seen": 335753435, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19311523, "step": 15565, "time_per_iteration": 2.8308422565460205 }, { "auxiliary_loss_clip": 0.01386867, "auxiliary_loss_mlp": 0.0103021, "balance_loss_clip": 1.23035729, "balance_loss_mlp": 1.01266289, "epoch": 0.9358785510296107, "flos": 18926402912640.0, "grad_norm": 1.8237209961151926, "language_loss": 0.72280133, "learning_rate": 4.2936221341210774e-08, "loss": 0.74697208, "num_input_tokens_seen": 335772105, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.17553711, "step": 15566, "time_per_iteration": 2.815019130706787 }, { "auxiliary_loss_clip": 0.01399829, "auxiliary_loss_mlp": 0.01031854, "balance_loss_clip": 1.23797882, "balance_loss_mlp": 1.01304305, "epoch": 0.9359386742822787, "flos": 23451865436160.0, "grad_norm": 5.1628753450131795, "language_loss": 0.6839478, "learning_rate": 4.285599216057889e-08, "loss": 0.70826465, "num_input_tokens_seen": 335789125, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18823242, "step": 15567, "time_per_iteration": 2.8701229095458984 }, { "auxiliary_loss_clip": 0.01401898, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.24266505, "balance_loss_mlp": 1.01281357, "epoch": 0.9359987975349466, "flos": 32756804645760.0, "grad_norm": 2.0529335103127466, "language_loss": 0.63119531, "learning_rate": 4.277583719504418e-08, "loss": 0.65552402, "num_input_tokens_seen": 335810995, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18164062, "step": 15568, "time_per_iteration": 2.9120895862579346 }, { "auxiliary_loss_clip": 0.01390623, "auxiliary_loss_mlp": 0.01034696, "balance_loss_clip": 1.23237491, "balance_loss_mlp": 1.01656461, "epoch": 0.9360589207876147, "flos": 22829738428800.0, "grad_norm": 2.0309980266612238, "language_loss": 0.79501373, "learning_rate": 4.269575644764556e-08, "loss": 0.81926692, "num_input_tokens_seen": 335830580, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18139648, "step": 15569, "time_per_iteration": 2.837771415710449 }, { "auxiliary_loss_clip": 0.01397753, "auxiliary_loss_mlp": 0.01031871, "balance_loss_clip": 1.23608685, "balance_loss_mlp": 1.01260638, "epoch": 0.9361190440402826, "flos": 20894666520960.0, "grad_norm": 5.150432230626565, "language_loss": 0.70384824, "learning_rate": 4.261574992142014e-08, "loss": 0.72814441, "num_input_tokens_seen": 335846515, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19262695, "step": 15570, "time_per_iteration": 2.849252223968506 }, { "auxiliary_loss_clip": 0.01398438, "auxiliary_loss_mlp": 0.01028546, "balance_loss_clip": 1.23839617, "balance_loss_mlp": 1.00974715, "epoch": 0.9361791672929506, "flos": 19327209765120.0, "grad_norm": 1.6947087464207835, "language_loss": 0.79659462, "learning_rate": 4.2535817619401726e-08, "loss": 0.82086438, "num_input_tokens_seen": 335863350, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18798828, "step": 15571, "time_per_iteration": 2.82601261138916 }, { "auxiliary_loss_clip": 0.01399652, "auxiliary_loss_mlp": 0.01029568, "balance_loss_clip": 1.23990917, "balance_loss_mlp": 1.01067388, "epoch": 0.9362392905456185, "flos": 15166421481600.0, "grad_norm": 2.201512508415923, "language_loss": 0.78491724, "learning_rate": 4.2455959544621224e-08, "loss": 0.80920947, "num_input_tokens_seen": 335880510, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18896484, "step": 15572, "time_per_iteration": 2.844223737716675 }, { "auxiliary_loss_clip": 0.01381833, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.22641253, "balance_loss_mlp": 1.01298594, "epoch": 0.9362994137982865, "flos": 22094734348800.0, "grad_norm": 1.6279461400042996, "language_loss": 0.78571594, "learning_rate": 4.237617570010688e-08, "loss": 0.80984658, "num_input_tokens_seen": 335899440, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18261719, "step": 15573, "time_per_iteration": 2.823782205581665 }, { "auxiliary_loss_clip": 0.01385026, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.22994423, "balance_loss_mlp": 1.01168036, "epoch": 0.9363595370509544, "flos": 23522366113920.0, "grad_norm": 1.5516864680127629, "language_loss": 0.74739647, "learning_rate": 4.2296466088884044e-08, "loss": 0.77154899, "num_input_tokens_seen": 335919540, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18554688, "step": 15574, "time_per_iteration": 2.8705625534057617 }, { "auxiliary_loss_clip": 0.01393301, "auxiliary_loss_mlp": 0.01030732, "balance_loss_clip": 1.23709929, "balance_loss_mlp": 1.01168263, "epoch": 0.9364196603036224, "flos": 27134288000640.0, "grad_norm": 1.9655918587353216, "language_loss": 0.69340789, "learning_rate": 4.221683071397564e-08, "loss": 0.71764821, "num_input_tokens_seen": 335939665, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.19067383, "step": 15575, "time_per_iteration": 2.909931182861328 }, { "auxiliary_loss_clip": 0.01380113, "auxiliary_loss_mlp": 0.01032663, "balance_loss_clip": 1.22650242, "balance_loss_mlp": 1.01362491, "epoch": 0.9364797835562904, "flos": 18488332327680.0, "grad_norm": 1.5671891680311858, "language_loss": 0.66393751, "learning_rate": 4.2137269578401026e-08, "loss": 0.68806529, "num_input_tokens_seen": 335958580, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.19042969, "step": 15576, "time_per_iteration": 2.8272457122802734 }, { "auxiliary_loss_clip": 0.01399948, "auxiliary_loss_mlp": 0.01029198, "balance_loss_clip": 1.23837256, "balance_loss_mlp": 1.01066113, "epoch": 0.9365399068089584, "flos": 13013241799680.0, "grad_norm": 2.3751027552936175, "language_loss": 0.77323395, "learning_rate": 4.2057782685177566e-08, "loss": 0.79752541, "num_input_tokens_seen": 335974965, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.1854248, "step": 15577, "time_per_iteration": 2.814401388168335 }, { "auxiliary_loss_clip": 0.01402131, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.24126959, "balance_loss_mlp": 1.01632965, "epoch": 0.9366000300616263, "flos": 25677491322240.0, "grad_norm": 1.9432758964432237, "language_loss": 0.53434741, "learning_rate": 4.1978370037318855e-08, "loss": 0.55872583, "num_input_tokens_seen": 335996575, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19384766, "step": 15578, "time_per_iteration": 4.387479543685913 }, { "auxiliary_loss_clip": 0.01371328, "auxiliary_loss_mlp": 0.01030246, "balance_loss_clip": 1.21730804, "balance_loss_mlp": 1.01174426, "epoch": 0.9366601533142943, "flos": 21443125714560.0, "grad_norm": 1.6697283682582675, "language_loss": 0.70965576, "learning_rate": 4.189903163783692e-08, "loss": 0.73367155, "num_input_tokens_seen": 336017265, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.18493652, "step": 15579, "time_per_iteration": 2.866657257080078 }, { "auxiliary_loss_clip": 0.01395307, "auxiliary_loss_mlp": 0.01030011, "balance_loss_clip": 1.23685074, "balance_loss_mlp": 1.01206994, "epoch": 0.9367202765669622, "flos": 24102750153600.0, "grad_norm": 2.142095399152322, "language_loss": 0.77339637, "learning_rate": 4.181976748973959e-08, "loss": 0.79764956, "num_input_tokens_seen": 336035905, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.17932129, "step": 15580, "time_per_iteration": 2.8751673698425293 }, { "auxiliary_loss_clip": 0.01408916, "auxiliary_loss_mlp": 0.0103475, "balance_loss_clip": 1.24648321, "balance_loss_mlp": 1.01452065, "epoch": 0.9367803998196302, "flos": 20899191000960.0, "grad_norm": 1.8743423567282866, "language_loss": 0.67383599, "learning_rate": 4.1740577596033114e-08, "loss": 0.69827271, "num_input_tokens_seen": 336055585, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20202637, "step": 15581, "time_per_iteration": 2.8453528881073 }, { "auxiliary_loss_clip": 0.01390997, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.2338469, "balance_loss_mlp": 1.01076734, "epoch": 0.9368405230722983, "flos": 22574412167040.0, "grad_norm": 2.0485340247921973, "language_loss": 0.76538444, "learning_rate": 4.166146195972042e-08, "loss": 0.78958488, "num_input_tokens_seen": 336076695, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18273926, "step": 15582, "time_per_iteration": 4.385905027389526 }, { "auxiliary_loss_clip": 0.01394612, "auxiliary_loss_mlp": 0.01035232, "balance_loss_clip": 1.23648143, "balance_loss_mlp": 1.01614618, "epoch": 0.9369006463249662, "flos": 18889546383360.0, "grad_norm": 1.6218225136245776, "language_loss": 0.74711621, "learning_rate": 4.1582420583800905e-08, "loss": 0.77141464, "num_input_tokens_seen": 336094740, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1907959, "step": 15583, "time_per_iteration": 2.8466501235961914 }, { "auxiliary_loss_clip": 0.01411898, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.248559, "balance_loss_mlp": 1.01394582, "epoch": 0.9369607695776342, "flos": 26443741576320.0, "grad_norm": 2.165010940520297, "language_loss": 0.853773, "learning_rate": 4.1503453471272376e-08, "loss": 0.87822551, "num_input_tokens_seen": 336113985, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.1940918, "step": 15584, "time_per_iteration": 2.8633909225463867 }, { "auxiliary_loss_clip": 0.01420834, "auxiliary_loss_mlp": 0.01033865, "balance_loss_clip": 1.25544286, "balance_loss_mlp": 1.01402819, "epoch": 0.9370208928303021, "flos": 39581968072320.0, "grad_norm": 1.5572466371479246, "language_loss": 0.72744983, "learning_rate": 4.1424560625129334e-08, "loss": 0.75199687, "num_input_tokens_seen": 336136395, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19824219, "step": 15585, "time_per_iteration": 3.0002529621124268 }, { "auxiliary_loss_clip": 0.01383211, "auxiliary_loss_mlp": 0.01027694, "balance_loss_clip": 1.22786403, "balance_loss_mlp": 1.009431, "epoch": 0.9370810160829701, "flos": 22972594821120.0, "grad_norm": 1.7734803051365042, "language_loss": 0.81087565, "learning_rate": 4.134574204836316e-08, "loss": 0.83498466, "num_input_tokens_seen": 336156345, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18249512, "step": 15586, "time_per_iteration": 2.8816869258880615 }, { "auxiliary_loss_clip": 0.01391025, "auxiliary_loss_mlp": 0.01034076, "balance_loss_clip": 1.23328447, "balance_loss_mlp": 1.01513338, "epoch": 0.937141139335638, "flos": 23085200424960.0, "grad_norm": 1.6082296425172466, "language_loss": 0.77166939, "learning_rate": 4.126699774396258e-08, "loss": 0.79592037, "num_input_tokens_seen": 336176760, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1895752, "step": 15587, "time_per_iteration": 2.910634756088257 }, { "auxiliary_loss_clip": 0.01408804, "auxiliary_loss_mlp": 0.01035254, "balance_loss_clip": 1.24547136, "balance_loss_mlp": 1.01556039, "epoch": 0.937201262588306, "flos": 16363774621440.0, "grad_norm": 2.1532447252120632, "language_loss": 0.8801465, "learning_rate": 4.118832771491387e-08, "loss": 0.90458715, "num_input_tokens_seen": 336193285, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.19702148, "step": 15588, "time_per_iteration": 5.5915868282318115 }, { "auxiliary_loss_clip": 0.01386727, "auxiliary_loss_mlp": 0.01031052, "balance_loss_clip": 1.2321527, "balance_loss_mlp": 1.01284885, "epoch": 0.937261385840974, "flos": 20203848627840.0, "grad_norm": 1.7799875177970732, "language_loss": 0.79017729, "learning_rate": 4.11097319642002e-08, "loss": 0.81435508, "num_input_tokens_seen": 336211425, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.18188477, "step": 15589, "time_per_iteration": 2.833195924758911 }, { "auxiliary_loss_clip": 0.01384671, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.22881746, "balance_loss_mlp": 1.01571417, "epoch": 0.937321509093642, "flos": 18304999822080.0, "grad_norm": 2.122253628165616, "language_loss": 0.78005683, "learning_rate": 4.103121049480163e-08, "loss": 0.80425823, "num_input_tokens_seen": 336230205, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.1973877, "step": 15590, "time_per_iteration": 2.787128210067749 }, { "auxiliary_loss_clip": 0.01406664, "auxiliary_loss_mlp": 0.01032937, "balance_loss_clip": 1.24367535, "balance_loss_mlp": 1.01323128, "epoch": 0.9373816323463099, "flos": 25895599096320.0, "grad_norm": 1.8073955535254873, "language_loss": 0.71997482, "learning_rate": 4.095276330969577e-08, "loss": 0.74437082, "num_input_tokens_seen": 336252440, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19702148, "step": 15591, "time_per_iteration": 2.8492865562438965 }, { "auxiliary_loss_clip": 0.01416041, "auxiliary_loss_mlp": 0.01033327, "balance_loss_clip": 1.25156665, "balance_loss_mlp": 1.01360965, "epoch": 0.9374417555989779, "flos": 27210308544000.0, "grad_norm": 1.9018842955238968, "language_loss": 0.54752171, "learning_rate": 4.0874390411857804e-08, "loss": 0.5720154, "num_input_tokens_seen": 336273845, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19714355, "step": 15592, "time_per_iteration": 2.8730247020721436 }, { "auxiliary_loss_clip": 0.01395289, "auxiliary_loss_mlp": 0.01029686, "balance_loss_clip": 1.23677504, "balance_loss_mlp": 1.01170909, "epoch": 0.9375018788516458, "flos": 23631261644160.0, "grad_norm": 1.4993081523272904, "language_loss": 0.67606604, "learning_rate": 4.0796091804259136e-08, "loss": 0.70031571, "num_input_tokens_seen": 336292790, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.1796875, "step": 15593, "time_per_iteration": 2.850207567214966 }, { "auxiliary_loss_clip": 0.01398386, "auxiliary_loss_mlp": 0.01031369, "balance_loss_clip": 1.23785949, "balance_loss_mlp": 1.01346338, "epoch": 0.9375620021043138, "flos": 22690139662080.0, "grad_norm": 1.628406830187153, "language_loss": 0.74392271, "learning_rate": 4.0717867489868715e-08, "loss": 0.76822031, "num_input_tokens_seen": 336312600, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.17907715, "step": 15594, "time_per_iteration": 2.8662147521972656 }, { "auxiliary_loss_clip": 0.01399768, "auxiliary_loss_mlp": 0.01033602, "balance_loss_clip": 1.24237823, "balance_loss_mlp": 1.01434934, "epoch": 0.9376221253569819, "flos": 27570910752000.0, "grad_norm": 1.5773255543802072, "language_loss": 0.74056441, "learning_rate": 4.063971747165351e-08, "loss": 0.76489806, "num_input_tokens_seen": 336332770, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19250488, "step": 15595, "time_per_iteration": 2.992496967315674 }, { "auxiliary_loss_clip": 0.01396744, "auxiliary_loss_mlp": 0.01033071, "balance_loss_clip": 1.23622572, "balance_loss_mlp": 1.01343703, "epoch": 0.9376822486096498, "flos": 24139470948480.0, "grad_norm": 1.7234356207336765, "language_loss": 0.7669487, "learning_rate": 4.056164175257626e-08, "loss": 0.79124683, "num_input_tokens_seen": 336351445, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19641113, "step": 15596, "time_per_iteration": 2.927608013153076 }, { "auxiliary_loss_clip": 0.01398221, "auxiliary_loss_mlp": 0.01034837, "balance_loss_clip": 1.23861122, "balance_loss_mlp": 1.01483417, "epoch": 0.9377423718623178, "flos": 22794329733120.0, "grad_norm": 1.7765609404557516, "language_loss": 0.79270959, "learning_rate": 4.0483640335597926e-08, "loss": 0.81704015, "num_input_tokens_seen": 336368690, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.20007324, "step": 15597, "time_per_iteration": 2.8876144886016846 }, { "auxiliary_loss_clip": 0.01410863, "auxiliary_loss_mlp": 0.01034295, "balance_loss_clip": 1.24665976, "balance_loss_mlp": 1.01500654, "epoch": 0.9378024951149857, "flos": 19177476163200.0, "grad_norm": 1.4583549335933, "language_loss": 0.81330681, "learning_rate": 4.0405713223676363e-08, "loss": 0.83775836, "num_input_tokens_seen": 336388165, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19287109, "step": 15598, "time_per_iteration": 2.9666144847869873 }, { "auxiliary_loss_clip": 0.01422599, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.25583243, "balance_loss_mlp": 1.01120341, "epoch": 0.9378626183676537, "flos": 23513860091520.0, "grad_norm": 2.173806659457754, "language_loss": 0.63841927, "learning_rate": 4.0327860419766994e-08, "loss": 0.66295207, "num_input_tokens_seen": 336406475, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19482422, "step": 15599, "time_per_iteration": 2.8800790309906006 }, { "auxiliary_loss_clip": 0.01404347, "auxiliary_loss_mlp": 0.01032589, "balance_loss_clip": 1.24272728, "balance_loss_mlp": 1.0138011, "epoch": 0.9379227416203216, "flos": 18414800248320.0, "grad_norm": 1.9382906118775722, "language_loss": 0.74069482, "learning_rate": 4.0250081926821e-08, "loss": 0.76506412, "num_input_tokens_seen": 336424690, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18786621, "step": 15600, "time_per_iteration": 2.812058448791504 }, { "auxiliary_loss_clip": 0.0139765, "auxiliary_loss_mlp": 0.01032993, "balance_loss_clip": 1.23951221, "balance_loss_mlp": 1.01493287, "epoch": 0.9379828648729897, "flos": 17830841869440.0, "grad_norm": 1.8943926470971277, "language_loss": 0.70528078, "learning_rate": 4.0172377747788474e-08, "loss": 0.72958726, "num_input_tokens_seen": 336443055, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18078613, "step": 15601, "time_per_iteration": 2.930206298828125 }, { "auxiliary_loss_clip": 0.01183182, "auxiliary_loss_mlp": 0.01028726, "balance_loss_clip": 1.09479594, "balance_loss_mlp": 1.00164175, "epoch": 0.9380429881256576, "flos": 68055397712640.0, "grad_norm": 0.7435471256833117, "language_loss": 0.58099306, "learning_rate": 4.009474788561573e-08, "loss": 0.60311222, "num_input_tokens_seen": 336510190, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.27148438, "step": 15602, "time_per_iteration": 3.5120227336883545 }, { "auxiliary_loss_clip": 0.01409143, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.24698389, "balance_loss_mlp": 1.01652527, "epoch": 0.9381031113783256, "flos": 20786675886720.0, "grad_norm": 2.156156286309213, "language_loss": 0.72703141, "learning_rate": 4.001719234324663e-08, "loss": 0.75146961, "num_input_tokens_seen": 336529250, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18127441, "step": 15603, "time_per_iteration": 2.900451183319092 }, { "auxiliary_loss_clip": 0.01373327, "auxiliary_loss_mlp": 0.01031913, "balance_loss_clip": 1.22108054, "balance_loss_mlp": 1.01281524, "epoch": 0.9381632346309935, "flos": 19034167322880.0, "grad_norm": 1.6892087973838246, "language_loss": 0.76429832, "learning_rate": 3.993971112362171e-08, "loss": 0.7883507, "num_input_tokens_seen": 336548530, "router_z_loss_clip": 1.5234375, "router_z_loss_mlp": 0.19091797, "step": 15604, "time_per_iteration": 2.845463275909424 }, { "auxiliary_loss_clip": 0.01401526, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.24092126, "balance_loss_mlp": 1.0139395, "epoch": 0.9382233578836615, "flos": 23524356885120.0, "grad_norm": 2.2975359464392078, "language_loss": 0.65822339, "learning_rate": 3.9862304229679734e-08, "loss": 0.68257463, "num_input_tokens_seen": 336568510, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.1965332, "step": 15605, "time_per_iteration": 2.836155414581299 }, { "auxiliary_loss_clip": 0.01416978, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.25161815, "balance_loss_mlp": 1.01467609, "epoch": 0.9382834811363294, "flos": 43082867923200.0, "grad_norm": 1.83811450221809, "language_loss": 0.68095326, "learning_rate": 3.9784971664355683e-08, "loss": 0.70546901, "num_input_tokens_seen": 336592020, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19934082, "step": 15606, "time_per_iteration": 3.0473427772521973 }, { "auxiliary_loss_clip": 0.01382238, "auxiliary_loss_mlp": 0.01032112, "balance_loss_clip": 1.22800553, "balance_loss_mlp": 1.0145762, "epoch": 0.9383436043889974, "flos": 16445088806400.0, "grad_norm": 1.812233648348569, "language_loss": 0.78327239, "learning_rate": 3.970771343058166e-08, "loss": 0.80741584, "num_input_tokens_seen": 336610010, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.17553711, "step": 15607, "time_per_iteration": 2.819307327270508 }, { "auxiliary_loss_clip": 0.01397044, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.23770046, "balance_loss_mlp": 1.01737928, "epoch": 0.9384037276416655, "flos": 20750317050240.0, "grad_norm": 2.1298856359174265, "language_loss": 0.83714253, "learning_rate": 3.963052953128776e-08, "loss": 0.86147082, "num_input_tokens_seen": 336628520, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18408203, "step": 15608, "time_per_iteration": 2.8339216709136963 }, { "auxiliary_loss_clip": 0.0139937, "auxiliary_loss_mlp": 0.01034702, "balance_loss_clip": 1.24119759, "balance_loss_mlp": 1.01587868, "epoch": 0.9384638508943334, "flos": 19072064482560.0, "grad_norm": 3.800067413404078, "language_loss": 0.69783109, "learning_rate": 3.9553419969400536e-08, "loss": 0.72217184, "num_input_tokens_seen": 336647365, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18823242, "step": 15609, "time_per_iteration": 2.8217291831970215 }, { "auxiliary_loss_clip": 0.01404482, "auxiliary_loss_mlp": 0.01030585, "balance_loss_clip": 1.24203777, "balance_loss_mlp": 1.01132059, "epoch": 0.9385239741470014, "flos": 23415506599680.0, "grad_norm": 1.8340449223792288, "language_loss": 0.76562989, "learning_rate": 3.9476384747844316e-08, "loss": 0.78998059, "num_input_tokens_seen": 336667165, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19274902, "step": 15610, "time_per_iteration": 2.8732151985168457 }, { "auxiliary_loss_clip": 0.01399723, "auxiliary_loss_mlp": 0.01031994, "balance_loss_clip": 1.24039912, "balance_loss_mlp": 1.01395762, "epoch": 0.9385840973996693, "flos": 12832804961280.0, "grad_norm": 1.9251622983496737, "language_loss": 0.76033556, "learning_rate": 3.939942386953987e-08, "loss": 0.78465271, "num_input_tokens_seen": 336684130, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18029785, "step": 15611, "time_per_iteration": 2.827209949493408 }, { "auxiliary_loss_clip": 0.01398508, "auxiliary_loss_mlp": 0.01035789, "balance_loss_clip": 1.23954165, "balance_loss_mlp": 1.016608, "epoch": 0.9386442206523373, "flos": 15495415557120.0, "grad_norm": 1.731999646637399, "language_loss": 0.66431999, "learning_rate": 3.9322537337405756e-08, "loss": 0.68866295, "num_input_tokens_seen": 336701520, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19165039, "step": 15612, "time_per_iteration": 2.8208634853363037 }, { "auxiliary_loss_clip": 0.01386004, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.22979021, "balance_loss_mlp": 1.01384878, "epoch": 0.9387043439050052, "flos": 21188749593600.0, "grad_norm": 1.8240787888625594, "language_loss": 0.58332765, "learning_rate": 3.924572515435742e-08, "loss": 0.60750771, "num_input_tokens_seen": 336720675, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18164062, "step": 15613, "time_per_iteration": 4.299671173095703 }, { "auxiliary_loss_clip": 0.01404279, "auxiliary_loss_mlp": 0.01034796, "balance_loss_clip": 1.24308097, "balance_loss_mlp": 1.01700985, "epoch": 0.9387644671576733, "flos": 27678584672640.0, "grad_norm": 2.129621601567192, "language_loss": 0.71297008, "learning_rate": 3.916898732330764e-08, "loss": 0.73736084, "num_input_tokens_seen": 336741005, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.17773438, "step": 15614, "time_per_iteration": 2.907902956008911 }, { "auxiliary_loss_clip": 0.01402033, "auxiliary_loss_mlp": 0.01035069, "balance_loss_clip": 1.23948169, "balance_loss_mlp": 1.01537561, "epoch": 0.9388245904103412, "flos": 18843731383680.0, "grad_norm": 1.853990515311715, "language_loss": 0.81988335, "learning_rate": 3.9092323847166544e-08, "loss": 0.84425437, "num_input_tokens_seen": 336757990, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19702148, "step": 15615, "time_per_iteration": 2.828937292098999 }, { "auxiliary_loss_clip": 0.01382299, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.22568011, "balance_loss_mlp": 1.01333416, "epoch": 0.9388847136630092, "flos": 25494792243840.0, "grad_norm": 1.6098860080050859, "language_loss": 0.72336292, "learning_rate": 3.901573472884134e-08, "loss": 0.74751043, "num_input_tokens_seen": 336777705, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.19128418, "step": 15616, "time_per_iteration": 4.311228275299072 }, { "auxiliary_loss_clip": 0.0140057, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.24193752, "balance_loss_mlp": 1.01453829, "epoch": 0.9389448369156771, "flos": 18744472995840.0, "grad_norm": 1.7229232777041865, "language_loss": 0.66641665, "learning_rate": 3.89392199712355e-08, "loss": 0.69076568, "num_input_tokens_seen": 336798275, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.19787598, "step": 15617, "time_per_iteration": 2.8747506141662598 }, { "auxiliary_loss_clip": 0.01404723, "auxiliary_loss_mlp": 0.01031603, "balance_loss_clip": 1.24226916, "balance_loss_mlp": 1.01285148, "epoch": 0.9390049601683451, "flos": 21725761852800.0, "grad_norm": 1.964378448222992, "language_loss": 0.74064517, "learning_rate": 3.886277957725092e-08, "loss": 0.76500845, "num_input_tokens_seen": 336813835, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1875, "step": 15618, "time_per_iteration": 2.839963436126709 }, { "auxiliary_loss_clip": 0.01414563, "auxiliary_loss_mlp": 0.01035324, "balance_loss_clip": 1.24930835, "balance_loss_mlp": 1.01642942, "epoch": 0.939065083421013, "flos": 19400741844480.0, "grad_norm": 2.1919855485047135, "language_loss": 0.70717055, "learning_rate": 3.878641354978662e-08, "loss": 0.73166943, "num_input_tokens_seen": 336832210, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.18896484, "step": 15619, "time_per_iteration": 2.8600919246673584 }, { "auxiliary_loss_clip": 0.01398979, "auxiliary_loss_mlp": 0.01031543, "balance_loss_clip": 1.23910499, "balance_loss_mlp": 1.01246965, "epoch": 0.939125206673681, "flos": 24692364132480.0, "grad_norm": 3.0792791732144176, "language_loss": 0.7837162, "learning_rate": 3.8710121891737834e-08, "loss": 0.80802143, "num_input_tokens_seen": 336851380, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1907959, "step": 15620, "time_per_iteration": 2.8636281490325928 }, { "auxiliary_loss_clip": 0.01390739, "auxiliary_loss_mlp": 0.01030872, "balance_loss_clip": 1.23367715, "balance_loss_mlp": 1.01271617, "epoch": 0.9391853299263491, "flos": 16334564463360.0, "grad_norm": 1.9330543528435413, "language_loss": 0.7491163, "learning_rate": 3.8633904605998025e-08, "loss": 0.77333242, "num_input_tokens_seen": 336868525, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18151855, "step": 15621, "time_per_iteration": 2.926732301712036 }, { "auxiliary_loss_clip": 0.01411128, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.24695778, "balance_loss_mlp": 1.01398432, "epoch": 0.939245453179017, "flos": 11663168901120.0, "grad_norm": 2.432880745956629, "language_loss": 0.67719626, "learning_rate": 3.855776169545688e-08, "loss": 0.70163846, "num_input_tokens_seen": 336886200, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19116211, "step": 15622, "time_per_iteration": 2.863950729370117 }, { "auxiliary_loss_clip": 0.01389278, "auxiliary_loss_mlp": 0.01034961, "balance_loss_clip": 1.23245227, "balance_loss_mlp": 1.01635277, "epoch": 0.939305576431685, "flos": 23159049217920.0, "grad_norm": 1.6302037902840594, "language_loss": 0.72275114, "learning_rate": 3.848169316300209e-08, "loss": 0.74699348, "num_input_tokens_seen": 336905815, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.1862793, "step": 15623, "time_per_iteration": 5.823200225830078 }, { "auxiliary_loss_clip": 0.01398018, "auxiliary_loss_mlp": 0.01032469, "balance_loss_clip": 1.23972344, "balance_loss_mlp": 1.01315665, "epoch": 0.9393656996843529, "flos": 33299743973760.0, "grad_norm": 1.75319437753113, "language_loss": 0.73419297, "learning_rate": 3.84056990115178e-08, "loss": 0.75849783, "num_input_tokens_seen": 336928460, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19287109, "step": 15624, "time_per_iteration": 2.960040807723999 }, { "auxiliary_loss_clip": 0.01389909, "auxiliary_loss_mlp": 0.01031788, "balance_loss_clip": 1.23233283, "balance_loss_mlp": 1.01366806, "epoch": 0.9394258229370209, "flos": 21699447361920.0, "grad_norm": 1.840002443587124, "language_loss": 0.90440512, "learning_rate": 3.832977924388614e-08, "loss": 0.92862201, "num_input_tokens_seen": 336948320, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18127441, "step": 15625, "time_per_iteration": 2.8558897972106934 }, { "auxiliary_loss_clip": 0.01391311, "auxiliary_loss_mlp": 0.01033599, "balance_loss_clip": 1.23302913, "balance_loss_mlp": 1.01426375, "epoch": 0.9394859461896888, "flos": 23883646993920.0, "grad_norm": 1.9318991858878543, "language_loss": 0.84593487, "learning_rate": 3.825393386298592e-08, "loss": 0.87018394, "num_input_tokens_seen": 336967670, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1932373, "step": 15626, "time_per_iteration": 2.8637161254882812 }, { "auxiliary_loss_clip": 0.0117793, "auxiliary_loss_mlp": 0.0101976, "balance_loss_clip": 1.09065938, "balance_loss_mlp": 1.0018307, "epoch": 0.9395460694423569, "flos": 61595360974080.0, "grad_norm": 0.7781650295314795, "language_loss": 0.56128883, "learning_rate": 3.8178162871693284e-08, "loss": 0.58326578, "num_input_tokens_seen": 337028395, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.1796875, "step": 15627, "time_per_iteration": 3.3093039989471436 }, { "auxiliary_loss_clip": 0.01401089, "auxiliary_loss_mlp": 0.01032095, "balance_loss_clip": 1.24187481, "balance_loss_mlp": 1.01315308, "epoch": 0.9396061926950248, "flos": 21005417088000.0, "grad_norm": 3.0438516745996425, "language_loss": 0.70700079, "learning_rate": 3.810246627288105e-08, "loss": 0.7313326, "num_input_tokens_seen": 337048150, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18933105, "step": 15628, "time_per_iteration": 2.8497703075408936 }, { "auxiliary_loss_clip": 0.01398838, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.24116158, "balance_loss_mlp": 1.01186967, "epoch": 0.9396663159476928, "flos": 27498555037440.0, "grad_norm": 6.44455212782039, "language_loss": 0.75787491, "learning_rate": 3.8026844069420025e-08, "loss": 0.78216308, "num_input_tokens_seen": 337069315, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18115234, "step": 15629, "time_per_iteration": 2.894747495651245 }, { "auxiliary_loss_clip": 0.01386645, "auxiliary_loss_mlp": 0.01030742, "balance_loss_clip": 1.23233294, "balance_loss_mlp": 1.01243186, "epoch": 0.9397264392003607, "flos": 19436919701760.0, "grad_norm": 2.5054209357138992, "language_loss": 0.74588817, "learning_rate": 3.795129626417748e-08, "loss": 0.77006209, "num_input_tokens_seen": 337087765, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.18322754, "step": 15630, "time_per_iteration": 2.8149120807647705 }, { "auxiliary_loss_clip": 0.01384811, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.23023987, "balance_loss_mlp": 1.01178789, "epoch": 0.9397865624530287, "flos": 18013993395840.0, "grad_norm": 1.8472200364025315, "language_loss": 0.70088738, "learning_rate": 3.787582286001845e-08, "loss": 0.72503191, "num_input_tokens_seen": 337106265, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.1784668, "step": 15631, "time_per_iteration": 2.806218147277832 }, { "auxiliary_loss_clip": 0.01394212, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.23845875, "balance_loss_mlp": 1.01441956, "epoch": 0.9398466857056966, "flos": 22574728880640.0, "grad_norm": 1.4924255988296533, "language_loss": 0.75239599, "learning_rate": 3.7800423859805086e-08, "loss": 0.77665228, "num_input_tokens_seen": 337126090, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.17004395, "step": 15632, "time_per_iteration": 2.9303033351898193 }, { "auxiliary_loss_clip": 0.01420706, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 1.2562387, "balance_loss_mlp": 1.01585197, "epoch": 0.9399068089583646, "flos": 24546521583360.0, "grad_norm": 1.887505414941001, "language_loss": 0.75313091, "learning_rate": 3.772509926639622e-08, "loss": 0.77770692, "num_input_tokens_seen": 337145655, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.21020508, "step": 15633, "time_per_iteration": 2.8765904903411865 }, { "auxiliary_loss_clip": 0.01404576, "auxiliary_loss_mlp": 0.01034304, "balance_loss_clip": 1.24222708, "balance_loss_mlp": 1.01364458, "epoch": 0.9399669322110327, "flos": 25641494444160.0, "grad_norm": 1.7776589159109235, "language_loss": 0.73327899, "learning_rate": 3.764984908264823e-08, "loss": 0.75766778, "num_input_tokens_seen": 337164805, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.20666504, "step": 15634, "time_per_iteration": 2.9478559494018555 }, { "auxiliary_loss_clip": 0.01408704, "auxiliary_loss_mlp": 0.0103323, "balance_loss_clip": 1.24496925, "balance_loss_mlp": 1.01451433, "epoch": 0.9400270554637006, "flos": 17097466602240.0, "grad_norm": 2.0833059418836615, "language_loss": 0.69888747, "learning_rate": 3.75746733114144e-08, "loss": 0.72330678, "num_input_tokens_seen": 337182280, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18713379, "step": 15635, "time_per_iteration": 2.831812858581543 }, { "auxiliary_loss_clip": 0.01377371, "auxiliary_loss_mlp": 0.01033161, "balance_loss_clip": 1.22224855, "balance_loss_mlp": 1.01386058, "epoch": 0.9400871787163686, "flos": 22065433701120.0, "grad_norm": 1.5477413525728871, "language_loss": 0.74697316, "learning_rate": 3.7499571955545985e-08, "loss": 0.77107847, "num_input_tokens_seen": 337203495, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.19287109, "step": 15636, "time_per_iteration": 2.8760297298431396 }, { "auxiliary_loss_clip": 0.01402024, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.24283266, "balance_loss_mlp": 1.01483762, "epoch": 0.9401473019690365, "flos": 16991557228800.0, "grad_norm": 2.1873011835981577, "language_loss": 0.84086919, "learning_rate": 3.7424545017890054e-08, "loss": 0.86523616, "num_input_tokens_seen": 337220435, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19836426, "step": 15637, "time_per_iteration": 2.8801279067993164 }, { "auxiliary_loss_clip": 0.01403015, "auxiliary_loss_mlp": 0.01035009, "balance_loss_clip": 1.24274337, "balance_loss_mlp": 1.01574516, "epoch": 0.9402074252217045, "flos": 19691250577920.0, "grad_norm": 2.1664106426837018, "language_loss": 0.69813925, "learning_rate": 3.7349592501292325e-08, "loss": 0.72251946, "num_input_tokens_seen": 337238095, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19262695, "step": 15638, "time_per_iteration": 2.8621532917022705 }, { "auxiliary_loss_clip": 0.01382582, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.22807598, "balance_loss_mlp": 1.01290011, "epoch": 0.9402675484743724, "flos": 24765127050240.0, "grad_norm": 1.7432558631564368, "language_loss": 0.8538326, "learning_rate": 3.727471440859498e-08, "loss": 0.87796545, "num_input_tokens_seen": 337256645, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.17822266, "step": 15639, "time_per_iteration": 2.910263776779175 }, { "auxiliary_loss_clip": 0.01397382, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.2368983, "balance_loss_mlp": 1.01322031, "epoch": 0.9403276717270405, "flos": 25570088870400.0, "grad_norm": 1.4719015132262792, "language_loss": 0.78809094, "learning_rate": 3.719991074263662e-08, "loss": 0.81238967, "num_input_tokens_seen": 337278360, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19274902, "step": 15640, "time_per_iteration": 3.04253888130188 }, { "auxiliary_loss_clip": 0.01410496, "auxiliary_loss_mlp": 0.01033765, "balance_loss_clip": 1.24843431, "balance_loss_mlp": 1.01556158, "epoch": 0.9403877949797084, "flos": 26701646791680.0, "grad_norm": 1.5257969979657229, "language_loss": 0.74373502, "learning_rate": 3.7125181506254544e-08, "loss": 0.76817763, "num_input_tokens_seen": 337302480, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18188477, "step": 15641, "time_per_iteration": 2.9261746406555176 }, { "auxiliary_loss_clip": 0.01410181, "auxiliary_loss_mlp": 0.01036166, "balance_loss_clip": 1.2450043, "balance_loss_mlp": 1.01638961, "epoch": 0.9404479182323764, "flos": 15019719281280.0, "grad_norm": 2.1759405770681957, "language_loss": 0.83426774, "learning_rate": 3.7050526702282256e-08, "loss": 0.85873115, "num_input_tokens_seen": 337316600, "router_z_loss_clip": 1.65136719, "router_z_loss_mlp": 0.19787598, "step": 15642, "time_per_iteration": 2.82086181640625 }, { "auxiliary_loss_clip": 0.01391201, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.23433769, "balance_loss_mlp": 1.01083922, "epoch": 0.9405080414850443, "flos": 24984546923520.0, "grad_norm": 2.1521182416071163, "language_loss": 0.69332576, "learning_rate": 3.697594633355084e-08, "loss": 0.7175343, "num_input_tokens_seen": 337336895, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18798828, "step": 15643, "time_per_iteration": 2.9383535385131836 }, { "auxiliary_loss_clip": 0.0140477, "auxiliary_loss_mlp": 0.01037862, "balance_loss_clip": 1.24272835, "balance_loss_mlp": 1.0185864, "epoch": 0.9405681647377123, "flos": 20853104532480.0, "grad_norm": 1.8672346765696008, "language_loss": 0.77261263, "learning_rate": 3.6901440402888226e-08, "loss": 0.79703897, "num_input_tokens_seen": 337355105, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19262695, "step": 15644, "time_per_iteration": 3.0102365016937256 }, { "auxiliary_loss_clip": 0.01386114, "auxiliary_loss_mlp": 0.01031273, "balance_loss_clip": 1.23099399, "balance_loss_mlp": 1.01296258, "epoch": 0.9406282879903802, "flos": 23816132472960.0, "grad_norm": 2.267101137421202, "language_loss": 0.68722391, "learning_rate": 3.682700891311974e-08, "loss": 0.71139777, "num_input_tokens_seen": 337374905, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18322754, "step": 15645, "time_per_iteration": 2.8829092979431152 }, { "auxiliary_loss_clip": 0.01381856, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.22720623, "balance_loss_mlp": 1.01514626, "epoch": 0.9406884112430483, "flos": 27687000205440.0, "grad_norm": 1.6319333045375188, "language_loss": 0.70790982, "learning_rate": 3.6752651867067774e-08, "loss": 0.73206198, "num_input_tokens_seen": 337397130, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.18212891, "step": 15646, "time_per_iteration": 2.9393808841705322 }, { "auxiliary_loss_clip": 0.01393666, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.235098, "balance_loss_mlp": 1.01252973, "epoch": 0.9407485344957163, "flos": 23085200424960.0, "grad_norm": 1.5958653959384281, "language_loss": 0.74568939, "learning_rate": 3.667836926755208e-08, "loss": 0.76993513, "num_input_tokens_seen": 337418660, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18395996, "step": 15647, "time_per_iteration": 4.29063868522644 }, { "auxiliary_loss_clip": 0.01181409, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.0924902, "balance_loss_mlp": 1.00767183, "epoch": 0.9408086577483842, "flos": 71045237836800.0, "grad_norm": 0.8850170833601647, "language_loss": 0.63559508, "learning_rate": 3.660416111738907e-08, "loss": 0.65766704, "num_input_tokens_seen": 337478055, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.18164062, "step": 15648, "time_per_iteration": 3.433637857437134 }, { "auxiliary_loss_clip": 0.01387461, "auxiliary_loss_mlp": 0.01034374, "balance_loss_clip": 1.23282933, "balance_loss_mlp": 1.01633763, "epoch": 0.9408687810010522, "flos": 23741062070400.0, "grad_norm": 1.8803048284562656, "language_loss": 0.66863072, "learning_rate": 3.653002741939337e-08, "loss": 0.6928491, "num_input_tokens_seen": 337499405, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18041992, "step": 15649, "time_per_iteration": 2.854145050048828 }, { "auxiliary_loss_clip": 0.01402228, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.24091148, "balance_loss_mlp": 1.0162642, "epoch": 0.9409289042537201, "flos": 18378486656640.0, "grad_norm": 3.2587737465198594, "language_loss": 0.7856971, "learning_rate": 3.645596817637586e-08, "loss": 0.81006521, "num_input_tokens_seen": 337517195, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18322754, "step": 15650, "time_per_iteration": 2.8250887393951416 }, { "auxiliary_loss_clip": 0.01401474, "auxiliary_loss_mlp": 0.01033412, "balance_loss_clip": 1.24329507, "balance_loss_mlp": 1.01334858, "epoch": 0.9409890275063881, "flos": 23888850145920.0, "grad_norm": 1.6468161856774448, "language_loss": 0.74777329, "learning_rate": 3.638198339114451e-08, "loss": 0.77212209, "num_input_tokens_seen": 337535245, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.20056152, "step": 15651, "time_per_iteration": 2.9264655113220215 }, { "auxiliary_loss_clip": 0.0139639, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.23928547, "balance_loss_mlp": 1.0136497, "epoch": 0.941049150759056, "flos": 16553803357440.0, "grad_norm": 1.8292360599221005, "language_loss": 0.73133796, "learning_rate": 3.630807306650507e-08, "loss": 0.75563204, "num_input_tokens_seen": 337553040, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19372559, "step": 15652, "time_per_iteration": 4.279458522796631 }, { "auxiliary_loss_clip": 0.01418277, "auxiliary_loss_mlp": 0.01037846, "balance_loss_clip": 1.25129843, "balance_loss_mlp": 1.01757991, "epoch": 0.9411092740117241, "flos": 25129122618240.0, "grad_norm": 2.028144023178358, "language_loss": 0.67136669, "learning_rate": 3.6234237205260645e-08, "loss": 0.69592798, "num_input_tokens_seen": 337574580, "router_z_loss_clip": 1.67089844, "router_z_loss_mlp": 0.20251465, "step": 15653, "time_per_iteration": 2.882615566253662 }, { "auxiliary_loss_clip": 0.01414326, "auxiliary_loss_mlp": 0.01036688, "balance_loss_clip": 1.25192261, "balance_loss_mlp": 1.01645803, "epoch": 0.941169397264392, "flos": 21152164533120.0, "grad_norm": 1.8915747359642483, "language_loss": 0.78373563, "learning_rate": 3.6160475810210536e-08, "loss": 0.80824578, "num_input_tokens_seen": 337593010, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20214844, "step": 15654, "time_per_iteration": 2.8326432704925537 }, { "auxiliary_loss_clip": 0.01423362, "auxiliary_loss_mlp": 0.01034477, "balance_loss_clip": 1.2564683, "balance_loss_mlp": 1.01671433, "epoch": 0.94122952051706, "flos": 38523173068800.0, "grad_norm": 1.930367415843534, "language_loss": 0.71114516, "learning_rate": 3.6086788884152065e-08, "loss": 0.73572356, "num_input_tokens_seen": 337616170, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.1776123, "step": 15655, "time_per_iteration": 3.0381829738616943 }, { "auxiliary_loss_clip": 0.01389759, "auxiliary_loss_mlp": 0.01032913, "balance_loss_clip": 1.23138237, "balance_loss_mlp": 1.01308823, "epoch": 0.9412896437697279, "flos": 18378350922240.0, "grad_norm": 1.888385973393771, "language_loss": 0.73097765, "learning_rate": 3.601317642987944e-08, "loss": 0.75520444, "num_input_tokens_seen": 337635215, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19836426, "step": 15656, "time_per_iteration": 2.8174080848693848 }, { "auxiliary_loss_clip": 0.01394756, "auxiliary_loss_mlp": 0.0103212, "balance_loss_clip": 1.23617065, "balance_loss_mlp": 1.0135473, "epoch": 0.9413497670223959, "flos": 25895961054720.0, "grad_norm": 2.0256985428616403, "language_loss": 0.79247963, "learning_rate": 3.593963845018377e-08, "loss": 0.81674838, "num_input_tokens_seen": 337654195, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18579102, "step": 15657, "time_per_iteration": 2.891327381134033 }, { "auxiliary_loss_clip": 0.01395761, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.23573923, "balance_loss_mlp": 1.01465154, "epoch": 0.9414098902750638, "flos": 16627109212800.0, "grad_norm": 2.2185269401537573, "language_loss": 0.85291553, "learning_rate": 3.586617494785371e-08, "loss": 0.87721908, "num_input_tokens_seen": 337671810, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19934082, "step": 15658, "time_per_iteration": 5.752011299133301 }, { "auxiliary_loss_clip": 0.01406412, "auxiliary_loss_mlp": 0.01033599, "balance_loss_clip": 1.24256873, "balance_loss_mlp": 1.01352453, "epoch": 0.9414700135277319, "flos": 18634355856000.0, "grad_norm": 2.1529502468778245, "language_loss": 0.71575183, "learning_rate": 3.5792785925675254e-08, "loss": 0.740152, "num_input_tokens_seen": 337689410, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.20080566, "step": 15659, "time_per_iteration": 2.8107247352600098 }, { "auxiliary_loss_clip": 0.01397477, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.24000561, "balance_loss_mlp": 1.01733255, "epoch": 0.9415301367803999, "flos": 26289664473600.0, "grad_norm": 1.654388232949391, "language_loss": 0.80294919, "learning_rate": 3.571947138643172e-08, "loss": 0.82728434, "num_input_tokens_seen": 337709950, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18701172, "step": 15660, "time_per_iteration": 2.850080728530884 }, { "auxiliary_loss_clip": 0.01373679, "auxiliary_loss_mlp": 0.01031751, "balance_loss_clip": 1.21888018, "balance_loss_mlp": 1.01319027, "epoch": 0.9415902600330678, "flos": 23272604962560.0, "grad_norm": 1.4585268165919931, "language_loss": 0.68497109, "learning_rate": 3.564623133290201e-08, "loss": 0.70902538, "num_input_tokens_seen": 337731320, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18566895, "step": 15661, "time_per_iteration": 2.875058650970459 }, { "auxiliary_loss_clip": 0.01380636, "auxiliary_loss_mlp": 0.01032213, "balance_loss_clip": 1.22215676, "balance_loss_mlp": 1.01445079, "epoch": 0.9416503832857358, "flos": 14726948307840.0, "grad_norm": 1.9585791643017243, "language_loss": 0.67133683, "learning_rate": 3.557306576786434e-08, "loss": 0.69546533, "num_input_tokens_seen": 337747720, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17773438, "step": 15662, "time_per_iteration": 2.806626081466675 }, { "auxiliary_loss_clip": 0.01178134, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.09101915, "balance_loss_mlp": 1.01254833, "epoch": 0.9417105065384037, "flos": 70341615665280.0, "grad_norm": 0.7610550651232125, "language_loss": 0.59319687, "learning_rate": 3.5499974694092935e-08, "loss": 0.61532497, "num_input_tokens_seen": 337806930, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.22167969, "step": 15663, "time_per_iteration": 3.401543378829956 }, { "auxiliary_loss_clip": 0.01409041, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.24496055, "balance_loss_mlp": 1.01392615, "epoch": 0.9417706297910717, "flos": 34071061645440.0, "grad_norm": 2.1015670800763386, "language_loss": 0.67819905, "learning_rate": 3.542695811435914e-08, "loss": 0.70262539, "num_input_tokens_seen": 337828100, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19677734, "step": 15664, "time_per_iteration": 2.9625699520111084 }, { "auxiliary_loss_clip": 0.01403861, "auxiliary_loss_mlp": 0.01034231, "balance_loss_clip": 1.24705625, "balance_loss_mlp": 1.01549184, "epoch": 0.9418307530437396, "flos": 16480180788480.0, "grad_norm": 1.989977390935584, "language_loss": 0.74042165, "learning_rate": 3.535401603143207e-08, "loss": 0.76480258, "num_input_tokens_seen": 337844805, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18725586, "step": 15665, "time_per_iteration": 2.8203487396240234 }, { "auxiliary_loss_clip": 0.01391301, "auxiliary_loss_mlp": 0.01031182, "balance_loss_clip": 1.23446298, "balance_loss_mlp": 1.01294327, "epoch": 0.9418908762964077, "flos": 11260597501440.0, "grad_norm": 1.8591854408247217, "language_loss": 0.64290643, "learning_rate": 3.528114844807773e-08, "loss": 0.6671313, "num_input_tokens_seen": 337860490, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18225098, "step": 15666, "time_per_iteration": 2.851552963256836 }, { "auxiliary_loss_clip": 0.01394325, "auxiliary_loss_mlp": 0.01031583, "balance_loss_clip": 1.2356112, "balance_loss_mlp": 1.01237869, "epoch": 0.9419509995490756, "flos": 18447132297600.0, "grad_norm": 1.8132323595045707, "language_loss": 0.79213738, "learning_rate": 3.520835536705902e-08, "loss": 0.81639647, "num_input_tokens_seen": 337878360, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.1920166, "step": 15667, "time_per_iteration": 2.834063768386841 }, { "auxiliary_loss_clip": 0.01394402, "auxiliary_loss_mlp": 0.01034803, "balance_loss_clip": 1.23568225, "balance_loss_mlp": 1.01655221, "epoch": 0.9420111228017436, "flos": 20746968935040.0, "grad_norm": 2.896256304967013, "language_loss": 0.75989211, "learning_rate": 3.5135636791136404e-08, "loss": 0.78418422, "num_input_tokens_seen": 337895635, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18261719, "step": 15668, "time_per_iteration": 2.848926305770874 }, { "auxiliary_loss_clip": 0.01397688, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.23690629, "balance_loss_mlp": 1.01504076, "epoch": 0.9420712460544115, "flos": 21151757329920.0, "grad_norm": 3.8388476949595995, "language_loss": 0.6019758, "learning_rate": 3.506299272306723e-08, "loss": 0.62628961, "num_input_tokens_seen": 337913940, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18664551, "step": 15669, "time_per_iteration": 2.824575662612915 }, { "auxiliary_loss_clip": 0.01385775, "auxiliary_loss_mlp": 0.01030243, "balance_loss_clip": 1.23051012, "balance_loss_mlp": 1.01163411, "epoch": 0.9421313693070795, "flos": 15860270776320.0, "grad_norm": 1.866353376994672, "language_loss": 0.77454704, "learning_rate": 3.4990423165606406e-08, "loss": 0.79870725, "num_input_tokens_seen": 337932015, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.18615723, "step": 15670, "time_per_iteration": 2.8121819496154785 }, { "auxiliary_loss_clip": 0.01399901, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.24163067, "balance_loss_mlp": 1.01006794, "epoch": 0.9421914925597474, "flos": 32428624976640.0, "grad_norm": 2.5716449882354344, "language_loss": 0.65721297, "learning_rate": 3.491792812150574e-08, "loss": 0.68150592, "num_input_tokens_seen": 337953345, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.1932373, "step": 15671, "time_per_iteration": 2.935765504837036 }, { "auxiliary_loss_clip": 0.01404062, "auxiliary_loss_mlp": 0.01030605, "balance_loss_clip": 1.24340868, "balance_loss_mlp": 1.01228237, "epoch": 0.9422516158124155, "flos": 19727835638400.0, "grad_norm": 1.5209562211660483, "language_loss": 0.80288988, "learning_rate": 3.48455075935139e-08, "loss": 0.82723659, "num_input_tokens_seen": 337973685, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18334961, "step": 15672, "time_per_iteration": 2.852769136428833 }, { "auxiliary_loss_clip": 0.01410415, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.24599266, "balance_loss_mlp": 1.01937294, "epoch": 0.9423117390650835, "flos": 16261937280000.0, "grad_norm": 2.015830313043843, "language_loss": 0.74565917, "learning_rate": 3.47731615843776e-08, "loss": 0.77014893, "num_input_tokens_seen": 337989175, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.1920166, "step": 15673, "time_per_iteration": 2.8180108070373535 }, { "auxiliary_loss_clip": 0.01397001, "auxiliary_loss_mlp": 0.01032061, "balance_loss_clip": 1.23789537, "balance_loss_mlp": 1.01214123, "epoch": 0.9423718623177514, "flos": 31809348391680.0, "grad_norm": 1.9662117483941726, "language_loss": 0.70814025, "learning_rate": 3.470089009683974e-08, "loss": 0.73243093, "num_input_tokens_seen": 338011800, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19921875, "step": 15674, "time_per_iteration": 2.9343671798706055 }, { "auxiliary_loss_clip": 0.01394829, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.23599052, "balance_loss_mlp": 1.01267803, "epoch": 0.9424319855704194, "flos": 23342381723520.0, "grad_norm": 2.276967829846908, "language_loss": 0.81787306, "learning_rate": 3.462869313364125e-08, "loss": 0.84212887, "num_input_tokens_seen": 338032120, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18078613, "step": 15675, "time_per_iteration": 2.8766908645629883 }, { "auxiliary_loss_clip": 0.01393324, "auxiliary_loss_mlp": 0.01030177, "balance_loss_clip": 1.2351687, "balance_loss_mlp": 1.01184225, "epoch": 0.9424921088230873, "flos": 20787490293120.0, "grad_norm": 1.6666372517385821, "language_loss": 0.63230336, "learning_rate": 3.4556570697519494e-08, "loss": 0.65653837, "num_input_tokens_seen": 338051880, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18334961, "step": 15676, "time_per_iteration": 2.8637049198150635 }, { "auxiliary_loss_clip": 0.0139981, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.23923016, "balance_loss_mlp": 1.0140413, "epoch": 0.9425522320757553, "flos": 19036158094080.0, "grad_norm": 2.051223507624855, "language_loss": 0.67392898, "learning_rate": 3.448452279120984e-08, "loss": 0.6982649, "num_input_tokens_seen": 338069665, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19726562, "step": 15677, "time_per_iteration": 2.8687174320220947 }, { "auxiliary_loss_clip": 0.0140356, "auxiliary_loss_mlp": 0.01033126, "balance_loss_clip": 1.24094272, "balance_loss_mlp": 1.01373029, "epoch": 0.9426123553284232, "flos": 25166205371520.0, "grad_norm": 2.226929498074642, "language_loss": 0.65127647, "learning_rate": 3.441254941744387e-08, "loss": 0.67564327, "num_input_tokens_seen": 338090490, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1940918, "step": 15678, "time_per_iteration": 2.9398910999298096 }, { "auxiliary_loss_clip": 0.01393196, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 1.2354995, "balance_loss_mlp": 1.01327085, "epoch": 0.9426724785810913, "flos": 21189428265600.0, "grad_norm": 1.9097580584196483, "language_loss": 0.75229084, "learning_rate": 3.434065057895097e-08, "loss": 0.77654046, "num_input_tokens_seen": 338109825, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18481445, "step": 15679, "time_per_iteration": 2.8485448360443115 }, { "auxiliary_loss_clip": 0.01409023, "auxiliary_loss_mlp": 0.01036547, "balance_loss_clip": 1.2478261, "balance_loss_mlp": 1.0169487, "epoch": 0.9427326018337592, "flos": 14765071691520.0, "grad_norm": 2.17612102971205, "language_loss": 0.77999723, "learning_rate": 3.426882627845762e-08, "loss": 0.8044529, "num_input_tokens_seen": 338125790, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19592285, "step": 15680, "time_per_iteration": 2.813602924346924 }, { "auxiliary_loss_clip": 0.01398016, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.24068785, "balance_loss_mlp": 1.0136106, "epoch": 0.9427927250864272, "flos": 20933559066240.0, "grad_norm": 1.761321680522249, "language_loss": 0.76607269, "learning_rate": 3.419707651868742e-08, "loss": 0.7903744, "num_input_tokens_seen": 338145610, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18530273, "step": 15681, "time_per_iteration": 2.8398356437683105 }, { "auxiliary_loss_clip": 0.01410226, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.24851203, "balance_loss_mlp": 1.01653075, "epoch": 0.9428528483390951, "flos": 19761208318080.0, "grad_norm": 2.282523016609095, "language_loss": 0.67048436, "learning_rate": 3.412540130236086e-08, "loss": 0.69494182, "num_input_tokens_seen": 338165960, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18994141, "step": 15682, "time_per_iteration": 4.263850688934326 }, { "auxiliary_loss_clip": 0.01387845, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 1.22918463, "balance_loss_mlp": 1.01376224, "epoch": 0.9429129715917631, "flos": 24545661932160.0, "grad_norm": 1.7205219610216753, "language_loss": 0.77031171, "learning_rate": 3.405380063219665e-08, "loss": 0.79451632, "num_input_tokens_seen": 338187215, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18859863, "step": 15683, "time_per_iteration": 2.922215223312378 }, { "auxiliary_loss_clip": 0.01409008, "auxiliary_loss_mlp": 0.01038821, "balance_loss_clip": 1.24776077, "balance_loss_mlp": 1.01903224, "epoch": 0.942973094844431, "flos": 17966775807360.0, "grad_norm": 3.117109281623423, "language_loss": 0.76917231, "learning_rate": 3.398227451090885e-08, "loss": 0.79365057, "num_input_tokens_seen": 338201825, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19775391, "step": 15684, "time_per_iteration": 2.780897855758667 }, { "auxiliary_loss_clip": 0.01390507, "auxiliary_loss_mlp": 0.01029483, "balance_loss_clip": 1.23384154, "balance_loss_mlp": 1.0117445, "epoch": 0.9430332180970991, "flos": 26148074935680.0, "grad_norm": 1.6416565686685076, "language_loss": 0.77855551, "learning_rate": 3.391082294121017e-08, "loss": 0.80275536, "num_input_tokens_seen": 338220865, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.17736816, "step": 15685, "time_per_iteration": 2.887619972229004 }, { "auxiliary_loss_clip": 0.01394507, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.23777008, "balance_loss_mlp": 1.0118928, "epoch": 0.943093341349767, "flos": 23961929777280.0, "grad_norm": 1.8303506579814757, "language_loss": 0.76471257, "learning_rate": 3.383944592581023e-08, "loss": 0.78895736, "num_input_tokens_seen": 338240160, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18078613, "step": 15686, "time_per_iteration": 2.852933645248413 }, { "auxiliary_loss_clip": 0.01405125, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.24275017, "balance_loss_mlp": 1.01222825, "epoch": 0.943153464602435, "flos": 17977589314560.0, "grad_norm": 3.1028940971994756, "language_loss": 0.81629348, "learning_rate": 3.376814346741575e-08, "loss": 0.8406657, "num_input_tokens_seen": 338259305, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.1986084, "step": 15687, "time_per_iteration": 4.356295108795166 }, { "auxiliary_loss_clip": 0.01412451, "auxiliary_loss_mlp": 0.01035864, "balance_loss_clip": 1.24996376, "balance_loss_mlp": 1.01600373, "epoch": 0.943213587855103, "flos": 14509609695360.0, "grad_norm": 2.0658099255524602, "language_loss": 0.77124178, "learning_rate": 3.369691556873011e-08, "loss": 0.79572493, "num_input_tokens_seen": 338274950, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19873047, "step": 15688, "time_per_iteration": 2.7963361740112305 }, { "auxiliary_loss_clip": 0.01383864, "auxiliary_loss_mlp": 0.01030123, "balance_loss_clip": 1.2302587, "balance_loss_mlp": 1.01126456, "epoch": 0.9432737111077709, "flos": 28998271048320.0, "grad_norm": 1.7259325209897614, "language_loss": 0.69119036, "learning_rate": 3.3625762232454504e-08, "loss": 0.71533024, "num_input_tokens_seen": 338295585, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.18847656, "step": 15689, "time_per_iteration": 2.890425682067871 }, { "auxiliary_loss_clip": 0.01382395, "auxiliary_loss_mlp": 0.0103576, "balance_loss_clip": 1.22734535, "balance_loss_mlp": 1.0184505, "epoch": 0.9433338343604389, "flos": 21617137791360.0, "grad_norm": 1.9122396029522821, "language_loss": 0.80722857, "learning_rate": 3.35546834612872e-08, "loss": 0.83141017, "num_input_tokens_seen": 338314555, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.17321777, "step": 15690, "time_per_iteration": 2.8511645793914795 }, { "auxiliary_loss_clip": 0.01389523, "auxiliary_loss_mlp": 0.01029696, "balance_loss_clip": 1.23264551, "balance_loss_mlp": 1.01205337, "epoch": 0.9433939576131068, "flos": 33195237189120.0, "grad_norm": 1.8928355971207866, "language_loss": 0.61323082, "learning_rate": 3.348367925792317e-08, "loss": 0.63742304, "num_input_tokens_seen": 338336260, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.17663574, "step": 15691, "time_per_iteration": 2.997518301010132 }, { "auxiliary_loss_clip": 0.01403486, "auxiliary_loss_mlp": 0.01030965, "balance_loss_clip": 1.2420373, "balance_loss_mlp": 1.01232076, "epoch": 0.9434540808657749, "flos": 20496348132480.0, "grad_norm": 1.93696047475537, "language_loss": 0.66764754, "learning_rate": 3.341274962505514e-08, "loss": 0.69199204, "num_input_tokens_seen": 338354680, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18652344, "step": 15692, "time_per_iteration": 2.8448970317840576 }, { "auxiliary_loss_clip": 0.01396209, "auxiliary_loss_mlp": 0.01030793, "balance_loss_clip": 1.23590207, "balance_loss_mlp": 1.01252997, "epoch": 0.9435142041184428, "flos": 21552790406400.0, "grad_norm": 2.8440184193732154, "language_loss": 0.75747979, "learning_rate": 3.334189456537251e-08, "loss": 0.78174978, "num_input_tokens_seen": 338372490, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18261719, "step": 15693, "time_per_iteration": 5.736707925796509 }, { "auxiliary_loss_clip": 0.01392666, "auxiliary_loss_mlp": 0.01033226, "balance_loss_clip": 1.2348485, "balance_loss_mlp": 1.01452184, "epoch": 0.9435743273711108, "flos": 25019819884800.0, "grad_norm": 1.609723174764035, "language_loss": 0.73240507, "learning_rate": 3.327111408156291e-08, "loss": 0.75666404, "num_input_tokens_seen": 338390870, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18701172, "step": 15694, "time_per_iteration": 2.8831520080566406 }, { "auxiliary_loss_clip": 0.0118015, "auxiliary_loss_mlp": 0.01024723, "balance_loss_clip": 1.09209788, "balance_loss_mlp": 1.00336099, "epoch": 0.9436344506237787, "flos": 60191618463360.0, "grad_norm": 0.7390174750703838, "language_loss": 0.50644445, "learning_rate": 3.3200408176309316e-08, "loss": 0.52849317, "num_input_tokens_seen": 338453075, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21386719, "step": 15695, "time_per_iteration": 3.4074082374572754 }, { "auxiliary_loss_clip": 0.01382747, "auxiliary_loss_mlp": 0.01036074, "balance_loss_clip": 1.22845197, "balance_loss_mlp": 1.01757276, "epoch": 0.9436945738764467, "flos": 22247544597120.0, "grad_norm": 1.640641200779351, "language_loss": 0.66058731, "learning_rate": 3.312977685229335e-08, "loss": 0.68477547, "num_input_tokens_seen": 338471770, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.18493652, "step": 15696, "time_per_iteration": 2.8858556747436523 }, { "auxiliary_loss_clip": 0.01412953, "auxiliary_loss_mlp": 0.01031457, "balance_loss_clip": 1.25249708, "balance_loss_mlp": 1.0128479, "epoch": 0.9437546971291146, "flos": 25055816762880.0, "grad_norm": 1.663483412313059, "language_loss": 0.66924, "learning_rate": 3.305922011219353e-08, "loss": 0.6936841, "num_input_tokens_seen": 338492190, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18603516, "step": 15697, "time_per_iteration": 2.908749580383301 }, { "auxiliary_loss_clip": 0.01181108, "auxiliary_loss_mlp": 0.01037695, "balance_loss_clip": 1.09339428, "balance_loss_mlp": 1.01328075, "epoch": 0.9438148203817827, "flos": 56819911075200.0, "grad_norm": 3.9270824662890447, "language_loss": 0.63358736, "learning_rate": 3.298873795868506e-08, "loss": 0.65577537, "num_input_tokens_seen": 338552560, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.24414062, "step": 15698, "time_per_iteration": 3.2012455463409424 }, { "auxiliary_loss_clip": 0.01398918, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 1.23669171, "balance_loss_mlp": 1.01538157, "epoch": 0.9438749436344506, "flos": 22356621106560.0, "grad_norm": 1.6403660359318228, "language_loss": 0.70271236, "learning_rate": 3.291833039444092e-08, "loss": 0.72704822, "num_input_tokens_seen": 338571770, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19299316, "step": 15699, "time_per_iteration": 2.839911699295044 }, { "auxiliary_loss_clip": 0.01386789, "auxiliary_loss_mlp": 0.01032404, "balance_loss_clip": 1.23131239, "balance_loss_mlp": 1.01439142, "epoch": 0.9439350668871186, "flos": 13378685201280.0, "grad_norm": 2.029029581551905, "language_loss": 0.7507298, "learning_rate": 3.2847997422130734e-08, "loss": 0.77492177, "num_input_tokens_seen": 338587310, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18041992, "step": 15700, "time_per_iteration": 2.88627290725708 }, { "auxiliary_loss_clip": 0.01402855, "auxiliary_loss_mlp": 0.01031748, "balance_loss_clip": 1.24605834, "balance_loss_mlp": 1.01327074, "epoch": 0.9439951901397866, "flos": 17794302053760.0, "grad_norm": 4.222971801620476, "language_loss": 0.71432209, "learning_rate": 3.2777739044421495e-08, "loss": 0.73866814, "num_input_tokens_seen": 338606235, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18481445, "step": 15701, "time_per_iteration": 2.880390167236328 }, { "auxiliary_loss_clip": 0.01423092, "auxiliary_loss_mlp": 0.01030207, "balance_loss_clip": 1.25665998, "balance_loss_mlp": 1.01177764, "epoch": 0.9440553133924545, "flos": 18888731976960.0, "grad_norm": 2.118810548087412, "language_loss": 0.78468156, "learning_rate": 3.2707555263977505e-08, "loss": 0.80921447, "num_input_tokens_seen": 338624090, "router_z_loss_clip": 1.66308594, "router_z_loss_mlp": 0.1842041, "step": 15702, "time_per_iteration": 2.8772659301757812 }, { "auxiliary_loss_clip": 0.01410587, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.24755454, "balance_loss_mlp": 1.01245999, "epoch": 0.9441154366451225, "flos": 19582355047680.0, "grad_norm": 1.6336649476605354, "language_loss": 0.67217714, "learning_rate": 3.2637446083460194e-08, "loss": 0.6965856, "num_input_tokens_seen": 338643695, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.17797852, "step": 15703, "time_per_iteration": 3.078662395477295 }, { "auxiliary_loss_clip": 0.0140589, "auxiliary_loss_mlp": 0.01031359, "balance_loss_clip": 1.24384809, "balance_loss_mlp": 1.01204729, "epoch": 0.9441755598977905, "flos": 30305922307200.0, "grad_norm": 2.2827924117850573, "language_loss": 0.73942876, "learning_rate": 3.256741150552833e-08, "loss": 0.76380128, "num_input_tokens_seen": 338664725, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19311523, "step": 15704, "time_per_iteration": 2.9371068477630615 }, { "auxiliary_loss_clip": 0.013827, "auxiliary_loss_mlp": 0.0103109, "balance_loss_clip": 1.22580183, "balance_loss_mlp": 1.01192093, "epoch": 0.9442356831504585, "flos": 20677644622080.0, "grad_norm": 2.7830588472081166, "language_loss": 0.75640488, "learning_rate": 3.2497451532837336e-08, "loss": 0.78054285, "num_input_tokens_seen": 338683990, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.19177246, "step": 15705, "time_per_iteration": 2.852957010269165 }, { "auxiliary_loss_clip": 0.01392859, "auxiliary_loss_mlp": 0.0103256, "balance_loss_clip": 1.23633361, "balance_loss_mlp": 1.01446366, "epoch": 0.9442958064031264, "flos": 16115868506880.0, "grad_norm": 1.906471639502226, "language_loss": 0.7752434, "learning_rate": 3.2427566168039986e-08, "loss": 0.7994976, "num_input_tokens_seen": 338702025, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.1809082, "step": 15706, "time_per_iteration": 2.8056094646453857 }, { "auxiliary_loss_clip": 0.01373538, "auxiliary_loss_mlp": 0.01028555, "balance_loss_clip": 1.22021186, "balance_loss_mlp": 1.00926709, "epoch": 0.9443559296557944, "flos": 20456686425600.0, "grad_norm": 1.4372574297989535, "language_loss": 0.69647396, "learning_rate": 3.23577554137866e-08, "loss": 0.72049487, "num_input_tokens_seen": 338720920, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.19274902, "step": 15707, "time_per_iteration": 2.8777594566345215 }, { "auxiliary_loss_clip": 0.01389661, "auxiliary_loss_mlp": 0.01031263, "balance_loss_clip": 1.2347374, "balance_loss_mlp": 1.01333332, "epoch": 0.9444160529084623, "flos": 21619445276160.0, "grad_norm": 2.055947421919738, "language_loss": 0.70416296, "learning_rate": 3.22880192727244e-08, "loss": 0.72837222, "num_input_tokens_seen": 338739590, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.17907715, "step": 15708, "time_per_iteration": 2.845287561416626 }, { "auxiliary_loss_clip": 0.01398786, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.24171233, "balance_loss_mlp": 1.01365781, "epoch": 0.9444761761611303, "flos": 18450842371200.0, "grad_norm": 2.5613461554254875, "language_loss": 0.72442973, "learning_rate": 3.221835774749748e-08, "loss": 0.74873209, "num_input_tokens_seen": 338757240, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.17810059, "step": 15709, "time_per_iteration": 2.809617519378662 }, { "auxiliary_loss_clip": 0.01385728, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.23043692, "balance_loss_mlp": 1.01206422, "epoch": 0.9445362994137982, "flos": 20966298318720.0, "grad_norm": 2.0200578332026757, "language_loss": 0.85598689, "learning_rate": 3.214877084074774e-08, "loss": 0.88014889, "num_input_tokens_seen": 338773750, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18395996, "step": 15710, "time_per_iteration": 2.832777261734009 }, { "auxiliary_loss_clip": 0.0140543, "auxiliary_loss_mlp": 0.01035244, "balance_loss_clip": 1.2423923, "balance_loss_mlp": 1.01524115, "epoch": 0.9445964226664663, "flos": 20313015626880.0, "grad_norm": 1.5952675174206652, "language_loss": 0.72137749, "learning_rate": 3.2079258555113956e-08, "loss": 0.74578416, "num_input_tokens_seen": 338792115, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19995117, "step": 15711, "time_per_iteration": 2.8389997482299805 }, { "auxiliary_loss_clip": 0.01402936, "auxiliary_loss_mlp": 0.0103249, "balance_loss_clip": 1.24380982, "balance_loss_mlp": 1.01341665, "epoch": 0.9446565459191342, "flos": 26407247005440.0, "grad_norm": 2.714388225478444, "language_loss": 0.70557606, "learning_rate": 3.200982089323179e-08, "loss": 0.7299304, "num_input_tokens_seen": 338812480, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1907959, "step": 15712, "time_per_iteration": 2.9374778270721436 }, { "auxiliary_loss_clip": 0.01417587, "auxiliary_loss_mlp": 0.01035416, "balance_loss_clip": 1.25301051, "balance_loss_mlp": 1.01578212, "epoch": 0.9447166691718022, "flos": 16553396154240.0, "grad_norm": 2.2342025939172876, "language_loss": 0.7187472, "learning_rate": 3.1940457857734246e-08, "loss": 0.74327725, "num_input_tokens_seen": 338829105, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19628906, "step": 15713, "time_per_iteration": 2.8547704219818115 }, { "auxiliary_loss_clip": 0.01385509, "auxiliary_loss_mlp": 0.01036756, "balance_loss_clip": 1.23062098, "balance_loss_mlp": 1.01641846, "epoch": 0.9447767924244702, "flos": 29175450261120.0, "grad_norm": 1.6293927671270205, "language_loss": 0.77385557, "learning_rate": 3.187116945125212e-08, "loss": 0.79807818, "num_input_tokens_seen": 338850670, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.20336914, "step": 15714, "time_per_iteration": 2.9928553104400635 }, { "auxiliary_loss_clip": 0.01403835, "auxiliary_loss_mlp": 0.01033025, "balance_loss_clip": 1.24248755, "balance_loss_mlp": 1.0146668, "epoch": 0.9448369156771381, "flos": 19282616375040.0, "grad_norm": 2.2234870920061747, "language_loss": 0.68210506, "learning_rate": 3.1801955676412194e-08, "loss": 0.70647365, "num_input_tokens_seen": 338867795, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18359375, "step": 15715, "time_per_iteration": 2.812911033630371 }, { "auxiliary_loss_clip": 0.01402762, "auxiliary_loss_mlp": 0.01034384, "balance_loss_clip": 1.24192929, "balance_loss_mlp": 1.01444006, "epoch": 0.9448970389298061, "flos": 23851495923840.0, "grad_norm": 1.7595507723091628, "language_loss": 0.7526654, "learning_rate": 3.173281653583948e-08, "loss": 0.77703691, "num_input_tokens_seen": 338887205, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19946289, "step": 15716, "time_per_iteration": 2.8818581104278564 }, { "auxiliary_loss_clip": 0.01400217, "auxiliary_loss_mlp": 0.01027397, "balance_loss_clip": 1.24024391, "balance_loss_mlp": 1.0085144, "epoch": 0.944957162182474, "flos": 22392346515840.0, "grad_norm": 1.7600417277335711, "language_loss": 0.62511015, "learning_rate": 3.166375203215565e-08, "loss": 0.64938617, "num_input_tokens_seen": 338906130, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1887207, "step": 15717, "time_per_iteration": 4.313234567642212 }, { "auxiliary_loss_clip": 0.0139746, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 1.23932445, "balance_loss_mlp": 1.0144732, "epoch": 0.9450172854351421, "flos": 17392726039680.0, "grad_norm": 1.6325025820002794, "language_loss": 0.79917562, "learning_rate": 3.1594762167979514e-08, "loss": 0.82347614, "num_input_tokens_seen": 338923045, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18139648, "step": 15718, "time_per_iteration": 2.8022913932800293 }, { "auxiliary_loss_clip": 0.01180515, "auxiliary_loss_mlp": 0.01022943, "balance_loss_clip": 1.09249794, "balance_loss_mlp": 1.00482368, "epoch": 0.94507740868781, "flos": 68498535715200.0, "grad_norm": 0.7095110987787475, "language_loss": 0.57864815, "learning_rate": 3.152584694592719e-08, "loss": 0.60068274, "num_input_tokens_seen": 338987545, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.18164062, "step": 15719, "time_per_iteration": 3.37575101852417 }, { "auxiliary_loss_clip": 0.01398945, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.23801208, "balance_loss_mlp": 1.01342559, "epoch": 0.945137531940478, "flos": 21152436001920.0, "grad_norm": 2.303150938332756, "language_loss": 0.7651211, "learning_rate": 3.145700636861193e-08, "loss": 0.78943968, "num_input_tokens_seen": 339007830, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.19494629, "step": 15720, "time_per_iteration": 2.875047206878662 }, { "auxiliary_loss_clip": 0.01380539, "auxiliary_loss_mlp": 0.01032611, "balance_loss_clip": 1.22344351, "balance_loss_mlp": 1.01400256, "epoch": 0.9451976551931459, "flos": 24544304588160.0, "grad_norm": 2.1343738470573608, "language_loss": 0.72979808, "learning_rate": 3.138824043864452e-08, "loss": 0.75392956, "num_input_tokens_seen": 339028980, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18603516, "step": 15721, "time_per_iteration": 2.872805595397949 }, { "auxiliary_loss_clip": 0.01400052, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.24019706, "balance_loss_mlp": 1.01638091, "epoch": 0.9452577784458139, "flos": 23451005784960.0, "grad_norm": 1.9823838324766136, "language_loss": 0.86084712, "learning_rate": 3.131954915863244e-08, "loss": 0.88520646, "num_input_tokens_seen": 339047950, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19482422, "step": 15722, "time_per_iteration": 4.3119072914123535 }, { "auxiliary_loss_clip": 0.01181207, "auxiliary_loss_mlp": 0.01022012, "balance_loss_clip": 1.09352493, "balance_loss_mlp": 1.00570381, "epoch": 0.9453179016984818, "flos": 52047266353920.0, "grad_norm": 0.9031642058713492, "language_loss": 0.64490509, "learning_rate": 3.125093253118005e-08, "loss": 0.66693735, "num_input_tokens_seen": 339104535, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.16308594, "step": 15723, "time_per_iteration": 3.242908477783203 }, { "auxiliary_loss_clip": 0.01405667, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.24458873, "balance_loss_mlp": 1.01478875, "epoch": 0.9453780249511499, "flos": 13479029464320.0, "grad_norm": 2.048261267733471, "language_loss": 0.73708874, "learning_rate": 3.1182390558889715e-08, "loss": 0.76148391, "num_input_tokens_seen": 339122050, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19067383, "step": 15724, "time_per_iteration": 2.920581579208374 }, { "auxiliary_loss_clip": 0.01395383, "auxiliary_loss_mlp": 0.01031027, "balance_loss_clip": 1.23713136, "balance_loss_mlp": 1.01170278, "epoch": 0.9454381482038178, "flos": 23268985378560.0, "grad_norm": 2.184050879488074, "language_loss": 0.856359, "learning_rate": 3.111392324436024e-08, "loss": 0.88062304, "num_input_tokens_seen": 339138940, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19335938, "step": 15725, "time_per_iteration": 2.865981340408325 }, { "auxiliary_loss_clip": 0.01401324, "auxiliary_loss_mlp": 0.01029187, "balance_loss_clip": 1.24236071, "balance_loss_mlp": 1.01085222, "epoch": 0.9454982714564858, "flos": 19505112894720.0, "grad_norm": 1.8904404301700122, "language_loss": 0.71599001, "learning_rate": 3.104553059018822e-08, "loss": 0.74029511, "num_input_tokens_seen": 339158245, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18322754, "step": 15726, "time_per_iteration": 2.859386920928955 }, { "auxiliary_loss_clip": 0.01401521, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.24231267, "balance_loss_mlp": 1.01603603, "epoch": 0.9455583947091538, "flos": 23268487685760.0, "grad_norm": 1.808752986305152, "language_loss": 0.61536562, "learning_rate": 3.097721259896735e-08, "loss": 0.63973582, "num_input_tokens_seen": 339178200, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19470215, "step": 15727, "time_per_iteration": 2.880540370941162 }, { "auxiliary_loss_clip": 0.01387511, "auxiliary_loss_mlp": 0.01031734, "balance_loss_clip": 1.23147154, "balance_loss_mlp": 1.01261246, "epoch": 0.9456185179618217, "flos": 17681244001920.0, "grad_norm": 1.6941768114150166, "language_loss": 0.82491529, "learning_rate": 3.0908969273287566e-08, "loss": 0.84910774, "num_input_tokens_seen": 339193950, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.19104004, "step": 15728, "time_per_iteration": 4.22817587852478 }, { "auxiliary_loss_clip": 0.01180012, "auxiliary_loss_mlp": 0.01028561, "balance_loss_clip": 1.09334874, "balance_loss_mlp": 1.0119673, "epoch": 0.9456786412144897, "flos": 61442912684160.0, "grad_norm": 0.737347826883757, "language_loss": 0.59101403, "learning_rate": 3.08408006157368e-08, "loss": 0.61309969, "num_input_tokens_seen": 339252330, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.16601562, "step": 15729, "time_per_iteration": 4.684915065765381 }, { "auxiliary_loss_clip": 0.01382959, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 1.22584105, "balance_loss_mlp": 1.01210868, "epoch": 0.9457387644671577, "flos": 18597635061120.0, "grad_norm": 1.9449291941082627, "language_loss": 0.77846646, "learning_rate": 3.077270662890052e-08, "loss": 0.80261338, "num_input_tokens_seen": 339270325, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19616699, "step": 15730, "time_per_iteration": 2.836890459060669 }, { "auxiliary_loss_clip": 0.01402468, "auxiliary_loss_mlp": 0.01034298, "balance_loss_clip": 1.24227846, "balance_loss_mlp": 1.01498652, "epoch": 0.9457988877198257, "flos": 21119153811840.0, "grad_norm": 1.3770720710316837, "language_loss": 0.63489658, "learning_rate": 3.070468731536047e-08, "loss": 0.65926421, "num_input_tokens_seen": 339291980, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19311523, "step": 15731, "time_per_iteration": 2.9405975341796875 }, { "auxiliary_loss_clip": 0.01395459, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.23498976, "balance_loss_mlp": 1.01123953, "epoch": 0.9458590109724936, "flos": 26699791754880.0, "grad_norm": 3.0028800225445256, "language_loss": 0.64643884, "learning_rate": 3.063674267769589e-08, "loss": 0.67070913, "num_input_tokens_seen": 339311795, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20336914, "step": 15732, "time_per_iteration": 2.925813674926758 }, { "auxiliary_loss_clip": 0.01413664, "auxiliary_loss_mlp": 0.01030186, "balance_loss_clip": 1.2491678, "balance_loss_mlp": 1.01117253, "epoch": 0.9459191342251616, "flos": 18670940916480.0, "grad_norm": 1.9657147377884634, "language_loss": 0.8516835, "learning_rate": 3.056887271848363e-08, "loss": 0.876122, "num_input_tokens_seen": 339327745, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.19030762, "step": 15733, "time_per_iteration": 2.8026206493377686 }, { "auxiliary_loss_clip": 0.01386864, "auxiliary_loss_mlp": 0.01029783, "balance_loss_clip": 1.23283935, "balance_loss_mlp": 1.01212835, "epoch": 0.9459792574778295, "flos": 23407407780480.0, "grad_norm": 1.4485004388529126, "language_loss": 0.7290619, "learning_rate": 3.0501077440297173e-08, "loss": 0.75322831, "num_input_tokens_seen": 339346445, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.17663574, "step": 15734, "time_per_iteration": 2.877760171890259 }, { "auxiliary_loss_clip": 0.01379308, "auxiliary_loss_mlp": 0.01028002, "balance_loss_clip": 1.2245084, "balance_loss_mlp": 1.01037097, "epoch": 0.9460393807304975, "flos": 24404615331840.0, "grad_norm": 13.520151814134735, "language_loss": 0.87095284, "learning_rate": 3.043335684570692e-08, "loss": 0.89502597, "num_input_tokens_seen": 339367945, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.1763916, "step": 15735, "time_per_iteration": 2.914188861846924 }, { "auxiliary_loss_clip": 0.01403861, "auxiliary_loss_mlp": 0.01031593, "balance_loss_clip": 1.24413681, "balance_loss_mlp": 1.01354504, "epoch": 0.9460995039831654, "flos": 21948077393280.0, "grad_norm": 1.9999020475359695, "language_loss": 0.68061072, "learning_rate": 3.036571093728102e-08, "loss": 0.70496523, "num_input_tokens_seen": 339386060, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18054199, "step": 15736, "time_per_iteration": 2.855769157409668 }, { "auxiliary_loss_clip": 0.0117594, "auxiliary_loss_mlp": 0.01017784, "balance_loss_clip": 1.09002066, "balance_loss_mlp": 0.99918747, "epoch": 0.9461596272358335, "flos": 70353922250880.0, "grad_norm": 0.8693790193733918, "language_loss": 0.65304625, "learning_rate": 3.029813971758499e-08, "loss": 0.6749835, "num_input_tokens_seen": 339446695, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.18554688, "step": 15737, "time_per_iteration": 3.348862648010254 }, { "auxiliary_loss_clip": 0.01174321, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.08957088, "balance_loss_mlp": 1.00657701, "epoch": 0.9462197504885014, "flos": 58624116462720.0, "grad_norm": 0.8099701742835257, "language_loss": 0.5894472, "learning_rate": 3.0230643189181225e-08, "loss": 0.61149651, "num_input_tokens_seen": 339510080, "router_z_loss_clip": 0.84765625, "router_z_loss_mlp": 0.24023438, "step": 15738, "time_per_iteration": 3.302438259124756 }, { "auxiliary_loss_clip": 0.01377995, "auxiliary_loss_mlp": 0.01032624, "balance_loss_clip": 1.22340894, "balance_loss_mlp": 1.01358652, "epoch": 0.9462798737411694, "flos": 23442409272960.0, "grad_norm": 1.878194724246135, "language_loss": 0.71959782, "learning_rate": 3.016322135462834e-08, "loss": 0.74370402, "num_input_tokens_seen": 339529335, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.19055176, "step": 15739, "time_per_iteration": 2.85922908782959 }, { "auxiliary_loss_clip": 0.01406721, "auxiliary_loss_mlp": 0.01032838, "balance_loss_clip": 1.24559093, "balance_loss_mlp": 1.01383638, "epoch": 0.9463399969938374, "flos": 25056947882880.0, "grad_norm": 2.5244123235446714, "language_loss": 0.64885396, "learning_rate": 3.009587421648363e-08, "loss": 0.67324954, "num_input_tokens_seen": 339548820, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19006348, "step": 15740, "time_per_iteration": 2.8383712768554688 }, { "auxiliary_loss_clip": 0.01385785, "auxiliary_loss_mlp": 0.0103498, "balance_loss_clip": 1.23039365, "balance_loss_mlp": 1.01534581, "epoch": 0.9464001202465053, "flos": 24363234322560.0, "grad_norm": 1.723577229975546, "language_loss": 0.67379045, "learning_rate": 3.0028601777301045e-08, "loss": 0.69799805, "num_input_tokens_seen": 339566775, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.19641113, "step": 15741, "time_per_iteration": 2.865550994873047 }, { "auxiliary_loss_clip": 0.0140169, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.24207783, "balance_loss_mlp": 1.01507664, "epoch": 0.9464602434991733, "flos": 17174346796800.0, "grad_norm": 2.364179052279632, "language_loss": 0.76452076, "learning_rate": 2.9961404039630987e-08, "loss": 0.78887713, "num_input_tokens_seen": 339581905, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1887207, "step": 15742, "time_per_iteration": 2.8343098163604736 }, { "auxiliary_loss_clip": 0.01385571, "auxiliary_loss_mlp": 0.01030134, "balance_loss_clip": 1.22939515, "balance_loss_mlp": 1.01172769, "epoch": 0.9465203667518413, "flos": 19947662714880.0, "grad_norm": 2.6438642321854475, "language_loss": 0.73206103, "learning_rate": 2.989428100602187e-08, "loss": 0.75621808, "num_input_tokens_seen": 339599870, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.1842041, "step": 15743, "time_per_iteration": 2.946211576461792 }, { "auxiliary_loss_clip": 0.01397013, "auxiliary_loss_mlp": 0.01031069, "balance_loss_clip": 1.23563015, "balance_loss_mlp": 1.01228166, "epoch": 0.9465804900045093, "flos": 20129864100480.0, "grad_norm": 2.136463010203987, "language_loss": 0.80655074, "learning_rate": 2.982723267901943e-08, "loss": 0.83083159, "num_input_tokens_seen": 339620250, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18786621, "step": 15744, "time_per_iteration": 2.8593456745147705 }, { "auxiliary_loss_clip": 0.01400515, "auxiliary_loss_mlp": 0.01034653, "balance_loss_clip": 1.23910093, "balance_loss_mlp": 1.01517415, "epoch": 0.9466406132571772, "flos": 23921317929600.0, "grad_norm": 1.6946146879387574, "language_loss": 0.78808087, "learning_rate": 2.9760259061165417e-08, "loss": 0.81243253, "num_input_tokens_seen": 339639900, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19470215, "step": 15745, "time_per_iteration": 2.8633596897125244 }, { "auxiliary_loss_clip": 0.01406032, "auxiliary_loss_mlp": 0.01031445, "balance_loss_clip": 1.24312449, "balance_loss_mlp": 1.0123837, "epoch": 0.9467007365098452, "flos": 19941509422080.0, "grad_norm": 2.3672414154130217, "language_loss": 0.70904249, "learning_rate": 2.9693360155000014e-08, "loss": 0.73341727, "num_input_tokens_seen": 339658970, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19055176, "step": 15746, "time_per_iteration": 2.8703107833862305 }, { "auxiliary_loss_clip": 0.0139636, "auxiliary_loss_mlp": 0.01032475, "balance_loss_clip": 1.23763919, "balance_loss_mlp": 1.0135448, "epoch": 0.9467608597625131, "flos": 19317979825920.0, "grad_norm": 1.8504150373339785, "language_loss": 0.57108778, "learning_rate": 2.962653596305964e-08, "loss": 0.59537613, "num_input_tokens_seen": 339675600, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18933105, "step": 15747, "time_per_iteration": 2.7869908809661865 }, { "auxiliary_loss_clip": 0.01177906, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.09203506, "balance_loss_mlp": 1.015028, "epoch": 0.9468209830151811, "flos": 69661792258560.0, "grad_norm": 0.6700392142477466, "language_loss": 0.53288186, "learning_rate": 2.955978648787871e-08, "loss": 0.55504298, "num_input_tokens_seen": 339744505, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.23144531, "step": 15748, "time_per_iteration": 3.5311431884765625 }, { "auxiliary_loss_clip": 0.01401776, "auxiliary_loss_mlp": 0.01035479, "balance_loss_clip": 1.24107289, "balance_loss_mlp": 1.01592851, "epoch": 0.946881106267849, "flos": 27028107158400.0, "grad_norm": 1.611726546872284, "language_loss": 0.67064095, "learning_rate": 2.9493111731988096e-08, "loss": 0.69501346, "num_input_tokens_seen": 339765810, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19555664, "step": 15749, "time_per_iteration": 2.90354061126709 }, { "auxiliary_loss_clip": 0.01395499, "auxiliary_loss_mlp": 0.01034033, "balance_loss_clip": 1.23526788, "balance_loss_mlp": 1.01463723, "epoch": 0.9469412295205171, "flos": 20198826455040.0, "grad_norm": 1.9729554667103444, "language_loss": 0.77180886, "learning_rate": 2.942651169791621e-08, "loss": 0.79610419, "num_input_tokens_seen": 339784125, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.1940918, "step": 15750, "time_per_iteration": 2.8536465167999268 }, { "auxiliary_loss_clip": 0.01393544, "auxiliary_loss_mlp": 0.01030168, "balance_loss_clip": 1.23606682, "balance_loss_mlp": 1.01068926, "epoch": 0.947001352773185, "flos": 21334908856320.0, "grad_norm": 1.6377370319713205, "language_loss": 0.68525589, "learning_rate": 2.9359986388188372e-08, "loss": 0.70949298, "num_input_tokens_seen": 339803450, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.19470215, "step": 15751, "time_per_iteration": 2.884308099746704 }, { "auxiliary_loss_clip": 0.01404431, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.2430582, "balance_loss_mlp": 1.01418686, "epoch": 0.947061476025853, "flos": 21953913972480.0, "grad_norm": 2.581568927232058, "language_loss": 0.66142124, "learning_rate": 2.929353580532723e-08, "loss": 0.68579841, "num_input_tokens_seen": 339823215, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19091797, "step": 15752, "time_per_iteration": 2.8411879539489746 }, { "auxiliary_loss_clip": 0.01393, "auxiliary_loss_mlp": 0.01030488, "balance_loss_clip": 1.23468065, "balance_loss_mlp": 1.01125932, "epoch": 0.947121599278521, "flos": 21403916455680.0, "grad_norm": 1.8541845574484335, "language_loss": 0.72094238, "learning_rate": 2.9227159951852764e-08, "loss": 0.74517727, "num_input_tokens_seen": 339842230, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.19213867, "step": 15753, "time_per_iteration": 4.2640626430511475 }, { "auxiliary_loss_clip": 0.01411461, "auxiliary_loss_mlp": 0.0103775, "balance_loss_clip": 1.24797535, "balance_loss_mlp": 1.01774645, "epoch": 0.9471817225311889, "flos": 23086060076160.0, "grad_norm": 1.8216298975229803, "language_loss": 0.71347249, "learning_rate": 2.9160858830281855e-08, "loss": 0.73796457, "num_input_tokens_seen": 339861640, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.20007324, "step": 15754, "time_per_iteration": 2.8627097606658936 }, { "auxiliary_loss_clip": 0.01395821, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.23439288, "balance_loss_mlp": 1.01256227, "epoch": 0.947241845783857, "flos": 11918087959680.0, "grad_norm": 2.2229130899783724, "language_loss": 0.79637361, "learning_rate": 2.9094632443129153e-08, "loss": 0.82063401, "num_input_tokens_seen": 339878210, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.17651367, "step": 15755, "time_per_iteration": 2.803941011428833 }, { "auxiliary_loss_clip": 0.01415313, "auxiliary_loss_mlp": 0.01035332, "balance_loss_clip": 1.24870276, "balance_loss_mlp": 1.01619935, "epoch": 0.9473019690365249, "flos": 20750407539840.0, "grad_norm": 2.223633994387574, "language_loss": 0.76015782, "learning_rate": 2.9028480792904876e-08, "loss": 0.78466427, "num_input_tokens_seen": 339894255, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19128418, "step": 15756, "time_per_iteration": 4.248815298080444 }, { "auxiliary_loss_clip": 0.01397444, "auxiliary_loss_mlp": 0.01030026, "balance_loss_clip": 1.23779464, "balance_loss_mlp": 1.011608, "epoch": 0.9473620922891929, "flos": 17648957197440.0, "grad_norm": 2.391702943088744, "language_loss": 0.75972533, "learning_rate": 2.8962403882118347e-08, "loss": 0.78399992, "num_input_tokens_seen": 339912425, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18408203, "step": 15757, "time_per_iteration": 2.851130247116089 }, { "auxiliary_loss_clip": 0.01398249, "auxiliary_loss_mlp": 0.01031146, "balance_loss_clip": 1.23666596, "balance_loss_mlp": 1.01194167, "epoch": 0.9474222155418608, "flos": 23560444252800.0, "grad_norm": 2.4404414147231517, "language_loss": 0.80579758, "learning_rate": 2.889640171327512e-08, "loss": 0.8300916, "num_input_tokens_seen": 339929635, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.1920166, "step": 15758, "time_per_iteration": 2.822061061859131 }, { "auxiliary_loss_clip": 0.01384485, "auxiliary_loss_mlp": 0.01034624, "balance_loss_clip": 1.22729111, "balance_loss_mlp": 1.01504946, "epoch": 0.9474823387945288, "flos": 27101277279360.0, "grad_norm": 1.5281963497547446, "language_loss": 0.7247898, "learning_rate": 2.8830474288877638e-08, "loss": 0.74898088, "num_input_tokens_seen": 339951200, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19592285, "step": 15759, "time_per_iteration": 2.9126381874084473 }, { "auxiliary_loss_clip": 0.0138684, "auxiliary_loss_mlp": 0.01029169, "balance_loss_clip": 1.23359597, "balance_loss_mlp": 1.01098967, "epoch": 0.9475424620471967, "flos": 22976893077120.0, "grad_norm": 1.5901159313326272, "language_loss": 0.76475561, "learning_rate": 2.8764621611426344e-08, "loss": 0.78891563, "num_input_tokens_seen": 339971820, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.1817627, "step": 15760, "time_per_iteration": 2.839984893798828 }, { "auxiliary_loss_clip": 0.01397933, "auxiliary_loss_mlp": 0.01031918, "balance_loss_clip": 1.23920298, "balance_loss_mlp": 1.01302338, "epoch": 0.9476025852998647, "flos": 20057282161920.0, "grad_norm": 1.8035871762148665, "language_loss": 0.73398268, "learning_rate": 2.8698843683418128e-08, "loss": 0.75828117, "num_input_tokens_seen": 339989420, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18884277, "step": 15761, "time_per_iteration": 2.8317642211914062 }, { "auxiliary_loss_clip": 0.01398967, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.24178565, "balance_loss_mlp": 1.01987672, "epoch": 0.9476627085525327, "flos": 14983586668800.0, "grad_norm": 1.9453223892309939, "language_loss": 0.73020256, "learning_rate": 2.863314050734722e-08, "loss": 0.75458062, "num_input_tokens_seen": 340006690, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18981934, "step": 15762, "time_per_iteration": 2.8151895999908447 }, { "auxiliary_loss_clip": 0.01421153, "auxiliary_loss_mlp": 0.01037236, "balance_loss_clip": 1.25549376, "balance_loss_mlp": 1.01749539, "epoch": 0.9477228318052007, "flos": 18707073528960.0, "grad_norm": 1.8751145951343542, "language_loss": 0.67644703, "learning_rate": 2.856751208570518e-08, "loss": 0.70103085, "num_input_tokens_seen": 340025480, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.1973877, "step": 15763, "time_per_iteration": 4.241209030151367 }, { "auxiliary_loss_clip": 0.01403615, "auxiliary_loss_mlp": 0.01034507, "balance_loss_clip": 1.24297142, "balance_loss_mlp": 1.01546896, "epoch": 0.9477829550578686, "flos": 23884732869120.0, "grad_norm": 1.7014201153292459, "language_loss": 0.71214592, "learning_rate": 2.8501958420980466e-08, "loss": 0.7365272, "num_input_tokens_seen": 340043785, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19030762, "step": 15764, "time_per_iteration": 4.19794225692749 }, { "auxiliary_loss_clip": 0.01382525, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.23120761, "balance_loss_mlp": 1.01037312, "epoch": 0.9478430783105366, "flos": 22572557130240.0, "grad_norm": 1.63488240519942, "language_loss": 0.71902966, "learning_rate": 2.8436479515659306e-08, "loss": 0.74313998, "num_input_tokens_seen": 340064360, "router_z_loss_clip": 1.51269531, "router_z_loss_mlp": 0.18139648, "step": 15765, "time_per_iteration": 2.8548567295074463 }, { "auxiliary_loss_clip": 0.01177731, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.09149528, "balance_loss_mlp": 1.01322174, "epoch": 0.9479032015632046, "flos": 60883051800960.0, "grad_norm": 0.8174986119513635, "language_loss": 0.59207821, "learning_rate": 2.8371075372224384e-08, "loss": 0.61418033, "num_input_tokens_seen": 340114425, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.19238281, "step": 15766, "time_per_iteration": 3.156442165374756 }, { "auxiliary_loss_clip": 0.01393696, "auxiliary_loss_mlp": 0.01037911, "balance_loss_clip": 1.23517704, "balance_loss_mlp": 1.01915967, "epoch": 0.9479633248158725, "flos": 14691992060160.0, "grad_norm": 1.780997234906845, "language_loss": 0.75163603, "learning_rate": 2.8305745993155938e-08, "loss": 0.7759521, "num_input_tokens_seen": 340132200, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18762207, "step": 15767, "time_per_iteration": 2.8255720138549805 }, { "auxiliary_loss_clip": 0.01409258, "auxiliary_loss_mlp": 0.01030876, "balance_loss_clip": 1.24615407, "balance_loss_mlp": 1.01249409, "epoch": 0.9480234480685406, "flos": 20342090050560.0, "grad_norm": 1.96443628141305, "language_loss": 0.73548818, "learning_rate": 2.8240491380931096e-08, "loss": 0.7598896, "num_input_tokens_seen": 340149175, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.18383789, "step": 15768, "time_per_iteration": 2.846597909927368 }, { "auxiliary_loss_clip": 0.0118261, "auxiliary_loss_mlp": 0.01027775, "balance_loss_clip": 1.0933274, "balance_loss_mlp": 1.01079988, "epoch": 0.9480835713212085, "flos": 70326386150400.0, "grad_norm": 0.7320028041834035, "language_loss": 0.5530504, "learning_rate": 2.8175311538024326e-08, "loss": 0.57515424, "num_input_tokens_seen": 340208155, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.16992188, "step": 15769, "time_per_iteration": 3.2899396419525146 }, { "auxiliary_loss_clip": 0.01397106, "auxiliary_loss_mlp": 0.01030292, "balance_loss_clip": 1.23633838, "balance_loss_mlp": 1.01229143, "epoch": 0.9481436945738765, "flos": 25461329074560.0, "grad_norm": 1.3479992376320484, "language_loss": 0.7776159, "learning_rate": 2.8110206466907428e-08, "loss": 0.8018899, "num_input_tokens_seen": 340229275, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18005371, "step": 15770, "time_per_iteration": 2.940720319747925 }, { "auxiliary_loss_clip": 0.01402614, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.24353433, "balance_loss_mlp": 1.01349688, "epoch": 0.9482038178265444, "flos": 26991295873920.0, "grad_norm": 1.8934317623970707, "language_loss": 0.80599856, "learning_rate": 2.8045176170049313e-08, "loss": 0.83035505, "num_input_tokens_seen": 340248920, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19555664, "step": 15771, "time_per_iteration": 2.9939322471618652 }, { "auxiliary_loss_clip": 0.01398891, "auxiliary_loss_mlp": 0.01031143, "balance_loss_clip": 1.24174428, "balance_loss_mlp": 1.01231956, "epoch": 0.9482639410792124, "flos": 17794437788160.0, "grad_norm": 2.067358629902965, "language_loss": 0.70740634, "learning_rate": 2.7980220649915566e-08, "loss": 0.73170668, "num_input_tokens_seen": 340266775, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18823242, "step": 15772, "time_per_iteration": 2.9077634811401367 }, { "auxiliary_loss_clip": 0.01400146, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.24129844, "balance_loss_mlp": 1.01306224, "epoch": 0.9483240643318803, "flos": 21006638697600.0, "grad_norm": 1.5366063191493367, "language_loss": 0.74435246, "learning_rate": 2.7915339908969327e-08, "loss": 0.76867133, "num_input_tokens_seen": 340285295, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18676758, "step": 15773, "time_per_iteration": 2.8676867485046387 }, { "auxiliary_loss_clip": 0.01407691, "auxiliary_loss_mlp": 0.01037159, "balance_loss_clip": 1.24465799, "balance_loss_mlp": 1.01787138, "epoch": 0.9483841875845483, "flos": 20092690857600.0, "grad_norm": 1.9779610025387027, "language_loss": 0.63633311, "learning_rate": 2.7850533949671072e-08, "loss": 0.66078162, "num_input_tokens_seen": 340304265, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19287109, "step": 15774, "time_per_iteration": 2.8057992458343506 }, { "auxiliary_loss_clip": 0.01398622, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.2380352, "balance_loss_mlp": 1.01231694, "epoch": 0.9484443108372163, "flos": 20823396681600.0, "grad_norm": 3.0798775825444613, "language_loss": 0.61039037, "learning_rate": 2.7785802774478396e-08, "loss": 0.63469464, "num_input_tokens_seen": 340323690, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19482422, "step": 15775, "time_per_iteration": 2.8179807662963867 }, { "auxiliary_loss_clip": 0.0139823, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.23803401, "balance_loss_mlp": 1.01360822, "epoch": 0.9485044340898843, "flos": 36442892039040.0, "grad_norm": 1.8022802857631814, "language_loss": 0.6229167, "learning_rate": 2.772114638584555e-08, "loss": 0.64722443, "num_input_tokens_seen": 340345830, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18945312, "step": 15776, "time_per_iteration": 2.9442999362945557 }, { "auxiliary_loss_clip": 0.01404762, "auxiliary_loss_mlp": 0.01032569, "balance_loss_clip": 1.24242353, "balance_loss_mlp": 1.01292288, "epoch": 0.9485645573425522, "flos": 22613033243520.0, "grad_norm": 1.874860089152283, "language_loss": 0.74459827, "learning_rate": 2.765656478622458e-08, "loss": 0.7689715, "num_input_tokens_seen": 340365910, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19641113, "step": 15777, "time_per_iteration": 2.8325893878936768 }, { "auxiliary_loss_clip": 0.0143676, "auxiliary_loss_mlp": 0.01032748, "balance_loss_clip": 1.26717639, "balance_loss_mlp": 1.01331663, "epoch": 0.9486246805952202, "flos": 22027853255040.0, "grad_norm": 2.7600869119629645, "language_loss": 0.73097014, "learning_rate": 2.759205797806441e-08, "loss": 0.75566524, "num_input_tokens_seen": 340383935, "router_z_loss_clip": 1.69628906, "router_z_loss_mlp": 0.19433594, "step": 15778, "time_per_iteration": 2.8414289951324463 }, { "auxiliary_loss_clip": 0.01378298, "auxiliary_loss_mlp": 0.01033446, "balance_loss_clip": 1.22673368, "balance_loss_mlp": 1.01529086, "epoch": 0.9486848038478882, "flos": 16517399276160.0, "grad_norm": 1.7439916659147217, "language_loss": 0.7050333, "learning_rate": 2.7527625963810865e-08, "loss": 0.72915071, "num_input_tokens_seen": 340402760, "router_z_loss_clip": 1.51757812, "router_z_loss_mlp": 0.18151855, "step": 15779, "time_per_iteration": 2.8414008617401123 }, { "auxiliary_loss_clip": 0.01405534, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.24533057, "balance_loss_mlp": 1.01457345, "epoch": 0.9487449271005561, "flos": 19253632440960.0, "grad_norm": 1.9790834061069276, "language_loss": 0.79624867, "learning_rate": 2.7463268745907542e-08, "loss": 0.82064509, "num_input_tokens_seen": 340422105, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19543457, "step": 15780, "time_per_iteration": 2.7985169887542725 }, { "auxiliary_loss_clip": 0.01396657, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.23795354, "balance_loss_mlp": 1.0196892, "epoch": 0.9488050503532242, "flos": 21772798462080.0, "grad_norm": 1.7590676331994415, "language_loss": 0.66827118, "learning_rate": 2.7398986326794494e-08, "loss": 0.69261742, "num_input_tokens_seen": 340441160, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18286133, "step": 15781, "time_per_iteration": 2.8563005924224854 }, { "auxiliary_loss_clip": 0.01387523, "auxiliary_loss_mlp": 0.01030833, "balance_loss_clip": 1.23090696, "balance_loss_mlp": 1.01212907, "epoch": 0.9488651736058921, "flos": 18377988963840.0, "grad_norm": 2.1839346005472042, "language_loss": 0.80937147, "learning_rate": 2.733477870890999e-08, "loss": 0.83355498, "num_input_tokens_seen": 340458200, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18713379, "step": 15782, "time_per_iteration": 2.861647844314575 }, { "auxiliary_loss_clip": 0.01180522, "auxiliary_loss_mlp": 0.01019518, "balance_loss_clip": 1.09265041, "balance_loss_mlp": 1.00063539, "epoch": 0.9489252968585601, "flos": 70119861045120.0, "grad_norm": 0.7373543638753183, "language_loss": 0.59768558, "learning_rate": 2.7270645894688082e-08, "loss": 0.61968601, "num_input_tokens_seen": 340526420, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.18847656, "step": 15783, "time_per_iteration": 3.426539897918701 }, { "auxiliary_loss_clip": 0.01401402, "auxiliary_loss_mlp": 0.01035304, "balance_loss_clip": 1.24111176, "balance_loss_mlp": 1.0156703, "epoch": 0.948985420111228, "flos": 27867029840640.0, "grad_norm": 1.6217456847434788, "language_loss": 0.74158055, "learning_rate": 2.720658788656105e-08, "loss": 0.76594758, "num_input_tokens_seen": 340546325, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19641113, "step": 15784, "time_per_iteration": 2.938406467437744 }, { "auxiliary_loss_clip": 0.0140472, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 1.24263847, "balance_loss_mlp": 1.01401806, "epoch": 0.949045543363896, "flos": 24326649262080.0, "grad_norm": 1.8602291714428074, "language_loss": 0.70511377, "learning_rate": 2.714260468695806e-08, "loss": 0.72949952, "num_input_tokens_seen": 340565145, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.19836426, "step": 15785, "time_per_iteration": 2.8837757110595703 }, { "auxiliary_loss_clip": 0.01410767, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.24749851, "balance_loss_mlp": 1.01628029, "epoch": 0.9491056666165639, "flos": 24251895573120.0, "grad_norm": 1.8085407489955077, "language_loss": 0.76761508, "learning_rate": 2.707869629830495e-08, "loss": 0.79207504, "num_input_tokens_seen": 340585465, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.18933105, "step": 15786, "time_per_iteration": 3.060150623321533 }, { "auxiliary_loss_clip": 0.01393808, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.23595786, "balance_loss_mlp": 1.01511574, "epoch": 0.949165789869232, "flos": 24540956472960.0, "grad_norm": 1.790760280641059, "language_loss": 0.80036819, "learning_rate": 2.7014862723025335e-08, "loss": 0.82464314, "num_input_tokens_seen": 340606010, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18591309, "step": 15787, "time_per_iteration": 2.9672884941101074 }, { "auxiliary_loss_clip": 0.01401674, "auxiliary_loss_mlp": 0.01033923, "balance_loss_clip": 1.24535918, "balance_loss_mlp": 1.01582682, "epoch": 0.9492259131218999, "flos": 22245146622720.0, "grad_norm": 1.5514380728509336, "language_loss": 0.76673585, "learning_rate": 2.6951103963540388e-08, "loss": 0.7910918, "num_input_tokens_seen": 340626135, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.1809082, "step": 15788, "time_per_iteration": 4.391999006271362 }, { "auxiliary_loss_clip": 0.01406888, "auxiliary_loss_mlp": 0.01035734, "balance_loss_clip": 1.24559879, "balance_loss_mlp": 1.01675653, "epoch": 0.9492860363745679, "flos": 22976621608320.0, "grad_norm": 2.1244555136726424, "language_loss": 0.72606969, "learning_rate": 2.6887420022266848e-08, "loss": 0.75049585, "num_input_tokens_seen": 340644870, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18994141, "step": 15789, "time_per_iteration": 2.8260793685913086 }, { "auxiliary_loss_clip": 0.01399814, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.24037433, "balance_loss_mlp": 1.01166165, "epoch": 0.9493461596272358, "flos": 18379708266240.0, "grad_norm": 1.738112218492504, "language_loss": 0.73820436, "learning_rate": 2.682381090161989e-08, "loss": 0.76253247, "num_input_tokens_seen": 340663695, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.21313477, "step": 15790, "time_per_iteration": 2.865224599838257 }, { "auxiliary_loss_clip": 0.0142051, "auxiliary_loss_mlp": 0.01032833, "balance_loss_clip": 1.25652993, "balance_loss_mlp": 1.0141052, "epoch": 0.9494062828799038, "flos": 20021466263040.0, "grad_norm": 1.714648973119075, "language_loss": 0.78281081, "learning_rate": 2.6760276604012033e-08, "loss": 0.8073442, "num_input_tokens_seen": 340682970, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.18737793, "step": 15791, "time_per_iteration": 4.27386212348938 }, { "auxiliary_loss_clip": 0.01432423, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.26607025, "balance_loss_mlp": 1.01758897, "epoch": 0.9494664061325718, "flos": 27238297092480.0, "grad_norm": 2.115245603811823, "language_loss": 0.74898154, "learning_rate": 2.6696817131852234e-08, "loss": 0.77367026, "num_input_tokens_seen": 340702275, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.1887207, "step": 15792, "time_per_iteration": 2.9051811695098877 }, { "auxiliary_loss_clip": 0.01390478, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.23183441, "balance_loss_mlp": 1.01398146, "epoch": 0.9495265293852397, "flos": 18379120083840.0, "grad_norm": 1.9113605786553118, "language_loss": 0.78926134, "learning_rate": 2.663343248754679e-08, "loss": 0.81348884, "num_input_tokens_seen": 340719060, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18286133, "step": 15793, "time_per_iteration": 2.8375725746154785 }, { "auxiliary_loss_clip": 0.01384663, "auxiliary_loss_mlp": 0.01032636, "balance_loss_clip": 1.22645855, "balance_loss_mlp": 1.01458788, "epoch": 0.9495866526379078, "flos": 23086105320960.0, "grad_norm": 1.6862000774852333, "language_loss": 0.7862463, "learning_rate": 2.6570122673499562e-08, "loss": 0.81041932, "num_input_tokens_seen": 340737815, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18041992, "step": 15794, "time_per_iteration": 2.8928539752960205 }, { "auxiliary_loss_clip": 0.01406856, "auxiliary_loss_mlp": 0.01034405, "balance_loss_clip": 1.2444458, "balance_loss_mlp": 1.01498556, "epoch": 0.9496467758905757, "flos": 17538794812800.0, "grad_norm": 2.3107987342278435, "language_loss": 0.61636692, "learning_rate": 2.650688769211107e-08, "loss": 0.64077955, "num_input_tokens_seen": 340756035, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19445801, "step": 15795, "time_per_iteration": 2.818861722946167 }, { "auxiliary_loss_clip": 0.01386085, "auxiliary_loss_mlp": 0.010382, "balance_loss_clip": 1.23121226, "balance_loss_mlp": 1.01757693, "epoch": 0.9497068991432437, "flos": 24144538366080.0, "grad_norm": 1.7650589457316432, "language_loss": 0.80157, "learning_rate": 2.644372754577895e-08, "loss": 0.82581282, "num_input_tokens_seen": 340775620, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.20617676, "step": 15796, "time_per_iteration": 2.8484439849853516 }, { "auxiliary_loss_clip": 0.01396259, "auxiliary_loss_mlp": 0.01033074, "balance_loss_clip": 1.23653817, "balance_loss_mlp": 1.01428664, "epoch": 0.9497670223959116, "flos": 20313060871680.0, "grad_norm": 3.491004559911205, "language_loss": 0.76256061, "learning_rate": 2.6380642236898398e-08, "loss": 0.78685391, "num_input_tokens_seen": 340794510, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18774414, "step": 15797, "time_per_iteration": 2.830131769180298 }, { "auxiliary_loss_clip": 0.01404984, "auxiliary_loss_mlp": 0.01033192, "balance_loss_clip": 1.24518085, "balance_loss_mlp": 1.01410663, "epoch": 0.9498271456485796, "flos": 13706819625600.0, "grad_norm": 2.4190522789368725, "language_loss": 0.67297417, "learning_rate": 2.6317631767861727e-08, "loss": 0.69735593, "num_input_tokens_seen": 340812955, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19104004, "step": 15798, "time_per_iteration": 4.251521587371826 }, { "auxiliary_loss_clip": 0.01418381, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.25496316, "balance_loss_mlp": 1.01731217, "epoch": 0.9498872689012475, "flos": 20823849129600.0, "grad_norm": 3.751364691808705, "language_loss": 0.77673006, "learning_rate": 2.6254696141058575e-08, "loss": 0.80127633, "num_input_tokens_seen": 340829200, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18920898, "step": 15799, "time_per_iteration": 4.258679628372192 }, { "auxiliary_loss_clip": 0.01390174, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 1.23573565, "balance_loss_mlp": 1.01234293, "epoch": 0.9499473921539155, "flos": 21042545086080.0, "grad_norm": 1.7562194181987858, "language_loss": 0.72060138, "learning_rate": 2.6191835358874814e-08, "loss": 0.7448101, "num_input_tokens_seen": 340848035, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18347168, "step": 15800, "time_per_iteration": 2.8042502403259277 }, { "auxiliary_loss_clip": 0.01402869, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.24371457, "balance_loss_mlp": 1.01478529, "epoch": 0.9500075154065835, "flos": 21009172406400.0, "grad_norm": 1.569945636456981, "language_loss": 0.7235446, "learning_rate": 2.6129049423694315e-08, "loss": 0.74790585, "num_input_tokens_seen": 340870025, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18493652, "step": 15801, "time_per_iteration": 2.859426736831665 }, { "auxiliary_loss_clip": 0.01391765, "auxiliary_loss_mlp": 0.01033125, "balance_loss_clip": 1.23288178, "balance_loss_mlp": 1.0143137, "epoch": 0.9500676386592515, "flos": 25133601853440.0, "grad_norm": 1.807566186072682, "language_loss": 0.81541741, "learning_rate": 2.6066338337898508e-08, "loss": 0.83966637, "num_input_tokens_seen": 340892290, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18811035, "step": 15802, "time_per_iteration": 2.937368869781494 }, { "auxiliary_loss_clip": 0.01417215, "auxiliary_loss_mlp": 0.01037488, "balance_loss_clip": 1.25452459, "balance_loss_mlp": 1.01809287, "epoch": 0.9501277619119194, "flos": 27534235201920.0, "grad_norm": 1.8013964980122885, "language_loss": 0.6893121, "learning_rate": 2.60037021038646e-08, "loss": 0.71385908, "num_input_tokens_seen": 340912260, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.1940918, "step": 15803, "time_per_iteration": 2.8811376094818115 }, { "auxiliary_loss_clip": 0.01399101, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.24028385, "balance_loss_mlp": 1.01550484, "epoch": 0.9501878851645874, "flos": 20823803884800.0, "grad_norm": 1.6107051053520052, "language_loss": 0.76467121, "learning_rate": 2.5941140723968247e-08, "loss": 0.7890116, "num_input_tokens_seen": 340928930, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19433594, "step": 15804, "time_per_iteration": 2.833137273788452 }, { "auxiliary_loss_clip": 0.01414545, "auxiliary_loss_mlp": 0.01037288, "balance_loss_clip": 1.25272918, "balance_loss_mlp": 1.01853621, "epoch": 0.9502480084172553, "flos": 18378577146240.0, "grad_norm": 3.1194717956273603, "language_loss": 0.73713839, "learning_rate": 2.5878654200581775e-08, "loss": 0.76165676, "num_input_tokens_seen": 340946615, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18737793, "step": 15805, "time_per_iteration": 2.8037145137786865 }, { "auxiliary_loss_clip": 0.01408403, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.24830508, "balance_loss_mlp": 1.01801491, "epoch": 0.9503081316699233, "flos": 23559946560000.0, "grad_norm": 1.9202977215113957, "language_loss": 0.80663365, "learning_rate": 2.5816242536074618e-08, "loss": 0.83108377, "num_input_tokens_seen": 340967545, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18615723, "step": 15806, "time_per_iteration": 2.9057319164276123 }, { "auxiliary_loss_clip": 0.01409, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.24723291, "balance_loss_mlp": 1.01346684, "epoch": 0.9503682549225914, "flos": 18049402091520.0, "grad_norm": 2.1290884176423415, "language_loss": 0.83554029, "learning_rate": 2.5753905732813108e-08, "loss": 0.8599506, "num_input_tokens_seen": 340984955, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18554688, "step": 15807, "time_per_iteration": 2.8105506896972656 }, { "auxiliary_loss_clip": 0.01395249, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.2371484, "balance_loss_mlp": 1.01116693, "epoch": 0.9504283781752593, "flos": 25897454133120.0, "grad_norm": 1.7701684829867035, "language_loss": 0.72542846, "learning_rate": 2.5691643793161355e-08, "loss": 0.74968415, "num_input_tokens_seen": 341007300, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19165039, "step": 15808, "time_per_iteration": 2.888897657394409 }, { "auxiliary_loss_clip": 0.01393211, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.23576629, "balance_loss_mlp": 1.01044679, "epoch": 0.9504885014279273, "flos": 22133310180480.0, "grad_norm": 1.6088710638316248, "language_loss": 0.70138514, "learning_rate": 2.562945671948058e-08, "loss": 0.72561467, "num_input_tokens_seen": 341026695, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.1932373, "step": 15809, "time_per_iteration": 2.808933973312378 }, { "auxiliary_loss_clip": 0.01390886, "auxiliary_loss_mlp": 0.01031806, "balance_loss_clip": 1.23234797, "balance_loss_mlp": 1.01208901, "epoch": 0.9505486246805952, "flos": 21625824792960.0, "grad_norm": 2.2153647544438644, "language_loss": 0.75765753, "learning_rate": 2.5567344514128452e-08, "loss": 0.78188443, "num_input_tokens_seen": 341047080, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19714355, "step": 15810, "time_per_iteration": 2.868792772293091 }, { "auxiliary_loss_clip": 0.01397203, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.23667502, "balance_loss_mlp": 1.02000976, "epoch": 0.9506087479332632, "flos": 22538370044160.0, "grad_norm": 1.3750164478914175, "language_loss": 0.80469942, "learning_rate": 2.5505307179460643e-08, "loss": 0.8290565, "num_input_tokens_seen": 341067310, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18493652, "step": 15811, "time_per_iteration": 2.8515336513519287 }, { "auxiliary_loss_clip": 0.01404647, "auxiliary_loss_mlp": 0.01032806, "balance_loss_clip": 1.2451005, "balance_loss_mlp": 1.01383948, "epoch": 0.9506688711859311, "flos": 27538940661120.0, "grad_norm": 1.9825853193609115, "language_loss": 0.70809698, "learning_rate": 2.5443344717829495e-08, "loss": 0.73247147, "num_input_tokens_seen": 341085110, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18969727, "step": 15812, "time_per_iteration": 2.9683945178985596 }, { "auxiliary_loss_clip": 0.01412201, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.24973083, "balance_loss_mlp": 1.01694107, "epoch": 0.9507289944385992, "flos": 19875487979520.0, "grad_norm": 1.705879315730569, "language_loss": 0.66300708, "learning_rate": 2.538145713158446e-08, "loss": 0.68749034, "num_input_tokens_seen": 341103190, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19165039, "step": 15813, "time_per_iteration": 2.835587978363037 }, { "auxiliary_loss_clip": 0.01403776, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 1.2430234, "balance_loss_mlp": 1.01651978, "epoch": 0.9507891176912671, "flos": 25204509734400.0, "grad_norm": 1.4205683952538237, "language_loss": 0.70815432, "learning_rate": 2.5319644423072327e-08, "loss": 0.73254937, "num_input_tokens_seen": 341125695, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.1920166, "step": 15814, "time_per_iteration": 2.8905835151672363 }, { "auxiliary_loss_clip": 0.01391398, "auxiliary_loss_mlp": 0.01029528, "balance_loss_clip": 1.23441291, "balance_loss_mlp": 1.011253, "epoch": 0.9508492409439351, "flos": 24910245682560.0, "grad_norm": 2.3435473507056117, "language_loss": 0.64183295, "learning_rate": 2.5257906594637445e-08, "loss": 0.66604227, "num_input_tokens_seen": 341143930, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18286133, "step": 15815, "time_per_iteration": 2.9033846855163574 }, { "auxiliary_loss_clip": 0.01402004, "auxiliary_loss_mlp": 0.01032054, "balance_loss_clip": 1.24176466, "balance_loss_mlp": 1.01368356, "epoch": 0.950909364196603, "flos": 29794862580480.0, "grad_norm": 3.293243324781059, "language_loss": 0.59508169, "learning_rate": 2.519624364862061e-08, "loss": 0.61942226, "num_input_tokens_seen": 341164280, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18383789, "step": 15816, "time_per_iteration": 2.907811164855957 }, { "auxiliary_loss_clip": 0.01408328, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.24882472, "balance_loss_mlp": 1.01917291, "epoch": 0.950969487449271, "flos": 24728587234560.0, "grad_norm": 1.4257737992236463, "language_loss": 0.74179173, "learning_rate": 2.513465558735994e-08, "loss": 0.76626056, "num_input_tokens_seen": 341183670, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19396973, "step": 15817, "time_per_iteration": 2.888936996459961 }, { "auxiliary_loss_clip": 0.01398219, "auxiliary_loss_mlp": 0.01038477, "balance_loss_clip": 1.23649979, "balance_loss_mlp": 1.01743615, "epoch": 0.9510296107019389, "flos": 13707136339200.0, "grad_norm": 1.8279991964036522, "language_loss": 0.60439467, "learning_rate": 2.5073142413190918e-08, "loss": 0.62876159, "num_input_tokens_seen": 341201900, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.21044922, "step": 15818, "time_per_iteration": 2.8409926891326904 }, { "auxiliary_loss_clip": 0.01400437, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.24108624, "balance_loss_mlp": 1.0154078, "epoch": 0.9510897339546069, "flos": 17320913262720.0, "grad_norm": 1.7200299650161295, "language_loss": 0.70512557, "learning_rate": 2.5011704128446552e-08, "loss": 0.72947091, "num_input_tokens_seen": 341218340, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18676758, "step": 15819, "time_per_iteration": 2.798652172088623 }, { "auxiliary_loss_clip": 0.01404004, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.24230027, "balance_loss_mlp": 1.01376712, "epoch": 0.951149857207275, "flos": 14802244934400.0, "grad_norm": 3.7549307142888084, "language_loss": 0.7460345, "learning_rate": 2.49503407354561e-08, "loss": 0.77039748, "num_input_tokens_seen": 341235885, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18530273, "step": 15820, "time_per_iteration": 2.8260083198547363 }, { "auxiliary_loss_clip": 0.01418916, "auxiliary_loss_mlp": 0.01041475, "balance_loss_clip": 1.25651503, "balance_loss_mlp": 1.02229381, "epoch": 0.9512099804599429, "flos": 19400651354880.0, "grad_norm": 1.723660158467129, "language_loss": 0.78877544, "learning_rate": 2.4889052236546804e-08, "loss": 0.81337941, "num_input_tokens_seen": 341255280, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19189453, "step": 15821, "time_per_iteration": 2.795001745223999 }, { "auxiliary_loss_clip": 0.01392737, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.23459148, "balance_loss_mlp": 1.01258147, "epoch": 0.9512701037126109, "flos": 36771750380160.0, "grad_norm": 1.3756033746789327, "language_loss": 0.71290839, "learning_rate": 2.4827838634042586e-08, "loss": 0.7371577, "num_input_tokens_seen": 341279055, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19616699, "step": 15822, "time_per_iteration": 2.958003282546997 }, { "auxiliary_loss_clip": 0.01395571, "auxiliary_loss_mlp": 0.0102979, "balance_loss_clip": 1.23737264, "balance_loss_mlp": 1.01132429, "epoch": 0.9513302269652788, "flos": 22648577673600.0, "grad_norm": 1.7415098651582226, "language_loss": 0.66867238, "learning_rate": 2.47666999302647e-08, "loss": 0.69292593, "num_input_tokens_seen": 341298560, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18481445, "step": 15823, "time_per_iteration": 4.274961948394775 }, { "auxiliary_loss_clip": 0.01393681, "auxiliary_loss_mlp": 0.01035477, "balance_loss_clip": 1.23577714, "balance_loss_mlp": 1.01666594, "epoch": 0.9513903502179468, "flos": 22903496732160.0, "grad_norm": 1.7267961411417065, "language_loss": 0.77947271, "learning_rate": 2.4705636127531292e-08, "loss": 0.80376428, "num_input_tokens_seen": 341316650, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18811035, "step": 15824, "time_per_iteration": 2.825993537902832 }, { "auxiliary_loss_clip": 0.0141107, "auxiliary_loss_mlp": 0.01031523, "balance_loss_clip": 1.2469039, "balance_loss_mlp": 1.01213932, "epoch": 0.9514504734706147, "flos": 27940290451200.0, "grad_norm": 2.335071347555368, "language_loss": 0.74802017, "learning_rate": 2.4644647228158065e-08, "loss": 0.77244604, "num_input_tokens_seen": 341336185, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.19396973, "step": 15825, "time_per_iteration": 2.903411865234375 }, { "auxiliary_loss_clip": 0.01181852, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.09236121, "balance_loss_mlp": 1.02059102, "epoch": 0.9515105967232828, "flos": 67397364316800.0, "grad_norm": 0.8184035734412601, "language_loss": 0.53480101, "learning_rate": 2.458373323445806e-08, "loss": 0.55705625, "num_input_tokens_seen": 341395795, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.23046875, "step": 15826, "time_per_iteration": 4.726433277130127 }, { "auxiliary_loss_clip": 0.01403398, "auxiliary_loss_mlp": 0.01030872, "balance_loss_clip": 1.24225616, "balance_loss_mlp": 1.01229906, "epoch": 0.9515707199759507, "flos": 25857023264640.0, "grad_norm": 1.7716214275134703, "language_loss": 0.74123257, "learning_rate": 2.452289414874076e-08, "loss": 0.76557529, "num_input_tokens_seen": 341415675, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18566895, "step": 15827, "time_per_iteration": 2.8961915969848633 }, { "auxiliary_loss_clip": 0.01396782, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.23704314, "balance_loss_mlp": 1.01896942, "epoch": 0.9516308432286187, "flos": 21837372071040.0, "grad_norm": 2.217928880338013, "language_loss": 0.75614274, "learning_rate": 2.4462129973313207e-08, "loss": 0.7804985, "num_input_tokens_seen": 341432990, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19812012, "step": 15828, "time_per_iteration": 2.8720791339874268 }, { "auxiliary_loss_clip": 0.01391073, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.23453641, "balance_loss_mlp": 1.01497197, "epoch": 0.9516909664812866, "flos": 27280492508160.0, "grad_norm": 1.764120127527729, "language_loss": 0.74227762, "learning_rate": 2.440144071047978e-08, "loss": 0.76651871, "num_input_tokens_seen": 341454100, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18066406, "step": 15829, "time_per_iteration": 3.081106662750244 }, { "auxiliary_loss_clip": 0.01404548, "auxiliary_loss_mlp": 0.01033366, "balance_loss_clip": 1.2446965, "balance_loss_mlp": 1.01577091, "epoch": 0.9517510897339546, "flos": 21225289409280.0, "grad_norm": 1.9739716390437276, "language_loss": 0.62214696, "learning_rate": 2.4340826362541533e-08, "loss": 0.6465261, "num_input_tokens_seen": 341472955, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.17602539, "step": 15830, "time_per_iteration": 2.861830711364746 }, { "auxiliary_loss_clip": 0.01402904, "auxiliary_loss_mlp": 0.01036154, "balance_loss_clip": 1.24137306, "balance_loss_mlp": 1.01584113, "epoch": 0.9518112129866225, "flos": 18743206141440.0, "grad_norm": 2.1865024992478674, "language_loss": 0.73704255, "learning_rate": 2.428028693179729e-08, "loss": 0.76143312, "num_input_tokens_seen": 341490165, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.203125, "step": 15831, "time_per_iteration": 2.8334529399871826 }, { "auxiliary_loss_clip": 0.01389597, "auxiliary_loss_mlp": 0.01027682, "balance_loss_clip": 1.23222792, "balance_loss_mlp": 1.00974107, "epoch": 0.9518713362392905, "flos": 16772363579520.0, "grad_norm": 1.6592494589716154, "language_loss": 0.66626954, "learning_rate": 2.4219822420542545e-08, "loss": 0.69044232, "num_input_tokens_seen": 341508055, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.17956543, "step": 15832, "time_per_iteration": 2.809551954269409 }, { "auxiliary_loss_clip": 0.01389001, "auxiliary_loss_mlp": 0.01035162, "balance_loss_clip": 1.23421383, "balance_loss_mlp": 1.01668477, "epoch": 0.9519314594919586, "flos": 15238777196160.0, "grad_norm": 1.7656075427158238, "language_loss": 0.78643501, "learning_rate": 2.4159432831070135e-08, "loss": 0.81067657, "num_input_tokens_seen": 341526155, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18469238, "step": 15833, "time_per_iteration": 4.205423593521118 }, { "auxiliary_loss_clip": 0.01384171, "auxiliary_loss_mlp": 0.01032577, "balance_loss_clip": 1.22747827, "balance_loss_mlp": 1.01396847, "epoch": 0.9519915827446265, "flos": 19361985033600.0, "grad_norm": 1.9809653312821383, "language_loss": 0.75855625, "learning_rate": 2.4099118165670007e-08, "loss": 0.78272372, "num_input_tokens_seen": 341540450, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.18615723, "step": 15834, "time_per_iteration": 4.206353664398193 }, { "auxiliary_loss_clip": 0.01410976, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.24670386, "balance_loss_mlp": 1.0146606, "epoch": 0.9520517059972945, "flos": 22274673494400.0, "grad_norm": 3.6831644204758764, "language_loss": 0.76891243, "learning_rate": 2.4038878426629216e-08, "loss": 0.79336643, "num_input_tokens_seen": 341557865, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.19750977, "step": 15835, "time_per_iteration": 2.828402519226074 }, { "auxiliary_loss_clip": 0.01399833, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.23851514, "balance_loss_mlp": 1.015728, "epoch": 0.9521118292499624, "flos": 14869442741760.0, "grad_norm": 1.9665536170314728, "language_loss": 0.67058933, "learning_rate": 2.397871361623238e-08, "loss": 0.69494343, "num_input_tokens_seen": 341573890, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19836426, "step": 15836, "time_per_iteration": 2.8135793209075928 }, { "auxiliary_loss_clip": 0.01394407, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 1.23798418, "balance_loss_mlp": 1.01239967, "epoch": 0.9521719525026304, "flos": 23518022613120.0, "grad_norm": 1.8206042837961223, "language_loss": 0.70516396, "learning_rate": 2.391862373676057e-08, "loss": 0.72942412, "num_input_tokens_seen": 341593770, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.1920166, "step": 15837, "time_per_iteration": 2.869736671447754 }, { "auxiliary_loss_clip": 0.01388649, "auxiliary_loss_mlp": 0.01032919, "balance_loss_clip": 1.22859156, "balance_loss_mlp": 1.01279664, "epoch": 0.9522320757552983, "flos": 19723311158400.0, "grad_norm": 1.8070168510197444, "language_loss": 0.73867226, "learning_rate": 2.3858608790492617e-08, "loss": 0.76288795, "num_input_tokens_seen": 341612065, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.20129395, "step": 15838, "time_per_iteration": 2.8493638038635254 }, { "auxiliary_loss_clip": 0.01395604, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.23667979, "balance_loss_mlp": 1.01399362, "epoch": 0.9522921990079664, "flos": 25932274646400.0, "grad_norm": 2.1051847073750327, "language_loss": 0.7846902, "learning_rate": 2.379866877970449e-08, "loss": 0.80897111, "num_input_tokens_seen": 341631365, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18481445, "step": 15839, "time_per_iteration": 2.894648313522339 }, { "auxiliary_loss_clip": 0.01403517, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.24257481, "balance_loss_mlp": 1.02062106, "epoch": 0.9523523222606343, "flos": 19217409338880.0, "grad_norm": 1.6627950181055369, "language_loss": 0.80526948, "learning_rate": 2.3738803706668585e-08, "loss": 0.82968843, "num_input_tokens_seen": 341650300, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.1776123, "step": 15840, "time_per_iteration": 2.8819143772125244 }, { "auxiliary_loss_clip": 0.01383773, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.22774398, "balance_loss_mlp": 1.01244712, "epoch": 0.9524124455133023, "flos": 20930572909440.0, "grad_norm": 2.3001645434840494, "language_loss": 0.74039638, "learning_rate": 2.3679013573655314e-08, "loss": 0.76452905, "num_input_tokens_seen": 341667680, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.17041016, "step": 15841, "time_per_iteration": 2.8201746940612793 }, { "auxiliary_loss_clip": 0.01390405, "auxiliary_loss_mlp": 0.01032783, "balance_loss_clip": 1.2357496, "balance_loss_mlp": 1.01419842, "epoch": 0.9524725687659702, "flos": 18852825588480.0, "grad_norm": 1.7616392570715682, "language_loss": 0.79587424, "learning_rate": 2.3619298382931972e-08, "loss": 0.82010615, "num_input_tokens_seen": 341685760, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.18566895, "step": 15842, "time_per_iteration": 2.8824925422668457 }, { "auxiliary_loss_clip": 0.01401764, "auxiliary_loss_mlp": 0.01037346, "balance_loss_clip": 1.24313951, "balance_loss_mlp": 1.01780736, "epoch": 0.9525326920186382, "flos": 22684981754880.0, "grad_norm": 1.6498454966399745, "language_loss": 0.73002362, "learning_rate": 2.3559658136762973e-08, "loss": 0.75441474, "num_input_tokens_seen": 341705300, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.1953125, "step": 15843, "time_per_iteration": 2.8462376594543457 }, { "auxiliary_loss_clip": 0.01399173, "auxiliary_loss_mlp": 0.01033906, "balance_loss_clip": 1.23800015, "balance_loss_mlp": 1.01426029, "epoch": 0.9525928152713061, "flos": 22095684489600.0, "grad_norm": 1.6185058095164766, "language_loss": 0.78854185, "learning_rate": 2.3500092837409612e-08, "loss": 0.81287265, "num_input_tokens_seen": 341724565, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.1965332, "step": 15844, "time_per_iteration": 2.8755979537963867 }, { "auxiliary_loss_clip": 0.01401243, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.23824084, "balance_loss_mlp": 1.0154829, "epoch": 0.9526529385239741, "flos": 20714591640960.0, "grad_norm": 3.7671344324570604, "language_loss": 0.71276975, "learning_rate": 2.3440602487130977e-08, "loss": 0.73714948, "num_input_tokens_seen": 341743605, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.21240234, "step": 15845, "time_per_iteration": 2.8293771743774414 }, { "auxiliary_loss_clip": 0.01405552, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.24296498, "balance_loss_mlp": 1.01819611, "epoch": 0.9527130617766422, "flos": 23378559580800.0, "grad_norm": 1.4128378466578928, "language_loss": 0.75774336, "learning_rate": 2.338118708818282e-08, "loss": 0.7821703, "num_input_tokens_seen": 341763475, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18945312, "step": 15846, "time_per_iteration": 2.8706579208374023 }, { "auxiliary_loss_clip": 0.01392297, "auxiliary_loss_mlp": 0.0103024, "balance_loss_clip": 1.23292136, "balance_loss_mlp": 1.0120368, "epoch": 0.9527731850293101, "flos": 18994324636800.0, "grad_norm": 1.779746147458878, "language_loss": 0.79106748, "learning_rate": 2.3321846642817998e-08, "loss": 0.81529284, "num_input_tokens_seen": 341781265, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18212891, "step": 15847, "time_per_iteration": 2.864745616912842 }, { "auxiliary_loss_clip": 0.01386835, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 1.23019195, "balance_loss_mlp": 1.01526737, "epoch": 0.9528333082819781, "flos": 19327481233920.0, "grad_norm": 1.711001791839283, "language_loss": 0.78402835, "learning_rate": 2.326258115328672e-08, "loss": 0.80823624, "num_input_tokens_seen": 341798825, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.18688965, "step": 15848, "time_per_iteration": 2.8164916038513184 }, { "auxiliary_loss_clip": 0.01411394, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.24762917, "balance_loss_mlp": 1.01832259, "epoch": 0.952893431534646, "flos": 23961884532480.0, "grad_norm": 1.7550925466356884, "language_loss": 0.72996259, "learning_rate": 2.320339062183674e-08, "loss": 0.7544474, "num_input_tokens_seen": 341819480, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18762207, "step": 15849, "time_per_iteration": 2.8822686672210693 }, { "auxiliary_loss_clip": 0.01418626, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.25387025, "balance_loss_mlp": 1.01770771, "epoch": 0.952953554787314, "flos": 21039830398080.0, "grad_norm": 1.5098453610117843, "language_loss": 0.76005483, "learning_rate": 2.314427505071226e-08, "loss": 0.78460521, "num_input_tokens_seen": 341838035, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18713379, "step": 15850, "time_per_iteration": 2.828068256378174 }, { "auxiliary_loss_clip": 0.01397882, "auxiliary_loss_mlp": 0.0103476, "balance_loss_clip": 1.23832369, "balance_loss_mlp": 1.01680708, "epoch": 0.9530136780399819, "flos": 22393115677440.0, "grad_norm": 3.0411330371860292, "language_loss": 0.7333231, "learning_rate": 2.308523444215482e-08, "loss": 0.75764954, "num_input_tokens_seen": 341855895, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.17944336, "step": 15851, "time_per_iteration": 2.909142017364502 }, { "auxiliary_loss_clip": 0.01396554, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.23753262, "balance_loss_mlp": 1.01329803, "epoch": 0.95307380129265, "flos": 22168673631360.0, "grad_norm": 2.4796112619253754, "language_loss": 0.80360329, "learning_rate": 2.3026268798403525e-08, "loss": 0.82789016, "num_input_tokens_seen": 341875240, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18847656, "step": 15852, "time_per_iteration": 2.831716537475586 }, { "auxiliary_loss_clip": 0.01401864, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.24156606, "balance_loss_mlp": 1.01889038, "epoch": 0.9531339245453179, "flos": 44039599361280.0, "grad_norm": 1.6346945477542283, "language_loss": 0.60878903, "learning_rate": 2.2967378121694138e-08, "loss": 0.63318574, "num_input_tokens_seen": 341901020, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18908691, "step": 15853, "time_per_iteration": 3.1342570781707764 }, { "auxiliary_loss_clip": 0.01373562, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.21872437, "balance_loss_mlp": 1.01581717, "epoch": 0.9531940477979859, "flos": 20276385321600.0, "grad_norm": 4.960787403956611, "language_loss": 0.72953123, "learning_rate": 2.290856241425998e-08, "loss": 0.75360775, "num_input_tokens_seen": 341919365, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18273926, "step": 15854, "time_per_iteration": 2.842256784439087 }, { "auxiliary_loss_clip": 0.01406482, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.24564135, "balance_loss_mlp": 1.01326394, "epoch": 0.9532541710506538, "flos": 25346099272320.0, "grad_norm": 1.9728021402115516, "language_loss": 0.69007266, "learning_rate": 2.284982167833127e-08, "loss": 0.71446168, "num_input_tokens_seen": 341939985, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19165039, "step": 15855, "time_per_iteration": 2.8700456619262695 }, { "auxiliary_loss_clip": 0.01387976, "auxiliary_loss_mlp": 0.01031234, "balance_loss_clip": 1.22954166, "balance_loss_mlp": 1.01303113, "epoch": 0.9533142943033218, "flos": 26480598105600.0, "grad_norm": 1.6558309682016235, "language_loss": 0.77379066, "learning_rate": 2.279115591613556e-08, "loss": 0.79798281, "num_input_tokens_seen": 341959255, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18212891, "step": 15856, "time_per_iteration": 2.8652262687683105 }, { "auxiliary_loss_clip": 0.01396076, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 1.23777831, "balance_loss_mlp": 1.01295292, "epoch": 0.9533744175559897, "flos": 23666760829440.0, "grad_norm": 1.6880225201583716, "language_loss": 0.78347957, "learning_rate": 2.2732565129897075e-08, "loss": 0.80775249, "num_input_tokens_seen": 341977205, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18273926, "step": 15857, "time_per_iteration": 2.848681926727295 }, { "auxiliary_loss_clip": 0.01179669, "auxiliary_loss_mlp": 0.01021778, "balance_loss_clip": 1.09272265, "balance_loss_mlp": 1.00337172, "epoch": 0.9534345408086577, "flos": 61080319704960.0, "grad_norm": 0.707449838416559, "language_loss": 0.62671852, "learning_rate": 2.267404932183803e-08, "loss": 0.64873302, "num_input_tokens_seen": 342038545, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.18359375, "step": 15858, "time_per_iteration": 4.750405550003052 }, { "auxiliary_loss_clip": 0.0138986, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.23232436, "balance_loss_mlp": 1.01599193, "epoch": 0.9534946640613258, "flos": 18960635243520.0, "grad_norm": 1.6898571591385187, "language_loss": 0.5762378, "learning_rate": 2.2615608494177097e-08, "loss": 0.6004833, "num_input_tokens_seen": 342058195, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18701172, "step": 15859, "time_per_iteration": 2.8759772777557373 }, { "auxiliary_loss_clip": 0.01381308, "auxiliary_loss_mlp": 0.0103103, "balance_loss_clip": 1.22729206, "balance_loss_mlp": 1.01325619, "epoch": 0.9535547873139937, "flos": 16662970356480.0, "grad_norm": 2.11819030642061, "language_loss": 0.82721043, "learning_rate": 2.2557242649130504e-08, "loss": 0.8513338, "num_input_tokens_seen": 342075025, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.17773438, "step": 15860, "time_per_iteration": 2.8158483505249023 }, { "auxiliary_loss_clip": 0.01405244, "auxiliary_loss_mlp": 0.01032154, "balance_loss_clip": 1.24383092, "balance_loss_mlp": 1.01372457, "epoch": 0.9536149105666617, "flos": 20677508887680.0, "grad_norm": 1.8281519004467497, "language_loss": 0.67025012, "learning_rate": 2.249895178891159e-08, "loss": 0.69462407, "num_input_tokens_seen": 342094595, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18444824, "step": 15861, "time_per_iteration": 4.2691521644592285 }, { "auxiliary_loss_clip": 0.0140867, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.24768555, "balance_loss_mlp": 1.01379287, "epoch": 0.9536750338193296, "flos": 30712610983680.0, "grad_norm": 2.555519967214081, "language_loss": 0.66251743, "learning_rate": 2.244073591573037e-08, "loss": 0.68693161, "num_input_tokens_seen": 342115970, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18945312, "step": 15862, "time_per_iteration": 2.922919273376465 }, { "auxiliary_loss_clip": 0.0138757, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.23332739, "balance_loss_mlp": 1.01275611, "epoch": 0.9537351570719976, "flos": 20413359889920.0, "grad_norm": 1.5090424569901963, "language_loss": 0.6865226, "learning_rate": 2.238259503179485e-08, "loss": 0.71071303, "num_input_tokens_seen": 342134080, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.18725586, "step": 15863, "time_per_iteration": 2.849214553833008 }, { "auxiliary_loss_clip": 0.01400595, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.24158835, "balance_loss_mlp": 1.01447797, "epoch": 0.9537952803246655, "flos": 29939845478400.0, "grad_norm": 1.7785916815490503, "language_loss": 0.79125869, "learning_rate": 2.2324529139309267e-08, "loss": 0.81559914, "num_input_tokens_seen": 342154725, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18969727, "step": 15864, "time_per_iteration": 2.9445626735687256 }, { "auxiliary_loss_clip": 0.01389486, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.23339915, "balance_loss_mlp": 1.0141716, "epoch": 0.9538554035773336, "flos": 20530987666560.0, "grad_norm": 2.9700435384055743, "language_loss": 0.60170639, "learning_rate": 2.226653824047586e-08, "loss": 0.6259231, "num_input_tokens_seen": 342172275, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18005371, "step": 15865, "time_per_iteration": 2.8739097118377686 }, { "auxiliary_loss_clip": 0.01394806, "auxiliary_loss_mlp": 0.01032642, "balance_loss_clip": 1.23521757, "balance_loss_mlp": 1.01352119, "epoch": 0.9539155268300015, "flos": 18415886123520.0, "grad_norm": 1.7476345161124003, "language_loss": 0.71298218, "learning_rate": 2.2208622337493765e-08, "loss": 0.73725665, "num_input_tokens_seen": 342190880, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19104004, "step": 15866, "time_per_iteration": 2.8247647285461426 }, { "auxiliary_loss_clip": 0.01406831, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.24698305, "balance_loss_mlp": 1.01629102, "epoch": 0.9539756500826695, "flos": 26224366947840.0, "grad_norm": 2.452044262528461, "language_loss": 0.8577137, "learning_rate": 2.215078143255855e-08, "loss": 0.88214004, "num_input_tokens_seen": 342208165, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1953125, "step": 15867, "time_per_iteration": 2.9008405208587646 }, { "auxiliary_loss_clip": 0.01181084, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.0931251, "balance_loss_mlp": 1.00586259, "epoch": 0.9540357733353374, "flos": 68322578112000.0, "grad_norm": 0.7576184641944876, "language_loss": 0.61811602, "learning_rate": 2.2093015527864024e-08, "loss": 0.64024872, "num_input_tokens_seen": 342277110, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.26367188, "step": 15868, "time_per_iteration": 4.815367937088013 }, { "auxiliary_loss_clip": 0.01396548, "auxiliary_loss_mlp": 0.01033723, "balance_loss_clip": 1.23690403, "balance_loss_mlp": 1.01491189, "epoch": 0.9540958965880054, "flos": 21298233306240.0, "grad_norm": 1.9207224596212447, "language_loss": 0.61131704, "learning_rate": 2.2035324625600425e-08, "loss": 0.6356197, "num_input_tokens_seen": 342294695, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18798828, "step": 15869, "time_per_iteration": 4.2893900871276855 }, { "auxiliary_loss_clip": 0.01402326, "auxiliary_loss_mlp": 0.01035811, "balance_loss_clip": 1.2433064, "balance_loss_mlp": 1.01857376, "epoch": 0.9541560198406733, "flos": 19759805729280.0, "grad_norm": 2.1062487875905265, "language_loss": 0.71989089, "learning_rate": 2.197770872795579e-08, "loss": 0.74427223, "num_input_tokens_seen": 342314970, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.17248535, "step": 15870, "time_per_iteration": 2.8657803535461426 }, { "auxiliary_loss_clip": 0.01398742, "auxiliary_loss_mlp": 0.01031868, "balance_loss_clip": 1.24047995, "balance_loss_mlp": 1.01281857, "epoch": 0.9542161430933414, "flos": 24725872546560.0, "grad_norm": 1.868446974620835, "language_loss": 0.77609062, "learning_rate": 2.1920167837114368e-08, "loss": 0.80039668, "num_input_tokens_seen": 342334255, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19042969, "step": 15871, "time_per_iteration": 2.9891533851623535 }, { "auxiliary_loss_clip": 0.01400104, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.24089837, "balance_loss_mlp": 1.01748109, "epoch": 0.9542762663460094, "flos": 31078144874880.0, "grad_norm": 1.8158997744210454, "language_loss": 0.5890249, "learning_rate": 2.1862701955258634e-08, "loss": 0.61338973, "num_input_tokens_seen": 342354730, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18896484, "step": 15872, "time_per_iteration": 2.959434986114502 }, { "auxiliary_loss_clip": 0.01405536, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.24323893, "balance_loss_mlp": 1.01817989, "epoch": 0.9543363895986773, "flos": 20786359173120.0, "grad_norm": 1.8143159291535895, "language_loss": 0.75694871, "learning_rate": 2.1805311084567514e-08, "loss": 0.78138006, "num_input_tokens_seen": 342374565, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19433594, "step": 15873, "time_per_iteration": 2.8488261699676514 }, { "auxiliary_loss_clip": 0.01399105, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.23792398, "balance_loss_mlp": 1.01246393, "epoch": 0.9543965128513453, "flos": 24473577686400.0, "grad_norm": 6.029658357416853, "language_loss": 0.62882364, "learning_rate": 2.1747995227217265e-08, "loss": 0.6531347, "num_input_tokens_seen": 342394590, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1953125, "step": 15874, "time_per_iteration": 2.8589696884155273 }, { "auxiliary_loss_clip": 0.01383961, "auxiliary_loss_mlp": 0.0103528, "balance_loss_clip": 1.22771561, "balance_loss_mlp": 1.01596808, "epoch": 0.9544566361040132, "flos": 15268213578240.0, "grad_norm": 1.934679503874201, "language_loss": 0.90013587, "learning_rate": 2.169075438538104e-08, "loss": 0.92432833, "num_input_tokens_seen": 342410445, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19311523, "step": 15875, "time_per_iteration": 2.847459316253662 }, { "auxiliary_loss_clip": 0.01413104, "auxiliary_loss_mlp": 0.01033128, "balance_loss_clip": 1.24942029, "balance_loss_mlp": 1.01332712, "epoch": 0.9545167593566812, "flos": 25929107510400.0, "grad_norm": 1.6522967240216369, "language_loss": 0.68559206, "learning_rate": 2.1633588561229765e-08, "loss": 0.71005434, "num_input_tokens_seen": 342430970, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19799805, "step": 15876, "time_per_iteration": 2.8786566257476807 }, { "auxiliary_loss_clip": 0.01409826, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.24729729, "balance_loss_mlp": 1.01363182, "epoch": 0.9545768826093491, "flos": 25638779756160.0, "grad_norm": 1.8570534509737187, "language_loss": 0.70344943, "learning_rate": 2.1576497756931267e-08, "loss": 0.72787333, "num_input_tokens_seen": 342449505, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18920898, "step": 15877, "time_per_iteration": 2.901604413986206 }, { "auxiliary_loss_clip": 0.01410607, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.24965453, "balance_loss_mlp": 1.01132131, "epoch": 0.9546370058620172, "flos": 22501558759680.0, "grad_norm": 1.6254123941452787, "language_loss": 0.71379542, "learning_rate": 2.1519481974650035e-08, "loss": 0.73820662, "num_input_tokens_seen": 342470390, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19177246, "step": 15878, "time_per_iteration": 2.870429277420044 }, { "auxiliary_loss_clip": 0.01392309, "auxiliary_loss_mlp": 0.01032532, "balance_loss_clip": 1.23436975, "balance_loss_mlp": 1.01392293, "epoch": 0.9546971291146851, "flos": 24620687089920.0, "grad_norm": 1.2839911303183849, "language_loss": 0.68647313, "learning_rate": 2.1462541216548335e-08, "loss": 0.71072155, "num_input_tokens_seen": 342492560, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18615723, "step": 15879, "time_per_iteration": 2.9011917114257812 }, { "auxiliary_loss_clip": 0.01390249, "auxiliary_loss_mlp": 0.01029925, "balance_loss_clip": 1.23346448, "balance_loss_mlp": 1.01126862, "epoch": 0.9547572523673531, "flos": 28669457952000.0, "grad_norm": 1.88028076060204, "language_loss": 0.85623682, "learning_rate": 2.1405675484785334e-08, "loss": 0.88043857, "num_input_tokens_seen": 342512315, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18640137, "step": 15880, "time_per_iteration": 2.913128614425659 }, { "auxiliary_loss_clip": 0.01394798, "auxiliary_loss_mlp": 0.0103366, "balance_loss_clip": 1.23480177, "balance_loss_mlp": 1.01464641, "epoch": 0.954817375620021, "flos": 33815735383680.0, "grad_norm": 1.976869660491038, "language_loss": 0.72489411, "learning_rate": 2.134888478151753e-08, "loss": 0.74917865, "num_input_tokens_seen": 342533060, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19006348, "step": 15881, "time_per_iteration": 2.9521148204803467 }, { "auxiliary_loss_clip": 0.01397973, "auxiliary_loss_mlp": 0.01036225, "balance_loss_clip": 1.24083328, "balance_loss_mlp": 1.01739049, "epoch": 0.954877498872689, "flos": 14436213350400.0, "grad_norm": 2.2614401133578395, "language_loss": 0.72647333, "learning_rate": 2.1292169108898083e-08, "loss": 0.75081533, "num_input_tokens_seen": 342550830, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18835449, "step": 15882, "time_per_iteration": 2.822021007537842 }, { "auxiliary_loss_clip": 0.01409776, "auxiliary_loss_mlp": 0.01036682, "balance_loss_clip": 1.24819458, "balance_loss_mlp": 1.01746583, "epoch": 0.9549376221253569, "flos": 59289126837120.0, "grad_norm": 1.6343625639803911, "language_loss": 0.66972554, "learning_rate": 2.1235528469078168e-08, "loss": 0.69419014, "num_input_tokens_seen": 342575070, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19213867, "step": 15883, "time_per_iteration": 3.209477424621582 }, { "auxiliary_loss_clip": 0.01402918, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.24280405, "balance_loss_mlp": 1.01724744, "epoch": 0.954997745378025, "flos": 17283197082240.0, "grad_norm": 4.170039004690641, "language_loss": 0.78890288, "learning_rate": 2.1178962864205175e-08, "loss": 0.81330252, "num_input_tokens_seen": 342592215, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19812012, "step": 15884, "time_per_iteration": 2.7932369709014893 }, { "auxiliary_loss_clip": 0.01410589, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 1.24755156, "balance_loss_mlp": 1.0125885, "epoch": 0.955057868630693, "flos": 13014282430080.0, "grad_norm": 1.6362507025605773, "language_loss": 0.78353941, "learning_rate": 2.1122472296424054e-08, "loss": 0.8079679, "num_input_tokens_seen": 342610030, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19641113, "step": 15885, "time_per_iteration": 2.826247453689575 }, { "auxiliary_loss_clip": 0.0140108, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.2411449, "balance_loss_mlp": 1.01197577, "epoch": 0.9551179918833609, "flos": 22647853756800.0, "grad_norm": 2.1301254857593714, "language_loss": 0.70891702, "learning_rate": 2.1066056767877317e-08, "loss": 0.73323202, "num_input_tokens_seen": 342626475, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18444824, "step": 15886, "time_per_iteration": 2.8400673866271973 }, { "auxiliary_loss_clip": 0.01409128, "auxiliary_loss_mlp": 0.01037965, "balance_loss_clip": 1.24548864, "balance_loss_mlp": 1.01854599, "epoch": 0.9551781151360289, "flos": 21552609427200.0, "grad_norm": 1.6900708472766617, "language_loss": 0.73697108, "learning_rate": 2.1009716280703916e-08, "loss": 0.76144195, "num_input_tokens_seen": 342646645, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19421387, "step": 15887, "time_per_iteration": 2.9240052700042725 }, { "auxiliary_loss_clip": 0.01377792, "auxiliary_loss_mlp": 0.01030218, "balance_loss_clip": 1.22266269, "balance_loss_mlp": 1.01160932, "epoch": 0.9552382383886968, "flos": 20711469749760.0, "grad_norm": 6.862932880003928, "language_loss": 0.57441646, "learning_rate": 2.0953450837040364e-08, "loss": 0.59849656, "num_input_tokens_seen": 342663615, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.18603516, "step": 15888, "time_per_iteration": 2.8672680854797363 }, { "auxiliary_loss_clip": 0.01182454, "auxiliary_loss_mlp": 0.01020011, "balance_loss_clip": 1.09457421, "balance_loss_mlp": 0.99893492, "epoch": 0.9552983616413648, "flos": 67800931102080.0, "grad_norm": 0.7101981580259676, "language_loss": 0.57902008, "learning_rate": 2.0897260439020514e-08, "loss": 0.60104471, "num_input_tokens_seen": 342728275, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.2109375, "step": 15889, "time_per_iteration": 3.4000484943389893 }, { "auxiliary_loss_clip": 0.01406596, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.24331808, "balance_loss_mlp": 1.01253545, "epoch": 0.9553584848940327, "flos": 21589963649280.0, "grad_norm": 1.3936224899378897, "language_loss": 0.67988598, "learning_rate": 2.084114508877466e-08, "loss": 0.70427364, "num_input_tokens_seen": 342748860, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19628906, "step": 15890, "time_per_iteration": 2.8923583030700684 }, { "auxiliary_loss_clip": 0.01393251, "auxiliary_loss_mlp": 0.01029567, "balance_loss_clip": 1.23614001, "balance_loss_mlp": 1.01225758, "epoch": 0.9554186081467008, "flos": 24219156320640.0, "grad_norm": 1.4654916007782082, "language_loss": 0.74561226, "learning_rate": 2.0785104788430874e-08, "loss": 0.76984036, "num_input_tokens_seen": 342769705, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.17297363, "step": 15891, "time_per_iteration": 2.8617002964019775 }, { "auxiliary_loss_clip": 0.01387799, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.23370886, "balance_loss_mlp": 1.01152372, "epoch": 0.9554787313993687, "flos": 16259765529600.0, "grad_norm": 1.9842574461789808, "language_loss": 0.78733993, "learning_rate": 2.072913954011435e-08, "loss": 0.81151277, "num_input_tokens_seen": 342787000, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.17956543, "step": 15892, "time_per_iteration": 2.861238479614258 }, { "auxiliary_loss_clip": 0.01403395, "auxiliary_loss_mlp": 0.01038633, "balance_loss_clip": 1.2449832, "balance_loss_mlp": 1.01872551, "epoch": 0.9555388546520367, "flos": 23414556458880.0, "grad_norm": 1.4626697580634043, "language_loss": 0.70621395, "learning_rate": 2.0673249345947386e-08, "loss": 0.73063421, "num_input_tokens_seen": 342807795, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19921875, "step": 15893, "time_per_iteration": 4.308905601501465 }, { "auxiliary_loss_clip": 0.01391286, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.23428226, "balance_loss_mlp": 1.01546812, "epoch": 0.9555989779047046, "flos": 14802380668800.0, "grad_norm": 2.001226949858358, "language_loss": 0.66305614, "learning_rate": 2.0617434208048955e-08, "loss": 0.68733132, "num_input_tokens_seen": 342825490, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.2076416, "step": 15894, "time_per_iteration": 2.8000314235687256 }, { "auxiliary_loss_clip": 0.01400653, "auxiliary_loss_mlp": 0.01033232, "balance_loss_clip": 1.23822999, "balance_loss_mlp": 1.01421833, "epoch": 0.9556591011573726, "flos": 22246911169920.0, "grad_norm": 1.8250761635407013, "language_loss": 0.82334942, "learning_rate": 2.056169412853581e-08, "loss": 0.8476882, "num_input_tokens_seen": 342844965, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19018555, "step": 15895, "time_per_iteration": 2.8413095474243164 }, { "auxiliary_loss_clip": 0.01394551, "auxiliary_loss_mlp": 0.01030182, "balance_loss_clip": 1.23384762, "balance_loss_mlp": 1.01060796, "epoch": 0.9557192244100405, "flos": 27866848861440.0, "grad_norm": 1.5514381710825422, "language_loss": 0.73076057, "learning_rate": 2.0506029109521593e-08, "loss": 0.75500786, "num_input_tokens_seen": 342865915, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19567871, "step": 15896, "time_per_iteration": 4.343982458114624 }, { "auxiliary_loss_clip": 0.01388182, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.23079884, "balance_loss_mlp": 1.01280546, "epoch": 0.9557793476627086, "flos": 17611693464960.0, "grad_norm": 2.4316381135997718, "language_loss": 0.80515492, "learning_rate": 2.045043915311706e-08, "loss": 0.82934785, "num_input_tokens_seen": 342884000, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18286133, "step": 15897, "time_per_iteration": 2.7978665828704834 }, { "auxiliary_loss_clip": 0.01393696, "auxiliary_loss_mlp": 0.01033139, "balance_loss_clip": 1.2341367, "balance_loss_mlp": 1.0138272, "epoch": 0.9558394709153766, "flos": 23885275806720.0, "grad_norm": 2.307662601744314, "language_loss": 0.73452127, "learning_rate": 2.03949242614303e-08, "loss": 0.75878954, "num_input_tokens_seen": 342903095, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19311523, "step": 15898, "time_per_iteration": 2.9069690704345703 }, { "auxiliary_loss_clip": 0.01183108, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.0945462, "balance_loss_mlp": 1.0096823, "epoch": 0.9558995941680445, "flos": 53708371142400.0, "grad_norm": 0.9118544183450444, "language_loss": 0.5241133, "learning_rate": 2.033948443656652e-08, "loss": 0.54630822, "num_input_tokens_seen": 342958155, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.26757812, "step": 15899, "time_per_iteration": 3.2890727519989014 }, { "auxiliary_loss_clip": 0.01426617, "auxiliary_loss_mlp": 0.01035887, "balance_loss_clip": 1.26141846, "balance_loss_mlp": 1.01606202, "epoch": 0.9559597174207125, "flos": 13770488338560.0, "grad_norm": 2.0689336657193773, "language_loss": 0.69098878, "learning_rate": 2.028411968062782e-08, "loss": 0.71561378, "num_input_tokens_seen": 342972500, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.19824219, "step": 15900, "time_per_iteration": 2.8987224102020264 }, { "auxiliary_loss_clip": 0.01397829, "auxiliary_loss_mlp": 0.01031408, "balance_loss_clip": 1.23650563, "balance_loss_mlp": 1.01135731, "epoch": 0.9560198406733804, "flos": 19945445719680.0, "grad_norm": 1.9944706489616386, "language_loss": 0.83425725, "learning_rate": 2.0228829995713627e-08, "loss": 0.85854959, "num_input_tokens_seen": 342989035, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.20031738, "step": 15901, "time_per_iteration": 2.8445117473602295 }, { "auxiliary_loss_clip": 0.01186245, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.09758437, "balance_loss_mlp": 1.00866449, "epoch": 0.9560799639260484, "flos": 57315678059520.0, "grad_norm": 0.7101445764899141, "language_loss": 0.54384083, "learning_rate": 2.0173615383920485e-08, "loss": 0.56600738, "num_input_tokens_seen": 343051675, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.21777344, "step": 15902, "time_per_iteration": 3.3556182384490967 }, { "auxiliary_loss_clip": 0.01382513, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.22898197, "balance_loss_mlp": 1.01306832, "epoch": 0.9561400871787163, "flos": 18926629136640.0, "grad_norm": 1.6322659766613274, "language_loss": 0.8524884, "learning_rate": 2.01184758473425e-08, "loss": 0.8766247, "num_input_tokens_seen": 343068895, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.18041992, "step": 15903, "time_per_iteration": 4.26308274269104 }, { "auxiliary_loss_clip": 0.01397699, "auxiliary_loss_mlp": 0.01028308, "balance_loss_clip": 1.23766565, "balance_loss_mlp": 1.01074862, "epoch": 0.9562002104313844, "flos": 18047049361920.0, "grad_norm": 2.2340023302275522, "language_loss": 0.81101918, "learning_rate": 2.0063411388070217e-08, "loss": 0.83527923, "num_input_tokens_seen": 343087115, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.17553711, "step": 15904, "time_per_iteration": 4.240625858306885 }, { "auxiliary_loss_clip": 0.01406775, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.24506521, "balance_loss_mlp": 1.01176798, "epoch": 0.9562603336840523, "flos": 24728451500160.0, "grad_norm": 2.101843115390627, "language_loss": 0.60522354, "learning_rate": 2.0008422008191972e-08, "loss": 0.62959862, "num_input_tokens_seen": 343105575, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18969727, "step": 15905, "time_per_iteration": 2.92730712890625 }, { "auxiliary_loss_clip": 0.01396968, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 1.23869002, "balance_loss_mlp": 1.01243651, "epoch": 0.9563204569367203, "flos": 21186713577600.0, "grad_norm": 2.037146351835429, "language_loss": 0.71417058, "learning_rate": 1.995350770979254e-08, "loss": 0.73844391, "num_input_tokens_seen": 343123025, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.17919922, "step": 15906, "time_per_iteration": 2.8485374450683594 }, { "auxiliary_loss_clip": 0.01414479, "auxiliary_loss_mlp": 0.01033528, "balance_loss_clip": 1.25092053, "balance_loss_mlp": 1.01468122, "epoch": 0.9563805801893882, "flos": 20239302568320.0, "grad_norm": 1.6636157776593996, "language_loss": 0.71975827, "learning_rate": 1.9898668494954473e-08, "loss": 0.74423838, "num_input_tokens_seen": 343141625, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.18835449, "step": 15907, "time_per_iteration": 2.8828699588775635 }, { "auxiliary_loss_clip": 0.0137969, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.22367084, "balance_loss_mlp": 1.01484823, "epoch": 0.9564407034420562, "flos": 25421984081280.0, "grad_norm": 2.0041098637770327, "language_loss": 0.7092272, "learning_rate": 1.9843904365757447e-08, "loss": 0.733356, "num_input_tokens_seen": 343161300, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18347168, "step": 15908, "time_per_iteration": 2.8588943481445312 }, { "auxiliary_loss_clip": 0.01396499, "auxiliary_loss_mlp": 0.01031581, "balance_loss_clip": 1.23777831, "balance_loss_mlp": 1.01328254, "epoch": 0.9565008266947241, "flos": 18632908022400.0, "grad_norm": 2.0101121520559073, "language_loss": 0.83753681, "learning_rate": 1.978921532427802e-08, "loss": 0.86181754, "num_input_tokens_seen": 343177815, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.1829834, "step": 15909, "time_per_iteration": 2.837265729904175 }, { "auxiliary_loss_clip": 0.01392089, "auxiliary_loss_mlp": 0.01033437, "balance_loss_clip": 1.23393071, "balance_loss_mlp": 1.01481628, "epoch": 0.9565609499473922, "flos": 24872574746880.0, "grad_norm": 2.671253757447132, "language_loss": 0.69059706, "learning_rate": 1.9734601372590086e-08, "loss": 0.71485233, "num_input_tokens_seen": 343198140, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18640137, "step": 15910, "time_per_iteration": 2.895603895187378 }, { "auxiliary_loss_clip": 0.01414818, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.2507689, "balance_loss_mlp": 1.01604295, "epoch": 0.9566210732000601, "flos": 21808388136960.0, "grad_norm": 1.885149761328476, "language_loss": 0.75551373, "learning_rate": 1.968006251276444e-08, "loss": 0.78001428, "num_input_tokens_seen": 343218280, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1920166, "step": 15911, "time_per_iteration": 2.8970866203308105 }, { "auxiliary_loss_clip": 0.01397158, "auxiliary_loss_mlp": 0.01034083, "balance_loss_clip": 1.23740411, "balance_loss_mlp": 1.01543832, "epoch": 0.9566811964527281, "flos": 18706847304960.0, "grad_norm": 2.1654944345425133, "language_loss": 0.70007735, "learning_rate": 1.9625598746869198e-08, "loss": 0.72438973, "num_input_tokens_seen": 343236850, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18664551, "step": 15912, "time_per_iteration": 2.875248670578003 }, { "auxiliary_loss_clip": 0.0140795, "auxiliary_loss_mlp": 0.01034926, "balance_loss_clip": 1.24821234, "balance_loss_mlp": 1.0167706, "epoch": 0.9567413197053961, "flos": 13007043262080.0, "grad_norm": 2.3164393436225943, "language_loss": 0.7330451, "learning_rate": 1.95712100769696e-08, "loss": 0.75747383, "num_input_tokens_seen": 343253065, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18164062, "step": 15913, "time_per_iteration": 2.7933006286621094 }, { "auxiliary_loss_clip": 0.01395637, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.23695731, "balance_loss_mlp": 1.01239085, "epoch": 0.956801442958064, "flos": 19728785779200.0, "grad_norm": 1.7103164058743772, "language_loss": 0.7361275, "learning_rate": 1.9516896505128444e-08, "loss": 0.76038992, "num_input_tokens_seen": 343270330, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18212891, "step": 15914, "time_per_iteration": 2.914193630218506 }, { "auxiliary_loss_clip": 0.01394114, "auxiliary_loss_mlp": 0.01029292, "balance_loss_clip": 1.23613596, "balance_loss_mlp": 1.01117206, "epoch": 0.956861566210732, "flos": 18231829701120.0, "grad_norm": 1.6059951152801648, "language_loss": 0.67772067, "learning_rate": 1.9462658033404965e-08, "loss": 0.70195466, "num_input_tokens_seen": 343289625, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18115234, "step": 15915, "time_per_iteration": 2.844977617263794 }, { "auxiliary_loss_clip": 0.01387934, "auxiliary_loss_mlp": 0.0103145, "balance_loss_clip": 1.23226702, "balance_loss_mlp": 1.01267481, "epoch": 0.9569216894634, "flos": 22206254077440.0, "grad_norm": 1.7405566212283887, "language_loss": 0.64775282, "learning_rate": 1.9408494663855967e-08, "loss": 0.67194664, "num_input_tokens_seen": 343309200, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.18762207, "step": 15916, "time_per_iteration": 2.8971574306488037 }, { "auxiliary_loss_clip": 0.01383544, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.23026597, "balance_loss_mlp": 1.01344967, "epoch": 0.956981812716068, "flos": 21699266382720.0, "grad_norm": 2.211904581265226, "language_loss": 0.81215155, "learning_rate": 1.935440639853536e-08, "loss": 0.83630753, "num_input_tokens_seen": 343326270, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.18579102, "step": 15917, "time_per_iteration": 2.8474903106689453 }, { "auxiliary_loss_clip": 0.01391793, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.2345835, "balance_loss_mlp": 1.01574445, "epoch": 0.9570419359687359, "flos": 13998911927040.0, "grad_norm": 2.3907464067577164, "language_loss": 0.73953831, "learning_rate": 1.9300393239494172e-08, "loss": 0.76380157, "num_input_tokens_seen": 343344430, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18798828, "step": 15918, "time_per_iteration": 2.8160483837127686 }, { "auxiliary_loss_clip": 0.01182809, "auxiliary_loss_mlp": 0.01032522, "balance_loss_clip": 1.09442639, "balance_loss_mlp": 1.01039696, "epoch": 0.9571020592214039, "flos": 65231986521600.0, "grad_norm": 0.6622437235541858, "language_loss": 0.53172708, "learning_rate": 1.924645518878032e-08, "loss": 0.55388039, "num_input_tokens_seen": 343416155, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.22167969, "step": 15919, "time_per_iteration": 3.482100009918213 }, { "auxiliary_loss_clip": 0.01404293, "auxiliary_loss_mlp": 0.01034823, "balance_loss_clip": 1.24132168, "balance_loss_mlp": 1.01529682, "epoch": 0.9571621824740718, "flos": 17392952263680.0, "grad_norm": 2.4026765964196604, "language_loss": 0.7709893, "learning_rate": 1.919259224843972e-08, "loss": 0.79538047, "num_input_tokens_seen": 343431715, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.1953125, "step": 15920, "time_per_iteration": 2.826139450073242 }, { "auxiliary_loss_clip": 0.01411195, "auxiliary_loss_mlp": 0.01031162, "balance_loss_clip": 1.24931574, "balance_loss_mlp": 1.01248217, "epoch": 0.9572223057267398, "flos": 14546330490240.0, "grad_norm": 1.7082478410590913, "language_loss": 0.80244678, "learning_rate": 1.9138804420514298e-08, "loss": 0.82687032, "num_input_tokens_seen": 343450425, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18701172, "step": 15921, "time_per_iteration": 2.878636598587036 }, { "auxiliary_loss_clip": 0.01408382, "auxiliary_loss_mlp": 0.01031629, "balance_loss_clip": 1.24338651, "balance_loss_mlp": 1.01284218, "epoch": 0.9572824289794077, "flos": 33960763526400.0, "grad_norm": 1.8985959038551643, "language_loss": 0.51774931, "learning_rate": 1.9085091707044197e-08, "loss": 0.54214942, "num_input_tokens_seen": 343470445, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18786621, "step": 15922, "time_per_iteration": 2.98093843460083 }, { "auxiliary_loss_clip": 0.01407375, "auxiliary_loss_mlp": 0.01035257, "balance_loss_clip": 1.24690211, "balance_loss_mlp": 1.01637423, "epoch": 0.9573425522320758, "flos": 18703453944960.0, "grad_norm": 1.9618971157845593, "language_loss": 0.84303558, "learning_rate": 1.903145411006557e-08, "loss": 0.86746192, "num_input_tokens_seen": 343485200, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18884277, "step": 15923, "time_per_iteration": 2.868478298187256 }, { "auxiliary_loss_clip": 0.01393947, "auxiliary_loss_mlp": 0.01032139, "balance_loss_clip": 1.23502922, "balance_loss_mlp": 1.01370955, "epoch": 0.9574026754847437, "flos": 28521579386880.0, "grad_norm": 1.8861003329535757, "language_loss": 0.75826204, "learning_rate": 1.8977891631613008e-08, "loss": 0.78252292, "num_input_tokens_seen": 343505080, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18432617, "step": 15924, "time_per_iteration": 2.90383243560791 }, { "auxiliary_loss_clip": 0.01396227, "auxiliary_loss_mlp": 0.01036992, "balance_loss_clip": 1.23566914, "balance_loss_mlp": 1.01742959, "epoch": 0.9574627987374117, "flos": 24362962853760.0, "grad_norm": 1.866327912443195, "language_loss": 0.86801052, "learning_rate": 1.892440427371711e-08, "loss": 0.89234269, "num_input_tokens_seen": 343523995, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19567871, "step": 15925, "time_per_iteration": 2.863368272781372 }, { "auxiliary_loss_clip": 0.01407749, "auxiliary_loss_mlp": 0.01030646, "balance_loss_clip": 1.24531174, "balance_loss_mlp": 1.01095295, "epoch": 0.9575229219900797, "flos": 23520375342720.0, "grad_norm": 1.672995618428822, "language_loss": 0.76187968, "learning_rate": 1.8870992038406474e-08, "loss": 0.78626364, "num_input_tokens_seen": 343542015, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.19702148, "step": 15926, "time_per_iteration": 2.871328115463257 }, { "auxiliary_loss_clip": 0.01401797, "auxiliary_loss_mlp": 0.01032614, "balance_loss_clip": 1.2417661, "balance_loss_mlp": 1.01457775, "epoch": 0.9575830452427476, "flos": 22685162734080.0, "grad_norm": 2.1373921311083284, "language_loss": 0.78319335, "learning_rate": 1.8817654927706373e-08, "loss": 0.80753738, "num_input_tokens_seen": 343561680, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18041992, "step": 15927, "time_per_iteration": 2.867051362991333 }, { "auxiliary_loss_clip": 0.01411456, "auxiliary_loss_mlp": 0.01034958, "balance_loss_clip": 1.24881935, "balance_loss_mlp": 1.01452541, "epoch": 0.9576431684954156, "flos": 30498258528000.0, "grad_norm": 1.5949741542920641, "language_loss": 0.69831818, "learning_rate": 1.8764392943639183e-08, "loss": 0.72278231, "num_input_tokens_seen": 343585290, "router_z_loss_clip": 1.62597656, "router_z_loss_mlp": 0.2043457, "step": 15928, "time_per_iteration": 4.338857173919678 }, { "auxiliary_loss_clip": 0.01413673, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 1.25203776, "balance_loss_mlp": 1.01390767, "epoch": 0.9577032917480836, "flos": 21696868408320.0, "grad_norm": 2.7080583133661835, "language_loss": 0.82586855, "learning_rate": 1.871120608822485e-08, "loss": 0.85033727, "num_input_tokens_seen": 343604045, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19287109, "step": 15929, "time_per_iteration": 2.831333875656128 }, { "auxiliary_loss_clip": 0.01416215, "auxiliary_loss_mlp": 0.01040525, "balance_loss_clip": 1.25052547, "balance_loss_mlp": 1.02153492, "epoch": 0.9577634150007516, "flos": 29035082332800.0, "grad_norm": 1.386325477089829, "language_loss": 0.72703505, "learning_rate": 1.8658094363480202e-08, "loss": 0.75160247, "num_input_tokens_seen": 343626595, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.18994141, "step": 15930, "time_per_iteration": 2.8823630809783936 }, { "auxiliary_loss_clip": 0.01394673, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.23662257, "balance_loss_mlp": 1.01216757, "epoch": 0.9578235382534195, "flos": 19291846314240.0, "grad_norm": 1.4323384932987289, "language_loss": 0.63404047, "learning_rate": 1.8605057771419185e-08, "loss": 0.65829313, "num_input_tokens_seen": 343646195, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1842041, "step": 15931, "time_per_iteration": 2.8245046138763428 }, { "auxiliary_loss_clip": 0.01398722, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.24339294, "balance_loss_mlp": 1.01222146, "epoch": 0.9578836615060875, "flos": 13707272073600.0, "grad_norm": 1.7747104208642654, "language_loss": 0.70464432, "learning_rate": 1.8552096314052633e-08, "loss": 0.72893941, "num_input_tokens_seen": 343663665, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18591309, "step": 15932, "time_per_iteration": 4.391408920288086 }, { "auxiliary_loss_clip": 0.01414632, "auxiliary_loss_mlp": 0.01039932, "balance_loss_clip": 1.25004125, "balance_loss_mlp": 1.02082288, "epoch": 0.9579437847587554, "flos": 17063008047360.0, "grad_norm": 1.7337414930143524, "language_loss": 0.7610358, "learning_rate": 1.849920999338961e-08, "loss": 0.78558147, "num_input_tokens_seen": 343682145, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.19104004, "step": 15933, "time_per_iteration": 2.8614706993103027 }, { "auxiliary_loss_clip": 0.01184144, "auxiliary_loss_mlp": 0.0103566, "balance_loss_clip": 1.09506178, "balance_loss_mlp": 1.01029193, "epoch": 0.9580039080114234, "flos": 60597248526720.0, "grad_norm": 0.6953348197317655, "language_loss": 0.57300317, "learning_rate": 1.8446398811434948e-08, "loss": 0.59520125, "num_input_tokens_seen": 343744685, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.25390625, "step": 15934, "time_per_iteration": 3.4744327068328857 }, { "auxiliary_loss_clip": 0.01182072, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.09280562, "balance_loss_mlp": 1.01408792, "epoch": 0.9580640312640913, "flos": 66265580171520.0, "grad_norm": 0.9104009963431124, "language_loss": 0.66008008, "learning_rate": 1.8393662770191277e-08, "loss": 0.6822753, "num_input_tokens_seen": 343801835, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.23339844, "step": 15935, "time_per_iteration": 3.2301127910614014 }, { "auxiliary_loss_clip": 0.01181466, "auxiliary_loss_mlp": 0.01027187, "balance_loss_clip": 1.09412766, "balance_loss_mlp": 1.00582504, "epoch": 0.9581241545167594, "flos": 62246155201920.0, "grad_norm": 0.7738931572379062, "language_loss": 0.57114339, "learning_rate": 1.8341001871658546e-08, "loss": 0.59322989, "num_input_tokens_seen": 343861515, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.21386719, "step": 15936, "time_per_iteration": 3.2709760665893555 }, { "auxiliary_loss_clip": 0.01400376, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.23915136, "balance_loss_mlp": 1.01545835, "epoch": 0.9581842777694273, "flos": 23778371047680.0, "grad_norm": 1.9338743362812587, "language_loss": 0.78869581, "learning_rate": 1.8288416117833825e-08, "loss": 0.81305242, "num_input_tokens_seen": 343881240, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19836426, "step": 15937, "time_per_iteration": 2.8671751022338867 }, { "auxiliary_loss_clip": 0.01403604, "auxiliary_loss_mlp": 0.01037687, "balance_loss_clip": 1.24439406, "balance_loss_mlp": 1.01866138, "epoch": 0.9582444010220953, "flos": 21222438986880.0, "grad_norm": 1.55967268721307, "language_loss": 0.68627918, "learning_rate": 1.8235905510710636e-08, "loss": 0.71069211, "num_input_tokens_seen": 343900885, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19042969, "step": 15938, "time_per_iteration": 4.316569805145264 }, { "auxiliary_loss_clip": 0.01404027, "auxiliary_loss_mlp": 0.01033613, "balance_loss_clip": 1.24284661, "balance_loss_mlp": 1.01490927, "epoch": 0.9583045242747633, "flos": 23815589535360.0, "grad_norm": 2.442810033534897, "language_loss": 0.67877328, "learning_rate": 1.8183470052280712e-08, "loss": 0.70314968, "num_input_tokens_seen": 343918460, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18701172, "step": 15939, "time_per_iteration": 4.309950590133667 }, { "auxiliary_loss_clip": 0.0139138, "auxiliary_loss_mlp": 0.01030321, "balance_loss_clip": 1.23267221, "balance_loss_mlp": 1.01179576, "epoch": 0.9583646475274312, "flos": 24141461719680.0, "grad_norm": 1.5979306433430995, "language_loss": 0.74492502, "learning_rate": 1.8131109744532025e-08, "loss": 0.76914203, "num_input_tokens_seen": 343938030, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18505859, "step": 15940, "time_per_iteration": 2.883789539337158 }, { "auxiliary_loss_clip": 0.01420263, "auxiliary_loss_mlp": 0.01035705, "balance_loss_clip": 1.25852203, "balance_loss_mlp": 1.01677489, "epoch": 0.9584247707800992, "flos": 20896431068160.0, "grad_norm": 1.8366086333890226, "language_loss": 0.7332601, "learning_rate": 1.8078824589450535e-08, "loss": 0.75781977, "num_input_tokens_seen": 343956635, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18920898, "step": 15941, "time_per_iteration": 2.8464715480804443 }, { "auxiliary_loss_clip": 0.01398089, "auxiliary_loss_mlp": 0.01032022, "balance_loss_clip": 1.23833036, "balance_loss_mlp": 1.01319861, "epoch": 0.9584848940327672, "flos": 26078705377920.0, "grad_norm": 1.592446167468445, "language_loss": 0.72714889, "learning_rate": 1.8026614589018442e-08, "loss": 0.75144994, "num_input_tokens_seen": 343976625, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18823242, "step": 15942, "time_per_iteration": 3.0324931144714355 }, { "auxiliary_loss_clip": 0.01408431, "auxiliary_loss_mlp": 0.01034108, "balance_loss_clip": 1.24639702, "balance_loss_mlp": 1.01501036, "epoch": 0.9585450172854352, "flos": 34505557891200.0, "grad_norm": 1.4885761095876966, "language_loss": 0.72401917, "learning_rate": 1.797447974521571e-08, "loss": 0.74844456, "num_input_tokens_seen": 343997790, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.19104004, "step": 15943, "time_per_iteration": 2.936403751373291 }, { "auxiliary_loss_clip": 0.01409588, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.24766779, "balance_loss_mlp": 1.01432598, "epoch": 0.9586051405381031, "flos": 23120699610240.0, "grad_norm": 1.6621538680556884, "language_loss": 0.6951552, "learning_rate": 1.792242006001965e-08, "loss": 0.7195856, "num_input_tokens_seen": 344016935, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19116211, "step": 15944, "time_per_iteration": 2.884065628051758 }, { "auxiliary_loss_clip": 0.0140076, "auxiliary_loss_mlp": 0.01034318, "balance_loss_clip": 1.23932624, "balance_loss_mlp": 1.01616299, "epoch": 0.9586652637907711, "flos": 19611972408960.0, "grad_norm": 2.2198086291913905, "language_loss": 0.66675007, "learning_rate": 1.7870435535403795e-08, "loss": 0.69110084, "num_input_tokens_seen": 344035590, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18164062, "step": 15945, "time_per_iteration": 2.85005521774292 }, { "auxiliary_loss_clip": 0.0118102, "auxiliary_loss_mlp": 0.01038627, "balance_loss_clip": 1.09343112, "balance_loss_mlp": 1.01077986, "epoch": 0.958725387043439, "flos": 72105118715520.0, "grad_norm": 0.7454506615052396, "language_loss": 0.61904407, "learning_rate": 1.7818526173339678e-08, "loss": 0.64124048, "num_input_tokens_seen": 344100845, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.27929688, "step": 15946, "time_per_iteration": 3.4205920696258545 }, { "auxiliary_loss_clip": 0.01387702, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.23210859, "balance_loss_mlp": 1.01336396, "epoch": 0.958785510296107, "flos": 28923381624960.0, "grad_norm": 1.6881506503338013, "language_loss": 0.76116586, "learning_rate": 1.7766691975795723e-08, "loss": 0.78535974, "num_input_tokens_seen": 344121780, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18347168, "step": 15947, "time_per_iteration": 2.897128105163574 }, { "auxiliary_loss_clip": 0.01389611, "auxiliary_loss_mlp": 0.01032479, "balance_loss_clip": 1.23168349, "balance_loss_mlp": 1.01296401, "epoch": 0.958845633548775, "flos": 18485979598080.0, "grad_norm": 2.512468760970459, "language_loss": 0.70596319, "learning_rate": 1.771493294473747e-08, "loss": 0.73018402, "num_input_tokens_seen": 344140150, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1953125, "step": 15948, "time_per_iteration": 2.8244049549102783 }, { "auxiliary_loss_clip": 0.01393967, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.23464656, "balance_loss_mlp": 1.01615393, "epoch": 0.958905756801443, "flos": 24217572752640.0, "grad_norm": 2.0803339232198454, "language_loss": 0.79563522, "learning_rate": 1.7663249082127574e-08, "loss": 0.8199228, "num_input_tokens_seen": 344158200, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.18640137, "step": 15949, "time_per_iteration": 2.859264373779297 }, { "auxiliary_loss_clip": 0.01401422, "auxiliary_loss_mlp": 0.01034662, "balance_loss_clip": 1.24024856, "balance_loss_mlp": 1.01465833, "epoch": 0.9589658800541109, "flos": 25018372051200.0, "grad_norm": 1.803976312416274, "language_loss": 0.69472629, "learning_rate": 1.761164038992602e-08, "loss": 0.71908712, "num_input_tokens_seen": 344174720, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.20007324, "step": 15950, "time_per_iteration": 2.851726531982422 }, { "auxiliary_loss_clip": 0.01391801, "auxiliary_loss_mlp": 0.01031166, "balance_loss_clip": 1.23188996, "balance_loss_mlp": 1.0130465, "epoch": 0.9590260033067789, "flos": 23525261781120.0, "grad_norm": 1.9939726331259033, "language_loss": 0.86792141, "learning_rate": 1.7560106870089687e-08, "loss": 0.89215112, "num_input_tokens_seen": 344192580, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18127441, "step": 15951, "time_per_iteration": 2.8604893684387207 }, { "auxiliary_loss_clip": 0.01418709, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.25431406, "balance_loss_mlp": 1.01931763, "epoch": 0.9590861265594469, "flos": 25531332059520.0, "grad_norm": 2.5258364043318497, "language_loss": 0.8100276, "learning_rate": 1.7508648524572568e-08, "loss": 0.83459729, "num_input_tokens_seen": 344210345, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.18933105, "step": 15952, "time_per_iteration": 2.8647520542144775 }, { "auxiliary_loss_clip": 0.01398734, "auxiliary_loss_mlp": 0.01030793, "balance_loss_clip": 1.2397604, "balance_loss_mlp": 1.01170802, "epoch": 0.9591462498121148, "flos": 21189383020800.0, "grad_norm": 1.6014553577876367, "language_loss": 0.70063311, "learning_rate": 1.7457265355326434e-08, "loss": 0.72492838, "num_input_tokens_seen": 344229540, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19104004, "step": 15953, "time_per_iteration": 2.8547143936157227 }, { "auxiliary_loss_clip": 0.01392893, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.23391533, "balance_loss_mlp": 1.01402688, "epoch": 0.9592063730647828, "flos": 21732322348800.0, "grad_norm": 2.5279392940826577, "language_loss": 0.59876215, "learning_rate": 1.7405957364299285e-08, "loss": 0.62301701, "num_input_tokens_seen": 344247830, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18554688, "step": 15954, "time_per_iteration": 2.87784743309021 }, { "auxiliary_loss_clip": 0.01405579, "auxiliary_loss_mlp": 0.01033573, "balance_loss_clip": 1.24349678, "balance_loss_mlp": 1.01441658, "epoch": 0.9592664963174508, "flos": 29901631605120.0, "grad_norm": 2.2290566881046723, "language_loss": 0.74644601, "learning_rate": 1.7354724553437117e-08, "loss": 0.77083755, "num_input_tokens_seen": 344267760, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19152832, "step": 15955, "time_per_iteration": 2.893137216567993 }, { "auxiliary_loss_clip": 0.01396901, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.23566556, "balance_loss_mlp": 1.01284289, "epoch": 0.9593266195701188, "flos": 18007432899840.0, "grad_norm": 1.7866900059613064, "language_loss": 0.63345206, "learning_rate": 1.7303566924682378e-08, "loss": 0.65775734, "num_input_tokens_seen": 344284905, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20788574, "step": 15956, "time_per_iteration": 2.8754665851593018 }, { "auxiliary_loss_clip": 0.01402352, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 1.24095881, "balance_loss_mlp": 1.01192141, "epoch": 0.9593867428227867, "flos": 18846174602880.0, "grad_norm": 3.849557730505381, "language_loss": 0.6084286, "learning_rate": 1.725248447997507e-08, "loss": 0.63276196, "num_input_tokens_seen": 344302025, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19055176, "step": 15957, "time_per_iteration": 2.969870090484619 }, { "auxiliary_loss_clip": 0.01404801, "auxiliary_loss_mlp": 0.01037067, "balance_loss_clip": 1.24323189, "balance_loss_mlp": 1.01791, "epoch": 0.9594468660754547, "flos": 29578247884800.0, "grad_norm": 1.7694798450621323, "language_loss": 0.74425328, "learning_rate": 1.7201477221252314e-08, "loss": 0.76867193, "num_input_tokens_seen": 344321935, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19152832, "step": 15958, "time_per_iteration": 2.919109344482422 }, { "auxiliary_loss_clip": 0.01393213, "auxiliary_loss_mlp": 0.01029837, "balance_loss_clip": 1.23486781, "balance_loss_mlp": 1.01088274, "epoch": 0.9595069893281226, "flos": 20712736604160.0, "grad_norm": 1.6711337902717764, "language_loss": 0.75625789, "learning_rate": 1.7150545150448116e-08, "loss": 0.78048837, "num_input_tokens_seen": 344340405, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.1895752, "step": 15959, "time_per_iteration": 2.8641583919525146 }, { "auxiliary_loss_clip": 0.01409748, "auxiliary_loss_mlp": 0.01032433, "balance_loss_clip": 1.2464242, "balance_loss_mlp": 1.01322842, "epoch": 0.9595671125807906, "flos": 22463299641600.0, "grad_norm": 2.0699529468620925, "language_loss": 0.66343296, "learning_rate": 1.7099688269493816e-08, "loss": 0.68785477, "num_input_tokens_seen": 344359925, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19213867, "step": 15960, "time_per_iteration": 2.8574483394622803 }, { "auxiliary_loss_clip": 0.01388265, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.23245227, "balance_loss_mlp": 1.01853371, "epoch": 0.9596272358334585, "flos": 23925842409600.0, "grad_norm": 1.7701016185738538, "language_loss": 0.78398645, "learning_rate": 1.7048906580318544e-08, "loss": 0.80824995, "num_input_tokens_seen": 344379100, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.19555664, "step": 15961, "time_per_iteration": 2.858928680419922 }, { "auxiliary_loss_clip": 0.01391831, "auxiliary_loss_mlp": 0.01028633, "balance_loss_clip": 1.23468435, "balance_loss_mlp": 1.01053667, "epoch": 0.9596873590861266, "flos": 17680610574720.0, "grad_norm": 1.8887806422533424, "language_loss": 0.76267207, "learning_rate": 1.699820008484698e-08, "loss": 0.78687668, "num_input_tokens_seen": 344396895, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.1809082, "step": 15962, "time_per_iteration": 4.3035361766815186 }, { "auxiliary_loss_clip": 0.0140293, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.24148118, "balance_loss_mlp": 1.01207137, "epoch": 0.9597474823387945, "flos": 25818628412160.0, "grad_norm": 2.183847415857662, "language_loss": 0.72265053, "learning_rate": 1.6947568785002698e-08, "loss": 0.74699438, "num_input_tokens_seen": 344415115, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19396973, "step": 15963, "time_per_iteration": 2.8858559131622314 }, { "auxiliary_loss_clip": 0.01374335, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.22170353, "balance_loss_mlp": 1.01383257, "epoch": 0.9598076055914625, "flos": 23778913985280.0, "grad_norm": 1.481896721075402, "language_loss": 0.75072271, "learning_rate": 1.689701268270527e-08, "loss": 0.77479917, "num_input_tokens_seen": 344435185, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.19470215, "step": 15964, "time_per_iteration": 2.8512539863586426 }, { "auxiliary_loss_clip": 0.01178844, "auxiliary_loss_mlp": 0.01028557, "balance_loss_clip": 1.09113884, "balance_loss_mlp": 1.00671768, "epoch": 0.9598677288441305, "flos": 56539971642240.0, "grad_norm": 0.8710990217012085, "language_loss": 0.57619071, "learning_rate": 1.684653177987161e-08, "loss": 0.59826469, "num_input_tokens_seen": 344488950, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.21875, "step": 15965, "time_per_iteration": 3.2855350971221924 }, { "auxiliary_loss_clip": 0.01403334, "auxiliary_loss_mlp": 0.01034022, "balance_loss_clip": 1.2432059, "balance_loss_mlp": 1.01525867, "epoch": 0.9599278520967984, "flos": 23006419948800.0, "grad_norm": 1.8572832731289972, "language_loss": 0.79721653, "learning_rate": 1.6796126078416627e-08, "loss": 0.82159007, "num_input_tokens_seen": 344506740, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1875, "step": 15966, "time_per_iteration": 4.299907684326172 }, { "auxiliary_loss_clip": 0.01397244, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.23802936, "balance_loss_mlp": 1.01169825, "epoch": 0.9599879753494664, "flos": 23050108442880.0, "grad_norm": 1.5012240842705795, "language_loss": 0.80380636, "learning_rate": 1.674579558025102e-08, "loss": 0.82808292, "num_input_tokens_seen": 344526670, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18737793, "step": 15967, "time_per_iteration": 2.852555513381958 }, { "auxiliary_loss_clip": 0.01404695, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.2426753, "balance_loss_mlp": 1.01508808, "epoch": 0.9600480986021344, "flos": 16399183317120.0, "grad_norm": 2.46502102878593, "language_loss": 0.81535822, "learning_rate": 1.669554028728348e-08, "loss": 0.83975893, "num_input_tokens_seen": 344541995, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.20288086, "step": 15968, "time_per_iteration": 2.7934648990631104 }, { "auxiliary_loss_clip": 0.01410795, "auxiliary_loss_mlp": 0.01038572, "balance_loss_clip": 1.24650359, "balance_loss_mlp": 1.01786506, "epoch": 0.9601082218548024, "flos": 24286218393600.0, "grad_norm": 2.2344310254419444, "language_loss": 0.6811446, "learning_rate": 1.6645360201420044e-08, "loss": 0.70563829, "num_input_tokens_seen": 344559980, "router_z_loss_clip": 1.64453125, "router_z_loss_mlp": 0.20703125, "step": 15969, "time_per_iteration": 2.8926937580108643 }, { "auxiliary_loss_clip": 0.0140053, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.24131739, "balance_loss_mlp": 1.01419377, "epoch": 0.9601683451074703, "flos": 19619528290560.0, "grad_norm": 3.0912404825478172, "language_loss": 0.80122375, "learning_rate": 1.6595255324563186e-08, "loss": 0.82555854, "num_input_tokens_seen": 344577765, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1875, "step": 15970, "time_per_iteration": 2.81402850151062 }, { "auxiliary_loss_clip": 0.01386909, "auxiliary_loss_mlp": 0.01030661, "balance_loss_clip": 1.23114932, "balance_loss_mlp": 1.01022863, "epoch": 0.9602284683601383, "flos": 26662301798400.0, "grad_norm": 1.4866225334588137, "language_loss": 0.78130984, "learning_rate": 1.654522565861316e-08, "loss": 0.80548555, "num_input_tokens_seen": 344597650, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.20422363, "step": 15971, "time_per_iteration": 2.8932201862335205 }, { "auxiliary_loss_clip": 0.01400068, "auxiliary_loss_mlp": 0.01036141, "balance_loss_clip": 1.23644066, "balance_loss_mlp": 1.01592255, "epoch": 0.9602885916128062, "flos": 15561029796480.0, "grad_norm": 1.955736514098828, "language_loss": 0.68345261, "learning_rate": 1.64952712054669e-08, "loss": 0.70781463, "num_input_tokens_seen": 344613580, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20227051, "step": 15972, "time_per_iteration": 2.82707142829895 }, { "auxiliary_loss_clip": 0.01397022, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.23848403, "balance_loss_mlp": 1.01387608, "epoch": 0.9603487148654742, "flos": 16509436191360.0, "grad_norm": 2.1569287544145817, "language_loss": 0.77221322, "learning_rate": 1.644539196701844e-08, "loss": 0.79650354, "num_input_tokens_seen": 344626910, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18127441, "step": 15973, "time_per_iteration": 4.229241132736206 }, { "auxiliary_loss_clip": 0.014009, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.24378216, "balance_loss_mlp": 1.01514685, "epoch": 0.9604088381181421, "flos": 20853873694080.0, "grad_norm": 1.6048251477629363, "language_loss": 0.69508505, "learning_rate": 1.639558794515983e-08, "loss": 0.71943927, "num_input_tokens_seen": 344644330, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19372559, "step": 15974, "time_per_iteration": 2.8519973754882812 }, { "auxiliary_loss_clip": 0.0140661, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.24479866, "balance_loss_mlp": 1.01158869, "epoch": 0.9604689613708102, "flos": 19692698411520.0, "grad_norm": 3.5610303733479025, "language_loss": 0.68930328, "learning_rate": 1.6345859141779105e-08, "loss": 0.71367848, "num_input_tokens_seen": 344663910, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1932373, "step": 15975, "time_per_iteration": 4.398802042007446 }, { "auxiliary_loss_clip": 0.0137718, "auxiliary_loss_mlp": 0.01028685, "balance_loss_clip": 1.22415066, "balance_loss_mlp": 1.01025486, "epoch": 0.9605290846234781, "flos": 24108224774400.0, "grad_norm": 2.5586393148955056, "language_loss": 0.57039666, "learning_rate": 1.6296205558762322e-08, "loss": 0.59445536, "num_input_tokens_seen": 344682320, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.18432617, "step": 15976, "time_per_iteration": 2.85956072807312 }, { "auxiliary_loss_clip": 0.01382521, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 1.22730196, "balance_loss_mlp": 1.0096637, "epoch": 0.9605892078761461, "flos": 27133156880640.0, "grad_norm": 1.641260426351821, "language_loss": 0.68752092, "learning_rate": 1.624662719799219e-08, "loss": 0.71163082, "num_input_tokens_seen": 344701355, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.18798828, "step": 15977, "time_per_iteration": 2.986905336380005 }, { "auxiliary_loss_clip": 0.01392457, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.23271763, "balance_loss_mlp": 1.01504529, "epoch": 0.9606493311288141, "flos": 14144980700160.0, "grad_norm": 1.8260555952152562, "language_loss": 0.82995707, "learning_rate": 1.6197124061348766e-08, "loss": 0.8542285, "num_input_tokens_seen": 344717980, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1965332, "step": 15978, "time_per_iteration": 2.9257116317749023 }, { "auxiliary_loss_clip": 0.01406212, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.24303102, "balance_loss_mlp": 1.01325703, "epoch": 0.960709454381482, "flos": 15821332986240.0, "grad_norm": 2.1624252069353407, "language_loss": 0.84053147, "learning_rate": 1.614769615070921e-08, "loss": 0.86493099, "num_input_tokens_seen": 344733480, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.20483398, "step": 15979, "time_per_iteration": 2.9144721031188965 }, { "auxiliary_loss_clip": 0.01406714, "auxiliary_loss_mlp": 0.010344, "balance_loss_clip": 1.24407423, "balance_loss_mlp": 1.01596999, "epoch": 0.96076957763415, "flos": 22575724266240.0, "grad_norm": 4.470369402352826, "language_loss": 0.79661286, "learning_rate": 1.6098343467947805e-08, "loss": 0.821024, "num_input_tokens_seen": 344752130, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.1842041, "step": 15980, "time_per_iteration": 2.9412145614624023 }, { "auxiliary_loss_clip": 0.01403375, "auxiliary_loss_mlp": 0.0103533, "balance_loss_clip": 1.2410351, "balance_loss_mlp": 1.0165782, "epoch": 0.960829700886818, "flos": 24692047418880.0, "grad_norm": 1.9901399290245663, "language_loss": 0.68785155, "learning_rate": 1.6049066014935942e-08, "loss": 0.71223855, "num_input_tokens_seen": 344771195, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18762207, "step": 15981, "time_per_iteration": 2.8771586418151855 }, { "auxiliary_loss_clip": 0.01390632, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.23234844, "balance_loss_mlp": 1.01356411, "epoch": 0.960889824139486, "flos": 26553542002560.0, "grad_norm": 1.526048710064427, "language_loss": 0.69958884, "learning_rate": 1.5999863793542344e-08, "loss": 0.72382152, "num_input_tokens_seen": 344793150, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.19067383, "step": 15982, "time_per_iteration": 2.9642622470855713 }, { "auxiliary_loss_clip": 0.01180969, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 1.09285784, "balance_loss_mlp": 1.00811791, "epoch": 0.9609499473921539, "flos": 71143953287040.0, "grad_norm": 0.6672956639404675, "language_loss": 0.53278017, "learning_rate": 1.595073680563286e-08, "loss": 0.55488181, "num_input_tokens_seen": 344852855, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.2109375, "step": 15983, "time_per_iteration": 3.4245212078094482 }, { "auxiliary_loss_clip": 0.01396644, "auxiliary_loss_mlp": 0.01035949, "balance_loss_clip": 1.23756957, "balance_loss_mlp": 1.01689887, "epoch": 0.9610100706448219, "flos": 20560650272640.0, "grad_norm": 2.410711465134529, "language_loss": 0.69204432, "learning_rate": 1.5901685053070212e-08, "loss": 0.71637022, "num_input_tokens_seen": 344869830, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19042969, "step": 15984, "time_per_iteration": 2.8337159156799316 }, { "auxiliary_loss_clip": 0.01377243, "auxiliary_loss_mlp": 0.01033722, "balance_loss_clip": 1.22361732, "balance_loss_mlp": 1.01436234, "epoch": 0.9610701938974898, "flos": 14072308272000.0, "grad_norm": 1.5827946436066016, "language_loss": 0.68228281, "learning_rate": 1.5852708537714477e-08, "loss": 0.70639247, "num_input_tokens_seen": 344888905, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.19372559, "step": 15985, "time_per_iteration": 2.9058902263641357 }, { "auxiliary_loss_clip": 0.01407051, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.24598026, "balance_loss_mlp": 1.015908, "epoch": 0.9611303171501578, "flos": 20239619281920.0, "grad_norm": 1.7758498990169005, "language_loss": 0.79142952, "learning_rate": 1.580380726142283e-08, "loss": 0.81584656, "num_input_tokens_seen": 344907160, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.1875, "step": 15986, "time_per_iteration": 2.8711938858032227 }, { "auxiliary_loss_clip": 0.01399686, "auxiliary_loss_mlp": 0.01037644, "balance_loss_clip": 1.24084091, "balance_loss_mlp": 1.01713967, "epoch": 0.9611904404028258, "flos": 20958651947520.0, "grad_norm": 2.03413228771132, "language_loss": 0.64612526, "learning_rate": 1.5754981226049792e-08, "loss": 0.67049855, "num_input_tokens_seen": 344922400, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.20507812, "step": 15987, "time_per_iteration": 2.8308238983154297 }, { "auxiliary_loss_clip": 0.01384306, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.22988915, "balance_loss_mlp": 1.01591897, "epoch": 0.9612505636554938, "flos": 24838704374400.0, "grad_norm": 1.7213289184450784, "language_loss": 0.67827511, "learning_rate": 1.5706230433446544e-08, "loss": 0.70245451, "num_input_tokens_seen": 344941910, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.17724609, "step": 15988, "time_per_iteration": 2.866257667541504 }, { "auxiliary_loss_clip": 0.01400171, "auxiliary_loss_mlp": 0.01034757, "balance_loss_clip": 1.24018681, "balance_loss_mlp": 1.01660156, "epoch": 0.9613106869081617, "flos": 17173758614400.0, "grad_norm": 1.7861436296946636, "language_loss": 0.74731576, "learning_rate": 1.5657554885462055e-08, "loss": 0.77166504, "num_input_tokens_seen": 344960020, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18151855, "step": 15989, "time_per_iteration": 2.809638500213623 }, { "auxiliary_loss_clip": 0.01182175, "auxiliary_loss_mlp": 0.01037227, "balance_loss_clip": 1.09305155, "balance_loss_mlp": 1.01281261, "epoch": 0.9613708101608297, "flos": 61592465306880.0, "grad_norm": 0.8042047945905217, "language_loss": 0.63164431, "learning_rate": 1.5608954583941737e-08, "loss": 0.65383834, "num_input_tokens_seen": 345018290, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.24414062, "step": 15990, "time_per_iteration": 3.2551937103271484 }, { "auxiliary_loss_clip": 0.01398663, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.23808396, "balance_loss_mlp": 1.01303911, "epoch": 0.9614309334134977, "flos": 27429456948480.0, "grad_norm": 2.227518834361908, "language_loss": 0.78560829, "learning_rate": 1.5560429530729003e-08, "loss": 0.80990243, "num_input_tokens_seen": 345040235, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.17724609, "step": 15991, "time_per_iteration": 2.9074718952178955 }, { "auxiliary_loss_clip": 0.01406594, "auxiliary_loss_mlp": 0.01034741, "balance_loss_clip": 1.24179506, "balance_loss_mlp": 1.01455855, "epoch": 0.9614910566661656, "flos": 22829105001600.0, "grad_norm": 2.3416633439933277, "language_loss": 0.8637175, "learning_rate": 1.5511979727663493e-08, "loss": 0.88813078, "num_input_tokens_seen": 345054540, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20178223, "step": 15992, "time_per_iteration": 2.9261627197265625 }, { "auxiliary_loss_clip": 0.01400616, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.240273, "balance_loss_mlp": 1.01380658, "epoch": 0.9615511799188337, "flos": 20677554132480.0, "grad_norm": 2.2591688381544666, "language_loss": 0.73345906, "learning_rate": 1.5463605176582406e-08, "loss": 0.75779235, "num_input_tokens_seen": 345074035, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18908691, "step": 15993, "time_per_iteration": 2.841919422149658 }, { "auxiliary_loss_clip": 0.01406368, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.24461031, "balance_loss_mlp": 1.01586986, "epoch": 0.9616113031715016, "flos": 33163040874240.0, "grad_norm": 1.5413374031714386, "language_loss": 0.68559992, "learning_rate": 1.5415305879320716e-08, "loss": 0.71000946, "num_input_tokens_seen": 345099270, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18713379, "step": 15994, "time_per_iteration": 2.9672930240631104 }, { "auxiliary_loss_clip": 0.01396994, "auxiliary_loss_mlp": 0.01034214, "balance_loss_clip": 1.23891163, "balance_loss_mlp": 1.0149858, "epoch": 0.9616714264241696, "flos": 25020724780800.0, "grad_norm": 5.098119938757766, "language_loss": 0.84931862, "learning_rate": 1.5367081837709183e-08, "loss": 0.8736307, "num_input_tokens_seen": 345116975, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19226074, "step": 15995, "time_per_iteration": 2.8672475814819336 }, { "auxiliary_loss_clip": 0.01419722, "auxiliary_loss_mlp": 0.01033547, "balance_loss_clip": 1.25600839, "balance_loss_mlp": 1.01459241, "epoch": 0.9617315496768375, "flos": 13553918887680.0, "grad_norm": 1.9063716280793492, "language_loss": 0.76525986, "learning_rate": 1.5318933053576788e-08, "loss": 0.78979254, "num_input_tokens_seen": 345133645, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.1895752, "step": 15996, "time_per_iteration": 2.910810708999634 }, { "auxiliary_loss_clip": 0.01395587, "auxiliary_loss_mlp": 0.01032086, "balance_loss_clip": 1.23539066, "balance_loss_mlp": 1.01290536, "epoch": 0.9617916729295055, "flos": 11261819111040.0, "grad_norm": 24.072643676263425, "language_loss": 0.78075838, "learning_rate": 1.52708595287494e-08, "loss": 0.80503511, "num_input_tokens_seen": 345150740, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19177246, "step": 15997, "time_per_iteration": 4.235259294509888 }, { "auxiliary_loss_clip": 0.01383344, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.22746539, "balance_loss_mlp": 1.01241648, "epoch": 0.9618517961821734, "flos": 22829828918400.0, "grad_norm": 1.6169604607343846, "language_loss": 0.67969465, "learning_rate": 1.522286126505001e-08, "loss": 0.70383763, "num_input_tokens_seen": 345170365, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18554688, "step": 15998, "time_per_iteration": 2.8584611415863037 }, { "auxiliary_loss_clip": 0.01390346, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.23241305, "balance_loss_mlp": 1.01058793, "epoch": 0.9619119194348414, "flos": 16625887603200.0, "grad_norm": 1.6734145679292511, "language_loss": 0.73078656, "learning_rate": 1.5174938264298498e-08, "loss": 0.75498378, "num_input_tokens_seen": 345188930, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18798828, "step": 15999, "time_per_iteration": 2.959156036376953 }, { "auxiliary_loss_clip": 0.01385244, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.22968173, "balance_loss_mlp": 1.01416612, "epoch": 0.9619720426875094, "flos": 24545842911360.0, "grad_norm": 1.6895435310920486, "language_loss": 0.65791494, "learning_rate": 1.5127090528312514e-08, "loss": 0.68208766, "num_input_tokens_seen": 345209615, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.1784668, "step": 16000, "time_per_iteration": 2.951401948928833 }, { "auxiliary_loss_clip": 0.01402939, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.24252486, "balance_loss_mlp": 1.01350641, "epoch": 0.9620321659401774, "flos": 20642190681600.0, "grad_norm": 1.5221211518010416, "language_loss": 0.75490326, "learning_rate": 1.5079318058905723e-08, "loss": 0.77925861, "num_input_tokens_seen": 345229175, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19091797, "step": 16001, "time_per_iteration": 4.2572386264801025 }, { "auxiliary_loss_clip": 0.0139844, "auxiliary_loss_mlp": 0.01034387, "balance_loss_clip": 1.23865616, "balance_loss_mlp": 1.01550484, "epoch": 0.9620922891928453, "flos": 18524510184960.0, "grad_norm": 1.7311813682608408, "language_loss": 0.68933237, "learning_rate": 1.5031620857890447e-08, "loss": 0.71366072, "num_input_tokens_seen": 345247815, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18884277, "step": 16002, "time_per_iteration": 2.9166886806488037 }, { "auxiliary_loss_clip": 0.01389924, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.23237109, "balance_loss_mlp": 1.01531625, "epoch": 0.9621524124455133, "flos": 28779167888640.0, "grad_norm": 1.2774157547709442, "language_loss": 0.65430784, "learning_rate": 1.4983998927074804e-08, "loss": 0.67855561, "num_input_tokens_seen": 345269935, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.1953125, "step": 16003, "time_per_iteration": 2.9828546047210693 }, { "auxiliary_loss_clip": 0.01405118, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.24460697, "balance_loss_mlp": 1.01610422, "epoch": 0.9622125356981813, "flos": 19108197095040.0, "grad_norm": 3.9925021330368673, "language_loss": 0.76384306, "learning_rate": 1.493645226826512e-08, "loss": 0.78824556, "num_input_tokens_seen": 345288310, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19018555, "step": 16004, "time_per_iteration": 2.894174098968506 }, { "auxiliary_loss_clip": 0.01392852, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 1.23506188, "balance_loss_mlp": 1.0123477, "epoch": 0.9622726589508492, "flos": 20312427444480.0, "grad_norm": 2.53832814683334, "language_loss": 0.80449772, "learning_rate": 1.4888980883263958e-08, "loss": 0.82873833, "num_input_tokens_seen": 345306615, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18859863, "step": 16005, "time_per_iteration": 2.81652569770813 }, { "auxiliary_loss_clip": 0.01391363, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.23442054, "balance_loss_mlp": 1.01188827, "epoch": 0.9623327822035173, "flos": 54948851856000.0, "grad_norm": 2.4166621132818884, "language_loss": 0.68462819, "learning_rate": 1.4841584773871652e-08, "loss": 0.70884842, "num_input_tokens_seen": 345331935, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18786621, "step": 16006, "time_per_iteration": 3.1488258838653564 }, { "auxiliary_loss_clip": 0.01385694, "auxiliary_loss_mlp": 0.01034418, "balance_loss_clip": 1.23150992, "balance_loss_mlp": 1.01646543, "epoch": 0.9623929054561852, "flos": 21768183492480.0, "grad_norm": 2.143214790351003, "language_loss": 0.78400731, "learning_rate": 1.479426394188521e-08, "loss": 0.80820847, "num_input_tokens_seen": 345351510, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.17956543, "step": 16007, "time_per_iteration": 2.8847358226776123 }, { "auxiliary_loss_clip": 0.0141274, "auxiliary_loss_mlp": 0.01034523, "balance_loss_clip": 1.25064039, "balance_loss_mlp": 1.01393592, "epoch": 0.9624530287088532, "flos": 17940642295680.0, "grad_norm": 1.9084261105175964, "language_loss": 0.68418062, "learning_rate": 1.4747018389099198e-08, "loss": 0.70865321, "num_input_tokens_seen": 345367750, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20581055, "step": 16008, "time_per_iteration": 4.1960060596466064 }, { "auxiliary_loss_clip": 0.01398112, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.23739696, "balance_loss_mlp": 1.01583171, "epoch": 0.9625131519615211, "flos": 23262967820160.0, "grad_norm": 2.647211146549221, "language_loss": 0.74487531, "learning_rate": 1.469984811730529e-08, "loss": 0.76920545, "num_input_tokens_seen": 345384790, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19067383, "step": 16009, "time_per_iteration": 2.859495162963867 }, { "auxiliary_loss_clip": 0.01396088, "auxiliary_loss_mlp": 0.01030572, "balance_loss_clip": 1.23799062, "balance_loss_mlp": 1.01267838, "epoch": 0.9625732752141891, "flos": 18925950464640.0, "grad_norm": 1.8007479504161117, "language_loss": 0.76022148, "learning_rate": 1.4652753128292061e-08, "loss": 0.78448808, "num_input_tokens_seen": 345403390, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.17883301, "step": 16010, "time_per_iteration": 4.194318056106567 }, { "auxiliary_loss_clip": 0.01422641, "auxiliary_loss_mlp": 0.01039926, "balance_loss_clip": 1.2565341, "balance_loss_mlp": 1.01853991, "epoch": 0.962633398466857, "flos": 16261484832000.0, "grad_norm": 1.956551726351416, "language_loss": 0.70148027, "learning_rate": 1.4605733423845635e-08, "loss": 0.72610593, "num_input_tokens_seen": 345418685, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.21374512, "step": 16011, "time_per_iteration": 2.83772611618042 }, { "auxiliary_loss_clip": 0.01392448, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.23439968, "balance_loss_mlp": 1.01398087, "epoch": 0.962693521719525, "flos": 54215657568000.0, "grad_norm": 1.74628570331771, "language_loss": 0.69141024, "learning_rate": 1.4558789005748585e-08, "loss": 0.71565658, "num_input_tokens_seen": 345442380, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18200684, "step": 16012, "time_per_iteration": 3.1629652976989746 }, { "auxiliary_loss_clip": 0.01416274, "auxiliary_loss_mlp": 0.01036185, "balance_loss_clip": 1.25133395, "balance_loss_mlp": 1.01557338, "epoch": 0.962753644972193, "flos": 33117814056960.0, "grad_norm": 2.096593727016843, "language_loss": 0.73315084, "learning_rate": 1.4511919875781264e-08, "loss": 0.75767541, "num_input_tokens_seen": 345463815, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.20605469, "step": 16013, "time_per_iteration": 2.9411020278930664 }, { "auxiliary_loss_clip": 0.01398287, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.23883057, "balance_loss_mlp": 1.01046705, "epoch": 0.962813768224861, "flos": 42245936012160.0, "grad_norm": 2.105317744420225, "language_loss": 0.65139604, "learning_rate": 1.4465126035720698e-08, "loss": 0.67567635, "num_input_tokens_seen": 345484525, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19287109, "step": 16014, "time_per_iteration": 3.0462684631347656 }, { "auxiliary_loss_clip": 0.01379889, "auxiliary_loss_mlp": 0.01033432, "balance_loss_clip": 1.22626615, "balance_loss_mlp": 1.01566958, "epoch": 0.9628738914775289, "flos": 43960185457920.0, "grad_norm": 1.491104231154019, "language_loss": 0.72766364, "learning_rate": 1.4418407487341688e-08, "loss": 0.75179678, "num_input_tokens_seen": 345508295, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.17749023, "step": 16015, "time_per_iteration": 3.030846118927002 }, { "auxiliary_loss_clip": 0.01391022, "auxiliary_loss_mlp": 0.01028309, "balance_loss_clip": 1.23265839, "balance_loss_mlp": 1.00976014, "epoch": 0.9629340147301969, "flos": 15604582556160.0, "grad_norm": 2.1645476463263744, "language_loss": 0.78356373, "learning_rate": 1.4371764232415707e-08, "loss": 0.80775708, "num_input_tokens_seen": 345525155, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.1854248, "step": 16016, "time_per_iteration": 2.8288121223449707 }, { "auxiliary_loss_clip": 0.01183048, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.09356427, "balance_loss_mlp": 1.00533283, "epoch": 0.9629941379828649, "flos": 62980209141120.0, "grad_norm": 0.8192636355727563, "language_loss": 0.63149297, "learning_rate": 1.4325196272711337e-08, "loss": 0.65361238, "num_input_tokens_seen": 345578905, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.23535156, "step": 16017, "time_per_iteration": 3.245300769805908 }, { "auxiliary_loss_clip": 0.01400219, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.24014378, "balance_loss_mlp": 1.01290536, "epoch": 0.9630542612355328, "flos": 29911359237120.0, "grad_norm": 1.8683551748974374, "language_loss": 0.67502987, "learning_rate": 1.4278703609994502e-08, "loss": 0.6993463, "num_input_tokens_seen": 345598965, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1854248, "step": 16018, "time_per_iteration": 2.911592960357666 }, { "auxiliary_loss_clip": 0.01404479, "auxiliary_loss_mlp": 0.01037926, "balance_loss_clip": 1.24415267, "balance_loss_mlp": 1.01931739, "epoch": 0.9631143844882009, "flos": 17903514297600.0, "grad_norm": 2.2267148029316743, "language_loss": 0.80340934, "learning_rate": 1.4232286246028457e-08, "loss": 0.82783335, "num_input_tokens_seen": 345617945, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.1862793, "step": 16019, "time_per_iteration": 2.8536434173583984 }, { "auxiliary_loss_clip": 0.01383649, "auxiliary_loss_mlp": 0.0103102, "balance_loss_clip": 1.22733104, "balance_loss_mlp": 1.01226878, "epoch": 0.9631745077408688, "flos": 26149929972480.0, "grad_norm": 1.3892395474262773, "language_loss": 0.71898079, "learning_rate": 1.4185944182572907e-08, "loss": 0.74312758, "num_input_tokens_seen": 345637920, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.1875, "step": 16020, "time_per_iteration": 2.8727340698242188 }, { "auxiliary_loss_clip": 0.01397817, "auxiliary_loss_mlp": 0.01029415, "balance_loss_clip": 1.23729444, "balance_loss_mlp": 1.01166463, "epoch": 0.9632346309935368, "flos": 24984637413120.0, "grad_norm": 24.189682681961685, "language_loss": 0.78207594, "learning_rate": 1.4139677421385331e-08, "loss": 0.8063482, "num_input_tokens_seen": 345656195, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.17749023, "step": 16021, "time_per_iteration": 2.8867135047912598 }, { "auxiliary_loss_clip": 0.0142381, "auxiliary_loss_mlp": 0.01036158, "balance_loss_clip": 1.25880527, "balance_loss_mlp": 1.01658392, "epoch": 0.9632947542462047, "flos": 23626194226560.0, "grad_norm": 1.8261671278683849, "language_loss": 0.66148818, "learning_rate": 1.4093485964220331e-08, "loss": 0.68608785, "num_input_tokens_seen": 345676700, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.19567871, "step": 16022, "time_per_iteration": 2.8831191062927246 }, { "auxiliary_loss_clip": 0.01392211, "auxiliary_loss_mlp": 0.01034631, "balance_loss_clip": 1.23591518, "balance_loss_mlp": 1.01710725, "epoch": 0.9633548774988727, "flos": 26406704067840.0, "grad_norm": 2.1038859693408676, "language_loss": 0.74477017, "learning_rate": 1.4047369812829168e-08, "loss": 0.76903856, "num_input_tokens_seen": 345696725, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.17529297, "step": 16023, "time_per_iteration": 2.867810010910034 }, { "auxiliary_loss_clip": 0.01384859, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 1.22798777, "balance_loss_mlp": 1.01033473, "epoch": 0.9634150007515406, "flos": 23777918599680.0, "grad_norm": 1.4493365009877741, "language_loss": 0.82034123, "learning_rate": 1.4001328968960891e-08, "loss": 0.84448177, "num_input_tokens_seen": 345716245, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18859863, "step": 16024, "time_per_iteration": 2.852630138397217 }, { "auxiliary_loss_clip": 0.01416079, "auxiliary_loss_mlp": 0.01032527, "balance_loss_clip": 1.25140285, "balance_loss_mlp": 1.01284564, "epoch": 0.9634751240042086, "flos": 24146348158080.0, "grad_norm": 1.364178515386651, "language_loss": 0.81780016, "learning_rate": 1.3955363434361212e-08, "loss": 0.84228623, "num_input_tokens_seen": 345739060, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19665527, "step": 16025, "time_per_iteration": 2.862863063812256 }, { "auxiliary_loss_clip": 0.01410437, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.24749243, "balance_loss_mlp": 1.01366174, "epoch": 0.9635352472568766, "flos": 24358800332160.0, "grad_norm": 1.711063748084618, "language_loss": 0.77226341, "learning_rate": 1.3909473210773181e-08, "loss": 0.79669666, "num_input_tokens_seen": 345758325, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19226074, "step": 16026, "time_per_iteration": 2.888176918029785 }, { "auxiliary_loss_clip": 0.0139464, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.23463392, "balance_loss_mlp": 1.0165273, "epoch": 0.9635953705095446, "flos": 23994533295360.0, "grad_norm": 3.3421535207488295, "language_loss": 0.63884413, "learning_rate": 1.3863658299936965e-08, "loss": 0.66314679, "num_input_tokens_seen": 345778530, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.1907959, "step": 16027, "time_per_iteration": 2.876574993133545 }, { "auxiliary_loss_clip": 0.01406912, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.24527872, "balance_loss_mlp": 1.01456654, "epoch": 0.9636554937622125, "flos": 19837907533440.0, "grad_norm": 2.9488264736675465, "language_loss": 0.87677842, "learning_rate": 1.3817918703589837e-08, "loss": 0.90117639, "num_input_tokens_seen": 345796535, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18322754, "step": 16028, "time_per_iteration": 2.9124183654785156 }, { "auxiliary_loss_clip": 0.01183832, "auxiliary_loss_mlp": 0.0103245, "balance_loss_clip": 1.09450841, "balance_loss_mlp": 1.01423502, "epoch": 0.9637156170148805, "flos": 67465014572160.0, "grad_norm": 0.7052144500943459, "language_loss": 0.53126049, "learning_rate": 1.3772254423466412e-08, "loss": 0.55342335, "num_input_tokens_seen": 345859700, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.18261719, "step": 16029, "time_per_iteration": 3.3107709884643555 }, { "auxiliary_loss_clip": 0.01400981, "auxiliary_loss_mlp": 0.01030966, "balance_loss_clip": 1.23931754, "balance_loss_mlp": 1.01264381, "epoch": 0.9637757402675484, "flos": 20310029470080.0, "grad_norm": 1.6249489933846686, "language_loss": 0.74431372, "learning_rate": 1.372666546129797e-08, "loss": 0.76863319, "num_input_tokens_seen": 345878760, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18310547, "step": 16030, "time_per_iteration": 2.8293304443359375 }, { "auxiliary_loss_clip": 0.01389369, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.23320341, "balance_loss_mlp": 1.01349413, "epoch": 0.9638358635202164, "flos": 27245807729280.0, "grad_norm": 2.093787792652627, "language_loss": 0.66680384, "learning_rate": 1.3681151818813575e-08, "loss": 0.69101155, "num_input_tokens_seen": 345900445, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.17907715, "step": 16031, "time_per_iteration": 2.992018699645996 }, { "auxiliary_loss_clip": 0.01176513, "auxiliary_loss_mlp": 0.01028731, "balance_loss_clip": 1.09013486, "balance_loss_mlp": 1.00584245, "epoch": 0.9638959867728845, "flos": 70321228243200.0, "grad_norm": 0.8371232373477131, "language_loss": 0.60727292, "learning_rate": 1.3635713497738955e-08, "loss": 0.62932533, "num_input_tokens_seen": 345961020, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 0.22851562, "step": 16032, "time_per_iteration": 3.3319199085235596 }, { "auxiliary_loss_clip": 0.01374783, "auxiliary_loss_mlp": 0.01033435, "balance_loss_clip": 1.22204709, "balance_loss_mlp": 1.01520848, "epoch": 0.9639561100255524, "flos": 25417595335680.0, "grad_norm": 1.6691500056667177, "language_loss": 0.67084432, "learning_rate": 1.3590350499796954e-08, "loss": 0.6949265, "num_input_tokens_seen": 345980210, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.18225098, "step": 16033, "time_per_iteration": 4.330109119415283 }, { "auxiliary_loss_clip": 0.01397, "auxiliary_loss_mlp": 0.01030935, "balance_loss_clip": 1.23902178, "balance_loss_mlp": 1.01143217, "epoch": 0.9640162332782204, "flos": 18122979415680.0, "grad_norm": 1.8773509454065906, "language_loss": 0.66662908, "learning_rate": 1.3545062826707976e-08, "loss": 0.69090843, "num_input_tokens_seen": 345998280, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.19494629, "step": 16034, "time_per_iteration": 2.865243434906006 }, { "auxiliary_loss_clip": 0.01392839, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.23549557, "balance_loss_mlp": 1.01493144, "epoch": 0.9640763565308883, "flos": 23450643826560.0, "grad_norm": 2.027984328479228, "language_loss": 0.75185227, "learning_rate": 1.3499850480189313e-08, "loss": 0.77611661, "num_input_tokens_seen": 346015545, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18664551, "step": 16035, "time_per_iteration": 2.866504669189453 }, { "auxiliary_loss_clip": 0.01406614, "auxiliary_loss_mlp": 0.0103423, "balance_loss_clip": 1.24893212, "balance_loss_mlp": 1.01516831, "epoch": 0.9641364797835563, "flos": 22429610248320.0, "grad_norm": 2.1962156040267815, "language_loss": 0.82991862, "learning_rate": 1.3454713461955591e-08, "loss": 0.85432708, "num_input_tokens_seen": 346034055, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19067383, "step": 16036, "time_per_iteration": 4.267319202423096 }, { "auxiliary_loss_clip": 0.01397198, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.23895216, "balance_loss_mlp": 1.01148963, "epoch": 0.9641966030362242, "flos": 30633106590720.0, "grad_norm": 1.8232519018781073, "language_loss": 0.70932561, "learning_rate": 1.340965177371789e-08, "loss": 0.73359668, "num_input_tokens_seen": 346054130, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.1842041, "step": 16037, "time_per_iteration": 2.8957197666168213 }, { "auxiliary_loss_clip": 0.01404201, "auxiliary_loss_mlp": 0.01031401, "balance_loss_clip": 1.24238455, "balance_loss_mlp": 1.01222074, "epoch": 0.9642567262888923, "flos": 20961230901120.0, "grad_norm": 1.5938753831062864, "language_loss": 0.63400072, "learning_rate": 1.3364665417185506e-08, "loss": 0.65835673, "num_input_tokens_seen": 346072990, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19177246, "step": 16038, "time_per_iteration": 2.889617443084717 }, { "auxiliary_loss_clip": 0.01418694, "auxiliary_loss_mlp": 0.01035539, "balance_loss_clip": 1.2559526, "balance_loss_mlp": 1.01749015, "epoch": 0.9643168495415602, "flos": 22650025507200.0, "grad_norm": 1.7415228426359832, "language_loss": 0.71563911, "learning_rate": 1.3319754394064187e-08, "loss": 0.74018139, "num_input_tokens_seen": 346093745, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18054199, "step": 16039, "time_per_iteration": 2.930497884750366 }, { "auxiliary_loss_clip": 0.01395917, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.236606, "balance_loss_mlp": 1.00923526, "epoch": 0.9643769727942282, "flos": 20275842384000.0, "grad_norm": 2.1608363944778404, "language_loss": 0.74465489, "learning_rate": 1.327491870605657e-08, "loss": 0.76889682, "num_input_tokens_seen": 346110115, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19042969, "step": 16040, "time_per_iteration": 2.8745458126068115 }, { "auxiliary_loss_clip": 0.01405566, "auxiliary_loss_mlp": 0.01035087, "balance_loss_clip": 1.24380219, "balance_loss_mlp": 1.01682377, "epoch": 0.9644370960468961, "flos": 13889925907200.0, "grad_norm": 2.321486562992873, "language_loss": 0.74187231, "learning_rate": 1.3230158354863296e-08, "loss": 0.7662788, "num_input_tokens_seen": 346127165, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18273926, "step": 16041, "time_per_iteration": 2.8038291931152344 }, { "auxiliary_loss_clip": 0.01382062, "auxiliary_loss_mlp": 0.01031056, "balance_loss_clip": 1.22944748, "balance_loss_mlp": 1.01242328, "epoch": 0.9644972192995641, "flos": 17247064469760.0, "grad_norm": 3.07677314917852, "language_loss": 0.72835428, "learning_rate": 1.3185473342181674e-08, "loss": 0.75248545, "num_input_tokens_seen": 346145950, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.18640137, "step": 16042, "time_per_iteration": 2.8355727195739746 }, { "auxiliary_loss_clip": 0.01397532, "auxiliary_loss_mlp": 0.01028629, "balance_loss_clip": 1.23556674, "balance_loss_mlp": 1.0117017, "epoch": 0.964557342552232, "flos": 23850455293440.0, "grad_norm": 1.7072749632147377, "language_loss": 0.81947196, "learning_rate": 1.3140863669705683e-08, "loss": 0.84373361, "num_input_tokens_seen": 346165005, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.16931152, "step": 16043, "time_per_iteration": 2.848682165145874 }, { "auxiliary_loss_clip": 0.0140122, "auxiliary_loss_mlp": 0.01029744, "balance_loss_clip": 1.24226213, "balance_loss_mlp": 1.01180303, "epoch": 0.9646174658049, "flos": 21663179015040.0, "grad_norm": 1.5557709309004166, "language_loss": 0.72031879, "learning_rate": 1.3096329339127522e-08, "loss": 0.74462843, "num_input_tokens_seen": 346185095, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.17956543, "step": 16044, "time_per_iteration": 4.191645383834839 }, { "auxiliary_loss_clip": 0.01394936, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.23714876, "balance_loss_mlp": 1.01092708, "epoch": 0.9646775890575681, "flos": 17138621387520.0, "grad_norm": 1.8663212364405923, "language_loss": 0.70350218, "learning_rate": 1.3051870352135397e-08, "loss": 0.7277503, "num_input_tokens_seen": 346202580, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.1895752, "step": 16045, "time_per_iteration": 4.241312026977539 }, { "auxiliary_loss_clip": 0.01398139, "auxiliary_loss_mlp": 0.0103299, "balance_loss_clip": 1.23923552, "balance_loss_mlp": 1.01417875, "epoch": 0.964737712310236, "flos": 13013468023680.0, "grad_norm": 2.2766953623574255, "language_loss": 0.76677215, "learning_rate": 1.3007486710415737e-08, "loss": 0.79108346, "num_input_tokens_seen": 346219395, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18823242, "step": 16046, "time_per_iteration": 2.812091827392578 }, { "auxiliary_loss_clip": 0.01404237, "auxiliary_loss_mlp": 0.01032184, "balance_loss_clip": 1.24176741, "balance_loss_mlp": 1.01293159, "epoch": 0.964797835562904, "flos": 24289747488000.0, "grad_norm": 1.7060753024147461, "language_loss": 0.63180441, "learning_rate": 1.2963178415651199e-08, "loss": 0.65616864, "num_input_tokens_seen": 346239715, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19250488, "step": 16047, "time_per_iteration": 2.8381309509277344 }, { "auxiliary_loss_clip": 0.01397387, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.23840761, "balance_loss_mlp": 1.01763296, "epoch": 0.9648579588155719, "flos": 20532616479360.0, "grad_norm": 2.4484657861196175, "language_loss": 0.70019633, "learning_rate": 1.2918945469521992e-08, "loss": 0.72453713, "num_input_tokens_seen": 346258500, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19067383, "step": 16048, "time_per_iteration": 2.8083038330078125 }, { "auxiliary_loss_clip": 0.01403948, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.24228299, "balance_loss_mlp": 1.01243019, "epoch": 0.9649180820682399, "flos": 32166014302080.0, "grad_norm": 1.7247879888080815, "language_loss": 0.64505494, "learning_rate": 1.2874787873705662e-08, "loss": 0.6694144, "num_input_tokens_seen": 346279110, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19567871, "step": 16049, "time_per_iteration": 2.892364263534546 }, { "auxiliary_loss_clip": 0.01410476, "auxiliary_loss_mlp": 0.01032709, "balance_loss_clip": 1.25082731, "balance_loss_mlp": 1.01394558, "epoch": 0.9649782053209078, "flos": 20532480744960.0, "grad_norm": 1.6338836085244222, "language_loss": 0.71921837, "learning_rate": 1.2830705629876427e-08, "loss": 0.7436502, "num_input_tokens_seen": 346297860, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18762207, "step": 16050, "time_per_iteration": 2.862340211868286 }, { "auxiliary_loss_clip": 0.01414291, "auxiliary_loss_mlp": 0.01031336, "balance_loss_clip": 1.25006008, "balance_loss_mlp": 1.01211905, "epoch": 0.9650383285735759, "flos": 43082958412800.0, "grad_norm": 2.374355767803438, "language_loss": 0.70185375, "learning_rate": 1.278669873970606e-08, "loss": 0.72631001, "num_input_tokens_seen": 346319860, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19226074, "step": 16051, "time_per_iteration": 3.0700838565826416 }, { "auxiliary_loss_clip": 0.01180881, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.09315455, "balance_loss_mlp": 1.0027163, "epoch": 0.9650984518262438, "flos": 61777471870080.0, "grad_norm": 0.8511159233821218, "language_loss": 0.5909369, "learning_rate": 1.2742767204863004e-08, "loss": 0.61304373, "num_input_tokens_seen": 346379025, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.27148438, "step": 16052, "time_per_iteration": 3.4554381370544434 }, { "auxiliary_loss_clip": 0.01396351, "auxiliary_loss_mlp": 0.01031247, "balance_loss_clip": 1.23880386, "balance_loss_mlp": 1.01226854, "epoch": 0.9651585750789118, "flos": 29801468321280.0, "grad_norm": 1.5560824135024054, "language_loss": 0.75258124, "learning_rate": 1.2698911027013482e-08, "loss": 0.7768572, "num_input_tokens_seen": 346402250, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18981934, "step": 16053, "time_per_iteration": 2.963743209838867 }, { "auxiliary_loss_clip": 0.01411749, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.2507689, "balance_loss_mlp": 1.01371741, "epoch": 0.9652186983315797, "flos": 16881756802560.0, "grad_norm": 3.049320886176013, "language_loss": 0.68906248, "learning_rate": 1.2655130207820386e-08, "loss": 0.7135098, "num_input_tokens_seen": 346419555, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.19262695, "step": 16054, "time_per_iteration": 2.818467140197754 }, { "auxiliary_loss_clip": 0.01395446, "auxiliary_loss_mlp": 0.01031539, "balance_loss_clip": 1.23772156, "balance_loss_mlp": 1.01337147, "epoch": 0.9652788215842477, "flos": 31662284232960.0, "grad_norm": 1.5507393783315537, "language_loss": 0.63151813, "learning_rate": 1.2611424748943944e-08, "loss": 0.655788, "num_input_tokens_seen": 346441245, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1817627, "step": 16055, "time_per_iteration": 2.948437452316284 }, { "auxiliary_loss_clip": 0.01397028, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.24019289, "balance_loss_mlp": 1.01325893, "epoch": 0.9653389448369156, "flos": 24765081805440.0, "grad_norm": 1.9315706217738529, "language_loss": 0.77519083, "learning_rate": 1.2567794652041719e-08, "loss": 0.79947907, "num_input_tokens_seen": 346460065, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.1854248, "step": 16056, "time_per_iteration": 2.868941068649292 }, { "auxiliary_loss_clip": 0.01396874, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.23691416, "balance_loss_mlp": 1.0155108, "epoch": 0.9653990680895836, "flos": 20305414500480.0, "grad_norm": 2.1749772143858124, "language_loss": 0.72682083, "learning_rate": 1.2524239918767498e-08, "loss": 0.75113201, "num_input_tokens_seen": 346478005, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.1875, "step": 16057, "time_per_iteration": 2.8377623558044434 }, { "auxiliary_loss_clip": 0.01403132, "auxiliary_loss_mlp": 0.01033764, "balance_loss_clip": 1.24479079, "balance_loss_mlp": 1.01564407, "epoch": 0.9654591913422517, "flos": 22539048716160.0, "grad_norm": 2.1404854692958617, "language_loss": 0.72196496, "learning_rate": 1.2480760550773295e-08, "loss": 0.74633396, "num_input_tokens_seen": 346497575, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18103027, "step": 16058, "time_per_iteration": 2.896942377090454 }, { "auxiliary_loss_clip": 0.01380624, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.22551751, "balance_loss_mlp": 1.01349044, "epoch": 0.9655193145949196, "flos": 26774862157440.0, "grad_norm": 1.4073953808185482, "language_loss": 0.74392498, "learning_rate": 1.2437356549708011e-08, "loss": 0.76804936, "num_input_tokens_seen": 346520000, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18322754, "step": 16059, "time_per_iteration": 2.8779640197753906 }, { "auxiliary_loss_clip": 0.01412924, "auxiliary_loss_mlp": 0.01037993, "balance_loss_clip": 1.25029981, "balance_loss_mlp": 1.01981342, "epoch": 0.9655794378475876, "flos": 41984727926400.0, "grad_norm": 1.803246955502041, "language_loss": 0.7408154, "learning_rate": 1.239402791721722e-08, "loss": 0.76532459, "num_input_tokens_seen": 346541605, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.1817627, "step": 16060, "time_per_iteration": 3.008495330810547 }, { "auxiliary_loss_clip": 0.01384403, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.23052382, "balance_loss_mlp": 1.01460266, "epoch": 0.9656395611002555, "flos": 27720961067520.0, "grad_norm": 1.881596441117576, "language_loss": 0.76975977, "learning_rate": 1.2350774654944273e-08, "loss": 0.79393423, "num_input_tokens_seen": 346560955, "router_z_loss_clip": 1.53808594, "router_z_loss_mlp": 0.18444824, "step": 16061, "time_per_iteration": 2.927568197250366 }, { "auxiliary_loss_clip": 0.01185495, "auxiliary_loss_mlp": 0.01020096, "balance_loss_clip": 1.09585142, "balance_loss_mlp": 0.99644536, "epoch": 0.9656996843529235, "flos": 68998646200320.0, "grad_norm": 0.727033092487765, "language_loss": 0.64126086, "learning_rate": 1.2307596764528749e-08, "loss": 0.66331679, "num_input_tokens_seen": 346621615, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.23632812, "step": 16062, "time_per_iteration": 3.408754348754883 }, { "auxiliary_loss_clip": 0.0137369, "auxiliary_loss_mlp": 0.01031112, "balance_loss_clip": 1.22000718, "balance_loss_mlp": 1.01262236, "epoch": 0.9657598076055914, "flos": 20641059561600.0, "grad_norm": 2.3631622365421867, "language_loss": 0.94178599, "learning_rate": 1.226449424760867e-08, "loss": 0.96583396, "num_input_tokens_seen": 346637460, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.18493652, "step": 16063, "time_per_iteration": 2.8354737758636475 }, { "auxiliary_loss_clip": 0.01399106, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.23937798, "balance_loss_mlp": 1.01323009, "epoch": 0.9658199308582595, "flos": 20458179504000.0, "grad_norm": 1.690817006508775, "language_loss": 0.8282131, "learning_rate": 1.2221467105818062e-08, "loss": 0.85251749, "num_input_tokens_seen": 346655625, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18127441, "step": 16064, "time_per_iteration": 2.7917416095733643 }, { "auxiliary_loss_clip": 0.01401103, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.24466538, "balance_loss_mlp": 1.01623976, "epoch": 0.9658800541109274, "flos": 24728903948160.0, "grad_norm": 1.567842774870903, "language_loss": 0.84299988, "learning_rate": 1.2178515340788731e-08, "loss": 0.86734819, "num_input_tokens_seen": 346675220, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.17504883, "step": 16065, "time_per_iteration": 2.8806357383728027 }, { "auxiliary_loss_clip": 0.01393394, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.23435438, "balance_loss_mlp": 1.01355922, "epoch": 0.9659401773635954, "flos": 21618359400960.0, "grad_norm": 1.5445005614496579, "language_loss": 0.68508101, "learning_rate": 1.2135638954149151e-08, "loss": 0.70934951, "num_input_tokens_seen": 346694710, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19897461, "step": 16066, "time_per_iteration": 2.8352322578430176 }, { "auxiliary_loss_clip": 0.01394899, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.23746753, "balance_loss_mlp": 1.01525831, "epoch": 0.9660003006162633, "flos": 20310391428480.0, "grad_norm": 1.7659189597314102, "language_loss": 0.82363594, "learning_rate": 1.209283794752558e-08, "loss": 0.8479265, "num_input_tokens_seen": 346712645, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18884277, "step": 16067, "time_per_iteration": 4.2677271366119385 }, { "auxiliary_loss_clip": 0.013969, "auxiliary_loss_mlp": 0.01035438, "balance_loss_clip": 1.23878217, "balance_loss_mlp": 1.01696074, "epoch": 0.9660604238689313, "flos": 24472582300800.0, "grad_norm": 3.146586954365524, "language_loss": 0.70140421, "learning_rate": 1.2050112322540496e-08, "loss": 0.72572756, "num_input_tokens_seen": 346732375, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18481445, "step": 16068, "time_per_iteration": 2.8764517307281494 }, { "auxiliary_loss_clip": 0.01367164, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.2166903, "balance_loss_mlp": 1.01377845, "epoch": 0.9661205471215992, "flos": 19873270984320.0, "grad_norm": 1.6426073384449833, "language_loss": 0.68628967, "learning_rate": 1.20074620808146e-08, "loss": 0.71027428, "num_input_tokens_seen": 346750430, "router_z_loss_clip": 1.5078125, "router_z_loss_mlp": 0.1751709, "step": 16069, "time_per_iteration": 2.83490252494812 }, { "auxiliary_loss_clip": 0.01391617, "auxiliary_loss_mlp": 0.01032005, "balance_loss_clip": 1.23392177, "balance_loss_mlp": 1.01327729, "epoch": 0.9661806703742672, "flos": 20567482237440.0, "grad_norm": 1.7967375478551182, "language_loss": 0.89603812, "learning_rate": 1.1964887223964826e-08, "loss": 0.92027426, "num_input_tokens_seen": 346768455, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18725586, "step": 16070, "time_per_iteration": 2.912762403488159 }, { "auxiliary_loss_clip": 0.01406899, "auxiliary_loss_mlp": 0.01038123, "balance_loss_clip": 1.24670386, "balance_loss_mlp": 1.01892972, "epoch": 0.9662407936269353, "flos": 21440094312960.0, "grad_norm": 2.866960382325993, "language_loss": 0.78016722, "learning_rate": 1.1922387753605878e-08, "loss": 0.8046174, "num_input_tokens_seen": 346786530, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19213867, "step": 16071, "time_per_iteration": 4.310982704162598 }, { "auxiliary_loss_clip": 0.01396944, "auxiliary_loss_mlp": 0.01038368, "balance_loss_clip": 1.23986697, "balance_loss_mlp": 1.01804304, "epoch": 0.9663009168796032, "flos": 14911095219840.0, "grad_norm": 7.891129304577265, "language_loss": 0.66860777, "learning_rate": 1.1879963671349137e-08, "loss": 0.69296092, "num_input_tokens_seen": 346804635, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.20324707, "step": 16072, "time_per_iteration": 2.8181912899017334 }, { "auxiliary_loss_clip": 0.01412186, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.25047922, "balance_loss_mlp": 1.01745343, "epoch": 0.9663610401322712, "flos": 24320134010880.0, "grad_norm": 3.5441121813183813, "language_loss": 0.78116304, "learning_rate": 1.1837614978803534e-08, "loss": 0.80564928, "num_input_tokens_seen": 346823070, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19006348, "step": 16073, "time_per_iteration": 2.9015071392059326 }, { "auxiliary_loss_clip": 0.01414311, "auxiliary_loss_mlp": 0.01039167, "balance_loss_clip": 1.25173163, "balance_loss_mlp": 1.01953316, "epoch": 0.9664211633849391, "flos": 17646378243840.0, "grad_norm": 2.8967082570350016, "language_loss": 0.76206237, "learning_rate": 1.1795341677574677e-08, "loss": 0.78659713, "num_input_tokens_seen": 346841180, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.19641113, "step": 16074, "time_per_iteration": 2.8201541900634766 }, { "auxiliary_loss_clip": 0.01398503, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.240448, "balance_loss_mlp": 1.01465559, "epoch": 0.9664812866376071, "flos": 29801558810880.0, "grad_norm": 5.190174682403722, "language_loss": 0.76045108, "learning_rate": 1.1753143769265728e-08, "loss": 0.78477675, "num_input_tokens_seen": 346864250, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.1940918, "step": 16075, "time_per_iteration": 2.8968615531921387 }, { "auxiliary_loss_clip": 0.01401851, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.24146771, "balance_loss_mlp": 1.01240182, "epoch": 0.966541409890275, "flos": 14290280311680.0, "grad_norm": 2.6137444300275634, "language_loss": 0.79649419, "learning_rate": 1.171102125547696e-08, "loss": 0.82081664, "num_input_tokens_seen": 346881955, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.17993164, "step": 16076, "time_per_iteration": 2.842754602432251 }, { "auxiliary_loss_clip": 0.01392258, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.23238659, "balance_loss_mlp": 1.01721978, "epoch": 0.9666015331429431, "flos": 19868746504320.0, "grad_norm": 1.6638666795591646, "language_loss": 0.72288322, "learning_rate": 1.166897413780532e-08, "loss": 0.74717414, "num_input_tokens_seen": 346900445, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.19604492, "step": 16077, "time_per_iteration": 2.864861249923706 }, { "auxiliary_loss_clip": 0.01391806, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.23314345, "balance_loss_mlp": 1.01396728, "epoch": 0.966661656395611, "flos": 27137409891840.0, "grad_norm": 2.397937271229835, "language_loss": 0.60043252, "learning_rate": 1.1627002417845533e-08, "loss": 0.6246897, "num_input_tokens_seen": 346920135, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19934082, "step": 16078, "time_per_iteration": 4.27131462097168 }, { "auxiliary_loss_clip": 0.01410467, "auxiliary_loss_mlp": 0.01035643, "balance_loss_clip": 1.24875784, "balance_loss_mlp": 1.01683211, "epoch": 0.966721779648279, "flos": 21518512830720.0, "grad_norm": 1.8714424519790533, "language_loss": 0.72992164, "learning_rate": 1.158510609718899e-08, "loss": 0.75438273, "num_input_tokens_seen": 346940450, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.18798828, "step": 16079, "time_per_iteration": 2.8669281005859375 }, { "auxiliary_loss_clip": 0.01384069, "auxiliary_loss_mlp": 0.01030619, "balance_loss_clip": 1.22903657, "balance_loss_mlp": 1.01238036, "epoch": 0.9667819029009469, "flos": 23888578677120.0, "grad_norm": 1.6065279740342007, "language_loss": 0.72426254, "learning_rate": 1.1543285177424644e-08, "loss": 0.74840939, "num_input_tokens_seen": 346960935, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18249512, "step": 16080, "time_per_iteration": 4.300507545471191 }, { "auxiliary_loss_clip": 0.01389961, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.23128414, "balance_loss_mlp": 1.01069069, "epoch": 0.9668420261536149, "flos": 21517245976320.0, "grad_norm": 1.9575334703326828, "language_loss": 0.74683875, "learning_rate": 1.1501539660138115e-08, "loss": 0.7710278, "num_input_tokens_seen": 346980100, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18261719, "step": 16081, "time_per_iteration": 2.8782808780670166 }, { "auxiliary_loss_clip": 0.01394003, "auxiliary_loss_mlp": 0.01030965, "balance_loss_clip": 1.2350806, "balance_loss_mlp": 1.01180768, "epoch": 0.9669021494062828, "flos": 26698389166080.0, "grad_norm": 1.7711745995963897, "language_loss": 0.67920214, "learning_rate": 1.145986954691236e-08, "loss": 0.70345175, "num_input_tokens_seen": 347001250, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.19165039, "step": 16082, "time_per_iteration": 2.9073731899261475 }, { "auxiliary_loss_clip": 0.0139184, "auxiliary_loss_mlp": 0.01029148, "balance_loss_clip": 1.23406208, "balance_loss_mlp": 1.00938284, "epoch": 0.9669622726589508, "flos": 29836967506560.0, "grad_norm": 1.5678568485635798, "language_loss": 0.77284157, "learning_rate": 1.141827483932789e-08, "loss": 0.79705149, "num_input_tokens_seen": 347022975, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.19763184, "step": 16083, "time_per_iteration": 2.8845417499542236 }, { "auxiliary_loss_clip": 0.01402087, "auxiliary_loss_mlp": 0.01035647, "balance_loss_clip": 1.24211657, "balance_loss_mlp": 1.01668096, "epoch": 0.9670223959116189, "flos": 22930897098240.0, "grad_norm": 1.9185206390232092, "language_loss": 0.80190635, "learning_rate": 1.1376755538961669e-08, "loss": 0.82628369, "num_input_tokens_seen": 347038780, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18969727, "step": 16084, "time_per_iteration": 2.92252254486084 }, { "auxiliary_loss_clip": 0.01403231, "auxiliary_loss_mlp": 0.01030454, "balance_loss_clip": 1.24102437, "balance_loss_mlp": 1.01195228, "epoch": 0.9670825191642868, "flos": 18633677184000.0, "grad_norm": 3.047341761112542, "language_loss": 0.69247562, "learning_rate": 1.1335311647387991e-08, "loss": 0.71681243, "num_input_tokens_seen": 347056705, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18493652, "step": 16085, "time_per_iteration": 2.837934732437134 }, { "auxiliary_loss_clip": 0.01419796, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.25535727, "balance_loss_mlp": 1.01602435, "epoch": 0.9671426424169548, "flos": 24508443444480.0, "grad_norm": 1.794054034540556, "language_loss": 0.69164264, "learning_rate": 1.1293943166178709e-08, "loss": 0.71619362, "num_input_tokens_seen": 347075710, "router_z_loss_clip": 1.64550781, "router_z_loss_mlp": 0.19274902, "step": 16086, "time_per_iteration": 2.9011118412017822 }, { "auxiliary_loss_clip": 0.01407803, "auxiliary_loss_mlp": 0.01033236, "balance_loss_clip": 1.2481966, "balance_loss_mlp": 1.01324487, "epoch": 0.9672027656696227, "flos": 20379625251840.0, "grad_norm": 2.1726257373206925, "language_loss": 0.7907986, "learning_rate": 1.125265009690235e-08, "loss": 0.81520903, "num_input_tokens_seen": 347092325, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19970703, "step": 16087, "time_per_iteration": 2.8651180267333984 }, { "auxiliary_loss_clip": 0.01389397, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.23081517, "balance_loss_mlp": 1.01495337, "epoch": 0.9672628889222907, "flos": 18889501138560.0, "grad_norm": 1.9089972818311673, "language_loss": 0.72262824, "learning_rate": 1.1211432441124769e-08, "loss": 0.74684697, "num_input_tokens_seen": 347110595, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.17529297, "step": 16088, "time_per_iteration": 2.8264026641845703 }, { "auxiliary_loss_clip": 0.01392544, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.23721957, "balance_loss_mlp": 1.01357532, "epoch": 0.9673230121749586, "flos": 28707219377280.0, "grad_norm": 1.4205636674899553, "language_loss": 0.7136848, "learning_rate": 1.117029020040916e-08, "loss": 0.73793197, "num_input_tokens_seen": 347131625, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.1862793, "step": 16089, "time_per_iteration": 2.9089529514312744 }, { "auxiliary_loss_clip": 0.01409302, "auxiliary_loss_mlp": 0.01031523, "balance_loss_clip": 1.24848247, "balance_loss_mlp": 1.01310563, "epoch": 0.9673831354276267, "flos": 20493904913280.0, "grad_norm": 2.714491278621544, "language_loss": 0.75332355, "learning_rate": 1.1129223376315167e-08, "loss": 0.77773184, "num_input_tokens_seen": 347147910, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18432617, "step": 16090, "time_per_iteration": 2.8405590057373047 }, { "auxiliary_loss_clip": 0.01417124, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.25173104, "balance_loss_mlp": 1.01081753, "epoch": 0.9674432586802946, "flos": 26808913509120.0, "grad_norm": 1.7070007585083067, "language_loss": 0.69380897, "learning_rate": 1.1088231970400653e-08, "loss": 0.71827734, "num_input_tokens_seen": 347168805, "router_z_loss_clip": 1.65039062, "router_z_loss_mlp": 0.18908691, "step": 16091, "time_per_iteration": 2.8920960426330566 }, { "auxiliary_loss_clip": 0.01390514, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.23389161, "balance_loss_mlp": 1.01673734, "epoch": 0.9675033819329626, "flos": 22321438634880.0, "grad_norm": 2.4619670106862106, "language_loss": 0.77802789, "learning_rate": 1.1047315984219484e-08, "loss": 0.80230343, "num_input_tokens_seen": 347189455, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.20300293, "step": 16092, "time_per_iteration": 2.927084445953369 }, { "auxiliary_loss_clip": 0.01391954, "auxiliary_loss_mlp": 0.01032287, "balance_loss_clip": 1.23437786, "balance_loss_mlp": 1.0143342, "epoch": 0.9675635051856305, "flos": 12682935624960.0, "grad_norm": 2.7871743926161265, "language_loss": 0.77777374, "learning_rate": 1.1006475419323313e-08, "loss": 0.8020162, "num_input_tokens_seen": 347206030, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.1796875, "step": 16093, "time_per_iteration": 2.8511133193969727 }, { "auxiliary_loss_clip": 0.01394537, "auxiliary_loss_mlp": 0.01029204, "balance_loss_clip": 1.23539352, "balance_loss_mlp": 1.0098567, "epoch": 0.9676236284382985, "flos": 24619284501120.0, "grad_norm": 2.7401319165621247, "language_loss": 0.69869143, "learning_rate": 1.096571027726112e-08, "loss": 0.72292888, "num_input_tokens_seen": 347226250, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19348145, "step": 16094, "time_per_iteration": 2.871044397354126 }, { "auxiliary_loss_clip": 0.01404664, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.24201417, "balance_loss_mlp": 1.01481533, "epoch": 0.9676837516909664, "flos": 23377428460800.0, "grad_norm": 1.741388968864354, "language_loss": 0.76560444, "learning_rate": 1.0925020559578557e-08, "loss": 0.78998649, "num_input_tokens_seen": 347247350, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.18725586, "step": 16095, "time_per_iteration": 2.8633015155792236 }, { "auxiliary_loss_clip": 0.01406385, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.24272156, "balance_loss_mlp": 1.01418293, "epoch": 0.9677438749436345, "flos": 20496981559680.0, "grad_norm": 1.8779131269138423, "language_loss": 0.7188561, "learning_rate": 1.0884406267818392e-08, "loss": 0.74325049, "num_input_tokens_seen": 347266870, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.1887207, "step": 16096, "time_per_iteration": 2.887155055999756 }, { "auxiliary_loss_clip": 0.0141363, "auxiliary_loss_mlp": 0.01035711, "balance_loss_clip": 1.25219941, "balance_loss_mlp": 1.01642299, "epoch": 0.9678039981963025, "flos": 47572107344640.0, "grad_norm": 1.7124421337653342, "language_loss": 0.72476614, "learning_rate": 1.0843867403520946e-08, "loss": 0.74925959, "num_input_tokens_seen": 347290120, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19287109, "step": 16097, "time_per_iteration": 3.060173749923706 }, { "auxiliary_loss_clip": 0.01395796, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.23867369, "balance_loss_mlp": 1.01403642, "epoch": 0.9678641214489704, "flos": 25049889694080.0, "grad_norm": 1.7947219427114232, "language_loss": 0.78669643, "learning_rate": 1.0803403968223434e-08, "loss": 0.81098521, "num_input_tokens_seen": 347308785, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.19055176, "step": 16098, "time_per_iteration": 2.875985860824585 }, { "auxiliary_loss_clip": 0.01381474, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.22628915, "balance_loss_mlp": 1.01575017, "epoch": 0.9679242447016384, "flos": 19249288940160.0, "grad_norm": 1.8230476040076373, "language_loss": 0.90509903, "learning_rate": 1.0763015963459965e-08, "loss": 0.92924899, "num_input_tokens_seen": 347326375, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.17785645, "step": 16099, "time_per_iteration": 2.8328845500946045 }, { "auxiliary_loss_clip": 0.0141464, "auxiliary_loss_mlp": 0.01034649, "balance_loss_clip": 1.25151038, "balance_loss_mlp": 1.01434779, "epoch": 0.9679843679543063, "flos": 33267502414080.0, "grad_norm": 1.8034124267981217, "language_loss": 0.66840422, "learning_rate": 1.0722703390762643e-08, "loss": 0.69289708, "num_input_tokens_seen": 347348250, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.20300293, "step": 16100, "time_per_iteration": 2.952063798904419 }, { "auxiliary_loss_clip": 0.01390149, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.2314775, "balance_loss_mlp": 1.01059389, "epoch": 0.9680444912069743, "flos": 22793832040320.0, "grad_norm": 1.808082566413832, "language_loss": 0.73791677, "learning_rate": 1.0682466251659584e-08, "loss": 0.76210773, "num_input_tokens_seen": 347367400, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18371582, "step": 16101, "time_per_iteration": 2.852747917175293 }, { "auxiliary_loss_clip": 0.01393444, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.23463738, "balance_loss_mlp": 1.01257122, "epoch": 0.9681046144596422, "flos": 24034602205440.0, "grad_norm": 3.02758435764318, "language_loss": 0.74028927, "learning_rate": 1.0642304547676672e-08, "loss": 0.76453829, "num_input_tokens_seen": 347387600, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.1887207, "step": 16102, "time_per_iteration": 4.311879873275757 }, { "auxiliary_loss_clip": 0.01407473, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.24674416, "balance_loss_mlp": 1.01664543, "epoch": 0.9681647377123103, "flos": 23451548722560.0, "grad_norm": 1.8157065613753776, "language_loss": 0.78513151, "learning_rate": 1.0602218280337139e-08, "loss": 0.80957419, "num_input_tokens_seen": 347406915, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.20129395, "step": 16103, "time_per_iteration": 2.8557939529418945 }, { "auxiliary_loss_clip": 0.01406784, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.24689305, "balance_loss_mlp": 1.01443434, "epoch": 0.9682248609649782, "flos": 22685479447680.0, "grad_norm": 1.5325443979707467, "language_loss": 0.80476439, "learning_rate": 1.0562207451160655e-08, "loss": 0.82916617, "num_input_tokens_seen": 347425140, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18969727, "step": 16104, "time_per_iteration": 2.8791444301605225 }, { "auxiliary_loss_clip": 0.0138878, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.23126054, "balance_loss_mlp": 1.01429939, "epoch": 0.9682849842176462, "flos": 24439797803520.0, "grad_norm": 1.53026226268362, "language_loss": 0.78326106, "learning_rate": 1.0522272061664672e-08, "loss": 0.80747664, "num_input_tokens_seen": 347446350, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18481445, "step": 16105, "time_per_iteration": 2.9193274974823 }, { "auxiliary_loss_clip": 0.01183552, "auxiliary_loss_mlp": 0.0104321, "balance_loss_clip": 1.09482217, "balance_loss_mlp": 1.01383686, "epoch": 0.9683451074703141, "flos": 60023424983040.0, "grad_norm": 0.8211507215538079, "language_loss": 0.56731719, "learning_rate": 1.0482412113363536e-08, "loss": 0.58958477, "num_input_tokens_seen": 347510135, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.29296875, "step": 16106, "time_per_iteration": 4.7999937534332275 }, { "auxiliary_loss_clip": 0.01182267, "auxiliary_loss_mlp": 0.01039631, "balance_loss_clip": 1.09502363, "balance_loss_mlp": 1.0196991, "epoch": 0.9684052307229821, "flos": 52722067587840.0, "grad_norm": 0.8824386913191324, "language_loss": 0.6166116, "learning_rate": 1.0442627607768707e-08, "loss": 0.63883054, "num_input_tokens_seen": 347562505, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.19921875, "step": 16107, "time_per_iteration": 3.1996593475341797 }, { "auxiliary_loss_clip": 0.01399733, "auxiliary_loss_mlp": 0.01037214, "balance_loss_clip": 1.2394917, "balance_loss_mlp": 1.01877189, "epoch": 0.96846535397565, "flos": 22794103509120.0, "grad_norm": 3.012271564964, "language_loss": 0.74582839, "learning_rate": 1.040291854638875e-08, "loss": 0.77019787, "num_input_tokens_seen": 347579150, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18457031, "step": 16108, "time_per_iteration": 2.8605902194976807 }, { "auxiliary_loss_clip": 0.01404369, "auxiliary_loss_mlp": 0.01032073, "balance_loss_clip": 1.24251032, "balance_loss_mlp": 1.01263022, "epoch": 0.968525477228318, "flos": 23332518357120.0, "grad_norm": 3.8223461836127783, "language_loss": 0.58252043, "learning_rate": 1.0363284930729576e-08, "loss": 0.60688484, "num_input_tokens_seen": 347596705, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19433594, "step": 16109, "time_per_iteration": 2.857447862625122 }, { "auxiliary_loss_clip": 0.01179933, "auxiliary_loss_mlp": 0.01028911, "balance_loss_clip": 1.0928483, "balance_loss_mlp": 1.00611782, "epoch": 0.9685856004809861, "flos": 67915075029120.0, "grad_norm": 0.6689177113830288, "language_loss": 0.54293251, "learning_rate": 1.0323726762294205e-08, "loss": 0.56502104, "num_input_tokens_seen": 347661870, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.22753906, "step": 16110, "time_per_iteration": 3.2365925312042236 }, { "auxiliary_loss_clip": 0.01412373, "auxiliary_loss_mlp": 0.01034004, "balance_loss_clip": 1.24893773, "balance_loss_mlp": 1.01398838, "epoch": 0.968645723733654, "flos": 33960899260800.0, "grad_norm": 1.4633542249169573, "language_loss": 0.63075864, "learning_rate": 1.0284244042582325e-08, "loss": 0.65522242, "num_input_tokens_seen": 347684295, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20007324, "step": 16111, "time_per_iteration": 2.9124701023101807 }, { "auxiliary_loss_clip": 0.01396243, "auxiliary_loss_mlp": 0.01032167, "balance_loss_clip": 1.2381264, "balance_loss_mlp": 1.01334381, "epoch": 0.968705846986322, "flos": 18560642797440.0, "grad_norm": 1.8129688968730455, "language_loss": 0.75007284, "learning_rate": 1.024483677309118e-08, "loss": 0.77435696, "num_input_tokens_seen": 347702585, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18811035, "step": 16112, "time_per_iteration": 2.9002394676208496 }, { "auxiliary_loss_clip": 0.01384106, "auxiliary_loss_mlp": 0.01033874, "balance_loss_clip": 1.22871792, "balance_loss_mlp": 1.01564729, "epoch": 0.9687659702389899, "flos": 17429899282560.0, "grad_norm": 1.8416067503416018, "language_loss": 0.67547351, "learning_rate": 1.020550495531558e-08, "loss": 0.69965339, "num_input_tokens_seen": 347721810, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18249512, "step": 16113, "time_per_iteration": 4.313498020172119 }, { "auxiliary_loss_clip": 0.01177266, "auxiliary_loss_mlp": 0.01019765, "balance_loss_clip": 1.09033465, "balance_loss_mlp": 0.99859315, "epoch": 0.9688260934916579, "flos": 62078296417920.0, "grad_norm": 0.6960541649119036, "language_loss": 0.56644547, "learning_rate": 1.0166248590746329e-08, "loss": 0.5884158, "num_input_tokens_seen": 347782330, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.21191406, "step": 16114, "time_per_iteration": 3.281243085861206 }, { "auxiliary_loss_clip": 0.01398876, "auxiliary_loss_mlp": 0.01034858, "balance_loss_clip": 1.23995805, "balance_loss_mlp": 1.01582074, "epoch": 0.9688862167443258, "flos": 15082483098240.0, "grad_norm": 1.9554561667894377, "language_loss": 0.82984507, "learning_rate": 1.0127067680872458e-08, "loss": 0.85418248, "num_input_tokens_seen": 347794835, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.19055176, "step": 16115, "time_per_iteration": 4.160271644592285 }, { "auxiliary_loss_clip": 0.01379058, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.22554719, "balance_loss_mlp": 1.01127243, "epoch": 0.9689463399969939, "flos": 19947798449280.0, "grad_norm": 1.574532342687924, "language_loss": 0.72517997, "learning_rate": 1.0087962227179448e-08, "loss": 0.74926639, "num_input_tokens_seen": 347814320, "router_z_loss_clip": 1.53515625, "router_z_loss_mlp": 0.18322754, "step": 16116, "time_per_iteration": 2.8470888137817383 }, { "auxiliary_loss_clip": 0.01403556, "auxiliary_loss_mlp": 0.0102817, "balance_loss_clip": 1.24248838, "balance_loss_mlp": 1.00981164, "epoch": 0.9690064632496618, "flos": 19582445537280.0, "grad_norm": 1.9864058054616247, "language_loss": 0.76725149, "learning_rate": 1.0048932231150553e-08, "loss": 0.79156876, "num_input_tokens_seen": 347832125, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18371582, "step": 16117, "time_per_iteration": 2.801406145095825 }, { "auxiliary_loss_clip": 0.01397075, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.23622835, "balance_loss_mlp": 1.01506805, "epoch": 0.9690665865023298, "flos": 21882463153920.0, "grad_norm": 2.3239879576729674, "language_loss": 0.77942777, "learning_rate": 1.000997769426548e-08, "loss": 0.80373245, "num_input_tokens_seen": 347850765, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18322754, "step": 16118, "time_per_iteration": 2.841005802154541 }, { "auxiliary_loss_clip": 0.01414636, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.25415361, "balance_loss_mlp": 1.01453269, "epoch": 0.9691267097549977, "flos": 21003878764800.0, "grad_norm": 1.674769416682097, "language_loss": 0.78506756, "learning_rate": 9.971098618001272e-09, "loss": 0.80954552, "num_input_tokens_seen": 347870125, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.1862793, "step": 16119, "time_per_iteration": 2.849072217941284 }, { "auxiliary_loss_clip": 0.01385104, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.23083854, "balance_loss_mlp": 1.01444268, "epoch": 0.9691868330076657, "flos": 24289611753600.0, "grad_norm": 1.3802769080867991, "language_loss": 0.76099694, "learning_rate": 9.932295003832747e-09, "loss": 0.78517354, "num_input_tokens_seen": 347890615, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.18115234, "step": 16120, "time_per_iteration": 2.900120973587036 }, { "auxiliary_loss_clip": 0.01410677, "auxiliary_loss_mlp": 0.01031037, "balance_loss_clip": 1.25086176, "balance_loss_mlp": 1.0127387, "epoch": 0.9692469562603336, "flos": 17684592117120.0, "grad_norm": 1.7856261502677935, "language_loss": 0.70341176, "learning_rate": 9.89356685323095e-09, "loss": 0.72782892, "num_input_tokens_seen": 347908685, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.1829834, "step": 16121, "time_per_iteration": 2.8468096256256104 }, { "auxiliary_loss_clip": 0.01389538, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.23152781, "balance_loss_mlp": 1.01064229, "epoch": 0.9693070795130017, "flos": 26845860528000.0, "grad_norm": 1.8274096114105445, "language_loss": 0.69463056, "learning_rate": 9.854914167664486e-09, "loss": 0.71882081, "num_input_tokens_seen": 347926385, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18835449, "step": 16122, "time_per_iteration": 2.882077217102051 }, { "auxiliary_loss_clip": 0.01403914, "auxiliary_loss_mlp": 0.01029646, "balance_loss_clip": 1.24356437, "balance_loss_mlp": 1.01193142, "epoch": 0.9693672027656697, "flos": 18086349110400.0, "grad_norm": 1.743857138804114, "language_loss": 0.76238042, "learning_rate": 9.81633694859907e-09, "loss": 0.7867161, "num_input_tokens_seen": 347945290, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.17724609, "step": 16123, "time_per_iteration": 2.8360776901245117 }, { "auxiliary_loss_clip": 0.01397069, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.23545039, "balance_loss_mlp": 1.01201177, "epoch": 0.9694273260183376, "flos": 21773205665280.0, "grad_norm": 1.5874243530975334, "language_loss": 0.75206399, "learning_rate": 9.777835197497753e-09, "loss": 0.77634716, "num_input_tokens_seen": 347966330, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19250488, "step": 16124, "time_per_iteration": 2.8346760272979736 }, { "auxiliary_loss_clip": 0.01403048, "auxiliary_loss_mlp": 0.01033994, "balance_loss_clip": 1.24231815, "balance_loss_mlp": 1.01588643, "epoch": 0.9694874492710056, "flos": 24436902136320.0, "grad_norm": 2.5709807418418706, "language_loss": 0.75441802, "learning_rate": 9.739408915820258e-09, "loss": 0.77878845, "num_input_tokens_seen": 347982590, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18115234, "step": 16125, "time_per_iteration": 2.8585658073425293 }, { "auxiliary_loss_clip": 0.01183467, "auxiliary_loss_mlp": 0.01028926, "balance_loss_clip": 1.09442663, "balance_loss_mlp": 1.00527525, "epoch": 0.9695475725236735, "flos": 67679367027840.0, "grad_norm": 0.919609345773292, "language_loss": 0.61539143, "learning_rate": 9.70105810502364e-09, "loss": 0.63751537, "num_input_tokens_seen": 348043310, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.23632812, "step": 16126, "time_per_iteration": 3.289154291152954 }, { "auxiliary_loss_clip": 0.01381031, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.22632694, "balance_loss_mlp": 1.01450992, "epoch": 0.9696076957763415, "flos": 19137090539520.0, "grad_norm": 1.6480419698716133, "language_loss": 0.75184226, "learning_rate": 9.662782766562738e-09, "loss": 0.77599937, "num_input_tokens_seen": 348062200, "router_z_loss_clip": 1.54589844, "router_z_loss_mlp": 0.20153809, "step": 16127, "time_per_iteration": 2.831860303878784 }, { "auxiliary_loss_clip": 0.01406961, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.24321866, "balance_loss_mlp": 1.01385784, "epoch": 0.9696678190290094, "flos": 15495234577920.0, "grad_norm": 1.908256089925537, "language_loss": 0.69864392, "learning_rate": 9.62458290188839e-09, "loss": 0.72304416, "num_input_tokens_seen": 348080685, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19226074, "step": 16128, "time_per_iteration": 2.8462777137756348 }, { "auxiliary_loss_clip": 0.01394924, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.23734891, "balance_loss_mlp": 1.01477027, "epoch": 0.9697279422816775, "flos": 36221933842560.0, "grad_norm": 1.5783371409266866, "language_loss": 0.65741873, "learning_rate": 9.586458512449213e-09, "loss": 0.68170905, "num_input_tokens_seen": 348102500, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19335938, "step": 16129, "time_per_iteration": 2.953537940979004 }, { "auxiliary_loss_clip": 0.01421164, "auxiliary_loss_mlp": 0.0103283, "balance_loss_clip": 1.25727057, "balance_loss_mlp": 1.0136497, "epoch": 0.9697880655343454, "flos": 25495018467840.0, "grad_norm": 1.9727734268645263, "language_loss": 0.63628972, "learning_rate": 9.548409599691166e-09, "loss": 0.66082966, "num_input_tokens_seen": 348122515, "router_z_loss_clip": 1.63964844, "router_z_loss_mlp": 0.19165039, "step": 16130, "time_per_iteration": 2.9029581546783447 }, { "auxiliary_loss_clip": 0.01417108, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.25224972, "balance_loss_mlp": 1.0148232, "epoch": 0.9698481887870134, "flos": 15339981110400.0, "grad_norm": 2.6441444161847203, "language_loss": 0.71769357, "learning_rate": 9.510436165056867e-09, "loss": 0.74220395, "num_input_tokens_seen": 348138775, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.1907959, "step": 16131, "time_per_iteration": 2.8254919052124023 }, { "auxiliary_loss_clip": 0.01417085, "auxiliary_loss_mlp": 0.01038751, "balance_loss_clip": 1.25430155, "balance_loss_mlp": 1.01898623, "epoch": 0.9699083120396813, "flos": 21992580293760.0, "grad_norm": 1.6359206513231148, "language_loss": 0.77282333, "learning_rate": 9.472538209986058e-09, "loss": 0.79738176, "num_input_tokens_seen": 348157115, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19763184, "step": 16132, "time_per_iteration": 2.9412343502044678 }, { "auxiliary_loss_clip": 0.01400854, "auxiliary_loss_mlp": 0.01038095, "balance_loss_clip": 1.24147415, "balance_loss_mlp": 1.0193553, "epoch": 0.9699684352923493, "flos": 15671146936320.0, "grad_norm": 2.8994340530886795, "language_loss": 0.79560006, "learning_rate": 9.434715735916477e-09, "loss": 0.81998956, "num_input_tokens_seen": 348173035, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18737793, "step": 16133, "time_per_iteration": 2.815883159637451 }, { "auxiliary_loss_clip": 0.01393352, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.23781514, "balance_loss_mlp": 1.01572108, "epoch": 0.9700285585450172, "flos": 21918460032000.0, "grad_norm": 1.5773113009625812, "language_loss": 0.65481627, "learning_rate": 9.396968744281863e-09, "loss": 0.67909193, "num_input_tokens_seen": 348192960, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18493652, "step": 16134, "time_per_iteration": 2.8525664806365967 }, { "auxiliary_loss_clip": 0.01402447, "auxiliary_loss_mlp": 0.01032055, "balance_loss_clip": 1.24238205, "balance_loss_mlp": 1.01346993, "epoch": 0.9700886817976853, "flos": 23925435206400.0, "grad_norm": 2.207490908421783, "language_loss": 0.81528616, "learning_rate": 9.359297236513519e-09, "loss": 0.8396312, "num_input_tokens_seen": 348212805, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18591309, "step": 16135, "time_per_iteration": 2.9292211532592773 }, { "auxiliary_loss_clip": 0.01407368, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.24484921, "balance_loss_mlp": 1.01432252, "epoch": 0.9701488050503532, "flos": 25458931100160.0, "grad_norm": 2.046627198563364, "language_loss": 0.7422033, "learning_rate": 9.321701214040079e-09, "loss": 0.76662052, "num_input_tokens_seen": 348232900, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.20031738, "step": 16136, "time_per_iteration": 2.871307134628296 }, { "auxiliary_loss_clip": 0.01385396, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.22839832, "balance_loss_mlp": 1.01426053, "epoch": 0.9702089283030212, "flos": 20599542817920.0, "grad_norm": 1.7169481804383706, "language_loss": 0.77227473, "learning_rate": 9.28418067828729e-09, "loss": 0.79645419, "num_input_tokens_seen": 348253065, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18273926, "step": 16137, "time_per_iteration": 2.8432352542877197 }, { "auxiliary_loss_clip": 0.01179202, "auxiliary_loss_mlp": 0.01021802, "balance_loss_clip": 1.0921309, "balance_loss_mlp": 1.00377798, "epoch": 0.9702690515556892, "flos": 70683233040000.0, "grad_norm": 0.8191662807832778, "language_loss": 0.54930115, "learning_rate": 9.246735630678015e-09, "loss": 0.57131118, "num_input_tokens_seen": 348316075, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.18066406, "step": 16138, "time_per_iteration": 4.854604721069336 }, { "auxiliary_loss_clip": 0.01407801, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.2469399, "balance_loss_mlp": 1.01032889, "epoch": 0.9703291748083571, "flos": 35894885293440.0, "grad_norm": 1.6580180868063579, "language_loss": 0.71960104, "learning_rate": 9.209366072632007e-09, "loss": 0.74395961, "num_input_tokens_seen": 348337605, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.17736816, "step": 16139, "time_per_iteration": 2.9524242877960205 }, { "auxiliary_loss_clip": 0.0140734, "auxiliary_loss_mlp": 0.01033029, "balance_loss_clip": 1.24648309, "balance_loss_mlp": 1.01316893, "epoch": 0.9703892980610251, "flos": 24327192199680.0, "grad_norm": 1.4788404097581702, "language_loss": 0.72954118, "learning_rate": 9.172072005566134e-09, "loss": 0.75394487, "num_input_tokens_seen": 348359430, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.1986084, "step": 16140, "time_per_iteration": 2.887711524963379 }, { "auxiliary_loss_clip": 0.01396881, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.23664236, "balance_loss_mlp": 1.01763225, "epoch": 0.970449421313693, "flos": 18012636051840.0, "grad_norm": 2.202862269899796, "language_loss": 0.69261819, "learning_rate": 9.13485343089504e-09, "loss": 0.71696573, "num_input_tokens_seen": 348377890, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.20239258, "step": 16141, "time_per_iteration": 4.231178045272827 }, { "auxiliary_loss_clip": 0.01388179, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.23304176, "balance_loss_mlp": 1.01387811, "epoch": 0.9705095445663611, "flos": 25348949694720.0, "grad_norm": 1.9367402813633896, "language_loss": 0.69335878, "learning_rate": 9.097710350029597e-09, "loss": 0.71756899, "num_input_tokens_seen": 348396550, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.1895752, "step": 16142, "time_per_iteration": 2.8643553256988525 }, { "auxiliary_loss_clip": 0.01398158, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.23768139, "balance_loss_mlp": 1.0124265, "epoch": 0.970569667819029, "flos": 26844865142400.0, "grad_norm": 1.8066971458512866, "language_loss": 0.56038988, "learning_rate": 9.060642764378457e-09, "loss": 0.58468348, "num_input_tokens_seen": 348417120, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18774414, "step": 16143, "time_per_iteration": 2.9029433727264404 }, { "auxiliary_loss_clip": 0.01406651, "auxiliary_loss_mlp": 0.01034578, "balance_loss_clip": 1.24456954, "balance_loss_mlp": 1.0159452, "epoch": 0.970629791071697, "flos": 25859692707840.0, "grad_norm": 10.353866906179231, "language_loss": 0.68830967, "learning_rate": 9.023650675347382e-09, "loss": 0.71272194, "num_input_tokens_seen": 348437750, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.18640137, "step": 16144, "time_per_iteration": 2.8985683917999268 }, { "auxiliary_loss_clip": 0.01386336, "auxiliary_loss_mlp": 0.01032929, "balance_loss_clip": 1.22983944, "balance_loss_mlp": 1.01310444, "epoch": 0.9706899143243649, "flos": 36554321278080.0, "grad_norm": 3.0737535430619163, "language_loss": 0.7219826, "learning_rate": 8.986734084339253e-09, "loss": 0.74617529, "num_input_tokens_seen": 348460935, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.19836426, "step": 16145, "time_per_iteration": 2.9813120365142822 }, { "auxiliary_loss_clip": 0.01399379, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.23782706, "balance_loss_mlp": 1.01472855, "epoch": 0.9707500375770329, "flos": 12273758484480.0, "grad_norm": 3.0910936234215103, "language_loss": 0.80543458, "learning_rate": 8.949892992753395e-09, "loss": 0.82976562, "num_input_tokens_seen": 348474480, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18994141, "step": 16146, "time_per_iteration": 2.8732059001922607 }, { "auxiliary_loss_clip": 0.01181166, "auxiliary_loss_mlp": 0.01036012, "balance_loss_clip": 1.0917027, "balance_loss_mlp": 1.00778353, "epoch": 0.9708101608297008, "flos": 60886417898880.0, "grad_norm": 0.7701317981547353, "language_loss": 0.54704314, "learning_rate": 8.91312740198713e-09, "loss": 0.56921494, "num_input_tokens_seen": 348541220, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 0.28320312, "step": 16147, "time_per_iteration": 3.4482955932617188 }, { "auxiliary_loss_clip": 0.01407818, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.2457298, "balance_loss_mlp": 1.0172683, "epoch": 0.9708702840823689, "flos": 27135916813440.0, "grad_norm": 3.274700762556912, "language_loss": 0.62400049, "learning_rate": 8.876437313434682e-09, "loss": 0.64843941, "num_input_tokens_seen": 348559230, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18811035, "step": 16148, "time_per_iteration": 4.28760552406311 }, { "auxiliary_loss_clip": 0.01398654, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.24106729, "balance_loss_mlp": 1.01379323, "epoch": 0.9709304073350368, "flos": 20787580782720.0, "grad_norm": 1.7725379314285796, "language_loss": 0.74760616, "learning_rate": 8.839822728487155e-09, "loss": 0.77191675, "num_input_tokens_seen": 348577850, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18615723, "step": 16149, "time_per_iteration": 2.8908510208129883 }, { "auxiliary_loss_clip": 0.01390795, "auxiliary_loss_mlp": 0.01039129, "balance_loss_clip": 1.23239422, "balance_loss_mlp": 1.02024579, "epoch": 0.9709905305877048, "flos": 41948640558720.0, "grad_norm": 2.184690880535293, "language_loss": 0.76594621, "learning_rate": 8.803283648533222e-09, "loss": 0.79024541, "num_input_tokens_seen": 348598345, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18884277, "step": 16150, "time_per_iteration": 4.370849370956421 }, { "auxiliary_loss_clip": 0.01420061, "auxiliary_loss_mlp": 0.01037856, "balance_loss_clip": 1.25405979, "balance_loss_mlp": 1.01636231, "epoch": 0.9710506538403728, "flos": 17174527776000.0, "grad_norm": 2.0281235289193464, "language_loss": 0.74495029, "learning_rate": 8.766820074958214e-09, "loss": 0.76952946, "num_input_tokens_seen": 348616300, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.21508789, "step": 16151, "time_per_iteration": 2.901977300643921 }, { "auxiliary_loss_clip": 0.01387558, "auxiliary_loss_mlp": 0.01031316, "balance_loss_clip": 1.23293078, "balance_loss_mlp": 1.01171803, "epoch": 0.9711107770930407, "flos": 21182008118400.0, "grad_norm": 2.3745417442155614, "language_loss": 0.75615978, "learning_rate": 8.730432009145027e-09, "loss": 0.78034854, "num_input_tokens_seen": 348633845, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.19604492, "step": 16152, "time_per_iteration": 2.894869804382324 }, { "auxiliary_loss_clip": 0.01407166, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.24797797, "balance_loss_mlp": 1.01501369, "epoch": 0.9711709003457087, "flos": 22247318373120.0, "grad_norm": 1.6008248734439288, "language_loss": 0.67648774, "learning_rate": 8.694119452473448e-09, "loss": 0.70090127, "num_input_tokens_seen": 348653070, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19177246, "step": 16153, "time_per_iteration": 2.8772382736206055 }, { "auxiliary_loss_clip": 0.01399873, "auxiliary_loss_mlp": 0.0103241, "balance_loss_clip": 1.2403096, "balance_loss_mlp": 1.01277685, "epoch": 0.9712310235983767, "flos": 26225090864640.0, "grad_norm": 1.748182695136443, "language_loss": 0.71511364, "learning_rate": 8.65788240632037e-09, "loss": 0.73943651, "num_input_tokens_seen": 348672145, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.19628906, "step": 16154, "time_per_iteration": 2.942561149597168 }, { "auxiliary_loss_clip": 0.014095, "auxiliary_loss_mlp": 0.01037983, "balance_loss_clip": 1.24635613, "balance_loss_mlp": 1.01894534, "epoch": 0.9712911468510447, "flos": 20678006580480.0, "grad_norm": 1.8528823554855698, "language_loss": 0.81855738, "learning_rate": 8.621720872059812e-09, "loss": 0.84303218, "num_input_tokens_seen": 348690615, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.19042969, "step": 16155, "time_per_iteration": 2.863548755645752 }, { "auxiliary_loss_clip": 0.01406141, "auxiliary_loss_mlp": 0.01033671, "balance_loss_clip": 1.24223876, "balance_loss_mlp": 1.01290476, "epoch": 0.9713512701037126, "flos": 13560750852480.0, "grad_norm": 11.38890371699834, "language_loss": 0.67992437, "learning_rate": 8.58563485106334e-09, "loss": 0.70432246, "num_input_tokens_seen": 348708665, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.20788574, "step": 16156, "time_per_iteration": 2.7993052005767822 }, { "auxiliary_loss_clip": 0.0140316, "auxiliary_loss_mlp": 0.01036659, "balance_loss_clip": 1.23986244, "balance_loss_mlp": 1.01789546, "epoch": 0.9714113933563806, "flos": 25859828442240.0, "grad_norm": 2.9592552297067627, "language_loss": 0.91744024, "learning_rate": 8.54962434469919e-09, "loss": 0.94183838, "num_input_tokens_seen": 348726105, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.18774414, "step": 16157, "time_per_iteration": 2.8777596950531006 }, { "auxiliary_loss_clip": 0.01412301, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.25053835, "balance_loss_mlp": 1.01489949, "epoch": 0.9714715166090485, "flos": 12748866577920.0, "grad_norm": 2.016914392239951, "language_loss": 0.73006833, "learning_rate": 8.513689354332721e-09, "loss": 0.75451958, "num_input_tokens_seen": 348743360, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.17932129, "step": 16158, "time_per_iteration": 2.8247272968292236 }, { "auxiliary_loss_clip": 0.01403615, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.24500358, "balance_loss_mlp": 1.01913476, "epoch": 0.9715316398617165, "flos": 18414664513920.0, "grad_norm": 1.9648794728798589, "language_loss": 0.61782014, "learning_rate": 8.477829881326836e-09, "loss": 0.64223182, "num_input_tokens_seen": 348759045, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.1842041, "step": 16159, "time_per_iteration": 2.848052501678467 }, { "auxiliary_loss_clip": 0.01392013, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.23718333, "balance_loss_mlp": 1.01342916, "epoch": 0.9715917631143844, "flos": 28925734354560.0, "grad_norm": 1.6737759484025454, "language_loss": 0.79725099, "learning_rate": 8.44204592704112e-09, "loss": 0.82148641, "num_input_tokens_seen": 348779910, "router_z_loss_clip": 1.54882812, "router_z_loss_mlp": 0.18115234, "step": 16160, "time_per_iteration": 2.935985803604126 }, { "auxiliary_loss_clip": 0.01177131, "auxiliary_loss_mlp": 0.0102432, "balance_loss_clip": 1.09064627, "balance_loss_mlp": 1.00295734, "epoch": 0.9716518863670525, "flos": 65968872900480.0, "grad_norm": 0.7710650595495714, "language_loss": 0.54338998, "learning_rate": 8.406337492832704e-09, "loss": 0.56540447, "num_input_tokens_seen": 348838995, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.21386719, "step": 16161, "time_per_iteration": 3.372110605239868 }, { "auxiliary_loss_clip": 0.01387921, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.23148799, "balance_loss_mlp": 1.01351786, "epoch": 0.9717120096197204, "flos": 17721810604800.0, "grad_norm": 1.8500687523991368, "language_loss": 0.72437346, "learning_rate": 8.3707045800554e-09, "loss": 0.7485795, "num_input_tokens_seen": 348858090, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.19177246, "step": 16162, "time_per_iteration": 2.8383305072784424 }, { "auxiliary_loss_clip": 0.01391795, "auxiliary_loss_mlp": 0.01030878, "balance_loss_clip": 1.23374653, "balance_loss_mlp": 1.01222146, "epoch": 0.9717721328723884, "flos": 24474437337600.0, "grad_norm": 4.972124470773038, "language_loss": 0.79969335, "learning_rate": 8.335147190060787e-09, "loss": 0.82392001, "num_input_tokens_seen": 348877885, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18664551, "step": 16163, "time_per_iteration": 2.878923177719116 }, { "auxiliary_loss_clip": 0.01393493, "auxiliary_loss_mlp": 0.01029119, "balance_loss_clip": 1.23625863, "balance_loss_mlp": 1.01122582, "epoch": 0.9718322561250564, "flos": 20786042459520.0, "grad_norm": 1.5062141666143194, "language_loss": 0.73581594, "learning_rate": 8.299665324196903e-09, "loss": 0.76004201, "num_input_tokens_seen": 348897720, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.17907715, "step": 16164, "time_per_iteration": 2.9071035385131836 }, { "auxiliary_loss_clip": 0.01405978, "auxiliary_loss_mlp": 0.01036296, "balance_loss_clip": 1.24471903, "balance_loss_mlp": 1.01749718, "epoch": 0.9718923793777243, "flos": 19035207953280.0, "grad_norm": 2.2859496888682056, "language_loss": 0.85102439, "learning_rate": 8.264258983809114e-09, "loss": 0.87544703, "num_input_tokens_seen": 348915410, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18798828, "step": 16165, "time_per_iteration": 2.873432159423828 }, { "auxiliary_loss_clip": 0.01400748, "auxiliary_loss_mlp": 0.01031745, "balance_loss_clip": 1.2422173, "balance_loss_mlp": 1.01492453, "epoch": 0.9719525026303923, "flos": 21881739237120.0, "grad_norm": 2.444406364067279, "language_loss": 0.80002278, "learning_rate": 8.228928170240345e-09, "loss": 0.82434773, "num_input_tokens_seen": 348934335, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.16821289, "step": 16166, "time_per_iteration": 2.9322948455810547 }, { "auxiliary_loss_clip": 0.0139746, "auxiliary_loss_mlp": 0.01029925, "balance_loss_clip": 1.23839879, "balance_loss_mlp": 1.01165032, "epoch": 0.9720126258830603, "flos": 14437570694400.0, "grad_norm": 1.7064912762865116, "language_loss": 0.71911716, "learning_rate": 8.193672884830195e-09, "loss": 0.74339104, "num_input_tokens_seen": 348952405, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18273926, "step": 16167, "time_per_iteration": 2.919938802719116 }, { "auxiliary_loss_clip": 0.01396609, "auxiliary_loss_mlp": 0.01038008, "balance_loss_clip": 1.23905206, "balance_loss_mlp": 1.01718211, "epoch": 0.9720727491357283, "flos": 26262626065920.0, "grad_norm": 1.438060051156298, "language_loss": 0.76344848, "learning_rate": 8.158493128915812e-09, "loss": 0.78779465, "num_input_tokens_seen": 348973580, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.20825195, "step": 16168, "time_per_iteration": 2.92417049407959 }, { "auxiliary_loss_clip": 0.01407037, "auxiliary_loss_mlp": 0.01036222, "balance_loss_clip": 1.2452122, "balance_loss_mlp": 1.01679134, "epoch": 0.9721328723883962, "flos": 22684257838080.0, "grad_norm": 2.1096614632035418, "language_loss": 0.74741149, "learning_rate": 8.123388903830797e-09, "loss": 0.77184415, "num_input_tokens_seen": 348992035, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19421387, "step": 16169, "time_per_iteration": 2.8938534259796143 }, { "auxiliary_loss_clip": 0.01416715, "auxiliary_loss_mlp": 0.01030978, "balance_loss_clip": 1.2526319, "balance_loss_mlp": 1.01161814, "epoch": 0.9721929956410642, "flos": 28085771041920.0, "grad_norm": 2.0189384694925256, "language_loss": 0.58227628, "learning_rate": 8.088360210906309e-09, "loss": 0.60675323, "num_input_tokens_seen": 349013160, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.19360352, "step": 16170, "time_per_iteration": 3.005073308944702 }, { "auxiliary_loss_clip": 0.01391393, "auxiliary_loss_mlp": 0.0103314, "balance_loss_clip": 1.23195815, "balance_loss_mlp": 1.0136131, "epoch": 0.9722531188937321, "flos": 21006095760000.0, "grad_norm": 1.7957431193786442, "language_loss": 0.7191757, "learning_rate": 8.053407051471062e-09, "loss": 0.74342096, "num_input_tokens_seen": 349033485, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19519043, "step": 16171, "time_per_iteration": 2.9933855533599854 }, { "auxiliary_loss_clip": 0.01399601, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.23979509, "balance_loss_mlp": 1.01878405, "epoch": 0.9723132421464001, "flos": 16079147712000.0, "grad_norm": 1.7035127794905218, "language_loss": 0.69373339, "learning_rate": 8.018529426850218e-09, "loss": 0.71810222, "num_input_tokens_seen": 349051705, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18493652, "step": 16172, "time_per_iteration": 2.8956363201141357 }, { "auxiliary_loss_clip": 0.01392457, "auxiliary_loss_mlp": 0.0103113, "balance_loss_clip": 1.2361145, "balance_loss_mlp": 1.0119493, "epoch": 0.972373365399068, "flos": 27757274659200.0, "grad_norm": 1.7455273221982708, "language_loss": 0.87024099, "learning_rate": 7.983727338366274e-09, "loss": 0.89447689, "num_input_tokens_seen": 349070825, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.19177246, "step": 16173, "time_per_iteration": 4.394887208938599 }, { "auxiliary_loss_clip": 0.01423447, "auxiliary_loss_mlp": 0.01037756, "balance_loss_clip": 1.25771356, "balance_loss_mlp": 1.018659, "epoch": 0.9724334886517361, "flos": 23013297158400.0, "grad_norm": 1.8232491244186737, "language_loss": 0.65366638, "learning_rate": 7.949000787339289e-09, "loss": 0.67827839, "num_input_tokens_seen": 349089730, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19104004, "step": 16174, "time_per_iteration": 2.871086835861206 }, { "auxiliary_loss_clip": 0.0139429, "auxiliary_loss_mlp": 0.0103521, "balance_loss_clip": 1.23761308, "balance_loss_mlp": 1.0166254, "epoch": 0.972493611904404, "flos": 25457935714560.0, "grad_norm": 1.4266629316949673, "language_loss": 0.78825581, "learning_rate": 7.914349775085538e-09, "loss": 0.81255078, "num_input_tokens_seen": 349111315, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18591309, "step": 16175, "time_per_iteration": 2.9415509700775146 }, { "auxiliary_loss_clip": 0.0139138, "auxiliary_loss_mlp": 0.01034387, "balance_loss_clip": 1.23313189, "balance_loss_mlp": 1.01549196, "epoch": 0.972553735157072, "flos": 16991919187200.0, "grad_norm": 2.337708969734513, "language_loss": 0.58365387, "learning_rate": 7.879774302919307e-09, "loss": 0.60791153, "num_input_tokens_seen": 349129495, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18896484, "step": 16176, "time_per_iteration": 2.8378376960754395 }, { "auxiliary_loss_clip": 0.01410109, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.25174236, "balance_loss_mlp": 1.0109601, "epoch": 0.97261385840974, "flos": 26115923865600.0, "grad_norm": 2.398149540032479, "language_loss": 0.72868901, "learning_rate": 7.845274372151545e-09, "loss": 0.75308174, "num_input_tokens_seen": 349148850, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18188477, "step": 16177, "time_per_iteration": 4.339169025421143 }, { "auxiliary_loss_clip": 0.01399962, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.23931813, "balance_loss_mlp": 1.01301301, "epoch": 0.9726739816624079, "flos": 25458976344960.0, "grad_norm": 1.688287123052307, "language_loss": 0.6960299, "learning_rate": 7.810849984090984e-09, "loss": 0.72034287, "num_input_tokens_seen": 349167620, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18322754, "step": 16178, "time_per_iteration": 2.9026010036468506 }, { "auxiliary_loss_clip": 0.0140856, "auxiliary_loss_mlp": 0.01030696, "balance_loss_clip": 1.24605691, "balance_loss_mlp": 1.01218247, "epoch": 0.972734104915076, "flos": 29024721273600.0, "grad_norm": 2.912708929601516, "language_loss": 0.6825428, "learning_rate": 7.776501140042358e-09, "loss": 0.70693535, "num_input_tokens_seen": 349185845, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.18530273, "step": 16179, "time_per_iteration": 2.8754777908325195 }, { "auxiliary_loss_clip": 0.01386497, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.23008192, "balance_loss_mlp": 1.01590347, "epoch": 0.9727942281677439, "flos": 23447295711360.0, "grad_norm": 1.9384759626842905, "language_loss": 0.78078777, "learning_rate": 7.742227841308624e-09, "loss": 0.80499578, "num_input_tokens_seen": 349204525, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18408203, "step": 16180, "time_per_iteration": 2.8491365909576416 }, { "auxiliary_loss_clip": 0.01409768, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.2443186, "balance_loss_mlp": 1.01292467, "epoch": 0.9728543514204119, "flos": 31737626104320.0, "grad_norm": 1.6002077579190386, "language_loss": 0.77564657, "learning_rate": 7.708030089189188e-09, "loss": 0.80006993, "num_input_tokens_seen": 349228075, "router_z_loss_clip": 1.65429688, "router_z_loss_mlp": 0.1965332, "step": 16181, "time_per_iteration": 2.9964916706085205 }, { "auxiliary_loss_clip": 0.01397109, "auxiliary_loss_mlp": 0.01036364, "balance_loss_clip": 1.23837626, "balance_loss_mlp": 1.01755261, "epoch": 0.9729144746730798, "flos": 16297934158080.0, "grad_norm": 1.4442251740758172, "language_loss": 0.64438868, "learning_rate": 7.67390788498079e-09, "loss": 0.6687234, "num_input_tokens_seen": 349246990, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18798828, "step": 16182, "time_per_iteration": 2.870171070098877 }, { "auxiliary_loss_clip": 0.0140339, "auxiliary_loss_mlp": 0.01031173, "balance_loss_clip": 1.24295688, "balance_loss_mlp": 1.01251698, "epoch": 0.9729745979257478, "flos": 25051066058880.0, "grad_norm": 4.186218822970891, "language_loss": 0.6372323, "learning_rate": 7.639861229977507e-09, "loss": 0.66157794, "num_input_tokens_seen": 349265890, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18664551, "step": 16183, "time_per_iteration": 4.293447732925415 }, { "auxiliary_loss_clip": 0.01390044, "auxiliary_loss_mlp": 0.01035024, "balance_loss_clip": 1.23275256, "balance_loss_mlp": 1.01531875, "epoch": 0.9730347211784157, "flos": 22649165856000.0, "grad_norm": 1.605978408181502, "language_loss": 0.78499818, "learning_rate": 7.605890125470527e-09, "loss": 0.80924892, "num_input_tokens_seen": 349285275, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.19714355, "step": 16184, "time_per_iteration": 2.884580135345459 }, { "auxiliary_loss_clip": 0.01393506, "auxiliary_loss_mlp": 0.01030422, "balance_loss_clip": 1.23500478, "balance_loss_mlp": 1.0125649, "epoch": 0.9730948444310837, "flos": 11005723687680.0, "grad_norm": 2.192501169555215, "language_loss": 0.80106372, "learning_rate": 7.571994572747709e-09, "loss": 0.82530302, "num_input_tokens_seen": 349301515, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.17858887, "step": 16185, "time_per_iteration": 4.429924964904785 }, { "auxiliary_loss_clip": 0.01413406, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.25150871, "balance_loss_mlp": 1.01757264, "epoch": 0.9731549676837516, "flos": 16807998499200.0, "grad_norm": 1.6620523291253477, "language_loss": 0.78685713, "learning_rate": 7.538174573094469e-09, "loss": 0.81135845, "num_input_tokens_seen": 349319590, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19140625, "step": 16186, "time_per_iteration": 2.887716293334961 }, { "auxiliary_loss_clip": 0.01399059, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.24032736, "balance_loss_mlp": 1.01533055, "epoch": 0.9732150909364197, "flos": 21151531105920.0, "grad_norm": 1.7076633108454966, "language_loss": 0.66242653, "learning_rate": 7.504430127793337e-09, "loss": 0.68675828, "num_input_tokens_seen": 349339230, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18774414, "step": 16187, "time_per_iteration": 2.878744602203369 }, { "auxiliary_loss_clip": 0.01396341, "auxiliary_loss_mlp": 0.01030536, "balance_loss_clip": 1.23905444, "balance_loss_mlp": 1.01177239, "epoch": 0.9732752141890876, "flos": 33739850574720.0, "grad_norm": 1.6480616104857082, "language_loss": 0.80716914, "learning_rate": 7.47076123812418e-09, "loss": 0.83143795, "num_input_tokens_seen": 349361155, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18737793, "step": 16188, "time_per_iteration": 3.018159866333008 }, { "auxiliary_loss_clip": 0.01387847, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.23287892, "balance_loss_mlp": 1.01278472, "epoch": 0.9733353374417556, "flos": 23414873172480.0, "grad_norm": 1.846467092536917, "language_loss": 0.79017508, "learning_rate": 7.437167905363084e-09, "loss": 0.81436121, "num_input_tokens_seen": 349379335, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.17980957, "step": 16189, "time_per_iteration": 2.9806342124938965 }, { "auxiliary_loss_clip": 0.0138764, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.22873425, "balance_loss_mlp": 1.01155591, "epoch": 0.9733954606944236, "flos": 39180256323840.0, "grad_norm": 1.9181958403720214, "language_loss": 0.51775908, "learning_rate": 7.403650130784367e-09, "loss": 0.54193425, "num_input_tokens_seen": 349401575, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18322754, "step": 16190, "time_per_iteration": 2.9808578491210938 }, { "auxiliary_loss_clip": 0.01396786, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.23768115, "balance_loss_mlp": 1.01460946, "epoch": 0.9734555839470915, "flos": 21992037356160.0, "grad_norm": 1.6935672236447623, "language_loss": 0.81525922, "learning_rate": 7.3702079156590105e-09, "loss": 0.83956474, "num_input_tokens_seen": 349420650, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19165039, "step": 16191, "time_per_iteration": 2.888597249984741 }, { "auxiliary_loss_clip": 0.01405075, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.24598646, "balance_loss_mlp": 1.01305008, "epoch": 0.9735157071997596, "flos": 16582696801920.0, "grad_norm": 1.6776807692401956, "language_loss": 0.83014822, "learning_rate": 7.336841261255111e-09, "loss": 0.8545121, "num_input_tokens_seen": 349436830, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18249512, "step": 16192, "time_per_iteration": 2.7913994789123535 }, { "auxiliary_loss_clip": 0.01423379, "auxiliary_loss_mlp": 0.01032507, "balance_loss_clip": 1.26255071, "balance_loss_mlp": 1.01419616, "epoch": 0.9735758304524275, "flos": 20231113259520.0, "grad_norm": 8.688198350150344, "language_loss": 0.75586408, "learning_rate": 7.303550168837658e-09, "loss": 0.78042293, "num_input_tokens_seen": 349454325, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.1829834, "step": 16193, "time_per_iteration": 2.8233678340911865 }, { "auxiliary_loss_clip": 0.01386178, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.23112178, "balance_loss_mlp": 1.01203442, "epoch": 0.9736359537050955, "flos": 23662236349440.0, "grad_norm": 2.471811740770899, "language_loss": 0.85558796, "learning_rate": 7.270334639669417e-09, "loss": 0.8797425, "num_input_tokens_seen": 349470230, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.17248535, "step": 16194, "time_per_iteration": 2.8938090801239014 }, { "auxiliary_loss_clip": 0.01378681, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.22568142, "balance_loss_mlp": 1.01363635, "epoch": 0.9736960769577634, "flos": 15568721412480.0, "grad_norm": 1.706143341946828, "language_loss": 0.76520085, "learning_rate": 7.237194675009828e-09, "loss": 0.78931117, "num_input_tokens_seen": 349486250, "router_z_loss_clip": 1.53027344, "router_z_loss_mlp": 0.18701172, "step": 16195, "time_per_iteration": 2.8149304389953613 }, { "auxiliary_loss_clip": 0.01181568, "auxiliary_loss_mlp": 0.01029609, "balance_loss_clip": 1.09322906, "balance_loss_mlp": 1.00548124, "epoch": 0.9737562002104314, "flos": 65379711369600.0, "grad_norm": 0.7074616399574122, "language_loss": 0.52466005, "learning_rate": 7.204130276115439e-09, "loss": 0.54677182, "num_input_tokens_seen": 349545865, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.24121094, "step": 16196, "time_per_iteration": 3.28844952583313 }, { "auxiliary_loss_clip": 0.01406918, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.24657178, "balance_loss_mlp": 1.01041186, "epoch": 0.9738163234630993, "flos": 27207639100800.0, "grad_norm": 1.8495417814219874, "language_loss": 0.76825392, "learning_rate": 7.171141444240136e-09, "loss": 0.79260755, "num_input_tokens_seen": 349566080, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18041992, "step": 16197, "time_per_iteration": 2.933150291442871 }, { "auxiliary_loss_clip": 0.01415841, "auxiliary_loss_mlp": 0.01031425, "balance_loss_clip": 1.25017273, "balance_loss_mlp": 1.01235175, "epoch": 0.9738764467157673, "flos": 21079265880960.0, "grad_norm": 2.2631821713131135, "language_loss": 0.68064821, "learning_rate": 7.13822818063492e-09, "loss": 0.70512092, "num_input_tokens_seen": 349585665, "router_z_loss_clip": 1.65820312, "router_z_loss_mlp": 0.19067383, "step": 16198, "time_per_iteration": 2.8958942890167236 }, { "auxiliary_loss_clip": 0.01396805, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.23626709, "balance_loss_mlp": 1.01417303, "epoch": 0.9739365699684353, "flos": 21371177203200.0, "grad_norm": 1.7695226381433766, "language_loss": 0.78979659, "learning_rate": 7.10539048654768e-09, "loss": 0.81409454, "num_input_tokens_seen": 349605125, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18811035, "step": 16199, "time_per_iteration": 2.8626868724823 }, { "auxiliary_loss_clip": 0.01401144, "auxiliary_loss_mlp": 0.01034395, "balance_loss_clip": 1.24122453, "balance_loss_mlp": 1.01564324, "epoch": 0.9739966932211033, "flos": 21910496947200.0, "grad_norm": 1.6931131618083486, "language_loss": 0.79610562, "learning_rate": 7.072628363223865e-09, "loss": 0.82046103, "num_input_tokens_seen": 349623360, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18737793, "step": 16200, "time_per_iteration": 2.8704047203063965 }, { "auxiliary_loss_clip": 0.01425846, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.25831926, "balance_loss_mlp": 1.01434135, "epoch": 0.9740568164737712, "flos": 24838206681600.0, "grad_norm": 2.4207356307047596, "language_loss": 0.69446558, "learning_rate": 7.039941811905592e-09, "loss": 0.71906108, "num_input_tokens_seen": 349644390, "router_z_loss_clip": 1.67675781, "router_z_loss_mlp": 0.19360352, "step": 16201, "time_per_iteration": 2.9265036582946777 }, { "auxiliary_loss_clip": 0.01403276, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.24258566, "balance_loss_mlp": 1.01595306, "epoch": 0.9741169397264392, "flos": 23634202556160.0, "grad_norm": 1.472758522532452, "language_loss": 0.73378354, "learning_rate": 7.0073308338325364e-09, "loss": 0.75816476, "num_input_tokens_seen": 349663200, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18908691, "step": 16202, "time_per_iteration": 2.8630495071411133 }, { "auxiliary_loss_clip": 0.01403997, "auxiliary_loss_mlp": 0.01032291, "balance_loss_clip": 1.24271309, "balance_loss_mlp": 1.01327658, "epoch": 0.9741770629791072, "flos": 18849794186880.0, "grad_norm": 1.9187673095500724, "language_loss": 0.73372418, "learning_rate": 6.974795430241265e-09, "loss": 0.75808704, "num_input_tokens_seen": 349681975, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.19018555, "step": 16203, "time_per_iteration": 2.9457216262817383 }, { "auxiliary_loss_clip": 0.01396368, "auxiliary_loss_mlp": 0.01032767, "balance_loss_clip": 1.23582304, "balance_loss_mlp": 1.01400304, "epoch": 0.9742371862317751, "flos": 22356440127360.0, "grad_norm": 1.6933981274254448, "language_loss": 0.7791431, "learning_rate": 6.942335602365235e-09, "loss": 0.80343443, "num_input_tokens_seen": 349701185, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18762207, "step": 16204, "time_per_iteration": 2.852281093597412 }, { "auxiliary_loss_clip": 0.01399442, "auxiliary_loss_mlp": 0.01033724, "balance_loss_clip": 1.23916149, "balance_loss_mlp": 1.01388788, "epoch": 0.9742973094844432, "flos": 21772888951680.0, "grad_norm": 1.9565868481430708, "language_loss": 0.80493987, "learning_rate": 6.909951351435905e-09, "loss": 0.82927155, "num_input_tokens_seen": 349720360, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19812012, "step": 16205, "time_per_iteration": 2.868943452835083 }, { "auxiliary_loss_clip": 0.01381985, "auxiliary_loss_mlp": 0.01033046, "balance_loss_clip": 1.2244494, "balance_loss_mlp": 1.01480746, "epoch": 0.9743574327371111, "flos": 26259549419520.0, "grad_norm": 1.6140809808490482, "language_loss": 0.75416082, "learning_rate": 6.87764267868074e-09, "loss": 0.77831113, "num_input_tokens_seen": 349741040, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18225098, "step": 16206, "time_per_iteration": 2.890082836151123 }, { "auxiliary_loss_clip": 0.01396937, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.23538339, "balance_loss_mlp": 1.01293325, "epoch": 0.9744175559897791, "flos": 12356972951040.0, "grad_norm": 2.616995658724922, "language_loss": 0.85339075, "learning_rate": 6.8454095853252015e-09, "loss": 0.87767923, "num_input_tokens_seen": 349758895, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.18981934, "step": 16207, "time_per_iteration": 2.8434536457061768 }, { "auxiliary_loss_clip": 0.01387325, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.23129988, "balance_loss_mlp": 1.01638842, "epoch": 0.974477679242447, "flos": 28408295111040.0, "grad_norm": 1.8354607390285151, "language_loss": 0.71170712, "learning_rate": 6.813252072591425e-09, "loss": 0.7359271, "num_input_tokens_seen": 349779740, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.18286133, "step": 16208, "time_per_iteration": 2.980886459350586 }, { "auxiliary_loss_clip": 0.01387756, "auxiliary_loss_mlp": 0.01030006, "balance_loss_clip": 1.23510623, "balance_loss_mlp": 1.01200497, "epoch": 0.974537802495115, "flos": 17794347298560.0, "grad_norm": 1.8465179202310829, "language_loss": 0.77952158, "learning_rate": 6.781170141698878e-09, "loss": 0.80369914, "num_input_tokens_seen": 349796820, "router_z_loss_clip": 1.52539062, "router_z_loss_mlp": 0.18017578, "step": 16209, "time_per_iteration": 4.369826316833496 }, { "auxiliary_loss_clip": 0.01401165, "auxiliary_loss_mlp": 0.01033295, "balance_loss_clip": 1.2389369, "balance_loss_mlp": 1.01363707, "epoch": 0.9745979257477829, "flos": 23852943757440.0, "grad_norm": 1.7628109002935457, "language_loss": 0.80297321, "learning_rate": 6.749163793864144e-09, "loss": 0.82731783, "num_input_tokens_seen": 349816550, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19665527, "step": 16210, "time_per_iteration": 2.9210939407348633 }, { "auxiliary_loss_clip": 0.01395293, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.23562253, "balance_loss_mlp": 1.01677001, "epoch": 0.9746580490004509, "flos": 27027518976000.0, "grad_norm": 5.833557764098546, "language_loss": 0.7836495, "learning_rate": 6.7172330303009176e-09, "loss": 0.80795455, "num_input_tokens_seen": 349834350, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18444824, "step": 16211, "time_per_iteration": 4.362037420272827 }, { "auxiliary_loss_clip": 0.01419458, "auxiliary_loss_mlp": 0.0103759, "balance_loss_clip": 1.25431347, "balance_loss_mlp": 1.01786065, "epoch": 0.9747181722531189, "flos": 19801503452160.0, "grad_norm": 2.656654433560433, "language_loss": 0.78689891, "learning_rate": 6.685377852219787e-09, "loss": 0.81146944, "num_input_tokens_seen": 349853460, "router_z_loss_clip": 1.64941406, "router_z_loss_mlp": 0.19726562, "step": 16212, "time_per_iteration": 2.8346025943756104 }, { "auxiliary_loss_clip": 0.01385796, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.23035717, "balance_loss_mlp": 1.01333702, "epoch": 0.9747782955057869, "flos": 31443000094080.0, "grad_norm": 5.171784234346739, "language_loss": 0.81228447, "learning_rate": 6.653598260829118e-09, "loss": 0.83646214, "num_input_tokens_seen": 349874830, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18640137, "step": 16213, "time_per_iteration": 2.9361579418182373 }, { "auxiliary_loss_clip": 0.01387757, "auxiliary_loss_mlp": 0.01028312, "balance_loss_clip": 1.22946405, "balance_loss_mlp": 1.00948846, "epoch": 0.9748384187584548, "flos": 15969844978560.0, "grad_norm": 2.125745941676953, "language_loss": 0.67558849, "learning_rate": 6.6218942573335044e-09, "loss": 0.69974911, "num_input_tokens_seen": 349893690, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18823242, "step": 16214, "time_per_iteration": 2.84736967086792 }, { "auxiliary_loss_clip": 0.01407096, "auxiliary_loss_mlp": 0.01035245, "balance_loss_clip": 1.24332762, "balance_loss_mlp": 1.0158143, "epoch": 0.9748985420111228, "flos": 20568522867840.0, "grad_norm": 1.6699869323972791, "language_loss": 0.74999416, "learning_rate": 6.5902658429355386e-09, "loss": 0.77441764, "num_input_tokens_seen": 349912480, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19433594, "step": 16215, "time_per_iteration": 2.8607492446899414 }, { "auxiliary_loss_clip": 0.01393204, "auxiliary_loss_mlp": 0.01031189, "balance_loss_clip": 1.23356605, "balance_loss_mlp": 1.01224637, "epoch": 0.9749586652637908, "flos": 36735527278080.0, "grad_norm": 1.8037485293047941, "language_loss": 0.6773867, "learning_rate": 6.558713018834483e-09, "loss": 0.70163065, "num_input_tokens_seen": 349932470, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18945312, "step": 16216, "time_per_iteration": 2.9663803577423096 }, { "auxiliary_loss_clip": 0.01411899, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.24894452, "balance_loss_mlp": 1.01586819, "epoch": 0.9750187885164587, "flos": 11006492849280.0, "grad_norm": 2.4189545047443866, "language_loss": 0.72628176, "learning_rate": 6.527235786226937e-09, "loss": 0.75075281, "num_input_tokens_seen": 349949060, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19335938, "step": 16217, "time_per_iteration": 2.847182512283325 }, { "auxiliary_loss_clip": 0.01387582, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.22993314, "balance_loss_mlp": 1.01212835, "epoch": 0.9750789117691268, "flos": 25750616198400.0, "grad_norm": 1.5486590404623812, "language_loss": 0.78813589, "learning_rate": 6.495834146306167e-09, "loss": 0.81231606, "num_input_tokens_seen": 349968010, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18310547, "step": 16218, "time_per_iteration": 2.899005174636841 }, { "auxiliary_loss_clip": 0.01381324, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.22496283, "balance_loss_mlp": 1.01542664, "epoch": 0.9751390350217947, "flos": 13341738182400.0, "grad_norm": 1.965379575628166, "language_loss": 0.78334588, "learning_rate": 6.464508100263222e-09, "loss": 0.80750328, "num_input_tokens_seen": 349985270, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18981934, "step": 16219, "time_per_iteration": 4.202998876571655 }, { "auxiliary_loss_clip": 0.0140014, "auxiliary_loss_mlp": 0.0103413, "balance_loss_clip": 1.23970747, "balance_loss_mlp": 1.01580787, "epoch": 0.9751991582744627, "flos": 22830960038400.0, "grad_norm": 1.7172042899904458, "language_loss": 0.82175243, "learning_rate": 6.433257649285817e-09, "loss": 0.84609509, "num_input_tokens_seen": 350003935, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18334961, "step": 16220, "time_per_iteration": 4.2648303508758545 }, { "auxiliary_loss_clip": 0.01386374, "auxiliary_loss_mlp": 0.01033493, "balance_loss_clip": 1.22951591, "balance_loss_mlp": 1.01503992, "epoch": 0.9752592815271306, "flos": 19655660903040.0, "grad_norm": 2.3936098096802625, "language_loss": 0.7584582, "learning_rate": 6.402082794559227e-09, "loss": 0.78265691, "num_input_tokens_seen": 350023595, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18457031, "step": 16221, "time_per_iteration": 2.858649253845215 }, { "auxiliary_loss_clip": 0.01388027, "auxiliary_loss_mlp": 0.01032488, "balance_loss_clip": 1.23152876, "balance_loss_mlp": 1.012079, "epoch": 0.9753194047797986, "flos": 26702415953280.0, "grad_norm": 1.5485150335561055, "language_loss": 0.66974056, "learning_rate": 6.370983537265395e-09, "loss": 0.69394565, "num_input_tokens_seen": 350045920, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.20422363, "step": 16222, "time_per_iteration": 2.9350554943084717 }, { "auxiliary_loss_clip": 0.01395326, "auxiliary_loss_mlp": 0.01031294, "balance_loss_clip": 1.23777139, "balance_loss_mlp": 1.01309037, "epoch": 0.9753795280324665, "flos": 23232174094080.0, "grad_norm": 2.4871438006927558, "language_loss": 0.89199436, "learning_rate": 6.3399598785836004e-09, "loss": 0.91626048, "num_input_tokens_seen": 350063925, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18200684, "step": 16223, "time_per_iteration": 2.919252872467041 }, { "auxiliary_loss_clip": 0.01392785, "auxiliary_loss_mlp": 0.0103718, "balance_loss_clip": 1.23706627, "balance_loss_mlp": 1.01851213, "epoch": 0.9754396512851345, "flos": 19473278538240.0, "grad_norm": 1.7031893695150015, "language_loss": 0.75100362, "learning_rate": 6.309011819690457e-09, "loss": 0.7753033, "num_input_tokens_seen": 350080900, "router_z_loss_clip": 1.55859375, "router_z_loss_mlp": 0.18664551, "step": 16224, "time_per_iteration": 2.8274567127227783 }, { "auxiliary_loss_clip": 0.01181938, "auxiliary_loss_mlp": 0.01022529, "balance_loss_clip": 1.0939827, "balance_loss_mlp": 1.00030851, "epoch": 0.9754997745378025, "flos": 68489667734400.0, "grad_norm": 0.809790010652841, "language_loss": 0.59106421, "learning_rate": 6.278139361759249e-09, "loss": 0.61310887, "num_input_tokens_seen": 350144550, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.22265625, "step": 16225, "time_per_iteration": 3.369819402694702 }, { "auxiliary_loss_clip": 0.01393989, "auxiliary_loss_mlp": 0.01035795, "balance_loss_clip": 1.23639035, "balance_loss_mlp": 1.0163641, "epoch": 0.9755598977904705, "flos": 26406161130240.0, "grad_norm": 1.660901904940292, "language_loss": 0.69417632, "learning_rate": 6.247342505960818e-09, "loss": 0.71847409, "num_input_tokens_seen": 350164050, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.1940918, "step": 16226, "time_per_iteration": 2.9045512676239014 }, { "auxiliary_loss_clip": 0.01393198, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.23428881, "balance_loss_mlp": 1.01486015, "epoch": 0.9756200210431384, "flos": 16626294806400.0, "grad_norm": 1.6553515189316224, "language_loss": 0.83610439, "learning_rate": 6.216621253462894e-09, "loss": 0.86036968, "num_input_tokens_seen": 350181350, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18481445, "step": 16227, "time_per_iteration": 2.8254475593566895 }, { "auxiliary_loss_clip": 0.01395432, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.23790789, "balance_loss_mlp": 1.01254702, "epoch": 0.9756801442958064, "flos": 23633523884160.0, "grad_norm": 2.1377362337290817, "language_loss": 0.78233647, "learning_rate": 6.185975605430549e-09, "loss": 0.80660903, "num_input_tokens_seen": 350199765, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.19262695, "step": 16228, "time_per_iteration": 2.894794225692749 }, { "auxiliary_loss_clip": 0.01183604, "auxiliary_loss_mlp": 0.01047832, "balance_loss_clip": 1.09400022, "balance_loss_mlp": 1.01884079, "epoch": 0.9757402675484744, "flos": 61651609539840.0, "grad_norm": 0.8433921415933072, "language_loss": 0.55795377, "learning_rate": 6.155405563025962e-09, "loss": 0.5802682, "num_input_tokens_seen": 350256420, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.2890625, "step": 16229, "time_per_iteration": 3.2694520950317383 }, { "auxiliary_loss_clip": 0.0140099, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.24201107, "balance_loss_mlp": 1.01656318, "epoch": 0.9758003908011423, "flos": 24069060760320.0, "grad_norm": 1.9137128941278372, "language_loss": 0.75584525, "learning_rate": 6.124911127407984e-09, "loss": 0.78021032, "num_input_tokens_seen": 350276270, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.1895752, "step": 16230, "time_per_iteration": 2.872739315032959 }, { "auxiliary_loss_clip": 0.0138069, "auxiliary_loss_mlp": 0.01031521, "balance_loss_clip": 1.22612679, "balance_loss_mlp": 1.01385438, "epoch": 0.9758605140538104, "flos": 17501893038720.0, "grad_norm": 1.9213139971288582, "language_loss": 0.72959417, "learning_rate": 6.094492299733245e-09, "loss": 0.75371623, "num_input_tokens_seen": 350295000, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.17663574, "step": 16231, "time_per_iteration": 2.8657314777374268 }, { "auxiliary_loss_clip": 0.01417613, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.25340331, "balance_loss_mlp": 1.01210928, "epoch": 0.9759206373064783, "flos": 24837708988800.0, "grad_norm": 1.7597077618921848, "language_loss": 0.77006012, "learning_rate": 6.064149081155267e-09, "loss": 0.79454488, "num_input_tokens_seen": 350314980, "router_z_loss_clip": 1.64257812, "router_z_loss_mlp": 0.1875, "step": 16232, "time_per_iteration": 2.920801877975464 }, { "auxiliary_loss_clip": 0.01179753, "auxiliary_loss_mlp": 0.01024766, "balance_loss_clip": 1.09231985, "balance_loss_mlp": 1.00750434, "epoch": 0.9759807605591463, "flos": 68189838572160.0, "grad_norm": 0.7413605561446295, "language_loss": 0.53795445, "learning_rate": 6.033881472824465e-09, "loss": 0.5599997, "num_input_tokens_seen": 350371985, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.17285156, "step": 16233, "time_per_iteration": 3.132338285446167 }, { "auxiliary_loss_clip": 0.01390204, "auxiliary_loss_mlp": 0.01030215, "balance_loss_clip": 1.23274112, "balance_loss_mlp": 1.01171327, "epoch": 0.9760408838118142, "flos": 18998487158400.0, "grad_norm": 1.727365289369547, "language_loss": 0.72425121, "learning_rate": 6.003689475888807e-09, "loss": 0.74845541, "num_input_tokens_seen": 350390590, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18493652, "step": 16234, "time_per_iteration": 2.863232135772705 }, { "auxiliary_loss_clip": 0.01407126, "auxiliary_loss_mlp": 0.01032248, "balance_loss_clip": 1.24245358, "balance_loss_mlp": 1.0134728, "epoch": 0.9761010070644822, "flos": 17134096907520.0, "grad_norm": 2.4193343479413674, "language_loss": 0.80037463, "learning_rate": 5.973573091493156e-09, "loss": 0.82476836, "num_input_tokens_seen": 350403770, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.18774414, "step": 16235, "time_per_iteration": 2.818143844604492 }, { "auxiliary_loss_clip": 0.01393661, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.23678589, "balance_loss_mlp": 1.01379573, "epoch": 0.9761611303171501, "flos": 22062266565120.0, "grad_norm": 1.840192077337459, "language_loss": 0.7751019, "learning_rate": 5.943532320779265e-09, "loss": 0.79936862, "num_input_tokens_seen": 350421870, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.19226074, "step": 16236, "time_per_iteration": 2.8439910411834717 }, { "auxiliary_loss_clip": 0.01399687, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.24014604, "balance_loss_mlp": 1.01518774, "epoch": 0.9762212535698181, "flos": 21766283210880.0, "grad_norm": 1.8844866378489251, "language_loss": 0.76393127, "learning_rate": 5.913567164886446e-09, "loss": 0.78825957, "num_input_tokens_seen": 350440025, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17944336, "step": 16237, "time_per_iteration": 2.8670175075531006 }, { "auxiliary_loss_clip": 0.01396019, "auxiliary_loss_mlp": 0.01035566, "balance_loss_clip": 1.23503792, "balance_loss_mlp": 1.01617074, "epoch": 0.9762813768224861, "flos": 25932681849600.0, "grad_norm": 1.825793769446491, "language_loss": 0.73374444, "learning_rate": 5.8836776249509e-09, "loss": 0.75806034, "num_input_tokens_seen": 350459435, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19384766, "step": 16238, "time_per_iteration": 2.8664515018463135 }, { "auxiliary_loss_clip": 0.01397738, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.23776472, "balance_loss_mlp": 1.01537311, "epoch": 0.9763415000751541, "flos": 24060102289920.0, "grad_norm": 2.093123273615315, "language_loss": 0.85013598, "learning_rate": 5.8538637021063875e-09, "loss": 0.87445295, "num_input_tokens_seen": 350472655, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18591309, "step": 16239, "time_per_iteration": 2.8302385807037354 }, { "auxiliary_loss_clip": 0.01402226, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.24071336, "balance_loss_mlp": 1.01392126, "epoch": 0.976401623327822, "flos": 17027327882880.0, "grad_norm": 2.5934576512047225, "language_loss": 0.60408556, "learning_rate": 5.824125397483115e-09, "loss": 0.62843168, "num_input_tokens_seen": 350488160, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18457031, "step": 16240, "time_per_iteration": 2.8622426986694336 }, { "auxiliary_loss_clip": 0.01396649, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.24004614, "balance_loss_mlp": 1.01287413, "epoch": 0.97646174658049, "flos": 16115461303680.0, "grad_norm": 1.7835460985714808, "language_loss": 0.83519226, "learning_rate": 5.7944627122088474e-09, "loss": 0.85947275, "num_input_tokens_seen": 350506065, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18505859, "step": 16241, "time_per_iteration": 2.9623098373413086 }, { "auxiliary_loss_clip": 0.01391964, "auxiliary_loss_mlp": 0.01039349, "balance_loss_clip": 1.23174107, "balance_loss_mlp": 1.02059793, "epoch": 0.9765218698331579, "flos": 21262915100160.0, "grad_norm": 8.495715279383578, "language_loss": 0.84197247, "learning_rate": 5.764875647408463e-09, "loss": 0.86628556, "num_input_tokens_seen": 350524495, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18762207, "step": 16242, "time_per_iteration": 2.9112656116485596 }, { "auxiliary_loss_clip": 0.01400583, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.24086618, "balance_loss_mlp": 1.01208282, "epoch": 0.9765819930858259, "flos": 18597227857920.0, "grad_norm": 1.5390580955374455, "language_loss": 0.76907206, "learning_rate": 5.7353642042037294e-09, "loss": 0.79338229, "num_input_tokens_seen": 350544185, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18359375, "step": 16243, "time_per_iteration": 2.865661859512329 }, { "auxiliary_loss_clip": 0.01389975, "auxiliary_loss_mlp": 0.01036998, "balance_loss_clip": 1.23138714, "balance_loss_mlp": 1.01751935, "epoch": 0.976642116338494, "flos": 20276566300800.0, "grad_norm": 1.6996965514701532, "language_loss": 0.70855284, "learning_rate": 5.705928383713754e-09, "loss": 0.73282254, "num_input_tokens_seen": 350562675, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.19482422, "step": 16244, "time_per_iteration": 4.36298680305481 }, { "auxiliary_loss_clip": 0.0140427, "auxiliary_loss_mlp": 0.0103414, "balance_loss_clip": 1.24185324, "balance_loss_mlp": 1.01501894, "epoch": 0.9767022395911619, "flos": 25559818300800.0, "grad_norm": 1.8130657932466356, "language_loss": 0.84038353, "learning_rate": 5.676568187055197e-09, "loss": 0.86476755, "num_input_tokens_seen": 350581535, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19116211, "step": 16245, "time_per_iteration": 2.90073561668396 }, { "auxiliary_loss_clip": 0.01394927, "auxiliary_loss_mlp": 0.01032099, "balance_loss_clip": 1.23677707, "balance_loss_mlp": 1.01370478, "epoch": 0.9767623628438299, "flos": 21772753217280.0, "grad_norm": 1.304184589534147, "language_loss": 0.79026985, "learning_rate": 5.647283615340726e-09, "loss": 0.81454009, "num_input_tokens_seen": 350601615, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18395996, "step": 16246, "time_per_iteration": 4.307973384857178 }, { "auxiliary_loss_clip": 0.01371437, "auxiliary_loss_mlp": 0.01030135, "balance_loss_clip": 1.22127235, "balance_loss_mlp": 1.01280165, "epoch": 0.9768224860964978, "flos": 15859456369920.0, "grad_norm": 1.3224820187807351, "language_loss": 0.74769616, "learning_rate": 5.6180746696812275e-09, "loss": 0.77171189, "num_input_tokens_seen": 350619580, "router_z_loss_clip": 1.50195312, "router_z_loss_mlp": 0.17346191, "step": 16247, "time_per_iteration": 2.8672025203704834 }, { "auxiliary_loss_clip": 0.01400293, "auxiliary_loss_mlp": 0.0103098, "balance_loss_clip": 1.24097741, "balance_loss_mlp": 1.01257455, "epoch": 0.9768826093491658, "flos": 25160911729920.0, "grad_norm": 1.5486620970180436, "language_loss": 0.80441558, "learning_rate": 5.58894135118404e-09, "loss": 0.82872832, "num_input_tokens_seen": 350640015, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.18408203, "step": 16248, "time_per_iteration": 2.9788942337036133 }, { "auxiliary_loss_clip": 0.01410111, "auxiliary_loss_mlp": 0.01045071, "balance_loss_clip": 1.2480197, "balance_loss_mlp": 1.02443624, "epoch": 0.9769427326018337, "flos": 22977526504320.0, "grad_norm": 3.9066961865791074, "language_loss": 0.79958737, "learning_rate": 5.559883660954278e-09, "loss": 0.82413918, "num_input_tokens_seen": 350659155, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.2064209, "step": 16249, "time_per_iteration": 2.9217536449432373 }, { "auxiliary_loss_clip": 0.01391741, "auxiliary_loss_mlp": 0.01034383, "balance_loss_clip": 1.23632061, "balance_loss_mlp": 1.01560783, "epoch": 0.9770028558545018, "flos": 15271697427840.0, "grad_norm": 2.4152044783732194, "language_loss": 0.67622209, "learning_rate": 5.530901600093507e-09, "loss": 0.70048332, "num_input_tokens_seen": 350676615, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18786621, "step": 16250, "time_per_iteration": 2.823637008666992 }, { "auxiliary_loss_clip": 0.01182215, "auxiliary_loss_mlp": 0.01026509, "balance_loss_clip": 1.09327328, "balance_loss_mlp": 1.00171304, "epoch": 0.9770629791071697, "flos": 71481272405760.0, "grad_norm": 0.7854921579859312, "language_loss": 0.59936517, "learning_rate": 5.501995169700846e-09, "loss": 0.62145245, "num_input_tokens_seen": 350736805, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.24707031, "step": 16251, "time_per_iteration": 3.383244752883911 }, { "auxiliary_loss_clip": 0.01401337, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.2419765, "balance_loss_mlp": 1.01641989, "epoch": 0.9771231023598377, "flos": 22421420939520.0, "grad_norm": 1.728371907105927, "language_loss": 0.79088134, "learning_rate": 5.473164370872307e-09, "loss": 0.81525666, "num_input_tokens_seen": 350753600, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19775391, "step": 16252, "time_per_iteration": 2.846653699874878 }, { "auxiliary_loss_clip": 0.01388662, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.23144317, "balance_loss_mlp": 1.01363349, "epoch": 0.9771832256125056, "flos": 19035253198080.0, "grad_norm": 3.8430409177887026, "language_loss": 0.65023601, "learning_rate": 5.444409204701461e-09, "loss": 0.67445147, "num_input_tokens_seen": 350771225, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19250488, "step": 16253, "time_per_iteration": 2.8675317764282227 }, { "auxiliary_loss_clip": 0.014093, "auxiliary_loss_mlp": 0.0103496, "balance_loss_clip": 1.24931467, "balance_loss_mlp": 1.01464713, "epoch": 0.9772433488651736, "flos": 17831203827840.0, "grad_norm": 3.3532893645595387, "language_loss": 0.77566469, "learning_rate": 5.415729672278324e-09, "loss": 0.8001073, "num_input_tokens_seen": 350789100, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.203125, "step": 16254, "time_per_iteration": 4.289041996002197 }, { "auxiliary_loss_clip": 0.01401249, "auxiliary_loss_mlp": 0.01035115, "balance_loss_clip": 1.23959076, "balance_loss_mlp": 1.01624405, "epoch": 0.9773034721178415, "flos": 37642914622080.0, "grad_norm": 3.074352354795752, "language_loss": 0.64868402, "learning_rate": 5.387125774690471e-09, "loss": 0.67304766, "num_input_tokens_seen": 350811085, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.1887207, "step": 16255, "time_per_iteration": 4.5104851722717285 }, { "auxiliary_loss_clip": 0.01405521, "auxiliary_loss_mlp": 0.01033752, "balance_loss_clip": 1.24209261, "balance_loss_mlp": 1.01281917, "epoch": 0.9773635953705095, "flos": 20311974996480.0, "grad_norm": 1.5835759663280684, "language_loss": 0.75949782, "learning_rate": 5.358597513023033e-09, "loss": 0.78389055, "num_input_tokens_seen": 350831065, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.20922852, "step": 16256, "time_per_iteration": 2.835880756378174 }, { "auxiliary_loss_clip": 0.01393416, "auxiliary_loss_mlp": 0.01039481, "balance_loss_clip": 1.23732936, "balance_loss_mlp": 1.01957285, "epoch": 0.9774237186231776, "flos": 22319312129280.0, "grad_norm": 2.34766087713655, "language_loss": 0.78350526, "learning_rate": 5.330144888357369e-09, "loss": 0.80783415, "num_input_tokens_seen": 350849675, "router_z_loss_clip": 1.56054688, "router_z_loss_mlp": 0.19909668, "step": 16257, "time_per_iteration": 2.8585243225097656 }, { "auxiliary_loss_clip": 0.01391353, "auxiliary_loss_mlp": 0.01033108, "balance_loss_clip": 1.23297632, "balance_loss_mlp": 1.01433265, "epoch": 0.9774838418758455, "flos": 24215039043840.0, "grad_norm": 1.6719429121673341, "language_loss": 0.75457954, "learning_rate": 5.301767901772391e-09, "loss": 0.77882421, "num_input_tokens_seen": 350868955, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18774414, "step": 16258, "time_per_iteration": 2.8921289443969727 }, { "auxiliary_loss_clip": 0.01182335, "auxiliary_loss_mlp": 0.01039249, "balance_loss_clip": 1.09375525, "balance_loss_mlp": 1.01617038, "epoch": 0.9775439651285135, "flos": 66390582850560.0, "grad_norm": 0.6827868734239715, "language_loss": 0.59880227, "learning_rate": 5.273466554344353e-09, "loss": 0.62101811, "num_input_tokens_seen": 350935110, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 0.23046875, "step": 16259, "time_per_iteration": 3.444063186645508 }, { "auxiliary_loss_clip": 0.01412937, "auxiliary_loss_mlp": 0.01036337, "balance_loss_clip": 1.25008583, "balance_loss_mlp": 1.01696563, "epoch": 0.9776040883811814, "flos": 22611811633920.0, "grad_norm": 1.72763536980597, "language_loss": 0.74273479, "learning_rate": 5.2452408471461705e-09, "loss": 0.76722753, "num_input_tokens_seen": 350953220, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19372559, "step": 16260, "time_per_iteration": 2.901092290878296 }, { "auxiliary_loss_clip": 0.01393638, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.23596776, "balance_loss_mlp": 1.01399422, "epoch": 0.9776642116338494, "flos": 18451340064000.0, "grad_norm": 4.705948366504652, "language_loss": 0.79720318, "learning_rate": 5.2170907812485456e-09, "loss": 0.82147932, "num_input_tokens_seen": 350971915, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.1998291, "step": 16261, "time_per_iteration": 2.8695197105407715 }, { "auxiliary_loss_clip": 0.01415709, "auxiliary_loss_mlp": 0.01031404, "balance_loss_clip": 1.25335109, "balance_loss_mlp": 1.01306987, "epoch": 0.9777243348865173, "flos": 22648803897600.0, "grad_norm": 2.603208981775957, "language_loss": 0.75115401, "learning_rate": 5.189016357718845e-09, "loss": 0.77562511, "num_input_tokens_seen": 350990470, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18334961, "step": 16262, "time_per_iteration": 2.882128953933716 }, { "auxiliary_loss_clip": 0.01407047, "auxiliary_loss_mlp": 0.0103282, "balance_loss_clip": 1.24638486, "balance_loss_mlp": 1.01307917, "epoch": 0.9777844581391854, "flos": 31333833095040.0, "grad_norm": 2.039669284673078, "language_loss": 0.70707583, "learning_rate": 5.16101757762133e-09, "loss": 0.73147446, "num_input_tokens_seen": 351010755, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.19750977, "step": 16263, "time_per_iteration": 2.926440477371216 }, { "auxiliary_loss_clip": 0.01408276, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.24738753, "balance_loss_mlp": 1.01499629, "epoch": 0.9778445813918533, "flos": 23049384526080.0, "grad_norm": 1.5599774349677595, "language_loss": 0.66591328, "learning_rate": 5.133094442018038e-09, "loss": 0.69032913, "num_input_tokens_seen": 351029965, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.18310547, "step": 16264, "time_per_iteration": 2.869438648223877 }, { "auxiliary_loss_clip": 0.01411716, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 1.24635291, "balance_loss_mlp": 1.01215708, "epoch": 0.9779047046445213, "flos": 17575244138880.0, "grad_norm": 7.776731454926342, "language_loss": 0.74027693, "learning_rate": 5.105246951967679e-09, "loss": 0.7647028, "num_input_tokens_seen": 351046205, "router_z_loss_clip": 1.65527344, "router_z_loss_mlp": 0.18725586, "step": 16265, "time_per_iteration": 2.8278932571411133 }, { "auxiliary_loss_clip": 0.01388681, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.23267794, "balance_loss_mlp": 1.01687205, "epoch": 0.9779648278971892, "flos": 20750995722240.0, "grad_norm": 1.7885565132865664, "language_loss": 0.69299114, "learning_rate": 5.077475108526297e-09, "loss": 0.71723515, "num_input_tokens_seen": 351065390, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18859863, "step": 16266, "time_per_iteration": 2.895054817199707 }, { "auxiliary_loss_clip": 0.01382117, "auxiliary_loss_mlp": 0.01029265, "balance_loss_clip": 1.22856843, "balance_loss_mlp": 1.0114429, "epoch": 0.9780249511498572, "flos": 21035305918080.0, "grad_norm": 1.5747832703190197, "language_loss": 0.87301457, "learning_rate": 5.049778912747049e-09, "loss": 0.89712834, "num_input_tokens_seen": 351084355, "router_z_loss_clip": 1.53613281, "router_z_loss_mlp": 0.17822266, "step": 16267, "time_per_iteration": 2.8854143619537354 }, { "auxiliary_loss_clip": 0.01405007, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.24290586, "balance_loss_mlp": 1.01410842, "epoch": 0.9780850744025251, "flos": 30786550266240.0, "grad_norm": 2.2147559568329256, "language_loss": 0.710051, "learning_rate": 5.022158365679985e-09, "loss": 0.73444217, "num_input_tokens_seen": 351105870, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20019531, "step": 16268, "time_per_iteration": 2.9728877544403076 }, { "auxiliary_loss_clip": 0.01402904, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.2433989, "balance_loss_mlp": 1.01049244, "epoch": 0.9781451976551931, "flos": 20312653668480.0, "grad_norm": 2.419186640616248, "language_loss": 0.74268174, "learning_rate": 4.994613468372711e-09, "loss": 0.76700389, "num_input_tokens_seen": 351124760, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18823242, "step": 16269, "time_per_iteration": 2.872086524963379 }, { "auxiliary_loss_clip": 0.01400546, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.23992479, "balance_loss_mlp": 1.01777339, "epoch": 0.9782053209078612, "flos": 24327192199680.0, "grad_norm": 1.7754473883981208, "language_loss": 0.71359897, "learning_rate": 4.967144221869501e-09, "loss": 0.73798436, "num_input_tokens_seen": 351142820, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.20214844, "step": 16270, "time_per_iteration": 2.877346992492676 }, { "auxiliary_loss_clip": 0.01400866, "auxiliary_loss_mlp": 0.01031868, "balance_loss_clip": 1.24066472, "balance_loss_mlp": 1.01314068, "epoch": 0.9782654441605291, "flos": 32502292790400.0, "grad_norm": 1.7363722386286664, "language_loss": 0.6492542, "learning_rate": 4.939750627212191e-09, "loss": 0.6735816, "num_input_tokens_seen": 351164805, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18725586, "step": 16271, "time_per_iteration": 2.9763550758361816 }, { "auxiliary_loss_clip": 0.01388013, "auxiliary_loss_mlp": 0.01035262, "balance_loss_clip": 1.23356962, "balance_loss_mlp": 1.01676047, "epoch": 0.9783255674131971, "flos": 26990255243520.0, "grad_norm": 1.4493076697749112, "language_loss": 0.70685893, "learning_rate": 4.912432685439505e-09, "loss": 0.73109168, "num_input_tokens_seen": 351187005, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.18505859, "step": 16272, "time_per_iteration": 2.8863015174865723 }, { "auxiliary_loss_clip": 0.01400979, "auxiliary_loss_mlp": 0.01034062, "balance_loss_clip": 1.23959529, "balance_loss_mlp": 1.0157516, "epoch": 0.978385690665865, "flos": 23122599891840.0, "grad_norm": 1.9122153088387575, "language_loss": 0.67346054, "learning_rate": 4.88519039758728e-09, "loss": 0.69781095, "num_input_tokens_seen": 351208450, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18322754, "step": 16273, "time_per_iteration": 2.8705954551696777 }, { "auxiliary_loss_clip": 0.01399683, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.23985147, "balance_loss_mlp": 1.01221299, "epoch": 0.978445813918533, "flos": 25420310023680.0, "grad_norm": 1.6421877777147773, "language_loss": 0.74522758, "learning_rate": 4.85802376468869e-09, "loss": 0.76953673, "num_input_tokens_seen": 351229585, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19006348, "step": 16274, "time_per_iteration": 3.0399856567382812 }, { "auxiliary_loss_clip": 0.01396188, "auxiliary_loss_mlp": 0.01035654, "balance_loss_clip": 1.23755348, "balance_loss_mlp": 1.01737905, "epoch": 0.9785059371712009, "flos": 23560715721600.0, "grad_norm": 1.5648886168141092, "language_loss": 0.78297043, "learning_rate": 4.830932787773579e-09, "loss": 0.80728883, "num_input_tokens_seen": 351249525, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.18286133, "step": 16275, "time_per_iteration": 2.9320003986358643 }, { "auxiliary_loss_clip": 0.01400434, "auxiliary_loss_mlp": 0.01030808, "balance_loss_clip": 1.24000216, "balance_loss_mlp": 1.01199675, "epoch": 0.978566060423869, "flos": 34364782759680.0, "grad_norm": 1.5556579106374215, "language_loss": 0.71560961, "learning_rate": 4.803917467869567e-09, "loss": 0.73992199, "num_input_tokens_seen": 351272530, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18798828, "step": 16276, "time_per_iteration": 2.9695611000061035 }, { "auxiliary_loss_clip": 0.0137354, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.21849656, "balance_loss_mlp": 1.01378024, "epoch": 0.9786261836765369, "flos": 11626131392640.0, "grad_norm": 2.001994431329105, "language_loss": 0.86880094, "learning_rate": 4.776977806000726e-09, "loss": 0.89286178, "num_input_tokens_seen": 351288530, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18762207, "step": 16277, "time_per_iteration": 2.800727367401123 }, { "auxiliary_loss_clip": 0.01392511, "auxiliary_loss_mlp": 0.01033544, "balance_loss_clip": 1.23637843, "balance_loss_mlp": 1.01443458, "epoch": 0.9786863069292049, "flos": 17429944527360.0, "grad_norm": 1.6643018985640836, "language_loss": 0.71468437, "learning_rate": 4.7501138031891264e-09, "loss": 0.73894489, "num_input_tokens_seen": 351305890, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.19091797, "step": 16278, "time_per_iteration": 2.850175142288208 }, { "auxiliary_loss_clip": 0.01395832, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.23797011, "balance_loss_mlp": 1.01299644, "epoch": 0.9787464301818728, "flos": 20853330756480.0, "grad_norm": 2.1174342178591363, "language_loss": 0.84468228, "learning_rate": 4.723325460453065e-09, "loss": 0.86896133, "num_input_tokens_seen": 351325010, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19091797, "step": 16279, "time_per_iteration": 4.308953046798706 }, { "auxiliary_loss_clip": 0.01393878, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.23454273, "balance_loss_mlp": 1.01178932, "epoch": 0.9788065534345408, "flos": 18231965435520.0, "grad_norm": 1.8716598543840146, "language_loss": 0.79555768, "learning_rate": 4.696612778808395e-09, "loss": 0.81979978, "num_input_tokens_seen": 351343060, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18530273, "step": 16280, "time_per_iteration": 2.856933355331421 }, { "auxiliary_loss_clip": 0.01378991, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.22493112, "balance_loss_mlp": 1.01735044, "epoch": 0.9788666766872087, "flos": 21587746654080.0, "grad_norm": 1.7199167863160605, "language_loss": 0.79863888, "learning_rate": 4.669975759268085e-09, "loss": 0.82278812, "num_input_tokens_seen": 351363260, "router_z_loss_clip": 1.54101562, "router_z_loss_mlp": 0.18591309, "step": 16281, "time_per_iteration": 4.477910757064819 }, { "auxiliary_loss_clip": 0.01400145, "auxiliary_loss_mlp": 0.01032195, "balance_loss_clip": 1.23970044, "balance_loss_mlp": 1.01387262, "epoch": 0.9789267999398767, "flos": 24911422047360.0, "grad_norm": 1.6273026971620754, "language_loss": 0.81003582, "learning_rate": 4.643414402842216e-09, "loss": 0.83435917, "num_input_tokens_seen": 351382610, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18334961, "step": 16282, "time_per_iteration": 3.020148754119873 }, { "auxiliary_loss_clip": 0.01385973, "auxiliary_loss_mlp": 0.0103626, "balance_loss_clip": 1.22814369, "balance_loss_mlp": 1.017735, "epoch": 0.9789869231925448, "flos": 19582536026880.0, "grad_norm": 3.3881743940661178, "language_loss": 0.83797395, "learning_rate": 4.616928710538204e-09, "loss": 0.86219627, "num_input_tokens_seen": 351401075, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18530273, "step": 16283, "time_per_iteration": 2.8674137592315674 }, { "auxiliary_loss_clip": 0.01398831, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.23967314, "balance_loss_mlp": 1.01676643, "epoch": 0.9790470464452127, "flos": 16804424160000.0, "grad_norm": 1.8438729125356759, "language_loss": 0.72703159, "learning_rate": 4.590518683360134e-09, "loss": 0.75137359, "num_input_tokens_seen": 351419275, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18591309, "step": 16284, "time_per_iteration": 2.8400633335113525 }, { "auxiliary_loss_clip": 0.01389368, "auxiliary_loss_mlp": 0.01034335, "balance_loss_clip": 1.23391747, "balance_loss_mlp": 1.01551163, "epoch": 0.9791071696978807, "flos": 18378531901440.0, "grad_norm": 1.620209670331508, "language_loss": 0.65028942, "learning_rate": 4.56418432230965e-09, "loss": 0.67452645, "num_input_tokens_seen": 351437375, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18847656, "step": 16285, "time_per_iteration": 2.8428094387054443 }, { "auxiliary_loss_clip": 0.01391727, "auxiliary_loss_mlp": 0.01031379, "balance_loss_clip": 1.23452771, "balance_loss_mlp": 1.01287806, "epoch": 0.9791672929505486, "flos": 24180942447360.0, "grad_norm": 1.4773658128214413, "language_loss": 0.71537113, "learning_rate": 4.537925628385286e-09, "loss": 0.73960221, "num_input_tokens_seen": 351457810, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18493652, "step": 16286, "time_per_iteration": 2.8957693576812744 }, { "auxiliary_loss_clip": 0.01380172, "auxiliary_loss_mlp": 0.01033671, "balance_loss_clip": 1.22505486, "balance_loss_mlp": 1.01553917, "epoch": 0.9792274162032166, "flos": 24365134604160.0, "grad_norm": 1.4819171197318963, "language_loss": 0.59570032, "learning_rate": 4.511742602582691e-09, "loss": 0.61983871, "num_input_tokens_seen": 351478825, "router_z_loss_clip": 1.54980469, "router_z_loss_mlp": 0.18151855, "step": 16287, "time_per_iteration": 2.9106900691986084 }, { "auxiliary_loss_clip": 0.01391968, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.23482251, "balance_loss_mlp": 1.01314747, "epoch": 0.9792875394558845, "flos": 26406930291840.0, "grad_norm": 1.7182501958454888, "language_loss": 0.82415587, "learning_rate": 4.485635245894626e-09, "loss": 0.84840596, "num_input_tokens_seen": 351498785, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.19885254, "step": 16288, "time_per_iteration": 2.8720617294311523 }, { "auxiliary_loss_clip": 0.01397198, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.23821235, "balance_loss_mlp": 1.01527715, "epoch": 0.9793476627085526, "flos": 28159846058880.0, "grad_norm": 1.392899788630742, "language_loss": 0.72237962, "learning_rate": 4.459603559311631e-09, "loss": 0.74669641, "num_input_tokens_seen": 351520235, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.1920166, "step": 16289, "time_per_iteration": 2.9140384197235107 }, { "auxiliary_loss_clip": 0.01399165, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.24110365, "balance_loss_mlp": 1.01533318, "epoch": 0.9794077859612205, "flos": 16772680293120.0, "grad_norm": 2.186455798757513, "language_loss": 0.76440865, "learning_rate": 4.43364754382003e-09, "loss": 0.78873646, "num_input_tokens_seen": 351538900, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18286133, "step": 16290, "time_per_iteration": 5.616847276687622 }, { "auxiliary_loss_clip": 0.01402242, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.23894954, "balance_loss_mlp": 1.01312518, "epoch": 0.9794679092138885, "flos": 19290081767040.0, "grad_norm": 1.5662991233947143, "language_loss": 0.67486227, "learning_rate": 4.4077672004048105e-09, "loss": 0.69921279, "num_input_tokens_seen": 351558715, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.19702148, "step": 16291, "time_per_iteration": 2.8482697010040283 }, { "auxiliary_loss_clip": 0.0140756, "auxiliary_loss_mlp": 0.01033576, "balance_loss_clip": 1.24499488, "balance_loss_mlp": 1.01435947, "epoch": 0.9795280324665564, "flos": 32168276542080.0, "grad_norm": 1.6048190983064579, "language_loss": 0.62354088, "learning_rate": 4.3819625300467456e-09, "loss": 0.6479522, "num_input_tokens_seen": 351578450, "router_z_loss_clip": 1.625, "router_z_loss_mlp": 0.19213867, "step": 16292, "time_per_iteration": 2.9441723823547363 }, { "auxiliary_loss_clip": 0.01391748, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.23321724, "balance_loss_mlp": 1.01174593, "epoch": 0.9795881557192244, "flos": 19069892732160.0, "grad_norm": 2.027644088307165, "language_loss": 0.74357748, "learning_rate": 4.356233533724829e-09, "loss": 0.7678014, "num_input_tokens_seen": 351597195, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18908691, "step": 16293, "time_per_iteration": 2.8586339950561523 }, { "auxiliary_loss_clip": 0.01415149, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.25321293, "balance_loss_mlp": 1.01644373, "epoch": 0.9796482789718923, "flos": 28341685486080.0, "grad_norm": 1.6874131800020478, "language_loss": 0.85017979, "learning_rate": 4.330580212414503e-09, "loss": 0.87468231, "num_input_tokens_seen": 351617460, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18676758, "step": 16294, "time_per_iteration": 2.8975167274475098 }, { "auxiliary_loss_clip": 0.01377776, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.22491884, "balance_loss_mlp": 1.01625562, "epoch": 0.9797084022245603, "flos": 17977046376960.0, "grad_norm": 2.284729616993463, "language_loss": 0.72993577, "learning_rate": 4.305002567088767e-09, "loss": 0.75406832, "num_input_tokens_seen": 351635900, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.19213867, "step": 16295, "time_per_iteration": 2.8519980907440186 }, { "auxiliary_loss_clip": 0.01408836, "auxiliary_loss_mlp": 0.01038595, "balance_loss_clip": 1.24683952, "balance_loss_mlp": 1.0190686, "epoch": 0.9797685254772284, "flos": 20276430566400.0, "grad_norm": 1.6793868298773194, "language_loss": 0.80939591, "learning_rate": 4.2795005987170674e-09, "loss": 0.83387017, "num_input_tokens_seen": 351655400, "router_z_loss_clip": 1.61816406, "router_z_loss_mlp": 0.19519043, "step": 16296, "time_per_iteration": 2.8413708209991455 }, { "auxiliary_loss_clip": 0.0139827, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.24080193, "balance_loss_mlp": 1.01404762, "epoch": 0.9798286487298963, "flos": 26918578200960.0, "grad_norm": 1.7431190160309271, "language_loss": 0.75663209, "learning_rate": 4.254074308266853e-09, "loss": 0.78094065, "num_input_tokens_seen": 351675505, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.18530273, "step": 16297, "time_per_iteration": 2.9021573066711426 }, { "auxiliary_loss_clip": 0.01401074, "auxiliary_loss_mlp": 0.01034545, "balance_loss_clip": 1.2385236, "balance_loss_mlp": 1.01585293, "epoch": 0.9798887719825643, "flos": 27172185160320.0, "grad_norm": 2.736842327065104, "language_loss": 0.78904635, "learning_rate": 4.228723696702019e-09, "loss": 0.81340253, "num_input_tokens_seen": 351697920, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.18676758, "step": 16298, "time_per_iteration": 2.937523126602173 }, { "auxiliary_loss_clip": 0.01389679, "auxiliary_loss_mlp": 0.01028399, "balance_loss_clip": 1.23440218, "balance_loss_mlp": 1.00977886, "epoch": 0.9799488952352322, "flos": 20678323294080.0, "grad_norm": 1.6884137906118983, "language_loss": 0.7355001, "learning_rate": 4.203448764984019e-09, "loss": 0.75968087, "num_input_tokens_seen": 351717615, "router_z_loss_clip": 1.55175781, "router_z_loss_mlp": 0.1862793, "step": 16299, "time_per_iteration": 2.832148313522339 }, { "auxiliary_loss_clip": 0.01399617, "auxiliary_loss_mlp": 0.0103148, "balance_loss_clip": 1.23832512, "balance_loss_mlp": 1.01288378, "epoch": 0.9800090184879002, "flos": 21991268194560.0, "grad_norm": 2.0957449246246074, "language_loss": 0.90152174, "learning_rate": 4.178249514071419e-09, "loss": 0.92583275, "num_input_tokens_seen": 351735260, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.18591309, "step": 16300, "time_per_iteration": 2.8857336044311523 }, { "auxiliary_loss_clip": 0.01405069, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.24176288, "balance_loss_mlp": 1.01253021, "epoch": 0.9800691417405681, "flos": 21298414285440.0, "grad_norm": 2.0932251788503846, "language_loss": 0.7942636, "learning_rate": 4.1531259449194555e-09, "loss": 0.81863058, "num_input_tokens_seen": 351755800, "router_z_loss_clip": 1.6328125, "router_z_loss_mlp": 0.19091797, "step": 16301, "time_per_iteration": 2.879556894302368 }, { "auxiliary_loss_clip": 0.01401244, "auxiliary_loss_mlp": 0.01037144, "balance_loss_clip": 1.2425729, "balance_loss_mlp": 1.01770067, "epoch": 0.9801292649932362, "flos": 18448444396800.0, "grad_norm": 1.87574907858267, "language_loss": 0.76540327, "learning_rate": 4.128078058480921e-09, "loss": 0.78978711, "num_input_tokens_seen": 351774790, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19421387, "step": 16302, "time_per_iteration": 2.857126235961914 }, { "auxiliary_loss_clip": 0.01402291, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.24362671, "balance_loss_mlp": 1.01518726, "epoch": 0.9801893882459041, "flos": 25057309841280.0, "grad_norm": 3.349300968912559, "language_loss": 0.80016255, "learning_rate": 4.103105855705724e-09, "loss": 0.82452738, "num_input_tokens_seen": 351792855, "router_z_loss_clip": 1.58691406, "router_z_loss_mlp": 0.18994141, "step": 16303, "time_per_iteration": 2.900641918182373 }, { "auxiliary_loss_clip": 0.0140967, "auxiliary_loss_mlp": 0.01033912, "balance_loss_clip": 1.24669874, "balance_loss_mlp": 1.01537466, "epoch": 0.9802495114985721, "flos": 18519714236160.0, "grad_norm": 2.3117070308959518, "language_loss": 0.83624637, "learning_rate": 4.078209337540883e-09, "loss": 0.86068213, "num_input_tokens_seen": 351811450, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.1854248, "step": 16304, "time_per_iteration": 2.8329830169677734 }, { "auxiliary_loss_clip": 0.01387626, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.23341894, "balance_loss_mlp": 1.01392829, "epoch": 0.98030963475124, "flos": 21479620285440.0, "grad_norm": 1.7510404733696563, "language_loss": 0.71509999, "learning_rate": 4.053388504930089e-09, "loss": 0.7392959, "num_input_tokens_seen": 351831960, "router_z_loss_clip": 1.54199219, "router_z_loss_mlp": 0.18017578, "step": 16305, "time_per_iteration": 2.9039056301116943 }, { "auxiliary_loss_clip": 0.01411074, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.24917269, "balance_loss_mlp": 1.01484561, "epoch": 0.980369758003908, "flos": 20421911157120.0, "grad_norm": 2.645134043452221, "language_loss": 0.7271477, "learning_rate": 4.028643358815032e-09, "loss": 0.75160277, "num_input_tokens_seen": 351851585, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19580078, "step": 16306, "time_per_iteration": 2.840883493423462 }, { "auxiliary_loss_clip": 0.01378388, "auxiliary_loss_mlp": 0.0103134, "balance_loss_clip": 1.2228663, "balance_loss_mlp": 1.0131135, "epoch": 0.9804298812565759, "flos": 23408312676480.0, "grad_norm": 1.9032003177788999, "language_loss": 0.74548829, "learning_rate": 4.00397390013385e-09, "loss": 0.76958561, "num_input_tokens_seen": 351871085, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.18225098, "step": 16307, "time_per_iteration": 2.9179160594940186 }, { "auxiliary_loss_clip": 0.013737, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.22109962, "balance_loss_mlp": 1.01336288, "epoch": 0.980490004509244, "flos": 23302539037440.0, "grad_norm": 1.7652662549700575, "language_loss": 0.74874175, "learning_rate": 3.979380129822018e-09, "loss": 0.7727837, "num_input_tokens_seen": 351891775, "router_z_loss_clip": 1.52832031, "router_z_loss_mlp": 0.17138672, "step": 16308, "time_per_iteration": 2.8911659717559814 }, { "auxiliary_loss_clip": 0.01180885, "auxiliary_loss_mlp": 0.01039228, "balance_loss_clip": 1.09327066, "balance_loss_mlp": 1.00890124, "epoch": 0.980550127761912, "flos": 56077351113600.0, "grad_norm": 0.7601874219662592, "language_loss": 0.57846069, "learning_rate": 3.954862048811902e-09, "loss": 0.60066187, "num_input_tokens_seen": 351946770, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.30273438, "step": 16309, "time_per_iteration": 3.24973464012146 }, { "auxiliary_loss_clip": 0.01394804, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.23478055, "balance_loss_mlp": 1.01338887, "epoch": 0.9806102510145799, "flos": 25343022625920.0, "grad_norm": 1.7445918279657047, "language_loss": 0.67349899, "learning_rate": 3.930419658033646e-09, "loss": 0.69776863, "num_input_tokens_seen": 351966155, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18762207, "step": 16310, "time_per_iteration": 2.8780481815338135 }, { "auxiliary_loss_clip": 0.0117898, "auxiliary_loss_mlp": 0.01030265, "balance_loss_clip": 1.09201503, "balance_loss_mlp": 1.01071513, "epoch": 0.9806703742672479, "flos": 67309960066560.0, "grad_norm": 0.8210806659183517, "language_loss": 0.54602456, "learning_rate": 3.906052958413841e-09, "loss": 0.56811702, "num_input_tokens_seen": 352031655, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.1953125, "step": 16311, "time_per_iteration": 3.335137128829956 }, { "auxiliary_loss_clip": 0.01401789, "auxiliary_loss_mlp": 0.01032878, "balance_loss_clip": 1.24327064, "balance_loss_mlp": 1.01394713, "epoch": 0.9807304975199158, "flos": 25240008919680.0, "grad_norm": 2.7528148001981356, "language_loss": 0.80360031, "learning_rate": 3.881761950876638e-09, "loss": 0.82794696, "num_input_tokens_seen": 352051920, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18933105, "step": 16312, "time_per_iteration": 2.92051362991333 }, { "auxiliary_loss_clip": 0.01390162, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.23369527, "balance_loss_mlp": 1.00930643, "epoch": 0.9807906207725838, "flos": 17465126999040.0, "grad_norm": 2.560431084668054, "language_loss": 0.63889706, "learning_rate": 3.8575466363430785e-09, "loss": 0.66306788, "num_input_tokens_seen": 352069315, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.17602539, "step": 16313, "time_per_iteration": 2.8344950675964355 }, { "auxiliary_loss_clip": 0.01390689, "auxiliary_loss_mlp": 0.01035957, "balance_loss_clip": 1.23382485, "balance_loss_mlp": 1.01652575, "epoch": 0.9808507440252517, "flos": 21042183127680.0, "grad_norm": 1.9037445823970505, "language_loss": 0.73120725, "learning_rate": 3.833407015731316e-09, "loss": 0.75547373, "num_input_tokens_seen": 352089480, "router_z_loss_clip": 1.56835938, "router_z_loss_mlp": 0.19433594, "step": 16314, "time_per_iteration": 4.307093143463135 }, { "auxiliary_loss_clip": 0.01178, "auxiliary_loss_mlp": 0.01029143, "balance_loss_clip": 1.09043956, "balance_loss_mlp": 1.01045096, "epoch": 0.9809108672779198, "flos": 64073688923520.0, "grad_norm": 0.6964784399449659, "language_loss": 0.51786762, "learning_rate": 3.80934308995684e-09, "loss": 0.53993905, "num_input_tokens_seen": 352150000, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.18652344, "step": 16315, "time_per_iteration": 3.298262357711792 }, { "auxiliary_loss_clip": 0.01398186, "auxiliary_loss_mlp": 0.01031996, "balance_loss_clip": 1.23735845, "balance_loss_mlp": 1.01357865, "epoch": 0.9809709905305877, "flos": 22790031477120.0, "grad_norm": 1.7125208300800534, "language_loss": 0.70373702, "learning_rate": 3.785354859932033e-09, "loss": 0.72803891, "num_input_tokens_seen": 352170990, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.1842041, "step": 16316, "time_per_iteration": 4.401776313781738 }, { "auxiliary_loss_clip": 0.01393144, "auxiliary_loss_mlp": 0.01030265, "balance_loss_clip": 1.23213053, "balance_loss_mlp": 1.01174021, "epoch": 0.9810311137832557, "flos": 37027393355520.0, "grad_norm": 2.1005508088311005, "language_loss": 0.55808771, "learning_rate": 3.76144232656661e-09, "loss": 0.58232176, "num_input_tokens_seen": 352195335, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.1854248, "step": 16317, "time_per_iteration": 3.016195774078369 }, { "auxiliary_loss_clip": 0.01387057, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.23080909, "balance_loss_mlp": 1.01348996, "epoch": 0.9810912370359236, "flos": 18925543261440.0, "grad_norm": 2.5096305686801106, "language_loss": 0.73968148, "learning_rate": 3.737605490767404e-09, "loss": 0.76387393, "num_input_tokens_seen": 352214170, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18713379, "step": 16318, "time_per_iteration": 2.857858180999756 }, { "auxiliary_loss_clip": 0.0138169, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.22618926, "balance_loss_mlp": 1.01701093, "epoch": 0.9811513602885916, "flos": 18450616147200.0, "grad_norm": 2.1048247137796134, "language_loss": 0.82531714, "learning_rate": 3.7138443534383555e-09, "loss": 0.84948635, "num_input_tokens_seen": 352231470, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18225098, "step": 16319, "time_per_iteration": 2.8692994117736816 }, { "auxiliary_loss_clip": 0.01177028, "auxiliary_loss_mlp": 0.01019687, "balance_loss_clip": 1.09129691, "balance_loss_mlp": 1.00261641, "epoch": 0.9812114835412595, "flos": 68089449064320.0, "grad_norm": 0.7176720519565054, "language_loss": 0.53626126, "learning_rate": 3.6901589154803014e-09, "loss": 0.55822843, "num_input_tokens_seen": 352291770, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.17089844, "step": 16320, "time_per_iteration": 3.265028953552246 }, { "auxiliary_loss_clip": 0.01401918, "auxiliary_loss_mlp": 0.01032986, "balance_loss_clip": 1.24187636, "balance_loss_mlp": 1.01452088, "epoch": 0.9812716067939276, "flos": 25383589228800.0, "grad_norm": 1.9872695036795303, "language_loss": 0.73973334, "learning_rate": 3.6665491777914116e-09, "loss": 0.76408237, "num_input_tokens_seen": 352310735, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.18469238, "step": 16321, "time_per_iteration": 2.9811341762542725 }, { "auxiliary_loss_clip": 0.01389594, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.23382437, "balance_loss_mlp": 1.01390481, "epoch": 0.9813317300465956, "flos": 22867092650880.0, "grad_norm": 1.6440586777003658, "language_loss": 0.79545784, "learning_rate": 3.6430151412669698e-09, "loss": 0.81967252, "num_input_tokens_seen": 352329545, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.1796875, "step": 16322, "time_per_iteration": 2.8677475452423096 }, { "auxiliary_loss_clip": 0.01390874, "auxiliary_loss_mlp": 0.01035188, "balance_loss_clip": 1.23178005, "balance_loss_mlp": 1.01609039, "epoch": 0.9813918532992635, "flos": 23597527006080.0, "grad_norm": 1.8544480247367832, "language_loss": 0.81633532, "learning_rate": 3.619556806799595e-09, "loss": 0.84059596, "num_input_tokens_seen": 352352080, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.19091797, "step": 16323, "time_per_iteration": 2.8989927768707275 }, { "auxiliary_loss_clip": 0.01405556, "auxiliary_loss_mlp": 0.01034218, "balance_loss_clip": 1.24270999, "balance_loss_mlp": 1.01482272, "epoch": 0.9814519765519315, "flos": 19614913320960.0, "grad_norm": 6.298577167984119, "language_loss": 0.85500985, "learning_rate": 3.596174175278799e-09, "loss": 0.87940764, "num_input_tokens_seen": 352366455, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.19396973, "step": 16324, "time_per_iteration": 4.24781346321106 }, { "auxiliary_loss_clip": 0.01396656, "auxiliary_loss_mlp": 0.01031398, "balance_loss_clip": 1.23775065, "balance_loss_mlp": 1.01297975, "epoch": 0.9815120998045994, "flos": 33958863244800.0, "grad_norm": 1.353887722845431, "language_loss": 0.75175053, "learning_rate": 3.5728672475909827e-09, "loss": 0.77603108, "num_input_tokens_seen": 352386090, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18408203, "step": 16325, "time_per_iteration": 4.434406757354736 }, { "auxiliary_loss_clip": 0.01383666, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.2302556, "balance_loss_mlp": 1.01609325, "epoch": 0.9815722230572674, "flos": 20859755518080.0, "grad_norm": 1.6261691932796574, "language_loss": 0.76900005, "learning_rate": 3.5496360246201063e-09, "loss": 0.79318333, "num_input_tokens_seen": 352404000, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.18579102, "step": 16326, "time_per_iteration": 2.855483293533325 }, { "auxiliary_loss_clip": 0.01407485, "auxiliary_loss_mlp": 0.01033773, "balance_loss_clip": 1.24645972, "balance_loss_mlp": 1.01450849, "epoch": 0.9816323463099353, "flos": 22905125544960.0, "grad_norm": 1.7289096153173702, "language_loss": 0.6859405, "learning_rate": 3.5264805072470205e-09, "loss": 0.71035308, "num_input_tokens_seen": 352423540, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.19250488, "step": 16327, "time_per_iteration": 2.861799955368042 }, { "auxiliary_loss_clip": 0.01413633, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.24842799, "balance_loss_mlp": 1.01474786, "epoch": 0.9816924695626034, "flos": 31551714645120.0, "grad_norm": 1.4978965196358367, "language_loss": 0.73972952, "learning_rate": 3.5034006963501337e-09, "loss": 0.76421046, "num_input_tokens_seen": 352445530, "router_z_loss_clip": 1.65234375, "router_z_loss_mlp": 0.19726562, "step": 16328, "time_per_iteration": 2.927619218826294 }, { "auxiliary_loss_clip": 0.01435894, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.26747417, "balance_loss_mlp": 1.01930737, "epoch": 0.9817525928152713, "flos": 21516838773120.0, "grad_norm": 1.778962727529593, "language_loss": 0.82399046, "learning_rate": 3.4803965928040802e-09, "loss": 0.84872901, "num_input_tokens_seen": 352466325, "router_z_loss_clip": 1.68261719, "router_z_loss_mlp": 0.18664551, "step": 16329, "time_per_iteration": 2.8557119369506836 }, { "auxiliary_loss_clip": 0.01412318, "auxiliary_loss_mlp": 0.01033117, "balance_loss_clip": 1.24899113, "balance_loss_mlp": 1.01420999, "epoch": 0.9818127160679393, "flos": 25559863545600.0, "grad_norm": 1.8466984731399236, "language_loss": 0.77100396, "learning_rate": 3.4574681974817168e-09, "loss": 0.79545832, "num_input_tokens_seen": 352485505, "router_z_loss_clip": 1.63183594, "router_z_loss_mlp": 0.18920898, "step": 16330, "time_per_iteration": 2.9195094108581543 }, { "auxiliary_loss_clip": 0.01430675, "auxiliary_loss_mlp": 0.01033301, "balance_loss_clip": 1.25955737, "balance_loss_mlp": 1.01302314, "epoch": 0.9818728393206072, "flos": 28815390990720.0, "grad_norm": 2.2335169401631267, "language_loss": 0.66965783, "learning_rate": 3.434615511252126e-09, "loss": 0.69429767, "num_input_tokens_seen": 352505360, "router_z_loss_clip": 1.71289062, "router_z_loss_mlp": 0.20288086, "step": 16331, "time_per_iteration": 2.885392427444458 }, { "auxiliary_loss_clip": 0.01395363, "auxiliary_loss_mlp": 0.01031883, "balance_loss_clip": 1.23685288, "balance_loss_mlp": 1.01321447, "epoch": 0.9819329625732752, "flos": 23232762276480.0, "grad_norm": 1.6956179259388642, "language_loss": 0.74645454, "learning_rate": 3.411838534981948e-09, "loss": 0.77072704, "num_input_tokens_seen": 352524035, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18652344, "step": 16332, "time_per_iteration": 2.8638341426849365 }, { "auxiliary_loss_clip": 0.01390591, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.23213017, "balance_loss_mlp": 1.01103091, "epoch": 0.9819930858259431, "flos": 17539473484800.0, "grad_norm": 2.74879588440532, "language_loss": 0.77271914, "learning_rate": 3.389137269534936e-09, "loss": 0.79690945, "num_input_tokens_seen": 352543210, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.17407227, "step": 16333, "time_per_iteration": 2.807950496673584 }, { "auxiliary_loss_clip": 0.0139534, "auxiliary_loss_mlp": 0.01031027, "balance_loss_clip": 1.23599482, "balance_loss_mlp": 1.01264524, "epoch": 0.9820532090786112, "flos": 12537138320640.0, "grad_norm": 2.0958041519038164, "language_loss": 0.74219966, "learning_rate": 3.366511715771958e-09, "loss": 0.76646334, "num_input_tokens_seen": 352559770, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.18383789, "step": 16334, "time_per_iteration": 2.817504644393921 }, { "auxiliary_loss_clip": 0.0140219, "auxiliary_loss_mlp": 0.01036798, "balance_loss_clip": 1.24142075, "balance_loss_mlp": 1.01798677, "epoch": 0.9821133323312792, "flos": 18848572577280.0, "grad_norm": 2.779586204860118, "language_loss": 0.79093903, "learning_rate": 3.3439618745509934e-09, "loss": 0.81532896, "num_input_tokens_seen": 352577690, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18798828, "step": 16335, "time_per_iteration": 2.849583387374878 }, { "auxiliary_loss_clip": 0.01405568, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.24272227, "balance_loss_mlp": 1.01850319, "epoch": 0.9821734555839471, "flos": 34837357144320.0, "grad_norm": 36.7807906505841, "language_loss": 0.65588474, "learning_rate": 3.3214877467271362e-09, "loss": 0.68032199, "num_input_tokens_seen": 352598850, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19665527, "step": 16336, "time_per_iteration": 2.9764039516448975 }, { "auxiliary_loss_clip": 0.01404347, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.23972666, "balance_loss_mlp": 1.01747704, "epoch": 0.9822335788366151, "flos": 17136856840320.0, "grad_norm": 1.798625558581924, "language_loss": 0.74381649, "learning_rate": 3.299089333152372e-09, "loss": 0.76823378, "num_input_tokens_seen": 352616130, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19885254, "step": 16337, "time_per_iteration": 2.8278439044952393 }, { "auxiliary_loss_clip": 0.01394576, "auxiliary_loss_mlp": 0.01037897, "balance_loss_clip": 1.23308909, "balance_loss_mlp": 1.01769078, "epoch": 0.982293702089283, "flos": 20822898988800.0, "grad_norm": 1.594928490681411, "language_loss": 0.73806381, "learning_rate": 3.2767666346764645e-09, "loss": 0.76238853, "num_input_tokens_seen": 352636885, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.20214844, "step": 16338, "time_per_iteration": 2.867870569229126 }, { "auxiliary_loss_clip": 0.01399886, "auxiliary_loss_mlp": 0.01040494, "balance_loss_clip": 1.23954272, "balance_loss_mlp": 1.02140856, "epoch": 0.982353825341951, "flos": 24691504481280.0, "grad_norm": 1.7318630898245948, "language_loss": 0.82135737, "learning_rate": 3.2545196521454045e-09, "loss": 0.84576118, "num_input_tokens_seen": 352657905, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.1907959, "step": 16339, "time_per_iteration": 2.9090890884399414 }, { "auxiliary_loss_clip": 0.01386984, "auxiliary_loss_mlp": 0.01031109, "balance_loss_clip": 1.23142767, "balance_loss_mlp": 1.0121665, "epoch": 0.982413948594619, "flos": 20860343700480.0, "grad_norm": 1.6831310070160217, "language_loss": 0.62820965, "learning_rate": 3.232348386403405e-09, "loss": 0.6523906, "num_input_tokens_seen": 352676320, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.1895752, "step": 16340, "time_per_iteration": 2.8368945121765137 }, { "auxiliary_loss_clip": 0.01407641, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.24622583, "balance_loss_mlp": 1.01109552, "epoch": 0.982474071847287, "flos": 15385750865280.0, "grad_norm": 2.980301633031861, "language_loss": 0.86782217, "learning_rate": 3.2102528382904613e-09, "loss": 0.89220321, "num_input_tokens_seen": 352692665, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19372559, "step": 16341, "time_per_iteration": 2.9089434146881104 }, { "auxiliary_loss_clip": 0.01380327, "auxiliary_loss_mlp": 0.01029545, "balance_loss_clip": 1.22555661, "balance_loss_mlp": 1.01084125, "epoch": 0.9825341950999549, "flos": 23786741335680.0, "grad_norm": 2.9144734842749567, "language_loss": 0.67235738, "learning_rate": 3.188233008645014e-09, "loss": 0.69645607, "num_input_tokens_seen": 352716130, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.18701172, "step": 16342, "time_per_iteration": 2.9485669136047363 }, { "auxiliary_loss_clip": 0.01402154, "auxiliary_loss_mlp": 0.01029666, "balance_loss_clip": 1.2407583, "balance_loss_mlp": 1.01022315, "epoch": 0.9825943183526229, "flos": 22756251594240.0, "grad_norm": 1.7507930586633487, "language_loss": 0.77544034, "learning_rate": 3.16628889830195e-09, "loss": 0.79975855, "num_input_tokens_seen": 352734705, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19445801, "step": 16343, "time_per_iteration": 2.8317768573760986 }, { "auxiliary_loss_clip": 0.01397975, "auxiliary_loss_mlp": 0.01031886, "balance_loss_clip": 1.24077928, "balance_loss_mlp": 1.01368237, "epoch": 0.9826544416052908, "flos": 27721639739520.0, "grad_norm": 1.5365773974449652, "language_loss": 0.75830626, "learning_rate": 3.1444205080932707e-09, "loss": 0.78260481, "num_input_tokens_seen": 352756225, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18188477, "step": 16344, "time_per_iteration": 2.887556791305542 }, { "auxiliary_loss_clip": 0.01412385, "auxiliary_loss_mlp": 0.01035542, "balance_loss_clip": 1.25207484, "balance_loss_mlp": 1.01726723, "epoch": 0.9827145648579588, "flos": 26952584307840.0, "grad_norm": 2.122693524862932, "language_loss": 0.67261279, "learning_rate": 3.122627838848313e-09, "loss": 0.69709206, "num_input_tokens_seen": 352776210, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18286133, "step": 16345, "time_per_iteration": 2.8643970489501953 }, { "auxiliary_loss_clip": 0.01385296, "auxiliary_loss_mlp": 0.01030047, "balance_loss_clip": 1.22998023, "balance_loss_mlp": 1.01168847, "epoch": 0.9827746881106267, "flos": 21875223985920.0, "grad_norm": 2.2036909212656606, "language_loss": 0.80049175, "learning_rate": 3.1009108913933045e-09, "loss": 0.82464516, "num_input_tokens_seen": 352795455, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.18347168, "step": 16346, "time_per_iteration": 2.870227336883545 }, { "auxiliary_loss_clip": 0.01423699, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.25728202, "balance_loss_mlp": 1.01381326, "epoch": 0.9828348113632948, "flos": 20860705658880.0, "grad_norm": 2.021880516125609, "language_loss": 0.76154137, "learning_rate": 3.079269666552031e-09, "loss": 0.78611028, "num_input_tokens_seen": 352812895, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19384766, "step": 16347, "time_per_iteration": 2.8142223358154297 }, { "auxiliary_loss_clip": 0.01379999, "auxiliary_loss_mlp": 0.01036692, "balance_loss_clip": 1.2249949, "balance_loss_mlp": 1.01809525, "epoch": 0.9828949346159628, "flos": 34582664309760.0, "grad_norm": 1.708137141615956, "language_loss": 0.68188095, "learning_rate": 3.0577041651449474e-09, "loss": 0.70604777, "num_input_tokens_seen": 352835470, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.18591309, "step": 16348, "time_per_iteration": 2.9624292850494385 }, { "auxiliary_loss_clip": 0.01393618, "auxiliary_loss_mlp": 0.01034029, "balance_loss_clip": 1.23579335, "balance_loss_mlp": 1.01422811, "epoch": 0.9829550578686307, "flos": 24466926700800.0, "grad_norm": 3.5809937518734167, "language_loss": 0.70032036, "learning_rate": 3.0362143879898437e-09, "loss": 0.72459674, "num_input_tokens_seen": 352854295, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19812012, "step": 16349, "time_per_iteration": 4.2957799434661865 }, { "auxiliary_loss_clip": 0.01369055, "auxiliary_loss_mlp": 0.0103025, "balance_loss_clip": 1.21643162, "balance_loss_mlp": 1.01173651, "epoch": 0.9830151811212987, "flos": 16918930045440.0, "grad_norm": 1.8589675914643207, "language_loss": 0.75835854, "learning_rate": 3.0148003359014018e-09, "loss": 0.78235161, "num_input_tokens_seen": 352869695, "router_z_loss_clip": 1.52636719, "router_z_loss_mlp": 0.18518066, "step": 16350, "time_per_iteration": 2.8139514923095703 }, { "auxiliary_loss_clip": 0.0140304, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.24248648, "balance_loss_mlp": 1.01557326, "epoch": 0.9830753043739666, "flos": 21298278551040.0, "grad_norm": 2.516635311641066, "language_loss": 0.8511833, "learning_rate": 2.9934620096920826e-09, "loss": 0.87556893, "num_input_tokens_seen": 352887430, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19934082, "step": 16351, "time_per_iteration": 4.262692213058472 }, { "auxiliary_loss_clip": 0.01393776, "auxiliary_loss_mlp": 0.01030232, "balance_loss_clip": 1.23487878, "balance_loss_mlp": 1.01144421, "epoch": 0.9831354276266346, "flos": 31736268760320.0, "grad_norm": 1.5599381867491418, "language_loss": 0.68913257, "learning_rate": 2.972199410170795e-09, "loss": 0.71337265, "num_input_tokens_seen": 352907555, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18786621, "step": 16352, "time_per_iteration": 2.9057857990264893 }, { "auxiliary_loss_clip": 0.01394509, "auxiliary_loss_mlp": 0.01035125, "balance_loss_clip": 1.23653221, "balance_loss_mlp": 1.01609898, "epoch": 0.9831955508793025, "flos": 21629715845760.0, "grad_norm": 1.35473531271265, "language_loss": 0.67110133, "learning_rate": 2.951012538143782e-09, "loss": 0.69539773, "num_input_tokens_seen": 352928670, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19030762, "step": 16353, "time_per_iteration": 2.845715045928955 }, { "auxiliary_loss_clip": 0.01380957, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.22571826, "balance_loss_mlp": 1.01553202, "epoch": 0.9832556741319706, "flos": 22979019582720.0, "grad_norm": 1.500469136580036, "language_loss": 0.75142503, "learning_rate": 2.9299013944144025e-09, "loss": 0.7755748, "num_input_tokens_seen": 352948345, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.18505859, "step": 16354, "time_per_iteration": 2.904158353805542 }, { "auxiliary_loss_clip": 0.01388183, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.23123801, "balance_loss_mlp": 1.01439595, "epoch": 0.9833157973846385, "flos": 21333642001920.0, "grad_norm": 2.4553363274230398, "language_loss": 0.78386176, "learning_rate": 2.9088659797835702e-09, "loss": 0.80806804, "num_input_tokens_seen": 352967250, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18066406, "step": 16355, "time_per_iteration": 2.8519859313964844 }, { "auxiliary_loss_clip": 0.01395761, "auxiliary_loss_mlp": 0.01034345, "balance_loss_clip": 1.23781109, "balance_loss_mlp": 1.01573634, "epoch": 0.9833759206373065, "flos": 21078451474560.0, "grad_norm": 4.805718133736267, "language_loss": 0.73910356, "learning_rate": 2.8879062950484256e-09, "loss": 0.76340461, "num_input_tokens_seen": 352984725, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18615723, "step": 16356, "time_per_iteration": 2.8277242183685303 }, { "auxiliary_loss_clip": 0.01396647, "auxiliary_loss_mlp": 0.0103028, "balance_loss_clip": 1.23926568, "balance_loss_mlp": 1.01123023, "epoch": 0.9834360438899744, "flos": 18706485346560.0, "grad_norm": 2.0552353896871205, "language_loss": 0.76903188, "learning_rate": 2.8670223410041104e-09, "loss": 0.79330111, "num_input_tokens_seen": 353003480, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19042969, "step": 16357, "time_per_iteration": 2.824057102203369 }, { "auxiliary_loss_clip": 0.01394216, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.23582339, "balance_loss_mlp": 1.01082444, "epoch": 0.9834961671426424, "flos": 21114991290240.0, "grad_norm": 2.125172688605168, "language_loss": 0.8134706, "learning_rate": 2.846214118442436e-09, "loss": 0.83771765, "num_input_tokens_seen": 353021425, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19665527, "step": 16358, "time_per_iteration": 2.8243868350982666 }, { "auxiliary_loss_clip": 0.01403278, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.24454403, "balance_loss_mlp": 1.01217651, "epoch": 0.9835562903953103, "flos": 26698841614080.0, "grad_norm": 2.1338256830293956, "language_loss": 0.68725592, "learning_rate": 2.8254816281523263e-09, "loss": 0.71159214, "num_input_tokens_seen": 353039870, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.1817627, "step": 16359, "time_per_iteration": 4.44164776802063 }, { "auxiliary_loss_clip": 0.01389552, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.23226833, "balance_loss_mlp": 1.01293278, "epoch": 0.9836164136479784, "flos": 22100118480000.0, "grad_norm": 2.284236987211495, "language_loss": 0.7000975, "learning_rate": 2.804824870920264e-09, "loss": 0.72430223, "num_input_tokens_seen": 353059750, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.17980957, "step": 16360, "time_per_iteration": 4.3412346839904785 }, { "auxiliary_loss_clip": 0.01403123, "auxiliary_loss_mlp": 0.01034994, "balance_loss_clip": 1.24288607, "balance_loss_mlp": 1.01479983, "epoch": 0.9836765369006463, "flos": 23888940635520.0, "grad_norm": 1.6961491125983499, "language_loss": 0.84604245, "learning_rate": 2.7842438475293996e-09, "loss": 0.87042356, "num_input_tokens_seen": 353079940, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.2019043, "step": 16361, "time_per_iteration": 3.0159943103790283 }, { "auxiliary_loss_clip": 0.01392089, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.23346114, "balance_loss_mlp": 1.0115943, "epoch": 0.9837366601533143, "flos": 25855122983040.0, "grad_norm": 1.6044526467163782, "language_loss": 0.76481867, "learning_rate": 2.76373855876022e-09, "loss": 0.78903604, "num_input_tokens_seen": 353099990, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18054199, "step": 16362, "time_per_iteration": 2.928985595703125 }, { "auxiliary_loss_clip": 0.01399211, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.23991418, "balance_loss_mlp": 1.01541543, "epoch": 0.9837967834059823, "flos": 21367150416000.0, "grad_norm": 1.7239013868153286, "language_loss": 0.71955121, "learning_rate": 2.7433090053901043e-09, "loss": 0.74388403, "num_input_tokens_seen": 353118710, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18676758, "step": 16363, "time_per_iteration": 2.8677234649658203 }, { "auxiliary_loss_clip": 0.01381781, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.22852707, "balance_loss_mlp": 1.01343012, "epoch": 0.9838569066586502, "flos": 18525143612160.0, "grad_norm": 1.6162151351248686, "language_loss": 0.63508821, "learning_rate": 2.7229551881937653e-09, "loss": 0.65921849, "num_input_tokens_seen": 353136415, "router_z_loss_clip": 1.53320312, "router_z_loss_mlp": 0.17822266, "step": 16364, "time_per_iteration": 2.808624029159546 }, { "auxiliary_loss_clip": 0.01398042, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.23892069, "balance_loss_mlp": 1.01218867, "epoch": 0.9839170299113182, "flos": 22461761318400.0, "grad_norm": 1.6708738058559187, "language_loss": 0.75448823, "learning_rate": 2.702677107943252e-09, "loss": 0.77876347, "num_input_tokens_seen": 353154650, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.17297363, "step": 16365, "time_per_iteration": 2.877235174179077 }, { "auxiliary_loss_clip": 0.01389255, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.23222971, "balance_loss_mlp": 1.01326513, "epoch": 0.9839771531639862, "flos": 27904022104320.0, "grad_norm": 1.604459488239463, "language_loss": 0.76975167, "learning_rate": 2.6824747654072832e-09, "loss": 0.79395252, "num_input_tokens_seen": 353174065, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.17590332, "step": 16366, "time_per_iteration": 2.9044857025146484 }, { "auxiliary_loss_clip": 0.01385916, "auxiliary_loss_mlp": 0.01032984, "balance_loss_clip": 1.22960329, "balance_loss_mlp": 1.01464963, "epoch": 0.9840372764166542, "flos": 28224691136640.0, "grad_norm": 1.7194003972455227, "language_loss": 0.77541733, "learning_rate": 2.662348161352357e-09, "loss": 0.79960632, "num_input_tokens_seen": 353193560, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18334961, "step": 16367, "time_per_iteration": 2.872946262359619 }, { "auxiliary_loss_clip": 0.01398219, "auxiliary_loss_mlp": 0.01035586, "balance_loss_clip": 1.24057591, "balance_loss_mlp": 1.01676273, "epoch": 0.9840973996693221, "flos": 23414375479680.0, "grad_norm": 2.696641913280829, "language_loss": 0.61740375, "learning_rate": 2.642297296540974e-09, "loss": 0.64174175, "num_input_tokens_seen": 353213525, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18835449, "step": 16368, "time_per_iteration": 2.904874563217163 }, { "auxiliary_loss_clip": 0.01382313, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.22839928, "balance_loss_mlp": 1.01613498, "epoch": 0.9841575229219901, "flos": 21405364289280.0, "grad_norm": 2.4962731170714374, "language_loss": 0.66281772, "learning_rate": 2.6223221717340816e-09, "loss": 0.68697852, "num_input_tokens_seen": 353234000, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.17651367, "step": 16369, "time_per_iteration": 2.8288822174072266 }, { "auxiliary_loss_clip": 0.01393065, "auxiliary_loss_mlp": 0.01032102, "balance_loss_clip": 1.23325992, "balance_loss_mlp": 1.01321971, "epoch": 0.984217646174658, "flos": 24475251744000.0, "grad_norm": 1.6031460700587432, "language_loss": 0.68616229, "learning_rate": 2.6024227876886295e-09, "loss": 0.71041393, "num_input_tokens_seen": 353254940, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18884277, "step": 16370, "time_per_iteration": 2.8937289714813232 }, { "auxiliary_loss_clip": 0.01404385, "auxiliary_loss_mlp": 0.01034008, "balance_loss_clip": 1.24368763, "balance_loss_mlp": 1.01427865, "epoch": 0.984277769427326, "flos": 16443595728000.0, "grad_norm": 2.087007185534773, "language_loss": 0.74735892, "learning_rate": 2.582599145159792e-09, "loss": 0.77174282, "num_input_tokens_seen": 353272590, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.1973877, "step": 16371, "time_per_iteration": 2.8109121322631836 }, { "auxiliary_loss_clip": 0.01183102, "auxiliary_loss_mlp": 0.01053289, "balance_loss_clip": 1.09419525, "balance_loss_mlp": 1.02467859, "epoch": 0.9843378926799939, "flos": 64563003884160.0, "grad_norm": 0.781229367967472, "language_loss": 0.65256512, "learning_rate": 2.562851244898745e-09, "loss": 0.67492902, "num_input_tokens_seen": 353334380, "router_z_loss_clip": 0.890625, "router_z_loss_mlp": 0.28515625, "step": 16372, "time_per_iteration": 3.348527669906616 }, { "auxiliary_loss_clip": 0.01386593, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 1.22959805, "balance_loss_mlp": 1.010849, "epoch": 0.984398015932662, "flos": 17391368695680.0, "grad_norm": 1.7264321663506492, "language_loss": 0.71189058, "learning_rate": 2.5431790876544456e-09, "loss": 0.73604369, "num_input_tokens_seen": 353351640, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.17871094, "step": 16373, "time_per_iteration": 2.8333990573883057 }, { "auxiliary_loss_clip": 0.0138492, "auxiliary_loss_mlp": 0.01030509, "balance_loss_clip": 1.22872269, "balance_loss_mlp": 1.01180482, "epoch": 0.9844581391853299, "flos": 23889800286720.0, "grad_norm": 1.6993230314119452, "language_loss": 0.82217801, "learning_rate": 2.523582674173186e-09, "loss": 0.84633231, "num_input_tokens_seen": 353372555, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18713379, "step": 16374, "time_per_iteration": 2.94132137298584 }, { "auxiliary_loss_clip": 0.01390964, "auxiliary_loss_mlp": 0.01033234, "balance_loss_clip": 1.23189664, "balance_loss_mlp": 1.01433945, "epoch": 0.9845182624379979, "flos": 19874945041920.0, "grad_norm": 1.7374782127016828, "language_loss": 0.69914925, "learning_rate": 2.504062005197927e-09, "loss": 0.72339123, "num_input_tokens_seen": 353391385, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.18896484, "step": 16375, "time_per_iteration": 2.922139883041382 }, { "auxiliary_loss_clip": 0.0141248, "auxiliary_loss_mlp": 0.01037309, "balance_loss_clip": 1.25130475, "balance_loss_mlp": 1.01791394, "epoch": 0.9845783856906659, "flos": 28265800677120.0, "grad_norm": 1.8013628094840444, "language_loss": 0.81601644, "learning_rate": 2.484617081468521e-09, "loss": 0.84051436, "num_input_tokens_seen": 353411630, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19372559, "step": 16376, "time_per_iteration": 2.9252190589904785 }, { "auxiliary_loss_clip": 0.0139016, "auxiliary_loss_mlp": 0.0103305, "balance_loss_clip": 1.23375154, "balance_loss_mlp": 1.01452506, "epoch": 0.9846385089433338, "flos": 28339739959680.0, "grad_norm": 1.4930984954441606, "language_loss": 0.63119018, "learning_rate": 2.4652479037228224e-09, "loss": 0.65542233, "num_input_tokens_seen": 353432895, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18530273, "step": 16377, "time_per_iteration": 2.9014320373535156 }, { "auxiliary_loss_clip": 0.01403505, "auxiliary_loss_mlp": 0.01033399, "balance_loss_clip": 1.24304891, "balance_loss_mlp": 1.0147543, "epoch": 0.9846986321960018, "flos": 24327554158080.0, "grad_norm": 1.7073433586366624, "language_loss": 0.74081969, "learning_rate": 2.445954472695133e-09, "loss": 0.76518875, "num_input_tokens_seen": 353454195, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18640137, "step": 16378, "time_per_iteration": 2.8990511894226074 }, { "auxiliary_loss_clip": 0.01397814, "auxiliary_loss_mlp": 0.01032298, "balance_loss_clip": 1.23867548, "balance_loss_mlp": 1.01396322, "epoch": 0.9847587554486698, "flos": 27283523909760.0, "grad_norm": 1.6439877443565571, "language_loss": 0.71049571, "learning_rate": 2.426736789116868e-09, "loss": 0.73479688, "num_input_tokens_seen": 353475125, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18322754, "step": 16379, "time_per_iteration": 2.9230332374572754 }, { "auxiliary_loss_clip": 0.0140022, "auxiliary_loss_mlp": 0.01032217, "balance_loss_clip": 1.23968482, "balance_loss_mlp": 1.01339364, "epoch": 0.9848188787013378, "flos": 16550817200640.0, "grad_norm": 1.819288466970461, "language_loss": 0.6921615, "learning_rate": 2.407594853716999e-09, "loss": 0.71648586, "num_input_tokens_seen": 353493265, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18811035, "step": 16380, "time_per_iteration": 2.8730764389038086 }, { "auxiliary_loss_clip": 0.01408244, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.24382401, "balance_loss_mlp": 1.01386571, "epoch": 0.9848790019540057, "flos": 20203305690240.0, "grad_norm": 1.9020148737699714, "language_loss": 0.79697436, "learning_rate": 2.38852866722139e-09, "loss": 0.82139087, "num_input_tokens_seen": 353511650, "router_z_loss_clip": 1.64355469, "router_z_loss_mlp": 0.1953125, "step": 16381, "time_per_iteration": 2.8677265644073486 }, { "auxiliary_loss_clip": 0.01402657, "auxiliary_loss_mlp": 0.01031662, "balance_loss_clip": 1.2415123, "balance_loss_mlp": 1.01274383, "epoch": 0.9849391252066737, "flos": 28272180193920.0, "grad_norm": 1.4761322258319163, "language_loss": 0.83083123, "learning_rate": 2.3695382303527965e-09, "loss": 0.85517442, "num_input_tokens_seen": 353534035, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.18920898, "step": 16382, "time_per_iteration": 2.9514949321746826 }, { "auxiliary_loss_clip": 0.01411039, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.24681437, "balance_loss_mlp": 1.01194251, "epoch": 0.9849992484593416, "flos": 22464973699200.0, "grad_norm": 1.8069972822303069, "language_loss": 0.74937087, "learning_rate": 2.3506235438315316e-09, "loss": 0.77379322, "num_input_tokens_seen": 353549950, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.19250488, "step": 16383, "time_per_iteration": 2.8254432678222656 }, { "auxiliary_loss_clip": 0.0140342, "auxiliary_loss_mlp": 0.0103305, "balance_loss_clip": 1.24305797, "balance_loss_mlp": 1.01377344, "epoch": 0.9850593717120096, "flos": 34510851532800.0, "grad_norm": 1.7567342576549105, "language_loss": 0.67192638, "learning_rate": 2.3317846083750203e-09, "loss": 0.69629109, "num_input_tokens_seen": 353573745, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19274902, "step": 16384, "time_per_iteration": 4.360185623168945 }, { "auxiliary_loss_clip": 0.01406865, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.24487042, "balance_loss_mlp": 1.01675141, "epoch": 0.9851194949646775, "flos": 38851624206720.0, "grad_norm": 1.747995428822308, "language_loss": 0.71622294, "learning_rate": 2.313021424697359e-09, "loss": 0.74065107, "num_input_tokens_seen": 353595335, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.1920166, "step": 16385, "time_per_iteration": 2.983834981918335 }, { "auxiliary_loss_clip": 0.01409948, "auxiliary_loss_mlp": 0.01032779, "balance_loss_clip": 1.24913669, "balance_loss_mlp": 1.01388419, "epoch": 0.9851796182173456, "flos": 17721403401600.0, "grad_norm": 1.8843443149609969, "language_loss": 0.82167351, "learning_rate": 2.294333993509978e-09, "loss": 0.84610087, "num_input_tokens_seen": 353614270, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.18896484, "step": 16386, "time_per_iteration": 4.316385269165039 }, { "auxiliary_loss_clip": 0.0140934, "auxiliary_loss_mlp": 0.01036677, "balance_loss_clip": 1.2483269, "balance_loss_mlp": 1.01717424, "epoch": 0.9852397414700135, "flos": 27465861029760.0, "grad_norm": 2.0028294475262167, "language_loss": 0.68844473, "learning_rate": 2.2757223155216442e-09, "loss": 0.71290493, "num_input_tokens_seen": 353634900, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.19519043, "step": 16387, "time_per_iteration": 2.8710286617279053 }, { "auxiliary_loss_clip": 0.01379441, "auxiliary_loss_mlp": 0.0103253, "balance_loss_clip": 1.22604513, "balance_loss_mlp": 1.0144937, "epoch": 0.9852998647226815, "flos": 18305949962880.0, "grad_norm": 2.530714127502147, "language_loss": 0.75028926, "learning_rate": 2.257186391438237e-09, "loss": 0.77440894, "num_input_tokens_seen": 353652890, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.18041992, "step": 16388, "time_per_iteration": 2.824612855911255 }, { "auxiliary_loss_clip": 0.0139284, "auxiliary_loss_mlp": 0.01031388, "balance_loss_clip": 1.23440742, "balance_loss_mlp": 1.01335168, "epoch": 0.9853599879753495, "flos": 19651091178240.0, "grad_norm": 2.836748547456495, "language_loss": 0.82969445, "learning_rate": 2.238726221962528e-09, "loss": 0.85393667, "num_input_tokens_seen": 353671295, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.18041992, "step": 16389, "time_per_iteration": 2.824563503265381 }, { "auxiliary_loss_clip": 0.01390799, "auxiliary_loss_mlp": 0.01028919, "balance_loss_clip": 1.23236752, "balance_loss_mlp": 1.00969088, "epoch": 0.9854201112280174, "flos": 23852491309440.0, "grad_norm": 4.504824147853173, "language_loss": 0.67813128, "learning_rate": 2.2203418077946234e-09, "loss": 0.7023285, "num_input_tokens_seen": 353690560, "router_z_loss_clip": 1.58398438, "router_z_loss_mlp": 0.19226074, "step": 16390, "time_per_iteration": 2.8358211517333984 }, { "auxiliary_loss_clip": 0.01417146, "auxiliary_loss_mlp": 0.01038451, "balance_loss_clip": 1.25562644, "balance_loss_mlp": 1.01929426, "epoch": 0.9854802344806854, "flos": 30092610481920.0, "grad_norm": 1.767111895931276, "language_loss": 0.77645439, "learning_rate": 2.2020331496312994e-09, "loss": 0.80101037, "num_input_tokens_seen": 353710660, "router_z_loss_clip": 1.61621094, "router_z_loss_mlp": 0.19165039, "step": 16391, "time_per_iteration": 2.9367012977600098 }, { "auxiliary_loss_clip": 0.0137723, "auxiliary_loss_mlp": 0.01035527, "balance_loss_clip": 1.22404671, "balance_loss_mlp": 1.01677513, "epoch": 0.9855403577333534, "flos": 21917238422400.0, "grad_norm": 3.719820029087447, "language_loss": 0.69277596, "learning_rate": 2.1838002481673333e-09, "loss": 0.71690357, "num_input_tokens_seen": 353730440, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.18762207, "step": 16392, "time_per_iteration": 2.8416285514831543 }, { "auxiliary_loss_clip": 0.01419583, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.25276935, "balance_loss_mlp": 1.01278389, "epoch": 0.9856004809860214, "flos": 15422154946560.0, "grad_norm": 7.0796075759043156, "language_loss": 0.57301009, "learning_rate": 2.1656431040937286e-09, "loss": 0.59753799, "num_input_tokens_seen": 353748360, "router_z_loss_clip": 1.66894531, "router_z_loss_mlp": 0.2043457, "step": 16393, "time_per_iteration": 2.8223114013671875 }, { "auxiliary_loss_clip": 0.01399772, "auxiliary_loss_mlp": 0.0103249, "balance_loss_clip": 1.2355479, "balance_loss_mlp": 1.01385808, "epoch": 0.9856606042386893, "flos": 13658697141120.0, "grad_norm": 3.411685874541053, "language_loss": 0.79819244, "learning_rate": 2.1475617180990444e-09, "loss": 0.82251513, "num_input_tokens_seen": 353760880, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18615723, "step": 16394, "time_per_iteration": 2.779979944229126 }, { "auxiliary_loss_clip": 0.01402366, "auxiliary_loss_mlp": 0.01032338, "balance_loss_clip": 1.24078119, "balance_loss_mlp": 1.0136342, "epoch": 0.9857207274913573, "flos": 23490124554240.0, "grad_norm": 1.4687254398454892, "language_loss": 0.7671209, "learning_rate": 2.129556090869178e-09, "loss": 0.79146796, "num_input_tokens_seen": 353782255, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.18701172, "step": 16395, "time_per_iteration": 4.313923120498657 }, { "auxiliary_loss_clip": 0.01400235, "auxiliary_loss_mlp": 0.01031735, "balance_loss_clip": 1.24263501, "balance_loss_mlp": 1.01274538, "epoch": 0.9857808507440252, "flos": 21074379442560.0, "grad_norm": 1.8700402348234162, "language_loss": 0.7590099, "learning_rate": 2.1116262230866933e-09, "loss": 0.78332961, "num_input_tokens_seen": 353803580, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18994141, "step": 16396, "time_per_iteration": 4.341885328292847 }, { "auxiliary_loss_clip": 0.0139445, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.23705721, "balance_loss_mlp": 1.01344526, "epoch": 0.9858409739966932, "flos": 25312274144640.0, "grad_norm": 1.4407267834413362, "language_loss": 0.71642005, "learning_rate": 2.0937721154317133e-09, "loss": 0.74068719, "num_input_tokens_seen": 353824200, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18835449, "step": 16397, "time_per_iteration": 2.9244003295898438 }, { "auxiliary_loss_clip": 0.01386336, "auxiliary_loss_mlp": 0.01032595, "balance_loss_clip": 1.23293519, "balance_loss_mlp": 1.01464248, "epoch": 0.9859010972493611, "flos": 20568568112640.0, "grad_norm": 1.6054343834615512, "language_loss": 0.7226454, "learning_rate": 2.0759937685810304e-09, "loss": 0.7468347, "num_input_tokens_seen": 353843350, "router_z_loss_clip": 1.53417969, "router_z_loss_mlp": 0.17944336, "step": 16398, "time_per_iteration": 2.880894660949707 }, { "auxiliary_loss_clip": 0.01401069, "auxiliary_loss_mlp": 0.01032681, "balance_loss_clip": 1.24321866, "balance_loss_mlp": 1.01409674, "epoch": 0.9859612205020292, "flos": 24765986701440.0, "grad_norm": 1.5097557565261428, "language_loss": 0.74801075, "learning_rate": 2.058291183208771e-09, "loss": 0.77234828, "num_input_tokens_seen": 353864520, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18591309, "step": 16399, "time_per_iteration": 2.921234369277954 }, { "auxiliary_loss_clip": 0.0139886, "auxiliary_loss_mlp": 0.01035005, "balance_loss_clip": 1.23905742, "balance_loss_mlp": 1.01489472, "epoch": 0.9860213437546971, "flos": 21115760451840.0, "grad_norm": 1.8494235383744735, "language_loss": 0.58158511, "learning_rate": 2.0406643599863993e-09, "loss": 0.60592377, "num_input_tokens_seen": 353882240, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.20117188, "step": 16400, "time_per_iteration": 2.8620755672454834 }, { "auxiliary_loss_clip": 0.01422207, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.25506163, "balance_loss_mlp": 1.01473093, "epoch": 0.9860814670073651, "flos": 19145596561920.0, "grad_norm": 2.1286804973289697, "language_loss": 0.81499112, "learning_rate": 2.023113299582491e-09, "loss": 0.83955681, "num_input_tokens_seen": 353901590, "router_z_loss_clip": 1.67285156, "router_z_loss_mlp": 0.19641113, "step": 16401, "time_per_iteration": 2.914907693862915 }, { "auxiliary_loss_clip": 0.01386358, "auxiliary_loss_mlp": 0.01031611, "balance_loss_clip": 1.22993231, "balance_loss_mlp": 1.01233459, "epoch": 0.9861415902600331, "flos": 17245616636160.0, "grad_norm": 1.7411081070866785, "language_loss": 0.79070008, "learning_rate": 2.005638002662069e-09, "loss": 0.81487978, "num_input_tokens_seen": 353918785, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.19262695, "step": 16402, "time_per_iteration": 2.902890205383301 }, { "auxiliary_loss_clip": 0.01395688, "auxiliary_loss_mlp": 0.01033933, "balance_loss_clip": 1.23590302, "balance_loss_mlp": 1.01474071, "epoch": 0.986201713512701, "flos": 27794176433280.0, "grad_norm": 1.775030176844202, "language_loss": 0.7114929, "learning_rate": 1.9882384698881596e-09, "loss": 0.73578906, "num_input_tokens_seen": 353940390, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1920166, "step": 16403, "time_per_iteration": 2.9123497009277344 }, { "auxiliary_loss_clip": 0.01398904, "auxiliary_loss_mlp": 0.01028582, "balance_loss_clip": 1.24031365, "balance_loss_mlp": 1.01048601, "epoch": 0.986261836765369, "flos": 28742039890560.0, "grad_norm": 1.7582831087140398, "language_loss": 0.75262415, "learning_rate": 1.9709147019204566e-09, "loss": 0.77689898, "num_input_tokens_seen": 353962180, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.18103027, "step": 16404, "time_per_iteration": 2.896562337875366 }, { "auxiliary_loss_clip": 0.01404024, "auxiliary_loss_mlp": 0.01035969, "balance_loss_clip": 1.24263072, "balance_loss_mlp": 1.01740813, "epoch": 0.986321960018037, "flos": 34326749865600.0, "grad_norm": 1.8570842492651563, "language_loss": 0.70821142, "learning_rate": 1.953666699415768e-09, "loss": 0.73261136, "num_input_tokens_seen": 353984305, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.1854248, "step": 16405, "time_per_iteration": 2.9488749504089355 }, { "auxiliary_loss_clip": 0.01393794, "auxiliary_loss_mlp": 0.01031093, "balance_loss_clip": 1.2380873, "balance_loss_mlp": 1.01303256, "epoch": 0.986382083270705, "flos": 25200075744000.0, "grad_norm": 1.7805660764713538, "language_loss": 0.70745975, "learning_rate": 1.93649446302846e-09, "loss": 0.73170865, "num_input_tokens_seen": 354004495, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18066406, "step": 16406, "time_per_iteration": 2.8927483558654785 }, { "auxiliary_loss_clip": 0.013874, "auxiliary_loss_mlp": 0.01032396, "balance_loss_clip": 1.23113787, "balance_loss_mlp": 1.0138948, "epoch": 0.9864422065233729, "flos": 11030635589760.0, "grad_norm": 11.122142521354993, "language_loss": 0.74835587, "learning_rate": 1.9193979934095663e-09, "loss": 0.7725538, "num_input_tokens_seen": 354015985, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18505859, "step": 16407, "time_per_iteration": 2.7870326042175293 }, { "auxiliary_loss_clip": 0.01387695, "auxiliary_loss_mlp": 0.01030443, "balance_loss_clip": 1.23030782, "balance_loss_mlp": 1.01171517, "epoch": 0.9865023297760409, "flos": 16554436784640.0, "grad_norm": 5.132109332118595, "language_loss": 0.77706742, "learning_rate": 1.9023772912072357e-09, "loss": 0.80124879, "num_input_tokens_seen": 354033260, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18725586, "step": 16408, "time_per_iteration": 2.8352224826812744 }, { "auxiliary_loss_clip": 0.01416348, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.25211167, "balance_loss_mlp": 1.01324081, "epoch": 0.9865624530287088, "flos": 18889501138560.0, "grad_norm": 1.7703462783372068, "language_loss": 0.69054627, "learning_rate": 1.8854323570669515e-09, "loss": 0.71503001, "num_input_tokens_seen": 354052825, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18811035, "step": 16409, "time_per_iteration": 2.8402485847473145 }, { "auxiliary_loss_clip": 0.01180813, "auxiliary_loss_mlp": 0.01028396, "balance_loss_clip": 1.09420919, "balance_loss_mlp": 1.00941837, "epoch": 0.9866225762813768, "flos": 68915838936960.0, "grad_norm": 0.8048945630231783, "language_loss": 0.61097121, "learning_rate": 1.8685631916313118e-09, "loss": 0.63306332, "num_input_tokens_seen": 354113920, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.18945312, "step": 16410, "time_per_iteration": 3.405085325241089 }, { "auxiliary_loss_clip": 0.01397049, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.23698974, "balance_loss_mlp": 1.01289773, "epoch": 0.9866826995340447, "flos": 29035037088000.0, "grad_norm": 2.0900844003183536, "language_loss": 0.668715, "learning_rate": 1.8517697955400258e-09, "loss": 0.693012, "num_input_tokens_seen": 354134210, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.19750977, "step": 16411, "time_per_iteration": 3.032343626022339 }, { "auxiliary_loss_clip": 0.01181314, "auxiliary_loss_mlp": 0.01038858, "balance_loss_clip": 1.09372211, "balance_loss_mlp": 1.01549268, "epoch": 0.9867428227867128, "flos": 65411246995200.0, "grad_norm": 0.7232335389489192, "language_loss": 0.56308901, "learning_rate": 1.8350521694299182e-09, "loss": 0.58529079, "num_input_tokens_seen": 354198010, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.23339844, "step": 16412, "time_per_iteration": 3.370803117752075 }, { "auxiliary_loss_clip": 0.01418032, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.2531724, "balance_loss_mlp": 1.01243305, "epoch": 0.9868029460393807, "flos": 26517771348480.0, "grad_norm": 1.9019440255109836, "language_loss": 0.73458463, "learning_rate": 1.818410313934926e-09, "loss": 0.75908792, "num_input_tokens_seen": 354220000, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.1986084, "step": 16413, "time_per_iteration": 2.9063198566436768 }, { "auxiliary_loss_clip": 0.0139342, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.23302853, "balance_loss_mlp": 1.01159334, "epoch": 0.9868630692920487, "flos": 22977843217920.0, "grad_norm": 1.7042336294599283, "language_loss": 0.72071773, "learning_rate": 1.8018442296858782e-09, "loss": 0.74495083, "num_input_tokens_seen": 354240910, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.1829834, "step": 16414, "time_per_iteration": 2.892103433609009 }, { "auxiliary_loss_clip": 0.01384885, "auxiliary_loss_mlp": 0.0103234, "balance_loss_clip": 1.23120296, "balance_loss_mlp": 1.01346922, "epoch": 0.9869231925447167, "flos": 19838224247040.0, "grad_norm": 1.8028580724795036, "language_loss": 0.71473396, "learning_rate": 1.7853539173111608e-09, "loss": 0.73890626, "num_input_tokens_seen": 354259430, "router_z_loss_clip": 1.53710938, "router_z_loss_mlp": 0.1887207, "step": 16415, "time_per_iteration": 2.826204776763916 }, { "auxiliary_loss_clip": 0.01374679, "auxiliary_loss_mlp": 0.0102856, "balance_loss_clip": 1.22148752, "balance_loss_mlp": 1.01095319, "epoch": 0.9869833157973846, "flos": 20205296461440.0, "grad_norm": 1.6140461473247256, "language_loss": 0.76107764, "learning_rate": 1.7689393774362737e-09, "loss": 0.78511006, "num_input_tokens_seen": 354279490, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.17614746, "step": 16416, "time_per_iteration": 2.931623935699463 }, { "auxiliary_loss_clip": 0.01392706, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.23634624, "balance_loss_mlp": 1.00947666, "epoch": 0.9870434390500527, "flos": 16106095630080.0, "grad_norm": 1.8242613570742483, "language_loss": 0.71594393, "learning_rate": 1.7526006106833858e-09, "loss": 0.74015701, "num_input_tokens_seen": 354295080, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.19116211, "step": 16417, "time_per_iteration": 2.8050966262817383 }, { "auxiliary_loss_clip": 0.01419023, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.25330114, "balance_loss_mlp": 1.01158464, "epoch": 0.9871035623027206, "flos": 21770355242880.0, "grad_norm": 2.162598400949832, "language_loss": 0.71517164, "learning_rate": 1.7363376176720013e-09, "loss": 0.73967385, "num_input_tokens_seen": 354314610, "router_z_loss_clip": 1.65722656, "router_z_loss_mlp": 0.19604492, "step": 16418, "time_per_iteration": 2.880129098892212 }, { "auxiliary_loss_clip": 0.01181082, "auxiliary_loss_mlp": 0.01038478, "balance_loss_clip": 1.09310055, "balance_loss_mlp": 1.01482701, "epoch": 0.9871636855553886, "flos": 70252718336640.0, "grad_norm": 0.6621746989465043, "language_loss": 0.53711224, "learning_rate": 1.7201503990189603e-09, "loss": 0.55930781, "num_input_tokens_seen": 354383115, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 0.23632812, "step": 16419, "time_per_iteration": 5.01503586769104 }, { "auxiliary_loss_clip": 0.01399927, "auxiliary_loss_mlp": 0.01037803, "balance_loss_clip": 1.23767471, "balance_loss_mlp": 1.01758528, "epoch": 0.9872238088080565, "flos": 25056631169280.0, "grad_norm": 1.7688479174860716, "language_loss": 0.78800833, "learning_rate": 1.7040389553382162e-09, "loss": 0.81238562, "num_input_tokens_seen": 354403115, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20227051, "step": 16420, "time_per_iteration": 2.8838114738464355 }, { "auxiliary_loss_clip": 0.01391352, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.23584116, "balance_loss_mlp": 1.0140202, "epoch": 0.9872839320607245, "flos": 19475812247040.0, "grad_norm": 1.8500805285590918, "language_loss": 0.71105075, "learning_rate": 1.6880032872403916e-09, "loss": 0.73529595, "num_input_tokens_seen": 354424520, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.19152832, "step": 16421, "time_per_iteration": 4.341374635696411 }, { "auxiliary_loss_clip": 0.01411209, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.24770212, "balance_loss_mlp": 1.01401067, "epoch": 0.9873440553133924, "flos": 26954167875840.0, "grad_norm": 1.876757470214039, "language_loss": 0.82905984, "learning_rate": 1.6720433953338886e-09, "loss": 0.85350633, "num_input_tokens_seen": 354444800, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19433594, "step": 16422, "time_per_iteration": 2.945460557937622 }, { "auxiliary_loss_clip": 0.0139971, "auxiliary_loss_mlp": 0.01033944, "balance_loss_clip": 1.24330842, "balance_loss_mlp": 1.01519239, "epoch": 0.9874041785660604, "flos": 19071476300160.0, "grad_norm": 1.767852906874787, "language_loss": 0.87122798, "learning_rate": 1.656159280223779e-09, "loss": 0.89556456, "num_input_tokens_seen": 354464590, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.1875, "step": 16423, "time_per_iteration": 2.849529504776001 }, { "auxiliary_loss_clip": 0.01401092, "auxiliary_loss_mlp": 0.01033906, "balance_loss_clip": 1.24025822, "balance_loss_mlp": 1.01348543, "epoch": 0.9874643018187284, "flos": 21115715207040.0, "grad_norm": 3.1053685746657527, "language_loss": 0.71658301, "learning_rate": 1.6403509425122475e-09, "loss": 0.740933, "num_input_tokens_seen": 354484145, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.20397949, "step": 16424, "time_per_iteration": 2.8494362831115723 }, { "auxiliary_loss_clip": 0.01402442, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.24115956, "balance_loss_mlp": 1.01337731, "epoch": 0.9875244250713964, "flos": 24436811646720.0, "grad_norm": 2.343569889449802, "language_loss": 0.81452322, "learning_rate": 1.6246183827990366e-09, "loss": 0.83887249, "num_input_tokens_seen": 354502475, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.19116211, "step": 16425, "time_per_iteration": 2.894390821456909 }, { "auxiliary_loss_clip": 0.01392291, "auxiliary_loss_mlp": 0.01030717, "balance_loss_clip": 1.23306966, "balance_loss_mlp": 1.01086855, "epoch": 0.9875845483240643, "flos": 25128127232640.0, "grad_norm": 2.0917833937464385, "language_loss": 0.80520761, "learning_rate": 1.6089616016803364e-09, "loss": 0.82943773, "num_input_tokens_seen": 354521855, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19848633, "step": 16426, "time_per_iteration": 2.855056047439575 }, { "auxiliary_loss_clip": 0.01389911, "auxiliary_loss_mlp": 0.01035739, "balance_loss_clip": 1.23250341, "balance_loss_mlp": 1.01617718, "epoch": 0.9876446715767323, "flos": 16590569397120.0, "grad_norm": 1.8828727462905686, "language_loss": 0.85659063, "learning_rate": 1.593380599750338e-09, "loss": 0.88084716, "num_input_tokens_seen": 354539535, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19555664, "step": 16427, "time_per_iteration": 2.8632853031158447 }, { "auxiliary_loss_clip": 0.01407055, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.24988639, "balance_loss_mlp": 1.01455998, "epoch": 0.9877047948294003, "flos": 21626051016960.0, "grad_norm": 1.6868841474262293, "language_loss": 0.71219558, "learning_rate": 1.577875377599458e-09, "loss": 0.73659629, "num_input_tokens_seen": 354557430, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.18444824, "step": 16428, "time_per_iteration": 2.8641908168792725 }, { "auxiliary_loss_clip": 0.01390004, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.23307657, "balance_loss_mlp": 1.01136732, "epoch": 0.9877649180820682, "flos": 21188342390400.0, "grad_norm": 2.869681706012376, "language_loss": 0.81181949, "learning_rate": 1.5624459358158926e-09, "loss": 0.83601713, "num_input_tokens_seen": 354574735, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.18395996, "step": 16429, "time_per_iteration": 2.876523971557617 }, { "auxiliary_loss_clip": 0.01403493, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.24455023, "balance_loss_mlp": 1.01095068, "epoch": 0.9878250413347363, "flos": 39763762254720.0, "grad_norm": 1.5745816376620567, "language_loss": 0.63061202, "learning_rate": 1.5470922749845073e-09, "loss": 0.65493333, "num_input_tokens_seen": 354597050, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.17700195, "step": 16430, "time_per_iteration": 4.526405334472656 }, { "auxiliary_loss_clip": 0.01392664, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.2333566, "balance_loss_mlp": 1.01385903, "epoch": 0.9878851645874042, "flos": 29437970446080.0, "grad_norm": 1.2603608363570549, "language_loss": 0.73316455, "learning_rate": 1.531814395687725e-09, "loss": 0.75742114, "num_input_tokens_seen": 354619095, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19128418, "step": 16431, "time_per_iteration": 4.360586166381836 }, { "auxiliary_loss_clip": 0.01398731, "auxiliary_loss_mlp": 0.01031663, "balance_loss_clip": 1.23906994, "balance_loss_mlp": 1.01305485, "epoch": 0.9879452878400722, "flos": 15812419760640.0, "grad_norm": 2.19100921825087, "language_loss": 0.81681073, "learning_rate": 1.5166122985048602e-09, "loss": 0.84111464, "num_input_tokens_seen": 354633790, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18615723, "step": 16432, "time_per_iteration": 2.826491117477417 }, { "auxiliary_loss_clip": 0.01385667, "auxiliary_loss_mlp": 0.01030803, "balance_loss_clip": 1.22922778, "balance_loss_mlp": 1.01313591, "epoch": 0.9880054110927401, "flos": 22243336830720.0, "grad_norm": 1.4329380653035315, "language_loss": 0.80867302, "learning_rate": 1.5014859840123405e-09, "loss": 0.8328377, "num_input_tokens_seen": 354653180, "router_z_loss_clip": 1.56738281, "router_z_loss_mlp": 0.17651367, "step": 16433, "time_per_iteration": 2.893446445465088 }, { "auxiliary_loss_clip": 0.01391285, "auxiliary_loss_mlp": 0.01031576, "balance_loss_clip": 1.23467445, "balance_loss_mlp": 1.01260972, "epoch": 0.9880655343454081, "flos": 28774688653440.0, "grad_norm": 2.263975873388379, "language_loss": 0.65694475, "learning_rate": 1.4864354527837075e-09, "loss": 0.68117332, "num_input_tokens_seen": 354669900, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18969727, "step": 16434, "time_per_iteration": 2.8816497325897217 }, { "auxiliary_loss_clip": 0.01404736, "auxiliary_loss_mlp": 0.01030894, "balance_loss_clip": 1.24238777, "balance_loss_mlp": 1.01253581, "epoch": 0.988125657598076, "flos": 32866107379200.0, "grad_norm": 1.7228588235009938, "language_loss": 0.69990021, "learning_rate": 1.4714607053896154e-09, "loss": 0.72425652, "num_input_tokens_seen": 354693165, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18347168, "step": 16435, "time_per_iteration": 2.9447977542877197 }, { "auxiliary_loss_clip": 0.01393156, "auxiliary_loss_mlp": 0.01032215, "balance_loss_clip": 1.23494029, "balance_loss_mlp": 1.01324892, "epoch": 0.988185780850744, "flos": 19400063172480.0, "grad_norm": 1.7985508229044551, "language_loss": 0.76087976, "learning_rate": 1.4565617423980548e-09, "loss": 0.78513348, "num_input_tokens_seen": 354711915, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18969727, "step": 16436, "time_per_iteration": 2.864062786102295 }, { "auxiliary_loss_clip": 0.01402619, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.24296403, "balance_loss_mlp": 1.01139593, "epoch": 0.988245904103412, "flos": 22538596268160.0, "grad_norm": 2.108242118773648, "language_loss": 0.75229537, "learning_rate": 1.4417385643741286e-09, "loss": 0.77663231, "num_input_tokens_seen": 354729135, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19677734, "step": 16437, "time_per_iteration": 2.8416621685028076 }, { "auxiliary_loss_clip": 0.01374109, "auxiliary_loss_mlp": 0.01034168, "balance_loss_clip": 1.22071767, "balance_loss_mlp": 1.01597691, "epoch": 0.98830602735608, "flos": 28671267744000.0, "grad_norm": 1.6034908463391344, "language_loss": 0.60430837, "learning_rate": 1.4269911718796103e-09, "loss": 0.62839109, "num_input_tokens_seen": 354752530, "router_z_loss_clip": 1.53222656, "router_z_loss_mlp": 0.18188477, "step": 16438, "time_per_iteration": 2.9804015159606934 }, { "auxiliary_loss_clip": 0.01392746, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.23495388, "balance_loss_mlp": 1.01264405, "epoch": 0.9883661506087479, "flos": 21005824291200.0, "grad_norm": 3.280548525338781, "language_loss": 0.7300126, "learning_rate": 1.4123195654738295e-09, "loss": 0.75425768, "num_input_tokens_seen": 354771135, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19116211, "step": 16439, "time_per_iteration": 2.8490633964538574 }, { "auxiliary_loss_clip": 0.01383008, "auxiliary_loss_mlp": 0.01034723, "balance_loss_clip": 1.2276144, "balance_loss_mlp": 1.01595998, "epoch": 0.9884262738614159, "flos": 32718455038080.0, "grad_norm": 1.622422454468921, "language_loss": 0.60882771, "learning_rate": 1.3977237457134528e-09, "loss": 0.63300502, "num_input_tokens_seen": 354791800, "router_z_loss_clip": 1.5546875, "router_z_loss_mlp": 0.18774414, "step": 16440, "time_per_iteration": 2.960610866546631 }, { "auxiliary_loss_clip": 0.01402042, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.24079323, "balance_loss_mlp": 1.01615226, "epoch": 0.9884863971140839, "flos": 17573162878080.0, "grad_norm": 2.5844537566043315, "language_loss": 0.7728591, "learning_rate": 1.3832037131513707e-09, "loss": 0.7972303, "num_input_tokens_seen": 354809200, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18908691, "step": 16441, "time_per_iteration": 2.898624897003174 }, { "auxiliary_loss_clip": 0.01404912, "auxiliary_loss_mlp": 0.01035993, "balance_loss_clip": 1.24498153, "balance_loss_mlp": 1.0170629, "epoch": 0.9885465203667518, "flos": 40567411975680.0, "grad_norm": 2.1140195426228128, "language_loss": 0.68566346, "learning_rate": 1.3687594683386982e-09, "loss": 0.71007252, "num_input_tokens_seen": 354829945, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.18933105, "step": 16442, "time_per_iteration": 3.069356918334961 }, { "auxiliary_loss_clip": 0.01382653, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.22685969, "balance_loss_mlp": 1.01140141, "epoch": 0.9886066436194199, "flos": 13815805645440.0, "grad_norm": 2.6738773239888247, "language_loss": 0.75557512, "learning_rate": 1.3543910118227753e-09, "loss": 0.7797004, "num_input_tokens_seen": 354845055, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.18457031, "step": 16443, "time_per_iteration": 2.842747449874878 }, { "auxiliary_loss_clip": 0.01401267, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.24202538, "balance_loss_mlp": 1.01444137, "epoch": 0.9886667668720878, "flos": 23333966190720.0, "grad_norm": 1.6541528053125119, "language_loss": 0.74732929, "learning_rate": 1.3400983441487213e-09, "loss": 0.77167165, "num_input_tokens_seen": 354864680, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.18530273, "step": 16444, "time_per_iteration": 2.865488290786743 }, { "auxiliary_loss_clip": 0.01397773, "auxiliary_loss_mlp": 0.01032299, "balance_loss_clip": 1.24002981, "balance_loss_mlp": 1.01327372, "epoch": 0.9887268901247558, "flos": 22715006319360.0, "grad_norm": 1.8009080877633203, "language_loss": 0.69415748, "learning_rate": 1.325881465858547e-09, "loss": 0.71845818, "num_input_tokens_seen": 354885685, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19018555, "step": 16445, "time_per_iteration": 2.938490390777588 }, { "auxiliary_loss_clip": 0.01387474, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.2296648, "balance_loss_mlp": 1.01328564, "epoch": 0.9887870133774237, "flos": 13048786229760.0, "grad_norm": 3.745076057305797, "language_loss": 0.61128354, "learning_rate": 1.311740377491155e-09, "loss": 0.63547963, "num_input_tokens_seen": 354901505, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18859863, "step": 16446, "time_per_iteration": 2.858058214187622 }, { "auxiliary_loss_clip": 0.01382388, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.22605908, "balance_loss_mlp": 1.01516891, "epoch": 0.9888471366300917, "flos": 15167281132800.0, "grad_norm": 1.9250539361817742, "language_loss": 0.71721661, "learning_rate": 1.297675079582783e-09, "loss": 0.74137282, "num_input_tokens_seen": 354920060, "router_z_loss_clip": 1.56445312, "router_z_loss_mlp": 0.18066406, "step": 16447, "time_per_iteration": 2.8022687435150146 }, { "auxiliary_loss_clip": 0.01387822, "auxiliary_loss_mlp": 0.0103107, "balance_loss_clip": 1.23129034, "balance_loss_mlp": 1.01378465, "epoch": 0.9889072598827596, "flos": 25129620311040.0, "grad_norm": 1.9978546576799325, "language_loss": 0.84051061, "learning_rate": 1.2836855726667818e-09, "loss": 0.86469948, "num_input_tokens_seen": 354938690, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.17297363, "step": 16448, "time_per_iteration": 2.8684353828430176 }, { "auxiliary_loss_clip": 0.01383265, "auxiliary_loss_mlp": 0.01028887, "balance_loss_clip": 1.22688961, "balance_loss_mlp": 1.01147032, "epoch": 0.9889673831354276, "flos": 16737724045440.0, "grad_norm": 1.6317993720618438, "language_loss": 0.71057594, "learning_rate": 1.26977185727406e-09, "loss": 0.73469746, "num_input_tokens_seen": 354956955, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.17407227, "step": 16449, "time_per_iteration": 2.8257665634155273 }, { "auxiliary_loss_clip": 0.01414778, "auxiliary_loss_mlp": 0.01029614, "balance_loss_clip": 1.25260293, "balance_loss_mlp": 1.01092148, "epoch": 0.9890275063880956, "flos": 35597408860800.0, "grad_norm": 2.0699314500929042, "language_loss": 0.74887818, "learning_rate": 1.25593393393153e-09, "loss": 0.77332217, "num_input_tokens_seen": 354976800, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.18713379, "step": 16450, "time_per_iteration": 3.007493734359741 }, { "auxiliary_loss_clip": 0.01400375, "auxiliary_loss_mlp": 0.0103156, "balance_loss_clip": 1.23737669, "balance_loss_mlp": 1.0135355, "epoch": 0.9890876296407636, "flos": 18961494894720.0, "grad_norm": 2.404630838383014, "language_loss": 0.79718059, "learning_rate": 1.242171803164549e-09, "loss": 0.82149994, "num_input_tokens_seen": 354996625, "router_z_loss_clip": 1.62890625, "router_z_loss_mlp": 0.18017578, "step": 16451, "time_per_iteration": 2.872495651245117 }, { "auxiliary_loss_clip": 0.01409139, "auxiliary_loss_mlp": 0.01034584, "balance_loss_clip": 1.2453289, "balance_loss_mlp": 1.01551056, "epoch": 0.9891477528934315, "flos": 23779909370880.0, "grad_norm": 1.9179648386048196, "language_loss": 0.7119683, "learning_rate": 1.2284854654946996e-09, "loss": 0.73640549, "num_input_tokens_seen": 355014535, "router_z_loss_clip": 1.63769531, "router_z_loss_mlp": 0.19067383, "step": 16452, "time_per_iteration": 2.8751890659332275 }, { "auxiliary_loss_clip": 0.0138509, "auxiliary_loss_mlp": 0.01027635, "balance_loss_clip": 1.23113811, "balance_loss_mlp": 1.00986123, "epoch": 0.9892078761460995, "flos": 20781563224320.0, "grad_norm": 1.698320310580232, "language_loss": 0.74397731, "learning_rate": 1.2148749214409004e-09, "loss": 0.76810461, "num_input_tokens_seen": 355033280, "router_z_loss_clip": 1.5390625, "router_z_loss_mlp": 0.17773438, "step": 16453, "time_per_iteration": 2.85638165473938 }, { "auxiliary_loss_clip": 0.01402047, "auxiliary_loss_mlp": 0.01037332, "balance_loss_clip": 1.23991764, "balance_loss_mlp": 1.01898539, "epoch": 0.9892679993987675, "flos": 23378288112000.0, "grad_norm": 2.760773229420982, "language_loss": 0.71657538, "learning_rate": 1.2013401715191828e-09, "loss": 0.74096918, "num_input_tokens_seen": 355053320, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18334961, "step": 16454, "time_per_iteration": 4.341542959213257 }, { "auxiliary_loss_clip": 0.01379647, "auxiliary_loss_mlp": 0.01030187, "balance_loss_clip": 1.22476852, "balance_loss_mlp": 1.01119685, "epoch": 0.9893281226514354, "flos": 22714237157760.0, "grad_norm": 2.003869021726763, "language_loss": 0.75248092, "learning_rate": 1.1878812162433583e-09, "loss": 0.77657926, "num_input_tokens_seen": 355070230, "router_z_loss_clip": 1.54785156, "router_z_loss_mlp": 0.18981934, "step": 16455, "time_per_iteration": 2.886852502822876 }, { "auxiliary_loss_clip": 0.01378936, "auxiliary_loss_mlp": 0.01033232, "balance_loss_clip": 1.2237978, "balance_loss_mlp": 1.0135982, "epoch": 0.9893882459041035, "flos": 21806125896960.0, "grad_norm": 1.7557437973468615, "language_loss": 0.65884215, "learning_rate": 1.1744980561230188e-09, "loss": 0.68296385, "num_input_tokens_seen": 355090125, "router_z_loss_clip": 1.55078125, "router_z_loss_mlp": 0.19641113, "step": 16456, "time_per_iteration": 4.329211950302124 }, { "auxiliary_loss_clip": 0.01416976, "auxiliary_loss_mlp": 0.01035619, "balance_loss_clip": 1.25480092, "balance_loss_mlp": 1.01717687, "epoch": 0.9894483691567714, "flos": 18122843681280.0, "grad_norm": 2.036478350333511, "language_loss": 0.74480867, "learning_rate": 1.161190691666203e-09, "loss": 0.76933467, "num_input_tokens_seen": 355107890, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.18444824, "step": 16457, "time_per_iteration": 2.8383498191833496 }, { "auxiliary_loss_clip": 0.01401614, "auxiliary_loss_mlp": 0.01034242, "balance_loss_clip": 1.24011433, "balance_loss_mlp": 1.01540744, "epoch": 0.9895084924094394, "flos": 31223942179200.0, "grad_norm": 2.093606851004012, "language_loss": 0.69823956, "learning_rate": 1.1479591233773954e-09, "loss": 0.72259808, "num_input_tokens_seen": 355126340, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18847656, "step": 16458, "time_per_iteration": 2.9327542781829834 }, { "auxiliary_loss_clip": 0.01379539, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.2230705, "balance_loss_mlp": 1.01210451, "epoch": 0.9895686156621073, "flos": 19686318894720.0, "grad_norm": 1.7901048071240322, "language_loss": 0.8027972, "learning_rate": 1.1348033517581956e-09, "loss": 0.82689708, "num_input_tokens_seen": 355144025, "router_z_loss_clip": 1.56640625, "router_z_loss_mlp": 0.18334961, "step": 16459, "time_per_iteration": 2.8448216915130615 }, { "auxiliary_loss_clip": 0.0140258, "auxiliary_loss_mlp": 0.01031269, "balance_loss_clip": 1.2410562, "balance_loss_mlp": 1.01250577, "epoch": 0.9896287389147753, "flos": 23590876020480.0, "grad_norm": 2.1433783289280077, "language_loss": 0.71858865, "learning_rate": 1.1217233773075373e-09, "loss": 0.74292719, "num_input_tokens_seen": 355163125, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.18774414, "step": 16460, "time_per_iteration": 2.865595579147339 }, { "auxiliary_loss_clip": 0.01397539, "auxiliary_loss_mlp": 0.01035608, "balance_loss_clip": 1.23684239, "balance_loss_mlp": 1.01660657, "epoch": 0.9896888621674432, "flos": 29617004695680.0, "grad_norm": 1.586384198624689, "language_loss": 0.87840587, "learning_rate": 1.1087192005214685e-09, "loss": 0.90273732, "num_input_tokens_seen": 355184060, "router_z_loss_clip": 1.60644531, "router_z_loss_mlp": 0.18994141, "step": 16461, "time_per_iteration": 2.9141829013824463 }, { "auxiliary_loss_clip": 0.01404078, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.24478531, "balance_loss_mlp": 1.01573527, "epoch": 0.9897489854201112, "flos": 23705291416320.0, "grad_norm": 1.5701458460334479, "language_loss": 0.63970709, "learning_rate": 1.09579082189315e-09, "loss": 0.66409743, "num_input_tokens_seen": 355204505, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19226074, "step": 16462, "time_per_iteration": 2.8678526878356934 }, { "auxiliary_loss_clip": 0.01399647, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.24181867, "balance_loss_mlp": 1.01192212, "epoch": 0.9898091086727792, "flos": 13233068876160.0, "grad_norm": 1.7279517559795192, "language_loss": 0.73509479, "learning_rate": 1.0829382419126343e-09, "loss": 0.75940597, "num_input_tokens_seen": 355223055, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19543457, "step": 16463, "time_per_iteration": 2.8295462131500244 }, { "auxiliary_loss_clip": 0.01407236, "auxiliary_loss_mlp": 0.01031939, "balance_loss_clip": 1.24681115, "balance_loss_mlp": 1.01246023, "epoch": 0.9898692319254472, "flos": 22940850954240.0, "grad_norm": 1.653460971191012, "language_loss": 0.70744181, "learning_rate": 1.0701614610675314e-09, "loss": 0.73183358, "num_input_tokens_seen": 355242000, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.19470215, "step": 16464, "time_per_iteration": 2.850431203842163 }, { "auxiliary_loss_clip": 0.0140841, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.24684548, "balance_loss_mlp": 1.01479042, "epoch": 0.9899293551781151, "flos": 12465913726080.0, "grad_norm": 1.9898458384307425, "language_loss": 0.73888767, "learning_rate": 1.0574604798421204e-09, "loss": 0.7633121, "num_input_tokens_seen": 355260175, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19226074, "step": 16465, "time_per_iteration": 4.209473133087158 }, { "auxiliary_loss_clip": 0.01384509, "auxiliary_loss_mlp": 0.01032032, "balance_loss_clip": 1.22909737, "balance_loss_mlp": 1.01494896, "epoch": 0.9899894784307831, "flos": 26882671812480.0, "grad_norm": 1.607359367118123, "language_loss": 0.87545979, "learning_rate": 1.0448352987182386e-09, "loss": 0.89962518, "num_input_tokens_seen": 355281930, "router_z_loss_clip": 1.55371094, "router_z_loss_mlp": 0.17102051, "step": 16466, "time_per_iteration": 4.298904180526733 }, { "auxiliary_loss_clip": 0.01398971, "auxiliary_loss_mlp": 0.01029934, "balance_loss_clip": 1.23993695, "balance_loss_mlp": 1.01089668, "epoch": 0.990049601683451, "flos": 21551704531200.0, "grad_norm": 1.6326239016675728, "language_loss": 0.7244764, "learning_rate": 1.0322859181743915e-09, "loss": 0.74876547, "num_input_tokens_seen": 355301555, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19042969, "step": 16467, "time_per_iteration": 2.8738081455230713 }, { "auxiliary_loss_clip": 0.0138917, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.23118985, "balance_loss_mlp": 1.01689136, "epoch": 0.990109724936119, "flos": 28784732999040.0, "grad_norm": 1.3341094880140978, "language_loss": 0.65413356, "learning_rate": 1.019812338686643e-09, "loss": 0.67837489, "num_input_tokens_seen": 355324925, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.18054199, "step": 16468, "time_per_iteration": 2.9437413215637207 }, { "auxiliary_loss_clip": 0.01403145, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.23946595, "balance_loss_mlp": 1.01256657, "epoch": 0.9901698481887871, "flos": 29285160197760.0, "grad_norm": 1.8267849045015225, "language_loss": 0.62419486, "learning_rate": 1.0074145607281704e-09, "loss": 0.64854133, "num_input_tokens_seen": 355343875, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.18908691, "step": 16469, "time_per_iteration": 2.929666519165039 }, { "auxiliary_loss_clip": 0.01396111, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.23722196, "balance_loss_mlp": 1.01364589, "epoch": 0.990229971441455, "flos": 15966994556160.0, "grad_norm": 3.323835233958882, "language_loss": 0.730088, "learning_rate": 9.950925847685976e-10, "loss": 0.7543726, "num_input_tokens_seen": 355358835, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18701172, "step": 16470, "time_per_iteration": 2.829166889190674 }, { "auxiliary_loss_clip": 0.01178066, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.08966398, "balance_loss_mlp": 1.01690471, "epoch": 0.990290094694123, "flos": 69812928449280.0, "grad_norm": 0.6650812193149377, "language_loss": 0.55529261, "learning_rate": 9.828464112755509e-10, "loss": 0.57744348, "num_input_tokens_seen": 355431225, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.20117188, "step": 16471, "time_per_iteration": 3.547701835632324 }, { "auxiliary_loss_clip": 0.01388742, "auxiliary_loss_mlp": 0.01034296, "balance_loss_clip": 1.23070765, "balance_loss_mlp": 1.01453066, "epoch": 0.9903502179467909, "flos": 16260896649600.0, "grad_norm": 2.139720554473198, "language_loss": 0.85772669, "learning_rate": 9.706760407131032e-10, "loss": 0.88195705, "num_input_tokens_seen": 355448250, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19750977, "step": 16472, "time_per_iteration": 2.8637189865112305 }, { "auxiliary_loss_clip": 0.01408928, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.24744034, "balance_loss_mlp": 1.0113349, "epoch": 0.9904103411994589, "flos": 21698135262720.0, "grad_norm": 2.0223331569882794, "language_loss": 0.86558306, "learning_rate": 9.585814735431075e-10, "loss": 0.88997114, "num_input_tokens_seen": 355467040, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.1854248, "step": 16473, "time_per_iteration": 2.8790690898895264 }, { "auxiliary_loss_clip": 0.01404886, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.24535537, "balance_loss_mlp": 1.0151391, "epoch": 0.9904704644521268, "flos": 25750525708800.0, "grad_norm": 1.6207762338520475, "language_loss": 0.85330868, "learning_rate": 9.465627102240859e-10, "loss": 0.87768823, "num_input_tokens_seen": 355487825, "router_z_loss_clip": 1.59570312, "router_z_loss_mlp": 0.17956543, "step": 16474, "time_per_iteration": 2.8954622745513916 }, { "auxiliary_loss_clip": 0.01386321, "auxiliary_loss_mlp": 0.01036473, "balance_loss_clip": 1.22899103, "balance_loss_mlp": 1.0178169, "epoch": 0.9905305877047949, "flos": 21918414787200.0, "grad_norm": 1.645497899949222, "language_loss": 0.77228153, "learning_rate": 9.346197512116738e-10, "loss": 0.79650944, "num_input_tokens_seen": 355507445, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.18640137, "step": 16475, "time_per_iteration": 2.8597652912139893 }, { "auxiliary_loss_clip": 0.0139722, "auxiliary_loss_mlp": 0.01031382, "balance_loss_clip": 1.23778069, "balance_loss_mlp": 1.0123204, "epoch": 0.9905907109574628, "flos": 21401020788480.0, "grad_norm": 1.6839590429351796, "language_loss": 0.75867951, "learning_rate": 9.227525969588423e-10, "loss": 0.78296554, "num_input_tokens_seen": 355527205, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19042969, "step": 16476, "time_per_iteration": 2.898207664489746 }, { "auxiliary_loss_clip": 0.01423458, "auxiliary_loss_mlp": 0.01034328, "balance_loss_clip": 1.25669169, "balance_loss_mlp": 1.01414585, "epoch": 0.9906508342101308, "flos": 20531259135360.0, "grad_norm": 2.8645962914513667, "language_loss": 0.68827063, "learning_rate": 9.109612479154538e-10, "loss": 0.71284848, "num_input_tokens_seen": 355544740, "router_z_loss_clip": 1.66796875, "router_z_loss_mlp": 0.20178223, "step": 16477, "time_per_iteration": 2.8452680110931396 }, { "auxiliary_loss_clip": 0.01411565, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.24863696, "balance_loss_mlp": 1.01286626, "epoch": 0.9907109574627987, "flos": 21371177203200.0, "grad_norm": 1.903865867995358, "language_loss": 0.73229289, "learning_rate": 8.992457045289282e-10, "loss": 0.75673401, "num_input_tokens_seen": 355564385, "router_z_loss_clip": 1.62988281, "router_z_loss_mlp": 0.19677734, "step": 16478, "time_per_iteration": 2.857356071472168 }, { "auxiliary_loss_clip": 0.01409177, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.24802005, "balance_loss_mlp": 1.01833844, "epoch": 0.9907710807154667, "flos": 17345146492800.0, "grad_norm": 2.577267954066346, "language_loss": 0.8196044, "learning_rate": 8.876059672433545e-10, "loss": 0.8440789, "num_input_tokens_seen": 355579260, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.19921875, "step": 16479, "time_per_iteration": 2.820232391357422 }, { "auxiliary_loss_clip": 0.01407593, "auxiliary_loss_mlp": 0.01031408, "balance_loss_clip": 1.2464112, "balance_loss_mlp": 1.01306164, "epoch": 0.9908312039681346, "flos": 28633732542720.0, "grad_norm": 1.5398043999805664, "language_loss": 0.67177993, "learning_rate": 8.760420364999355e-10, "loss": 0.69616997, "num_input_tokens_seen": 355599790, "router_z_loss_clip": 1.61230469, "router_z_loss_mlp": 0.18334961, "step": 16480, "time_per_iteration": 2.8810153007507324 }, { "auxiliary_loss_clip": 0.01385061, "auxiliary_loss_mlp": 0.01031606, "balance_loss_clip": 1.22792697, "balance_loss_mlp": 1.01250887, "epoch": 0.9908913272208026, "flos": 35783727523200.0, "grad_norm": 1.7812081684523362, "language_loss": 0.72949809, "learning_rate": 8.645539127374313e-10, "loss": 0.75366473, "num_input_tokens_seen": 355620925, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.19091797, "step": 16481, "time_per_iteration": 3.0511796474456787 }, { "auxiliary_loss_clip": 0.01394311, "auxiliary_loss_mlp": 0.01029744, "balance_loss_clip": 1.23707116, "balance_loss_mlp": 1.01146901, "epoch": 0.9909514504734707, "flos": 19911892060800.0, "grad_norm": 1.8811230247096633, "language_loss": 0.78440297, "learning_rate": 8.531415963912713e-10, "loss": 0.80864352, "num_input_tokens_seen": 355639165, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.1829834, "step": 16482, "time_per_iteration": 2.841183662414551 }, { "auxiliary_loss_clip": 0.01399591, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.23884821, "balance_loss_mlp": 1.01170802, "epoch": 0.9910115737261386, "flos": 20012734016640.0, "grad_norm": 1.7590114459948827, "language_loss": 0.76143408, "learning_rate": 8.418050878944427e-10, "loss": 0.78573644, "num_input_tokens_seen": 355657320, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18933105, "step": 16483, "time_per_iteration": 2.840317487716675 }, { "auxiliary_loss_clip": 0.01176153, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.08978796, "balance_loss_mlp": 1.00631285, "epoch": 0.9910716969788066, "flos": 70720270548480.0, "grad_norm": 0.679474959086763, "language_loss": 0.53681815, "learning_rate": 8.305443876768237e-10, "loss": 0.55881453, "num_input_tokens_seen": 355726370, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 0.171875, "step": 16484, "time_per_iteration": 3.488264322280884 }, { "auxiliary_loss_clip": 0.01385447, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.22988439, "balance_loss_mlp": 1.01236165, "epoch": 0.9911318202314745, "flos": 21443985365760.0, "grad_norm": 2.3040782270122313, "language_loss": 0.82507575, "learning_rate": 8.19359496165184e-10, "loss": 0.84923989, "num_input_tokens_seen": 355745840, "router_z_loss_clip": 1.55566406, "router_z_loss_mlp": 0.18615723, "step": 16485, "time_per_iteration": 2.834007740020752 }, { "auxiliary_loss_clip": 0.01388744, "auxiliary_loss_mlp": 0.01035586, "balance_loss_clip": 1.23362505, "balance_loss_mlp": 1.0161432, "epoch": 0.9911919434841425, "flos": 19835871517440.0, "grad_norm": 1.6110321341730882, "language_loss": 0.82295537, "learning_rate": 8.082504137836288e-10, "loss": 0.84719861, "num_input_tokens_seen": 355763385, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.19433594, "step": 16486, "time_per_iteration": 2.845529556274414 }, { "auxiliary_loss_clip": 0.01411214, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.2489965, "balance_loss_mlp": 1.01521552, "epoch": 0.9912520667368104, "flos": 41734378592640.0, "grad_norm": 1.3190888344923042, "language_loss": 0.66571963, "learning_rate": 7.972171409538209e-10, "loss": 0.69017041, "num_input_tokens_seen": 355786075, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18652344, "step": 16487, "time_per_iteration": 3.059370279312134 }, { "auxiliary_loss_clip": 0.01383994, "auxiliary_loss_mlp": 0.01031245, "balance_loss_clip": 1.22863436, "balance_loss_mlp": 1.01289916, "epoch": 0.9913121899894785, "flos": 23780361818880.0, "grad_norm": 1.6049698259702623, "language_loss": 0.77607274, "learning_rate": 7.862596780936481e-10, "loss": 0.80022514, "num_input_tokens_seen": 355806295, "router_z_loss_clip": 1.55273438, "router_z_loss_mlp": 0.18359375, "step": 16488, "time_per_iteration": 2.963336706161499 }, { "auxiliary_loss_clip": 0.0142136, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.25554657, "balance_loss_mlp": 1.01310027, "epoch": 0.9913723132421464, "flos": 23780361818880.0, "grad_norm": 2.13476345238607, "language_loss": 0.69987619, "learning_rate": 7.753780256190001e-10, "loss": 0.72441131, "num_input_tokens_seen": 355825730, "router_z_loss_clip": 1.65917969, "router_z_loss_mlp": 0.19042969, "step": 16489, "time_per_iteration": 4.311330795288086 }, { "auxiliary_loss_clip": 0.01176003, "auxiliary_loss_mlp": 0.01017812, "balance_loss_clip": 1.08860373, "balance_loss_mlp": 0.99902481, "epoch": 0.9914324364948144, "flos": 71298166124160.0, "grad_norm": 0.6069470711894912, "language_loss": 0.5259797, "learning_rate": 7.645721839424357e-10, "loss": 0.5479179, "num_input_tokens_seen": 355891545, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.1875, "step": 16490, "time_per_iteration": 3.4537580013275146 }, { "auxiliary_loss_clip": 0.01419717, "auxiliary_loss_mlp": 0.01034024, "balance_loss_clip": 1.25509071, "balance_loss_mlp": 1.0141871, "epoch": 0.9914925597474823, "flos": 23705789109120.0, "grad_norm": 1.8880501379519958, "language_loss": 0.76428998, "learning_rate": 7.538421534734052e-10, "loss": 0.78882742, "num_input_tokens_seen": 355909920, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.19848633, "step": 16491, "time_per_iteration": 4.393556118011475 }, { "auxiliary_loss_clip": 0.01417968, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.25507259, "balance_loss_mlp": 1.01298785, "epoch": 0.9915526830001503, "flos": 13439141533440.0, "grad_norm": 2.328640926333991, "language_loss": 0.70915425, "learning_rate": 7.431879346191383e-10, "loss": 0.73365921, "num_input_tokens_seen": 355923130, "router_z_loss_clip": 1.63085938, "router_z_loss_mlp": 0.19543457, "step": 16492, "time_per_iteration": 2.7919111251831055 }, { "auxiliary_loss_clip": 0.01391072, "auxiliary_loss_mlp": 0.01032538, "balance_loss_clip": 1.23310721, "balance_loss_mlp": 1.01304746, "epoch": 0.9916128062528182, "flos": 20750769498240.0, "grad_norm": 2.2430412454193216, "language_loss": 0.69184566, "learning_rate": 7.326095277837563e-10, "loss": 0.71608174, "num_input_tokens_seen": 355941960, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.19494629, "step": 16493, "time_per_iteration": 2.8960349559783936 }, { "auxiliary_loss_clip": 0.01418285, "auxiliary_loss_mlp": 0.01034919, "balance_loss_clip": 1.25453639, "balance_loss_mlp": 1.01520181, "epoch": 0.9916729295054862, "flos": 22495993649280.0, "grad_norm": 3.1657004917122684, "language_loss": 0.71964538, "learning_rate": 7.221069333678276e-10, "loss": 0.7441774, "num_input_tokens_seen": 355961640, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.19714355, "step": 16494, "time_per_iteration": 2.8948168754577637 }, { "auxiliary_loss_clip": 0.01401996, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.24029148, "balance_loss_mlp": 1.01268959, "epoch": 0.9917330527581543, "flos": 14800389897600.0, "grad_norm": 2.062521391693983, "language_loss": 0.68706441, "learning_rate": 7.116801517701443e-10, "loss": 0.7114042, "num_input_tokens_seen": 355977980, "router_z_loss_clip": 1.6171875, "router_z_loss_mlp": 0.19287109, "step": 16495, "time_per_iteration": 2.8999719619750977 }, { "auxiliary_loss_clip": 0.01177333, "auxiliary_loss_mlp": 0.01038454, "balance_loss_clip": 1.09052444, "balance_loss_mlp": 1.01537502, "epoch": 0.9917931760108222, "flos": 59219458548480.0, "grad_norm": 0.7214530020154722, "language_loss": 0.53546178, "learning_rate": 7.013291833859458e-10, "loss": 0.55761969, "num_input_tokens_seen": 356042900, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 0.23046875, "step": 16496, "time_per_iteration": 3.497404098510742 }, { "auxiliary_loss_clip": 0.01394402, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.23400974, "balance_loss_mlp": 1.0130949, "epoch": 0.9918532992634902, "flos": 26773142855040.0, "grad_norm": 1.5957293644959956, "language_loss": 0.72271848, "learning_rate": 6.91054028607585e-10, "loss": 0.74698132, "num_input_tokens_seen": 356063000, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.18786621, "step": 16497, "time_per_iteration": 2.90838885307312 }, { "auxiliary_loss_clip": 0.01419059, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.25439215, "balance_loss_mlp": 1.01505661, "epoch": 0.9919134225161581, "flos": 14983903382400.0, "grad_norm": 2.0803313713119294, "language_loss": 0.82872969, "learning_rate": 6.808546878249721e-10, "loss": 0.85326648, "num_input_tokens_seen": 356078130, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19555664, "step": 16498, "time_per_iteration": 2.8506102561950684 }, { "auxiliary_loss_clip": 0.0140043, "auxiliary_loss_mlp": 0.01035918, "balance_loss_clip": 1.24064124, "balance_loss_mlp": 1.01637959, "epoch": 0.9919735457688261, "flos": 27829585128960.0, "grad_norm": 1.749854798797655, "language_loss": 0.68954647, "learning_rate": 6.707311614246869e-10, "loss": 0.71390992, "num_input_tokens_seen": 356101655, "router_z_loss_clip": 1.60058594, "router_z_loss_mlp": 0.1953125, "step": 16499, "time_per_iteration": 4.499104976654053 }, { "auxiliary_loss_clip": 0.01420257, "auxiliary_loss_mlp": 0.0103343, "balance_loss_clip": 1.25667357, "balance_loss_mlp": 1.01546526, "epoch": 0.992033669021494, "flos": 22572421395840.0, "grad_norm": 3.000668238451244, "language_loss": 0.82858789, "learning_rate": 6.606834497904223e-10, "loss": 0.85312474, "num_input_tokens_seen": 356121425, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.17956543, "step": 16500, "time_per_iteration": 4.324865341186523 }, { "auxiliary_loss_clip": 0.01394538, "auxiliary_loss_mlp": 0.01034937, "balance_loss_clip": 1.23536587, "balance_loss_mlp": 1.01587522, "epoch": 0.9920937922741621, "flos": 25385580000000.0, "grad_norm": 1.758729680230694, "language_loss": 0.82542479, "learning_rate": 6.507115533036511e-10, "loss": 0.84971952, "num_input_tokens_seen": 356140710, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.1907959, "step": 16501, "time_per_iteration": 2.8914644718170166 }, { "auxiliary_loss_clip": 0.01397375, "auxiliary_loss_mlp": 0.0103243, "balance_loss_clip": 1.23737943, "balance_loss_mlp": 1.01379764, "epoch": 0.99215391552683, "flos": 22064619294720.0, "grad_norm": 2.2470686268854663, "language_loss": 0.78135002, "learning_rate": 6.408154723420711e-10, "loss": 0.80564809, "num_input_tokens_seen": 356159835, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18615723, "step": 16502, "time_per_iteration": 2.8514018058776855 }, { "auxiliary_loss_clip": 0.01434712, "auxiliary_loss_mlp": 0.01035882, "balance_loss_clip": 1.27188778, "balance_loss_mlp": 1.01636696, "epoch": 0.992214038779498, "flos": 15422335925760.0, "grad_norm": 2.1818542820195654, "language_loss": 0.72347337, "learning_rate": 6.309952072811597e-10, "loss": 0.74817932, "num_input_tokens_seen": 356177555, "router_z_loss_clip": 1.62792969, "router_z_loss_mlp": 0.19519043, "step": 16503, "time_per_iteration": 2.829491376876831 }, { "auxiliary_loss_clip": 0.01179298, "auxiliary_loss_mlp": 0.01030598, "balance_loss_clip": 1.08943892, "balance_loss_mlp": 1.00656593, "epoch": 0.9922741620321659, "flos": 62046597795840.0, "grad_norm": 0.6338681167257824, "language_loss": 0.55120569, "learning_rate": 6.212507584932858e-10, "loss": 0.57330465, "num_input_tokens_seen": 356244975, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 0.24023438, "step": 16504, "time_per_iteration": 3.4515397548675537 }, { "auxiliary_loss_clip": 0.013981, "auxiliary_loss_mlp": 0.01028662, "balance_loss_clip": 1.23816514, "balance_loss_mlp": 1.01072073, "epoch": 0.9923342852848339, "flos": 17174392041600.0, "grad_norm": 1.7033882690323705, "language_loss": 0.70095128, "learning_rate": 6.115821263481536e-10, "loss": 0.72521889, "num_input_tokens_seen": 356262605, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.17944336, "step": 16505, "time_per_iteration": 2.8334648609161377 }, { "auxiliary_loss_clip": 0.01402717, "auxiliary_loss_mlp": 0.01031412, "balance_loss_clip": 1.23855162, "balance_loss_mlp": 1.01088405, "epoch": 0.9923944085375018, "flos": 23193055324800.0, "grad_norm": 1.9164489564145357, "language_loss": 0.6610375, "learning_rate": 6.019893112119146e-10, "loss": 0.68537879, "num_input_tokens_seen": 356278935, "router_z_loss_clip": 1.640625, "router_z_loss_mlp": 0.2052002, "step": 16506, "time_per_iteration": 2.8972232341766357 }, { "auxiliary_loss_clip": 0.01387218, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.22835624, "balance_loss_mlp": 1.01281071, "epoch": 0.9924545317901698, "flos": 20823668150400.0, "grad_norm": 19.864694976176192, "language_loss": 0.63956892, "learning_rate": 5.924723134487219e-10, "loss": 0.66376287, "num_input_tokens_seen": 356295675, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19372559, "step": 16507, "time_per_iteration": 2.8689470291137695 }, { "auxiliary_loss_clip": 0.01400924, "auxiliary_loss_mlp": 0.0103423, "balance_loss_clip": 1.24098682, "balance_loss_mlp": 1.01428628, "epoch": 0.9925146550428379, "flos": 20092871836800.0, "grad_norm": 1.9666558464110555, "language_loss": 0.7361086, "learning_rate": 5.830311334193983e-10, "loss": 0.76046014, "num_input_tokens_seen": 356312885, "router_z_loss_clip": 1.59863281, "router_z_loss_mlp": 0.19946289, "step": 16508, "time_per_iteration": 2.842477321624756 }, { "auxiliary_loss_clip": 0.01394687, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.23469424, "balance_loss_mlp": 1.01268697, "epoch": 0.9925747782955058, "flos": 24984727902720.0, "grad_norm": 1.4782602042961164, "language_loss": 0.71069813, "learning_rate": 5.736657714818793e-10, "loss": 0.73496377, "num_input_tokens_seen": 356334070, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.1920166, "step": 16509, "time_per_iteration": 2.8958420753479004 }, { "auxiliary_loss_clip": 0.01399668, "auxiliary_loss_mlp": 0.01033146, "balance_loss_clip": 1.23820674, "balance_loss_mlp": 1.01372671, "epoch": 0.9926349015481738, "flos": 60492271311360.0, "grad_norm": 1.6978387685509295, "language_loss": 0.6861819, "learning_rate": 5.643762279912146e-10, "loss": 0.71051002, "num_input_tokens_seen": 356359410, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.1940918, "step": 16510, "time_per_iteration": 3.2113988399505615 }, { "auxiliary_loss_clip": 0.01410033, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.24769092, "balance_loss_mlp": 1.01496863, "epoch": 0.9926950248008417, "flos": 20751719639040.0, "grad_norm": 2.146345013020012, "language_loss": 0.82617652, "learning_rate": 5.551625032997886e-10, "loss": 0.85062242, "num_input_tokens_seen": 356378345, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19592285, "step": 16511, "time_per_iteration": 2.863591432571411 }, { "auxiliary_loss_clip": 0.01387466, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.23033953, "balance_loss_mlp": 1.01405728, "epoch": 0.9927551480535097, "flos": 24363686770560.0, "grad_norm": 2.8046324104572484, "language_loss": 0.92675954, "learning_rate": 5.460245977570998e-10, "loss": 0.95096087, "num_input_tokens_seen": 356397345, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.18615723, "step": 16512, "time_per_iteration": 2.90809965133667 }, { "auxiliary_loss_clip": 0.01176633, "auxiliary_loss_mlp": 0.01020354, "balance_loss_clip": 1.0903877, "balance_loss_mlp": 0.99899215, "epoch": 0.9928152713061776, "flos": 71308436693760.0, "grad_norm": 0.6957966975924443, "language_loss": 0.55233419, "learning_rate": 5.369625117095378e-10, "loss": 0.57430398, "num_input_tokens_seen": 356459160, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 0.21386719, "step": 16513, "time_per_iteration": 3.543395757675171 }, { "auxiliary_loss_clip": 0.0139687, "auxiliary_loss_mlp": 0.01031275, "balance_loss_clip": 1.23847234, "balance_loss_mlp": 1.01230931, "epoch": 0.9928753945588457, "flos": 57828665329920.0, "grad_norm": 1.571783858715082, "language_loss": 0.65665495, "learning_rate": 5.279762455006054e-10, "loss": 0.68093646, "num_input_tokens_seen": 356486405, "router_z_loss_clip": 1.58496094, "router_z_loss_mlp": 0.18969727, "step": 16514, "time_per_iteration": 3.189713716506958 }, { "auxiliary_loss_clip": 0.01403666, "auxiliary_loss_mlp": 0.0103398, "balance_loss_clip": 1.24184728, "balance_loss_mlp": 1.01355958, "epoch": 0.9929355178115136, "flos": 19577332874880.0, "grad_norm": 1.7857685573101947, "language_loss": 0.73790419, "learning_rate": 5.190657994713632e-10, "loss": 0.7622807, "num_input_tokens_seen": 356502905, "router_z_loss_clip": 1.62011719, "router_z_loss_mlp": 0.20410156, "step": 16515, "time_per_iteration": 2.8217029571533203 }, { "auxiliary_loss_clip": 0.0140158, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.24170542, "balance_loss_mlp": 1.01571703, "epoch": 0.9929956410641816, "flos": 22974540347520.0, "grad_norm": 1.9306838252089873, "language_loss": 0.78132933, "learning_rate": 5.102311739593191e-10, "loss": 0.80570126, "num_input_tokens_seen": 356523830, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.19909668, "step": 16516, "time_per_iteration": 2.961782217025757 }, { "auxiliary_loss_clip": 0.01392465, "auxiliary_loss_mlp": 0.01032861, "balance_loss_clip": 1.23428917, "balance_loss_mlp": 1.01432419, "epoch": 0.9930557643168495, "flos": 22576991120640.0, "grad_norm": 1.8202685866577524, "language_loss": 0.78752828, "learning_rate": 5.014723692997602e-10, "loss": 0.81178159, "num_input_tokens_seen": 356543965, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.1854248, "step": 16517, "time_per_iteration": 2.8611419200897217 }, { "auxiliary_loss_clip": 0.01417403, "auxiliary_loss_mlp": 0.01037054, "balance_loss_clip": 1.25249112, "balance_loss_mlp": 1.01664567, "epoch": 0.9931158875695175, "flos": 17209710247680.0, "grad_norm": 2.4135607643736523, "language_loss": 0.68739825, "learning_rate": 4.927893858248655e-10, "loss": 0.71194279, "num_input_tokens_seen": 356561530, "router_z_loss_clip": 1.6484375, "router_z_loss_mlp": 0.20397949, "step": 16518, "time_per_iteration": 2.81862473487854 }, { "auxiliary_loss_clip": 0.0117825, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.09114695, "balance_loss_mlp": 1.02028263, "epoch": 0.9931760108221854, "flos": 63739039248000.0, "grad_norm": 0.747765155854921, "language_loss": 0.53420138, "learning_rate": 4.84182223863483e-10, "loss": 0.55636024, "num_input_tokens_seen": 356616845, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.17382812, "step": 16519, "time_per_iteration": 3.273587465286255 }, { "auxiliary_loss_clip": 0.01393179, "auxiliary_loss_mlp": 0.010313, "balance_loss_clip": 1.23513544, "balance_loss_mlp": 1.01295424, "epoch": 0.9932361340748534, "flos": 15313259416320.0, "grad_norm": 1.7024020232275632, "language_loss": 0.60703689, "learning_rate": 4.756508837426842e-10, "loss": 0.63128173, "num_input_tokens_seen": 356633560, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.18347168, "step": 16520, "time_per_iteration": 2.8711400032043457 }, { "auxiliary_loss_clip": 0.0140051, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.24095845, "balance_loss_mlp": 1.01616895, "epoch": 0.9932962573275215, "flos": 36078670247040.0, "grad_norm": 1.644182003131036, "language_loss": 0.62296605, "learning_rate": 4.671953657853223e-10, "loss": 0.64732659, "num_input_tokens_seen": 356657600, "router_z_loss_clip": 1.59472656, "router_z_loss_mlp": 0.19360352, "step": 16521, "time_per_iteration": 3.0150258541107178 }, { "auxiliary_loss_clip": 0.0140811, "auxiliary_loss_mlp": 0.01030996, "balance_loss_clip": 1.24604917, "balance_loss_mlp": 1.01084995, "epoch": 0.9933563805801894, "flos": 21480479936640.0, "grad_norm": 5.014743160059916, "language_loss": 0.75039792, "learning_rate": 4.5881567031225145e-10, "loss": 0.77478892, "num_input_tokens_seen": 356675880, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.20141602, "step": 16522, "time_per_iteration": 2.8488998413085938 }, { "auxiliary_loss_clip": 0.01392194, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.23597908, "balance_loss_mlp": 1.01559806, "epoch": 0.9934165038328574, "flos": 23996750290560.0, "grad_norm": 1.6409053270800151, "language_loss": 0.74109495, "learning_rate": 4.5051179764143964e-10, "loss": 0.76536345, "num_input_tokens_seen": 356696000, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.19067383, "step": 16523, "time_per_iteration": 2.8624582290649414 }, { "auxiliary_loss_clip": 0.01395485, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.23693252, "balance_loss_mlp": 1.01198769, "epoch": 0.9934766270855253, "flos": 21917736115200.0, "grad_norm": 3.3505942369369377, "language_loss": 0.71684539, "learning_rate": 4.422837480875241e-10, "loss": 0.74110651, "num_input_tokens_seen": 356716845, "router_z_loss_clip": 1.58886719, "router_z_loss_mlp": 0.18640137, "step": 16524, "time_per_iteration": 4.320645093917847 }, { "auxiliary_loss_clip": 0.01406824, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.24593532, "balance_loss_mlp": 1.01396573, "epoch": 0.9935367503381933, "flos": 17138078449920.0, "grad_norm": 2.10892538392591, "language_loss": 0.80006212, "learning_rate": 4.341315219624775e-10, "loss": 0.82445866, "num_input_tokens_seen": 356732100, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18847656, "step": 16525, "time_per_iteration": 2.8166589736938477 }, { "auxiliary_loss_clip": 0.01396361, "auxiliary_loss_mlp": 0.01029637, "balance_loss_clip": 1.23867989, "balance_loss_mlp": 1.01082623, "epoch": 0.9935968735908612, "flos": 22356440127360.0, "grad_norm": 1.661834005953681, "language_loss": 0.75919795, "learning_rate": 4.2605511957582995e-10, "loss": 0.78345799, "num_input_tokens_seen": 356751480, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18823242, "step": 16526, "time_per_iteration": 4.30532693862915 }, { "auxiliary_loss_clip": 0.0138303, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.22880554, "balance_loss_mlp": 1.01599383, "epoch": 0.9936569968435293, "flos": 29472474245760.0, "grad_norm": 1.4781374473587419, "language_loss": 0.73051143, "learning_rate": 4.180545412333369e-10, "loss": 0.75467688, "num_input_tokens_seen": 356772650, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.1751709, "step": 16527, "time_per_iteration": 2.917006015777588 }, { "auxiliary_loss_clip": 0.01396405, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.23486137, "balance_loss_mlp": 1.01362395, "epoch": 0.9937171200961972, "flos": 16552491258240.0, "grad_norm": 2.017788879356323, "language_loss": 0.76140904, "learning_rate": 4.1012978723875547e-10, "loss": 0.78569865, "num_input_tokens_seen": 356788510, "router_z_loss_clip": 1.61425781, "router_z_loss_mlp": 0.18933105, "step": 16528, "time_per_iteration": 2.8136661052703857 }, { "auxiliary_loss_clip": 0.01407881, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.24581707, "balance_loss_mlp": 1.01295114, "epoch": 0.9937772433488652, "flos": 24401221971840.0, "grad_norm": 2.3106751846310747, "language_loss": 0.68531692, "learning_rate": 4.022808578922898e-10, "loss": 0.70971847, "num_input_tokens_seen": 356809115, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.1932373, "step": 16529, "time_per_iteration": 2.971472978591919 }, { "auxiliary_loss_clip": 0.01407296, "auxiliary_loss_mlp": 0.01034018, "balance_loss_clip": 1.24378955, "balance_loss_mlp": 1.01469457, "epoch": 0.9938373666015331, "flos": 15678521838720.0, "grad_norm": 2.443467747913877, "language_loss": 0.65763223, "learning_rate": 3.9450775349170186e-10, "loss": 0.68204534, "num_input_tokens_seen": 356826410, "router_z_loss_clip": 1.63574219, "router_z_loss_mlp": 0.19335938, "step": 16530, "time_per_iteration": 2.8498618602752686 }, { "auxiliary_loss_clip": 0.01413041, "auxiliary_loss_mlp": 0.01030929, "balance_loss_clip": 1.25200319, "balance_loss_mlp": 1.01282108, "epoch": 0.9938974898542011, "flos": 19504569957120.0, "grad_norm": 11.542830933465353, "language_loss": 0.71749127, "learning_rate": 3.8681047433186676e-10, "loss": 0.74193102, "num_input_tokens_seen": 356844990, "router_z_loss_clip": 1.61035156, "router_z_loss_mlp": 0.18115234, "step": 16531, "time_per_iteration": 2.7993905544281006 }, { "auxiliary_loss_clip": 0.01394957, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.23459375, "balance_loss_mlp": 1.01420522, "epoch": 0.993957613106869, "flos": 26918261487360.0, "grad_norm": 1.5497651055396993, "language_loss": 0.74781752, "learning_rate": 3.791890207045512e-10, "loss": 0.77209246, "num_input_tokens_seen": 356866530, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18334961, "step": 16532, "time_per_iteration": 2.9319751262664795 }, { "auxiliary_loss_clip": 0.01378712, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 1.22603512, "balance_loss_mlp": 1.01372528, "epoch": 0.994017736359537, "flos": 14947861259520.0, "grad_norm": 1.731568180411468, "language_loss": 0.71301651, "learning_rate": 3.7164339289885717e-10, "loss": 0.73712599, "num_input_tokens_seen": 356884660, "router_z_loss_clip": 1.52734375, "router_z_loss_mlp": 0.18518066, "step": 16533, "time_per_iteration": 2.8201966285705566 }, { "auxiliary_loss_clip": 0.01407913, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.24653554, "balance_loss_mlp": 1.01537323, "epoch": 0.9940778596122051, "flos": 15386384292480.0, "grad_norm": 3.583756756501591, "language_loss": 0.85090053, "learning_rate": 3.641735912007782e-10, "loss": 0.87533057, "num_input_tokens_seen": 356900895, "router_z_loss_clip": 1.61523438, "router_z_loss_mlp": 0.19714355, "step": 16534, "time_per_iteration": 4.2584614753723145 }, { "auxiliary_loss_clip": 0.01376684, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.22428846, "balance_loss_mlp": 1.01318944, "epoch": 0.994137982864873, "flos": 25238923044480.0, "grad_norm": 2.3466282904023372, "language_loss": 0.66988671, "learning_rate": 3.567796158934211e-10, "loss": 0.69397449, "num_input_tokens_seen": 356920985, "router_z_loss_clip": 1.52441406, "router_z_loss_mlp": 0.18896484, "step": 16535, "time_per_iteration": 4.344003200531006 }, { "auxiliary_loss_clip": 0.0140103, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.24277771, "balance_loss_mlp": 1.01352406, "epoch": 0.994198106117541, "flos": 18451294819200.0, "grad_norm": 1.524482674255288, "language_loss": 0.65694427, "learning_rate": 3.4946146725767235e-10, "loss": 0.68127698, "num_input_tokens_seen": 356939800, "router_z_loss_clip": 1.58300781, "router_z_loss_mlp": 0.18725586, "step": 16536, "time_per_iteration": 2.8118183612823486 }, { "auxiliary_loss_clip": 0.01396418, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.2385689, "balance_loss_mlp": 1.01301503, "epoch": 0.9942582293702089, "flos": 16662744132480.0, "grad_norm": 1.9814036551872931, "language_loss": 0.79632115, "learning_rate": 3.4221914557064357e-10, "loss": 0.82060653, "num_input_tokens_seen": 356957780, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.19104004, "step": 16537, "time_per_iteration": 2.8669991493225098 }, { "auxiliary_loss_clip": 0.01420741, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.25438643, "balance_loss_mlp": 1.01275134, "epoch": 0.9943183526228769, "flos": 21954592644480.0, "grad_norm": 1.8344150213236963, "language_loss": 0.69376463, "learning_rate": 3.35052651107004e-10, "loss": 0.71829534, "num_input_tokens_seen": 356979185, "router_z_loss_clip": 1.66210938, "router_z_loss_mlp": 0.19580078, "step": 16538, "time_per_iteration": 2.852799654006958 }, { "auxiliary_loss_clip": 0.01377108, "auxiliary_loss_mlp": 0.01031376, "balance_loss_clip": 1.22151411, "balance_loss_mlp": 1.01220751, "epoch": 0.9943784758755448, "flos": 23853260471040.0, "grad_norm": 2.2013402973233265, "language_loss": 0.76024354, "learning_rate": 3.2796198413853614e-10, "loss": 0.7843284, "num_input_tokens_seen": 356997735, "router_z_loss_clip": 1.55761719, "router_z_loss_mlp": 0.19165039, "step": 16539, "time_per_iteration": 2.8497817516326904 }, { "auxiliary_loss_clip": 0.01407007, "auxiliary_loss_mlp": 0.01032187, "balance_loss_clip": 1.24459803, "balance_loss_mlp": 1.01328039, "epoch": 0.9944385991282129, "flos": 21479801264640.0, "grad_norm": 2.3366954031232274, "language_loss": 0.71337545, "learning_rate": 3.209471449341361e-10, "loss": 0.7377674, "num_input_tokens_seen": 357015660, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.18896484, "step": 16540, "time_per_iteration": 2.8357419967651367 }, { "auxiliary_loss_clip": 0.01381318, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.22563505, "balance_loss_mlp": 1.00953639, "epoch": 0.9944987223808808, "flos": 22936597943040.0, "grad_norm": 2.1395800030861327, "language_loss": 0.76214004, "learning_rate": 3.140081337600353e-10, "loss": 0.78622246, "num_input_tokens_seen": 357034800, "router_z_loss_clip": 1.55957031, "router_z_loss_mlp": 0.17370605, "step": 16541, "time_per_iteration": 2.8595893383026123 }, { "auxiliary_loss_clip": 0.01393737, "auxiliary_loss_mlp": 0.01035314, "balance_loss_clip": 1.2342937, "balance_loss_mlp": 1.0157516, "epoch": 0.9945588456335488, "flos": 22393296656640.0, "grad_norm": 1.6260254176434317, "language_loss": 0.77466238, "learning_rate": 3.0714495087891255e-10, "loss": 0.79895294, "num_input_tokens_seen": 357053785, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19567871, "step": 16542, "time_per_iteration": 2.8494691848754883 }, { "auxiliary_loss_clip": 0.01403848, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.24286032, "balance_loss_mlp": 1.01076388, "epoch": 0.9946189688862167, "flos": 21407445550080.0, "grad_norm": 2.2253198757971435, "language_loss": 0.75790095, "learning_rate": 3.0035759655122615e-10, "loss": 0.78224277, "num_input_tokens_seen": 357072025, "router_z_loss_clip": 1.609375, "router_z_loss_mlp": 0.19580078, "step": 16543, "time_per_iteration": 2.8462002277374268 }, { "auxiliary_loss_clip": 0.01426161, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.26197004, "balance_loss_mlp": 1.01293063, "epoch": 0.9946790921388847, "flos": 12422722924800.0, "grad_norm": 2.1362217633574323, "language_loss": 0.82573128, "learning_rate": 2.9364607103454785e-10, "loss": 0.85030955, "num_input_tokens_seen": 357086960, "router_z_loss_clip": 1.64160156, "router_z_loss_mlp": 0.18737793, "step": 16544, "time_per_iteration": 2.806331157684326 }, { "auxiliary_loss_clip": 0.01404687, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.24503183, "balance_loss_mlp": 1.01259434, "epoch": 0.9947392153915526, "flos": 19066544616960.0, "grad_norm": 13.504331585187186, "language_loss": 0.79688418, "learning_rate": 2.870103745831187e-10, "loss": 0.82124364, "num_input_tokens_seen": 357105095, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.18676758, "step": 16545, "time_per_iteration": 2.8460733890533447 }, { "auxiliary_loss_clip": 0.01399431, "auxiliary_loss_mlp": 0.01033941, "balance_loss_clip": 1.23758841, "balance_loss_mlp": 1.01490295, "epoch": 0.9947993386442207, "flos": 27320425683840.0, "grad_norm": 1.6076735466552663, "language_loss": 0.72715509, "learning_rate": 2.8045050744873733e-10, "loss": 0.7514888, "num_input_tokens_seen": 357125065, "router_z_loss_clip": 1.61914062, "router_z_loss_mlp": 0.19042969, "step": 16546, "time_per_iteration": 2.9006974697113037 }, { "auxiliary_loss_clip": 0.01387393, "auxiliary_loss_mlp": 0.01030658, "balance_loss_clip": 1.23037589, "balance_loss_mlp": 1.01309824, "epoch": 0.9948594618968887, "flos": 20814392966400.0, "grad_norm": 4.968101643896007, "language_loss": 0.77799022, "learning_rate": 2.739664698798716e-10, "loss": 0.80217075, "num_input_tokens_seen": 357141600, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.17578125, "step": 16547, "time_per_iteration": 2.877582550048828 }, { "auxiliary_loss_clip": 0.01405144, "auxiliary_loss_mlp": 0.01032465, "balance_loss_clip": 1.2442528, "balance_loss_mlp": 1.01500118, "epoch": 0.9949195851495566, "flos": 23302900995840.0, "grad_norm": 2.2761940095884055, "language_loss": 0.70516878, "learning_rate": 2.67558262122769e-10, "loss": 0.72954488, "num_input_tokens_seen": 357157880, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.17456055, "step": 16548, "time_per_iteration": 2.8586854934692383 }, { "auxiliary_loss_clip": 0.01399428, "auxiliary_loss_mlp": 0.01028166, "balance_loss_clip": 1.24195158, "balance_loss_mlp": 1.00993884, "epoch": 0.9949797084022246, "flos": 18524736408960.0, "grad_norm": 2.4604662499845253, "language_loss": 0.76166636, "learning_rate": 2.6122588442012427e-10, "loss": 0.78594226, "num_input_tokens_seen": 357176705, "router_z_loss_clip": 1.57421875, "router_z_loss_mlp": 0.18225098, "step": 16549, "time_per_iteration": 2.8484036922454834 }, { "auxiliary_loss_clip": 0.01406805, "auxiliary_loss_mlp": 0.01033591, "balance_loss_clip": 1.24459398, "balance_loss_mlp": 1.01348031, "epoch": 0.9950398316548925, "flos": 30419523296640.0, "grad_norm": 1.6203421638132818, "language_loss": 0.75008857, "learning_rate": 2.5496933701241177e-10, "loss": 0.7744925, "num_input_tokens_seen": 357197630, "router_z_loss_clip": 1.62109375, "router_z_loss_mlp": 0.20092773, "step": 16550, "time_per_iteration": 3.0100960731506348 }, { "auxiliary_loss_clip": 0.01396902, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.23761773, "balance_loss_mlp": 1.01647866, "epoch": 0.9950999549075605, "flos": 19910218003200.0, "grad_norm": 31.58311427129161, "language_loss": 0.78467917, "learning_rate": 2.4878862013655297e-10, "loss": 0.80899036, "num_input_tokens_seen": 357215445, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.17724609, "step": 16551, "time_per_iteration": 2.824270248413086 }, { "auxiliary_loss_clip": 0.01368453, "auxiliary_loss_mlp": 0.01031678, "balance_loss_clip": 1.21710515, "balance_loss_mlp": 1.01446462, "epoch": 0.9951600781602284, "flos": 17612643605760.0, "grad_norm": 1.3660177939649691, "language_loss": 0.67153275, "learning_rate": 2.426837340270271e-10, "loss": 0.69553405, "num_input_tokens_seen": 357234285, "router_z_loss_clip": 1.51367188, "router_z_loss_mlp": 0.17224121, "step": 16552, "time_per_iteration": 2.871899366378784 }, { "auxiliary_loss_clip": 0.01390036, "auxiliary_loss_mlp": 0.0103134, "balance_loss_clip": 1.23126912, "balance_loss_mlp": 1.01227832, "epoch": 0.9952202014128965, "flos": 28962771863040.0, "grad_norm": 1.3483168737158358, "language_loss": 0.81571096, "learning_rate": 2.3665467891520465e-10, "loss": 0.83992481, "num_input_tokens_seen": 357257565, "router_z_loss_clip": 1.5859375, "router_z_loss_mlp": 0.19055176, "step": 16553, "time_per_iteration": 2.9152684211730957 }, { "auxiliary_loss_clip": 0.01178448, "auxiliary_loss_mlp": 0.01020555, "balance_loss_clip": 1.09008861, "balance_loss_mlp": 0.99871588, "epoch": 0.9952803246655644, "flos": 70845499451520.0, "grad_norm": 0.7164974794391568, "language_loss": 0.57347679, "learning_rate": 2.3070145503001348e-10, "loss": 0.59546685, "num_input_tokens_seen": 357320205, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.21875, "step": 16554, "time_per_iteration": 3.439408779144287 }, { "auxiliary_loss_clip": 0.01400658, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.23995447, "balance_loss_mlp": 1.01540852, "epoch": 0.9953404479182324, "flos": 21809157298560.0, "grad_norm": 1.629216354804764, "language_loss": 0.77565455, "learning_rate": 2.24824062597051e-10, "loss": 0.79999113, "num_input_tokens_seen": 357340695, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.17578125, "step": 16555, "time_per_iteration": 2.872360944747925 }, { "auxiliary_loss_clip": 0.01396867, "auxiliary_loss_mlp": 0.0103111, "balance_loss_clip": 1.23773015, "balance_loss_mlp": 1.01180971, "epoch": 0.9954005711709003, "flos": 21945543684480.0, "grad_norm": 1.8263083780306948, "language_loss": 0.86491007, "learning_rate": 2.1902250183902793e-10, "loss": 0.88918984, "num_input_tokens_seen": 357357505, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.19299316, "step": 16556, "time_per_iteration": 2.8362884521484375 }, { "auxiliary_loss_clip": 0.01386884, "auxiliary_loss_mlp": 0.01031494, "balance_loss_clip": 1.2299782, "balance_loss_mlp": 1.0130049, "epoch": 0.9954606944235683, "flos": 19364021049600.0, "grad_norm": 1.6052679986389666, "language_loss": 0.73544562, "learning_rate": 2.132967729762125e-10, "loss": 0.75962937, "num_input_tokens_seen": 357375395, "router_z_loss_clip": 1.5703125, "router_z_loss_mlp": 0.18493652, "step": 16557, "time_per_iteration": 2.829871654510498 }, { "auxiliary_loss_clip": 0.01390554, "auxiliary_loss_mlp": 0.01032921, "balance_loss_clip": 1.23435366, "balance_loss_mlp": 1.01472962, "epoch": 0.9955208176762362, "flos": 30530816801280.0, "grad_norm": 1.8559815414510208, "language_loss": 0.76889741, "learning_rate": 2.0764687622554233e-10, "loss": 0.79313219, "num_input_tokens_seen": 357397375, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 0.18188477, "step": 16558, "time_per_iteration": 2.9124436378479004 }, { "auxiliary_loss_clip": 0.01394311, "auxiliary_loss_mlp": 0.01034582, "balance_loss_clip": 1.23384953, "balance_loss_mlp": 1.01540112, "epoch": 0.9955809409289043, "flos": 30020435746560.0, "grad_norm": 1.9371249323555864, "language_loss": 0.64146626, "learning_rate": 2.0207281180129044e-10, "loss": 0.66575515, "num_input_tokens_seen": 357418880, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19189453, "step": 16559, "time_per_iteration": 4.340106010437012 }, { "auxiliary_loss_clip": 0.01392463, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.23363566, "balance_loss_mlp": 1.01386976, "epoch": 0.9956410641815723, "flos": 21553197609600.0, "grad_norm": 2.2257896302363105, "language_loss": 0.75287116, "learning_rate": 1.965745799148433e-10, "loss": 0.77711761, "num_input_tokens_seen": 357438310, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.1829834, "step": 16560, "time_per_iteration": 2.964780330657959 }, { "auxiliary_loss_clip": 0.01395852, "auxiliary_loss_mlp": 0.01030271, "balance_loss_clip": 1.237746, "balance_loss_mlp": 1.01230633, "epoch": 0.9957011874342402, "flos": 21699492606720.0, "grad_norm": 1.8621997286815193, "language_loss": 0.80222762, "learning_rate": 1.9115218077470073e-10, "loss": 0.82648885, "num_input_tokens_seen": 357457155, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.17956543, "step": 16561, "time_per_iteration": 4.256886005401611 }, { "auxiliary_loss_clip": 0.01392285, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.23596239, "balance_loss_mlp": 1.01158071, "epoch": 0.9957613106869082, "flos": 17709096816000.0, "grad_norm": 2.1983330892316446, "language_loss": 0.66828388, "learning_rate": 1.8580561458647614e-10, "loss": 0.69251168, "num_input_tokens_seen": 357468060, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.18933105, "step": 16562, "time_per_iteration": 2.7956676483154297 }, { "auxiliary_loss_clip": 0.01413583, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.24991703, "balance_loss_mlp": 1.01641369, "epoch": 0.9958214339395761, "flos": 30568442492160.0, "grad_norm": 1.6603827982878892, "language_loss": 0.64787471, "learning_rate": 1.805348815528962e-10, "loss": 0.67238069, "num_input_tokens_seen": 357489665, "router_z_loss_clip": 1.63671875, "router_z_loss_mlp": 0.20593262, "step": 16563, "time_per_iteration": 2.9021146297454834 }, { "auxiliary_loss_clip": 0.01388072, "auxiliary_loss_mlp": 0.0103451, "balance_loss_clip": 1.23055601, "balance_loss_mlp": 1.01560318, "epoch": 0.9958815571922441, "flos": 24179494613760.0, "grad_norm": 8.367077508876621, "language_loss": 0.65243554, "learning_rate": 1.7533998187380105e-10, "loss": 0.67666137, "num_input_tokens_seen": 357511975, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18908691, "step": 16564, "time_per_iteration": 2.8833401203155518 }, { "auxiliary_loss_clip": 0.01394622, "auxiliary_loss_mlp": 0.01030681, "balance_loss_clip": 1.23707473, "balance_loss_mlp": 1.0115118, "epoch": 0.995941680444912, "flos": 15495460801920.0, "grad_norm": 1.9619007843621288, "language_loss": 0.74881655, "learning_rate": 1.7022091574636633e-10, "loss": 0.77306956, "num_input_tokens_seen": 357529345, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.19165039, "step": 16565, "time_per_iteration": 2.8087565898895264 }, { "auxiliary_loss_clip": 0.01390181, "auxiliary_loss_mlp": 0.01028636, "balance_loss_clip": 1.23087716, "balance_loss_mlp": 1.01032579, "epoch": 0.9960018036975801, "flos": 18629559907200.0, "grad_norm": 1.8826164932645824, "language_loss": 0.79443848, "learning_rate": 1.6517768336443694e-10, "loss": 0.81862664, "num_input_tokens_seen": 357547615, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.18310547, "step": 16566, "time_per_iteration": 2.8383727073669434 }, { "auxiliary_loss_clip": 0.01390328, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.23396802, "balance_loss_mlp": 1.01344299, "epoch": 0.996061926950248, "flos": 20093731488000.0, "grad_norm": 1.790822228282713, "language_loss": 0.71623898, "learning_rate": 1.6021028491941535e-10, "loss": 0.74046159, "num_input_tokens_seen": 357567380, "router_z_loss_clip": 1.56347656, "router_z_loss_mlp": 0.18481445, "step": 16567, "time_per_iteration": 2.837270975112915 }, { "auxiliary_loss_clip": 0.01407763, "auxiliary_loss_mlp": 0.01034653, "balance_loss_clip": 1.24551928, "balance_loss_mlp": 1.01466179, "epoch": 0.996122050202916, "flos": 24357216764160.0, "grad_norm": 2.2269843901567383, "language_loss": 0.79630184, "learning_rate": 1.5531872059959538e-10, "loss": 0.82072598, "num_input_tokens_seen": 357586435, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19995117, "step": 16568, "time_per_iteration": 2.875387668609619 }, { "auxiliary_loss_clip": 0.01376494, "auxiliary_loss_mlp": 0.01026522, "balance_loss_clip": 1.22247648, "balance_loss_mlp": 1.00873554, "epoch": 0.9961821734555839, "flos": 24207980855040.0, "grad_norm": 1.677873999686422, "language_loss": 0.82046902, "learning_rate": 1.5050299059060634e-10, "loss": 0.84449911, "num_input_tokens_seen": 357604720, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.17785645, "step": 16569, "time_per_iteration": 4.355791807174683 }, { "auxiliary_loss_clip": 0.01381217, "auxiliary_loss_mlp": 0.01032244, "balance_loss_clip": 1.22702801, "balance_loss_mlp": 1.01405251, "epoch": 0.9962422967082519, "flos": 22642876828800.0, "grad_norm": 2.8244713516959354, "language_loss": 0.71636248, "learning_rate": 1.457630950747468e-10, "loss": 0.74049711, "num_input_tokens_seen": 357622345, "router_z_loss_clip": 1.54003906, "router_z_loss_mlp": 0.18188477, "step": 16570, "time_per_iteration": 4.341177225112915 }, { "auxiliary_loss_clip": 0.01389352, "auxiliary_loss_mlp": 0.01029744, "balance_loss_clip": 1.23162413, "balance_loss_mlp": 1.01110005, "epoch": 0.9963024199609198, "flos": 26407247005440.0, "grad_norm": 1.5296586335916629, "language_loss": 0.75319993, "learning_rate": 1.4109903423209502e-10, "loss": 0.77739084, "num_input_tokens_seen": 357642710, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18640137, "step": 16571, "time_per_iteration": 3.0083255767822266 }, { "auxiliary_loss_clip": 0.01395412, "auxiliary_loss_mlp": 0.01035145, "balance_loss_clip": 1.23737574, "balance_loss_mlp": 1.01509392, "epoch": 0.9963625432135879, "flos": 16590252683520.0, "grad_norm": 1.968891648190455, "language_loss": 0.81085801, "learning_rate": 1.3651080823939843e-10, "loss": 0.83516359, "num_input_tokens_seen": 357659870, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.20043945, "step": 16572, "time_per_iteration": 2.8354341983795166 }, { "auxiliary_loss_clip": 0.01398146, "auxiliary_loss_mlp": 0.01029306, "balance_loss_clip": 1.23982239, "balance_loss_mlp": 1.01061392, "epoch": 0.9964226664662559, "flos": 26479647964800.0, "grad_norm": 1.9142900637593965, "language_loss": 0.7127713, "learning_rate": 1.3199841727074e-10, "loss": 0.73704582, "num_input_tokens_seen": 357677075, "router_z_loss_clip": 1.58203125, "router_z_loss_mlp": 0.18688965, "step": 16573, "time_per_iteration": 2.885568857192993 }, { "auxiliary_loss_clip": 0.01421204, "auxiliary_loss_mlp": 0.01035964, "balance_loss_clip": 1.25591803, "balance_loss_mlp": 1.01593733, "epoch": 0.9964827897189238, "flos": 27458305148160.0, "grad_norm": 2.309299391862734, "language_loss": 0.64140701, "learning_rate": 1.275618614968721e-10, "loss": 0.66597867, "num_input_tokens_seen": 357696715, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.20031738, "step": 16574, "time_per_iteration": 2.8985610008239746 }, { "auxiliary_loss_clip": 0.01425131, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.26037836, "balance_loss_mlp": 1.01521873, "epoch": 0.9965429129715918, "flos": 11727109082880.0, "grad_norm": 2.1496306052284884, "language_loss": 0.77342445, "learning_rate": 1.2320114108654856e-10, "loss": 0.79802024, "num_input_tokens_seen": 357712345, "router_z_loss_clip": 1.64648438, "router_z_loss_mlp": 0.19238281, "step": 16575, "time_per_iteration": 2.809224843978882 }, { "auxiliary_loss_clip": 0.01390726, "auxiliary_loss_mlp": 0.01031615, "balance_loss_clip": 1.23156977, "balance_loss_mlp": 1.01279211, "epoch": 0.9966030362242597, "flos": 19765597063680.0, "grad_norm": 2.9846587427359714, "language_loss": 0.71143079, "learning_rate": 1.1891625620474855e-10, "loss": 0.73565412, "num_input_tokens_seen": 357731815, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18823242, "step": 16576, "time_per_iteration": 2.863509178161621 }, { "auxiliary_loss_clip": 0.01384904, "auxiliary_loss_mlp": 0.01032784, "balance_loss_clip": 1.23074245, "balance_loss_mlp": 1.01357901, "epoch": 0.9966631594769277, "flos": 23925797164800.0, "grad_norm": 1.5790313537424852, "language_loss": 0.7326262, "learning_rate": 1.1470720701400871e-10, "loss": 0.75680315, "num_input_tokens_seen": 357751640, "router_z_loss_clip": 1.54394531, "router_z_loss_mlp": 0.1920166, "step": 16577, "time_per_iteration": 2.8518576622009277 }, { "auxiliary_loss_clip": 0.01397947, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.23760796, "balance_loss_mlp": 1.01464009, "epoch": 0.9967232827295956, "flos": 15567952250880.0, "grad_norm": 3.0455610229032692, "language_loss": 0.79107273, "learning_rate": 1.1057399367397912e-10, "loss": 0.81538683, "num_input_tokens_seen": 357769850, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18811035, "step": 16578, "time_per_iteration": 2.823291301727295 }, { "auxiliary_loss_clip": 0.01395038, "auxiliary_loss_mlp": 0.01030959, "balance_loss_clip": 1.2361629, "balance_loss_mlp": 1.01322067, "epoch": 0.9967834059822637, "flos": 20822627520000.0, "grad_norm": 1.7639218622557369, "language_loss": 0.76658219, "learning_rate": 1.0651661634142328e-10, "loss": 0.79084218, "num_input_tokens_seen": 357789550, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.17749023, "step": 16579, "time_per_iteration": 2.835808753967285 }, { "auxiliary_loss_clip": 0.01412097, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.25088692, "balance_loss_mlp": 1.01380956, "epoch": 0.9968435292349316, "flos": 36732541121280.0, "grad_norm": 2.679600034790059, "language_loss": 0.70266658, "learning_rate": 1.0253507516999604e-10, "loss": 0.72712588, "num_input_tokens_seen": 357809525, "router_z_loss_clip": 1.61328125, "router_z_loss_mlp": 0.20019531, "step": 16580, "time_per_iteration": 3.009549379348755 }, { "auxiliary_loss_clip": 0.0140181, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.24139857, "balance_loss_mlp": 1.01274133, "epoch": 0.9969036524875996, "flos": 26772237959040.0, "grad_norm": 1.6966015252597766, "language_loss": 0.79821157, "learning_rate": 9.862937031113184e-11, "loss": 0.82253438, "num_input_tokens_seen": 357829795, "router_z_loss_clip": 1.60351562, "router_z_loss_mlp": 0.17724609, "step": 16581, "time_per_iteration": 2.9062957763671875 }, { "auxiliary_loss_clip": 0.01390278, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.23375285, "balance_loss_mlp": 1.01299334, "epoch": 0.9969637757402675, "flos": 24838070947200.0, "grad_norm": 1.797496817652888, "language_loss": 0.81369692, "learning_rate": 9.479950191249031e-11, "loss": 0.83790356, "num_input_tokens_seen": 357851655, "router_z_loss_clip": 1.56542969, "router_z_loss_mlp": 0.17382812, "step": 16582, "time_per_iteration": 2.8926329612731934 }, { "auxiliary_loss_clip": 0.0138706, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.23246086, "balance_loss_mlp": 1.01387453, "epoch": 0.9970238989929355, "flos": 23048932078080.0, "grad_norm": 1.7024672963279959, "language_loss": 0.61400491, "learning_rate": 9.104547011951069e-11, "loss": 0.63819844, "num_input_tokens_seen": 357871205, "router_z_loss_clip": 1.54296875, "router_z_loss_mlp": 0.18432617, "step": 16583, "time_per_iteration": 2.8819990158081055 }, { "auxiliary_loss_clip": 0.01399315, "auxiliary_loss_mlp": 0.01029634, "balance_loss_clip": 1.23956358, "balance_loss_mlp": 1.01151371, "epoch": 0.9970840222456034, "flos": 25309061763840.0, "grad_norm": 1.7140377300823582, "language_loss": 0.78340805, "learning_rate": 8.736727507452357e-11, "loss": 0.80769747, "num_input_tokens_seen": 357892145, "router_z_loss_clip": 1.59765625, "router_z_loss_mlp": 0.18115234, "step": 16584, "time_per_iteration": 2.8664238452911377 }, { "auxiliary_loss_clip": 0.01393075, "auxiliary_loss_mlp": 0.01034068, "balance_loss_clip": 1.2362448, "balance_loss_mlp": 1.01634145, "epoch": 0.9971441454982715, "flos": 21625372344960.0, "grad_norm": 1.618210996073375, "language_loss": 0.6976645, "learning_rate": 8.376491691697297e-11, "loss": 0.72193593, "num_input_tokens_seen": 357911205, "router_z_loss_clip": 1.56933594, "router_z_loss_mlp": 0.17724609, "step": 16585, "time_per_iteration": 2.8620927333831787 }, { "auxiliary_loss_clip": 0.01391839, "auxiliary_loss_mlp": 0.01033468, "balance_loss_clip": 1.23420978, "balance_loss_mlp": 1.01483548, "epoch": 0.9972042687509394, "flos": 14983767648000.0, "grad_norm": 2.894940839137588, "language_loss": 0.82285881, "learning_rate": 8.023839578363834e-11, "loss": 0.84711188, "num_input_tokens_seen": 357928190, "router_z_loss_clip": 1.57714844, "router_z_loss_mlp": 0.18640137, "step": 16586, "time_per_iteration": 2.8479483127593994 }, { "auxiliary_loss_clip": 0.01403443, "auxiliary_loss_mlp": 0.01030809, "balance_loss_clip": 1.24305916, "balance_loss_mlp": 1.01246226, "epoch": 0.9972643920036074, "flos": 25816275682560.0, "grad_norm": 1.6613899031229435, "language_loss": 0.79136521, "learning_rate": 7.678771180796851e-11, "loss": 0.81570768, "num_input_tokens_seen": 357946985, "router_z_loss_clip": 1.60546875, "router_z_loss_mlp": 0.18359375, "step": 16587, "time_per_iteration": 2.8912594318389893 }, { "auxiliary_loss_clip": 0.01411361, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.25063825, "balance_loss_mlp": 1.0190804, "epoch": 0.9973245152562754, "flos": 23335368779520.0, "grad_norm": 2.2102800654999966, "language_loss": 0.73601741, "learning_rate": 7.341286512074773e-11, "loss": 0.76050556, "num_input_tokens_seen": 357966720, "router_z_loss_clip": 1.60839844, "router_z_loss_mlp": 0.18371582, "step": 16588, "time_per_iteration": 2.880502223968506 }, { "auxiliary_loss_clip": 0.01415396, "auxiliary_loss_mlp": 0.01034244, "balance_loss_clip": 1.25046158, "balance_loss_mlp": 1.01593387, "epoch": 0.9973846385089433, "flos": 12173640445440.0, "grad_norm": 2.718181350972074, "language_loss": 0.83705145, "learning_rate": 7.011385585031781e-11, "loss": 0.86154789, "num_input_tokens_seen": 357981375, "router_z_loss_clip": 1.64746094, "router_z_loss_mlp": 0.18310547, "step": 16589, "time_per_iteration": 2.816628932952881 }, { "auxiliary_loss_clip": 0.0141155, "auxiliary_loss_mlp": 0.01035526, "balance_loss_clip": 1.24609351, "balance_loss_mlp": 1.01592755, "epoch": 0.9974447617616113, "flos": 20054115025920.0, "grad_norm": 1.9981506705331358, "language_loss": 0.71551174, "learning_rate": 6.689068412168986e-11, "loss": 0.73998255, "num_input_tokens_seen": 358000290, "router_z_loss_clip": 1.65332031, "router_z_loss_mlp": 0.19592285, "step": 16590, "time_per_iteration": 2.908003330230713 }, { "auxiliary_loss_clip": 0.01408014, "auxiliary_loss_mlp": 0.01032915, "balance_loss_clip": 1.2456429, "balance_loss_mlp": 1.01354373, "epoch": 0.9975048850142793, "flos": 32027229941760.0, "grad_norm": 1.8417622543140382, "language_loss": 0.63712478, "learning_rate": 6.374335005676634e-11, "loss": 0.66153407, "num_input_tokens_seen": 358022075, "router_z_loss_clip": 1.62304688, "router_z_loss_mlp": 0.19372559, "step": 16591, "time_per_iteration": 2.944382429122925 }, { "auxiliary_loss_clip": 0.01391563, "auxiliary_loss_mlp": 0.0103177, "balance_loss_clip": 1.23115504, "balance_loss_mlp": 1.01255369, "epoch": 0.9975650082669473, "flos": 36945943436160.0, "grad_norm": 1.901774205437116, "language_loss": 0.74192953, "learning_rate": 6.067185377522933e-11, "loss": 0.76616287, "num_input_tokens_seen": 358043940, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.19213867, "step": 16592, "time_per_iteration": 2.9663896560668945 }, { "auxiliary_loss_clip": 0.01403864, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.24385607, "balance_loss_mlp": 1.01674843, "epoch": 0.9976251315196152, "flos": 16480814215680.0, "grad_norm": 1.4712718225120045, "language_loss": 0.85653269, "learning_rate": 5.767619539343016e-11, "loss": 0.88092691, "num_input_tokens_seen": 358062720, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18811035, "step": 16593, "time_per_iteration": 2.838838815689087 }, { "auxiliary_loss_clip": 0.0139547, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.23801255, "balance_loss_mlp": 1.01357341, "epoch": 0.9976852547722832, "flos": 19656475309440.0, "grad_norm": 2.012454126069002, "language_loss": 0.70616508, "learning_rate": 5.4756375024833656e-11, "loss": 0.7304343, "num_input_tokens_seen": 358081560, "router_z_loss_clip": 1.57617188, "router_z_loss_mlp": 0.17883301, "step": 16594, "time_per_iteration": 4.3265392780303955 }, { "auxiliary_loss_clip": 0.01409871, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.24752963, "balance_loss_mlp": 1.01117289, "epoch": 0.9977453780249511, "flos": 20457998524800.0, "grad_norm": 2.2252551533356257, "language_loss": 0.73193485, "learning_rate": 5.1912392780462113e-11, "loss": 0.75633556, "num_input_tokens_seen": 358099065, "router_z_loss_clip": 1.62402344, "router_z_loss_mlp": 0.19018555, "step": 16595, "time_per_iteration": 2.827493667602539 }, { "auxiliary_loss_clip": 0.01178743, "auxiliary_loss_mlp": 0.01024461, "balance_loss_clip": 1.09158635, "balance_loss_mlp": 1.00185931, "epoch": 0.9978055012776191, "flos": 65481458221440.0, "grad_norm": 0.792229855630669, "language_loss": 0.60304224, "learning_rate": 4.9144248768007156e-11, "loss": 0.62507427, "num_input_tokens_seen": 358156095, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 0.22558594, "step": 16596, "time_per_iteration": 4.636451005935669 }, { "auxiliary_loss_clip": 0.01403396, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.24274564, "balance_loss_mlp": 1.01459837, "epoch": 0.997865624530287, "flos": 20641285785600.0, "grad_norm": 1.85641297136318, "language_loss": 0.78347909, "learning_rate": 4.645194309227385e-11, "loss": 0.80784285, "num_input_tokens_seen": 358175230, "router_z_loss_clip": 1.60449219, "router_z_loss_mlp": 0.18383789, "step": 16597, "time_per_iteration": 2.836442470550537 }, { "auxiliary_loss_clip": 0.01399435, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.23872483, "balance_loss_mlp": 1.01294494, "epoch": 0.9979257477829551, "flos": 29398896921600.0, "grad_norm": 1.8146283617060635, "language_loss": 0.8280766, "learning_rate": 4.383547585562475e-11, "loss": 0.85239506, "num_input_tokens_seen": 358197075, "router_z_loss_clip": 1.60742188, "router_z_loss_mlp": 0.19470215, "step": 16598, "time_per_iteration": 2.8840250968933105 }, { "auxiliary_loss_clip": 0.0142017, "auxiliary_loss_mlp": 0.01036716, "balance_loss_clip": 1.25348186, "balance_loss_mlp": 1.016927, "epoch": 0.997985871035623, "flos": 22644550886400.0, "grad_norm": 1.809576199169239, "language_loss": 0.64948046, "learning_rate": 4.129484715709175e-11, "loss": 0.67404926, "num_input_tokens_seen": 358215925, "router_z_loss_clip": 1.66699219, "router_z_loss_mlp": 0.19787598, "step": 16599, "time_per_iteration": 2.858654260635376 }, { "auxiliary_loss_clip": 0.01180519, "auxiliary_loss_mlp": 0.01027584, "balance_loss_clip": 1.09288085, "balance_loss_mlp": 1.00202513, "epoch": 0.998045994288291, "flos": 61832498826240.0, "grad_norm": 0.8619244498028018, "language_loss": 0.62409866, "learning_rate": 3.8830057093264256e-11, "loss": 0.64617968, "num_input_tokens_seen": 358269035, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 0.25585938, "step": 16600, "time_per_iteration": 3.198683023452759 }, { "auxiliary_loss_clip": 0.01395378, "auxiliary_loss_mlp": 0.01031235, "balance_loss_clip": 1.23751843, "balance_loss_mlp": 1.01281726, "epoch": 0.998106117540959, "flos": 19255080274560.0, "grad_norm": 1.5956996145999227, "language_loss": 0.79177636, "learning_rate": 3.644110575717896e-11, "loss": 0.81604254, "num_input_tokens_seen": 358287680, "router_z_loss_clip": 1.57910156, "router_z_loss_mlp": 0.18432617, "step": 16601, "time_per_iteration": 2.8326947689056396 }, { "auxiliary_loss_clip": 0.01404637, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.24252868, "balance_loss_mlp": 1.01420021, "epoch": 0.9981662407936269, "flos": 21116212899840.0, "grad_norm": 1.8422130828921273, "language_loss": 0.82738602, "learning_rate": 3.412799323987414e-11, "loss": 0.85175931, "num_input_tokens_seen": 358304080, "router_z_loss_clip": 1.62207031, "router_z_loss_mlp": 0.18493652, "step": 16602, "time_per_iteration": 2.878427267074585 }, { "auxiliary_loss_clip": 0.01406451, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.24720526, "balance_loss_mlp": 1.01292861, "epoch": 0.998226364046295, "flos": 24327644647680.0, "grad_norm": 1.9545090990409062, "language_loss": 0.63167262, "learning_rate": 3.189071962883538e-11, "loss": 0.65605712, "num_input_tokens_seen": 358323670, "router_z_loss_clip": 1.58984375, "router_z_loss_mlp": 0.1907959, "step": 16603, "time_per_iteration": 2.860781192779541 }, { "auxiliary_loss_clip": 0.01400942, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.24061799, "balance_loss_mlp": 1.01485372, "epoch": 0.9982864872989629, "flos": 23845478365440.0, "grad_norm": 1.8614833045719852, "language_loss": 0.71905941, "learning_rate": 2.972928500866168e-11, "loss": 0.74341702, "num_input_tokens_seen": 358341980, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.19958496, "step": 16604, "time_per_iteration": 2.877634048461914 }, { "auxiliary_loss_clip": 0.01397329, "auxiliary_loss_mlp": 0.01028447, "balance_loss_clip": 1.23838484, "balance_loss_mlp": 1.01007736, "epoch": 0.9983466105516309, "flos": 18342354044160.0, "grad_norm": 1.7367280129996039, "language_loss": 0.65306723, "learning_rate": 2.7643689461953613e-11, "loss": 0.67732495, "num_input_tokens_seen": 358360400, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18371582, "step": 16605, "time_per_iteration": 4.352726459503174 }, { "auxiliary_loss_clip": 0.01392571, "auxiliary_loss_mlp": 0.01033673, "balance_loss_clip": 1.23534393, "balance_loss_mlp": 1.01438451, "epoch": 0.9984067338042988, "flos": 17245254677760.0, "grad_norm": 1.635113885871471, "language_loss": 0.71749115, "learning_rate": 2.5633933067092938e-11, "loss": 0.74175358, "num_input_tokens_seen": 358378990, "router_z_loss_clip": 1.57226562, "router_z_loss_mlp": 0.19299316, "step": 16606, "time_per_iteration": 4.232144594192505 }, { "auxiliary_loss_clip": 0.01394121, "auxiliary_loss_mlp": 0.01034281, "balance_loss_clip": 1.23600972, "balance_loss_mlp": 1.01351452, "epoch": 0.9984668570569668, "flos": 20677644622080.0, "grad_norm": 2.0419980330408642, "language_loss": 0.82525891, "learning_rate": 2.370001590090709e-11, "loss": 0.84954298, "num_input_tokens_seen": 358395970, "router_z_loss_clip": 1.58007812, "router_z_loss_mlp": 0.20739746, "step": 16607, "time_per_iteration": 2.8524351119995117 }, { "auxiliary_loss_clip": 0.01403551, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.24010694, "balance_loss_mlp": 1.01549208, "epoch": 0.9985269803096347, "flos": 30274495153920.0, "grad_norm": 1.557093765990386, "language_loss": 0.67675805, "learning_rate": 2.184193803622669e-11, "loss": 0.70115358, "num_input_tokens_seen": 358417355, "router_z_loss_clip": 1.63378906, "router_z_loss_mlp": 0.20507812, "step": 16608, "time_per_iteration": 2.961256742477417 }, { "auxiliary_loss_clip": 0.01400924, "auxiliary_loss_mlp": 0.01028543, "balance_loss_clip": 1.24183714, "balance_loss_mlp": 1.01030397, "epoch": 0.9985871035623027, "flos": 10568015061120.0, "grad_norm": 1.9038565287429294, "language_loss": 0.81254333, "learning_rate": 2.0059699543883978e-11, "loss": 0.83683801, "num_input_tokens_seen": 358434345, "router_z_loss_clip": 1.59082031, "router_z_loss_mlp": 0.18237305, "step": 16609, "time_per_iteration": 2.813807964324951 }, { "auxiliary_loss_clip": 0.0139348, "auxiliary_loss_mlp": 0.01035191, "balance_loss_clip": 1.23352456, "balance_loss_mlp": 1.01662993, "epoch": 0.9986472268149706, "flos": 16882254495360.0, "grad_norm": 1.612761275562252, "language_loss": 0.63325381, "learning_rate": 1.8353300491158462e-11, "loss": 0.6575405, "num_input_tokens_seen": 358452870, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.18554688, "step": 16610, "time_per_iteration": 2.8661773204803467 }, { "auxiliary_loss_clip": 0.01398896, "auxiliary_loss_mlp": 0.01035133, "balance_loss_clip": 1.23960698, "balance_loss_mlp": 1.01545191, "epoch": 0.9987073500676387, "flos": 22064709784320.0, "grad_norm": 16.767731120781075, "language_loss": 0.68186361, "learning_rate": 1.672274094288717e-11, "loss": 0.70620394, "num_input_tokens_seen": 358472210, "router_z_loss_clip": 1.59277344, "router_z_loss_mlp": 0.19677734, "step": 16611, "time_per_iteration": 2.816735029220581 }, { "auxiliary_loss_clip": 0.01398712, "auxiliary_loss_mlp": 0.01034502, "balance_loss_clip": 1.23761225, "balance_loss_mlp": 1.01451087, "epoch": 0.9987674733203066, "flos": 30495272371200.0, "grad_norm": 1.4330105718083253, "language_loss": 0.70623183, "learning_rate": 1.5168020961020544e-11, "loss": 0.730564, "num_input_tokens_seen": 358493840, "router_z_loss_clip": 1.61132812, "router_z_loss_mlp": 0.1998291, "step": 16612, "time_per_iteration": 2.9174954891204834 }, { "auxiliary_loss_clip": 0.01389505, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 1.23344541, "balance_loss_mlp": 1.01336682, "epoch": 0.9988275965729746, "flos": 27756595987200.0, "grad_norm": 1.5849791146017618, "language_loss": 0.74494529, "learning_rate": 1.3689140604400407e-11, "loss": 0.76915789, "num_input_tokens_seen": 358515060, "router_z_loss_clip": 1.56152344, "router_z_loss_mlp": 0.18383789, "step": 16613, "time_per_iteration": 2.8861424922943115 }, { "auxiliary_loss_clip": 0.01389624, "auxiliary_loss_mlp": 0.01032459, "balance_loss_clip": 1.22996783, "balance_loss_mlp": 1.01315928, "epoch": 0.9988877198256426, "flos": 17531600889600.0, "grad_norm": 2.1873032954509215, "language_loss": 0.74338591, "learning_rate": 1.2286099928981996e-11, "loss": 0.76760674, "num_input_tokens_seen": 358528200, "router_z_loss_clip": 1.59667969, "router_z_loss_mlp": 0.19299316, "step": 16614, "time_per_iteration": 2.801055669784546 }, { "auxiliary_loss_clip": 0.01398696, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.23942804, "balance_loss_mlp": 1.01443589, "epoch": 0.9989478430783105, "flos": 21006412473600.0, "grad_norm": 2.065331199839602, "language_loss": 0.73211551, "learning_rate": 1.0958898988278065e-11, "loss": 0.75644135, "num_input_tokens_seen": 358548360, "router_z_loss_clip": 1.59375, "router_z_loss_mlp": 0.19445801, "step": 16615, "time_per_iteration": 2.957162857055664 }, { "auxiliary_loss_clip": 0.01408518, "auxiliary_loss_mlp": 0.01029028, "balance_loss_clip": 1.24497175, "balance_loss_mlp": 1.01065826, "epoch": 0.9990079663309785, "flos": 13378458977280.0, "grad_norm": 3.056121369958587, "language_loss": 0.79146641, "learning_rate": 9.70753783247069e-12, "loss": 0.81584185, "num_input_tokens_seen": 358566270, "router_z_loss_clip": 1.63476562, "router_z_loss_mlp": 0.18359375, "step": 16616, "time_per_iteration": 2.8193466663360596 }, { "auxiliary_loss_clip": 0.0140043, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.24193227, "balance_loss_mlp": 1.01364553, "epoch": 0.9990680895836465, "flos": 17318877246720.0, "grad_norm": 1.8849169299726047, "language_loss": 0.83677983, "learning_rate": 8.532016508855378e-12, "loss": 0.86111474, "num_input_tokens_seen": 358584710, "router_z_loss_clip": 1.58789062, "router_z_loss_mlp": 0.1940918, "step": 16617, "time_per_iteration": 2.8249504566192627 }, { "auxiliary_loss_clip": 0.01393575, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.23639393, "balance_loss_mlp": 1.01130497, "epoch": 0.9991282128363145, "flos": 24218930096640.0, "grad_norm": 1.9948768914063952, "language_loss": 0.79220641, "learning_rate": 7.43233506206309e-12, "loss": 0.81643271, "num_input_tokens_seen": 358606750, "router_z_loss_clip": 1.57324219, "router_z_loss_mlp": 0.1776123, "step": 16618, "time_per_iteration": 2.879115104675293 }, { "auxiliary_loss_clip": 0.01392465, "auxiliary_loss_mlp": 0.01032661, "balance_loss_clip": 1.23491514, "balance_loss_mlp": 1.01414728, "epoch": 0.9991883360889824, "flos": 21184315603200.0, "grad_norm": 1.627230997868662, "language_loss": 0.75904751, "learning_rate": 6.408493534060255e-12, "loss": 0.78329885, "num_input_tokens_seen": 358624675, "router_z_loss_clip": 1.57519531, "router_z_loss_mlp": 0.18518066, "step": 16619, "time_per_iteration": 2.8527066707611084 }, { "auxiliary_loss_clip": 0.01386741, "auxiliary_loss_mlp": 0.01031895, "balance_loss_clip": 1.2310704, "balance_loss_mlp": 1.01328635, "epoch": 0.9992484593416504, "flos": 19910579961600.0, "grad_norm": 1.9238489520353774, "language_loss": 0.87078863, "learning_rate": 5.460491963260594e-12, "loss": 0.89497507, "num_input_tokens_seen": 358640715, "router_z_loss_clip": 1.55664062, "router_z_loss_mlp": 0.18603516, "step": 16620, "time_per_iteration": 2.851719617843628 }, { "auxiliary_loss_clip": 0.01389313, "auxiliary_loss_mlp": 0.01027935, "balance_loss_clip": 1.23224902, "balance_loss_mlp": 1.010638, "epoch": 0.9993085825943183, "flos": 24867145370880.0, "grad_norm": 1.7690788998381917, "language_loss": 0.7305612, "learning_rate": 4.58833038607942e-12, "loss": 0.75473368, "num_input_tokens_seen": 358659630, "router_z_loss_clip": 1.57128906, "router_z_loss_mlp": 0.17297363, "step": 16621, "time_per_iteration": 2.89024019241333 }, { "auxiliary_loss_clip": 0.01179163, "auxiliary_loss_mlp": 0.01029627, "balance_loss_clip": 1.09071612, "balance_loss_mlp": 1.00349665, "epoch": 0.9993687058469863, "flos": 71319657404160.0, "grad_norm": 0.7382580761413181, "language_loss": 0.56546247, "learning_rate": 3.79200883515729e-12, "loss": 0.5875504, "num_input_tokens_seen": 358727840, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 0.26171875, "step": 16622, "time_per_iteration": 3.567823886871338 }, { "auxiliary_loss_clip": 0.01402878, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.24267018, "balance_loss_mlp": 1.01430511, "epoch": 0.9994288290996542, "flos": 12206198718720.0, "grad_norm": 1.9077223565094001, "language_loss": 0.71944743, "learning_rate": 3.071527340914315e-12, "loss": 0.74380636, "num_input_tokens_seen": 358744125, "router_z_loss_clip": 1.60253906, "router_z_loss_mlp": 0.18725586, "step": 16623, "time_per_iteration": 2.854400634765625 }, { "auxiliary_loss_clip": 0.01390473, "auxiliary_loss_mlp": 0.01029027, "balance_loss_clip": 1.23234987, "balance_loss_mlp": 1.00983417, "epoch": 0.9994889523523223, "flos": 17897677718400.0, "grad_norm": 1.8034415668210102, "language_loss": 0.75078022, "learning_rate": 2.4268859304399368e-12, "loss": 0.77497524, "num_input_tokens_seen": 358761420, "router_z_loss_clip": 1.58105469, "router_z_loss_mlp": 0.19189453, "step": 16624, "time_per_iteration": 2.897861957550049 }, { "auxiliary_loss_clip": 0.01393988, "auxiliary_loss_mlp": 0.01029158, "balance_loss_clip": 1.23479736, "balance_loss_mlp": 1.00940514, "epoch": 0.9995490756049902, "flos": 26590262797440.0, "grad_norm": 1.5451847888440806, "language_loss": 0.74692374, "learning_rate": 1.8580846286031514e-12, "loss": 0.77115524, "num_input_tokens_seen": 358782600, "router_z_loss_clip": 1.59179688, "router_z_loss_mlp": 0.19750977, "step": 16625, "time_per_iteration": 2.9265923500061035 }, { "auxiliary_loss_clip": 0.0138045, "auxiliary_loss_mlp": 0.0103054, "balance_loss_clip": 1.22587824, "balance_loss_mlp": 1.01153851, "epoch": 0.9996091988576582, "flos": 22210145130240.0, "grad_norm": 2.0296810382562644, "language_loss": 0.78008509, "learning_rate": 1.3651234567202408e-12, "loss": 0.80419505, "num_input_tokens_seen": 358801220, "router_z_loss_clip": 1.546875, "router_z_loss_mlp": 0.19006348, "step": 16626, "time_per_iteration": 2.8440189361572266 }, { "auxiliary_loss_clip": 0.01395946, "auxiliary_loss_mlp": 0.010303, "balance_loss_clip": 1.23829889, "balance_loss_mlp": 1.01222777, "epoch": 0.9996693221103262, "flos": 27382601318400.0, "grad_norm": 2.281150078360747, "language_loss": 0.82095659, "learning_rate": 9.480024334429515e-13, "loss": 0.84521902, "num_input_tokens_seen": 358819190, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 0.18066406, "step": 16627, "time_per_iteration": 2.9165232181549072 }, { "auxiliary_loss_clip": 0.01403332, "auxiliary_loss_mlp": 0.01033187, "balance_loss_clip": 1.24057233, "balance_loss_mlp": 1.01290917, "epoch": 0.9997294453629941, "flos": 26881766916480.0, "grad_norm": 2.217513637178297, "language_loss": 0.72398841, "learning_rate": 6.067215747584952e-13, "loss": 0.7483536, "num_input_tokens_seen": 358839850, "router_z_loss_clip": 1.62695312, "router_z_loss_mlp": 0.20263672, "step": 16628, "time_per_iteration": 2.9542076587677 }, { "auxiliary_loss_clip": 0.01392562, "auxiliary_loss_mlp": 0.01031561, "balance_loss_clip": 1.23247933, "balance_loss_mlp": 1.01277328, "epoch": 0.9997895686156621, "flos": 23487409866240.0, "grad_norm": 1.796385392838348, "language_loss": 0.7607643, "learning_rate": 3.4128089332341456e-13, "loss": 0.78500557, "num_input_tokens_seen": 358859805, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 0.18786621, "step": 16629, "time_per_iteration": 4.279237747192383 }, { "auxiliary_loss_clip": 0.01418061, "auxiliary_loss_mlp": 0.01033411, "balance_loss_clip": 1.25414956, "balance_loss_mlp": 1.01437283, "epoch": 0.9998496918683301, "flos": 20233285009920.0, "grad_norm": 1.6544386008431677, "language_loss": 0.6107229, "learning_rate": 1.5168039935176126e-13, "loss": 0.63523757, "num_input_tokens_seen": 358877900, "router_z_loss_clip": 1.63867188, "router_z_loss_mlp": 0.19030762, "step": 16630, "time_per_iteration": 2.8854622840881348 }, { "auxiliary_loss_clip": 0.01399146, "auxiliary_loss_mlp": 0.01033547, "balance_loss_clip": 1.23930359, "balance_loss_mlp": 1.01429462, "epoch": 0.9999098151209981, "flos": 21662907546240.0, "grad_norm": 2.1908451151571398, "language_loss": 0.61663115, "learning_rate": 3.792010017100722e-14, "loss": 0.64095813, "num_input_tokens_seen": 358897285, "router_z_loss_clip": 1.59960938, "router_z_loss_mlp": 0.19262695, "step": 16631, "time_per_iteration": 4.266470432281494 }, { "auxiliary_loss_clip": 0.01384094, "auxiliary_loss_mlp": 0.01028967, "balance_loss_clip": 1.22959709, "balance_loss_mlp": 1.01159823, "epoch": 0.999969938373666, "flos": 11551513438080.0, "grad_norm": 2.3658797066110053, "language_loss": 0.73203969, "learning_rate": 0.0, "loss": 0.75617027, "num_input_tokens_seen": 358911570, "router_z_loss_clip": 1.54492188, "router_z_loss_mlp": 0.17358398, "step": 16632, "time_per_iteration": 2.78379225730896 }, { "epoch": 0.999969938373666, "num_input_tokens_seen": 358911570, "step": 16632, "total_flos": 1.3998867231602115e+18, "train_loss": 0.78706475517687, "train_runtime": 52477.3142, "train_samples_per_second": 12.678, "train_steps_per_second": 0.317 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3998867231602115e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }